# SIMPLE REGRESSION EXAMPLE USING THE DETROIT MURDER DATASET, YEARS 1961-1974 # R CODE HERE RUNS A BASIC MODEL AND VARIOUS DIAGNOSTICS, EMAIL QUESTIONS # TO jgill@wustl.edu #FTP - Full-time police per 100,000 population #UEMP - % unemployed in the population #MAN - number of manufacturing workers in thousands #NMAN - Number of non-manufacturing workers in thousands #GOV - Number of government workers in thousands #LIC - Number of handgun licences per 100,000 population (you can buy one) #GR - Number of handgun registrations per 100,000 population (you own one) #CLEAR - % homicides cleared by arrests #WM - Number of white males in the population #HE - Average hourly earnings #WE - Average weekly earnings #HOM - Number of homicides per 100,000 of population #ACC - Death rate in accidents per 100,000 population #ASR - Number of assaults per 100,000 population # LOAD DATA FILE, CREATE DATA FRAME count.fields("public_html/data/detroit.data") count.fields("http://jgil.wustl.edu/data/detroit.data") detroit.df <- read.table("public_html/data/detroit.data",header=TRUE) detroit.df <- read.table("http://jgill.wustl.edu/data/detroit.data",header=TRUE) # ATTACH DATA FRAME AND CREATE A SUB-MATRIX attach(detroit.df) detroit.mat <- cbind(FTP,UEMP,LIC,CLEAR,HE,HOM) # LOOK AT DATA WITH SCATTERPLOTS # postscript("S.Dir/detroit.fig1.ps") par(oma=c(1,1,4,1)) pairs(detroit.mat) # dev.off() # RUN A MODEL, SUMMARIZE THE RESULTS detroit.ols <- lm(HOM~FTP+UEMP+LIC+CLEAR+HE) summary(detroit.ols) detroit.table <- cbind(summary(detroit.ols)$coef[,1:2], (summary(detroit.ols)$coef[,1] - 1.96*summary(detroit.ols)$coef[,2]), (summary(detroit.ols)$coef[,1] + 1.96*summary(detroit.ols)$coef[,2])) dimnames(detroit.table)[[2]] <- c("Estimate","Std. Error","95% CI Lower","95% CI Upper") detroit.table # QQNORM DIAGNOSTIC ON RESIDUALS # postscript("S.Dir/detroit.fig2.ps") par(mfrow=c(1,1),oma=c(4,4,4,4)) qqnorm(detroit.ols$residuals) qqline(detroit.ols$residuals) # dev.off() # MISC DIAGNOSTIC VALUES detroit.ols$residuals; detroit.ols$residual; residuals(detroit.ols) rstudent(detroit.ols) dfbetas(detroit.ols) dffits(detroit.ols) covratio(detroit.ols) cooks.distance(detroit.ols) X <- detroit.mat[,-6] diag(X%*%solve(t(X)%*%X)%*%t(X)) # COOK'S D DIAGNOSTIC # postscript("S.Dir/detroit.fig3.ps") cooks.vals <- cooks.distance(detroit.ols) R.vals <- detroit.ols$residual par(mfrow=c(2,1),mar=c(0,5,0,1),oma=c(5,5,5,5)) plot(seq(1,13,length=13),rep(max(cooks.vals),13),type="n",xaxt="n", ylab="Studentized Residuals by Year",xlab="",ylim=c(-3.7,3.7)) abline(h=0) for (i in 1:length(R.vals)) segments(i,0,i,R.vals[i]) plot(seq(1,13,length=13),rep(max(cooks.vals),13),type="n",xaxt="n", ylab="Cook's Distance by Year", ylim=c(0,6), xlab="") abline(h=0) for (i in 1:length(cooks.vals)) segments(i,0,i,cooks.vals[i]) mtext(side=3,cex=1.3,"Leverage and Influence: Detroit Murder Data",outer=T,line=2) # dev.off() # JACKKNIFE OUT THE 8TH CASE AND RERUN MODEL detroit2.df <- detroit.df[-8,] detroit2.ols <- lm(HOM~FTP+UEMP+LIC+CLEAR+HE,data=detroit.df2) detroit2.table <- cbind(summary(detroit2.ols)$coef[,1:2], (summary(detroit2.ols)$coef[,1] - 1.96*summary(detroit2.ols)$coef[,2]), (summary(detroit2.ols)$coef[,1] + 1.96*summary(detroit2.ols)$coef[,2])) dimnames(detroit2.table)[[2]] <- c("Estimate","Std. Error","95% CI Lower","95% CI Upper") detroit2.table