# SIMPLE REGRESSION EXAMPLE USING THE DETROIT MURDER DATASET, YEARS 1961-1974
# R CODE HERE RUNS A BASIC MODEL AND VARIOUS DIAGNOSTICS, EMAIL QUESTIONS
# TO jgill@wustl.edu

#FTP    - Full-time police per 100,000 population
#UEMP   - % unemployed in the population

#MAN    - number of manufacturing workers in thousands
#NMAN   - Number of non-manufacturing workers in thousands
#GOV    - Number of government workers in thousands

#LIC    - Number of handgun licences per 100,000 population (you can buy one)
#GR     - Number of handgun registrations per 100,000 population (you own one)

#CLEAR  - % homicides cleared by arrests
#WM     - Number of white males in the population

#HE     - Average hourly earnings
#WE     - Average weekly earnings

#HOM    - Number of homicides per 100,000 of population
#ACC    - Death rate in accidents per 100,000 population
#ASR    - Number of assaults per 100,000 population

# LOAD DATA FILE, CREATE DATA FRAME
count.fields("public_html/data/detroit.data")
count.fields("http://jgil.wustl.edu/data/detroit.data")

detroit.df <- read.table("public_html/data/detroit.data",header=TRUE)
detroit.df <- read.table("http://jgill.wustl.edu/data/detroit.data",header=TRUE)

# ATTACH DATA FRAME AND CREATE A SUB-MATRIX
attach(detroit.df)
detroit.mat <- cbind(FTP,UEMP,LIC,CLEAR,HE,HOM)

# LOOK AT DATA WITH SCATTERPLOTS
# postscript("S.Dir/detroit.fig1.ps")
par(oma=c(1,1,4,1))
pairs(detroit.mat)
# dev.off()

# RUN A MODEL, SUMMARIZE THE RESULTS
detroit.ols <- lm(HOM~FTP+UEMP+LIC+CLEAR+HE)
summary(detroit.ols)

detroit.table <- cbind(summary(detroit.ols)$coef[,1:2],
	(summary(detroit.ols)$coef[,1] - 1.96*summary(detroit.ols)$coef[,2]),
	(summary(detroit.ols)$coef[,1] + 1.96*summary(detroit.ols)$coef[,2]))
dimnames(detroit.table)[[2]] <- c("Estimate","Std. Error","95% CI Lower","95% CI Upper")
detroit.table

# QQNORM DIAGNOSTIC ON RESIDUALS
# postscript("S.Dir/detroit.fig2.ps")
par(mfrow=c(1,1),oma=c(4,4,4,4))
qqnorm(detroit.ols$residuals)
qqline(detroit.ols$residuals)
# dev.off()

# MISC DIAGNOSTIC VALUES
detroit.ols$residuals; detroit.ols$residual; residuals(detroit.ols)
rstudent(detroit.ols)
dfbetas(detroit.ols)
dffits(detroit.ols)
covratio(detroit.ols)
cooks.distance(detroit.ols)
X <- detroit.mat[,-6]
diag(X%*%solve(t(X)%*%X)%*%t(X))

# COOK'S D DIAGNOSTIC
# postscript("S.Dir/detroit.fig3.ps")
cooks.vals <- cooks.distance(detroit.ols)
R.vals <- detroit.ols$residual
par(mfrow=c(2,1),mar=c(0,5,0,1),oma=c(5,5,5,5))
plot(seq(1,13,length=13),rep(max(cooks.vals),13),type="n",xaxt="n",
        ylab="Studentized Residuals by Year",xlab="",ylim=c(-3.7,3.7))
abline(h=0)
for (i in 1:length(R.vals))  segments(i,0,i,R.vals[i])
plot(seq(1,13,length=13),rep(max(cooks.vals),13),type="n",xaxt="n",
        ylab="Cook's Distance by Year",
        ylim=c(0,6), xlab="")
abline(h=0)
for (i in 1:length(cooks.vals))
        segments(i,0,i,cooks.vals[i])
mtext(side=3,cex=1.3,"Leverage and Influence: Detroit Murder Data",outer=T,line=2)
# dev.off()

# JACKKNIFE OUT THE 8TH CASE AND RERUN MODEL
detroit2.df <- detroit.df[-8,]
detroit2.ols <- lm(HOM~FTP+UEMP+LIC+CLEAR+HE,data=detroit.df2)
detroit2.table <- cbind(summary(detroit2.ols)$coef[,1:2],
	(summary(detroit2.ols)$coef[,1] - 1.96*summary(detroit2.ols)$coef[,2]),
	(summary(detroit2.ols)$coef[,1] + 1.96*summary(detroit2.ols)$coef[,2]))
dimnames(detroit2.table)[[2]] <- c("Estimate","Std. Error","95% CI Lower","95% CI Upper")
detroit2.table


