# FARAWAY CH.4 CODE library(faraway) # SETUP DATA FOR WAFER EXAMPLE USING gl (GENERATE LEVELS) FUNCTION y <- c(320,14,80,36) particle <- gl(2,1,4,labels=c("no","yes")) quality <- gl(2,2,labels=c("good","bad")) wafer <- data.frame(y,particle,quality) wafer # TABULATE, PRESERVING STRUCTURE WITH y AND ov (ov <- xtabs(y ~ quality+particle)) ################################################################################### # RUN A POISSON REGRESSION, NULL MODEL ASSUMES THAT ALL 4 COUNTS ARE PRODUCED # FROM THE SAME INTENSITY PARAMETER, NOTE THE Null Deviance, ADDITIVE MODEL # ADDITIVE MODEL REQUIRES MARGINALS WITH TREATMENT (DUMMY) CONTRAST # MODEL SPECIFICATION: log(mu) = gamma + alpha_i + beta_j WHERE gamma IS THE # INTERCEPT, alpha_i IS THE particle EFFECT (i=1,2), and beta_j IS THE quality # EFFECT (j=1,2), EFFECTS ARE INDEPENDENT SINCE NO INTERACTION SPECIFIED, # SPECIFYING AN INTERACTION PRODUCES THE SATURATED MODEL (NO DF) modl <- glm(y ~ particle+quality, poisson) summary(modl) X.prime <- t(model.matrix(modl)) modl$fitted.values # DO A SINGLE DROP LRT COMPARISON WITH THE FULL MODEL drop1(modl,test="Chi") # Errata: p71 - second para should read: "By examining the coefficients, we see that # wafers without particles occur at a significantly higher rate than wafers with # particles. Similarly, we see that good-quality wafers occur at a significantly # higher rate than bad-quality wafers." # ASSERTED RELATIONSHIP: X'y = X'mu.hat, WHERE: (X.prime %*% y)[,] # WHICH SHOWS THAT THE COEFFICIENTS ARE TAKEN FROM THE MARGINALS DUE TO THE # TREATMENT (DUMMY) CONTRAST ################################################################################### ################################################################################### # NOW ASSUME A MULTINOMIAL SETUP AND TEST FOR ROW/COLUMN INDEPENDENCE # p.hat(j=1) AND p.hat(j=2), SUMS OVER ROWS (pp <- prop.table( xtabs(y ~ particle))) # SAME AS: (320+80)/450, (14+36)/450 # p.hat(i=1) AND p.hat(i=2), SUMS OVER COLUMNS (qp <- prop.table( xtabs(y ~ quality))) # SAME AS: (320+14)/450, (80+36)/450 # FITTED VALUES FROM mu_ij = np.hat_ip.hat_j (fv <- outer(qp,pp)*450) # COMPARE WITH SATURATED MODEL THAT GIVES mu_ij = y_ij # DEVIANCE TEST: 2*sum(ov*log(ov/fv)) # PEARSON's X^2 TEST: sum( (ov-fv)^2/fv) # PEARSON'S X^2 TEST WITH YATES CORRECTION (ADDS 0.5 TOWARDS ZERO FOR Y-mu) prop.test(ov) ################################################################################### ################################################################################### # NOW TREAT THE TABLE AS 2 BINOMIALS DOWN THE COLUMNS, EQUIVALENT TO A TEST OF # INDEPENDENCE (m <- matrix(y,nrow=2)) modb <- glm(m ~ 1, family=binomial) modb$fitted.values deviance(modb) ################################################################################### ################################################################################### # USE HYPERGEOMETRIC ASSUMPTION, WHICH IS FISHER'S EXACT TEXT fisher.test(ov) (320*36)/(14*80) ################################################################################### ################################################################################### # LARGER TABLES haireye (ct <- xtabs(y ~ hair + eye, haireye)) summary(ct) dotchart(ct) mosaicplot(ct,color=TRUE,main="") modc <- glm(y ~ hair+eye,family=poisson,haireye) summary(modc) ################################################################################### # CORRESPONDENCE ANALYSIS, WHICH WE'LL WORRY ABOUT ONLY IF WE HAVE TIME z <- xtabs(residuals(modc,type="pearson")~hair+eye,haireye) svdz <- svd(z,2,2) leftsv <- svdz$u %*% diag(sqrt(svdz$d[1:2])) rightsv <- svdz$v %*% diag(sqrt(svdz$d[1:2])) ll <- 1.1*max(abs(rightsv),abs(leftsv)) plot(rbind(leftsv,rightsv),asp=1,xlim=c(-ll,ll),ylim=c(-ll,ll),xlab="SV1",ylab="SV2",type="n") abline(h=0,v=0) text(leftsv,dimnames(z)[[1]]) text(rightsv,dimnames(z)[[2]]) ################################################################################### # OTHER STUFF WE'LL WORRY ABOUT LATER data(eyegrade) (ct <- xtabs(y ~ right+left, eyegrade)) summary(ct) (symfac <- factor(apply(eyegrade[,2:3],1,function(x) paste(sort(x),collapse="-")))) mods <- glm(y ~ symfac, eyegrade, family=poisson) c(deviance(mods),df.residual(mods)) pchisq(deviance(mods),df.residual(mods),lower=F) round(xtabs(residuals(mods) ~ right+left, eyegrade),3) margin.table(ct,1) margin.table(ct,2) modq <- glm(y ~ right+left+symfac, eyegrade, family=poisson) pchisq(deviance(modq),df.residual(modq),lower=F) anova(mods,modq,test="Chi") modqi <- glm(y ~ right+left, eyegrade, family=poisson, subset=-c(1,6,11,16)) pchisq(deviance(modqi),df.residual(modqi),lower=F) data(femsmoke) femsmoke (ct <- xtabs(y ~ smoker+dead,femsmoke)) prop.table(ct,1) summary(ct) (cta <- xtabs(y ~ smoker+dead,femsmoke, subset=(age=="55-64"))) prop.table(cta,1) prop.table(xtabs(y ~ smoker+age, femsmoke),2) fisher.test(cta) ct3 <- xtabs(y ~ smoker+dead+age,femsmoke) apply(ct3, 3, function(x) (x[1,1]*x[2,2])/(x[1,2]*x[2,1])) mantelhaen.test(ct3,exact=TRUE) summary(ct3) modi <- glm(y ~ smoker + dead + age, femsmoke, family=poisson) c(deviance(modi),df.residual(modi)) (coefsmoke <- exp(c(0,coef(modi)[2]))) coefsmoke/sum(coefsmoke) prop.table(xtabs(y ~ smoker, femsmoke)) modj <- glm(y ~ smoker*dead + age, femsmoke, family=poisson) c(deviance(modj),df.residual(modj)) modc <- glm(y ~ smoker*age + age*dead, femsmoke, family=poisson) c(deviance(modc),df.residual(modc)) modu <- glm(y ~ (smoker+age+dead)^2, femsmoke, family=poisson) ctf <- xtabs(fitted(modu) ~ smoker+dead+age,femsmoke) apply(ctf, 3, function(x) (x[1,1]*x[2,2])/(x[1,2]*x[2,1])) exp(coef(modu)['smokerno:deadno']) modsat <- glm(y ~ smoker*age*dead, femsmoke, family=poisson) drop1(modsat,test="Chi") drop1(modu,test="Chi") ybin <- matrix(femsmoke$y,ncol=2) modbin <- glm(ybin ~ smoker*age, femsmoke[1:14,], family=binomial) drop1(modbin,test="Chi") modbinr <- glm(ybin ~ smoker+age, femsmoke[1:14,], family=binomial) drop1(modbinr,test="Chi") deviance(modu) deviance(modbinr) exp(-coef(modbinr)[2]) modbinull <- glm(ybin ~ 1, femsmoke[1:14,], family=binomial) deviance(modbinull) modj <- glm(y ~ smoker*age + dead, femsmoke, family=poisson) deviance(modj) ################################################################################### # ORDINAL VARIABLE ANALYSIS # LOOK AT JUST PARTY AFFILIATION AND LEVEL OF EDUCATION data(nes96) xtabs( ~ PID + educ, nes96) # CONVERT TO A DATAFRAME (partyed <- as.data.frame.table(xtabs( ~ PID + educ, nes96))) # A NOMINAL-BY-NOMINAL MODEL SHOWS NO EVIDENCE AGAINST INDEPENDENCE nomod <- glm(Freq ~ PID + educ, partyed, family= poisson) pchisq(deviance(nomod),df.residual(nomod),lower=F) # ASSIGN EVENLY SPACED SCORES partyed$oPID <- unclass(partyed$PID) partyed$oeduc <- unclass(partyed$educ) # RUN LINEAR-BY-LINEAR ASSOCIATION MODEL ormod <- glm(Freq ~ PID + educ + I(oPID*oeduc), partyed, family= poisson) # COMPARE NOMINAL TO ORDINAL anova(nomod,ormod,test="Chi") # LOOK AT GAMMA COEFFICIENT SPECIFICALLY summary(ormod)$coef['I(oPID * oeduc)',] # SPREAD OUT PARTY ID SCORES TO SEE IF THERE IS A MORE DRAMATIC EFFECT apid <- c(1,2,5,6,7,10,11) # COMPRESS EDUCATION SCORES aedu <- c(1,1,1,2,2,3,3) # RUN NEW MODEL ormoda <- glm(Freq ~ PID + educ + I(apid[oPID]*aedu[oeduc]), partyed, family= poisson) anova(nomod,ormoda,test="Chi") # THE LOG-ODDS RATIO FOR ADJACENT ENTRIES IN ROWS/COLUMNS IS: # log( (u_{ij} * u_{i+1} * u_{j+1})/( u_{i,j+1} * u_{i+1,j}) ) = gamma(u_{i+1} - u_i)(v_{j+1} - v_j) # which will be gamma for evenly spaced outcomes # look at bottom 2*2: round(xtabs(predict(ormod,type="response") ~ PID + educ, partyed),2) log(39.28*28.85/(47.49*23.19)) # GIVES ESSENTIALLY GAMMA.HAT # LOOK AT UNSCALED RESIDUALS, WHICH SHOWS SOME NON-MONOTONIC EFFECTS round(xtabs(residuals(ormod,type="response") ~ PID + educ, partyed),2) # FIT AN ORDINAL-BY-NOMINAL MODEL, ALSO CALLED A COLUMN-EFFECTS MODEL: # logE_{ij} = log mu_{ij} = log n p_{ij} = log n + alpha_i + beta_j + mu_i*gamma_j cmod <- glm(Freq ~ PID + educ + educ:oPID, partyed, family= poisson) # COMPARE COLUMN-EFFECTS MODEL TO THE NOMINAL MODEL (C-E BETTER): anova(nomod,cmod,test="Chi") # LOOK AT THE INTERACTION EFFECTS FOR THE COLUMN-EFFECTS MODEL: summary(cmod)$coef[14:19,] # COMPARE COLUMN-EFFECTS MODEL TO THE ORDINAL MODEL (O BETTER): anova(ormod,cmod,test="Chi") # ORDINAL BEST SO FAR, BUT NO SIGNIFICANCE FOR HIGH SCHOOL AND ABOVE, SO COLLAPSE: aedu <- c(1,1,2,2,2,2,2) # AND RE-RUN ORDINAL MODEL: ormodb <- glm(Freq ~ PID + educ + I(oPID*aedu[oeduc]), partyed, family= poisson) # NOTE THAT DEVIANCE IS EVEN BETTER deviance(ormodb) deviance(ormod)