# SUMMARY OF CONTRASTS IN R. # Idea: there is more than one way to choose k-1 contrasts for k categories # TREATMENT: standard dummy coding, note that this is technically not a contrast since # the columns do not sum to 0 and are therefore not orthogonal to a vector # of all 1's. # SUM: reference category sums to others # HELMERT: the difference between negative higher levels where abs(row)=#cats # POLYNOMIAL: linear, quadratic, cubic,... terms in hypothetical underlying # numeric variable that takes on equally spaced values for the # levels of the factor. # EXAMPLE 1: 2 LEVELS (1 AND 4) > N <- factor(Nlevs <- c(1,4)) > N [1] 1 4 Levels: 1 4 > contr.sum(N) [,1] 1 1 2 -1 > contr.treatment(N) 2 1 0 2 1 > contr.helmert(N) [,1] 1 -1 2 1 > contr.poly(N) .L [1,] -0.7071068 [2,] 0.7071068 # EXAMPLE 2: 3 LEVELS (1, 4 AND 8) > N <- factor(Nlevs <- c(1,4,8)) > contr.sum(N) [,1] [,2] 1 1 0 2 0 1 8 -1 -1 > contr.treatment(N) 2 3 1 0 0 2 1 0 8 0 1 > contr.treatment(N,base=3) 1 4 1 1 0 4 0 1 8 0 0 > contr.helmert(N) [,1] [,2] 1 -1 -1 2 1 -1 8 0 2 > contr.poly(N) .L .Q [1,] -7.071068e-01 0.4082483 [2,] -7.850462e-17 -0.8164966 [8,] 7.071068e-01 0.4082483 > contr.helmert(4) [,1] [,2] [,3] 1 -1 -1 -1 2 1 -1 -1 3 0 2 -1 4 0 0 3 > contr.helmert(5) [,1] [,2] [,3] [,4] 1 -1 -1 -1 -1 2 1 -1 -1 -1 3 0 2 -1 -1 4 0 0 3 -1 5 0 0 0 4 # LOOK AT CONSEQUENCES FOR A SIMPLE LINEAR MODEL Y <- rnorm(100); X1 <- rgamma(100,3,2); X2 <- factor(rbinom(100,2,.6)) contrasts(X2) <- contr.sum(3) levels(X2) <- c("low", "medium", "high") summary(lm(Y~X1+X2)) contrasts(X2) <- contr.sum(3) summary(lm(Y~X1+X2)) contrasts(X2) <- contr.treatment(3) summary(lm(Y~X1+X2)) contrasts(X2) <- contr.poly(3) summary(lm(Y~X1+X2)) contrasts(X2) <- contr.helmert(3) summary(lm(Y~X1+X2)) # A GLM Setting: fdr <- read.table("http://jgill.wustl.edu/data/fdr.dat",header=TRUE) # FDR indicates whether or not Roosevelt carried that state in the 1932 presidential elections, # PRE.DEP is the mean per-state income before the onset of the Great Depression (1929) in dollars, # POST.DEP is the mean per-state income after the onset of the great depression (1932) in dollars, # REGION: 0=northeast, 1=south, 2=west, 3=midwest # FARM is the total farm wage and salary disbursements per state in 1932 in thousands of dollars, # POP is the population size from the 1930 U.S. Census. fdr$REGION <- factor(fdr$REGION) levels(fdr$REGION) <- c("northeast", "south", "west", "midwest") contrasts(fdr$REGION) <- contr.sum(4) fdr.out <- glm(FDR ~ I(POST.DEP - PRE.DEP) + REGION + FARM + POP, data=fdr,family = binomial(link = logit)) summary(fdr.out) contrasts(fdr$REGION) <- contr.treatment(4) fdr.out2 <- glm(FDR ~ I(POST.DEP - PRE.DEP) + REGION + FARM + POP, data=fdr,family = binomial(link = logit)) summary(fdr.out2) # p(FDR) for case: Ohio 1 771 661 400 18301 6646697 midwest # PREDICTION FOR SUM CONTRAST logit( c(1,fdr[34,3]-fdr[34,2],-1,-1,-1,fdr[34,5],fdr[34,6]) %*% fdr.out$coef ) # PREDICTION FOR TREATMENT CONTRAST logit( c(1,fdr[34,3]-fdr[34,2],0,0,1,fdr[34,5],fdr[34,6]) %*% fdr.out2$coef ) # Some non-FDR state: Connecticut 0 1024 921 620 10167 1606903 northeast logit( c(1,fdr[6,3]-fdr[6,2],1,0,0,fdr[6,5],fdr[6,6]) %*% fdr.out$coef ) logit( c(1,fdr[6,3]-fdr[6,2],0,0,0,fdr[6,5],fdr[6,6]) %*% fdr.out2$coef )