FORMULA STRUCTURE AND SYNTAX IN R. JEFF GILL. GENERAL STRUCTURE: OV ~ EV1 + EV2 - the tilde is a function that saves the formula as an unevaluated expression: formula object. - note that no actual "adding" is being done here in the arithmetic sense. - a constant is automatically implied, same as: OV ~ 1 + EV1 + EV2. but we can explicitly exclude the constant by: OV ~ -1 + EV1 + EV2. OV ~ 0 + EV1 + EV2. LINEAR IMPLEMENTATION: osha.df <- read.table("http://jgill.wustl.edu/data/osha2.dat",header=TRUE) osha.ols <- lm(INSPT ~ AP + DI,data=osha.df) osha.ols <- lm(INSPT ~ I(AP) + DI,data=osha.df) summary(osha.ols) kitchen.sink <- lm(as.vector(osha.df[,3]) ~ as.matrix(osha.df[,-3])) INLINE MATH FORMULAS: log(OV) ~ EV1 + EV2 OV ~ exp(EV1) + cos(EV2) OV ~ I(EV1/2) + sqrt(EV2 - mean(EV2)) osha.ols <- lm(INSPT ~ AP + (DI>40) ,data=osha.df) osha.ols <- lm(INSPT ~ AP + cut(DI,3) ,data=osha.df) INTERACTIONS: OV ~ EV1 + EV2 + EV1:EV2 OV ~ EV1 * EV2 osha.ols <- lm(INSPT ~ AP * DI, data=osha.df) osha.ols <- lm(INSPT ~ AP * DI * SIC1, data=osha.df) NESTING: OV ~ EV1 + county %in% state OV ~ EV1 + state/county osha.ols <- lm(INSPT ~ D1 + D2 %in% D1, data=osha.df) osha.ols <- lm(INSPT ~ D1 + D1:D2, data=osha.df) osha.ols <- lm(INSPT ~ D1/D2, data=osha.df) SPECIAL CHARACTERS (+ - : * / . ^): - the "." is a default: osha.ols <- lm(INSPT ~ ., data=osha.df) - the "^" gives all factors crossed to the order specified: osha.ols <- lm(INSPT ~ (as.factor(D2)+as.factor(IP))^2, data=osha.df) UPDATE FUNCTION: osha.ols <- lm(INSPT ~ AP + DI + SIC1, data=osha.df) osha.ols <- update(osha.ols, . ~ . -AP) osha.ols <- lm(INSPT ~ AP + DI + SIC1, data=osha.df) osha.ols <- update(osha.ols, . ~ . +R0) osha.ols$call PARENTHESES: OV ~ EV1/(EV2 : EV3) OV ~ EV1/EV2 : EV3 osha.ols <- lm(INSPT ~ SIC1*(D2 + D1 %in% D2), data=osha.df) osha.ols <- lm(INSPT ~ SIC1*D2 + D1 %in% D2, data=osha.df) FACTORS: osha.df$D2 <- factor(osha.df$D2) levels(osha.df$D2) <- c("Winter","Spring","Summer","Fall") osha.ols <- lm(INSPT ~ D2 + SIC1, data=osha.df) osha.df$D2<- factor(as.ordered(osha.df$D2)) contrasts(osha.df$D2) <- contr.treatment(base=4,n=4) osha.ols <- lm(INSPT ~ D2 + SIC1, data=osha.df) osha.ols <- lm(INSPT ~ D2/D1, data=osha.df) CONTRASTS: options()$contrasts options(contrasts=c("contr.treatment","contr.helmert")) - HELMERT: the difference between negative higher levels where abs(row)=#cats - POLYNOMIAL: linear, quadratic, cubic,... terms in hypothetical underlying numeric variable that takes on equally spaced values for the levels of the factor. N <- factor(Nlevs <- c("men","women")) contr.sum(N) contr.treatment(N) contr.helmert(N) contr.poly(N) N <- factor(Nlevs <- c(1,4,8)) contr.sum(N) contr.treatment(N) contr.poly(N) contr.helmert(N) contr.helmert(4) contr.helmert(5) - LOOK AT CONSEQUENCES FOR A SIMPLE LINEAR MODEL Y <- rnorm(100); X1 <- rgamma(100,3,2); X2 <- factor(rbinom(100,2,.6)) contrasts(X2) <- contr.sum(3) summary(lm(Y~X1+X2)) contrasts(X2) <- contr.treatment(3) summary(lm(Y~X1+X2)) contrasts(X2) <- contr.poly(3) summary(lm(Y~X1+X2)) contrasts(X2) <- contr.helmert(3) summary(lm(Y~X1+X2)) FIXING A TERM FOR MODEL COMPARISON (GLM ONLY): teach.df <- read.table("http://jgill.wust.edu/data/spector.mazzeo.data", col.names=c("GPA","TUCE","PSI","GRADE")) teach.logit.fit <- glm(GRADE ~ GPA + TUCE + PSI, family=binomial(link=logit), data = teach.df) summary(teach.logit.fit) teach.logit.fit <- glm(GRADE ~ offset(GPA) + TUCE + PSI, family=binomial(link=logit), data = teach.df) summary(teach.logit.fit)