# MULTIPLE IMPUTATION IN R 
# Data From Greene, "Econometrics".  National figures scaled by CPI already, 1968-1982.
# Where do you get mice?  http://www.multiple-imputation.com/
#
# Some theory:
# 	Goal: f(Y|X,beta) unbiased
#	Z_{mis} = (X_{mis},Y_{mis}), Z_{obs} = (X_{obs},Y_{obs}) 
#	R is a (n by k+1) matrix of 0=not missing, 1=missing
# 	phi is a parameter that "causes" missingness
# 	MCAR: P(R|Z_{obs},Z_{mis}) = P(R|phi), missingness not related to any observed or unobserved data
#	MAR: P(R|Z_{obs},Z_{mis}) = P(R|Z_{obs},phi), missingness depends only on observed data
# 	Non-Ignorable: P(R|Z_{obs},Z_{mis}) = P(R|Z_{obs},Z_{mis},phi), missingness depends on unobserved data
#
# How does the MI process work?
# 	3 steps: - impute missing the data m times to get m replicate datasets, 
#		 - analyze/regress each dataset separately,
#		 - combine results with summar process. 
#	Imputation step assumes a conditional distribution for the missing data conditioning on observed values.
#	Oddly enough m = 5 to 10 is sufficient.
#	Combining process uses means for coefficients and an intuitive ANOVA approach for standard errors.


invest.df <- read.table("http://jgill.wustl.edu/data/investment.dat",header=TRUE)

   real.investment trend real.gnp real.interest      inflation
1            0.161     1    1.058          5.16           4.40
2            0.172     2    1.088          5.87           5.15
3            0.158     3    1.086          5.95           5.37
4            0.173     4    1.122          4.88           4.99
5            0.195     5    1.186          4.50           4.16
6            0.217     6    1.254          6.44           5.75
7            0.199     7    1.246          7.83           8.82
8            0.163     8    1.232          6.25           9.31
9            0.195     9    1.298          5.50           5.21
10           0.231    10    1.370          5.46           5.83
11           0.257    11    1.439          7.46           7.40
12           0.259    12    1.479         10.28           8.64
13           0.225    13    1.474         11.77           9.31
14           0.241    14    1.503         13.42           9.44
15           0.204    15    1.475         11.02           5.99

##### FIRST DO A STANDARD LINEAR MODEL ON THE FULL DATA #####

attach(invest.df)
invest.lm <- lm(real.investment ~ trend + real.gnp + real.interest + inflation)
summary.lm(invest.lm)
detach(invest.df)

Residuals:
       Min         1Q     Median         3Q        Max 
-0.0102779 -0.0022946  0.0004119  0.0029377  0.0080418 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)
(Intercept)   -5.091e-01  5.513e-02  -9.234 3.28e-06
trend         -1.658e-02  1.972e-03  -8.409 7.59e-06
real.gnp       6.704e-01  5.500e-02  12.189 2.52e-07
real.interest -2.326e-03  1.219e-03  -1.908   0.0854
inflation     -9.401e-05  1.347e-03  -0.070   0.9458

Residual standard error: 0.006714 on 10 degrees of freedom
Multiple R-Squared: 0.9724,     Adjusted R-squared: 0.9614 
F-statistic: 88.19 on 4 and 10 DF,  p-value: 9.333e-08 

##### NOW CREATE THE MISSING DATASET FROM THE FULL DATASET AND RERUN LM #####

invest.missing.df <- invest.df
invest.missing.df[1,1] <- invest.missing.df[8,5]  <- invest.missing.df [9,3] <- 
			  invest.missing.df[13,4] <- NA

sum(is.na(invest.missing.df))/prod(dim(invest.missing.df))        
[1] 0.05333333

invest.missing.lm <- lm(real.investment ~ trend + real.gnp + real.interest + inflation,
	data=invest.missing.df)
summary.lm(invest.missing.lm)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.002759 -0.001341 -0.001181  0.001311  0.003809 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)
(Intercept)   -0.4459603  0.0324644 -13.737 9.25e-06
trend         -0.0144363  0.0012779 -11.297 2.88e-05
real.gnp       0.6032710  0.0332504  18.143 1.80e-06
real.interest -0.0031976  0.0007746  -4.128  0.00616
inflation      0.0023307  0.0009748   2.391  0.05395

Residual standard error: 0.002866 on 6 degrees of freedom
Multiple R-Squared: 0.9959,     Adjusted R-squared: 0.9931 
F-statistic: 362.5 on 4 and 6 DF,  p-value: 2.79e-07 


# FROM THE lm HELP FILE:
# na.action: a function which indicates what should happen when the data
#         contain `NA's.  The default is set by the `na.action' setting
#         of `options', and is `na.fail' if that is unset. The
#         ``factory-fresh'' default is `na.omit'.
#
# STRONGLY ADVISED: options(na.action="na.fail")


##### NOW USE MULTIPLE IMPUTATION FROM THE MICE PROGRAM #####
##### DOWNLOAD mice FROM: http://www.multiple-imputation.com/ ###### 

library(nnet)
source("S.Dir/mice.R")

# HERE IS THE MICE COMMAND, m=10 SPECIFIES 10 IMPUTATIONS (DEFAULT=5)
imp.invest <- mice(invest.missing.df,m=10)

# WE CAN NOW USE THE complete FUNCTION TEN TIMES FOLLOWED BY 10 lm CALLS, BUT 
# THE lm.mids FUNCTION DOES THIS IN ONE COMMAND:
invest.mids <- lm.mids(real.investment ~ trend + real.gnp + real.interest 
	+ inflation,data=imp.invest)
pool(invest.mids)

# A FUNCTION TO CONVENIENTLY GET COEFICIENTS AND STANDARD ERRORS FROM A lm.mids 
# OBJECT, param=1 GIVES THE COEFFICIENT ESTIMATES, param=2 GIVES THE STANDARD ERRORS
lm.mids.vals <- function(obj,param) { 
    out.mat <- NULL
    for (i in 1:obj$call1$m) 
	out.mat <- rbind(out.mat,summary.lm(obj$analyses[[i]])$coef[,param])
    out.mat
}

# USE THIS FUNCTION TO GET THREE REQUIRED VECTORS:
impute.coef.vec <- apply(lm.mids.vals(invest.mids,1),2,mean)

between.var <- apply(lm.mids.vals(invest.mids,1),2,var)

within.var <- apply(lm.mids.vals(invest.mids,2)^2,2,mean)


# COMPUTE THE STANDARD ERROR VECTOR
m <- 10
impute.se.vec <- sqrt(within.var + ((m+1)/m)*between.var)

# THE DEGREES OF FREEDOM FOR THE T-STATISTIC NEEDS TO BE ADJUSTED.
# SEE LITTLE AND RUBIN (1987), PAGE 257
impute.df <- (m-1)*(1 + (1/(m+1)) * within.var/between.var)^2

# REGRESSION TABLE:
out.table <- round( cbind( impute.coef.vec,impute.se.vec,impute.coef.vec/impute.se.vec,
	1-pt(abs(impute.coef.vec/impute.se.vec),impute.df) ),6 )
dimnames(out.table) <- list( c("(intercept)","trend","real.gnp","real.interest","inflation"),
	c("Estimate","Std. Error","t value","Pr(>|t|)") ) 
out.table

               Estimate Std. Error   t value Pr(>|t|)
(intercept)   -0.390189   0.062981 -6.195350 0.000001
trend         -0.011984   0.002325 -5.153349 0.000025
real.gnp       0.549227   0.066607  8.245852 0.000000
real.interest -0.005107   0.001576 -3.241617 0.001627
inflation      0.003673   0.002421  1.516784 0.071848


# NOT QUITE THE IMPROVEMENT WE HOPED FOR.  WHY?  TWO REASONS: (1) I  PARTICULARLY 
# CHOSE THE FOUR MISSING VALUES TO "DO THE MOST DAMAGE", AND (2) THIS IS A 
# RELATIVELY SMALL DATASET SO MI DOES NOT HAVE A LOT OF INFORMATION WITH WHICH
# TO BUILD THE POSTERIOR DISTRIBUTION OF THE MISSING DATA.  I'D LIKE TO HAVE A
# BETTER SIMPLE LINEAR MODELS EXAMPLE.  EXTRA CREDIT FOR FINDING AND DEMONSTRATING.


#    SOME TEXT FROM A FORTHCOMING PAPER:

#    \footnote{Missing data values are addressed here with
#    \emph{multiple imputation} (Little and Rubin 1983, 1987; Rubin 1987) using the \texttt{mice}
#    (\texttt{m}ultiple \texttt{i}mputation by \texttt{c}hained \texttt{e}quations) package in the
#    \texttt{R} statistical environment.  The commonly used methods of listwise deletion and mean
#    imputation lead to biased and misleading results.  Essentially, multiple imputation creates a
#    posterior distribution for the missing data conditional on the observed data, and draws randomly
#    from this distribution to create multiple replications (5-10) of the original dataset.  The
#    model analysis is performed on each of these replicates and then averaged (with a standard error
#    adjustment).  See King, \emph{et al.} (2001) for a review of missing data issues in political
#    science.  All data, \texttt{R} code, \texttt{WinBUGS} code, and diagnostics to implement the
#    statistical model described here, as well as our data imputations used in this analysis
#    are available at the website: \phantom{\code{http://jgill.wustl.edu/PA\_Model/}} for
#    replication purposes.}

# Proof that ignoring non-ignorable data is bad 
# \begin{itemize}
#    \item   First, segment the $\X$-matrix into two constituent parts: $\X=[\X_{obs},\X_{mis}]$,
#            and restate the distribution function:
#            \begin{equation}\label{PDF.separation}
#                f(\X|\T) = f(\X_{obs},\X_{mis}|\T) = f(\X_{obs}|\T)f(\X_{mis}|\X_{obs},\T). \nonumber
#            \end{equation}
#    \item   Now segment the log likelihood function into two distinct components:
#            \begin{align}\label{likelihood.separation}
#                \ell(\T|\X) &= \ell(\T|\X_{obs},\X_{mis})       \nonumber \\
#                            &= \ell(\T|\X_{obs}) + \text{log}f(\X_{mis}|\X_{obs},\T)   \nonumber
#            \end{align}
#    \item   Rearrange this form to create a statement with both unknowns collected on the right-hand side:
#            \begin{equation}\label{reordered.likelihood.separation}
#                \ell(\T|\X_{obs}) = \ell(\T|\X_{obs},\X_{mis}) - \ell(\X_{mis}|\X_{obs},\T).\nonumber
#            \end{equation}
# \end{itemize}
# So ignoring is only okay when $\ell(\X_{mis}|\X_{obs},\T)$ is zero.

# References to Note:
# \bibitem        Little, Roderick J. A., and Donald B. Rubin.  1987.
#                 \emph{Statistical Analysis with Missing Data.}
#                 New York: John Wiley \& Sons.
# 
# \bibitem        Rubin, Donald.  1987.
#                 \emph{Multiple Imputation for Nonresponse in Surveys.}
#                 New York: John Wiley \& Sons.




