## Chapter 13 ## Sec 13.1: Principal Component Scores in Regression ## Principal components: data frame socsupport (DAAG) ss.pr1 <- princomp(as.matrix(na.omit(socsupport[, 9:19])), cor=TRUE) pairs(ss.pr1$scores[, 1:3]) sort(-ss.pr1$scores[,1], decr=TRUE)[1:10] # Note the outlier ## Alternative to pairs(), using the lattice function splom() splom(~ss.pr1$scores[, 1:3]) not.na <- complete.cases(socsupport[, 9:19]) not.na[36] <- FALSE ss.pr <- princomp(as.matrix(socsupport[not.na, 9:19]), cor=TRUE) summary(ss.pr) # Examine the contributions of the components ss.lm <- lm(BDI[not.na] ~ ss.pr$scores[, 1:6], data=socsupport) summary(ss.lm)$coef ss.pr$loadings[, 1] ## Footnote code attach(socsupport) plot(BDI[not.na] ~ ss.pr$scores[ ,1], col=as.numeric(gender[not.na]), pch=as.numeric(gender[not.na]), xlab ="1st principal component", ylab="BDI") topleft <- par()$usr[c(1,4)] legend(topleft[1], topleft[2], col=1:2, pch=1:2, legend=levels(gender)) detach(socsupport) ## Sec 13.2: {\kern-4pt}$^*$ Propensity Scores in Regression Comparisons -- Labor Training Data ## The labor training data ## Intended analyses ## Potential sources of bias ## The propensity score methodology -- further comments ## Data exploration nsw74psid1.lm <- lm(re78 ~ trt+ (age + educ + re74 + re75) + (black + hisp + marr + nodeg), data = nsw74psid1) ## Footnote code linecols <- c(rgb(0,0.5,0.5), rgb(0.65,0,0.65)) trellis.par.set(superpose.symbol=list(cex=0.25), superpose.line=list(lwd=2, col=linecols)) vnames <- c("educ","age","re74","re75","re78") dframe <- nsw74psid1[, vnames] dframe[,-(1:2)] <- log(dframe[,-(1:2)] + 47) lab <- c(vnames[1:2], paste("log\n", vnames[-(1:2)], "+", 47)) trt <- factor(nsw74psid1$trt, labels=c("Control","Treatment")) splom(~ dframe, type=c("p","smooth"), groups=trt, varnames=lab, auto.key=list(columns=2)) ## ss 13.2.1: Regression analysis, using all covariates library(splines) A.lm <- lm(formula = log(re78 + 47) ~ trt + (ns(age, 4) + educ + log(re74 + 47) + log(re75 + 47)) + I(re74 == 0) + I(re75 == 0) + (black + hisp + marr + nodeg), data = nsw74psid1) summary(A.lm)$coef ## Footnote code "multilap" <- function(df=nsw74psid1, maxf=20, colnames=c("educ","age","re74","re75","re78")){ if(maxf==Inf) return(rep(TRUE, dim(df)[1])) if(length(maxf)==1)maxf <- c(1/maxf, maxf) trt <- df$trt common <- rep(TRUE, length(trt)) for(vname in colnames){ y0 <- df[trt==0,vname] y1 <- df[trt==1,vname] xchop <- overlapDensity(y0, y1, ratio=maxf, compare.numbers=FALSE, plotit=FALSE) common <- common & df[,vname]>=xchop[1] & df[,vname]<=xchop[2] } invisible(common) } ## Now run the function common <- multilap(maxf=30) xyplot(resid(A.lm) ~ fitted(A.lm), groups=nsw74psid1$trt[common], type=c("p","smooth")) ## ss 13.2.2: The use of propensity scores common <- multilap(maxf=30) # Mild preliminary filtering ## Calculate propensity scores: data frame nsw74psidA (DAAG) disc.glm <- glm(formula = trt ~ age + educ + black + hisp + marr + nodeg + re74 + re75, family = binomial, data = nsw74psid1, subset=common) Pscores <- predict(disc.glm) nsw74psidB <- subset(nsw74psid1, common) trt <- nsw74psidB$trt xchop <- overlapDensity(Pscores[trt==0], Pscores[trt==1], compare.numbers=FALSE, ratio=c(1/30, 30)) overlap <- Pscores > xchop[1] & Pscores < xchop[2] with(subset(nsw74psidB, overlap), table(re78>0, trt)) AS.glm <- glm(I(re78>0) ~ trt + Pscores, data=nsw74psidB, subset = overlap, family=binomial) summary(AS.glm)$coef AS.lm <- lm(log(re78+47) ~ trt + Pscores, data=nsw74psidB, subset = overlap&nsw74psidB$re78>0) summary(AS.lm)$coef ## Further comments ## Sec 13.3: Further Reading