## Chapter 5 ## Sec 5.1: Fitting a Line to Data ## ss 5.1.1: Lawn roller example ## Footnote code ## Global option settings used for this and most later output options(show.signif.stars=FALSE, digits=4) # show.signif.stars=FALSE suppresses interpretive features that, # for our use of the output, are unhelpful. ## Fit lm model: data from roller (DAAG); output in roller.lm roller.lm <- lm(depression ~ weight, data = roller) ## Use the extractor function summary() to summarize results summary(roller.lm) ## Footnote code ## Fit model that omits intercept term; i.e., y = bx lm(depression ~ -1 + weight, data=roller) ## ss 5.1.2: Calculating fitted values and residuals data.frame(roller, fitted.value=predict(roller.lm), residual=resid(roller.lm)) ## ss 5.1.3: Residual plots plot(roller.lm, which = 1) plot(roller.lm, which = 2) ## Footnote code ## Alternatively, use the following code: test <- residuals(roller.lm); n <- length(test) av <- mean(test); sdev <- sd(test) y <- c(test, rnorm(7*n, av, sdev)) fac <- c(rep("residuals(roller.lm)",n), paste("reference", rep(1:7, rep(n,7)))) fac <- factor(fac, levels=unique(fac)) library(lattice) qqmath(~ y|fac, aspect=1, layout=c(4,2)) ## Normal probability plot, plus 7 reference plots qreference(residuals(roller.lm), nrep=8, nrows=2) ## ss 5.1.4: Iron slag example: is there a pattern in the residuals? ## Footnote code ## requires data frame ironslag (DAAG) par(mfrow=c(2,2)) attach(ironslag) ## Panel A plot(chemical ~ magnetic) ironslag.lm <- lm(chemical ~ magnetic) abline(ironslag.lm) lines(lowess(chemical ~ magnetic, f=.9), lty=2) ## Footnote code res <- residuals(ironslag.lm) plot(res ~ magnetic, xlab="Residual") lines(lowess(res ~ magnetic, f=.9), lty=2) ## Footnote code yhat <- fitted(ironslag.lm) plot(chemical ~ yhat, xlab="Predicted chemical", ylab="Chemical") lines(lowess(chemical ~ yhat, f=.9), lty=2) ## Footnote code sqrtabs <- sqrt(abs(res)) plot(sqrtabs ~ yhat, xlab = "Predicted chemical", ylab = expression(sqrt(abs(residual))), type = "n") panel.smooth(yhat, sqrtabs, span = 0.95) detach(ironslag) par(mfrow=c(1,1)) ## ss 5.1.5: The analysis of variance table anova(roller.lm) ## Sec 5.2: Outliers, Influence and Robust Regression softbacks.lm <- lm(weight ~ volume, data=softbacks) summary(softbacks.lm) par(mfrow=c(2,2)) # Use par(mfrow=c(1,4)) for layout in figure plot(softbacks.lm, which=1:4) # By default, plots 1:3 and 5 [which=c(1:3,5)] are given par(mfrow=c(1,1)) ## Robust regression ## Sec 5.3: Standard Errors and Confidence Intervals ## ss 5.3.1: Confidence intervals and tests for the slope ## Footnote code SEb <- summary(roller.lm)$coefficients[2, 2] coef(roller.lm)[2] + qt(c(0.025,.975), 8)*SEb ## ss 5.3.2: SEs and confidence intervals for predicted values ## Footnote code ## Code to obtain fitted values and standard errors (SE, then SE.OBS) fit.with.se <- predict(roller.lm, se.fit=TRUE) fit.with.se$se.fit # SE sqrt(fit.with.se$se.fit^2+fit.with.se$residual.scale^2) # SE.OBS ## Footnote code plot(depression ~ weight, data=roller, xlab = "Weight of Roller (tonnes)", ylab = "Depression in Lawn (mm)", pch = 16) roller.lm <- lm(depression ~ weight, data = roller) abline(roller.lm$coef, lty = 1) xy <- data.frame(weight = pretty(roller$weight, 20)) yhat <- predict(roller.lm, newdata = xy, interval="confidence") ci <- data.frame(lower=yhat[, "lwr"], upper=yhat[, "upr"]) lines(xy$weight, ci$lower, lty = 2, lwd=2, col="grey") lines(xy$weight, ci$upper, lty = 2, lwd=2, col="grey") ## ss 5.3.3: {\kern-5pt}*{\kern5pt}Implications for design ## Sec 5.4: Regression versus Qualitative anova Comparisons ## Issues of power ## The pattern of change ## Sec 5.5: Assessing Predictive Accuracy ## ss 5.5.1: Training/test sets, and cross-validation ## ss 5.5.2: Cross-validation -- an example ## Footnote code rand <- sample(1:15)%%3 + 1 # a%%3 is the remainder of a, modulo 3 # It is the remainder after subtracting from a the # largest multiple of 3 that is <= a (1:15)[rand == 1] # Observation numbers for the first group (1:15)[rand == 2] # Observation numbers for the second group (1:15)[rand == 3] # Observation numbers for the third group. ## Cross-validate lm calculations: data frame houseprices (DAAG) houseprices.lm <- lm(sale.price ~ area, data=houseprices) CVlm(houseprices, houseprices.lm) ## Footnote code summary(houseprices.lm)$sigma^2 ## ss 5.5.3: {\kern-4pt}*~Bootstrapping houseprices.lm <- lm(sale.price ~ area, data=houseprices) summary(houseprices.lm)$coef houseprices.fn <- function (houseprices, index){ house.resample <- houseprices[index, ] house.lm <- lm(sale.price ~ area, data=house.resample) coef(house.lm)[2] # slope estimate for resampled data } set.seed(1028) # use to replicate the exact results below library(boot) # ensure that the boot package is loaded ## requires the data frame houseprices (DAAG) houseprices.boot <- boot(houseprices, R=999, statistic=houseprices.fn) houseprices.boot houseprices1.fn <- function (houseprices, index){ house.resample <- houseprices[index, ] house.lm <- lm(sale.price ~ area, data=house.resample) predict(house.lm, newdata=data.frame(area=1200)) } ## Footnote code houseprices1.boot <- boot(houseprices, R=999, statistic=houseprices1.fn) boot.ci(houseprices1.boot, type="perc") # "basic" is an alternative to "perc" ## Footnote code houseprices2.fn <- function (houseprices, index) { house.resample <- houseprices[index, ] house.lm <- lm(sale.price ~ area, data=house.resample) houseprices$sale.price - predict(house.lm, houseprices) # resampled prediction # errors } n <- length(houseprices$area) R <- 200 houseprices2.boot <- boot(houseprices, R=R, statistic=houseprices2.fn) house.fac <- factor(rep(1:n, rep(R, n))) plot(house.fac, as.vector(houseprices2.boot$t), ylab="Prediction Errors", xlab="House") ## Footnote code bootse <- apply(houseprices2.boot$t, 2, sd) usualse <- predict.lm(houseprices.lm, se.fit=TRUE)$se.fit plot(bootse/usualse, ylab="Ratio of Bootstrap SE's to Model-Based SE's", xlab="House", pch=16) abline(1, 0) ## Commentary ## Sec 5.6: {\kern-5pt}*{\kern5pt}A Note on Power Transformations ## *General power transformations ## Sec 5.7: Size and Shape Data ## ss 5.7.1: Allometric growth cfseal.lm <- lm(log(heart) ~ log(weight), data=cfseal) summary(cfseal.lm) ## ss 5.7.2: There are two regression lines! ## An alternative to a regression line ## Sec 5.8: The Model Matrix in Regression model.matrix(roller.lm) ## Sec 5.9: Recap ## Sec 5.10: Methodological References ## requires the data frame ironslag (DAAG) ironslag.loess <- loess(chemical ~ magnetic, data=ironslag) chemhat <- fitted(ironslag.loess) res2 <- resid(ironslag.loess) sqrtabs2 <- sqrt(abs(res2)) e1.lm <- lm(distance ~ stretch, data=elastic1) elastic1$newdistance <- cbind(rep(1, 7), elastic1$stretch)%*%coef(e1.lm) + rnorm(7, sd=summary(e1.lm)$sigma) "funRel" <- function(x=leafshape$logpet, y=leafshape$loglen, scale=c(1,1)){ ## Find principal components rotation; see Section 11.1 ## Here (unlike 11.1) the interest is in the final component xy.prc <- prcomp(cbind(x,y), scale=scale) b <- xy.prc$rotation[,2]/scale bxy <- -b[1]/b[2] # slope - functional eqn line c(bxy = bxy) } ## Try the following: funRel(scale=c(1,1)) # Take x and y errors as equally important funRel(scale=c(1,10)) # Error is mostly in y; structural relation # line is close to regression line of y on x funRel(scale=c(10,1)) # Error is mostly in x ... ## Note that all lines pass through (xbar, ybar)