## Chapter 6 ## Sec 6.1: Basic Ideas: Book Weight and Brain Weight Examples ## Footnote code ## Plot of weight vs volume: data frame allbacks (DAAG) plot(weight ~ volume, data=allbacks, pch=c(16,1)[unclass(cover)]) # unclass(cover) gives the integer codes that identify levels ## Use with() to specify data frame [text() does not take a 'data' parameter] with(allbacks, text(weight ~ volume, labels=paste(1:15), pos=c(2,4)[unclass(cover)])) allbacks.lm <- lm(weight ~ volume+area, data=allbacks) summary(allbacks.lm) # for coefficient estimates and SEs, without other output, enter # summary(allbacks.lm)$coef ## Footnote code qt(0.975, 12) anova(allbacks.lm) model.matrix(allbacks.lm) ## ss 6.1.1: Omission of the intercept term allbacks.lm0 <- lm(weight ~ -1+volume+area, data=allbacks) summary(allbacks.lm0) summary(allbacks.lm, corr=TRUE) ## ss 6.1.2: Diagnostic plots par(mfrow=c(2,2)) # Get all 4 plots on one page plot(allbacks.lm0, which=1:4) par(mfrow=c(1,1)) allbacks.lm13 <- lm(weight ~ -1+volume+area, data=allbacks[-13, ]) summary(allbacks.lm13) ## ss 6.1.3: Example: brain weight ## Footnote code ## Scatterplot matrix for data frame litters (DAAG); labels as in figure pairs(litters, labels=c("lsize\n\n(litter size)", "bodywt\n\n(Body Weight)", "brainwt\n\n(Brain Weight)")) ## Alternatively, use the lattice function splom() library(lattice); splom(~litters) pairs(litters) # For improved labeling, see the footnote. ## Footnote code ## Code for the several regressions mice1.lm <- lm(brainwt ~ lsize, data = litters) # Regress on lsize mice2.lm <- lm(brainwt ~ bodywt, data = litters) # Regress on bodywt mice12.lm <- lm(brainwt ~ lsize + bodywt, data = litters) # Both the above ## Now obtain coefficients and SEs, etc., for the first of these regressions summary(mice1.lm)$coef ## Similarly for the other regressions. ## ss 6.1.4: Plots that show the contribution of individual terms yterms <- predict(mice12.lm, type="terms") termplot(mice12.lm) termplot(mice12.lm, partial.resid=TRUE, smooth=panel.smooth, col.res=1) res <- resid(mice12.lm) yhat <- predict(mice12.lm, type="terms") plot(litters$lsize, yhat[, "lsize"] + res, type="n") panel.smooth(litters$lsize, yhat[, "lsize"]+res) plot(litters$bodywt, yhat[, "bodywt"] + res, type="n") panel.smooth(litters$bodywt, yhat[, "bodywt"]+res) ## Sec 6.2: Multiple Regression Assumptions and Diagnostics ## ss 6.2.1: Influential outliers and Cook's distance ## $^{*}$ Leverage and the hat matrix hatvalues(mice12.lm) plot(mice12.lm, which=5, add.smooth=FALSE) ## The points can be plotted using ## plot(hatvalues(model.matrix(mice12.lm)), residuals(mice12.lm)) ## ss 6.2.2: Influence on the regression coefficients ## ss 6.2.3: \!\!$^{*}$Additional diagnostic plots library(car) leverage.plots(allbacks.lm, term.name="volume", identify.points=FALSE) ## ss 6.2.4: Robust and resistant methods ## ss 6.2.5: The uses of model diagnostics ## Sec 6.3: A Strategy for Fitting Multiple Regression Models ## ss 6.3.1: Preliminaries ## *Explanatory variables that are not linearly related ## ss 6.3.2: Model fitting ## ss 6.3.3: An example -- the Scottish hill race data ## Footnote code ## Scatterplot matrix: data frame hills (DAAG) ## Panel A: untransformed data library(lattice) splom(~ hills, cex.labels=1.2, varnames=c("dist\n(miles)","climb\n(feet)", "time\n(hours)")) ## Panel B: log transformed data splom(~ log(hills), cex.labels=1.2, varnames=c("dist\n(log miles)", "climb\n(log feet)", "time\n(log hours)")) with(hills, hills[log(time)>0 & log(dist)<1.5,]) ## Footnote code ## Include observation 18 hills0.loglm <- lm(log(time) ~ log(dist) + log(climb), data = hills) plot(hills0.loglm) ## Exclude observation 18 hills.loglm <- lm(log(time) ~ log(dist) + log(climb), data = hills[-18, ]) plot(hills.loglm) ## Residuals from a resistant regression fit ## Footnote code ## Resistant regression fit library(MASS) h.lqs <- lqs(log(time) ~ log(climb) + log(dist), data = hills[-18,]) ## Compare residuals from lqs() with those from lm() plot(resid(hills.loglm), resid(h.lqs), xlab="Residuals, lm model", ylab="Residuals, lqs model") ## Residuals vs fitted values; resistant regression plot(fitted(h.lqs), resid(h.lqs), xlab="Fitted values, lqs model", ylab="Residuals, lqs model") ## Inclusion of an interaction term hills2.loglm <- lm(log(time) ~ log(dist)+log(climb)+log(dist):log(climb), data=hills[-18, ]) anova(hills.loglm, hills2.loglm) step(hills2.loglm) ## The model without the interaction term summary(hills.loglm, corr=TRUE)$coef ## Footnote code .909 + c(-.065, .065) * qt(.975, 31) ## The model with the interaction term summary(hills2.loglm)$coef ## What happens if we do not transform? ## Sec 6.4: Measures for the Assessment and Comparison of Regression Models ## ss 6.4.1: $R^2$ and adjusted $R^2$ ## ss 6.4.2: AIC and related statistics ## Calculation of\, {\rm R}'s version of the AIC ## ss 6.4.3: How accurately does the equation predict? ## Sec 6.5: Interpreting Regression Coefficients ## ss 6.5.1: Book dimensions and book weight ## Footnote code pairs(log(oddbooks), labels=c("thick\n\nlog(mm)", "breadth\n\nlog(cm)", "height\n\nlog(cm)", "weight\n\nlog(g)")) ## Footnote code volume <- apply(oddbooks[, 1:3], 1, prod) area <- apply(oddbooks[, 2:3], 1, prod) lob1.lm <- lm(log(weight) ~ log(volume), data=oddbooks) lob2.lm <- lm(log(weight) ~ log(thick)+log(area), data=oddbooks) lob3.lm <- lm(log(weight) ~ log(thick)+log(breadth)+log(height), data=oddbooks) coef(summary(lob1.lm)) ## Similarly for coefficients and SEs for other models coef(summary(lm(log(weight) ~ log(thick), data=oddbooks))) book.density <- oddbooks$weight/volume bookDensity.lm <- lm(log(book.density) ~ log(area), data=oddbooks) coef(summary(bookDensity.lm)) ## Sec 6.6: Problems with Many Explanatory Variables ## ss 6.6.1: Variable selection issues ## Variable selection -- a simulation with random data ## Footnote code y <- rnorm(100) xx <- matrix(rnorm(4000), ncol = 40) dimnames(xx)<- list(NULL, paste("X",1:40, sep="")) ## Footnote code library(leaps) xx.subsets <- regsubsets(xx, y, method = "exhaustive", nvmax = 3, nbest = 1) subvar <- summary(xx.subsets)$which[3,-1] best3.lm <- lm(y ~ -1+xx[, subvar]) print(summary(best3.lm, corr = FALSE)) rm(xx, y, xx.subsets, subvar, best3.lm) # For once, we note objects that should be removed. ## Note also our function bestset.noise(). The following ## call is effectively equivalent to the above. bestset.noise(m=100, n=40) ## Sec 6.7: Multicollinearity ## ss 6.7.1: A contrived example ## Footnote code ## Scatterplot matrix for data frame carprice (DAAG) pairs(carprice[, -c(1,8,9)]) ## Alternative, using the lattice function splom() splom(~carprice[, -c(1,8,9)]) carprice1.lm <- lm(gpm100 ~ Type+Min.Price+Price+Max.Price+ Range.Price, data=carprice) summary(carprice1.lm)$coef alias(carprice1.lm) carprice2.lm <- lm(gpm100 ~ Type + Min.Price + Price + Max.Price + RoughRange, data=carprice) summary(carprice2.lm) ## An analysis that makes modest sense carprice.lm <- lm(gpm100 ~ Type + Price, data = carprice) summary(carprice.lm)$coef ## Footnote code summary(carprice.lm)$sigma # residual standard error when only price is used summary(carprice1.lm)$sigma # residual se when fitting all 3 price variables ## ss 6.7.2: The variance inflation factor (VIF) library (DAAG) # Be sure to use vif() from the DAAG package vif(lm(gpm100 ~ Price, data=carprice)) # Baseline vif(carprice1.lm) # has Min.Price, Price & Max.Price vif(carprice2.lm) # Add RoughRange vif(carprice.lm) # Price is the only price variable ## ss 6.7.3: Remedies for multicollinearity ## Sec 6.8: Multiple Regression Models -- Additional Points ## ss 6.8.1: Errors in $x$ ## Measurement of dietary intake ## A simulation of the effect of measurement error ## Errors in variables -- multiple regression ## ss 6.8.2: Confusion between explanatory and response variables ## Footnote code ## For convenience, here again are the R commands: hills.loglm <- lm(log(dist) ~ log(time)+log(climb), data=hills[-18, ]) summary(hills.loglm) ## Unintended correlations ## ss 6.8.3: Missing explanatory variables ## ss 6.8.4: \!\!$^{*}$ The use of transformations ## ss 6.8.5: \!\!$^{*}$ Non-linear methods -- an alternative to transformation? hills$climb.mi <- hills$climb/5280 hills.nls0 <- nls(time ~ (dist^alpha)*(climb.mi^beta), start = c(alpha = .909, beta = .260), data = hills[-18,]) summary(hills.nls0) plot(residuals(hills.nls0) ~ predict(hills.nls0)) # residual plot hills.nls <- nls(time ~ alpha + beta*dist + gamma*(climb.mi^delta), start=c(alpha = 1, beta = 1, gamma = 1, delta = 1), data=hills[-18, ]) summary(hills.nls) plot(residuals(hills.nls) ~ predict(hills.nls)) # residual plot ## Sec 6.9: Recap ## Sec 6.10: Further Reading ## Set up factor that identifies the `have' cities ## Data frame cities (DAAG) cities$have <- factor((cities$REGION=="ON")| (cities$REGION=="WEST")) plot(POP1996 ~ POP1992, data=cities, col=as.integer(cities$have)) plot(log(POP1996) ~ log(POP1992), data=cities, col=as.integer(cities$have)) cities.lm1 <- lm(POP1996 ~ have+POP1992, data=cities) cities.lm2 <- lm(log(POP1996) ~ have+log(POP1992), data=cities) hills.lm <- lm(time ~ dist+climb, data=hills) hills2.lm <- lm(time ~ dist+climb+dist:climb, data=hills) anova(hills.lm, hills2.lm) x1 <- runif(10) # predictor which will be missing x2 <- rbinom(10, 1, 1-x1) # observed predictor which depends # on missing predictor y <- 5*x1 + x2 + rnorm(10,sd=.1) # simulated model; coef # of x2 is positive y.lm <- lm(y ~ factor(x2)) # model fitted to observed data coef(y.lm) y.lm2 <- lm(y ~ x1 + factor(x2)) # correct model coef(y.lm2)