## Chapter 8 ## Sec 8.1: Generalized Linear Models ## ss 8.1.1: Transformation of the expected value on the left ## ss 8.1.2: Noise terms need not be normal ## ss 8.1.3: Log odds in contingency tables ## Footnote code ## Simplified version of the figure p <- (1:999)/1000 gitp <- log(p/(1 - p)) plot(p, gitp, xlab = "Proportion", ylab = "", type = "l", pch = 1) ## ss 8.1.4: Logistic regression with a continuous explanatory variable library(DAAG) anestot <- aggregate(anesthetic[, c("move","nomove")], by=list(conc=anesthetic$conc), FUN=sum) ## The column 'conc', because from the 'by' list, is then a factor. ## The next line recovers the numeric values anestot$conc <- as.numeric(as.character(anestot$conc)) anestot$total <- apply(anestot[, c("move","nomove")], 1 , sum) anestot$prop <- anestot$nomove/anestot$total ## Footnote code ## Plot proportion moving vs conc: data frame anesthetic (DAAG) plot(prop ~ conc, data=anestot, xlab = "Concentration", ylab = "Proportion", xlim = c(.5,2.5), ylim = c(0, 1), pch = 16) with(anestot, {text(conc, prop, paste(total), pos=2) abline(h=sum(nomove)/sum(total), lty=2)}) ## Fit model directly to the 0/1 data in nomove anes.logit <- glm(nomove ~ conc, family=binomial(link="logit"), data=anesthetic) ## Fit model to the proportions; supply total numbers as weights anes1.logit <- glm(prop ~ conc, family=binomial(link="logit"), weights=total, data=anestot) anova(anes.logit) summary(anes.logit) ## Footnote code ## Graphical summary of logistic regression results anestot$emplogit <- with(anestot, log((nomove+0.5)/(move+0.5))) plot(emplogit ~ conc, data=anestot, xlab = "Concentration", xlim = c(0, 2.75), xaxs="i", ylab = "Empirical logit", ylim=c(-2, 2.4), cex=1.5, pch = 16) with(anestot, text(conc, emplogit, paste(round(prop,2)), pos=c(2,4,2,4,4,4))) abline(anes.logit) ## Sec 8.2: Logistic Multiple Regression ## Footnote code ## Presence/absence information: data frame frogs (DAAGS) plot(northing ~ easting, data=frogs, pch=c(1,16)[frogs$pres.abs+1], xlab="Meters east of reference point", ylab="Meters north") ## Footnote code pairs(frogs[, 4:10], oma=c(2,2,2,2), cex=0.5) ## Footnote code ## Here is code for the top row of plots par(mfrow=c(3,3)) for(nam in c("distance","NoOfPools","NoOfSites")){ y <- frogs[, nam] plot(density(y), main="", xlab=nam) } # The other rows can be obtained by replacing y by # sqrt(y) or log(y) (or, for NoOfSites, by log(y+1)) par(mfrow=c(1,1)) ## Footnote code with(frogs, pairs(cbind(altitude, log(distance), log(NoOfPools), NoOfSites), panel=panel.smooth, labels=c("altitude","log(distance)", "log(NoOfPools)","NoOfSites"))) ## ss 8.2.1: Selection of model terms, and fitting the model frogs.glm0 <- glm(formula = pres.abs ~ altitude + log(distance) + log(NoOfPools) + NoOfSites + avrain + meanmin + meanmax, family = binomial, data = frogs) summary(frogs.glm0) frogs.glm <- glm(formula = pres.abs ~ log(distance) + log(NoOfPools) + meanmin + meanmax, family = binomial, data = frogs) summary(frogs.glm) ## ss 8.2.2: A plot of contributions of explanatory variables ## Footnote code ## Displaying the residuals does not, for logit models, add useful insight. ## To see what they look like, try: termplot(frogs.glm, data=frogs, partial.resid=TRUE) par(mfrow=c(1,4), pty="s") termplot(frogs.glm, data=frogs) par(mfrow=c(1,1)) ## ss 8.2.3: Cross-validation estimates of predictive accuracy ## Footnote code CVbinary(frogs.glm0) # All explanatory variables CVbinary(frogs.glm) # Reduced set of explanatory variables ## Footnote code ## The cross-validated estimate of accuracy is stored in the list ## element acc.cv, in the output from the function CVbinary(), thus: for (j in 1:4){ randsam <- sample(1:10, 212, replace=TRUE) all.acc <- CVbinary(frogs.glm0, rand=randsam)$acc.cv reduced.acc <- CVbinary(frogs.glm, rand=randsam)$acc.cv cat("\nAll:", round(all.acc,3), " Reduced:", round(reduced.acc,3)) } ## Sec 8.3: Logistic Models for Categorical Data -- an Example ## Create data frame from multi-way table UCBAdmissions (datasets) dimnames(UCBAdmissions) # Check levels of table margins UCB <- as.data.frame.table(UCBAdmissions["Admitted", , ]) names(UCB)[3] <- "admit" UCB$reject <- as.data.frame.table(UCBAdmissions["Rejected", , ])$Freq UCB$Gender <- relevel(UCB$Gender, ref="Male") ## Add further columns total and p (proportion admitted) UCB$total <- UCB$admit + UCB$reject UCB$p <- UCB$admit/UCB$total UCB.glm <- glm(p ~ Dept*Gender, family=binomial, data=UCB, weights=total) anova(UCB.glm, test="Chisq") summary(UCB.glm)$coef ## Sec 8.4: Poisson and Quasi-Poisson Regression ## ss 8.4.1: Data on aberrant crypt foci ## Footnote code ## Plot count vs endtime: data frame ACF1 (DAAG) plot(count ~ endtime, data=ACF1, pch=16, log="x") ACF.glm0 <- glm(formula = count ~ endtime, family = poisson, data = ACF1) summary(ACF.glm0) ACF.glm <- glm(formula = count ~ endtime + I(endtime^2), family = poisson, data = ACF1) summary(ACF.glm) etime <- unique(ACF1$endtime) etime exp(-0.3215 + 0.1192*etime) # Linear term only exp(1.72235 - 0.26235*etime + 0.01514*etime^2) # Quadratic model sum(resid(ACF.glm, type="pearson")^2)/19 ACFq.glm <- glm(formula = count ~ endtime, family = quasipoisson, data = ACF1) summary(ACFq.glm)$coef sapply(split(residuals(ACFq.glm), ACF1$endtime), var) fligner.test(resid(ACFq.glm) ~ factor(ACF1$endtime)) ## ss 8.4.2: Moth habitat example ## Footnote code ## Number of moths by habitat: data frame moths (DAAG) rbind(Number=table(moths[, 4]), sapply(split(moths[, -4], moths$habitat), apply, 2, sum)) library(lattice) dotplot(habitat ~ A, data=moths, xlab="Number of moths (species A)", Panel=function(x, y, ...){ avmoth <- aggregate(x, by=list(habitat=y), FUN=mean) panel.dotplot(x,y, pch=1, ...) panel.dotplot(avmoth$x, avmoth$habitat, pch=3, cex=1.25, ...) }, key=list(text=list(c("Individual transects", "Mean")), points=list(pch=c(1,3), cex=c(1,1.25))), columns=2)) ## An unsatisfactory choice of reference level A.glm <- glm(A ~ habitat + log(meters), family = quasipoisson, data = moths) summary(A.glm) subset(moths, habitat=="Bank") ## Footnote code ## Analysis with tighter convergence criterion A.glm <- update(A.glm, epsilon=1e-10) summary(A.glm)$coef head(summary(A.glm)$coefficients, 3) ## SEs of predicted values A.se <- predict(A.glm, se=T)$se.fit A.se[moths$habitat=="Bank"] range(A.se[moths$habitat!="Bank"]) ## A more satisfactory choice of reference level moths$habitat <- relevel(moths$habitat, ref="Lowerside") A.glm <- glm(A ~ habitat + log(meters), family=quasipoisson, data=moths) summary(A.glm) ## The comparison between \texttt{Bank} and other habitats ## Compare Bank with Disturbed habitatD <- moths$habitat habitatD[habitatD=="Bank"] <- "Disturbed" habitatD <- factor(habitatD) table(habitatD) A.glm <- glm(A ~ habitat + log(meters), family = quasipoisson, data=moths) AD.glm <- glm(A ~ habitatD + log(meters), family = quasipoisson, data=moths) anova(A.glm, AD.glm, test="F") ## ## Compare Bank with NWsoak habitatNW <- moths$habitat habitatNW[habitatNW=="Bank"] <- "NWsoak" habitatNW <- factor(habitatNW) ANW.glm <- glm(A ~ habitatNW + log(meters), family = quasipoisson, data=moths) ANW.glm <- glm(A ~ habitatNW + log(meters), family = quasipoisson, data=moths) anova(A.glm, ANW.glm, test="F") ## Diagnostic plots A1.glm <- glm(formula = A ~ habitat + log(meters), family = quasipoisson, data = moths, subset=habitat!="Bank") plot(A1.glm, panel=panel.smooth) ## Sec 8.5: Additional Notes on Generalized Linear Models ## ss 8.5.1: $\!\!^{*}$ Residuals, and estimating the dispersion ## Other choices of link function for binomial models ## Quasi-binomial and quasi-poisson models ## ss 8.5.2: Standard errors and $z$- or $t$-statistics for binomial models fac <- factor(LETTERS[1:4]) p <- c(103, 30, 11, 3)/500 n <- rep(500,4) summary(glm(p ~ fac, family=binomial, weights=n))$coef ## ss 8.5.3: Leverage for binomial models ## Sec 8.6: Models with an Ordered Categorical or Categorical Response ## ss 8.6.1: Ordinal Regression Models ## Exploratory analysis ## $\!\!^{*}$ Proportional odds logistic regression library(MASS) inhaler <- data.frame(freq=c(99,76,41,55,2,13), choice=rep(c("inh1","inh2"), 3), ease=ordered(rep(c("easy","re-read", "unclear"), rep(2,3)))) inhaler1.polr <- polr(ease ~ 1, weights=freq, data=inhaler, Hess=TRUE, subset=inhaler$choice=="inh1") # Setting Hess=TRUE at this point averts possible numerical # problems if this calculation is deferred until later. inhaler2.polr <- polr(ease ~ 1, weights=freq, data=inhaler, Hess=TRUE, subset=inhaler$choice=="inh2") summary(inhaler1.polr) summary(inhaler2.polr) inhaler.polr <- polr(ease ~ choice, weights=freq, Hess=TRUE, data=inhaler) summary(inhaler.polr) deviance(inhaler.polr) - (deviance(inhaler1.polr) + deviance(inhaler2.polr)) summary(inhaler.polr) ## ss 8.6.2: $\!\!^{*}$ Loglinear Models library(MASS) example(loglm) ## Sec 8.7: Survival analysis ## ss 8.7.1: Analysis of the \texttt{Aids2} data bloodAids <- subset(Aids2, T.categ=="blood") bloodAids$days <- bloodAids$death-bloodAids$diag bloodAids$dead <- as.integer(bloodAids$status=="D") library(survival) plot(survfit(Surv(days, dead) ~ sex, data=bloodAids), col=c(2,4), conf.int=TRUE) ## ss 8.7.2: Right censoring prior to the termination of the study hsaids <- subset(Aids2, sex=="M" & T.categ=="hs") hsaids$days <- hsaids$death-hsaids$diag hsaids$dead <- as.integer(hsaids$status=="D") table(hsaids$status,hsaids$death==11504) ## ss 8.7.3: The survival curve for male homosexuals ## Footnote code hsaids.surv <- survfit(Surv(days, dead) ~ 1, data=hsaids) plot(hsaids.surv) hsaids.surv <- survfit(Surv(days, dead) ~ 1, data=hsaids) plot(hsaids.surv) ## ss 8.7.4: Hazard rates ## ss 8.7.5: The Cox proportional hazards model bloodAids.coxph <- coxph(Surv(days, dead) ~ sex, data=bloodAids) summary(bloodAids.coxph) cox.zph(bloodAids.coxph) plot(cox.zph(bloodAids.coxph)) plot(bloodAids$days, resid(bloodAids.coxph)) lines(lowess(bloodAids$days, resid(bloodAids.coxph))) ## Sec 8.8: Transformations for Count Data ## Sec 8.9: Further Reading