## Chapter 4 ## Sec 4.1: Basic Concepts of Estimation ## ss 4.1.1: Population parameters and sample statistics ## Why use the sample mean as an estimator? ## Maximum likelihood estimation ## ss 4.1.2: Sampling distributions ## ss 4.1.3: Assessing accuracy -- the standard error ## An example ## Footnote code ## Calculate heated-ambient; take heated & ambient from columns of pair65 test <- with(pair65, heated-ambient) c(mean = mean(test), SD = sd(test), SEM = sd(test)/sqrt(length(test))) ## ss 4.1.4: The standard error for the difference of means ## Footnote code heated <- c(254, 252, 239, 240, 250, 256, 267, 249, 259, 269) ambient <- c(233, 252, 237, 246, 255, 244, 248, 242, 217, 257, 254) v1 <- var(heated) # 10 numbers; 10-1 = 9 d.f. v2 <- var(ambient) # 11 numbers; 11-1 = 10 d.f. v <- (9*v1 + 10*v2)/(9+10) # Pooled estimate of variance # Estimate SED; variances may not be equal c(sem1 = sqrt(v1/10), sem2 = sqrt(v2/11), sed = sqrt(v1/10 + v2/11)) # Estimate SED; use pooled estimate c(sd = sqrt(v), sed = sqrt(v1/10 + v2/11)) ## ss 4.1.5: \kern-4pt$^*$ The standard error of the median ## Footnote code ## median and SD for length, by species: data frame cuckoos (DAAG) wren <- split(cuckoos$length, cuckoos$species)$wren median(wren) n <- length(wren) sqrt(pi/2)*sd(wren)/sqrt(n) # this SE computation assumes normality ## ss 4.1.6: The sampling distribution of the $t$ statistic ## Calculations for the $t$-distribution # Plus or minus 1.96SE normal distribution limits, e.g. pnorm(1.96) - pnorm(-1.96) # Plus or minus 2.31SE t distribution (8 df) limits, e.g. pt(2.31, 8) - pt(-2.31,8) # 2.31 SEs either side qnorm(0.975) # normal distribution qt(0.975, 8) # t-distribution with 8 d.f. ## How good is the normal theory approximation? ## Sec 4.2: Confidence Intervals and Hypothesis Tests ## ss 4.2.1: One- and two-sample intervals and tests for means ## Confidence intervals of 95\% or 99\% ## Footnote code ## 95% CI for mean of heated-ambient: data frame pair65 (DAAG) pair65.diff <- with(pair65, heated-ambient) pair65.n <- length(pair65.diff) pair65.se <- sd(pair65.diff)/sqrt(pair65.n) mean(pair65.diff) + qt(c(.025,.975),8)*pair65.se ## Footnote code qt(0.995, 8) ## Tests of hypotheses 1-pt(6.33/2.03, 8) # Equals pt(-6.33/2.03, 8) ## What is a small $p$-value? ## A summary of one- and two-sample calculations ## Footnote code ## t-test and confidence interval calculations heated <- c(254, 252, 239, 240, 250, 256, 267, 249, 259, 269) ambient <- c(233, 252, 237, 246, 255, 244, 248, 242, 217, 257, 254) t.test(heated, ambient, var.equal=TRUE) rm(heated, ambient) ## When is pairing helpful? ## Footnote code ## heated vs ambient: pair65 (DAAG); and cross vs self: mignonette (DAAG) par(mfrow=c(1,2)) attach(pair65); attach(mignonette) plot(heated ~ ambient); abline(0, 1) # left panel abline(mean(heated-ambient), 1, lty=2) plot(cross ~ self); abline(0, 1) # right panel abline(mean(cross-self), 1, lty=2) detach(pair65); detach(mignonette); par(mfrow = c(1,1)) ## What if the standard deviations are unequal? ## Different ways to report results ## Footnote code pair65.diff <- with(pair65, heated-ambient) n <- length(pair65.diff) av <- mean(pair65.diff); sd <- sqrt(var(pair65.diff)); se <- sd/sqrt(n) print(c(av, se)) # Item 1 print(av/se) # Item 2 t.test(pair65.diff) # Items 3 and 4 rm(pair65.diff, n, av, sd, se) ## ss 4.2.2: Confidence intervals and tests for proportions ## ss 4.2.3: Confidence intervals for the correlation ## ss 4.2.4: Confidence intervals versus hypothesis tests ## Sec 4.3: Contingency Tables ## Footnote code ## Compare #'s with a high school qualification, between ## `treated' and `untreated': data frame nsw74psid3 (DAAG) table(nsw74psid3$trt, nsw74psid3$nodeg) # Relative to Table 4.4, the rows will be interchanged. # PSID3 males are coded 0; NSW74 male trainees are coded 1. chisq.test(table(nsw74psid3$trt, nsw74psid3$nodeg)) # Specify correct=FALSE ## The mechanics of the calculation ## An example where a chi-squared test may not be valid ## Footnote code engineman <- matrix(c(5,3,17,85), 2,2) chisq.test(engineman) ## ss 4.3.1: Rare and endangered plant species ## Footnote code ## Enter the data thus: rareplants <- matrix(c(37,190,94,23,59,23,10,141,28,15,58,16), ncol=3, byrow=TRUE, dimnames=list(c("CC","CR","RC","RR"), c("D","W","WD"))) chisq.test(rareplants, correct=FALSE) ## Footnote code x2 <- chisq.test(rareplants) x2E <- stack(data.frame(t(x2$expected))) habitat <- rep(c(1,2,3), 4) plot(x2E$values ~ habitat, axes=FALSE, xlim=c(0.5, 3.5), pch=16, xlab="habitat", ylab="Expected Number of Species") text(x2E$values ~ habitat, labels=x2E$ind, pos=rep(c(4,4,2,2),3)) axis(1, at=seq(1,3), labels=c("D", "W", "WD")) axis(2) box() ## Examination of departures from a consistent overall row pattern x2 <- chisq.test(rareplants) ## Standardized residuals residuals(x2) x2$expected ## ss 4.3.2: Additional notes ## Interpretation issues ## Modeling approaches ## Sec 4.4: One-Way Unstructured Comparisons ## Footnote code library(lattice) lev <- c("F10", "NH4Cl", "NH4NO3", "F10 +ANU843", "NH4Cl +ANU843", "NH4NO3 +ANU843") ## Strip plot of ShootDryMass by trt: data frame rice (DAAG) rice$trt <- factor(rice$trt, levels=lev) stripplot(trt ~ ShootDryMass, data=rice, xlab="Shoot dry mass (g)", aspect=0.5) rice.aov <- aov(ShootDryMass ~ trt, data=rice) anova(rice.aov) summary.lm(rice.aov)$coef ests <- dummy.coef(rice.aov) ests[["(Intercept)"]] + ests[["trt"]] ## Footnote code ## Make NH4Cl the reference (or baseline) level rice$trt <- relevel(rice$trt, ref="NH4Cl") model.tables(rice.aov, se=TRUE) # By default, returns effects model.tables(rice.aov, type="means", se=TRUE) # Treatment means ## ss 4.4.1: Displaying means for the one-way layout ## Footnote code ## Use the function onewayPlot() from our DAAG package onewayPlot(rice.aov) # rice.aov was obtained in footnote 17 ## Is the analysis valid? ## ss 4.4.2: Multiple comparisons ## Footnote code ## The SED is 7.88 with 66 degrees of freedom. There are 6 means. sed <- 7.88 # For the t-critical value, use the sed sem <- sed/sqrt(2) # For the HSD statistic, the sem is required ## Alternatively, sem = (Residual SE)/sqrt(# plots per treatment) sem <- summary.lm(rice.aov)$sigma/sqrt(12) qt(p=.975, df=66) * 7.88 # Equals 15.7 qtukey(p=.95, nmeans=6, df=66) * 7.88 / sqrt(2) # Equals 23.1 # NB: We call qtukey() with p=0.95, not with p=.975 as for the pairwise # t-test. The test works with |difference|, and is inherently one-sided. # We divide by sqrt(2) because the Tukey statistic is expressed as a # multiple of the SEM = SED/sqrt(2) = 7.88/sqrt(2) ## $^*$Microarray data -- severe multiplicity ## ss 4.4.3: Data with a two-way structure, i.e.\ two factors ## Footnote code with(rice, interaction.plot(fert, variety, ShootDryMass)) # Do interaction.plot(fert, variety, ShootDryMass), with fert, variety # and ShootDryMass taken from the columns of rice ## ss 4.4.4: Presentation issues ## Sec 4.5: Response Curves ## Footnote code ## Plot distance.travelled vs starting.point: data frame modelcars (DAAG) plot(distance.traveled ~ starting.point, pch=15, data=modelcars, xlab = "Distance up ramp", ylab="Distance traveled") ## Sec 4.6: Data with a Nested Variation Structure ## ss 4.6.1: Degrees of freedom considerations ## ss 4.6.2: General multi-way analysis of variance designs ## Sec 4.7: Resampling Methods for Standard Errors, Tests and Confidence Intervals ## ss 4.7.1: The one-sample permutation test ## ss 4.7.2: The two-sample permutation test ## Footnote code ## Draw just one of these curves ## Permutation distribution of difference in means x1 <- two65$ambient; x2 <- two65$heated; x <- c(x1, x2) n1 <- length(x1); n2 <- length(x2); n <- n1+n2 dbar <- mean(x2) - mean(x1) z <- numeric(2000) for(i in 1:2000){ mn <- sample(n, n2, replace=FALSE) dbardash <- mean(x[mn]) - mean(x[-mn]) z[i]<- dbardash } pval <- (sum(z > abs(dbar)) + sum (z< -abs(dbar)))/2000 plot(density(z), yaxs="i") abline(v = dbar) abline(v = -dbar, lty=2) print(signif(pval,3)) rm(x1, x2, n1, n2, x, n, dbar, z, mn, dbardash, pval) ## ss 4.7.3: \kern-4pt$^*$ Estimating the standard error of the median: bootstrapping ## bootstrap estimate of median of wren length: data frame cuckoos (DAAG) wren <- split(cuckoos$length, cuckoos$species)$wren library(boot) ## First define median.fun(), with two required arguments: ## data specifies the data vector, ## indices selects vector elements for a each resample median.fun <- function(data, indices){median(data[indices])} ## Call the boot() function, with statistic=median.fun wren.boot <- boot(data = wren, statistic = median.fun, R = 999) # R = number of resamples wren.boot ## ss 4.7.4: Bootstrap estimates of confidence intervals ## The median median.fun <- function(data, indices){median(data[indices])} ## Call the boot() function, with statistic=median.fun wren <- cuckoos[cuckoos$species=="wren", "length"] wren.boot <- boot(data=wren, statistic=median.fun, R=9999) boot.ci(wren.boot, type=c("perc", "bca")) ## The correlation coefficient ## Footnote code ## Bootstrap estimate of 95% CI of cor(chest, belly): data frame possum (DAAG) possum.fun <- function(data, indices) { chest <- data$chest[indices] belly <- data$belly[indices] cor(belly, chest) } possum.boot <- boot(possum, possum.fun, R=9999) boot.ci(possum.boot, type=c("perc", "bca")) ## The bootstrap -- parting comments ## Sec 4.8: * Theories of Inference ## ss 4.8.1: Maximum likelihood estimation funlik <- function(mu, sigma, x=with(pair65, heated-ambient)) prod(dnorm(x, mu, sigma)) log(funlik(6.3, 6.1)) # Close to estimated mean and SD log(funlik(6.33, 5.75)) # Close to the actual mle's log(funlik(7, 5.75)) ## ss 4.8.2: Bayesian estimation ## ss 4.8.3: If there is strong prior information, use it! ## Sec 4.9: Recap ## Dos and don'ts ## Sec 4.10: Further Reading oldpar <- par(mfrow=c(2,2)) tenfold1000 <- rep(1:10, rep(1000,10)) boxplot(split(rnorm(1000*10), tenfold1000), ylab="normal - 1000") boxplot(split(rt(1000*10, 7), tenfold1000), ylab=expression(t[7]*" - 1000")) tenfold100 <- rep(1:10, rep(100, 10)) boxplot(split(rnorm(100*10), tenfold100), ylab="normal - 100") boxplot(split(rt(100*10, 7), tenfold100), ylab=expression(t[7]*" - 100")) par(oldpar) y1 <- rnorm(51) y <- y1[-1] + y1[-51] acf(y1) # acf is `autocorrelation function' # (see Chapter 9) acf(y) ## UCBAdmissions is in the datasets package ## For each combination of margins 1 and 2, calculate the sum UCBtotal <- apply(UCBAdmissions, c(1,2), sum) apply(UCBAdmissions, 3, function(x) (x[1,1]*x[2,2])/(x[1,2]*x[2,1])) admissions <- array(c(30,30,10,10,15,5,30,10), dim=c(2,2,2)) ## Compare densities for ambient & heated: list two65 (DAAG) with(two65, overlapDensity(ambient, heated)) # Do overlapDensity(ambient, heated) with ambient and heated # taken, if not found elsewhere, from the columns of two65 z.transform <- function(r) .5*log((1+r)/(1-r)) z.inverse <- function(z) (exp(2*z)-1)/(exp(2*z)+1) possum.fun <- function(data, indices) { chest <- data$chest[indices] belly <- data$belly[indices] z.transform(cor(belly, chest))} possum.boot <- boot(possum, possum.fun, R=999) z.inverse(boot.ci(possum.boot, type="perc")$percent[4:5]) # See help(bootci.object). The 4th and 5th elements of # the percent list element hold the interval endpoints.