## Chapter 2 ## Sec 2.1: Revealing Views of the Data ## ss 2.1.1: Views of a single sample ## Histograms and density plots library(DAAG) # Ensure that the DAAG package is attached ## Form the subset of possum that holds data on females only fossum <- subset(possum, sex=="f") ## Footnote code ## To get a 1 by 4 layout, precede with par(mfrow = c(1,4)) attach(fossum) hist(totlngth, breaks = 72.5 + (0:5) * 5, ylim = c(0, 22), xlab="Total length (cm)", main ="A: Breaks at 72.5, 77.5, ...") hist(totlngth, breaks = 75 + (0:5) * 5, ylim = c(0, 22), xlab="Total length (cm)", main="B: Breaks at 75, 80, ...") dens <- density(totlngth) # Assumes fossum is still attached xlim <- range(dens$x); ylim <- range(dens$y) hist(totlngth, breaks = 72.5 + (0:5) * 5, probability = T, xlim = xlim, ylim = ylim, xlab="Total length (cm)", main=" ") lines(dens) hist(totlngth, breaks = 75 + (0:5) * 5, probability = T, xlim = xlim, ylim = ylim, xlab="Total length (cm)", main= " ") lines(dens) par(mfrow=c(1,1)); detach(fossum) ## The stem-and-leaf display stem(ais$ht[ais$sport=="Row"]) ## Footnote code ## Use quantile() to obtain the quartiles of ht: data frame ais (DAAG package) quantile(ais$ht[ais$sport=="Row"], prob=c(.25,.5,.75)) # For the 50th percentile (the 2nd quartile), an alternative is median() ## Lattice Style Density Plots ## Density plot for earconch: data frame possum (DAAG package) library(lattice) densityplot(~ earconch | Pop, groups=sex, data=possum, auto.key=list(columns=2), aspect=1) ## Boxplots ## Base graphics boxplot function boxplot(fossum$totlngth, horiz=TRUE) ## Alternative: lattice graphics bwplot function bwplot(~totlngth, data=fossum) ## ss 2.1.2: Patterns in univariate time series ## Panel A; For plotting of population numbers, see the footnote plot(log10(measles), xlab="", ylim=log10 (c(1,5000*1000)), ylab=" Deaths; Population (log scale)", yaxt="n") ytiks <- c(1, 10, 100, 1000, 1000000, 5000000) axis(2, at=log10(ytiks), labels=paste(ytiks), las=2) ## Panel B; for plotting of population numbers, see the footnote Plot (window(measles, start=1840, end=1882), ylim=c (0, 4600), yaxt="n") axis(2, at=(0:4)* 1000, labels=paste(0:4), las=2) ## Footnote code ## Panel A: Plot full measles time series (DAAG) par(fig=c(0, 1, .38, 1)) # 38% to 100% of height of figure region plot(log10(measles), xlab="", ylim=log10(c(1,5000*1000)), ylab=" Deaths; Population (log scale)", yaxt="n") ytiks <- c(1, 10, 100,1000, 1000000, 5000000) axis(2, at=log10(ytiks), labels=paste(ytiks), las=2) ## London population in thousands londonpop <- ts(c(1088,1258,1504,1778,2073,2491,2921,3336,3881,4266, 4563,4541,4498,4408), start=1801, end=1931, deltat=10) points(log10(londonpop*1000), pch=16, cex=.5) mtext(side=3, line=0.5, "A (1629-1939)", adj=0) ## Panel B: window from 1840 to 1882 par(fig=c(0, 1, 0, .4), new=TRUE) # 0% to 38% of height of figure region plot(window(measles, start=1840, end=1882), xlab="Year", yaxt="n", ylim=c(0,4600), ylab="Deaths; Population in 1000s") points(londonpop, pch=16, cex=0.5) axis(2, at=(0:4)*1000, las=2) mtext(side=3, line=0.5, "B (1841-1881)", adj=0) par(fig=c(0, 1, 0, 1)) # Restore default figure region # This is an addition to code in the text. ## ss 2.1.3: Patterns in bivariate data ## Plot four vs one: data frame milk (DAAG) xyrange <- range(milk) plot(four ~ one, data = milk, xlim = xyrange, ylim = xyrange, pch = 16, pty="s") # pty="s": square plotting region rug(milk$one) # Rug plot on x-axis rug(milk$four, side = 2) # Rug plot on y-axis abline(0, 1) ## The fitting of a smooth trend curve ## Plot ohms vs juice: data frame fruitohms (DAAG) plot(ohms ~ juice, xlab="Apparent juice content (%)", ylab="Resistance (ohms)", data=fruitohms) ## Add a smooth curve, as in Panel B lines(lowess(fruitohms$juice, fruitohms$ohms), lwd=2) # With lwd=2, the curve is twice the default thickness ## What is the appropriate scale? ## The following omits the labeling information oldpar <- par(mfrow = c(1,2), pty="s") ## Plot brain vs body: data frame Animals (MASS package) library(MASS) plot(brain ~ body, data=Animals) plot(log(brain) ~ log(body), data=Animals) par(oldpar) ## ss 2.1.4: Patterns in grouped data ## Example: eggs of cuckoos ## Compare stripplot() with bwplot(), both from lattice package stripplot(species ~ length, xlab="Length of egg (mm)", data=cuckoos) bwplot(species ~ length, xlab="Length of egg (mm)", data=cuckoos) ## Footnote code ## For tidier labels replace ".", in several of the species names, by a space specnam <- sub(pattern="\\.", replacement=" ", x=levels(cuckoos$species)) # In a `regular expression', "." must be specified as "\\." levels(cuckoos$species) <- specnam ## Panel A: Strip plot: data frame cuckoos (DAAG) plt1 <- stripplot(species ~ length, xlab="Length of egg (mm)", data=cuckoos) print(plt1, position=c(0, 0.5, 1, 1)) # xmin, ymin, xmax, ymax # Use print() to display lattice graphics objects ## Panel B: Box plot plt2 <- bwplot(species ~ length, xlab="Length of egg (mm)", data=cuckoos) print(plt2, newpage=FALSE, position=c(0, 0, 1, 0.5)) ## ss 2.1.5: \hspace*{-5pt}*\hspace*{5pt}Multiple variables and times ## Apply function range to columns of data frame jobs (DAAG) sapply(jobs, range) ## Footnote code ## Code for Figure 2.9A jobts <- ts(jobs[,1:6], start=1995, frequency=12) plot(jobts, plot.type="single", xlim=c(1995,1997.2), lty=1:6, log="y", xaxt="n", xlab="", ylab="Number of Jobs") ## Move label positions so that labels do not overlap ylast <- bounce(window(jobts, 1996+11/12), d=strheight("O"), log=TRUE) text(rep(1996+11/12,6), ylast, colnames(ylast), pos=4) datlab <- format(seq(from=as.Date("1Jan1995", format="%d%b%Y"), by="3 month", length=8), "%b%Y") axis(1, at=seq(from=1995, by=0.25, length=8), datlab) ## First create a multivariate time series jobts <- ts(jobs[,1:6], start=1995, frequency=12) ## Simplified plot; all series in a single panel; use log scale plot(jobts, plot.type="single", xlim=c(1995,1997.2), lty=1:5, xlab="", log="y", ylab="Number of Jobs") ## Plot labels following the final value for each series ylast <- window(jobts, 1996+11/12) # Note the use of window() to extract a row from the series. text(rep(1996+11/12,6), ylast, colnames(ylast), pos=4) ## Code for Figure 2.9B ## Stack labor force numbers into one column: data frame jobs (DAAG) Jobs <- stack(jobs, select = 1:6) # Column 1 first, then 2, ... # The stack() function was discussed in Chapter 1 Jobs$Year <- rep(jobs[, 7], 6) names(Jobs) <- c("Number", "Province", "Year") xyplot(log(Number) ~ Year | Province, data = Jobs, type = "l", layout=c(2,3), scales = list(y=list(relation="sliced"))) # The parameter setting relation="sliced" causes the length of # the relevant scale(s) (slice(s)) to be the same for all panels. ## Footnote code ## Another possibility, with number of tick positions (~100) ## on the logarithmic scale chosen after some experimentation ylabpos <- pretty(log(Jobs$Number), 100) ylabels <- paste(round(exp(ylabpos)),"\n(", ylabpos, ")", sep="") xyplot(log(Number) ~ Year | Province, data = Jobs, type = "l", layout=c(2,3), scales = list(y =list(relation="sliced", at=ylabpos, labels=ylabels))) ## *Small proportional changes, on a scale of natural logarithms ## ss 2.1.6: Scatterplots, broken down by multiple factors xyplot(csoa ~ it | sex*agegp, data=tinting, groups=target, auto.key=list(columns=2)) ## Footnote code ## Settings used for Figure 2.10B (suitable for grayscale on a printed page) xyplot(csoa ~ it|sex*agegp, data=tinting, groups=target, par.settings=list(background=list(col="transparent"), superpose.symbol=list(col=c("black","gray20"), pch=c(1, 16))), auto.key=list(columns=2, text=levels(tinting$target), points=TRUE)) # In the above, par.settings changed settings for this use of xyplot() # Setting background color to "transparent" gives a "white" background ## Use trellis.par.set() to change settings while the current device is in use ## Footnote code ## Panel B, with refinements xyplot(csoa ~ it|sex*agegp, groups=tint, data=tinting, type=c("p","smooth"), span=0.8, par.settings= list(superpose.symbol=list(col=c("skyblue1", "skyblue4")[c(2,1,2)], pch=c(1,16,16)), # open, filled, filled superpose.line=list(col=c("skyblue1", "skyblue4")[c(2,1,2)], lwd=c(1,1,2)), background=list(col="transparent")), auto.key=list(columns=3, points=TRUE, lines=TRUE)) # A large value for span gives a smoother curve xyplot(csoa ~ it|sex*agegp, groups=tint, data=tinting, type=c("p","smooth"), span=1.25, auto.key=list(columns=3)) # "p": points; "smooth": a smooth curve # With span=1.25, the smooth curve is close to a straight line ## ss 2.1.7: What to look for in plots ## Outliers ## Asymmetry of the distribution ## Changes in variability ## Clustering ## Non-linearity ## Sec 2.2: Data Summary ## ss 2.2.1: Counts ## Table of counts example: data frame nsw74psid1 (DAAG) attach(nsw74psid1) trtgp <- factor(trt, labels=c("none", "training")) educgp <- factor(nodeg, labels=c("completed", "dropout")) table(trtgp, educgp) ## Check for NAs, which table() will have ignored: sum(is.na(trtgp) | is.na(educgp)) ## A potentially more informative alternative is: ## table(is.na(trtgp), is.na(educgp)) detach(nsw74psid1) ## Addition over one or more margins of a table stones <- array(c(81,6,234,36,192,71,55,25), dim=c(2,2,2), dimnames=list(Sucess=c("yes","no"), Method=c("open","ultrasound"), Size=c("<2cm", ">=2cm"))) mosaicplot(stones, sort=3:1) # Re-ordering the margins gives a more interpretable plot. ## Cross-Tabulation -- the \texttt{xtabs()} function ## From table UCBAdmissions, create data frame UCB.df UCB.df <- as.data.frame(UCBAdmissions) UCB.df ## Footnote code ## The following turns the 3-way array stones (created above) into a data ## frame with one column for each margin, plus a column of frequencies: stones.df <- as.data.frame(as.table(stones)) # First, turn into a table stones.df <- as.data.frame.table(stones) # Alternative xtabs(Freq ~ Admit + Gender, data=UCB.df) ## First, determine overall admission rates tabAll <- xtabs(Freq ~ Admit + Gender, data=UCB.df) tabAll[1,]/(tabAll[1,]+tabAll[2,]) ## Create a table whose first dimension is 'Admit' tabDept <- xtabs(Freq ~ Admit + Gender + Dept, data=UCB.df) ## With dimension 2 ('Gender') and dimension 3 ('Dept') as ## margins, find the proportion in level 1, as a fraction ## of the total over both levels of dimension 1: tabDept[1,,]/(tabDept[1,,] + tabDept[2,,]) tabDept[1,,] + tabDept[2,,] # Calculate totals ## ss 2.2.2: Summaries of information from data frames ## Summary as a prelude to analysis -- \texttt{aggregate()} ## mean yield by block by shade: data frame kiwishade (DAAG) attach(kiwishade) kiwimeans <- aggregate(yield, by=list(block, shade), mean) names(kiwimeans) <- c("block","shade","meanyield") detach(kiwishade) head(kiwimeans, 4) # . . . ## Footnote code ## Note the use of a panel function that calls panel.dotplot(), ## then adding the code needed to display plot means. dotplot(shade ~ yield | block, data=kiwishade, pch=1, aspect=1, panel=function(x,y,...){panel.dotplot(x, y ,...) av <- sapply(split(x,y), mean) ypos <- unique(y) lpoints(ypos~av, pch=3, cex=1.25)}, key=list(space="top", columns=2, text=list(c("Individual vine yields", "Plot means (4 vines)")), points=list(pch=c(1,3), cex=c(1,1.25))), layout=c(3,1)) # Note that parameter settings were given both in the function call and # in the list supplied to key. [With auto.key, this is unnecessary.] ## The benefits of data summary -- dengue status example ## ss 2.2.3: Standard deviation and inter-quartile range ## Cuckoo eggs example ## Footnote code ## SD of length, by species: data frame cuckoos (DAAG) sapply(split(cuckoos$length, cuckoos$species), sd) # Subsection 12.6.7 has information on split() ## Degrees of freedom ## The pooled standard deviation ## Elastic bands example ## ss 2.2.4: Correlation ## Correlation between body and brain: data frame Animals (MASS) attach(Animals) ## Pearson product--moment correlation cor.test(body, brain) ## Spearman rank correlation cor.test(body, brain, method="spearman") detach(Animals) ## Sec 2.3: Statistical Analysis Questions, Aims and Strategies ## ss 2.3.1: How relevant and how reliable are the data? ## ss 2.3.2: Helpful and unhelpful questions ## ss 2.3.3: How will results be used? ## ss 2.3.4: Formal and informal assessments ## Questionnaires and surveys ## ss 2.3.5: Statistical Analysis Strategies ## ss 2.3.6: Planning the formal analysis ## ss 2.3.7: Changes to the intended plan of analysis ## Sec 2.4: Recap ## Sec 2.5: Further Reading attach(cuckoohosts) plot(c(clength, hlength), c(cbreadth, hbreadth), col=rep(1:2,c(12,12))) for(i in 1:12)lines(c(clength[i], hlength[i]), c(cbreadth[i], hbreadth[i])) text(hlength, hbreadth, abbreviate(rownames(cuckoohosts),8)) detach(cuckoohosts) deathrate <- c(40.7, 36,27,30.5,27.6,83.5) hosp <- c("Cliniques of Vienna (1834-63)\n(> 2000 cases pa)", "Enfans Trouves at Petersburg\n(1845-59, 1000-2000 cases pa)", "Pesth (500-1000 cases pa)", "Edinburgh (200-500 cases pa)", "Frankfort (100-200 cases pa)", "Lund (< 100 cases pa)") hosp <- factor(hosp, levels=hosp[order(deathrate)]) dotplot(hosp~deathrate, xlim=c(0,95), xlab="Death rate per 1000 ", type=c("h","p")) ## Source: Nightingale (1871). Data are ascribed to Dr Le Fort library(Devore6) # ex10.22 is from Devore6 tomatoes <- ex10.22 with(Animals, cor(brain,body)) with(Animals, cor(log(brain),log(body))) with(Animals, cor(log(brain),log(body), method="spearman")) bwplot(shade ~ yield|block, data=kiwishade, layout=c(3,1))