## Chapter 12 ## Sec 12.1: Multivariate Exploratory Data Analysis ## ss 12.1.1: Scatterplot matrices ## Footnote code ## Scatterplot matrix, for columns 9-11 of possum (DAAG) ## The use of different colors for different sexes, and different ## symbols for different sites, makes for intricate code. library(lattice) ## Scatterplot matrix: columns 9:11 of data frame possum (DAAG) colr <- c("red", "blue") pchr <- c(3,4,0,8,2,10,1) ss <- expand.grid(site=1:7, sex=1:2) # Site varies fastest ## Add sexsite to possum; will be used again below possum$sexsite <- paste(possum$sex, possum$site, sep="-") splom(~ possum[, c(9:11)], groups = possum$sexsite, col = colr[ss$sex], pch = pchr[ss$site], varnames=c("tail\nlength","foot\nlength","ear conch\nlength"), key =list(points = list(pch=pchr), text=list(c("Cambarville","Bellbird","Whian Whian","Byrangery", "Conondale ","Allyn River", "Bulburin")), columns=4)) ## ss 12.1.2: Principal components analysis ## Footnote code pchr <- c(3,4,0,8,2,10,1) colr <- trellis.par.get()$superpose.symbol$col cloud(earconch~taill+footlgth, data=possum, pch=pchr, groups=site, cex=.65, key = list(space="top", corner=c(0,1), columns=4, cex=0.75, between=1, points = list(pch=pchr, col=colr), text=list(c("Cambarville","Bellbird","Whian Whian ", "Byrangery", "Conondale ","Allyn River","Bulburin")), between.columns=2)) ## Preliminary data scrutiny ## Principal components calculations: possum[, 6:14] (DAAG) possum.prc <- princomp(na.omit(possum[, 6:14])) ## Footnote code ## Plot of principal components: possum[, 6:14] here<- complete.cases(possum[, 6:14]) colr <- c("red", "blue") pchr <- c(3,4,0,8,2,10,1) ss <- expand.grid(site=1:7, sex=1:2) # Site varies fastest xyplot(possum.prc$scores[, 2] ~ possum.prc$scores[, 1], groups = possum$sexsite[here], col = colr[ss$sex], pch = pchr[ss$site], xlab="1st Principal Component", ylab="2nd Principal Component", key=list(points = list(pch=pchr), text=list(c("Cambarville", "Bellbird", "Whian Whian", "Byrangery", "Conondale", "Allyn River", "Bulburin" )), columns=4)) summary(possum.prc, loadings=TRUE, digits=2) ## The stability of the principal components plot library(lattice) ## Bootstrap principal components calculations: possum (DAAG) usepossum <- na.omit(possum[, -5]) usepossum$sexsite <- with(usepossum, paste(sex, site, sep=":")) n <- dim(usepossum)[1] ntimes <- 4 bootscores <- data.frame(matrix(0, nrow=ntimes*n, ncol=3)) for (i in 1:ntimes){ samprows <- sample(1:n, n, replace=TRUE) possumi <- usepossum[samprows, ] bootscores[n*(i-1)+(1:n), 1:2] <- princomp(possumi[, 5:13])$scores[, 1:2] bootscores[n*(i-1)+(1:n), 3] <- usepossum$sexsite[samprows] } names(bootscores) <- c("scores1","scores2", "sexsite") bootscores$another <- rep(1:ntimes, rep(n,ntimes)) colr <- c("red", "blue") pchr <- c(3,4,0,8,2,10,1) ss <- expand.grid(site=1:7, sex=1:2) xyplot(scores2 ~ scores1 | another, data=bootscores, groups = bootscores$sexsite, col=colr[ss$sex], pch=pchr[ss$site], xlab="1st PC", ylab="2nd PC", key = list(points = list(pch=pchr), columns=4, cex=.75, between=1, between.columns=2, text=list(c("Cambarville","Bellbird","Whian Whian ", "Byrangery", "Conondale ","Allyn River","Bulburin")) )) ## ss 12.1.3: Multi-dimensional scaling library(MASS) d.possum <- dist(possum[,6:14]) # Euclidean distance matrix possum.sammon <- sammon(d.possum, k=2) plot(possum.sammon$points) # Plot 2nd vs 1st ordinates possum.isoMDS <- isoMDS(d.possum, k=2) plot(possum.isoMDS$points) ## Binary data ## Sec 12.2: Discriminant Analysis ## ss 12.2.1: Example -- plant architecture ## Notation ## ss 12.2.2: Logistic discriminant analysis leaf17.glm <- glm(arch ~ logwid + loglen, family=binomial, data=leafshape17) options(digits=3) summary(leaf17.glm)$coef ## Predictive accuracy leaf17.cv <- CVbinary(leaf17.glm) tCV <- table(leafshape17$arch, round(leaf17.cv$acc.cv)) # CV cbind(tCV, c(tCV[1,1], class.acc=tCV[2,2])/(tCV[,1]+tCV[,2])) sum(tCV[row(tCV)==col(tCV)])/sum(tCV) # Overall accuracy tR <- table(leafshape17$arch, round(leaf17.cv$acc.resub)) # Resub cbind(tR, c(tR[1,1], class.acc=tR[2,2])/(tR[,1]+tR[,2])) sum(tR[row(tR)==col(tR)])/sum(tR) # Overall accuracy ## ss 12.2.3: Linear discriminant analysis library(MASS) ## Discriminant analysis; data frame leafshape17 (DAAG) leaf17.lda <- lda(arch ~ logwid+loglen, data=leafshape17) leaf17.lda ## Assessments of predictive accuracy leaf17cv.lda <- lda(arch ~ logwid+loglen, data=leafshape17, CV=TRUE) ## the list element 'class' gives the predicted class ## The list element 'posterior' holds posterior probabilities tab <- table(leafshape17$arch, leaf17cv.lda$class) cbind(tab, c(tab[1,1], class.acc=tab[2,2])/(tab[,1]+tab[,2])) sum(tab[row(tab)==col(tab)])/sum(tab) ## ss 12.2.4: An example with more than two groups ## Footnote code possum.lda <- lda(site ~ hdlngth + skullw + totlngth + taill + footlgth + earconch + eye + chest + belly, data=na.omit(possum)) # na.omit() omits any rows that have one or more missing values options(digits=4) possum.lda$svd # Examine the singular values plot(possum.lda, dimen=3) # Scatterplot matrix - scores on 1st 3 canonical variates (Figure 11.4) possum.lda ## Sec 12.3: *High-dimensional data, classification, and plots ## What groups are of interest? library(hddplot) data(golubInfo) with(golubInfo, table(cancer, tissue.mf)) attach(golubInfo) ## Identify allB samples for that are BM:f or BM:m or PB:m subsetB <- cancer=="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m") ## Form vector that identifies these as BM:f or BM:m or PB:m tissue.mfB <- tissue.mf[subsetB, drop=TRUE] ## Separate off the relevant columns of the matrix Golub data(Golub) GolubB <- Golub[, subsetB] detach(golubInfo) ## ss 12.3.1: Classifications and associated graphs ## Preliminary data manipulation ## Footnote code ## Try e.g. boxplot(data.frame(GolubB[, 1:20])) # First 20 columns (observations) ## Random selection of 20 rows (features) boxplot(data.frame(GolubB[sample(1:7129, 20), ])) ## ss 12.3.2: Flawed graphs ## Footnote code ## Uses orderFeatures() (hddplot); see below ord15 <- orderFeatures(GolubB, cl=tissue.mfB)[1:15] ## Footnote code dfB.ord <- data.frame(t(GolubB[ord15, ])) ## Calculations for the left panel ## Transpose to observations by features dfB15 <- data.frame(t(GolubB[ord15, ])) library(MASS) dfB15.lda <- lda(dfB15, grouping=tissue.mfB) scores <- predict(dfB15.lda, dimen=2)$x ## Scores for the single PB:f observation attach(golubInfo) df.PBf <- data.frame(t(Golub[ord15, tissue.mf=="PB:f" & cancer=="allB", drop=FALSE])) scores.PBf <- predict(dfB15.lda, newdata=df.PBf, dimen=2)$x detach(golubInfo) ## Warning! The plot that now follows may be misleading! ## Use scoreplot(), from the hddplot package scoreplot(list(scores=scores, cl=tissue.mfB, other=scores.PBf, cl.other="PB:f")) ## Footnote code simscores <- simulateScores(nrow=7129, cl=rep(1:3, c(19,10,2)), cl.other=4, nfeatures=15, seed=41) # Returns list elements: scores, cl, scores.other & cl.other scoreplot(simscores) ## Distributional extremes ## The calculation may take tens of minutes, even with adequate ## memory (e.g., 512MB) and a fast processor. ## If necessary, use a smaller value of B. library(multtest) GolubB.maxT <- mt.maxT(GolubB, unclass(tissue.mfB)-1, test="f", B=100000) ## Compare calculated F-statistics with permutation distribution qqthin(qf(1-GolubB.maxT$rawp, 2, 28), GolubB.maxT$teststat) ## Compare calculated F-statistics with theoretical F-distribution qqthin(qf(ppoints(7129), 2, 28), GolubB.maxT$teststat) # The theoretical F-distribution gives estimates of quantiles # that are too small ## NB also (not included in Figure 12.10) the comparison between ## the permutation distribution and the theoretical F: qqthin(qf(ppoints(7129), 2, 28), qf(1-GolubB.maxT$rawp, 2, 28)) # qqthin() is a version of qqplot() that thins out points where # overlap is substantial, thus giving smaller graphics files. ## Selection of features that discriminate ## ss 12.3.3: Accuracies and Scores for test data attach(golubInfo) Golub.BM <- Golub[, BM.PB=="BM"] cancer.BM <- cancer[BM.PB=="BM"] ## Now split each of the cancer.BM categories between two subsets ## Uses divideUp(), from hddplot gp.id <- divideUp(cancer.BM, nset=2, seed=29) # Set seed to allow exact reproduction of the results below detach(golubInfo) table(gp.id, cancer.BM) accboth <- accTrainTest(x = Golub.BM, cl = cancer.BM, traintest=gp.id) ## Footnote code ## Use function plotTrainTest() from hddplot plotTrainTest(x=Golub.BM, nfeatures=c(14,10), cl=cancer.BM, traintest=gp.id) rbind(accboth$sub1.2[1:20],accboth$sub2.1[1:20]) match(accboth$sub1.2[1:20],accboth$sub2.1[1:20]) ## Cross-validation to determine the optimum number of features ## Cross-validation to determine the optimum number of features ## Accuracy measure will be: tissue.mfB.cv$acc.cv tissue.mfB.cv <- cvdisc(GolubB, cl=tissue.mfB, nfeatures=1:23, nfold=c(10,4)) # 10-fold CV (x4) ## Defective measures will be in acc.resub (resubstitution) ## and acc.sel1 (select features prior to cross-validation) tissue.mfB.badcv <- defectiveCVdisc(GolubB, cl=tissue.mfB, foldids=tissue.mfB.cv$folds, nfeatures=1:23) ## NB: Warning messages have been omitted ## ## Calculations for random normal data: set.seed(43) rGolubB <- matrix(rnorm(prod(dim(GolubB))), nrow=dim(GolubB)[1]) rtissue.mfB.cv <- cvdisc(rGolubB, cl=tissue.mfB, nfeatures=1:23, nfold=c(10,4)) rtissue.mfB.badcv <- defectiveCVdisc(rGolubB, cl=tissue.mfB, nfeatures=1:23, foldids=rtissue.mfB.cv$folds) ## Which features? genelist <- matrix(tissue.mfB.cv$genelist[1:3, ,], nrow=3) tab <- table(genelist, row(genelist)) ord <- order(tab[,1], tab[,2], decreasing=TRUE) tab[ord,] ## Cross-validation: bone marrow (\texttt{BM}) samples only BMonly.cv <- cvdisc(Golub.BM, cl=cancer.BM, nfeatures=1:25, nfold=c(10,4)) ## ss 12.3.4: Graphs derived from the cross-validation process ## Panel A: Uses tissue.mfB.acc from above tissue.mfB.scores <- cvscores(cvlist = tissue.mfB.cv, nfeatures = 3, cl.other = NULL) scoreplot(scorelist = tissue.mfB.scores, cl.circle=NULL, prefix="B-cell subset -") ## Footnote code BMonly.scores <- cvscores(cvlist=BMonly.cv, nfeatures=19, cl.other=NULL) scoreplot(scorelist=BMonly.scores, cl.circle=tissue.mfB, circle=tissue.mfB%in%c("BM:f","BM:m"), params=list(circle=list(col=c("cyan","gray"))), prefix="B: BM samples -") ## Further comments ## Sec 12.4: Further Reading library(DAAG); library(oz) oz(sections=c(3:5, 11:16)) attach(possumsites) points(latitude, longitude) posval <- c(2, 4, 2, 2, 4, 2, 2) text(latitude, longitude, row.names(possumsites), pos=posval)