## High-dimensional data, classification, and plots ## What groups are of interest? data(Golub) data(golubInfo) # 7129 rows by 72 columns with(golubInfo, table(cancer, tissue.mf)) attach(golubInfo) ## Identify allB samples for that are BM:f or BM:m or PB:m subsetB <- cancer=="allB" & tissue.mf%in%c("BM:f","BM:m","PB:m") ## Form vector that identifies these as BM:f or BM:m or PB:m tissue.mfB <- tissue.mf[subsetB, drop=TRUE] ## Separate off the relevant columns of the matrix Golub GolubB <- Golub[, subsetB] detach(golubInfo) ## ss 12.3.1: Classifications and associated graphs ## Preliminary data manipulation ## Footnote code ## Try e.g. ## Random selection of 20 rows (features) boxplot(data.frame(t(GolubB[sample(1:7129, 20), ])), las=2) ## ss 12.3.2: Flawed graphs ## Footnote code ## Uses orderFeatures() (hddplot); see below ord15 <- orderFeatures(GolubB, cl=tissue.mfB)[1:15] ## Footnote code dfB.ord <- data.frame(t(GolubB[ord15, ])) ## Calculations for the left panel ## Transpose to observations by features dfB15 <- data.frame(t(GolubB[ord15, ])) library(MASS) dfB15.lda <- lda(dfB15, grouping=tissue.mfB) scores <- predict(dfB15.lda, dimen=2)$x ## Scores for the single PB:f observation attach(golubInfo) df.PBf <- data.frame(t(Golub[ord15, tissue.mf=="PB:f" & cancer=="allB", drop=FALSE])) scores.PBf <- predict(dfB15.lda, newdata=df.PBf, dimen=2)$x detach(golubInfo) ## Warning! The plot that now follows may be misleading! ## Use scoreplot(), from the hddplot package scoreplot(scorelist=list(scores=scores, cl=tissue.mfB, other=scores.PBf, cl.other=factor("PB:f"), nfeatures=15), params=list(other=list(cex=1.2,col=4,pch=4))) simscores <- simulateScores(nrows=7129, cl=rep(1:3, c(19,10,2)), cl.other=4, nfeatures=15, seed=41) # Returns list elements: scores, cl, scores.other & cl.other with(simscores, scoreplot(list(scores=scores, cl=cl, other=scores.other, cl.other=cl.other), params=list(other=list(cex=0.9,col=4,pch=4)))) ## Footnote code ## Use the function simulate.scores() ## Alternatively, set: ## dimen <- dim(GolubB); rsetB <- array(rnorm(prod(dimen)), dim=dimen) ## Then replace GolubB with rsetB, and repeat the lines above. ## Distributional extremes ## The calculation may take tens of minutes, even with adequate ## memory (e.g., 512MB) and a fast processor. ## If necessary, use a smaller value of B. library(multtest) GolubB.maxT <- mt.maxT(GolubB, tissue.mfB, test="f", B=100000) ## Compare calculated F-statistics with permutation distribution qqthin(qf(1-GolubB.maxT$rawp, 2, 28), GolubB.maxT$teststat) ## Compare calculated F-statistics with theoretical F-distribution qqthin(qf(ppoints(7129), 2, 28), GolubB.maxT$teststat) # The theoretical F-distribution gives estimates of quantiles # that are too small ## NB also (not included in Figure 11.10) the comparison between ## the permutation distribution and the theoretical F: qqthin(qf(ppoints(7129), 2, 28), qf(1-GolubB.maxT$rawp, 2, 28)) # qqthin() is a version of qqplot() that thins out points where # overlap is substantial, thus giving smaller graphics files. ## The function \texttt{order.features()} ## Footnote code ## Simplified version. The hddplot version has an additional parameter ## subset (to extract a subset of observations) "simpleOrderFeatures" <- function(x, cl){ ## Ensure that cl is a factor & has no redundant levels cl <- factor(cl) stat <- apply(x, 1, function(y)anova(lm(y~cl))[["F value"]][1]) order(stat, decreasing=TRUE) # Require largest first } ## ss 12.3.3: Accuracies and Scores for test data attach(golubInfo) Golub.BM <- Golub[, BM.PB=="BM"] cancer.BM <- cancer[BM.PB=="BM"] ## Now split each of the cancer.BM categories between two subsets ## Uses divvyup(), from hddplot gp.id <- divvyup(cancer.BM, nset=2, seed=31) # Set seed to allow exact reproduction of the results below detach(golubInfo) table(gp.id, cancer.BM) accboth <- accTrainTest(x = Golub.BM, cl = cancer.BM, nfeatures=NULL, print.acc=FALSE) ## Footnote code ## Use function plotTrainTest() from hddplot plotTrainTest(x=Golub.BM, nfeatures=c(7,12), cl=cancer.BM, traintest=gp.id) gp.id <- divvyup(cl=cancer.BM, nset=2, seed=NULL) rbind(accboth$sub1.2[1:12],accboth$sub2.1[1:12]) ## Find the order of the first list in the second, if present match(accboth$sub1.2[1:12],accboth$sub2.1[1:12]) ## Cross-validation to determine the optimum number of features tissue.mfB.cv <- cvdisc(GolubB, cl=tissue.mfB, nfeatures=1:26, nfold=c(10,4)) # 10-fold CV, repeat 4 times # Accuracy measures is: tissue.mfB.cv$acc.cv ## Calculate defective accuracy measures, for comparison tissue.mfB.badcv <- defectiveCVdisc(GolubB, cl=tissue.mfB, foldids=tissue.mfB.cv$folds, nfeatures=1:26) # Resubstitution (red points): tissue.mfB.badcv$acc.resub # Select once (gray): tissue.mfB.badcv$acc.sel1 ## ## Calculations for random normal data: set.seed(43) rGolubB <- matrix(rnorm(prod(dim(GolubB))), nrow=dim(GolubB)[1]) rtissue.mfB.cv <- cvdisc(rGolubB, cl=tissue.mfB, nfeatures=1:26, nfold=c(10,4)) rtissue.mfB.badcv <- defectiveCVdisc(rGolubB, cl=tissue.mfB, nfeatures=1:26, foldids=rtissue.mfB.cv$folds) ## Cross-validation: bone marrow (\texttt{BM}) samples only BMonly.cv <- cvdisc(Golub.BM, cl=cancer.BM, nfeatures=1:20, nfold=c(10,4)) ## ss 12.3.4: Graphs derived from the cross-validation process ## Make each fold of each repeat a separate column genelist <- matrix(tissue.mfB.cv$genelist[1:3, ,], nrow=3) tab <- table(genelist, row(genelist)) ord <- order(apply(tab,1,sum), decreasing=TRUE) tab[ord,] ## Panel A: Uses tissue.mfB.cv from above tissue.mfB.scores <- cvscores(cvlist = tissue.mfB.cv, nfeatures = 3, cl.other = NULL) scoreplot(scorelist = tissue.mfB.scores, cl.circle=NULL, prefix="B-cell subset -") ## Footnote code BMonly.scores <- cvscores(cvlist=BMonly.cv, nfeatures=13, cl.other=NULL) scoreplot(scorelist=BMonly.scores, cl.circle=tissue.mfB, circle=tissue.mfB%in%c("BM:f","BM:m"), params=list(circle=list(col=c("cyan","gray"))), prefix="B: BM samples -")