## Chapter 11 ## Note ## Sec 11.1: The Uses of Tree-based Methods ## ss 11.1.1: Problems for which tree-based regression may be used ## When are tree-based methods appropriate? ## Sec 11.2: Detecting Email Spam~-- an Example ## Footnote code ## Sample 50 rows from data frame spam7 (DAAG) ## To get another sample and to obtain one of the plots, type: spam.sample <- spam7[sample(seq(1,4601),500,replace=FALSE), ] boxplot(split(spam.sample$crl.tot, spam.sample$yesno)) library(rpart) spam.rpart <- rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 + make, method="class", data=spam7) plot(spam.rpart) # Draw tree text(spam.rpart) # Add labeling ## ss 11.2.1: Choosing the number of splits ## Sec 11.3: Terminology and Methodology ## Footnote code ## Code to plot such a tree is Criterion <- factor(paste("Leaf", 1:5)) Node <- c(1,2,3,4,5) demo.df <- data.frame(Criterion = Criterion, Node = Node) demo.rpart <- rpart(Node ~ Criterion, data = demo.df, control = list(minsplit = 2, minbucket = 1)) plot(demo.rpart, uniform=TRUE) text(demo.rpart) ## ss 11.3.1: Choosing the split~-- regression trees ## ss 11.3.2: Within and between sums of squares ## ss 11.3.3: Choosing the split~-- classification trees ## ss 11.3.4: Tree-based regression versus loess regression smoothing ## loess fit to Mileage vs Weight: data frame car.test.frame (rpart) car.lo <- loess(Mileage ~ Weight, car.test.frame) plot(car.lo, xlab="Weight", ylab="Miles per gallon") lines(seq(1850,3850), predict(car.lo, data.frame(Weight=seq(1850,3850)))) car.tree <- rpart(Mileage ~ Weight, data=car.test.frame, control = list(minsplit = 10, minbucket = 5, cp = 0.0001), method="anova") plot(car.tree, uniform = TRUE) text(car.tree, digits = 3, use.n = TRUE) car.tree <- rpart(Mileage ~ Weight, data = car.test.frame) plot(car.tree, uniform = FALSE) text(car.tree, digits = 3, use.n = TRUE) ## Sec 11.4: Predictive Accuracy, and the Cost-complexity Tradeoff ## ss 11.4.1: Cross-validation ## ss 11.4.2: The cost-complexity parameter ## ss 11.4.3: Prediction error versus tree size ## Sec 11.5: Data for female heart attack patients summary(mifem) # data frame mifem (DAAG) mifem.rpart <- rpart(outcome ~ ., method="class", data = mifem, cp = 0.0025) plotcp(mifem.rpart) # Cross-validated error vs cp printcp(mifem.rpart) # Tabular version of the same information mifemb.rpart <- prune(mifem.rpart, cp=0.03) plot(mifemb.rpart) par(xpd=TRUE) # May be needed so that labels appear text(mifemb.rpart, use.n=T, digits=3) par(xpd=FALSE) ## ss 11.5.1: The one-standard-deviation rule ## ss 11.5.2: Printed Information on Each Split print(mifemb.rpart) ## Sec 11.6: Detecting Email Spam~-- the Optimal Tree spam7a.rpart <- rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 + make, method="class", data = spam7, cp = 0.001) printcp(spam7a.rpart) ## Footnote code ## Use of prune.rpart() with cp between 0.00276 and 0.00386 ## will prune back to nsplit=16. Specify: spam7b.rpart <- prune(spam7a.rpart, cp=0.003) plot(spam7b.rpart, uniform=TRUE) text(spam7b.rpart, cex=0.75) ## How does the one standard error rule affect accuracy estimates? acctree.mat <- matrix(0, nrow=100, ncol=6) for(i in 1:100)acctree.mat[i,] <- compareTreecalcs(data=spam7, fun="rpart") ## How is the standard error calculated? ## When are tree-based methods appropriate? ## Sec 11.7: The \textit{randomForest} Package library(randomForest) spam7.rf <- randomForest(yesno ~ ., data=spam7, importance=TRUE) print(spam7.rf) tuneRF(x=spam7[, -7], y=spam7$yesno, trace=FALSE) importance(spam7.rf) ## ss 11.7.1: Comparison between \texttt{rpart()} and \texttt{randomForest()} ## Footnote code acctree.mat <- matrix(0, nrow=100, ncol=8) colnames(acctree.mat) <- c("rpSEcvI", "rpcvI", "rpSEtest", "rptest", "n.SErule", "nre.min.12", "rfcvI", "rftest") for(i in 1:100)acctree.mat[i,] <- compareTreecalcs(data=spam7, fun=c("rpart", "randomForest")) acctree.df <- data.frame(acctree.mat) lims <- range(acctree.mat[, c(4,7,8)], na.rm=TRUE) plot(rfcvI ~ rftest, data=acctree.df); abline(0,1) # Panel A plot(rptest ~ rftest, data=acctree.df); abline(0,1) # Panel B ## Efficient computation ## Differences between \texttt{rpart()} and \texttt{randomForest()} ## Sec 11.8: Additional Notes on Tree-Based Methods ## The combining of tree-based methods with other approaches ## Models with a complex error structure ## Pruning as variable selection ## Other types of tree ## Factors as predictors ## Summary of pluses and minuses of tree-based methods ## Sec 11.9: Further Reading