## Chapter 14 ## Sec 14.1: Working Directories, Workspaces and the Search List ## ss 14.1.1: *The search path search() ## ss 14.1.2: Workspace management xy <- matrix(rnorm(60000), nrow=600) xy.rowrange <- apply(xy, 1, range) save(xy, xy.rowrange, file="xy.RData") rm(xy, xy.rowrange) ## attach("c:/r/ch1/.RData") # Microsoft Windows systems ## ls(pos=2) # Check names of available objects ## Changing the Working Directory and/or Workspace ## ss 14.1.3: Utility functions dir() # List files in the working directory file.choose() # Choose a file interactively sessionInfo() # Print version numbers for R and for # attached packages R.home() # Give the path to the R home directory .Library # Path to the default library. Sys.getenv() # Get settings of environment variables dir() Sys.getenv("R_HOME") ## Sec 14.2: Data Input and Output library(DAAG) datafile("oneBadRow") datafile("scan-demo") readLines("oneBadRow.txt") # all lines of file readLines("oneBadRow.txt", n=3) # First 3 lines only ## ss 14.2.1: Input of data ## The function {\em\texttt{read.table()}} and its variants ## Tracking errors in input data nfields <- count.fields("oneBadRow.txt") nfields # Number of fields, for each row (1:length(nfields))[nfields == 4] ## The following (the default) inputs all lines of the file readLines("oneBadRow.txt", n=-1) ## Input of fixed format data ## Input of large rectangular files scan("oneBadRow.txt", skip = 1, quiet= TRUE) scan("scan-demo.txt", skip = 1, quiet= TRUE, what="") data.frame(scan("scan-demo.txt", skip = 1, quiet= TRUE, ## Example -- input of the Boston housing data readLines("bostonc.txt", n=11)[10:11] strsplit(readLines("bostonc.txt", n=11)[10:11], split="\t") boston <- scan("bostonc.txt", n=-1, sep="\t", skip=10, what=c(list(1,""), as.list(rep(1,19)))) colnams <- scan("bostonc.txt", skip=9, n=21, what="") boston <- data.frame(boston) names(boston) <- colnams boston <- read.table("bostonc.txt", sep="\t", skip=9, comment.char="", header=TRUE) ## Importing data from other statistical systems ## Database connections ## ss 14.2.2: Data output ## Output of data frames write.table(fossilfuel, file="fuel.txt") ## Redirection of screen output to a file fossilfuel # Below, this will be written to the file sink("fuel2.txt") fossilfuel # NB: No output on screen sink() ## Output to a file using {\em\texttt{cat()}} ## Sec 14.3: Functions and operators -- Some Further Details ## Why collect code into functions? ## Issues for the writing and use of functions ## ss 14.3.1: Function arguments ## The {\em\texttt{args()}} function args(write.table) # Version 2.2.1 of R ## The\texttt{...}argument x <- c(1,5,7); z <- c(4,9,10,NA); u <- 1:10 # Now do calculations that use x, z and u rm(x, z, u) ## ss 14.3.2: Character string and vector functions substring("abracadabra",3, 8) # Extract characters 3 to 8 nchar("abracadabra") # Count the number of characters strsplit("abracadabra", "r") # Split wherever "r" appears strsplit("abcd", split="") # Split into separate characters paste("ab","c","d", sep="") # Join together paste(c("a","b","c"), collapse="") # Join vector elements ## ss 14.3.3: Anonymous functions ssfun <- function(x)sum(x^2) sapply(elastic1, ssfun) # elastic1 is from the DAAG package sapply(elastic1, function(x)sum(x^2)) growthfun <- function(x)(x[9] - x[1])/x[1] sapply(austpop[, -c(1,10)], growthfun) sapply(austpop[, -c(1,10)], function(x)(x[9] - x[1])/x[1]) ## ss 14.3.4: Functions for working with dates (and times) # Electricity Billing Dates dd <- as.Date(c("2003-08-24","2003-11-23","2004-02-22","2004-05-03")) diff(dd) as.Date("1/1/1960", format="%d/%m/%Y") as.Date("1:12:1960",format="%d:%m:%Y") as.Date("1960-12-1")-as.Date("1960-1-1") as.Date("31/12/1960","%d/%m/%Y") as.integer(as.Date("1/1/1970","%d/%m/%Y")) as.integer(as.Date("1/1/2000","%d/%m/%Y")) dec1 <- as.Date("2004-12-1") format(dec1, format="%b %d %Y") format(dec1, format="%a %b %d %Y") ## Labeling of graph: data frame jobs (DAAG) Jobs <- stack(jobs, select=-7) startofmonth <- seq(from=as.Date("1Jan1990", format="%d%b%Y"), by="1 month", length=24) Jobs$Date <- rep(startofmonth, 6) names(Jobs) <- c("Number", "Province", "Date") atdates <- seq(from=as.Date("1Jan1990", format="%d%b%Y"), by="3 month", length=8) datelabs <- format(atdates, "%b%y") xyplot(Number ~ Date, groups=Province, data=Jobs, scale=list(x=list(at=atdates, labels=datelabs)), auto.key=list(columns=6, between=1)) ## ss 14.3.5: Creating Groups library(MASS) catBP <- cut(Pima.tr2$bp, breaks=4) table(catBP) sum(table(catBP)) table(is.na(catBP)) ## ss 14.3.6: Logical operators c(TRUE, TRUE, FALSE) & c(FALSE, TRUE, FALSE) c(TRUE, TRUE, FALSE) && c(FALSE, TRUE, FALSE) ## ss 14.3.7: A vector form of \texttt{if} statement ## ss 14.3.8: Operators are functions "+"(2, 3) ## Sec 14.4: Factors fac <- factor(c("c", "b", "a")) for (i in fac) print(i) ## Ordered factors stress.level <- rep(c("low","medium","high"), 2) ordf.stress <- ordered(stress.level, levels=c("low", "medium", "high")) ordf.stress ordf.stress < "medium" ordf.stress >= "medium" ## Factor contrasts contr.treatment(3) contr.sum(3) cities <- factor(c("Melbourne","Sydney","Adelaide")) contr.sum(cities) ## Contrasts -- implications for model fitting ## *Tests for main effects in the presence of interactions? ## Sec 14.5: Missing Values x <- c(1, 6, 2, NA) is.na(x) # TRUE for NAs, otherwise FALSE x == NA # All elements are set to NA NA == NA x <- c(1, NA, 6, 2, 10) x > 4 # The second element will be NA x[x>4] # NB: This generates a vector of length 3 x[x > 4] <- c(101, 102) x[!is.na(x) & x > 4] <- c(101, 102) x ## Counting and identifying {\rm NA}s -- the use of \texttt{table()} x <- factor(x, exclude=NULL) library(MASS) catBP <- cut(Pima.tr2$bp, breaks=4) table(factor(catBP, exclude=NULL)) sum(table(factor(catBP, exclude=NULL))) ## The removal of NAs ## {\em\texttt{NA}}s in modeling functions options()$na.action # Version 2.1.0, following startup ## Sorting and ordering, where there are NAs x <- c(1, 20, 2, NA, 22) order(x) # By default, na.last=TRUE x[order(x)] sort(x) # By default na.last=NA sort(x, na.last=TRUE) ## Sec 14.6: {\kern-4pt}$^*$ Matrices and Arrays xx <- matrix(1:6, ncol=3) # Equivalently, enter xx dim(xx) ## Use as.vector() x <- as.vector(xx) ## Alternatively, directly remove the dimension attribute x <- xx dim(x) <- NULL ## The extraction of submatrices x34 <- matrix(1:12, ncol=4) x34 x34[2:3, c(1,4)] # Extract rows 2 & 3 and columns 1 & 4 x34[-2, ] # Extract all rows except the second x34[-2, -3] # Omit row 2 and column 3 x34[2, ] # The dimension attribute is dropped x34[2, , drop=FALSE] # Retain the dimension attribute ## Conversion of data frames and tables into matrices ## ss 14.6.1: Matrix arithmetic ## Set up example matrices G, H and B G <- matrix(1:12, nrow=4); H <- matrix(112:101, nrow=4); B <- 1:3 G + H # Element-wise addition (X & Y both n by m) G * H # Element-wise multiplication G %*% B # Matrix multiplication (X is n by k; B is k by m) ## Set up example matrices X (square, full rank) and Y X <- matrix(c(1,1,1, -1,0,1, 2,4,2), nrow=3) Y <- matrix(1:6, nrow=3) solve(X, Y) # Solve XB = Y (X must be square) ## Computational Efficiency xy <- matrix(rnorm(500000),ncol=50) dim(xy) system.time(xy+1) xy.df <- data.frame(xy) system.time(xy.df+1) ## ss 14.6.2: Outer products outer(1:4, 1:10) rbgshades <- outer(c("red","blue","green"), c("",paste(1:4)), function(x,y)paste(x,y, sep="")) rbgshades # Display the matrix plot(rep(0:4, rep(3,5)), rep(1:3, 5), col=rbgshades, pch=15, cex=8) ## ss 14.6.3: Arrays x <- 1:24 dim(x) <- c(4, 6) x dim(x) <- c(3, 4, 2) x ## Sec 14.7: Manipulations with Lists, Data Frames and Matrices ## ss 14.7.1: Lists -- an extension of the notion of ``vector'' list1 <- list(1:3, 15, c("win","susan")) list1[c(3,1,2)] ## Return list whose only element is the vector c("win","susan") list1[3] ## Return the vector c("win","susan") list1[[3]] ## The dual identity of data frames xyz <- data.frame(x=1:4, y=11:14, z=I(letters[1:4])) xyz[1,] # Returns a data frame, i.e., a list) xyz[1,, drop=TRUE] unlist(xyz[1,]) # Returns, here, a vector of atomic mode unlist(xyz[1,, drop=TRUE]) ## ss 14.7.2: Changing the shape of data frames ## ss 14.7.3: The {\em \texttt{reshape()}} function Jobs <- reshape(jobs, varying=list(names(jobs)[1:6]), v.names="Number", times=names(jobs)[1:6], timevar="Region", direction="long") head(Jobs, 3) reshape(Jobs, v.names="Number", timevar="Region", direction="wide") ## ss 14.7.4: {\kern-4pt}$^*$ Merging data frames -- {\em\texttt{merge()}} new.Cars93 <- merge(x=Cars93, y=Cars93.summary[, "abbrev", drop=F], by.x="Type", by.y="row.names") ## ss 14.7.5: Joining data frames, matrices and vectors -- {\em\texttt{cbind()}} ## Conversion of tables and arrays into data frames ## ss 14.7.6: The {\em\texttt{apply}} family of functions ## The {\em\texttt{tapply()}} function ## Footnote code ## The need to attach kiwishade can be avoided by use of with() with(kiwishade, tapply(yield, INDEX=list(block, shade), FUN=mean)) with(kiwishade, aggregate(yield, by=list(block, shade), FUN=mean)) ## Compare tapply() with aggregate(): data frame kiwishade (DAAG) attach(kiwishade) kiwimeans <- tapply(yield, INDEX=list(block, shade), FUN=mean) kiwimeans <- aggregate(yield, by=list(block, shade), FUN=mean) detach(kiwishade) ## The {\em\texttt{apply()}} function ## The functions {\em\texttt{lapply()}} and {\em\texttt{sapply()}} ## Uses data frame rainforest (DAAG) sapply(rainforest[, -7], range, na.rm=TRUE) ## ss 14.7.7: Splitting vectors and data frames into lists -- {\em\texttt{split()}} ## Split data frame cabbage (MASS) into list library(MASS) split(cabbages$HeadWt, cabbages$Date) split(cabbages[, -1], cabbages$Date) # Split remaining columns # by levels of Date ## ss 14.7.8: Multivariate time series jobts <- ts(jobs[,1:6], start=1995, frequency=12) colnames(jobts) ## Subseries through to the third month of 1995 window(jobts, end=1995+2/12) # Rows are 1995+0/12, 1990+1/12, 1990+2/12 plot(jobts, plot.type="single") # Use one panel for all plot(jobts, plot.type="multiple") # Separate panels. ## Sec 14.8: Classes and Methods print ## ss 14.8.1: Printing and summarizing model objects elastic.lm <- lm(distance ~ stretch, data=elasticband) elastic.sum <- summary(elastic.lm) ## ss 14.8.2: Extracting information from model objects names(elastic.lm) elastic.lm$coefficients elastic.lm[["coefficients"]] elastic.lm[[1]] coef(elastic.lm) ## ss 14.8.3: S4 classes and methods x <- 1:5 class(x) <- "lm" # Inappropriate assignment of class print(x) library(lme4) hp.lmList <- lmList(o2 ~ wattsPerKg | id, data=humanpower1) slotNames(hp.lmList) slot(hp.lmList, "call") hp.lmList@call ## Sec 14.9: Manipulation of Language Constructs ## ss 14.9.1: Model and graphics formulae plot.mtcars <- function(xvar="disp", yvar="hp"){ attach(mtcars) mt.txt <- paste(yvar, "~", xvar) plot(formula(mt.txt)) } ## Plot using data frame mtcars (datasets) names(mtcars) plot.mtcars(xvar="disp", yvar="mpg") ## Extraction of Variable Names from Formula Objects all.vars(mpg ~ disp) plot.mtcars <- function(form = mpg~disp){ yvar <- all.vars(form)[1] xvar <- all.vars(form)[2] ## Include information that allows a meaningful label mtcars.info <- c(mpg= "Miles/(US) gallon", cyl= "Number of cylinders", disp= "Displacement (cu.in.)", hp= "Gross horsepower", drat= "Rear axle ratio", wt= "Weight (lb/1000)", qsec= "1/4 mile time", vs= "V/S", am= "Transmission (0 = automatic, 1 = manual)", gear= "Number of forward gears", carb= "Number of carburettors") xlab <- mtcars.info[xvar] ylab <- mtcars.info[yvar] plot(form, xlab=xlab, ylab=ylab) } ## ss 14.9.2: The use of a list to pass parameter values mean(rnorm(20)) do.call("mean", args=list(x=rnorm(20))) "simulate.distribution" <- function(distn="weibull", parms=list(n=10)){ ## Simulates one of: weibull, gaussian, logistic if(distn=="weibull" & !("shape" %in% names(parms))) parms <- c(shape=1, parms) ## weibull requires a default shape parameter. ## ===================================================== ## Choose the function that will generate the random sample rfun <- switch(distn, weibull = rweibull, normal = rnorm, logistic= rlogis) ## Call rfun(). Use of do.call() makes it possible to give ## the parameter list as a list of named values. ## Pass shape parameter (or NULL), plus n = # of numbers do.call("rfun", args=parms) } simulate.distribution() plot(density(simulate.distribution("normal", parms=list(n=100)))) plot(density(simulate.distribution("weibull", parms=list(n=100, scale=0.5)))) mean.call <- call("mean", x=rnorm(5)) eval(mean.call) eval(mean.call) mean.call ## ss 14.9.3: Expressions local(a+b*x+c*x^2, envir=list(x=1:4, a=3, b=5, c=1)) ## ss 14.9.4: Environments test <- function(){ fname <- as.character(sys.call()) fname } test() newtest <- test # Create a copy with a different name newtest() ## Automatic naming of a file that holds function output gf <- function(width=2.25, height=2.25, color=F, pointsize=8){ funtxt <- sys.call(1) fnam <- paste(funtxt, ".pdf", sep="") print(paste("Output will be to the file '", fnam, "'", sep="")) pdf(file=fnam, width=width, height=height, pointsize= pointsize) } fig1 <- function(){ gf() # Call with default parameters curve(sin, -pi, 2*pi) dev.off() } ## ss 14.9.5: Function environments, and lazy evaluation ## Lazy evaluation lazyfoo <- function(x=4, y = x^2)y lazyfoo() lazyfoo(x=3) lazyfoo(y=x^3) # We specify a value for y in the function call x <- 9 lazyfoo(y=x^3) ## Example -- a function that identifies objects added during asession dsetnames <- objects() additions <- function(objnames = dsetnames){ newnames <- objects(envir=.GlobalEnv) existing <- newnames %in% objnames newnames[!existing] } additions(dsetnames) ## Sec 14.10: Document Preparation --- \texttt{Sweave()} and \texttt{xtable()} Sweave("sec1-1.Rnw") # actually, Sweave("sec1-1") is sufficient ## Sec 14.11: Graphs in R ## ss 14.11.1: Hardcopy graphics devices pdf(file="fossilfuel.pdf", width=6.5, height=6.5, pointsize=18) plot(Carbon ~ Year, data=fossilfuel, pch=16) dev.off() plot(carbon ~ year, data=fossilfuel, pch=16) # Display graph ## Now open pdf device and copy graph to it dev.copy(pdf, file="fossilcopy.pdf") dev.off() # Close pdf device ## ss 14.11.2: Multiple graphs on a single graphics page ## ss 14.11.3: Plotting characters, symbols, line types and colors ## Font families ## Symbols ## Draw red triangle plot(0:3, 0:3, type="n") polygon(c(1,1.5,2), c(1,2,1), col="red", border="black") # It was sufficient to specify 2 sides (out of 3) only ## Plot normal density; show area under curve in red curve(dnorm, from=-3.25, to=3.25, yaxs="i", ylim=c(0, 1.025*dnorm(0)), bty="n") ## Fill each 2.5% tail in gray x <- seq(from=1.96, to=3.25, length.out=20) polygon(c(x[1], x, 3.25), c(0, dnorm(x), 0), col="gray") polygon(-c(x[1], x, 3.25), c(0, dnorm(x), 0), col="gray") ## Colors ## Contour and filled contour plots ## Formatting and plotting of text and equations ## Plot data from data frame rainforest (DAAG) Acmena <- subset(rainforest, species=="Acmena smithii") maintitle <- expression(italic("Acmena smithii") * "(Lilly Pilly)") plot(wood ~ dbh, data = Acmena, main = maintitle) ## Symbolic substitution of symbols in an expression ## Alternative, demonstrating symbolic substitution maintitle <- substitute(italic(tx) * ": " * "wood vs dbh", list(tx="Acmena smithii"))) ## Plotting expressions in parallel ## Sample from bivariate normal (x, y) with correlation rho rho <- 0.75; x <- rnorm(40); y <- rnorm(40) + rho/sqrt(1-rho^2)*x ## Now map all values of x onto 3 integer values x0 <- cut(x, breaks=c(-Inf,-1,1,Inf)) plot(unclass(x0), y, xaxt="n") axis(1, at=1:3, labels=expression("(-Inf,-1]", "(-1,1]", "(1, Inf]")) ## *Symbolic substitution in parallel ## Plot Acmena smithii data (subset of rainforest, from DAAG) Acmena <- subset(rainforest, species=="Acmena smithii") plot(wood~dbh, data=Acmena) b <- coef(lm(log(wood) ~ log(dbh), data=Acmena, subset=dbh<40)) curve(exp(b[1])*x^b[2], add=TRUE) b <- round(b,3) arg1 <- bquote(italic(y) == .(A) * italic(x)^.(B), list(A=b[1], B=b[2])) arg2 <- quote("where " * italic(y) * "=wood; " * italic(x) * "=dbh") legend("topleft", legend=do.call("expression", c(arg1, arg2)), bty="n") ## Footnote code plot(wood~dbh, data=Acmena, xlim=c(0, max(Acmena$dbh))) ## In fitting the curve, omit the point where dbh is largest b <- coef(lm(log(wood) ~ log(dbh), data=Acmena, subset=dbh < 40)) big2 <- sort(Acmena$dbh, decreasing=TRUE)[2:1] curve(exp(b[1])*x^b[2], from=min(Acmena$dbh), to=big2[1], add=TRUE) curve(exp(b[1])*x^b[2], from=big2[1], to=big2[2], lty=2, add=TRUE) ## Finally, use the code given above to add the legend and the title. ## A further use for \texttt{substitute()} testfun <- function(x, y){ xpar <- deparse(substitute(x)) xpar} testfun(x = AnyValidName) ## Sec 14.12: Lattice Graphics, and the \textit{grid} Package ## Default \textit{lattice} graphics parameter settings names(trellis.par.get()) xyplot(csoa ~ it, groups=agegp, data=tinting, type=c("p","smooth"), par.settings=list(superpose.symbol=list(pch=c(1, 16)), superpose.line=list(col=c(8, 1))), auto.key=list(points=TRUE, lines=TRUE)) ## Annotation -- the \texttt{auto.key}, \texttt{key} and\texttt{legend} arguments ## ss 14.12.1: Interaction with plots ## Use of xyplot(): data frame tinting (DAAG) library(lattice); library(grid) xyplot(it ~ csoa | sex, data=tinting) trellis.focus("panel", column=1, row=1) panel.identify(labels=as.character(tinting$target)) ## Now click near points as required. ## Terminate by right clicking inside the panel. ## Now interact with panel 2 trellis.focus("panel", row=1, column=2) panel.identify(labels=as.character(tinting$target)) ## Click, ..., and right click ## ss 14.12.2: *Use of \texttt{grid.text()} to label points ## Add labels to lattice plot: data frame primates (DAAG) library(grid) xyplot(Brainwt ~ Bodywt, data=primates) trellis.focus("panel", row=1, column=1, clip.off=TRUE) xyetc <- trellis.panelArgs() grid.text(label=row.names(primates), x=xyetc$x, y=xyetc$y, default.units="native", just="left") trellis.unfocus() ## ss 14.12.3: *Multiple lattice graphs on a graphics page ## Two lattice graphs on one page: data frame cuckoos (DAAG) library(grid) trellis.par.set(layout.heights=list(key.top=0.25, axis.top=0.5, bottom.padding=0.15)) cuckoos.strip <- stripplot(species ~ length, xlab="", data=cuckoos, legend=list(top=list(fun=textGrob, args=list(label="A", x=0)))) print(cuckoos.strip, position=c(0,.475,1,1)) # Left bottom corner is (0,.475); right upper corner is (1,1) cuckoos.bw <- bwplot(species~length, data=cuckoos, xlab="Egg length (mm)", legend=list(top=list(fun=textGrob, args=list(label="B", x=0)))) print(cuckoos.bw, newpage=FALSE, position=c(0,0,1,.525)) frame() par(fig=c(0, 1, .495, 1)) # (xl, xu, yl, yu) mtext(side=3, line=2.5, "A", adj=-0.075) par(fig=c(0, 1, 0, .505), new=TRUE) # Specify new=TRUE for a new plot on the existing page mtext(side=3, line=2.5, "B", adj=-0.075) cuckoos.strip <- stripplot(species ~ length, xlab="", data=cuckoos) print(cuckoos.strip, newpage=FALSE, position=c(0,.495,1,1)) # (xl, yl, xu, yu) cuckoos.bw <- bwplot(species~length, data=cuckoos, xlab="Egg length (mm)") print(cuckoos.bw, newpage=FALSE, position=c(0,0,1,.505)) frame() ## Sec 14.13: Further Reading citation() citation("DAAG") ## Vignettes vignette() # All vignettes in all installed packages vignette(package="grid") vignette("grid") # Equivalent to vignette(topic="grid") ## Use of split() and sapply(): data frame science (DAAG) with(science, sapply(split(school, PrivPub), function(x)length(unique(x)))) ### Thanks to Markus Hegland (ANU), cwho wrote the initial version ##1 Generate the data cat("generate data \n") n <- 800 # length of noise vector m <- 100 # length of signal vector xsignal <- runif(m) sig <- 0.01 enoise <- rnorm(m)*sig ysignal <- xsignal**2+enoise maxys <- max(ysignal) minys <- min(ysignal) x <- c(runif(n), xsignal) y <- c(runif(n)*(maxys-minys)+minys, ysignal) # random permutation of the data vectors iperm <- sample(seq(x)) x <- x[iperm] y <- y[iperm] # normalize the data, i.e., scale x & y values to # lie between 0 & 1 xn <- (x - min(x))/(max(x) - min(x)) yn <- (y - min(y))/(max(y) - min(y)) ##1 End ##2 determine number of neighbors within # a distance <= h = 1/sqrt(length(xn)) nx <- length(xn) # determine distance matrix d <- sqrt( (matrix(xn, nx, nx) - t(matrix(xn, nx, nx)) )**2 + (matrix(yn, nx, nx) - t(matrix(yn, nx, nx)) )**2 ) h <- 1/sqrt(nx) nnear <- apply(d <= h, 1, sum) ##2 End ##3 Plot data, with reconstructed signal overlaid. cat("produce plots \n") # identify the points which have many such neighbors ns <- 8 plot(x,y) points(x[nnear > ns], y[nnear > ns], col="red", pch=16) ##3 End n <- 10000; system.time(sd(rnorm(n))) ## Sec 14.14: References ## References for further reading ## Sec 14.15: References ## References for further reading