## Chapter 1 ## Sec 1.1: An Overview of R ## ss 1.1.1: A Short R Session ## ss 1.1.2: {\rm R} must be installed! ## ss 1.1.3: Using the console (or command line) window 2+2 2*3*4*5 # * denotes 'multiply' sqrt(10) # the square root of 10 pi # R knows about pi 2*pi*6378 # Circumference of earth at equator (km) ## ss 1.1.4: Entry of data at the command line Year <- c(1800, 1850, 1900, 1950, 2000) Carbon <- c(8, 54, 534, 1630, 6611) ## Now plot Carbon as a function of Year plot(Carbon ~ Year, pch=16) ## ss 1.1.5: Collection of vectors into a data frame fossilfuel <- data.frame(year=Year, carbon=Carbon) fossilfuel # Display the contents of the data frame. rm(Year, Carbon) # These are no longer required plot(carbon ~ year, data=fossilfuel, pch=16) ## ss 1.1.6: Checks on the working directory and the contents of the workspace getwd() ls() ## ss 1.1.7: Quitting {\rm R} q() ## ss 1.1.8: The Uses of R ## {\rm R} offers an extensive collection of functions range(fossilfuel$carbon) ## 4 cities fourcities <- c("Toronto", "Canberra", "New York", "London") ## display in alphabetical order sort(fourcities) ## Find the number of characters in "Toronto" nchar("Toronto") ## Find the number of characters in all four city names at once nchar(fourcities) ## {\rm R} will give numerical or graphical data summaries summary(cars) hist(cars$speed) ## {\rm R} is an interactive programming language celsius <- (0:4)*10 fahrenheit <- 9/5*celsius+32 conversion <- data.frame(Celsius=celsius, Fahrenheit=fahrenheit) print(conversion) ## ss 1.1.9: Online Help help(plot) apropos("sort") # Try, also, apropos ("sor") # This lists all functions whose names include the # character string "sort". help.search("sort") # Note that the argument is 'sort' # This lists all functions that have the word 'sort' as # an alias or in their title. example(image) # for a 2 by 2 layout of the last 4 plots, precede with # par(mfrow=c(2,2)) # To prompt for each new graph, precede with par(ask=TRUE) # When finished, turn off the prompts with par(ask=FALSE) ## Wide-ranging searches ## ss 1.1.10: Further steps in learning R ## Sec 1.2: Data Input, Packages and the Search List ## ss 1.2.1: Reading data from a file fossilfuel <- read.table("fuel.txt", header=TRUE) fossilfuel <- read.table("fuel.csv", header=TRUE, sep=",") ## ss 1.2.2: R Packages library(DAAG) sessionInfo() ## Data sets that accompany {\rm R} packages data(package="datasets") # Specify 'package', not 'library'. ## Installation of additional packages ## Sec 1.3: Vectors, Factors and Univariate Time Series ## ss 1.3.1: Vectors c(2, 3, 5, 2, 7, 1) 3:10 # The numbers 3, 4,..., 10 c(T, F, F, F, T, T, F) c("Canberra", "Sydney", "Canberra", "Sydney") ## ss 1.3.2: Concatenation -- joining vector objects x <- c(2, 3, 5, 2, 7, 1) x y <- c(10, 15, 12) y z <- c(x, y) z ## ss 1.3.3: Subsets of vectors x <- c(3, 11, 8, 15, 12) # Assign to x the values 3, x[c(2,4)] # Elements in positions 2 x[-c(2,3)] # Remove the elements in positions 2 and 3 x > 10 x[x > 10] heights <- c(Andreas=178, John=185, Jeff=183) heights[c("John","Jeff")] ## ss 1.3.4: Patterned data 5:15 seq(from=5, to=22, by=3) # The first value is 5. The final seq(5, 22, 3) rep(c(2,3,5), 4) c(rep("female", 3), rep("male", 2)) ## ss 1.3.5: Missing values seal.lung <- cfseal$lung # Weights (gm) seal.lung seal.lung <- cfseal$lung mean(seal.lung) mean(seal.lung, na.rm=TRUE) seal.lung == NA ## Replace all NAs by -999 (in general, a bad idea) seal.lung[is.na(seal.lung)] <- -999 seal.lung ## There is now no protection against use of the -999 values as ## if they were legitimate numeric values mean(seal.lung) # Illegitimate calculation ## ss 1.3.6: Factors gender <- c(rep("female",691), rep("male",692)) gender <- factor(gender) levels(gender) gender <- factor(gender, levels=c("male", "female")) ## ss 1.3.7: Time series ## Footnote code ## Alternatively, obtain from data frame jobs (DAAG) library(DAAG) numjobs <- jobs$Prairies numjobs <- c(982,981,984,982,981,983,983,983,983,979,973,979, 974,981,985,987,986,980,983,983,988,994,990,999) numjobs <- ts(numjobs, start=1995, frequency = 12) plot(numjobs) first15 <- window(numjobs, end=1996.25) ## Sec 1.4: Data Frames and Matrices Cars93.summary Cars93.summary <- edit(Cars93.summary) ## Display of the first few, or last few, rows of a data frame head(Cars93.summary, n=3) # . . . Cars93.summary[1:3, ] ## Data frames are a specialized type of list ## ss 1.4.1: The attaching of data frames Min.passengers attach(Cars93.summary) # Attach data frame Cars93.summary Min.passengers detach(Cars93.summary) # Detach data frame ## Column and row names ## Subsets of data frames Cars93.summary[1:3, 2:3] # Rows 1-3 and columns 2-3 Cars93.summary[, 2:3] # Columns 2-3 (all rows) Cars93.summary[, c("No.of.cars", "abbrev")] # Cols 2-3, by name Cars93.summary[, -c(2,3)] # omit columns 2 and 3 subset(Cars93.summary, subset=c(TRUE, TRUE, FALSE, FALSE, FALSE, FALSE)) # Note correction to code in the text ## Assignment of (new) names to the columns of a data frame names(Cars93.summary)[3] <- "numCars" names(Cars93.summary) <- c("minPass","maxPass","numCars","code") ## ss 1.4.2: Aggregation, stacking and unstacking library(DAAG) head(jobs,3) # . . . Jobs <- stack(jobs, select = 1:6) # stack() concatenates selected data frame columns into a # single column named "values", & adds a factor named "ind" # that has the names of the concatenated columns as levels. head(Jobs,3) # . . . ## ss 1.4.3: {\kern-4pt}$^*$~Data frames and matrices fossilfuelmat <- matrix(c(1800, 1850, 1900, 1950, 2000, 8, 54, 534, 1630, 6611), nrow=5) colnames(fossilfuel) <- c("year", "carbon") fossilfuelmat <- cbind(year=c(1800, 1850, 1900, 1950, 2000), carbon=c(8, 54, 534, 1630, 6611)) ## Sec 1.5: Functions, Operators and Loops ## ss 1.5.1: Built-In Functions all() # returns TRUE if all values are TRUE any() # returns TRUE if any values are TRUE args() # information on the arguments to a function cat() # prints multiple objects, one after the other cumprod() # cumulative product cumsum() # cumulative sum diff() # form vector of first differences # N. B. diff(x) has one less element than x history() # displays previous commands used is.factor() # returns TRUE if the argument is a factor is.na() # returns TRUE if the argument is an NA # NB also is.logical(), is.matrix(), etc. length() # number of elements in a vector or of a list ls() # list names of objects in the workspace mean() # mean of the elements of a vector median() # median of the elements of a vector order() # x[order(x)] sorts x (by default, NAs are last) print() # prints a single R object range() # minimum and maximum value elements of vector sort() # sort elements into order, by default omitting NAs rev() # reverse the order of vector elements str() # information on an R object unique() # form the vector of distinct values which() # locates 'TRUE' indices of logical vectors which.max() # locates (first) maximum of a numeric vector which.min() # locates (first) minimum of a numeric vector with() # do computation using columns of specified data frame ## Calculations in parallel across all elements of a vector ## The handling of missing values mean(c(1, NA, 3, 0, NA), na.rm=T) ## Data summary functions -- {\em\texttt{table()}}and {\em \texttt{sapply()}} table() # Form a table of counts xtabs() # Form a table of totals library(DAAG) # tinting is from DAAG table(Sex=tinting$sex, AgeGroup=tinting$agegp) sapply(jobs[, -7], range) ## The function \texttt{with()} with(tinting, table(Sex=sex, AgeGroup=agegp)) ## Utility functions ## Footnote code ## More generally, note ls(pattern="p") # List object names that include the letter "p" ls(pattern="^p") # List object names that start with "p" # The pattern-matching conventions are those used for grep(), # which is modeled on the Unix grep command. ## ss 1.5.2: Generic Functions, and the class of an object ## ss 1.5.3: User-written functions mean.and.sd <- function(x = rnorm(10)){ av <- mean(x) sdev <- sd(x) c(mean=av, SD=sdev) } mean.and.sd() mean.and.sd(x = c(148,182,173,166,109,141,166)) ## The structure of functions ## Footnote code ## Thus, to return the mean, SD and name of the input vector ## replace c(mean=av, SD=sdev) by list(mean=av, SD=sdev, dataset = deparse(substitute(x))) ## ss 1.5.4: Relational and logical operators and operations attach(fossilfuel) carbon[year==1900] carbon[year!=1900] if (mean(carbon) > median(carbon)) print("Mean > Median") else detach(fossilfuel) distance<- c(148, 182, 173, 166, 109, 141, 166) dist.sort <- if (distance[1] < 150) sort(distance, decreasing=TRUE) else sort(distance) dist.sort ## ss 1.5.5: Selection and matching x <- rep(1:5, rep(3,5)) x x[x %in% c(2,4)] match(x, c(2,4), nomatch=0) ## ss 1.5.6: Functions for working with missing values ## Identification of rows that include missing values ## Which rows have missing values: data frame science (DAAG) science[!complete.cases(science), ] dim(science) Science <- na.omit(science) dim(Science) ## ss 1.5.7: \kern-4pt$^*$ Looping for (i in 1:4) print(i) ## Relative population increase in Australian states: 1917-1997 ## Data frame austpop (DAAG) relGrowth <- numeric(8) # numeric(8) creates a numeric vector # with 8 elements, all set equal to 0 for (j in seq(2,9)) { relGrowth[j-1] <- (austpop[9, j]-austpop[1, j])/ austpop[1, j]} names(relGrowth) <- names(austpop[c(-1,-10)]) # We have used names() to name the elements of relGrowth relGrowth # Output is with options(digits=3) ## Sec 1.6: Graphics in R demo(graphics) ## ss 1.6.1: The function {\em\texttt{plot( )}} and allied functions plot(y ~ x) plot(x, y) ## Adding points, lines, text and axis annotation plot(Brainwt ~ Bodywt, data=primates) ## Place labels on points attach(primates) plot(Brainwt ~ Bodywt, xlim=c(0, 300)) # Specify xlim so that there is room for the labels text(Brainwt ~ Bodywt, labels=row.names(primates), pos=4) # pos=4 places text to the right of the points. Other # possibilities are: 1: below; 2: to the left; 3: above ## Footnote code plot(Brainwt ~ Bodywt, xlim=c(0, 300), ylim=c(0,1500)) text(x=Bodywt, y=Brainwt+c(-.125,0,0,.125,0)*par()$cxy[2], labels=row.names(primates), pos=4) detach(primates) ## Fine control -- parameter settings oldpar <- par(cex=1.25) # Use par(oldpar) to restore # previous settings ## ss 1.6.2: The use of color theta <- (1:50)*0.92 plot(theta, sin(theta), col=1:50, pch=16, cex=4) points(theta, cos(theta), col=51:100, pch=15, cex=4) palette() # Names of the colors in the current palette theta <- (1:50)*0.92 plot(theta, sin(theta), col=colors()[1:50], pch=16, cex=4) points(theta, cos(theta), col=colors()[51:100], pch=15, cex=4) ## ss 1.6.3: The importance of aspect ratio ## Plot sin(theta) vs theta, at regularly spaced values of theta ## sin() expects angles to be in radians # multiply angles in degrees by pi/180 to get radians plot((0:20)*pi/10, sin((0:20)*pi/10)) plot((1:50)*0.92, sin((1:50)*0.92)) par(mfrow=c(3,1)) # Gives a 3 by 1 layout of plots plot((1:50)*0.92, sin((1:50)*0.92)) par(mfrow=c(1,1)) ## ss 1.6.4: Dimensions and other settings for graphics devices ## ss 1.6.5: The plotting of expressions and mathematical symbols symbols(0, 0, circles=0.95, bg="gray", xlim=c(-1,2.25), ylim=c(-1,1), inches=FALSE) # inches=FALSE: radius is in x-axis units text(1.75, 0, expression("Area" == pi*phantom("'")*italic(r)^2)) # Use '==' to insert '='. # Notice the use of phantom("'") to insert a small space. ## Footnote code ## To add the double-headed arrow and associated label, specify: arrows(0, 0, -0.95, 0, length=.1, code=3) # code=3: arrows at both ends # length is the length of the arrow head (in inches!) text(-0.45, -strheight("R"), expression(italic(r) == 0.95)) ## ss 1.6.6: Identification and location on the figure region attach(primates) plot(Bodywt, Brainwt) identify(Bodywt, Brainwt, labels=row.names(primates), n=2) # Now click near 2 plotted points detach(primates) ## ss 1.6.7: Plot methods for objects other than vectors ## Use plot() with data frame trees (datasets) plot(trees) # Gives a 3 x 3 layout of pairwise # scatterplots among the three variables ## ss 1.6.8: Lattice graphics versus base graphics -- \texttt{xyplot()} versus \texttt{plot()} ## Plot Brainwt vs Bodywt, data frame primates (DAAG) plot(Brainwt ~ Bodywt, data=primates) # base graphics # 'base' graphics use the abilities of the graphics package library(lattice) xyplot(Brainwt ~ Bodywt, data=primates) # lattice ## ss 1.6.9: Further information on graphics ## ss 1.6.10: Good and bad graphs ## Footnote code ## Example of plotting with different shades of gray plot(1:4, 1:4, pch=16, col=c("gray20","gray40","gray60","gray80"), cex=3) ## Sec 1.7: Lattice (trellis) Graphics ## Panels of scatterplots -- the use of xyplot() trellis.device(color=FALSE) here <- ais$sport %in% c("Row","Swim") xyplot(ht ~ wt | sport, groups=sex, pch=c(4,1), aspect=1, auto.key=list(columns=2), subset=here, data=ais) dev.off() # Close device trellis.device() # Start new device, by default with color=TRUE ## Selected lattice functions dotplot(factor ~ numeric,..) # 1-dim. Display stripplot(factor ~ numeric,..) # 1-dim. Display barchart(character ~ numeric,..) histogram( ~ numeric,..) densityplot( ~ numeric,..) # Density plot bwplot(factor ~ numeric,..) # Box and whisker plot qqmath(factor ~ numeric,..) # normal probability plots splom( ~ dataframe,..) # Scatterplot matrix parallel( ~ dataframe,..) # Parallel coordinate plots cloud(numeric ~ numeric * numeric, ...) # 3D surface wireframe(numeric ~ numeric * numeric, ...) # 3D scatterplot ## Inclusion of lattice graphics functions in user functions xyplot(ht ~ wt | sport, data=ais) print(xyplot(ht ~ wt | sport, data=ais)) print(xyplot(ht ~ wt | sport, data=ais)) ais.trellis <- xyplot(ht ~ wt | sport, data=ais) print(ais.trellis) ## Interaction with lattice plots ## Sec 1.8: Additional Points on the Use of R ## *Workspace management strategies ## Forward slashes and backslashes ## Setting the number of decimal places in output sqrt(10) options(digits=2) # Change until further notice, sqrt(10) ## Other option settings ## Cosmetic issues ## \kern-4pt$^*$\, Common sources of difficulty ## Variable names in data sets ## Sec 1.9: Recap ## Sec 1.10: Further Reading attach(Manitoba.lakes) plot(log2(area) ~ elevation, pch=16, xlim=c(170,280)) # NB: Doubling the area increases log2(area) by 1.0 text(log2(area) ~ elevation, labels=row.names(Manitoba.lakes), pos=4) text(log2(area) ~ elevation, labels=area, pos=2) title("Manitoba's Largest Lakes") detach(Manitoba.lakes) rep(c(2,3,5), c(4,3,2)) rep(c(2,3,5), 4:2) 1000*((1+0.075)^5 - 1) # Interest on $1000, compounded # annually at 7.5% p.a. for five years gender <- factor(c(rep("female", 91), rep("male", 92))) table(gender) gender <- factor(gender, levels=c("male", "female")) table(gender) gender <- factor(gender, levels=c("Male", "female")) # Note the mistake # The level was "male", not "Male" table(gender) rm(gender) # Remove gender par(mfrow=c(2,2)) # 2 by 2 layout on the page library(MASS) # Animals is in the MASS package attach(Animals) plot(body, brain) plot(sqrt(body), sqrt(brain)) plot((body)^0.1, (brain)^0.1) plot(log(body), log(brain)) detach(Animals) par(mfrow=c(1,1)) # Restore to 1 figure per page plot(BDI ~ age, data=socsupport) plot(BDI ~ unclass(age), data=socsupport) attach(socsupport) gender1 <- abbreviate(gender, 1) table(gender1) # Examine the result country3 <- abbreviate(country, 3) table(country3) # Examine the result num <- seq(along=gender) # Generate row numbers lab <- paste(gender1, country3, num, sep=":") detach(socsupport) x <- c(8, 54, 534, 1630, 6611) seq(1, length(x)) seq(along=x) head(vlt, 4) # first few rows of vlt # . . . vltcv <- stack(vlt[, 1:3]) head(vltcv) # first few rows of vltcv table(vltcv$values, vltcv$ind) # More cryptically, table(vltcv) gives the same result. oldpar <- par(mfrow=c(2,4)) for (i in 2:9){ plot(austpop[, 1], log(austpop[, i]), xlab="Year", ylab=names(austpop)[i], pch=16, ylim=c(0,10))} par(oldpar)