entries <- readLines("~/r-book/ed2/index/index.tex") entries <- entries[-(1:41)] z <- regexpr("\\\\indexspace", entries) entries <- entries[z<0] entries <- trim.String(entries) entries <- entries[entries!=""] makeone <- function(ent=entries, z = grep( "^[0-9]", ent)){ z <- sort(unique(c(z-1,z))) omit <- logical(length(ent)) repeat { if(length(z)==0) break n1 <- seq(along=z) m1 <- 1:min(n1[c(diff(z),2)>1]) d <- z[m1] ent[d[1]] <- paste(ent[d], collapse=" ") omit[d[-1]] <- TRUE z <- z[-m1] } ent <- ent[!omit] ent } entries[1046] <- paste("\\", entries[1046], sep="") st <- substring(entries, 1, 5) table(st) entries <- entries[st!="\\clea"] nond <- seq(along=entries)[-grep("^\\\\",entries)] newe <- makeone(z=nond) st <- substring(newe, 1, 5) table(st) nond <- seq(along=newe)[grep("^\\\\text", newe)] newe <- makeone(ent=newe,z=nond) grep("\\\\end", entries) ifun <- entries[2:495] ifun <- substring(ifun,7) iterms <- entries[498:1067] iauthors <- entries[1070:1242] z <- regexpr(", +[0-9]", ifun) ifun <- ifun[z>=0] z <- regexpr(", +[0-9]", ifun) pages <- substring(ifun, z+attr(z,"match.length")-1) fundex <- substring(ifun, 1, z-1) funmat <- cbind(item=fundex, pages=pages) bypage <- function(tx=funmat){ pag <- character(0) for(i in 1:dim(tx)[1]){ nam <- trim.String(tx[i,1]) pages <- trim.String(strsplit(tx[i,2], ",")[[1]]) names(pages) <- rep(nam, length(pages)) pag <- c(pag,pages) } sort(pag) } funpages <- bypage() iauthors <- substring(iauthors, 7) ## Omit \\item z <- regexpr(", +[0-9ivx]", iauthors) authdex <- substring(iauthors, 1, z-3) ## All end in \\ authdex <- sub("{","", authdex, fixed=T) authdex <- sub("}","", authdex, fixed=T) pages <- substring(iauthors, z+attr(z,"match.length")-1) authmat <- cbind(item=authdex, pages=pages) authpages <- bypage(authmat) iterms <- entries[498:1067] z <- regexpr("m ", iterms) iterms <- trim.String(substring(iterms, z+1)) iterms[352] <- "mean, 5, 18, {\\itshape passim}" iterms[397] <- "DAAG, 1, 9, {\\itshape passim}" iterms[399] <- "dichromat, 460, 492" iterms[407] <- "KernSmooth, 240" iterms[441] <- "histogram, 6, 38, 44--46, 59, 74, 92, 121" iterms[517] <- "data collection, 276" iterms[520] <- "martingale, 282" iterms[521] <- "Schoenfeld, 281" iterms[522] <- "survival estimate, 277--278" iterms[523] <- "tree-based, 372--373" iterms[525] <- "of frequencies, 19--20, 60--63, 95--96, 115-119, 141--142" iterms[526] <- "adding across tables (Simpson's paradox), 61, 63, 95--96, 141--142" [16,] "399" "dichromat 460" [17,] "407" "KernSmooth 240" [19,] "441" "histogram 6" [20,] "517" "data collection 276" [21,] "520" "martingale 282" [22,] "521" "Schoenfeld 281" [23,] "522" "survival estimate 277--278" [24,] "523" "tree-based 372--373" [25,] "525" "of frequencies 19--20" [26,] "526" "adding across tables (Simpson's paradox) 61" sum(substring(termdex,1,3)=="see") iterms <- iterms[substring(iterms,1,3)!="see",] mterm <- matrix("", nrow=length(z), ncol=4) mterm[,4] <- z id <- c("5"=1, "8"=2, "11"=3)[paste(z)] z <- regexpr(", +[0-9]", iterms) z[z==-1] <- nchar(iterms[z==-1])+1 termdex <- trim.String(substring(iterms,1,z-1)) seq(along=iterms)[substring(termdex,nchar(termdex))%in%c("i","x")] z[c(8,291,422)] <- regexpr(", +[ix]", iterms[c(8,291,422)]) termdex <- trim.String(substring(iterms,1,z-1)) termdex[substring(termdex,nchar(termdex))%in%c(paste(0-9),"i","v","x")] table(substring(termdex,nchar(termdex))) pages <- trim.String(substring(iterms,z+1)) mterm[cbind(1:570, id)] <- termdex mterm <- cbind(mterm, pages) u <- (1:570)[nchar(termdex)>50] mterm <- mterm[-u[-11],] termdex <- termdex[-u[-11]]] pages <- pages[-u[-11]] iterms <- iterms[-u[-11]] make.termindex <- function(mterm, completecol=2){ nrow <- dim(mterm)[1] k <- completecol irow <- 0 jmax <- max((1:nrow)[mterm[,k+1]!=""]) repeat { lookatrows <- (irow+1):nrow j <- min(lookatrows[mterm[lookatrows,k+1]!=""]) cat(" ",j) belowj <- 1:(j-1) if(mterm[j,k]=="") mterm[j,k] <- mterm[max(belowj[mterm[belowj,k]!=""]), k] irow <- j if(j==jmax)break } mterm } termsmat <- cbind(paste(mterm[,1], mterm[,2], mterm[,3], sep="! "), mterm[,5]) termspages <- bypage(termsmat)