Tutorial 8: Advanced Topics

DPI R Bootcamp

Jared Knowles

Overview

In this lesson we hope to learn: - Coding Style - Write a `for’ loop - Write a basic function - Optimize R with parallelization - Mixed effect models - Data mining with R - Animations - Git, GitHub, and add-on packages

Basic Principles

-

Coding Style

For Looping

# Loop to calculate number of students per grade

nstudents <- rep(NA, 6)
for (i in unique(df$grade)) {
    nstudents[[i - 2]] <- length(df$stuid[df$grade == i])
}
nstudents
## [1] 500 400 500 400 500 400
summary(factor(df$grade))
##   3   4   5   6   7   8 
## 500 400 500 400 500 400

Why is a loop slow?

A = matrix(as.numeric(1:1e+05))

system.time({
    Sum = 0
    for (i in seq_along(A)) {
        Sum = Sum + A[[i]]
    }
    Sum
})
##    user  system elapsed 
##    0.11    0.00    0.11

system.time({
    sum(A)
})
##    user  system elapsed 
##       0       0       0
rm(A)

Write a simple function

print(mean)  #bytecode, we can't see it
## function (x, ...) 
## UseMethod("mean")
## <bytecode: 0x0ea57bd8>
## <environment: namespace:base>
print(order)
## function (..., na.last = TRUE, decreasing = FALSE) 
## {
##     z <- list(...)
##     if (any(unlist(lapply(z, is.object)))) {
##         z <- lapply(z, function(x) if (is.object(x)) 
##             xtfrm(x)
##         else x)
##         if (!is.na(na.last)) 
##             return(do.call("order", c(z, na.last = na.last, decreasing = decreasing)))
##     }
##     else if (!is.na(na.last)) 
##         return(.Internal(order(na.last, decreasing, ...)))
##     if (any(diff(l.z <- vapply(z, length, 1L)) != 0L)) 
##         stop("argument lengths differ")
##     ans <- vapply(z, is.na, rep.int(NA, l.z[1L]))
##     ok <- if (is.matrix(ans)) 
##         !apply(ans, 1, any)
##     else !any(ans)
##     if (all(!ok)) 
##         return(integer())
##     z[[1L]][!ok] <- NA
##     ans <- do.call("order", c(z, decreasing = decreasing))
##     keep <- seq_along(ok)[ok]
##     ans[ans %in% keep]
## }
## <bytecode: 0x0deed500>
## <environment: namespace:base>

Still, we can write a number of simple functions very quickly

defac <- function(x) {
    # assign function a name, and list its arguments
    x <- as.character(x)  # what does function do?
    x  # last line is output of function
}

a <- factor(letters)
summary(a)
summary(defac(a))
summary(as.character(a))

Simple Data Cleaning Function

extractN <- function(x) {
    x <- suppressWarnings(as.numeric(x))
    # ignore warnings because we don't care
    x <- x[!is.na(x)]
    x
}
extractN(foo)
A <- extractN(foo)

Complicating Functions

Mixed Effect Models

The Basics of lme4

mymod_me<-lmer(readSS~factor(grade)+factor(race)+female+disab+ell+(1|dist)+(1|stuid),data=df)

library(lme4)
mymod_me <- lmer(readSS ~ factor(grade) + factor(race) + female + disab + ell + 
    (1 | dist) + (1 | stuid), data = df)
print(mymod_me, correlation = FALSE)
## Linear mixed model fit by REML 
## Formula: readSS ~ factor(grade) + factor(race) + female + disab + ell +      (1 | dist) + (1 | stuid) 
##    Data: df 
##    AIC   BIC logLik deviance REMLdev
##  30731 30826 -15350    30774   30699
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  stuid    (Intercept) 10626    103.1   
##  dist     (Intercept)   487     22.1   
##  Residual              1564     39.5   
## Number of obs: 2700, groups: stuid, 1200; dist, 3
## 
## Fixed effects:
##                Estimate Std. Error t value
## (Intercept)      378.24      31.03    12.2
## factor(grade)4    62.72       3.05    20.6
## factor(grade)5   124.67       3.28    38.0
## factor(grade)6   182.67       3.99    45.8
## factor(grade)7   236.64       4.20    56.4
## factor(grade)8   301.41       4.74    63.5
## factor(race)B    -61.87      28.05    -2.2
## factor(race)H    -34.29      27.54    -1.2
## factor(race)I     -6.09      43.93    -0.1
## factor(race)W     10.37      28.00     0.4
## female             8.76       6.21     1.4
## disab             -5.26       8.50    -0.6
## ell              -18.81      13.10    -1.4

LMER vs. LM

mymod <- lm(readSS ~ factor(grade) + factor(race) + female + disab + ell, data = df)

qplot(readSS, predict(mymod), data = df, alpha = I(0.3), color = I("blue")) + 
    geom_point(aes(x = df$readSS, y = fitted(mymod_me)), alpha = I(0.4), color = "dark green") + 
    theme_dpi() + xlab("Observed") + ylab("Predicted") + geom_text(aes(x = 370, 
    y = 700), label = "Green is Results of the \n Mixed Model")

Data Mining with R

Using the caret

library(caret)
# Set aside test set
testset <- sample(unique(df$stuid), 500)
df$case <- 0
df$case[df$stuid %in% testset] <- 1
# Draw a training set of data (random subset of students)
training <- subset(df, case == 0)
testing <- subset(df, case == 1)

training <- training[, c(3, 6:16, 21, 22, 28, 29, 30)]  # subset vars
trainX <- training[, 1:15]
refac <- function(x) as.factor(as.character(x))
trainX$stuid <- refac(trainX$stuid)
trainX$dist <- refac(trainX$dist)
trainX$year <- refac(trainX$year)

# Parameters
ctrl <- trainControl(method = "repeatedcv", number = 7, repeats = 3, summaryFunction = defaultSummary)
# Search grid
grid <- expand.grid(.interaction.depth = seq(2, 6, by = 1), .n.trees = seq(200, 
    700, by = 100), .shrinkage = c(0.05, 0.1))
# Boosted tree search
gbmTune <- train(x = trainX, y = training$mathSS, method = "gbm", metric = "RMSE", 
    trControl = ctrl, tuneGrid = grid, verbose = FALSE)

# gbmPred<-predict(gbmTune,testing[,names(trainX)])

Plot GBM

plot(gbmTune)
plot of chunk gbmplot

Optimizing R

A Quick Windows Parallel Example

n <- 10000
rep <- 5
# tLoop <- replicate(rep, system.time( integLoop(func, xint, yint, n) ))
# summary(tLoop[3,])
tVec <- replicate(rep, system.time(integVec(func, xint, yint, n)))
summary(tVec[3, ])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.004   0.000   0.020
tApply <- replicate(rep, system.time(integApply(func, xint, yint, n)))
summary(tApply[3, ])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.96    5.97    5.98    6.01    6.06    6.08

# 2 Core Cluster
library(snow)
c1 <- makeCluster(c("localhost", "localhost"), type = "SOCK")
tSnow1 <- replicate(rep, system.time(integSnow(c1, func, xint, yint, n)))
summary(tSnow1[3])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.75    8.75    8.75    8.75    8.75    8.75
stopCluster(c1)

Animations

Animating a Random Normal Distribution

Git and GitHub

Exercises

  1. Enjoy R!!!!

References

  1. Technical Report on Parallel Computing in R

Session Info

It is good to include the session info, e.g. this document is produced with knitr version 0.8. Here is my session info:

print(sessionInfo(), locale = FALSE)
## R version 2.15.2 (2012-10-26)
## Platform: i386-w64-mingw32/i386 (32-bit)
## 
## attached base packages:
## [1] splines   grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] snow_0.3-10      gbm_1.6-3.2      survival_2.36-14 caret_5.15-044  
##  [5] foreach_1.4.0    cluster_1.14.3   reshape_0.8.4    lme4_0.999999-0 
##  [9] Matrix_1.0-10    lattice_0.20-10  xtable_1.7-0     gridExtra_0.9.1 
## [13] sandwich_2.2-9   quantreg_4.91    SparseM_0.96     mgcv_1.7-22     
## [17] eeptools_0.1     mapproj_1.1-8.3  maps_2.2-6       proto_0.3-9.2   
## [21] stringr_0.6.1    plyr_1.7.1       ggplot2_0.9.2.1  lmtest_0.9-30   
## [25] zoo_1.7-9        knitr_0.8       
## 
## loaded via a namespace (and not attached):
##  [1] codetools_0.2-8    colorspace_1.2-0   compiler_2.15.2   
##  [4] dichromat_1.2-4    digest_0.5.2       evaluate_0.4.2    
##  [7] formatR_0.6        gtable_0.1.1       iterators_1.0.6   
## [10] labeling_0.1       MASS_7.3-22        memoise_0.1       
## [13] munsell_0.4        nlme_3.1-105       RColorBrewer_1.0-5
## [16] reshape2_1.2.1     scales_0.2.2       stats4_2.15.1     
## [19] tools_2.15.1

Attribution and License

Public Domain Mark
This work (R Tutorial for Education, by Jared E. Knowles), in service of the Wisconsin Department of Public Instruction, is free of known copyright restrictions.