4?R code for the tutorial Section 1.3 Requirements: * bioconductor package Biobase * a single-core computer (or multicore, but we only need one core) Code: data(geneData, package = 'Biobase') head(geneData, n=2) cor(geneData[12, ], geneData[13, ]) pair <- combn(1:nrow(geneData), 2, simplify = F) length(pair) head(pair, n = 3) tail(pair, n = 3) geneCor <- function(x, gene = geneData) { cor(gene[x[1], ], gene[x[2], ]) } geneCor(c(12, 13)) out <- lapply(pair[1:3], geneCor) out system.time(out <- lapply(pair, geneCor)) Section 2.1 Requirements: * package multicore * a multicore computer (otherwise we wont see speed up) Code: library(multicore) system.time(out <- mclapply(pair, geneCor)) Section 2.2 Requirements: * several computers with SSH access * all computers with R and snow installed in the same place (if not, then need complicated configuration, which is not included) * Public key login for SSH (otherwise, you need to type passwords for each node) Code: fakeData <- cbind(geneData, geneData, geneData, geneData) library(boot) geneCor2 <- function(x, gene = fakeData) { mydata <- cbind(gene[x[1], ], gene[x[2], ]) mycor <- function(x, i) cor(x[i,1], x[i,2]) boot.out <- boot(mydata, mycor, 1000) boot.ci(boot.out, type = 'bca')$bca[4:5] } geneCor2(c(12, 13)) system.time(out <- lapply(pair[1:10], geneCor2)) pair2 <- sample(pair, 300) system.time(out <- lapply(pair2, geneCor2)) library(multicore) system.time(out <- mclapply(pair2, geneCor2)) library(snow) hosts <- c( 'localhost', 'localhost', 'localhost', 'localhost', 'variome', 'variome', 'lams', 'lams', 'bug' ) cl <- makeCluster(hosts, type = 'SOCK') clusterExport(cl, 'fakeData') a <- clusterEvalQ(cl, library(boot)) system.time(out <- clusterApplyLB(cl, pair2, geneCor2)) stopCluster(cl) cl <- makeCluster(unique(hosts), type = 'SOCK') clusterCall(cl, date) clusterCall(cl, function(x) Sys.info()[c('nodename', 'machine')]) stopCluster(cl) Section 2.3 Requirements: * a cluster that supports MPI * a working R with package snow and Rmpi installed on the front-end Code: # bsub -o section.2.3.out -q atlas5_parallel -a openmpi -n 32 mpirun.lsf RMPISNOW -f ./section.2.3.r library(boot) library(snow) data(geneData, package = 'Biobase') pair <- combn(1:nrow(geneData), 2, simplify = F) fakeData <- cbind(geneData, geneData, geneData, geneData) geneCor2 <- function(x, gene = fakeData) { mydata <- cbind(gene[x[1], ], gene[x[2], ]) mycor <- function(x, i) cor(x[i,1], x[i,2]) boot.out <- boot(mydata, mycor, 1000) boot.ci(boot.out, type = 'bca')$bca[4:5] } geneCor2(c(12, 13)) system.time(out <- lapply(pair[1:10], geneCor2)) pair2 <- sample(pair, 300) # system.time(out1 <- lapply(pair2, geneCor2)) cl <- makeCluster() clusterExport(cl, 'fakeData') a <- clusterEvalQ(cl, library(boot)) system.time(out2 <- clusterApply(cl, pair2, geneCor2)) system.time(out2 <- clusterApplyLB(cl, pair2, geneCor2)) stopCluster(cl) mpi.quit() Section 2.4 Requirements: For 2.4.3 * package foreach * ordinary single-core computer is enough For 2.4.4 * package doMC * multicore computer For 2.4.5 * package doSNOW * same requirements as section 2.2 For 2.4.6 * a cluster that supports MPI * package doMPI and Rmpi correctly installed 2.4.3 Sequential code Let's first solve our example in sequential manner, using the foreach framework: #!/usr/bin/env Rscript library(foreach) data(geneData, package = 'Biobase') pair <- combn(1:nrow(geneData), 2, simplify = F) fakeData <- cbind(geneData, geneData, geneData, geneData) pair2 <- sample(pair, 300) print(system.time( out <- foreach(p = pair2, .packages = 'boot', .combine = 'rbind') %dopar% { mydata <- cbind(fakeData[p[1],], fakeData[p[2], ]) mycor <- function(x, i) cor(x[i,1], x[i,2]) boot.out <- boot(mydata, mycor, 1000) ci <- boot.ci(boot.out, type = 'bca')$bca[4:5] c(p, ci) } )) print(head(out)) # print the head of the result This script is named as section.2.4-shared.r, and it can itself work as normal R code: $ ./section.2.4-shared.r user system elapsed 65.920 0.440 66.355 [,1] [,2] [,3] [,4] result.1 288 422 -0.3357138 -0.1266808 result.2 243 303 -0.1529784 0.1017721 result.3 234 386 -0.1561772 0.2161708 result.4 5 183 0.2505966 0.4794503 result.5 275 440 0.2420215 0.5738407 result.6 62 457 0.1558987 0.4712927 The above code is the ONLY version that we need to write to solve our problem. To enable parallel computing for this piece of code, we simply register various parallel back-ends with it. 2.4.4 doMC Let's start with our multicore machine: #!/usr/bin/env Rscript library(doMC) registerDoMC() source("section.2.4-shared.r") # or paste the shared code here That's it! Except the parallel backend register code, we do not need do any modification to our original code and we get a multicore parallel code! $ ./section.2.4-doMC.r user system elapsed 56.410 56.290 19.114 [,1] [,2] [,3] [,4] result.1 164 172 -0.2668234 0.14122971 result.2 31 67 0.6607894 0.78171239 result.3 62 282 -0.3225335 -0.03370322 result.4 37 293 0.1073143 0.31629694 result.5 38 271 -0.1041704 0.28953027 result.6 211 287 -0.3495002 0.00608487 2.4.5 doSNOW Let's port our code to a SNOW netowrk: #!/usr/bin/env Rscript library(doSNOW) hosts <- c( 'localhost', 'localhost', 'localhost', 'localhost', 'variome', 'variome', 'lams', 'lams', 'bug' ) cl <- makeCluster(hosts, type = 'SOCK') registerDoSNOW(cl) source("section.2.4-shared.r") # or paste the shared code here stopCluster(cl) So, the only things we did here are: 1) choose computing hosts 2) start and register our SNOW cluster 3) run our original code! 4) stop the cluster Let's test this code out: $ ./section.2.4-doSNOW.r user system elapsed 0.130 0.020 12.335 [,1] [,2] [,3] [,4] result.1 217 332 0.1224703 0.44764756 result.2 19 179 -0.6041616 -0.36109986 result.3 192 426 0.4824728 0.68836742 result.4 46 228 -0.2883579 -0.01718258 result.5 407 426 0.1998271 0.45557312 result.6 165 386 -0.6302590 -0.15638907 2.4.6 doMPI How about MPI cluster code? #!/usr/bin/env Rscript library(doMPI) cl <- startMPIcluster() registerDoMPI(cl) source("section.2.4-shared.r") # or paste the shared code here closeCluster(cl) Great, it is even simpler than the SNOW version. $ bsub -o section.2.4-doMPI.out -e error -q atlas5_parallel \ -a openmpi -n 32 mpirun.lsf ./section.2.4-doMPI.r $ (wait, and then) cat section.2.4-doMPI.r user system elapsed 2.742 0.243 3.015 [,1] [,2] [,3] [,4] result.1 202 231 0.003461775 0.3608233 result.2 174 372 0.009071919 0.3583579 result.3 118 186 0.601903906 0.7885814 result.4 186 362 -0.336281370 -0.1353606 result.5 270 485 -0.491912888 -0.1121098 result.6 168 413 -0.147680243 0.1079481