[R] simplified multicore by() function

ivo welch ivo.welch at gmail.com
Sat Oct 22 23:02:45 CEST 2011


dear R readers---I thought I would post the following snippet of R
code that makes by() like operations easier and faster on multicore
machines for R novices and amateurs.  I hope it helps some.  YMMV.
feel free to ignore.

PS: I wish R had a POD-like documentation system for end users that
are not writing full libraries.  because it does not, I did not
provide documentation ala  '?mc.by'.

/iaw


set.seed(0)
library(multicore)

################################################################
###
### these R functions are very type-limited wrappers for
### by()-like operations, using the multicore library.  this
### means effort-less multi-CPU calculations.
###
### the user functions MUST return a numeric scalar or a vector.
###
### to enhance speed, internally the user function is wrapped, too.
###
### the output is ONE matrix, whose row-names are the categories.
###
################################################################

add.by.names <- function( mc.rv ) {
  for (i in 1:length(mc.rv)) {
    stopifnot( is.matrix( mc.rv[[i]] ) )
    row.names(mc.rv[[i]]) <- rep( names(mc.rv)[i], nrow(mc.rv[[i]]) )
  }
  mc.rv
}

mc.by <- function(data, INDICES, FUN, ...) {
  FUN.i <- function(.index, ...) { cbind(FUN(data[.index,], ...)) }
## we cbind, so we always get an array back
  si <- split(1:nrow(data), INDICES)
  soln <- mclapply( si, FUN.i, ... )
  rv <- do.call("rbind", add.by.names(soln))
  if (ncol(rv)==1) {
    nm <- rownames(rv)
    rv <- as.vector(rv)
    names(rv) <- nm
  }
  rv
}

mc.byallrows <- function(data, INDICES, FUN, ...) {
  FUN.i <- function(.index, ...) { cbind(FUN(data[.index,], ...)) }
  si <- as.list(1:nrow(data))  ## a little faster than the split for
large data sets
  soln <- mclapply( si, FUN.i, ... )
  rv <- do.call("rbind", soln)  ## omits naming.
  if (ncol(rv)==1) rv <- as.vector(rv)
  rv
}


function.sample <- function(d) cbind(d$x+d$y, d$x, d$y)
function.sample.simpler <- function(d) (d$x+d$y)



d <- data.frame( i=c( rep(1,2), rep(2,3), rep(3,4) ), x=rnorm(9), y=rnorm(9) )

report <- function( text2print, f.output ) {
  cat("\n\n", text2print, ":\n"); print(f.output); cat("\n\n")
}

report( "the original R by() function", by( d, d$i, function.sample ))
report( "wrappled multicore by mc.by with user function returning
scalar", mc.by( d, d$i, function.sample.simpler ))
report( "wrappled multicore by mc.by with user function returning
vector", mc.by( d, d$i, function.sample ))
report( "wrappled multicore by mc.byallrows ", mc.byallrows( d, d$i,
function.sample ))



More information about the R-help mailing list