[R] mc.cores and computer settings on osx and linux

ivo welch ivo.welch at gmail.com
Fri Nov 11 18:50:49 CET 2011


for the googleable r-help archives, I thought I would post what I
wrote into my .Rprofile to automatically set some system information.
the most relevant aspect is the determination of mc.cores.  this is
useful when users want to use the parallel package

  options(uname= system("uname", intern=TRUE))

  options(os= if (getOption("uname")=="Darwin") "osx" else "linux")

  if ((getOption("os") != "osx") & (getOption("os") != "linux"))
stop("You need to set options yourself.  I only grok linux and osx\n")

  options(mc.cores= as.numeric(if (getOption("os")=="osx")
system("sysctl -n hw.ncpu", intern=TRUE) else system("grep 'core id'
/proc/cpuinfo | sort | uniq | wc -l", intern=TRUE)))

  options(hostname= system("hostname", intern=TRUE))

[below, I am also posting my current wrapper for the parallel library.
 I know it is amateurish, but it may be useful for novices exploring
parallel calculations.  it is a friendlier face.]

[gripes: R is powerful, and the team that maintains it are saints.
But R is not friendly.  it lacks the ability to turn off recycling for
enhanced error detection.  it does not throw clear errors when one
accesses a non-existing column in a data frame.  it does not print out
the user program line number where an error occurred.  it lacks an
end-user documentation system [like POD], though it does have good
package documentation.  it does have some unexpected behavior:
mymatrix[1:2,] is a matrix, but mymatrix[1:1,] is a numeric.  huh?
data.table is necessary for reasonably fast data manipulation, but
data.table giveth and taketh.  it has some really strange unexpected
behavior---mydatatable[,1] is not the second column, as one would
expect it to be.  yes, it is documented, but syntax should be as
expected.]

/iaw

----
Ivo Welch (ivo.welch at gmail.com)



################################################################
###
### these R functions are very type-limited wrappers for
### by()-like operations, using the multicore library.  this
### means effort-less multi-CPU calculations.
###
### the user functions MUST return a numeric scalar, a vector, a matrix, or
### a data frame.  to enhance speed, internally the user function is
### wrapped, too.
###
### the output is ONE matrix, whose row-names are the categories.
################################################################

check.output <- function( mc.rv ) {
  ## check that we have a list of matrices, and that each matrix has
the same number of columns
  numofcols <- (-1)
  for (i in 1:length(mc.rv)) {
    if (is.null(mc.rv[[i]])) next;
    if (! (is.matrix(mc.rv[[i]])|is.data.frame(mc.rv[[i]])) )
      abort("iaw-mc.R:check.output: Element", i, "is not a
matrix/dataframe, but a ", whatis(mc.rv[[i]]))
    if (numofcols<0) numofcols <- ncol(mc.rv[[i]])
    if (numofcols<0) next
    if (ncol(mc.rv[[i]]) != numofcols) {
      print(head(mc.rv[[i]]))
      abort("iaw-mc.R:check.output: Element", i, "should have",
numofcols, "columns, but has", ncol(mc.rv[[i]]), "columns instead.")
    }
  }
}

add.by.names <- function( mc.rv ) {
  for (i in 1:length(mc.rv))
    if (!is.null(mc.rv[[i]])) row.names(mc.rv[[i]]) <- rep(
names(mc.rv)[i], nrow(mc.rv[[i]]) )
  mc.rv
}


.mc.by <- function(lcapplyversion, data, INDICES, FUN, ...) {
  si <- split(1:nrow(data), INDICES)

  ## input = set of row indexes ; output = one row in a matrix or data
frame, that can be stacked up
  FUN.ON.ROWS <- function(.index, ...) { rv <- FUN(data[.index,],
...); if (is.null(rv)) rv else if (is.vector(rv)) matrix(rv, nrow=1)
else rv }
  soln <- lcapplyversion( si, FUN.ON.ROWS, ... )
  check.output(soln)

  rv <- do.call("rbind", add.by.names(soln))
  if (is.null(rv)) { print(head(soln)); abort("Sorry, but in .mc.by,
the rv is null!\n") }
  if (ncol(rv)==1) {
    nm <- rownames(rv)
    rv <- as.vector(rv)
    names(rv) <- nm
  }
  rv
}

mc.by <- function(data, INDICES, FUN, ...) { .mc.by(mclapply, data,
INDICES, FUN, ...) }
oc.by <- function(data, INDICES, FUN, ...) { .mc.by(lapply, data,
INDICES, FUN, ...) }


mc.byallrows <- function(data, FUN, ...) {
  si <- as.list(1:nrow(data))  ## a little faster than the split for
large data sets
  FUN.ON.ROWS <- function(.index, ...) { rv <- FUN(data[.index,],
...); if (is.null(rv)) rv else if (is.vector(rv)) matrix(rv, nrow=1)
else rv }

  soln <- mclapply( si, FUN.ON.ROWS, ..., mc.cores= 4 )
  check.output(soln)
  rv <- do.call("rbind", soln)  ## omits naming.
  if (ncol(rv)==1) rv <- as.vector(rv)
  rv
}


if (0) {
  function.sample <- function(d) cbind(d$x+d$y, d$x, d$y)
  function.sample.simpler <- function(d) (d$x+d$y)

  d <- data.frame( i=c( rep(1,2), rep(2,3), rep(3,4) ), x=rnorm(9), y=rnorm(9) )

  report <- function( text2print, f.output ) {
    cat("\n\n", text2print, ":\n"); print(f.output); cat("\n\n")
  }

  report( "the original R by() function", by( d, d$i, function.sample ))
  report( "wrappled multicore by mc.by with user function returning
scalar", mc.by( d, d$i, function.sample.simpler ))
  report( "wrappled multicore by mc.by with user function returning
vector", mc.by( d, d$i, function.sample ))
  report( "wrappled multicore by mc.byallrows ", mc.byallrows( d, d$i,
function.sample ))
}



More information about the R-help mailing list