[BioC] segfault when using data.table package in conjunction with foreach

Matthew Keller mckellercran at gmail.com
Thu Feb 23 18:04:59 CET 2012


Hi all,

I'm trying to use the package read.table within a foreach loop. I'm
grabbing 500M rows of data at a time from two different files and then
doing an aggregate/tapply like function in read.table after that. I
had planned on doing a foreach loop 39 times at once for the 39 files
I have, but obviously that won't work until I figure out why the
segfault is occurring. The sessionInfo, code, and error are pasted
below. If you have any ideas, would love to hear them. (I have no
control over the version of R - 2.13.0 - being used). Best

Matt


SESSION INFO:

> sessionInfo()
R version 2.13.0 (2011-04-13)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=C
 [6] LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C
               LC_ADDRESS=C               LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base

other attached packages:
[1] data.table_1.7.10 doMC_1.2.2        multicore_0.1-5
foreach_1.3.2     codetools_0.2-8   iterators_1.0.3



MY CODE:

computeAllPairSums <- function(filename, nbindiv,nrows.to.read)
{
   con <- file(filename, open="r")
   on.exit(close(con))
   ans <- matrix(numeric(nbindiv * nbindiv), nrow=nbindiv)
   chunk <- 0L
   while (TRUE) {
       #read.table faster than scan
       df0 <- read.table(con,col.names=c("ID1", "ID2", "ignored", "sharing"),
                colClasses=c("integer", "integer", "NULL",
"numeric"),nrows=nrows.to.read,comment.char="")

       DT <- data.table(df0)
       setkey(DT,ID1,ID2)
       ss <- DT[,sum(sharing),by="ID1,ID2"]

       if (nrow(df0) == 0L)
           break

       chunk <- chunk + 1L
       cat("Processing chunk", chunk, "... ")

      idd <- as.matrix(subset(ss,select=1:2))
      newvec <- as.vector(as.matrix(subset(ss,select=3)))
      ans[idd] <- ans[idd] + newvec

         cat("OK\n")
     }
   ans
 }



require(foreach)
require(doMC)
registerDoMC(cores=2)


num <- 8891
nr <-  500000000L   #500 million rows at a time


MMM  <-  foreach(IT = 1:2) %dopar% {
  require(data.table)
  if (IT==1){ x <- system.time({computeAllPairSums(
paste(GERMLINE,"bc.chr22.q.20.file",sep=''),num,nr)}) } #Run it on
regular file PID 6489, 24 gb
  if (IT==2){ z <- system.time({computeAllPairSums.gz(
paste(GERMLINE,"bc.chr22.q.20.gz",sep=''),num,nr)}) } #Run it on gz
file PID 6490, 24 gb
}


MY R OUTPUT/ERROR:

MMM  <-  foreach(IT = 1:2) %dopar% {
+   require(data.table)
+   if (IT==1){ x <- system.time({computeAllPairSums(
paste(GERMLINE,"bc.chr22.q.20.file",sep=''),num,nr)}) } #Run it on
regular file PID 6053, 5.9 gb
+   if (IT==2){ z <- system.time({computeAllPairSums.gz(
paste(GERMLINE,"bc.chr22.q.20.gz",sep=''),num,nr)}) } #Run it on gz
file PID 6054, 4 gb
+ }

Loading required package: data.table
Loading required package: data.table
data.table 1.7.10  For help type: help("data.table")
data.table 1.7.10  For help type: help("data.table")

 *** caught segfault ***
address 0x2ae93df90000, cause 'memory not mapped'

Traceback:
 1: .Call("dogroups", x, xcols, o__, f__, len__, jsub, SDenv, testj,
  byretn, byval, i, as.integer(icols), i[1, ivars, with = FALSE],
if (length(ivars)) paste("i.", ivars, sep = ""), is.na(nomatch),
verbose, PACKAGE = "data.table")
 2: `[.data.table`(DT, , sum(sharing), by = "ID1,ID2")
 3: DT[, sum(sharing), by = "ID1,ID2"]
 4: computeAllPairSums(paste(GERMLINE, "bc.chr22.q.20.file", sep =
""),     num, nr)
 5: system.time({    computeAllPairSums(paste(GERMLINE,
"bc.chr22.q.20.file",         sep = ""), num, nr)})
 6: eval(expr, envir, enclos)
 7: eval(c.expr, envir = args, enclos = envir)
 8: doTryCatch(return(expr), name, parentenv, handler)
 9: tryCatchOne(expr, names, parentenv, handlers[[1L]])
10: tryCatchList(expr, classes, parentenv, handlers)
11: tryCatch(eval(c.expr, envir = args, enclos = envir), error = function(e) e)
12: FUN(X[[1L]], ...)
13: lapply(S, FUN, ...)
14: doTryCatch(return(expr), name, parentenv, handler)
15: tryCatchOne(expr, names, parentenv, handlers[[1L]])
16: tryCatchList(expr, classes, parentenv, handlers)
17: tryCatch(expr, error = function(e) {    call <- conditionCall(e)
 if (!is.null(call)) {        if (identical(call[[1L]],
quote(doTryCatch)))             call <- sys.call(-4L)        dcall <-
deparse(call)[1L]        prefix <- paste("Error in", dcall, ": ")
  LONG <- 75L        msg <- conditionMessage(e)        sm <-
strsplit(msg, "\n")[[1L]]        w <- 14L + nchar(dcall, type = "w") +
nchar(sm[1L], type = "w")        if (is.na(w))             w <- 14L +
nchar(dcall, type = "b") + nchar(sm[1L],                 type = "b")
     if (w > LONG)             prefix <- paste(prefix, "\n  ", sep =
"")    }    else prefix <- "Error : "    msg <- paste(prefix,
conditionMessage(e), "\n", sep = "")
.Internal(seterrmessage(msg[1L]))    if (!silent &&
identical(getOption("show.error.messages"),         TRUE)) {
cat(msg, file = stderr())        .Internal(printDeferredWarnings())
}    invisible(structure(msg, class = "try-error"))})
18: try(lapply(S, FUN, ...), silent = TRUE)
19: sendMaster(try(lapply(S, FUN, ...), silent = TRUE))
20: FUN(1:2[[1L]], ...)
21: lapply(1:cores, inner.do)
22: mclapply(argsList, FUN, mc.preschedule = preschedule, mc.set.seed
= set.seed,     mc.silent = silent, mc.cores = cores)
23: e$fun(obj, substitute(ex), parent.frame(), e$data)
24: foreach(IT = 1:2) %dopar% {    require(data.table)    if (IT == 1)
{        x <- system.time({
computeAllPairSums(paste(GERMLINE, "bc.chr22.q.20.file",
  sep = ""), num, nr)        })    }    if (IT == 2) {        z <-
system.time({            computeAllPairSums.gz(paste(GERMLINE,
"bc.chr22.q.20.gz",                 sep = ""), num, nr)        })
}}

Possible actions:
1: abort (with core dump, if enabled)
2: normal R exit
3: exit R without saving workspace
4: exit R saving workspace



-- 
Matthew C Keller
Asst. Professor of Psychology
University of Colorado at Boulder
www.matthewckeller.com



More information about the Bioconductor mailing list