[R] making code (loop) more efficient

Wed Dec 16 17:34:05 CET 2020

Indeed it was the issue with data.table. I converted it to data.frame
and it worked like a charm.
Thank you so much for your insight!

This is the code that worked:

library(parallel)
library(data.table)
library(doSNOW)

n <-  parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")
doSNOW::registerDoSNOW(cl)
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T)

lst_out <- foreach::foreach(i = seq_along(files),
                  .packages = c("data.table") ) %dopar% {

   tmp <- get(load(files[i]))
   a <- data.table::copy(tmp)
   a=as.data.frame(a)
   rm(tmp)
   gc()

   names <- rownames(a)
   if("blup" %in% colnames(a)) {
     data <- data.table(names, a["blup"])
     nm1 <- c("rsid", "ref_allele", "eff_allele")
     data[,  (nm1) := tstrsplit(names, ":")[-2]]
     out <- data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
               WGT := files[i]][]
    } else {

     data <- data.table(names)
     nm1 <- c("rsid", "ref_allele", "eff_allele")
     data[,  (nm1) := tstrsplit(names, ":")[-2]]
     out <- data[, .(rsid,  ref_allele, eff_allele)][,
               WGT := files[i]][]
   }

    return(out)
   rm(data)
   gc()
 }
parallel::stopCluster(cl)

big_data <- rbindlist(lst_out, fill = TRUE)

On Wed, Dec 16, 2020 at 9:31 AM Ana Marija <sokovic.anamarija using gmail.com> wrote:
>
> HI Jim,
>
> this is what I as running:
>
> library(parallel)
> library(data.table)
> library(foreach)
> library(doSNOW)
>
> n <-  parallel::detectCores()
> cl <- parallel::makeCluster(n, type = "SOCK")
> doSNOW::registerDoSNOW(cl)
> files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T)
>
> lst_out <- foreach::foreach(i = seq_along(files),
>                   .packages = c("data.table") ) %dopar% {
>
>    a <- get(load(files[i]))
> namesplit<-strsplit(rownames(a),":")
> rsid<-unlist(lapply(namesplit,"[",1))
> ref_allele<-unlist(lapply(namesplit,"[",3))
> eff_allele<-unlist(lapply(namesplit,"[",4))
> WGT<-rep(files[i],length(rsid))
> data<-data.frame(rsid=rsid,weight=a$blup,     #weight is "blup" column
>  ref_allele=ref_allele,eff_allele,WGT=WGT)
>
>    return(data)
>  }
> parallel::stopCluster(cl)
>
> big_data <- rbindlist(lst_out)
>
> and i got:
> Error in { : task 4 failed - "$ operator is invalid for atomic vectors"
> > parallel::stopCluster(cl)
>
> I uploaded 3 RDat file here if you want to try it
> https://github.com/montenegrina/sample
>
> Thank you for looking into this
> Ana
>
> On Tue, Dec 15, 2020 at 11:45 PM Jim Lemon <drjimlemon using gmail.com> wrote:
> >
> > Hi Ana,
> > Back on the job. I'm not sure how this will work in your setup, but
> > here is a try:
> >
> > a<-read.table(text="top1 blup lasso enet
> > rs4980905:184404:C:A  0.07692622 -1.881795e-04     0    0
> > rs7978751:187541:G:C  0.62411425  9.934994e-04     0    0
> > rs2368831:188285:C:T  0.69529158  1.211028e-03     0    0
> > rs12830904:188335:T:A 0.92793158 -9.143555e-05     0    0
> > rs1500098:189093:G:C  0.42032471  9.001814e-04     0    0
> > rs79410690:190097:G:A 0.26194244  5.019037e-04     0    0",
> > header=TRUE,stringsAsFactors=FALSE)
> > namesplit<-strsplit(rownames(a),":")
> > rsid<-unlist(lapply(namesplit,"[",1))
> > ref_allele<-unlist(lapply(namesplit,"[",3))
> > eff_allele<-unlist(lapply(namesplit,"[",4))
> > # here I'm assuming that the filename
> > # is stored in files[i]
> > files<-"retina.ENSG00000135776.wgt.RDat"
> > i<-1
> > WGT<-rep(files[i],length(rsid))
> > data<-data.frame(rsid=rsid,weight=a$top1,
> >  ref_allele=ref_allele,eff_allele,WGT=WGT)
> > data
> >
> > Note that the output is a data frame, not a data table. I hope that
> > the function for creating a data table is close enough to that for a
> > data frame for you to work it out. If not I can probably have a look
> > at it a bit later.
> >
> > Jim
> >
> > On Wed, Dec 16, 2020 at 1:45 PM Ana Marija <sokovic.anamarija using gmail.com> wrote:
> > >
> > > Hi Jim,
> > >
> > > as always you're completely right, this is what is happening:
> > >
> > > > head(a)
> > >                             top1          blup lasso enet
> > > rs4980905:184404:C:A  0.07692622 -1.881795e-04     0    0
> > > rs7978751:187541:G:C  0.62411425  9.934994e-04     0    0
> > > rs2368831:188285:C:T  0.69529158  1.211028e-03     0    0
> > > rs12830904:188335:T:A 0.92793158 -9.143555e-05     0    0
> > > rs1500098:189093:G:C  0.42032471  9.001814e-04     0    0
> > > rs79410690:190097:G:A 0.26194244  5.019037e-04     0    0
> > > >    names <- rownames(a)
> > > >    data <- data.table(names, a["blup"])
> > > > head(data)
> > >                    names V2
> > > 1:  rs4980905:184404:C:A NA
> > > 2:  rs7978751:187541:G:C NA
> > > 3:  rs2368831:188285:C:T NA
> > > 4: rs12830904:188335:T:A NA
> > > 5:  rs1500098:189093:G:C NA
> > > 6: rs79410690:190097:G:A NA
> > >
> > > So my goal is to transform what is in "a" to this for every RDat file:
> > >
> > >           rsid        weight ref_allele eff_allele
> > > 1:  rs72763981  9.376766e-09          C          G
> > > 2: rs144383755 -2.093346e-09          A          G
> > > 3:   rs1925717  1.511376e-08          T          C
> > > 4:  rs61827307 -1.625302e-08          C          A
> > > 5:  rs61827308 -1.625302e-08          G          C
> > > 6: rs199623136 -9.128354e-10         GC          G
> > >                            WGT
> > > 1: retina.ENSG00000135776.wgt.RDat
> > > 2: retina.ENSG00000135776.wgt.RDat
> > > 3: retina.ENSG00000135776.wgt.RDat
> > > 4: retina.ENSG00000135776.wgt.RDat
> > > 5: retina.ENSG00000135776.wgt.RDat
> > > 6: retina.ENSG00000135776.wgt.RDat
> > >
> > > so from rs4980905:184404:C:A I would take rs4980905 to be in column
> > > "rsid", C in column "ref_allele" and A to be in column "eff_allele",
> > > WGT column would just be filled with a name of the particular RDat
> > > file.
> > >
> > > So the issue is in these lines:
> > >
> > > a <- get(load(files[i]))
> > > names <- rownames(a)
> > > data <- data.table(names, a["blup"])
> > > nm1 <- c("rsid", "ref_allele", "eff_allele")
> > >
> > > any idea how I can rewrite this?
> > >
> > >
> > >
> > > On Tue, Dec 15, 2020 at 8:30 PM Jim Lemon <drjimlemon using gmail.com> wrote:
> > > >
> > > > Hi Ana,
> > > > I would look at "data" in your second example and see if it contains a
> > > > column named "blup" or just the values that were extracted from
> > > > a$blup. Also, I assume that weight=blup looks for an object named
> > > > "blup", which may not be there.
> > > >
> > > > Jim
> > > >
> > > > On Wed, Dec 16, 2020 at 1:20 PM Ana Marija <sokovic.anamarija using gmail.com> wrote:
> > > > >
> > > > > Hi Jim,
> > > > >
> > > > > Maybe my post is confusing.
> > > > >
> > > > > so "dd" came from my slow code and I don't use it again in parallelized code.
> > > > >
> > > > > So for example for one of my files:
> > > > >
> > > > > if
> > > > > i="retina.ENSG00000120647.wgt.RDat"
> > > > > > a <- get(load(i))
> > > > > > head(a)
> > > > >                             top1          blup lasso enet
> > > > > rs4980905:184404:C:A  0.07692622 -1.881795e-04     0    0
> > > > > rs7978751:187541:G:C  0.62411425  9.934994e-04     0    0
> > > > > rs2368831:188285:C:T  0.69529158  1.211028e-03     0    0
> > > > > ...
> > > > >
> > > > > Slow code was posted just to show what was running very slow and it
> > > > > was running. I really need help fixing parallelized version.
> > > > >
> > > > > On Tue, Dec 15, 2020 at 7:35 PM Jim Lemon <drjimlemon using gmail.com> wrote:
> > > > > >
> > > > > > Hi Ana,
> > > > > > My guess is that in your second code fragment you are assigning the
> > > > > > rownames of "a" and the _values_ contained in a$blup to the data.table
> > > > > > "data". As I don't have much experience with data tables I may be
> > > > > > wrong, but I suspect that the column name "blup" may not be visible or
> > > > > > even present in "data". I don't see it in "dd" above this code
> > > > > > fragment.
> > > > > >
> > > > > > Jim
> > > > > >
> > > > > > On Wed, Dec 16, 2020 at 11:12 AM Ana Marija <sokovic.anamarija using gmail.com> wrote:
> > > > > > >
> > > > > > > Hello,
> > > > > > >
> > > > > > > I made a terribly inefficient code which runs forever but it does run.
> > > > > > >
> > > > > > > library(dplyr)
> > > > > > > library(splitstackshape)
> > > > > > >
> > > > > > > datalist = list()
> > > > > > > files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T)
> > > > > > >
> > > > > > > for(i in files)
> > > > > > > {
> > > > > > > a<-get(load(i))
> > > > > > > names <- rownames(a)
> > > > > > > data <- as.data.frame(cbind(names,a))
> > > > > > > rownames(data) <- NULL
> > > > > > > dd=na.omit(concat.split.multiple(data = data, split.cols = c("names"),
> > > > > > > seps = ":"))
> > > > > > > dd=select(dd,names_1,blup,names_3,names_4)
> > > > > > > colnames(dd)=c("rsid","weight","ref_allele","eff_allele")
> > > > > > > dd$WGT<-i
> > > > > > > datalist[[i]] <- dd # add it to your list
> > > > > > > }
> > > > > > >
> > > > > > > big_data = do.call(rbind, datalist)
> > > > > > >
> > > > > > > There is 17345 RDat files this loop has to go through. And each file
> > > > > > > has approximately 10,000 lines. All RDat files can be downloaded from
> > > > > > > here: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE115828 and
> > > > > > > they are compressed in this file: GSE115828_retina_TWAS_wgts.tar.gz .
> > > > > > > And subset of 3 of those .RDat files is here:
> > > > > > > https://github.com/montenegrina/sample
> > > > > > >
> > > > > > > For one of those files, say i="retina.ENSG00000135776.wgt.RDat"
> > > > > > > dd looks like this:
> > > > > > >
> > > > > > > > head(dd)
> > > > > > >           rsid        weight ref_allele eff_allele
> > > > > > > 1:  rs72763981  9.376766e-09          C          G
> > > > > > > 2: rs144383755 -2.093346e-09          A          G
> > > > > > > 3:   rs1925717  1.511376e-08          T          C
> > > > > > > 4:  rs61827307 -1.625302e-08          C          A
> > > > > > > 5:  rs61827308 -1.625302e-08          G          C
> > > > > > > 6: rs199623136 -9.128354e-10         GC          G
> > > > > > >                            WGT
> > > > > > > 1: retina.ENSG00000135776.wgt.RDat
> > > > > > > 2: retina.ENSG00000135776.wgt.RDat
> > > > > > > 3: retina.ENSG00000135776.wgt.RDat
> > > > > > > 4: retina.ENSG00000135776.wgt.RDat
> > > > > > > 5: retina.ENSG00000135776.wgt.RDat
> > > > > > > 6: retina.ENSG00000135776.wgt.RDat
> > > > > > >
> > > > > > > so on attempt to parallelize this I did this:
> > > > > > >
> > > > > > > library(parallel)
> > > > > > > library(data.table)
> > > > > > > library(foreach)
> > > > > > > library(doSNOW)
> > > > > > >
> > > > > > > n <-  parallel::detectCores()
> > > > > > > cl <- parallel::makeCluster(n, type = "SOCK")
> > > > > > > doSNOW::registerDoSNOW(cl)
> > > > > > > files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T)
> > > > > > >
> > > > > > > lst_out <- foreach::foreach(i = seq_along(files),
> > > > > > >                   .packages = c("data.table") ) %dopar% {
> > > > > > >
> > > > > > >    a <- get(load(files[i]))
> > > > > > >    names <- rownames(a)
> > > > > > >    data <- data.table(names, a["blup"])
> > > > > > >    nm1 <- c("rsid", "ref_allele", "eff_allele")
> > > > > > >    data[,  (nm1) := tstrsplit(names, ":")[-2]]
> > > > > > >    return(data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
> > > > > > >                WGT := files[i]][])
> > > > > > >  }
> > > > > > > parallel::stopCluster(cl)
> > > > > > >
> > > > > > > big_data <- rbindlist(lst_out)
> > > > > > >
> > > > > > > I am getting this Error:
> > > > > > >
> > > > > > > Error in { : task 7 failed - "object 'blup' not found"
> > > > > > > > parallel::stopCluster(cl)
> > > > > > >
> > > > > > > Can you please advise,
> > > > > > > Ana
> > > > > > >
> > > > > > > ______________________________________________
> > > > > > > R-help using r-project.org mailing list -- To UNSUBSCRIBE and more, see
> > > > > > > https://stat.ethz.ch/mailman/listinfo/r-help
> > > > > > > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> > > > > > > and provide commented, minimal, self-contained, reproducible code.