[R] vectorisation

Sun Feb 3 09:33:26 CET 2013

On 02-02-2013, at 17:38, Brett Robinson <brett.robinson at 7dials.com> wrote:

> Hi
> I'm trying to set up a simulation problem without resorting to (m)any loops. I want to set entries in a data frame of zeros ('starts' in the code below) to 1 at certain points and the points have been randomly generated and stored in a separate data.frame ('sl'), which has the same number of columns.
> 
> An example of the procedure is as follows:
> ml <- data.frame(matrix(sample(1:50,80, replace=TRUE),20,4))
> mm <- apply(ml, 2, cumsum)
> starts<- data.frame(matrix(0,600,4))
> 
> I can achieve the result I want with a loop:
> for (i in 1:4){
> lstarts[,i][mm[,i]] <-1
> }
> 
> But as I want to use a large number of columns I would like to do away with the loop
> 
> Can anyone suggest how this might be done?

Another way is this

f2 <- function(starts, mm) {
    mn <- cbind(as.vector(mm),rep(1:ncol(mm),each=nrow(mm)))
    x <- as.matrix(starts) 
    x[mn] <- 1              
    as.data.frame(x)
}

starts2 <- f2(starts,mm)
#> identical(starts2,starts1)
# [1] TRUE

Collect all the options presented so far in functions, use the compiler package to see if that helps
and do some speed tests with Arun's parameters.

# Brett
f1 <- function(starts, mm) {
    for (i in 1:ncol(mm)){
        starts[,i][mm[,i]] <-1
    }
    starts
}

# Berend
f2 <- function(starts, mm) {
    mn <- cbind(as.vector(mm),rep(1:ncol(mm),each=nrow(mm)))
    x <- as.matrix(starts) 
    x[mn] <- 1              
    as.data.frame(x)
}

# Rui
f3 <- function(s2,mm) {
    s2[] <- lapply(seq_len(ncol(mm)), function(i) {s2[,i][mm[,i]] <- 1; s2[,i]})
    s2
}

# Arun
f4 <- function(starts,mm) {
    starts2 <- as.data.frame(do.call(cbind,lapply(1:ncol(mm),function(i) {starts[,i][mm[,i]]<-1;starts[,i]})))
    colnames(starts2)<- colnames(starts)
    starts2
}

library(compiler)
f1c <- cmpfun(f1)
f2c <- cmpfun(f2)
f3c <- cmpfun(f3)
f4c <- cmpfun(f4)

library(rbenchmark)

# Arun's test
set.seed(11)
starts <- data.frame(matrix(0,1e6,4))
ml <- data.frame(matrix(sample(1:1e4,1e3, replace=TRUE),100,4))
mm <- apply(ml, 2, cumsum)

z1 <- f1(starts,mm)
z2 <- f2(starts,mm)
z3 <- f3(starts,mm)
z4 <- f4(starts,mm)
z1c <- f1c(starts,mm)
z2c <- f2c(starts,mm)
z3c <- f3c(starts,mm)
z4c <- f4c(starts,mm)

identical(z2,z1)
identical(z3,z1)
identical(z4,z1)
identical(z1c,z1)
identical(z2c,z1)
identical(z3c,z1)
identical(z4c,z1)

benchmark( f1(starts,mm) , f2(starts,mm),
           f1c(starts,mm), f2c(starts,mm),
           f3(starts,mm) , f4(starts,mm),
           f3c(starts,mm), f4c(starts,mm),
           replications=1,order="relative", columns=c("test","relative","elapsed","replications"))

Result:

# > identical(z2,z1)
# [1] TRUE
# > identical(z3,z1)
# [1] TRUE
# > identical(z4,z1)
# [1] TRUE
# > identical(z1c,z1)
# [1] TRUE
# > identical(z2c,z1)
# [1] TRUE
# > identical(z3c,z1)
# [1] TRUE
# > identical(z4c,z1)
# [1] TRUE
# > 
# > benchmark( f1(starts,mm) , f2(starts,mm),
# +            f1c(starts,mm), f2c(starts,mm),
# +            f3(starts,mm) , f4(starts,mm),
# +            f3c(starts,mm), f4c(starts,mm),
# +            replications=1,order="relative", columns=c("test","relative","elapsed","replications"))
#              test relative elapsed replications
# 2  f2(starts, mm)    1.000   0.195            1
# 4 f2c(starts, mm)    1.005   0.196            1
# 1  f1(starts, mm)    2.990   0.583            1
# 3 f1c(starts, mm)    3.082   0.601            1
# 7 f3c(starts, mm)    3.903   0.761            1
# 5  f3(starts, mm)    3.949   0.770            1
# 8 f4c(starts, mm)    4.436   0.865            1
# 6  f4(starts, mm)    4.462   0.870            1

Compiling doesn't deliver significant speed gains in this case.
Function f2 is the quickest.

Berend