[R] : Quantile and rowMean from multiple files in a folder

Tue Apr 15 05:48:17 CEST 2014

Hi,
Q1 solution already sent.

Regarding Q2, one of the files in the new Observed folder doesn't have any  data (just the Year column alone).

That may be the reason for the problem.

### Q1: working directory: Observed #Only one file per Site.  Assuming this is the
### case for the full dataset, then I guess there is no need to average
dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", list.files(pattern = ".csv")))

lst2 <- lapply(lst1, function(x1) lapply(x1, function(x2) {
    lines1 <- readLines(x2)
    header1 <- lines1[1:2]
    dat1 <- read.table(text = lines1, header = FALSE, sep = ",", stringsAsFactors = FALSE, 
        skip = 2)
    colnames(dat1) <- Reduce(paste, strsplit(header1, ","))
    dat1[-c(nrow(dat1), nrow(dat1) - 1), ]
}))

lst3 <- lst2[sapply(seq_along(lst2),function(i){lstN <- sapply(lst2[[i]],function(x) is.integer(ncol(x)))})]

#difference in column number
sapply(seq_along(lst3), function(i) {
    sapply(lst3[[i]], function(x) ncol(x))
})
# 
#[1] 157 258 258  98 157 258 256 258 250 258 258 147 157 250 250 256 249 240
# [19] 181 188 256 146 117 258 153 256 255 246 255 256 258 257 145 258 258 255
# [37] 258 157 164 144 265 258 254 258 258 157 258 176 258 256 257 258 258 258
# [55] 248 258 156 258 157 157 258 258 258 258 258 148 258 258 258 258 257 258
# [73] 258 258 157 154 153 258 248 255 257 256 258 258 157 256 256 257 257 250
# [91] 257 139 155 256 256 257 257 256 258 258 257 258 258 258 258 157 157 157
#[109] 258 258 258 258 256 258 157 258 258 256 258

library(plyr)
library(stringr)

lst4 <- setNames(lapply(seq_along(lst3), function(i) {
    lapply(lst3[[i]], function(x) {
        names(x)[-1] <- paste(names(x)[-1], names(lst1)[i], sep = "_")
        names(x) <- str_trim(names(x))
        x
    })[[1]]
}), names(lst3))

df1 <- join_all(lst4, by = "Year")
dim(df1)
# [1] 9 27311

sapply(split(names(df1)[-1], gsub(".*\\_", "", names(df1)[-1])), function(x) {
    df2 <- df1[, x]
    df3 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), numcolwise(function(y) quantile(y, 
        seq(0, 1, by = 0.01), na.rm = TRUE))(df2), stringsAsFactors = FALSE)
    ncol(df3)
})
# 
#G100 G101 G102 G103 G104 G105 G106 G107 G108 G109 G110 G111 G112 G113 G114 G115 
# 157  258  258   98  157  258  256  258  250  258  258  147  157  250  250  256 
#G116 G117 G118 G119 G120 GG10 GG11 GG12 GG13 GG14 GG15 GG16 GG17 GG18 GG19 GG20 
# 249  240  181  188  256  146  117  258  153  256  255  246  255  256  258  257 
#GG21 GG22 GG23 GG24 GG25 GG26 GG27 GG28 GG29 GG30 GG31 GG32 GG33 GG34 GG35 GG36 
# 145  258  258  255  258  157  164  144  265  258  254  258  258  157  258  176 
#GG37 GG38 GG39 GG40 GG41 GG42 GG43 GG44 GG45 GG46 GG47 GG48 GG49 GG50 GG51 GG52 
# 258  256  257  258  258  258  248  258  156  258  157  157  258  258  258  258 
#GG53 GG54 GG55 GG56 GG57 GG58 GG59 GG60 GG61 GG62 GG63 GG64 GG65 GG66 GG67 GG68 
# 258  148  258  258  258  258  257  258  258  258  157  154  153  258  248  255 
#GG69 GG70 GG71 GG72 GG73 GG74 GG75 GG76 GG77 GG78 GG79 GG80 GG81 GG82 GG83 GG84 
# 257  256  258  258  157  256  256  257  257  250  257  139  155  256  256  257 
#GG85 GG86 GG87 GG88 GG89 GG90 GG91 GG92 GG93 GG94 GG95 GG96 GG97 GG98 GG99 GGG1 
# 257  256  258  258  257  258  258  258  258  157  157  157  258  258  258  258 
#GGG2 GGG3 GGG4 GGG5 GGG6 GGG7 GGG8 
# 256  258  157  258  258  256  258 

lst5 <- split(names(df1)[-1], gsub(".*\\_", "", names(df1)[-1]))

lapply(seq_along(lst5), function(i) {
    df2 <- df1[, lst5[[i]]]
    df3 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"), numcolwise(function(y) quantile(y, 
        seq(0, 1, by = 0.01), na.rm = TRUE))(df2), stringsAsFactors = FALSE)
    df3[1:3, 1:3]
    write.csv(df3, paste0(paste(getwd(), "final", paste(names(lst4)[[i]], "Quantile", 
        sep = "_"), sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})

ReadOut1 <- lapply(list.files(recursive = TRUE)[grep("Quantile", list.files(recursive = TRUE))], 
    function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))

sapply(ReadOut1, dim)[,1:3]
#     [,1] [,2] [,3]
#[1,]  101  101  101
#[2,]  157  258  258

lapply(ReadOut1, function(x) x[1:2, 1:3])[1:3]
#[[1]]
#  Percentiles pav.DJF_G100 pav.MAM_G100
#1          0%            0     0.640500
#2          1%            0     0.664604
#
#[[2]]
#  Percentiles txav.DJF_G101 txav.MAM_G101
#1          0%      -13.8756      4.742400
#2          1%      -13.8140      4.817184
#
#[[3]]
#  Percentiles txav.DJF_G102 txav.MAM_G102
#1          0%     -15.05000      4.520700
#2          1%     -14.96833      4.543828

### Q2: Observed data

dir.create("Indices")

names1 <- unlist(lapply(ReadOut1, function(x) names(x)[-1]))
names2 <- gsub("\\_.*", "", names1)
names3 <- unique(gsub("[.]", " ", names2))

res <- do.call(rbind, lapply(seq_along(lst5), function(i) {
    df2 <- df1[, lst5[[i]]]
    vec1 <- colMeans(df2, na.rm = TRUE)
    vec2 <- rep(NA, length(names3))
    names(vec2) <- paste(names3, names(lst5)[[i]], sep = "_")
    vec2[names(vec2) %in% names(vec1)] <- vec1
    names(vec2) <- gsub("\\_.*", "", names(vec2))
    vec2
}))
dim(res)
#[1] 119 264

lapply(seq_len(ncol(res)), function(i) {
    mat1 <- t(res[, i, drop = FALSE])
    colnames(mat1) <- names(lst4)
    write.csv(mat1, paste0(paste(getwd(), "Indices", gsub(" ", "_", rownames(mat1)), 
        sep = "/"), ".csv"), row.names = FALSE, quote = FALSE)
})

## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))], 
    function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
length(ReadOut2)
# [1]  264

list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))][1]
# [1] 'Indices/pav_ANN.csv'
res[, "pav ANN", drop = FALSE]

ReadOut2[[1]]

Attached is the updated Quantilecode2.txt.

A.K.

On Monday, April 14, 2014 10:41 PM, Zilefac Elvis <zilefacelvis at yahoo.com> wrote:
Hi AK,
Q1) Please try to correct the error using the larger data set (Sample.zip). The issue is that once you write the codes and restrict it to smaller data sets, I find it difficult to generalize it to larger data sets.

Q2) From the Quantilecode2.txt you just sent, you forgot to do the following section using the Observed.zip file. I tried to run the code to section Q1 in Quantilecode2.txt using a larger data set and received the same error :Error in 2:nrow(lstNew) : argument of length 0. I have attached a larger data set too for you to generalize the code to suit the larger data set. Please do not forget to include the code below in the final code of Q2.

Once you fix these two, I should be able to fix the rest following these examples.

Thanks AK. Sorry for overloading you with much work.
Atem.

#==============================================================================================================
dir.create("Indices") 
names1 <- lapply(ReadOut1, function(x) names(x))[[1]]
lstNew <- simplify2array(ReadOut1) lapply(2:nrow(lstNew), function(i) { dat1 <- data.frame(lstNew[1], do.call(cbind, lstNew[i, ]), stringsAsFactors = FALSE) colnames(dat1) <- c(rownames(lstNew)[1], paste(names(lst1), rep(rownames(lstNew)[i],  length(lst1)), sep = "_")) 
write.csv(dat1, paste0(paste(getwd(), "Indices", rownames(lstNew)[i], sep = "/"),  ".csv"), row.names = FALSE, quote = FALSE)
})  
## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))],  function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
length(ReadOut2)
# [1] 257
head(ReadOut2[[1]], 2) 

#==============================================================================================================

=================================================
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: Quantilecode2.txt
URL: <https://stat.ethz.ch/pipermail/r-help/attachments/20140414/c97a58e5/attachment-0002.txt>