[R] : Quantile and rowMean from multiple files in a folder

Tue Apr 15 05:04:20 CEST 2014

Hi,
It is because of different dimensions of Simulation data  within each Site.
Try:
dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", list.files(pattern = ".csv")))
sapply(lst1,length)
#G100 G101 G102 G103 G104 G105 G106 G107 G108 G109 G110 G111 G112 G113 G114 G115 
# 100  100  100  100  100  100  100  100  100  100  100  100  100  100  100  100 
#G116 G117 G118 G119 G120 GG10 GG11 GG12 GG13 GG14 GG15 GG16 GG17 GG18 GG19 GG20 
# 100  100  100  100  100  100  100  100  100  100  100  100  100  100  100  100 
#GG21 GG22 GG23 GG24 GG25 GG26 GG27 GG28 
# 100  100  100  100  100  100  100  100 

lst2 <- lapply(lst1, function(x1) lapply(x1, function(x2) {
    lines1 <- readLines(x2)
    header1 <- lines1[1:2]
    dat1 <- read.table(text = lines1, header = FALSE, sep = ",", stringsAsFactors = FALSE, 
        skip = 2)
    colnames(dat1) <- Reduce(paste, strsplit(header1, ","))
    dat1[-c(nrow(dat1), nrow(dat1) - 1), ]
}))

##dimensions differ within each Site
sapply(lst2,function(x) sapply(x,ncol))[1:6,5:8]
#     G104 G105 G106 G107
#[1,]  258  257  258  258
#[2,]  258  258  258  258
#[3,]  258  258  258  258
#[4,]  258  257  258  258
#[5,]  258  258  258  258
#[6,]  258  258  258  258

##number of rows are consistent
sapply(lst2,function(x) any(sapply(x,nrow)!=9))
# G100  G101  G102  G103  G104  G105  G106  G107  G108  G109  G110  G111  G112 
#FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
# G113  G114  G115  G116  G117  G118  G119  G120  GG10  GG11  GG12  GG13  GG14 
#FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
# GG15  GG16  GG17  GG18  GG19  GG20  GG21  GG22  GG23  GG24  GG25  GG26  GG27 
#FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
# GG28 
#FALSE 
names1 <- unique(unlist(lapply(lst2,function(x) unlist(lapply(x,function(y) names(y)[-1])))))
length(names1)
#[1] 257

# lstYear <- lapply(lst2,function(x) lapply(x, function(y)
# y[,1,drop=FALSE])[[1]])

library(plyr)

lapply(seq_along(lst2),function(i) {lstN <- lapply(lst2[[i]],function(x) {datN <- as.data.frame(matrix(NA, nrow=9, ncol=length(names1),dimnames=list(NULL,names1)));datN[,names1] <- x[,-1]; datN }); lstQ1 <- lapply(lstN,function(x) numcolwise(function(y) quantile(y,seq(0,1,by=0.01), na.rm=TRUE))(x)); arr1 <- array(unlist(lstQ1), dim=c(dim(lstQ1[[1]]),length(lstQ1)),dimnames=list(NULL,lapply(lstQ1,names)[[1]])); res <- rowMeans(arr1, dims=2, na.rm=TRUE); colnames(res) <- gsub(" ", "_", colnames(res)); res1 <- data.frame(Percentiles=paste0(seq(0,100, by=1),"%"),res, stringsAsFactors=FALSE); write.csv(res1,paste0(paste(getwd(), "final", paste(names(lst1)[[i]], "Quantile", sep="_"), sep= "/"), ".csv"), row.names=FALSE, quote=FALSE)})

## output files
list.files(recursive = TRUE)[grep("Quantile", list.files(recursive = TRUE))]
#[1] "final/G100_Quantile.csv" "final/G101_Quantile.csv"
#[3] "final/G102_Quantile.csv" "final/G103_Quantile.csv"
#[5] "final/G104_Quantile.csv" "final/G105_Quantile.csv"
#[7] "final/G106_Quantile.csv" "final/G107_Quantile.csv"
#[9] "final/G108_Quantile.csv" "final/G109_Quantile.csv"
#[11] "final/G110_Quantile.csv" "final/G111_Quantile.csv"
#[13] "final/G112_Quantile.csv" "final/G113_Quantile.csv"
#[15] "final/G114_Quantile.csv" "final/G115_Quantile.csv"
#[17] "final/G116_Quantile.csv" "final/G117_Quantile.csv"
#[19] "final/G118_Quantile.csv" "final/G119_Quantile.csv"
#[21] "final/G120_Quantile.csv" "final/GG10_Quantile.csv"
#[23] "final/GG11_Quantile.csv" "final/GG12_Quantile.csv"
#[25] "final/GG13_Quantile.csv" "final/GG14_Quantile.csv"
#[27] "final/GG15_Quantile.csv" "final/GG16_Quantile.csv"
#[29] "final/GG17_Quantile.csv" "final/GG18_Quantile.csv"
#[31] "final/GG19_Quantile.csv" "final/GG20_Quantile.csv"
#[33] "final/GG21_Quantile.csv" "final/GG22_Quantile.csv"
#[35] "final/GG23_Quantile.csv" "final/GG24_Quantile.csv"
#[37] "final/GG25_Quantile.csv" "final/GG26_Quantile.csv"
#[39] "final/GG27_Quantile.csv" "final/GG28_Quantile.csv"

ReadOut1 <- lapply(list.files(recursive = TRUE)[grep("Quantile", list.files(recursive = TRUE))], 
    function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
sapply(ReadOut1,function(x) dim(x))
#     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
#[1,]  101  101  101  101  101  101  101  101  101   101   101   101   101   101
#[2,]  258  258  258  258  258  258  258  258  258   258   258   258   258   258
#     [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26]
#[1,]   101   101   101   101   101   101   101   101   101   101   101   101
#[2,]   258   258   258   258   258   258   258   258   258   258   258   258
#     [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37] [,38]
#[1,]   101   101   101   101   101   101   101   101   101   101   101   101
#[2,]   258   258   258   258   258   258   258   258   258   258   258   258
#     [,39] [,40]
#[1,]   101   101
#[2,]   258   258

ReadOut1[[1]][1:3,1:3]
#  Percentiles  txav_DJF txav_MAM
#1          0% -12.56619 6.795429
#2          1% -12.45888 6.864886
#3          2% -12.35157 6.934344

### Q2:
dir.create("Indices")
names1 <- lapply(ReadOut1, function(x) names(x))[[1]]
lstNew <- simplify2array(ReadOut1)
nrow(lstNew)
#[1] 258

lapply(2:nrow(lstNew), function(i) {
    dat1 <- data.frame(lstNew[1], do.call(cbind, lstNew[i, ]), stringsAsFactors = FALSE)
    colnames(dat1) <- c(rownames(lstNew)[1], paste(names(lst1), rep(rownames(lstNew)[i], 
        length(lst1)), sep = "_"))
    write.csv(dat1, paste0(paste(getwd(), "Indices", rownames(lstNew)[i], sep = "/"), 
        ".csv"), row.names = FALSE, quote = FALSE)
})

## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", list.files(recursive = TRUE))], 
    function(x) read.csv(x, header = TRUE, stringsAsFactors = FALSE))
length(ReadOut2)
#[1] 257

head(ReadOut2[[1]], 2)
#Percentiles G100_pav_ANN G101_pav_ANN G102_pav_ANN G103_pav_ANN G104_pav_ANN
#1          0%     0.978451    0.9517680    0.9383280    0.8519280    0.9438790
#2          1%     0.992648    0.9638816    0.9480754    0.8625262    0.9548512
#  G105_pav_ANN G106_pav_ANN G107_pav_ANN G108_pav_ANN G109_pav_ANN G110_pav_ANN
#1    0.9303260    0.7484670    0.9757010     1.049533    0.9841290    0.7778830
#2    0.9417438    0.7594563    0.9868968     1.063668    0.9968095    0.7882509
#  G111_pav_ANN G112_pav_ANN G113_pav_ANN G114_pav_ANN G115_pav_ANN G116_pav_ANN
#1     0.737651    0.8813010    0.9155330     0.829001    0.6778760    0.5463310
#2     0.746934    0.8924871    0.9265448     0.838534    0.6880397    0.5527359
#  G117_pav_ANN G118_pav_ANN G119_pav_ANN G120_pav_ANN GG10_pav_ANN GG11_pav_ANN
#1    0.7191360    0.7470170    0.7859380    0.7774590    0.6303150    0.5200200
#2    0.7278231    0.7556053    0.7975213    0.7852408    0.6381671    0.5258248
#  GG12_pav_ANN GG13_pav_ANN GG14_pav_ANN GG15_pav_ANN GG16_pav_ANN GG17_pav_ANN
#1    0.6672890     0.851834    0.5209710    0.6445290    0.5874320    0.7263650
#2    0.6761913     0.861177    0.5282514    0.6520456    0.5948674    0.7365299
#  GG18_pav_ANN GG19_pav_ANN GG20_pav_ANN GG21_pav_ANN GG22_pav_ANN GG23_pav_ANN
#1    0.6642220    0.5385440    0.5043320    0.7484140    0.6436940     0.541165
#2    0.6729234    0.5454527    0.5120815    0.7575216    0.6502167     0.549040
#  GG24_pav_ANN GG25_pav_ANN GG26_pav_ANN GG27_pav_ANN GG28_pav_ANN
#1    0.5067010    0.7082260    0.6447260    0.6197480    0.9163480
#2    0.5136588    0.7160864    0.6545266    0.6278891    0.9284303 

Also, atttached is the script in case the email mangles the code.

A.K.

On Monday, April 14, 2014 6:26 PM, Zilefac Elvis <zilefacelvis at yahoo.com> wrote:

Hi AK,
I have another request for help.
Attached is a larger file (~27MB) for sample.zip. All files are same as previous except that I am using more sites to do the same thing that you did with sample.zip.

When generalizing Quantilecode.R to many sites, I receive an error when I run:

dir.create("Indices")
names1 <- lapply(ReadOut1, function(x) names(x))[[1]]
lstNew <- simplify2array(ReadOut1)

lapply(2:nrow(lstNew), function(i) {
  dat1 <- data.frame(lstNew[1], do.call(cbind, lstNew[i, ]), stringsAsFactors = FALSE)
  colnames(dat1) <- c(rownames(lstNew)[1], paste(names(lst1), rep(rownames(lstNew)[i], 
                                                                  length(lst1)), sep = "_"))
  write.csv(dat1, paste0(paste(getwd(), "Indices", rownames(lstNew)[i], sep = "/"), 
                         ".csv"), row.names = FALSE, quote = FALSE)
})

and I get this:
Error in 2:nrow(lstNew) : argument of length 0

I have tried a few tricks but could not overcome the error message.

Please help!
Atem.
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: Quantilecode.txt
URL: <https://stat.ethz.ch/pipermail/r-help/attachments/20140414/2628a0b7/attachment-0002.txt>