[R] Quantile and rowMean from multiple files in a folder

Tue Apr 15 03:13:22 CEST 2014

   Hi AK,

   Thanks very much.

   I  did  send  you  another  email  with  a larger Sample.zip file. The
   Quantilecode.R which you initially developed for a smaller sample.zip did
   not complete the task when I used it for a larger data set. Please check to
   rectify the error message.

   Thanks,

   Atem.
   ------ Original Message ------

     From : arun
     To : R. Help;
     Cc : Zilefac Elvis;
     Sent : 14-04-2014 18:57
     Subject : Re: Quantile and rowMean from multiple files in a folder

Hi Atem,

I guess this is what you wanted.

###Q1: 
###
###working directory: Observed
 #Only one file per Site.  Assuming this is the case for the full dataset, then
 I guess there is no need to average

dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", list.files(patter
n = ".csv")))

lst2 <-  lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- readLines(
x2); header1 <- lines1[1:2]; dat1 <- read.table(text=lines1,header=FALSE,sep=",
",stringsAsFactors=FALSE, skip=2); colnames(dat1) <- Reduce(paste,strsplit(head
er1,","));dat1[-c(nrow(dat1),nrow(dat1)-1),]}))

#different number of rows
 sapply(seq_along(lst2),function(i){lstN <- lapply(lst2[[i]],function(x) x[,-1]
);sapply(lstN,function(x) nrow(x))})
 #[1] 9 9 9 8 2 9

#difference in number of columns
sapply(seq_along(lst2),function(i) {sapply(lst2[[i]],function(x) ncol(x))})
 #[1] 157 258 258  98 157 258

library(plyr)
library(stringr)

lst3 <- setNames(lapply(seq_along(lst2),function(i) {lapply(lst2[[i]],function(
x) {names(x)[-1] <- paste(names(x)[-1], names(lst1)[i],sep="_"); names(x) <- st
r_trim(names(x)); x})[[1]]}), names(lst1)) 

df1 <- join_all(lst3,by="Year")
dim(df1)
 #[1]    9 1181 

sapply(split(names(df1)[-1] ,gsub(".*\\_","",names(df1)[-1])),function(x) {df2 
<- df1[,x];df3 <- data.frame(Percentiles=paste0(seq(0,100, by=1) ,"%"), numcolw
ise(function(y) quantile(y,seq(0,1,by=0.01),na.rm=TRUE))(df2),stringsAsFactors=
FALSE);ncol(df3) })
 #G100 G101 G102 G103 G104 G105 
# 157  258  258   98  157  258 

lst4 <- split(names(df1)[-1] ,gsub(".*\\_","",names(df1)[-1]))

lapply(seq_along(lst4),function(i) {df2 <- df1[,lst4[[i]]]; df3 <- data.frame(P
ercentiles=paste0(seq(0,100, by=1) ,"%"), numcolwise(function(y) quantile(y,seq
(0,1,by=0.01),na.rm=TRUE))(df2),stringsAsFactors=FALSE);df3[1:3,1:3]; write.csv
(df3,paste0(paste(getwd(), "final",paste(names(lst1)[[i]],"Quantile",sep="_"),s
ep="/"),".csv"),row.names=FALSE,quote=FALSE)}) 

ReadOut1 <- lapply(list.files(recursive=TRUE)[grep("Quantile",list.files(recurs
ive=TRUE))],function(x) read.csv(x,header=TRUE,stringsAsFactors=FALSE)) 

sapply(ReadOut1,dim)
#     [,1] [,2] [,3] [,4] [,5] [,6]
 #[1,]  101  101  101  101  101  101 
#[2,]  157  258  258   98  157  258

lapply(ReadOut1,function(x) x[1:2,1:3])[1:3]
 #[[1]] 
#  Percentiles pav.DJF_G100 pav.MAM_G100 
#1          0%            0     0.640500 
#2          1%            0     0.664604 
# 
#[[2]] 
#  Percentiles txav.DJF_G101 txav.MAM_G101
 #1          0%      -13.8756      4.742400 
#2          1%      -13.8140      4.817184
 #
 #[[3]] 
#  Percentiles txav.DJF_G102 txav.MAM_G102
 #1          0%     -15.05000      4.520700
 #2          1%     -14.96833      4.543828 
#####
###Q2: 
###Observed data 

dir.create("Indices")
 names1 <- unlist(lapply(ReadOut1,function(x)
 names(x)[-1])) 
names2 <-  gsub("\\_.*","",names1)
 names3 <- unique(gsub("[.]", " ", names2)) 

res <- do.call(rbind,lapply(seq_along(lst4),function(i) {df2 <- df1[,lst4[[i]]]
;vec1 <- colMeans(df2,na.rm=TRUE); vec2 <- rep(NA,length(names3));names(vec2) <
- paste(names3,names(lst4)[[i]],sep="_"); vec2[names(vec2) %in% names(vec1)] <-
 vec1; names(vec2) <- gsub("\\_.*","",names(vec2)); vec2  }))

lapply(seq_len(ncol(res)),function(i) {mat1 <- t(res[,i,drop=FALSE]);colnames(m
at1) <- names(lst4); write.csv(mat1,paste0(paste(getwd(),"Indices", gsub(" ","_
",rownames(mat1)),sep="/"),".csv"),row.names=FALSE,quote=FALSE)})

##Output2:
ReadOut2 <- lapply(list.files(recursive=TRUE)[grep("Indices",list.files(recursi
ve=TRUE))],function(x) read.csv(x,header=TRUE,stringsAsFactors=FALSE)) 

length(ReadOut2) 

#[1] 257

list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))][1]
#[1] "Indices/pav_ANN.csv" 

res[,"pav ANN",drop=FALSE] 

#      pav ANN
#[1,] 1.298811
#[2,] 7.642922 

#[3,] 6.740011 

#[4,]       NA
#[5,] 1.296650 

#[6,] 6.887622 

ReadOut2[[1]]
#      G100     G101     G102 G103    G104     G105
#1 1.298811 7.642922 6.740011   NA 1.29665 6.887622 

###Sample data 

###Working directory changed to "sample" 

dir.create("Indices_colMeans")

lst1 <- split(list.files(pattern=".csv"),gsub("\\_.*","",list.files(pattern=".c
sv"))) 

lst2 <-  lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- readLines(
x2); header1 <- lines1[1:2]; dat1 <- read.table(text=lines1,header=FALSE,sep=",
",stringsAsFactors=FALSE, skip=2); colnames(dat1) <- Reduce(paste,strsplit(head
er1,","));dat1[-c(nrow(dat1),nrow(dat1)-1),]}))
res1 <- do.call(rbind,lapply(seq_along(lst2),function(i) {rowMeans(do.call(cbin
d,lapply(lst2[[i]],function(x) colMeans(x[,-1],na.rm=TRUE))),na.rm=TRUE) })) 

lapply(seq_len(ncol(res1)),function(i){mat1 <- t(res1[,i,drop=FALSE]); colnames
(mat1) <- names(lst2);write.csv(mat1,paste0(paste(getwd(),"Indices_colMeans",gs
ub(" ","_",rownames(mat1)),sep="/"),".csv"),row.names=FALSE,quote=FALSE)})

##Output2 Sample
ReadOut2S <- lapply(list.files(recursive=TRUE)[grep("Indices",list.files(recurs
ive=TRUE))],function(x) read.csv(x,header=TRUE,stringsAsFactors=FALSE)) 

length(ReadOut2S)
#[1] 257

list.files(recursive=TRUE)[grep("Indices",list.files(recursive=TRUE))][1] 

#[1] "Indices_colMeans/pav_ANN.csv" 

res1[,"pav ANN",drop=FALSE] 

#      pav ANN
#[1,] 1.545620
#[2,] 1.518553 

ReadOut2S[[1]] 

#     G100     G101 

#1 1.54562 1.518553 

A.K.

On Monday, April 14, 2014 1:05 AM, Zilefac Elvis  wrote:

Hi AK,

Q1) Please apply the Quantilecode.R to Observed.zip (attached). I tried but rec
eived an error which was self-explanatory but I could not change the dimensions
 in the code.

Q2) Please apply Quantilecode.R to both sample.zip and observed.zip. Here, inst
ead of doing quantile(y, seq(0, 1, by = 0.01), take colMeans of the indices. 

I have tried to solve both Q1 and Q2 but still unable to control the dimensions
.

Thanks,
Atem.
On Sunday, April 13, 2014 9:05 AM, arun  wrote:

Hi Atem,

On my end, the codes are not formatted in the email as seen in the screen of fo
rmatR GUI.

I am attaching the .R file in case there is some difficulty for you.
Arun

On Sunday, April 13, 2014 10:54 AM, arun  wrote:
Hi,

I am formatting the codes using library(formatR).  Hopefully, it will not be ma
ngled in the email.
dir.create("final")
lst1 <- split(list.files(pattern = ".csv"), gsub("\\_.*", "", list.files(patter
n = ".csv")))

lst2 <- lapply(lst1, function(x1) lapply(x1, function(x2) { lines1 <- readLines
(x2) header1 <- lines1[1:2] dat1 <- read.table(text = lines1, header = FALSE, s
ep = ",", stringsAsFactors = FALSE,  skip = 2) colnames(dat1) <- Reduce(paste, 
strsplit(header1, ",")) dat1[-c(nrow(dat1), nrow(dat1) - 1), ]
}))

library(plyr) 

lapply(seq_along(lst2), function(i) { lstN <- lapply(lst2[[i]], function(x) x[,
 -1]) lstQ1 <- lapply(lstN, function(x) numcolwise(function(y) quantile(y, seq(
0, 1,  by = 0.01), na.rm = TRUE))(x)) arr1 <- array(unlist(lstQ1), dim = c(dim(
lstQ1[[1]]), length(lstQ1)), dimnames = list(NULL,  lapply(lstQ1, names)[[1]]))
 res <- rowMeans(arr1, dims = 2, na.rm = TRUE) colnames(res) <- gsub(" ", "_", 
colnames(res)) res1 <- data.frame(Percentiles = paste0(seq(0, 100, by = 1), "%"
), res, stringsAsFactors = FALSE) write.csv(res1, paste0(paste(getwd(), "final"
, paste(names(lst1)[[i]], "Quantile",  sep = "_"), sep = "/"), ".csv"), row.nam
es = FALSE, quote = FALSE)
})

ReadOut1 <- lapply(list.files(recursive = TRUE)[grep("Quantile", list.files(rec
ursive = TRUE))],  function(x) read.csv(x, header = TRUE, stringsAsFactors = FA
LSE))
sapply(ReadOut1,
dim)
#     [,1] [,2]
#[1,]  101  101
#[2,]  258  258

lapply(ReadOut1,function(x) x[1:2,1:3])
#[[1]]
#  Percentiles  txav_DJF txav_MAM
#1          0% -12.68566  7.09702
#2          1% -12.59062  7.15338
#
#[[2]]
#  Percentiles  txav_DJF txav_MAM
#1          0% -12.75516 6.841840
#2          1% -12.68244 6.910664 

###Q2:

dir.create("Indices")
names1 <- lapply(ReadOut1, function(x) names(x))[[1]]
lstNew <- simplify2array(ReadOut1)
lapply(2:nrow(lstNew), function(i) { dat1 <- data.frame(lstNew[1], do.call(cbin
d, lstNew[i, ]), stringsAsFactors = FALSE) colnames(dat1) <- c(rownames(lstNew)
[1], paste(names(lst1), rep(rownames(lstNew)[i],  length(lst1)), sep = "_")) wr
ite.csv(dat1, paste0(paste(getwd(), "Indices", rownames(lstNew)[i], sep = "/"),
  ".csv"), row.names = FALSE, quote = FALSE)
}) ## Output2:
ReadOut2 <- lapply(list.files(recursive = TRUE)[grep("Indices", list.files(recu
rsive = TRUE))],  function(x) read.csv(x, header = TRUE, stringsAsFactors = FAL
SE))
length(ReadOut2)
# [1] 257 

head(ReadOut2[[1]], 2)
#  Percentiles G100_pav_ANN G101_pav_ANN
#1          0%     1.054380     1.032740
#2          1% 
   1.069457     1.045689 

A.K.

On Sunday, April 13, 2014 2:46 AM, Zilefac Elvis  wrote:

Hi AK,
Q1) I need your help again. Using the previous data (attached) and the previous
 code below,instead of taking rowMeans, let's do quantile(x,seq(0,1,by=0.01)). 

Delete the last 2 rows (Trend and p<) in each file before doing quantile(x,seq(
0,1,by=0.01)).

For example, assume that I want to
calculate quantile(x,seq(0,1,by=0.01)) for each column of Site G100. I will do 
so for the 5 sims of site G100 and then take their average. This will be approx
imately close to the true value than just calculating quantile(x,seq(0,1,by=0.0
1)) from one sim. Please do this same thing for all the files.

So, when you do rowMeans, it should be the mean of quantile(x,seq(0,1,by=0.01))
 calculated from all sims in that Site.

Output

The number of files in "final" remains the same (2 files). The "Year" column(wi
ll be replaced) will contain  the names of quantile(x,seq(0,1,by=0.01)) such as
  0%           1%           2%           3%           4%           5%          
 6%, ..., 98%    
     99%         100% . You can give this column any name such as "Percentiles"
.

Q2)  From the folder "final", please go to each file identified by site name, t
ake a column, say col1 of txav  from each file, create a dataframe whose colnam
es are site codes (names of files in "final"). Create a folder called "Indices"
 and place this dataframe in it. The filename for the dataframe is txav, say. S
o, in "Indices", you will have one file having 3 columns [, c(Percentiles, G100
,G101)]. The idea is that I want to be able to pick any column from files in "f
inal" and form a dataframe from which I will generate my qqplot or boxplot.

Thanks very much AK.
Atem
This should be the final step of this my drama, at least for now.
#==============================================================================
================================

dir.create("final")
lst1 <- split(list.files(pattern=".csv"),gsub("\\_.*","",list.files(pattern=".c
sv"))) lst2 <-  lapply(lst1,function(x1) lapply(x1, function(x2) {lines1 <- rea
dLines(x2); header1 <- lines1[1:2]; dat1 <- read.table(text=lines1,header=FALSE
,sep=",",stringsAsFactors=FALSE, skip=2); colnames(dat1) <- Reduce(paste,strspl
it(header1,","));dat1}))

lstYear <- lapply(lst2,function(x) lapply(x, function(y) y[,1,drop=FALSE])[[1]]
) 

lapply(seq_along(lst2),function(i) {lstN <-lapply(lst2[[i]],function(x) x[,-1])
; arr1 <- array(unlist(lstN),dim=c(dim(lstN[[1]]),length(lstN)),dimnames=list(N
ULL,lapply(lstN,names)[[1]]));res <-
cbind(lstYear[[i]],rowMeans(arr1,dims=2,na.rm=TRUE)); names(res) <- gsub("\\_$"
,"",gsub(" ", "_",names(res))); res[,1] <- gsub(" <", "",res[,1]); write.csv(re
s,paste0(paste(getwd(),"final",names(lst1)
[[i]],sep="/"),".csv"),row.names=FALSE,quote=FALSE)  }) 

#==============================================================================
======================