[R] How to extract last value in each group

arun smartpink111 at yahoo.com
Thu Aug 15 21:40:30 CEST 2013


Speed comparison:


dat1<-structure(list(Date = c("06/01/2010", "06/01/2010", "06/01/2010", 
"06/01/2010", "06/02/2010", "06/02/2010", "06/02/2010", "06/02/2010", 
"06/02/2010", "06/02/2010", "06/02/2010"), Time = c(1358L, 1359L, 
1400L, 1700L, 331L, 332L, 334L, 335L, 336L, 337L, 338L), O = c(136.4, 
136.4, 136.45, 136.55, 136.55, 136.7, 136.75, 136.8, 136.8, 136.75, 
136.8), H = c(136.4, 136.5, 136.55, 136.55, 136.7, 136.7, 136.75, 
136.8, 136.8, 136.8, 136.8), L = c(136.35, 136.35, 136.35, 136.55, 
136.5, 136.65, 136.75, 136.8, 136.8, 136.75, 136.8), C = c(136.35, 
136.5, 136.4, 136.55, 136.7, 136.65, 136.75, 136.8, 136.8, 136.8, 
136.8), U = c(2L, 9L, 8L, 1L, 36L, 3L, 1L, 4L, 8L, 1L, 3L), D = c(12L, 
6L, 7L, 0L, 6L, 1L, 0L, 0L, 0L, 2L, 0L)), .Names = c("Date", 
"Time", "O", "H", "L", "C", "U", "D"), class = "data.frame", row.names = c(NA, 
-11L))


indx<- rep(1:nrow(dat1),1e5)
dat2<- dat1[indx,]

dat2[-c(1:11),1]<-format(rep(seq(as.Date("1080-01-01"),by=1,length.out=99999),each=11),"%m/%d/%Y")
 dat2<- dat2[order(dat2[,1],dat2[,2]),]
row.names(dat2)<-1:nrow(dat2)

library(data.table)
library(plyr)

## Functions


isLastInRun <- function(x) c(x[-1] != x[-length(x)], TRUE)
  f3 <- function(dataFrame) {
      dataFrame[ isLastInRun(dataFrame$Date), ]
  }

  f1 <- function (dataFrame) {
      dataFrame[unlist(with(dataFrame, tapply(Time, list(Date), FUN = function(x) x == max(x)))), ]
  }
  f2 <- function (dataFrame) {
      dataFrame[cumsum(with(dataFrame, tapply(Time, list(Date), FUN = which.max))), ]
  }
  
f4<- function(dataFrame){
    dataFrame[as.logical(with(dataFrame,ave(Time,Date,FUN=function(x) x==max(x)))),]
}



#Comparison

system.time(res1<-dat2[c(diff(as.numeric(as.factor(dat2$Date))),1)>0,])
 # user  system elapsed 
 # 0.500   0.000   0.501 


system.time(res2<-f3(dat2))
#  user  system elapsed 
#  0.316   0.000   0.318 

identical(res1,res2)
#[1] TRUE

system.time(res3<-f1(dat2))
 #user  system elapsed 
 # 2.272   0.000   2.278 

system.time(res4<-f2(dat2))
 # user  system elapsed 
 # 0.932   0.000   0.935 

 identical(res1,res3)
#[1] TRUE
 identical(res1,res4)
#[1] TRUE

system.time(res5<-aggregate(dat2[-1], dat2[1], tail, 1))
# user  system elapsed 
# 26.784   0.008  26.840 

row.names(res5)<- row.names(res1)
attr(res5,"row.names")<- attr(res1,"row.names")
 identical(res5,res1)
#[1] TRUE

system.time(res6<- dat2[ tapply(rownames(dat2), dat2$Date, tail, 1) , ] )
#   user  system elapsed 
#392.124   0.008 392.880 
identical(res1,res6)
#[1] TRUE

system.time(res7<- dat2[cumsum(rle(dat2[,1])$lengths),]) #shortest time
# user  system elapsed 
#  0.152   0.000   0.153 
identical(res1,res7)
#[1] TRUE
system.time(res8<-ddply(dat2, .(Date), function(df) df[which.max(df$Time),]))
#  user  system elapsed 
#195.580   1.988 197.995 
row.names(res8)<- row.names(res1)
attr(res8,"row.names")<- attr(res1,"row.names")
identical(res1,res8)
#[1] TRUE
system.time(res9<- f4(dat2))
# user  system elapsed 
#  0.764   0.000   0.767 
 identical(res1,res9)
#[1] TRUE

system.time({
dt1 <- data.table(dat2, key=c('Date', 'Time'))
 ans <- dt1[, .SD[.N], by='Date']})
#  user  system elapsed 
# 37.384   0.000  37.454 
#separate the data.table creation step:
 dt1 <- data.table(dat2, key=c('Date', 'Time'))
system.time(ans <- dt1[, .SD[.N], by='Date'])
# user  system elapsed 
# 38.500   0.000  38.566 
ans1<- as.data.frame(ans)
row.names(ans1)<- row.names(res1)
attr(ans1,"row.names")<- attr(res1,"row.names")
 identical(ans1,res1)
#[1] TRUE


A.K.





----- Original Message -----
From: Steve Lianoglou <lianoglou.steve at gene.com>
To: William Dunlap <wdunlap at tibco.com>
Cc: arun <smartpink111 at yahoo.com>; Noah Silverman <noahsilverman at ucla.edu>; R help <r-help at r-project.org>
Sent: Wednesday, August 14, 2013 5:22 PM
Subject: Re: [R] How to extract last value in each group

Or with plyr:

R> library(plyr)
R> ans <- ddply(x, .(Date), function(df) df[which.max(df$Time),])

-steve

On Wed, Aug 14, 2013 at 2:18 PM, Steve Lianoglou
<lianoglou.steve at gene.com> wrote:
> While we're playing code golf, likely faster still could be to use
> data.table. Assume your data is in a data.frame named "x":
>
> R> library(data.table)
> R> x <- data.table(x, key=c('Date', 'Time'))
> R> ans <- x[, .SD[.N], by='Date']
>
> -steve
>
> On Wed, Aug 14, 2013 at 2:01 PM, William Dunlap <wdunlap at tibco.com> wrote:
>> A somewhat faster version (for datasets with lots of dates, assuming it is sorted by date and time) is
>>   isLastInRun <- function(x) c(x[-1] != x[-length(x)], TRUE)
>>   f3 <- function(dataFrame) {
>>       dataFrame[ isLastInRun(dataFrame$Date), ]
>>   }
>> where your two suggestions, as functions, are
>>   f1 <- function (dataFrame) {
>>       dataFrame[unlist(with(dataFrame, tapply(Time, list(Date), FUN = function(x) x == max(x)))), ]
>>   }
>>   f2 <- function (dataFrame) {
>>       dataFrame[cumsum(with(dataFrame, tapply(Time, list(Date), FUN = which.max))), ]
>>   }
>>
>> Bill Dunlap
>> Spotfire, TIBCO Software
>> wdunlap tibco.com
>>
>>
>>> -----Original Message-----
>>> From: r-help-bounces at r-project.org [mailto:r-help-bounces at r-project.org] On Behalf
>>> Of arun
>>> Sent: Wednesday, August 14, 2013 1:08 PM
>>> To: Noah Silverman
>>> Cc: R help
>>> Subject: Re: [R] How to extract last value in each group
>>>
>>> Hi,
>>> Try:
>>> dat1<- read.table(text="
>>>         Date Time      O      H      L      C  U  D
>>> 06/01/2010 1358 136.40 136.40 136.35 136.35  2  12
>>> 06/01/2010 1359 136.40 136.50 136.35 136.50  9  6
>>> 06/01/2010 1400 136.45 136.55 136.35 136.40  8  7
>>> 06/01/2010 1700 136.55 136.55 136.55 136.55  1  0
>>> 06/02/2010  331 136.55 136.70 136.50 136.70  36  6
>>> 06/02/2010  332 136.70 136.70 136.65 136.65  3  1
>>> 06/02/2010  334 136.75 136.75 136.75 136.75  1  0
>>> 06/02/2010  335 136.80 136.80 136.80 136.80  4  0
>>> 06/02/2010  336 136.80 136.80 136.80 136.80  8  0
>>> 06/02/2010  337 136.75 136.80 136.75 136.80  1  2
>>> 06/02/2010  338 136.80 136.80 136.80 136.80  3  0
>>> ",sep="",header=TRUE,stringsAsFactors=FALSE)
>>>
>>>  dat1[unlist(with(dat1,tapply(Time,list(Date),FUN=function(x) x==max(x)))),]
>>> #         Date Time      O      H      L      C U D
>>> #4  06/01/2010 1700 136.55 136.55 136.55 136.55 1 0
>>> #11 06/02/2010  338 136.80 136.80 136.80 136.80 3 0
>>> #or
>>>  dat1[cumsum(with(dat1,tapply(Time,list(Date),FUN=which.max))),]
>>>          Date Time      O      H      L      C U D
>>> 4  06/01/2010 1700 136.55 136.55 136.55 136.55 1 0
>>> 11 06/02/2010  338 136.80 136.80 136.80 136.80 3 0
>>>
>>> #or
>>> dat1[as.logical(with(dat1,ave(Time,Date,FUN=function(x) x==max(x)))),]
>>>  #        Date Time      O      H      L      C U D
>>> #4  06/01/2010 1700 136.55 136.55 136.55 136.55 1 0
>>> #11 06/02/2010  338 136.80 136.80 136.80 136.80 3 0
>>> A.K.
>>>
>>>
>>>
>>>
>>> ----- Original Message -----
>>> From: Noah Silverman <noahsilverman at ucla.edu>
>>> To: "R-help at r-project.org" <r-help at r-project.org>
>>> Cc:
>>> Sent: Wednesday, August 14, 2013 3:56 PM
>>> Subject: [R] How to extract last value in each group
>>>
>>> Hello,
>>>
>>> I have some stock pricing data for one minute intervals.
>>>
>>> The delivery format is a bit odd.  The date column is easily parsed and used as an index
>>> for an its object.  However, the time column is just an integer (1:1807)
>>>
>>> I just need to extract the *last* entry for each day.  Don't actually care what time it was,
>>> as long as it was the last one.
>>>
>>> Sure, writing a big nasty loop would work, but I was hoping that someone would be able
>>> to suggest a faster way.
>>>
>>> Small snippet of data below my sig.
>>>
>>> Thanks!
>>>
>>>
>>> --
>>> Noah Silverman, M.S., C.Phil
>>> UCLA Department of Statistics
>>> 8117 Math Sciences Building
>>> Los Angeles, CA 90095
>>>
>>> --------------------------------------------------------------------------
>>>
>>>         Date Time      O      H      L      C  U  D
>>> 06/01/2010 1358 136.40 136.40 136.35 136.35   2  12
>>> 06/01/2010 1359 136.40 136.50 136.35 136.50   9   6
>>> 06/01/2010 1400 136.45 136.55 136.35 136.40   8   7
>>> 06/01/2010 1700 136.55 136.55 136.55 136.55   1   0
>>> 06/02/2010  331 136.55 136.70 136.50 136.70  36   6
>>> 06/02/2010  332 136.70 136.70 136.65 136.65   3   1
>>> 06/02/2010  334 136.75 136.75 136.75 136.75   1   0
>>> 06/02/2010  335 136.80 136.80 136.80 136.80   4   0
>>> 06/02/2010  336 136.80 136.80 136.80 136.80   8   0
>>> 06/02/2010  337 136.75 136.80 136.75 136.80   1   2
>>> 06/02/2010  338 136.80 136.80 136.80 136.80   3   0
>>> ______________________________________________
>>> R-help at r-project.org mailing list
>>> https://stat.ethz.ch/mailman/listinfo/r-help
>>> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
>>> and provide commented, minimal, self-contained, reproducible code.
>>>
>>>
>>> ______________________________________________
>>> R-help at r-project.org mailing list
>>> https://stat.ethz.ch/mailman/listinfo/r-help
>>> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
>>> and provide commented, minimal, self-contained, reproducible code.
>>
>> ______________________________________________
>> R-help at r-project.org mailing list
>> https://stat.ethz.ch/mailman/listinfo/r-help
>> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
>> and provide commented, minimal, self-contained, reproducible code.
>
>
>
> --
> Steve Lianoglou
> Computational Biologist
> Bioinformatics and Computational Biology
> Genentech



-- 
Steve Lianoglou
Computational Biologist
Bioinformatics and Computational Biology
Genentech




More information about the R-help mailing list