# [R] Loop for taking sum of rows based on proximity to other non-NA rows

Jeff Newmiller jdnewmil at dcn.davis.ca.us
Mon Oct 21 01:49:32 CEST 2013

```Looks like a right parenthesis was dropped. Corrected:

removeNNAs <- function( dat, N, lessOrEqual=FALSE ) {
N1 <- N+1
rx <- rle( !is.na( dat\$Count ) )
# indexes of the ends of each run of NAs or non-NAs
cs <- cumsum( rx\$lengths )
# indexes of the ends of runs of NAs or non-NAs
cs2 <- cs[ !rx\$values ]
# If the first Count is NA, then drop first run of NAs
if ( !rx\$values ) {
cs2 <- cs2[ -1 ]
}
# If the last Count is NA, then drop last run of NAs
if ( !rx\$values[ length( rx\$values ) ] ) {
cs2 <- cs2[ -length( cs2 ) ]
}
# cs2 is indexes of rows to potentially receive deleted Counts
# after collapse
cs2 <- cs2 + 1
# cs1 is indexes of non-NA Counts to be deleted
cs1 <- cs[ rx\$values ][ seq.int( length( cs2 ) ) ]
# identify the indexes of the Count values before the strings
# of NAs that meet the criteria
if ( lessOrEqual ) {
idx0 <- N1 >= ( cs2 - cs1 )
} else {
idx0 <- N1 == ( cs2 - cs1 )
}
idx1 <- cs1[ idx0 ]
# identify the indexes of the Count values after the strings of
# NAs that meet the criteria
idx2 <- cs2[ idx0 ]
# Identify which indexes are both sources and destinations
idx1c <-c( idx2[ -length( idx2 ) ] == idx1[ -1 ], FALSE )
# identify groups of indexes that need to be merged
idx1g <- rev( cumsum( rev( !idx1c ) ) )
# find which elements of idx1 represent the beginning of a
# sequence of indexes to be replaced (meta-indexes)
srcmidxs <- which( -1 == diff( c( idx1g[ 1 ] + 1, idx1g ) ) )
# find which elements of idx2 represent the end of a sequence
# to be  replaced (meta-indexes)
destmidxs <- which( 1 == rev( diff( rev( c( idx1g, 0 ) ) ) ) )
# add counts from before NAs to destination rows
result <- dat
srcidxList <- vector( mode="list", length=length( destmidxs ) )
for ( i in seq.int( length( destmidxs ) ) ) {
# row to which data will be copied
destidx <- idx2[ destmidxs[ i ] ]
# sequence of indexes of source rows
srcidxss <- seq.int( from=idx1[ srcmidxs[ i ] ], to=destidx - 1 )
result[ destidx, "Count" ] <- ( dat[ destidx, "Count" ]
+ sum( dat[ srcidxss, "Count" ], na.rm=TRUE ) )
# keep a list of indexes to be removed
srcidxList[ i ] <- list( srcidxss )
}
# remove source rows
result <- result[ -unlist( srcidxList ), ]
result
}

On Sun, 20 Oct 2013, Jeff Newmiller wrote:

> I thought this question looked interesting enough to make my own stab at it,
> but in hindsight I think this business of combining the counts seems quite
> unlikely to be necessary... it would be simpler and less damaging to the
> original data pattern to just remove groups of rows having fewer than "N"
> NAs.
>
> removeNNAs <- function( dat, N, lessOrEqual=FALSE ) {
>  N1 <- N+1
>  rx <- rle( !is.na( dat\$Count ) )
>  # indexes of the ends of each run of NAs or non-NAs
>  cs <- cumsum( rx\$lengths )
>  # indexes of the ends of runs of NAs or non-NAs
>  cs2 <- cs[ !rx\$values ]
>  # If the first Count is NA, then drop first run of NAs
>  if ( !rx\$values ) {
>    cs2 <- cs2[ -1 ]
>  }
>  # If the last Count is NA, then drop last run of NAs
>  if ( !rx\$values[ length( rx\$values ) ] ) {
>    cs2 <- cs2[ -length( cs2 ) ]
>  }
>  # cs2 is indexes of rows to potentially receive deleted Counts
>  # after collapse
>  cs2 <- cs2 + 1
>  # cs1 is indexes of non-NA Counts to be deleted
>  cs1 <- cs[ rx\$values ][ seq.int( length( cs2 ) ) ]
>  # identify the indexes of the Count values before the strings
>  # of NAs that meet the criteria
>  if ( lessOrEqual ) {
>    idx0 <- N1 >= ( cs2 - cs1 )
>  } else {
>    idx0 <- N1 == ( cs2 - cs1 )
>  }
>  idx1 <- cs1[ idx0 ]
>  # identify the indexes of the Count values after the strings of
>  # NAs that meet the criteria
>  idx2 <- cs2[ idx0 ]
>  # Identify which indexes are both sources and destinations
>  idx1c <-c( idx2[ -length( idx2 ) ] == idx1[ -1 ], FALSE )
>  # identify groups of indexes that need to be merged
>  idx1g <- rev( cumsum( rev( !idx1c ) ) )
>  # find which elements of idx1 represent the beginning of a
>  # sequence of indexes to be replaced (meta-indexes)
>  srcmidxs <- which( -1 == diff( c( idx1g[ 1 ] + 1, idx1g ) ) )
>  # find which elements of idx2 represent the end of a sequence
>  # to be  replaced (meta-indexes)
>  destmidxs <- which( 1 == rev( diff( rev( c( idx1g, 0 ) ) ) ) )
>  # add counts from before NAs to destination rows
>  result <- dat
>  srcidxList <- vector( mode="list", length=length( destmidxs ) )
>  for ( i in seq.int( length( destmidxs ) ) ) {
>    # row to which data will be copied
>    destidx <- idx2[ destmidxs[ i ] ]
>    # sequence of indexes of source rows
>    srcidxss <- seq.int( from=idx1[ srcmidxs[ i ] ], to=destidx - 1 )
>    result[ destidx, "Count" ] <- ( dat[ destidx, "Count" ]
>                       + sum( dat[ srcidxss, "Count" ], na.rm=TRUE )
>    # keep a list of indexes to be removed
>    srcidxList[ i ] <- list( srcidxss )
>  }
>  # remove source rows
>  result <- result[ -unlist( srcidxList ), ]
>  result
> }
>
>
> On Fri, 18 Oct 2013, arun wrote:
>
>>
>>
>> Hi,
>>
>> Found a bug in the function when tested.  So, try this (added one more
>> line):
>>
>> #Modified function
>> fun1 <- function(dat,n) {
>>  rl <- rle(is.na(dat[,"Count"]))
>> indx <-
>> which(is.na(dat[,"Count"]))[rep(rl\$lengths[rl\$values],rl\$lengths[rl\$values])==n]
>>  lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
>>                          x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
>>                      x2 <- x1[!is.na(x1\$Count),]
>>                      datN <-
>> data.frame(Position=max(x2\$Position),Count=sum(x2\$Count))
>>                      rowN <- row.names(x2)[x2\$Position %in%
>> max(x2\$Position)]
>>                      row.names(datN) <- if(length(rowN)>1) rowN else
>> rowN
>>                      datN
>>                     })
>> names(lst1) <- NULL
>> dat2 <- do.call(rbind,lst1)
>> indx2 <-
>> sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x)
>> c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))
>>
>> dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
>> dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
>> row.names(dat1New) <- 1:nrow(dat1New)
>> dat1New
>> }
>>
>>
>>
>> #Another function
>> fun2 <- function(dat,n){
>>  indx <- cumsum(c(1,abs(diff(is.na(dat[,"Count"])))))
>>  indx1 <- indx[is.na(dat[,"Count"])]
>>  names(indx1) <- which(is.na(dat[,"Count"]))
>> indx2 <- indx1[indx1 %in% names(table(indx1))[table(indx1)==n]]
>> lst1 <- tapply(seq_along(indx2),list(indx2),FUN=function(i) {
>>                             x1 <- indx2[i]
>>                              x2 <- as.numeric(names(x1))
>>                              x3 <- dat[c(min(x2)-1L,x2,max(x2)+1L),]
>>                              x4 <- subset(x3, !is.na(Count))
>>                              x5 <-
>> data.frame(Position=max(x4\$Position),Count=sum(x4\$Count))
>>                             ind <- x4\$Position %in% max(x4\$Position)
>>                              row.names(x5) <- if(sum(ind)>1)
>> row.names(x4)[ind] else row.names(x4)[ind]
>>                             x5
>>                         })
>> attr(lst1,"dimnames") <- NULL
>>  dat2 <- do.call(rbind,lst1)
>> indx3 <- sort(unlist(tapply(seq_along(indx2),list(indx2),FUN=function(i)
>> {x1 <- indx2[i]
>>                                      x2 <- as.numeric(names(x1))
>>                                      c(min(x2)-1L, x2,
>> max(x2)+1L)}),use.names=FALSE))
>>
>> dat\$id <- 1:nrow(dat)
>> dat2\$id <- as.numeric(row.names(dat2))
>> library(plyr)
>> res <- join(dat,dat2[,-1],by="id",type="left")
>> res1 <- res[!((row.names(res) %in% indx3) & is.na(res[,4])),]
>> res1[,2][!is.na(res1[,4])] <- res1[,4][!is.na(res1[,4])]
>> res2 <- res1[,1:2]
>> row.names(res2) <- 1:nrow(res2)
>> res2
>> }
>>
>>
>> identical(fun1(dat1,1),fun2(dat1,1))
>> # TRUE
>> identical(fun1(fun1(dat1,1),2),fun2(fun2(dat1,1),2))
>> # TRUE
>>
>> identical(fun1(fun1(fun1(dat1,1),2),3),fun2(fun2(fun2(dat1,1),2),3))
>> # TRUE
>>
>>
>> #Speed
>> set.seed(185)
>> datT <- data.frame(Position = sample(10:80,1e5,replace=TRUE),Count=
>> sample(c(NA, 10:100),1e5, replace=TRUE))
>>  system.time(res <- fun1(datT,1))
>>  #  user  system elapsed
>>  # 0.676   0.000   0.676
>>  system.time(res2 <- fun2(datT,1))
>> #   user  system elapsed
>> #  1.240   0.000   1.237
>>  identical(res,res2)
>> # TRUE
>>
>> A.K.
>>
>>
>>
>>
>>
>>
>>
>>
>>
>> On Friday, October 18, 2013 4:19 PM, arun <smartpink111 at yahoo.com> wrote:
>> Hi,
>>
>> May be this helps:
>>
>> dat1 <- structure(list(Position = c(15L, 22L, 38L, 49L, 55L, 61L, 62L,
>> 14L, 29L, 63L, 46L, 22L, 18L, 24L, 22L, 49L, 42L, 38L, 29L, 22L,
>> 29L, 23L, 42L), Count = c(15L, NA, NA, 5L, NA, 17L, 18L, NA,
>> NA, NA, 8L, NA, 20L, NA, NA, 16L, 19L, NA, NA, NA, 13L, NA, 33L
>> )), .Names = c("Position", "Count"), class = "data.frame", row.names =
>> c(NA,
>> -23L))
>>
>>
>> #There might be simple solutions.
>>
>> fun1 <- function(dat,n) {
>>  rl <- rle(is.na(dat[,"Count"]))
>> indx <-
>> which(is.na(dat[,"Count"]))[rep(rl\$lengths[rl\$values],rl\$lengths[rl\$values])==n]
>>  lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
>>                          x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
>>                      x2 <- x1[!is.na(x1\$Count),]
>>                      datN <-
>> data.frame(Position=max(x2\$Position),Count=sum(x2\$Count))
>>                      rowN <- row.names(x2)[x2\$Position %in%
>> max(x2\$Position)]
>>                      row.names(datN) <- if(length(rowN)>1) rowN else
>> rowN
>>                      datN
>>                     })
>> names(lst1) <- NULL
>> dat2 <- do.call(rbind,lst1)
>> indx2 <-
>> sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x)
>> c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))
>>
>> dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
>> dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
>> row.names(dat1New) <- 1:nrow(dat1New)
>> dat1New
>> }
>>
>> dat1N <- fun1(dat1,1)
>> dat1N
>>    Position Count
>> 1        15    15
>> 2        22    NA
>> 3        38    NA
>> 4        61    22
>> 5        62    18
>> 6        14    NA
>> 7        29    NA
>> 8        63    NA
>> 9        46    28
>> 10       24    NA
>> 11       22    NA
>> 12       49    16
>> 13       42    19
>> 14       38    NA
>> 15       29    NA
>> 16       22    NA
>> 17       42    46
>>
>> dat2N <- fun1(dat1N,2)
>> dat2N
>>    Position Count
>> 1        61    37
>> 2        62    18
>> 3        14    NA
>> 4        29    NA
>> 5        63    NA
>> 6        49    44
>> 7        42    19
>> 8        38    NA
>> 9        29    NA
>> 10       22    NA
>> 11       42    46
>> dat3N <- fun1(dat2N,3)
>> dat3N
>>   Position Count
>> 1       61    37
>> 2       62    62
>> 3       42    65
>>
>> A.K.
>>
>>
>>
>>
>>
>>
>>
>>
>>
>> Hi all, I have a dataset with 2 important columns, "Position" and
>> "Count". There are a total of 34,532 rows, but only 457 non-NA values in
>> the "Count" column (every cell in "Position" column has a value). I
>> need to write a loop to march down the rows, and if there are 2 rows in
>> "Count" where there is only 1 NA row between them, sum the two values up
>> and print only one row with the summed Count value and the Position
>> value that corresponds to the larger Count value, thus making the three
>> rows into one. For example:
>>
>> Position Count
>> 15 15
>> 22 NA
>> 38 NA
>> 49 5
>> 55 NA
>> 61 17
>>
>> would become
>>
>> Position Count
>> 15 15
>> 22 NA
>> 38 NA
>> 61 22
>>
>> After this step, I also need to write another script to march
>> down the rows and look for rows with only two NA's between non-NA rows
>> in Count. This would make the previous data become
>>
>> Position Count
>> 61 37
>>
>> Ideally I would like a loop that can be flexibly adjusted to the
>> number of NA's in between adjacent non-NA values that can be freely
>> changed. I would greatly appreciate any insight for this.
>>
>> ______________________________________________
>> R-help at r-project.org mailing list
>> https://stat.ethz.ch/mailman/listinfo/r-help
>> http://www.R-project.org/posting-guide.html
>> and provide commented, minimal, self-contained, reproducible code.
>>
>
> ---------------------------------------------------------------------------
> Jeff Newmiller                        The     .....       .....  Go Live...
> DCN:<jdnewmil at dcn.davis.ca.us>        Basics: ##.#.       ##.#.  Live Go...
>                                      Live:   OO#.. Dead: OO#..  Playing
> Research Engineer (Solar/Batteries            O.O#.       #.O#.  with
> /Software/Embedded Controllers)               .OO#.       .OO#.  rocks...1k
> ---------------------------------------------------------------------------

---------------------------------------------------------------------------
Jeff Newmiller                        The     .....       .....  Go Live...
DCN:<jdnewmil at dcn.davis.ca.us>        Basics: ##.#.       ##.#.  Live Go...