[R] Filtering using multiple rows in dplyr

Thu May 31 19:55:02 CEST 2018

Yes this can be done using dplyr. One of the main advantages of doing so 
is that it is possible to develop and modify code to handle fairly 
complicated requirements easily, but it may not always be best from a 
performance or memory usage perspective. The example below walks you 
through some instructive examples using both base R and dplyr and shows 
that base R seems faster for this particular example.

For reference: your failure to send your request in plain text lead to 
some minor corruption when passed through the mailing list. (See the 
included text below.) I have seen cases where what was received was 
essentially unreadable, so please figure out your email program's plain 
text setting for next time. The mailing list posting guide warns you about
this and other pitfalls, though due to the wide variety of operating 
systems and email programs the details of how to circumvent these pitfalls 
are mostly left to the user to track down.

#####################################################
dta0 <- read.table( text=
"   subject ageGrp ear hearingGrp sex freq L2       Ldp     Phidp        NF       SNR
1 HALAF032      A   L          A   F    2  0 -23.54459  55.56005 -43.08282 19.538232
2 HALAF032      A   L          A   F    2  2 -32.64881  86.22040 -23.31558 -9.333224
3 HALAF032      A   L          A   F    2  4 -18.91058  42.12168 -35.60250 16.691919
4 HALAF032      A   L          A   F    2  6 -23.85937 297.94499 -20.70452 -3.154846
5 HALAF032      A   L          A   F    2  8 -14.45381 181.75329 -24.17094  9.717128
6 HALAF032      A   L          A   F    2 10 -20.42384  67.12998 -35.77357 15.349728
", header=TRUE )

set.seed( 21 )
dta <- expand.grid( subject = sprintf( "HALAF%s", 30:35 )
                   , freq = 2:4
                   , L2 = seq( 0, 10, by=2 )
                   )
dta$Ldp <- rnorm( nrow( dta ), -20, 5 )
dta$SNR <- rnorm( nrow( dta ), 5, 10 )

dta1 <- dta[ order( dta$subject, dta$freq, dta$L2 ), ]

# verify whether subset of data meets criteria
# assumes DF has L2 in sorted order (and only one subject and freq)
testSNR <- function( SNR, SNRlimit, limitCount ) {
   qual <- SNR > SNRlimit
   consec <- cumsum( qual )
   any( limitCount < consec )
}

dta1[ 1:6, "SNR" ]
#> [1] 15.720845 -3.846874 -1.650308  3.353974  5.533430  8.709555
testSNR( dta1[ 1:6, "SNR" ], SNRlimit = 3, limitCount = 3 )
#> [1] TRUE

# one row output per group
aggregate( list( isclean = dta1$SNR )
          , dta1[ , c( "subject", "freq" ) ]
          , FUN=testSNR
          , SNRlimit=3
          , limitCount=3
          )
#>    subject freq isclean
#> 1  HALAF30    2    TRUE
#> 2  HALAF31    2   FALSE
#> 3  HALAF32    2   FALSE
#> 4  HALAF33    2   FALSE
#> 5  HALAF34    2   FALSE
#> 6  HALAF35    2    TRUE
#> 7  HALAF30    3   FALSE
#> 8  HALAF31    3    TRUE
#> 9  HALAF32    3   FALSE
#> 10 HALAF33    3    TRUE
#> 11 HALAF34    3   FALSE
#> 12 HALAF35    3    TRUE
#> 13 HALAF30    4   FALSE
#> 14 HALAF31    4   FALSE
#> 15 HALAF32    4    TRUE
#> 16 HALAF33    4   FALSE
#> 17 HALAF34    4    TRUE
#> 18 HALAF35    4    TRUE

# results are repeated to the length of their group
# result of ave function is always numeric
ave( dta1$SNR
    , dta1$subject
    , dta1$freq
    , FUN=function(x) { testSNR( x, 3, 3 ) }
    )
#>   [1] 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0
#>  [36] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0
#>  [71] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
#> [106] 1 1 1

# this is the calculation you asked for
dta1$clean <- ifelse( ave( dta1$SNR
                          , dta1$subject
                          , dta1$freq
                          , FUN=function(x) { testSNR( x, 3, 3 ) }
                          )
                     , "Y"
                     , "N"
                     )
dta2 <- dta1[ "Y" == dta1$clean, ]

# but yes, all this can also be accomplished using dplyr operations
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#>     filter, lag
#> The following objects are masked from 'package:base':
#>
#>     intersect, setdiff, setequal, union

# create sample data set using dplyr
set.seed( 21 )
dtaT <- (   expand.grid( subject = sprintf( "HALAF%s", 30:35 )
                        , freq = 2:4
                        , L2 = seq( 0, 10, by=2 )
                        )
         %>% mutate( Ldp = rnorm( n(), -20, 5 )
                   , SNR = rnorm( n(), 5, 10 )
                   )
         )
# sort sample dataset using dplyr
dta1T <- (   dtaT
          %>% arrange( subject, freq, L2 )
          %>% group_by( subject, freq )
          %>% mutate( clean = ifelse( testSNR( SNR
                                             , SNRlimit = 3
                                             , limitCount = 3
                                             )
                                    , "Y"
                                    , "N"
                                    )
                    )
          %>% ungroup
          )
# per your request
dta2T <- (   dta1T
          %>% filter( "Y" == clean )
          )

# or avoid unnecessary comparisons
dta1T2 <- (   dtaT
           %>% arrange( subject, freq, L2 )
           %>% group_by( subject, freq )
           %>% mutate( isclean = testSNR( SNR, SNRlimit = 3, limitCount = 3 ) )
           %>% ungroup
           )
# per your request
dta2T2 <- (   dta1T2
           %>% filter( isclean )
           )

# confirm that results are equivalent
all.equal( dta2$Ldp, dta2T$Ldp )
#> [1] TRUE
all.equal( dta2$Ldp, dta2T2$Ldp )
#> [1] TRUE

library(microbenchmark)
fn_base <- function( dta1 ) {
   dta1$clean <- ifelse( ave( dta1$SNR
                            , dta1$subject
                            , dta1$freq
                            , FUN=function(x) { testSNR( x, 3, 3 ) }
                            )
                       , "Y"
                       , "N"
                       )
   dta1[ "Y" == dta1$clean, ]
}
fn_dplyr1 <- function( dta1 ) {
   (   dta1
   %>% group_by( subject, freq )
   %>% mutate( clean = ifelse( testSNR( SNR, SNRlimit = 3, limitCount = 3 )
                             , "Y"
                             , "N"
                             )
             )
   %>% ungroup
   %>% filter( "Y" == clean )
   )
}
fn_dplyr2 <- function( dta1 ) {
   (   dta1
   %>% group_by( subject, freq )
   %>% mutate( isclean = testSNR( SNR, SNRlimit = 3, limitCount = 3 ) )
   %>% ungroup
   %>% filter( isclean )
   )
}

microbenchmark( fn_base( dta1 ), fn_dplyr1( dta1 ), fn_dplyr2( dta1 ) )
#> Unit: microseconds
#>             expr      min       lq      mean   median       uq      max
#>    fn_base(dta1)  756.604  797.154  932.1924  943.983  960.054 3928.349
#>  fn_dplyr1(dta1) 4106.170 4180.185 4386.9290 4212.907 4253.640 7855.908
#>  fn_dplyr2(dta1) 4148.434 4221.252 4421.7373 4249.389 4288.841 7519.803
#>  neval
#>    100
#>    100
#>    100

#' <details><summary>Session info</summary>

devtools::session_info()
#> Session info 
-------------------------------------------------------------
#>  setting  value
#>  version  R version 3.4.4 (2018-03-15)
#>  system   x86_64, linux-gnu
#>  ui       X11
#>  language en_US:en
#>  collate  en_US.UTF-8
#>  tz       America/Los_Angeles
#>  date     2018-05-31
#> Packages 
-----------------------------------------------------------------
#>  package        * version date       source
#>  assertthat       0.2.0   2017-04-11 CRAN (R 3.4.0)
#>  backports        1.1.2   2017-12-13 CRAN (R 3.4.4)
#>  base           * 3.4.4   2018-03-16 local
#>  bindr            0.1.1   2018-03-13 CRAN (R 3.4.4)
#>  bindrcpp       * 0.2.2   2018-03-29 CRAN (R 3.4.4)
#>  compiler         3.4.4   2018-03-16 local
#>  datasets       * 3.4.4   2018-03-16 local
#>  devtools         1.13.5  2018-02-18 CRAN (R 3.4.4)
#>  digest           0.6.15  2018-01-28 CRAN (R 3.4.4)
#>  dplyr          * 0.7.5   2018-05-19 CRAN (R 3.4.4)
#>  evaluate         0.10.1  2017-06-24 CRAN (R 3.4.1)
#>  glue             1.2.0   2017-10-29 CRAN (R 3.4.4)
#>  graphics       * 3.4.4   2018-03-16 local
#>  grDevices      * 3.4.4   2018-03-16 local
#>  htmltools        0.3.6   2017-04-28 CRAN (R 3.4.1)
#>  knitr            1.20    2018-02-20 CRAN (R 3.4.4)
#>  magrittr         1.5     2014-11-22 CRAN (R 3.4.0)
#>  memoise          1.1.0   2017-04-21 CRAN (R 3.4.1)
#>  methods        * 3.4.4   2018-03-16 local
#>  microbenchmark * 1.4-4   2018-01-24 CRAN (R 3.4.4)
#>  pillar           1.2.3   2018-05-25 CRAN (R 3.4.4)
#>  pkgconfig        2.0.1   2017-03-21 CRAN (R 3.4.1)
#>  purrr            0.2.5   2018-05-29 CRAN (R 3.4.4)
#>  R6               2.2.2   2017-06-17 CRAN (R 3.4.1)
#>  Rcpp             0.12.17 2018-05-18 CRAN (R 3.4.4)
#>  rlang            0.2.1   2018-05-30 CRAN (R 3.4.4)
#>  rmarkdown        1.9     2018-03-01 CRAN (R 3.4.4)
#>  rprojroot        1.3-2   2018-01-03 CRAN (R 3.4.4)
#>  stats          * 3.4.4   2018-03-16 local
#>  stringi          1.2.2   2018-05-02 CRAN (R 3.4.4)
#>  stringr          1.3.1   2018-05-10 CRAN (R 3.4.4)
#>  tibble           1.4.2   2018-01-22 CRAN (R 3.4.4)
#>  tidyselect       0.2.4   2018-02-26 CRAN (R 3.4.4)
#>  tools            3.4.4   2018-03-16 local
#>  utils          * 3.4.4   2018-03-16 local
#>  withr            2.1.2   2018-03-15 CRAN (R 3.4.4)
#>  yaml             2.1.19  2018-05-01 CRAN (R 3.4.4)

#' </details>
#####################################################

On Wed, 30 May 2018, Sumitrajit Dhar wrote:

> Hi Folks,
>
> I have just started using dplyr and could use some help getting unstuck. 
> It could well be that dplyr is not the package to be using, but let me 
> just pose the question and seek your advice.
>
> Here is my basic data frame.
>
> head(h)
>   subject ageGrp ear hearingGrp sex freq L2       Ldp     Phidp        NF       SNR
> 1 HALAF032      A   L          A   F    2  0 -23.54459  55.56005 -43.08282 19.538232
> 2 HALAF032      A   L          A   F    2  2 -32.64881  86.22040 -23.31558 -9.333224
> 3 HALAF032      A   L          A   F    2  4 -18.91058  42.12168 -35.60250 16.691919
> 4 HALAF032      A   L          A   F    2  6 -23.85937 297.94499 -20.70452 -3.154846
> 5 HALAF032      A   L          A   F    2  8 -14.45381 181.75329 -24.17094  9.717128
> 6 HALAF032      A   L          A   F    2 10 -20.42384  67.12998 -35.77357 15.349728
>
> ?subject? and ?freq? together make a set of data and I am interested in 
> how the last four columns vary as a function of L2. So I grouped by 
> ?subject? and ?freq? and can look at basic summaries.
>
>
> h_byFunc <- h %>% group_by(subject, freq)
>
>> h_byFunc %>% summarize(l = mean(Ldp), s = sd(Ldp) )
>
> # A tibble: 1,175 x 4
> # Groups:   subject [?]
>   subject   freq       l     s
>   <fct>    <int>   <dbl> <dbl>
> 1 HALAF032     2 -13.8    8.39
> 2 HALAF032     4 -15.8   11.0
> 3 HALAF032     8 -23.4    6.51
> 4 HALAF033     2 -14.2    9.64
> 5 HALAF033     4 -12.3    8.92
> 6 HALAF033     8  -6.55  12.3
> 7 HALAF036     2 -14.9   12.6
> 8 HALAF036     4 -16.7   11.2
> 9 HALAF036     8 -21.7    6.56
> 10 HALAF039     2   0.242 12.4
> # ... with 1,165 more rows
>
> What I would like to do is filter some groups out based on various 
> criteria. For example, if SNR > 3 in three consecutive L2 within a 
> group, that group qualifies and I would add a column, say ?clean? and 
> assign it a value ?Y.? Is there a way to do this in dplyr or should I be 
> looking at a different way.
>
> Thanks in advance for your help.
>
> Regards,
> Sumit
>
>
>
>

---------------------------------------------------------------------------
Jeff Newmiller                        The     .....       .....  Go Live...
DCN:<jdnewmil using dcn.davis.ca.us>        Basics: ##.#.       ##.#.  Live Go...
                                       Live:   OO#.. Dead: OO#..  Playing
Research Engineer (Solar/Batteries            O.O#.       #.O#.  with
/Software/Embedded Controllers)               .OO#.       .OO#.  rocks...1k