[R] uniq -c

cberry at tajo.ucsd.edu cberry at tajo.ucsd.edu
Tue Oct 16 17:46:27 CEST 2012

Sam Steingold <sds at gnu.org> writes:

> I need an analogue of "uniq -c" for a data frame.

The count.rows() function is the R analogue.



No need to install the package - just copy and paste the function into an
R session.

On cases I've tried that are big enough to matter, it is a good deal
faster than the table( do.call( paste, x )) idiom.



> xtabs(), although dog slow, would have footed the bill nicely:
> --8<---------------cut here---------------start------------->8---
>> x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32)
>> system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 ))
>    user  system elapsed 
>  12.788   4.288  17.224
> --8<---------------cut here---------------end--------------->8---
> but, alas, if fails on larger data:
> system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 ))
> Error in table(a = 1:32, b = 1:32, c = 1:32, d = 1:32, e = 1:32, f = 1:32,  : 
>   attempt to make a table with >= 2^31 elements
> (apparently, because the product of the numbers of all the possible
> values of all the columns is too large).
> rle() seems to be what I really need, but I cannot figure out what it
> returns for a simple example:
> --8<---------------cut here---------------start------------->8---
>> x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32,f=1:32,g=1:32,h=1:32)
>> rle(x)
> Run Length Encoding
>   lengths: int 8
>   values :'data.frame':	32 obs. of  1 variable:
>  $ h: int  1 2 3 4 5 6 7 8 9 10 ...
> --8<---------------cut here---------------end--------------->8---
> (where are all the other columns?)
> and it fails on my actual data (3 column of factors):
> Error in Ops.factor(left, right) : level sets of factors are different
> when I replace factors with strings, I get:
> Error in `[.data.frame`(x, i) : undefined columns selected
> dput:
> --8<---------------cut here---------------start------------->8---
> structure(list(user = c("45ff768774777593", "45ff768774777593", 
> "45ff768774777593", "45ff768774777593", "45ff768774777593", "4bbf9e94cbceb70c", 
> "4bbf9e94cbceb70c", "4fbbf2c67e0fb867", "4fbbf2c67e0fb867", "5038d46739f9f516", 
> "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", 
> "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4fe9e0496ecfc55e", 
> "4fe9e0496ecfc55e", "4fe9e0496ecfc55e", "506b92707a3aa65f", "502c0a9ba9ce5019", 
> "502c0a9ba9ce5019", "502c0a9ba9ce5019", "502c0a9ba9ce5019", "501b52fe24d88162", 
> "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", 
> "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", 
> "4fd4852ed504b160", "4fd4852ed504b160", "4e717c219268b736", "4e717c219268b736", 
> "4e717c219268b736", "506bb429eeab2af4", "506bb429eeab2af4", "506bb429eeab2af4", 
> "4f6f91cb83e1a7ef", "506bb8b62bde3c48", "506bb8b62bde3c48", "506bb8b62bde3c48", 
> "506bb8b62bde3c48", "4edff2aeb4df7613", "4edff2aeb4df7613", "506bba652fa6bf78", 
> "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", 
> "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", 
> "506a4941b50ca422", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", 
> "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", 
> "5036993a16b323d1", "506bb525ffce3add", "506bb6cf52819b5f", "506bb6cf52819b5f", 
> "4fe02e08a6a2ce64", "4fe02e08a6a2ce64", "5056d247bc7dae0b", "5056d247bc7dae0b", 
> "5056d247bc7dae0b", "506abfecf61bf4e6", "4ec00aa243745ee5", "4ec00aa243745ee5", 
> "4ec00aa243745ee5", "4ec00aa243745ee5", "506c69d7fb598afa", "5065a0c59c59c111", 
> "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
> "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
> "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
> "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", 
> "5065a0c59c59c111", "5065a0c59c59c111"), country = c("AR", "AR", 
> "AR", "AR", "AR", "BG", "BG", "SK", "SK", "US", "VE", "VE", "VE", 
> "VE", "VE", "VE", "VE", "TH", "TH", "TH", "MY", "US", "US", "US", 
> "US", "MX", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", 
> "JP", "US", "US", "US", "US", "US", "US", "NP", "US", "US", "US", 
> "US", "MX", "MX", "US", "AR", "AR", "AR", "AR", "AR", "AR", "AR", 
> "AR", "AR", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "US", 
> "MM", "MM", "US", "US", "IN", "IN", "IN", "IN", "CA", "CA", "CA", 
> "CA", "US", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", 
> "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE"), 
>     language = c("es", "es", "es", "es", "es", "bg", "bg", "sk", 
>     "sk", "en", "es", "es", "es", "es", "es", "es", "es", "th", 
>     "th", "th", "en", "en", "en", "en", "en", "es", "en", "en", 
>     "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", 
>     "en", "en", "en", "en", "en", "en", "en", "en", "en", "es", 
>     "es", "en", "es", "es", "es", "es", "es", "es", "es", "es", 
>     "es", "es", "es", "es", "es", "es", "es", "es", "es", "en", 
>     "en", "en", "en", "en", "en", "en", "en", "en", "fr", "fr", 
>     "fr", "fr", "en", "de", "de", "de", "de", "de", "de", "de", 
>     "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", 
>     "de", "de")), .Names = c("user", "country", "language"), row.names = c(1L, 
> 2L, 3229330L, 3229331L, 3229332L, 3L, 9504492L, 4L, 9504493L, 
> 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 4989444L, 
> 4989445L, 4989446L, 18L, 19L, 20L, 21L, 22L, 95114L, 95115L, 
> 95116L, 95117L, 9504509L, 9599604L, 23L, 3319263L, 3319264L, 
> 24L, 25L, 9504513L, 26L, 27L, 28L, 7548719L, 7548720L, 29L, 30L, 
> 31L, 32L, 2498862L, 2498863L, 2498864L, 2498865L, 4560918L, 4560919L, 
> 5938642L, 14065408L, 33L, 4348151L, 4348152L, 5627634L, 5627635L, 
> 9504522L, 13852641L, 15132124L, 34L, 35L, 6400711L, 36L, 2173763L, 
> 37L, 38L, 39L, 40L, 41L, 10210641L, 10672811L, 16158334L, 42L, 
> 43L, 44L, 45L, 1974646L, 4032952L, 4032953L, 4032954L, 4032955L, 
> 4032956L, 4032957L, 4032958L, 4475376L, 4475377L, 4475378L, 4475379L, 
> 5500564L, 7871329L, 7871330L, 8670694L), class = "data.frame")
> --8<---------------cut here---------------end--------------->8---
> thanks!

More information about the R-help mailing list