# [R] uniq -c

Sam Steingold sds at gnu.org
Wed Oct 17 21:57:56 CEST 2012

> * Sam Steingold <fqf at tah.bet> [2012-10-16 11:03:27 -0400]:
>
> I need an analogue of "uniq -c" for a data frame.

Summary of options:

1. William:

isFirstInRun <- function(x) UseMethod("isFirstInRun")
isFirstInRun.default <- function(x) c(TRUE, x[-1] != x[-length(x)])
isFirstInRun.data.frame <- function(x) {
stopifnot(ncol(x)>0)
retval <- isFirstInRun(x[[1]])
for(column in x) {
retval <- retval | isFirstInRun(column)
}
retval
}
row.count.1 <- function (x) {
i <- which(isFirstInRun(x))
data.frame(x[i,], count=diff(c(i, 1L+nrow(x))))
}

147 seconds

2. http://orgmode.org/worg/org-contrib/babel/examples/Rpackage.html#sec-6-1
row.count.2 <- function (x) {
equal.to.previous <- rowSums( x[2:nrow(x),] != x[1:(nrow(x)-1),] )==0
tf.runs <- rle(equal.to.previous)
counts <- c(1, unlist(mapply(function(x,y) if (y) x+1 else (rep(1,x)),
tf.runs\$length, tf.runs\$value)))
counts <- counts[ c( diff( counts ) <= 0, TRUE ) ]
unique.rows <- which( c(TRUE, !equal.to.previous ) )
cbind(x[ unique.rows, ,drop=FALSE ], counts)
}

136 seconds

3. Micael: paste/strsplit

row.count.3 <- function (x) {
pa <- do.call(paste,x)
rl <- rle(p)
sp <- strsplit(as.character(rl\$values)," ")
data.frame(user = sapply(sp,"[",1),
country = sapply(sp,"[",2),
language = sapply(sp,"[",3),
count = rl\$length)
}

here I know the columns and rely on absense of spaces in values.

27 seconds.