[R] Find "undirected" duplicates in a tibble

Greg Minshall m|n@h@|| @end|ng |rom um|ch@edu
Sat Aug 21 12:06:53 CEST 2021


Bert,

> Turns out that there's an even faster alternative.

hah, but i'm *not* surprised.  thanks for sharing the (current)
low-price leader!

and, thanks, again, to Kimmo for posting such a productive question!

cheers, Greg

----
my apply
   user  system elapsed 
  8.465   0.103   8.578 
Bert's !duplicated
   user  system elapsed 
  2.397   0.000   2.399 
Bert's x[,2]>x[,1]
   user  system elapsed 
  1.068   0.000   1.069 
Bert's table()-based   user  system elapsed 
  0.235   0.000   0.235 
my a.d.f(unique(cbind(do.call)))
   user  system elapsed 
  4.470   0.000   4.475 
Eric Berger's unique(...pmin...pmax)
   user  system elapsed 
  0.820   0.017   0.837 
Eric Berger's transmuting tibble...
   user  system elapsed 
  0.936   0.000   0.938 
Kimmo Elo's [OP] mutating paste
   user  system elapsed 
 50.769   0.000  50.821 
Rui Barradas' sort-based
   user  system elapsed 
 42.001   0.053  42.112 
----

----
n <- 1000
x <- expand.grid(Source = 1:n, Target = 1:n)

cat("my apply\n")
system.time({
 y <- apply(x, 1, function(y) return (c(A=min(y), B=max(y))))
 unique(t(y))})
#   user  system elapsed
#  5.075   0.034   5.109

cat("Bert's !duplicated\n")
system.time({
 x[!duplicated(cbind(do.call(pmin, x), do.call(pmax, x))), ]
 })
#   user  system elapsed
#  1.340   0.013   1.353

# Still more efficient and still returning a data frame is:
cat("Bert's x[,2]>x[,1]\n")
system.time({
 w <- x[, 2] > x[,1]
 x[w, ] <- x[w, 2:1]
 unique(x)})
#   user  system elapsed
#  0.693   0.011   0.703

cat("Bert's table()-based")
f <- function(x){
   w <- x[,2] > x[,1]
   x[w, ] <- x[w, 2:1]
   x$counts <- as.vector(table(x)) ## drop the dim
   x[x$counts>0, ]
}
system.time({
  f(x)
})

cat("my a.d.f(unique(cbind(do.call)))\n")
system.time({
  as.data.frame(unique(cbind(A=do.call(pmin,x), B=do.call(pmax,x))))
})

cat("Eric Berger's unique(...pmin...pmax)\n")
system.time({
  unique(data.frame(V1=pmin(x$Source,x$Target), V2=pmax(x$Source,x$Target)))
})

cat("Eric Berger's transmuting tibble...\n")
require(dplyr)
xt<-tibble(x)
system.time({
  xt %>% transmute( a=pmin(Source,Target), b=pmax(Source,Target)) %>%
    unique() %>% rename(Source=a, Target=b)
})

cat("Kimmo Elo's [OP] mutating paste\n")
system.time({
  xt %>%
    mutate(pair=mapply(function(x,y)
      paste0(sort(c(x,y)),collapse="-"), Source, Target)) %>%
    distinct(pair,
             .keep_all = T) %>%
    mutate(Source=sapply(pair, function(x)
      unlist(strsplit(x, split="-"))[1]), Target=sapply(pair, function(x)
        unlist(strsplit(x, split="-"))[2])) %>%
    select(-pair)
})

cat("Rui Barradas' sort-based\n")
system.time({
  apply(x, 1, sort) |> t() |> unique()
})



More information about the R-help mailing list