# [R] Find "undirected" duplicates in a tibble

Greg Minshall m|n@h@|| @end|ng |rom um|ch@edu
Sat Aug 21 12:06:53 CEST 2021

```Bert,

> Turns out that there's an even faster alternative.

hah, but i'm *not* surprised.  thanks for sharing the (current)

and, thanks, again, to Kimmo for posting such a productive question!

cheers, Greg

----
my apply
user  system elapsed
8.465   0.103   8.578
Bert's !duplicated
user  system elapsed
2.397   0.000   2.399
Bert's x[,2]>x[,1]
user  system elapsed
1.068   0.000   1.069
Bert's table()-based   user  system elapsed
0.235   0.000   0.235
my a.d.f(unique(cbind(do.call)))
user  system elapsed
4.470   0.000   4.475
Eric Berger's unique(...pmin...pmax)
user  system elapsed
0.820   0.017   0.837
Eric Berger's transmuting tibble...
user  system elapsed
0.936   0.000   0.938
Kimmo Elo's [OP] mutating paste
user  system elapsed
50.769   0.000  50.821
user  system elapsed
42.001   0.053  42.112
----

----
n <- 1000
x <- expand.grid(Source = 1:n, Target = 1:n)

cat("my apply\n")
system.time({
y <- apply(x, 1, function(y) return (c(A=min(y), B=max(y))))
unique(t(y))})
#   user  system elapsed
#  5.075   0.034   5.109

cat("Bert's !duplicated\n")
system.time({
x[!duplicated(cbind(do.call(pmin, x), do.call(pmax, x))), ]
})
#   user  system elapsed
#  1.340   0.013   1.353

# Still more efficient and still returning a data frame is:
cat("Bert's x[,2]>x[,1]\n")
system.time({
w <- x[, 2] > x[,1]
x[w, ] <- x[w, 2:1]
unique(x)})
#   user  system elapsed
#  0.693   0.011   0.703

cat("Bert's table()-based")
f <- function(x){
w <- x[,2] > x[,1]
x[w, ] <- x[w, 2:1]
x\$counts <- as.vector(table(x)) ## drop the dim
x[x\$counts>0, ]
}
system.time({
f(x)
})

cat("my a.d.f(unique(cbind(do.call)))\n")
system.time({
as.data.frame(unique(cbind(A=do.call(pmin,x), B=do.call(pmax,x))))
})

cat("Eric Berger's unique(...pmin...pmax)\n")
system.time({
unique(data.frame(V1=pmin(x\$Source,x\$Target), V2=pmax(x\$Source,x\$Target)))
})

cat("Eric Berger's transmuting tibble...\n")
require(dplyr)
xt<-tibble(x)
system.time({
xt %>% transmute( a=pmin(Source,Target), b=pmax(Source,Target)) %>%
unique() %>% rename(Source=a, Target=b)
})

cat("Kimmo Elo's [OP] mutating paste\n")
system.time({
xt %>%
mutate(pair=mapply(function(x,y)
paste0(sort(c(x,y)),collapse="-"), Source, Target)) %>%
distinct(pair,
.keep_all = T) %>%
mutate(Source=sapply(pair, function(x)
unlist(strsplit(x, split="-"))[1]), Target=sapply(pair, function(x)
unlist(strsplit(x, split="-"))[2])) %>%
select(-pair)
})