[R] finding duplicates in a data frame

Rui Barradas ruipbarradas at sapo.pt
Wed Jun 13 17:36:59 CEST 2012


Hello,

Thanks! Now we can see that your df1 only has one column, your dataset 
was messed up in some previous step. So, it needs to be transformed into 
a 3 columns data.frame first. Then use merge().



x <- levels(df1$new.col.ppm.p..freq.p.)[df1$new.col.ppm.p..freq.p.]
x <- t(sapply(x, function(xx) strsplit(xx, " ")))
x <- data.frame(do.call(rbind, x), stringsAsFactors=FALSE)
names(x) <- names(df2)
x$ppm.p. <- as.numeric(x$ppm.p.)
x$freq.p. <- as.numeric(x$freq.p.)
str(x)
'data.frame':   30 obs. of  3 variables:
  $ new.col: chr  "1_3_diaminopropane" "1_3_diaminopropane" 
"1_3_diaminopropane" "1_3_diaminopropane" ...
  $ ppm.p. : num  3.14 3.14 3.14 3.13 3.12 ...
  $ freq.p.: num  5.68 6.65 8.01 9.64 298.24 ...

# If the output is like above, it's ok to merge the two dfs
merge(x, df2, by.x=c("ppm.p.", "freq.p."), by.y=c("ppm.p.", "freq.p."))


If you want it's also ok to rename 'x' as 'df1', with df1 <- x

If this doesn't do it, say something.

Rui Barradas
Em 13-06-2012 13:45, Sathya Priya escreveu:
> *>dput(head(df1, 30))*
> structure(list(new.col.ppm.p..freq.p. = structure(c(22L, 21L,
> 20L, 19L, 18L, 17L, 16L, 15L, 14L, 13L, 12L, 11L, 10L, 9L, 8L,
> 7L, 6L, 5L, 4L, 3L, 2L, 1L, 24L, 23L, 26L, 25L, 90L, 89L, 88L,
> 87L), .Label = c("1_3_diaminopropane -0.00533455 7.32726",
> "1_3_diaminopropane 0.00021588 103.234",
> "1_3_diaminopropane 0.00655923 7.6155", "1_3_diaminopropane 2.04634
> 32.1115",
> "1_3_diaminopropane 2.0618 72.4135", "1_3_diaminopropane 2.06497 49.6692",
> "1_3_diaminopropane 2.07251 47.5652", "1_3_diaminopropane 2.07727 135.629",
> "1_3_diaminopropane 2.08242 51.5292", "1_3_diaminopropane 2.09312 79.8969",
> "1_3_diaminopropane 2.10859 37.0658", "1_3_diaminopropane 3.07912 5.6996",
> "1_3_diaminopropane 3.0807 7.47272", "1_3_diaminopropane 3.08943 308.2",
> "1_3_diaminopropane 3.09617 44.1467", "1_3_diaminopropane 3.10528 337.852",
> "1_3_diaminopropane 3.1152 44.6212", "1_3_diaminopropane 3.12075 298.243",
> "1_3_diaminopropane 3.13383 9.64184", "1_3_diaminopropane 3.13541 8.0142",
> "1_3_diaminopropane 3.137 6.65388", "1_3_diaminopropane 3.13859 5.67516",
> "1_3_dimethylurea 2.66393 39.238", "1_3_dimethylurea 4.80643 88.4026",
> "1_amino_1_phenylmethyl_phosphonic_acid 4.81412 105.11",
> "1_amino_1_phenylmethyl_phosphonic_acid 7.44687 7.1684",
> "2_3_diphospho_D_glyceric_acid 0.000156828 127.55",
> "2_3_diphospho_D_glyceric_acid 1.12848 35.0817",
> "2_3_diphospho_D_glyceric_acid 1.13681 45.9263",
> "2_3_diphospho_D_glyceric_acid 1.14513 64.8014",
> "2_3_diphospho_D_glyceric_acid 1.15306 116.661",
> "2_3_diphospho_D_glyceric_acid 1.1598 115.19",
> "2_3_diphospho_D_glyceric_acid 1.1709 102.384",
> "2_3_diphospho_D_glyceric_acid 1.17764 140.963",
> "2_3_diphospho_D_glyceric_acid 1.18478 115.727",
> "2_3_diphospho_D_glyceric_acid 1.19548 62.2725",
> "2_3_diphospho_D_glyceric_acid 1.20262 74.4761",
> "2_3_diphospho_D_glyceric_acid 1.20936 48.6332",
> "2_3_diphospho_D_glyceric_acid 1.21808 10.5804",
> "2_3_diphospho_D_glyceric_acid 1.22482 6.26529",
> "2_3_diphospho_D_glyceric_acid 1.27239 64.2389",
> "2_3_diphospho_D_glyceric_acid 1.27834 93.7896",
> "2_3_diphospho_D_glyceric_acid 1.29737 211.264",
> "2_3_diphospho_D_glyceric_acid 1.30371 298.131",
> "2_3_diphospho_D_glyceric_acid 1.32076 616.226",
> "2_3_diphospho_D_glyceric_acid 1.32671 503.942",
> "2_3_diphospho_D_glyceric_acid 1.33107 662.735",
> "2_3_diphospho_D_glyceric_acid 1.33741 593.161",
> "2_3_diphospho_D_glyceric_acid 1.35208 290.138",
> "2_3_diphospho_D_glyceric_acid 1.35724 310.132",
> "2_3_diphospho_D_glyceric_acid 1.36992 38.1639",
> "2_3_diphospho_D_glyceric_acid 1.38182 93.7426",
> "2_3_diphospho_D_glyceric_acid 1.62604 191.929",
> "2_3_diphospho_D_glyceric_acid 1.6522 179.805",
> "2_3_diphospho_D_glyceric_acid 1.77788 274.143",
> "2_3_diphospho_D_glyceric_acid 1.78462 294.731",
> "2_3_diphospho_D_glyceric_acid 1.79572 410.836",
> "2_3_diphospho_D_glyceric_acid 1.80167 361.596",
> "2_3_diphospho_D_glyceric_acid 1.82585 30.7865",
> "2_3_diphospho_D_glyceric_acid 1.95311 34.917",
> "2_3_diphospho_D_glyceric_acid 1.95787 30.7119",
> "2_3_diphospho_D_glyceric_acid 1.97849 465.916",
> "2_3_diphospho_D_glyceric_acid 1.99395 300.192",
> "2_3_diphospho_D_glyceric_acid 3.10007 55.6859",
> "2_3_diphospho_D_glyceric_acid 3.108 92.3526",
> "2_3_diphospho_D_glyceric_acid 3.12148 159.122",
> "2_3_diphospho_D_glyceric_acid 3.12981 193.855",
> "2_3_diphospho_D_glyceric_acid 3.13813 121.311",
> "2_3_diphospho_D_glyceric_acid 3.14289 114.087",
> "2_3_diphospho_D_glyceric_acid 3.15121 92.128",
> "2_3_diphospho_D_glyceric_acid 3.96911 17.9823",
> "2_3_diphospho_D_glyceric_acid 3.97823 36.9309",
> "2_3_diphospho_D_glyceric_acid 3.99052 57.8175",
> "2_3_diphospho_D_glyceric_acid 3.99964 100.939",
> "2_3_diphospho_D_glyceric_acid 4.00876 61.2461",
> "2_3_diphospho_D_glyceric_acid 4.01232 60.9695",
> "2_3_diphospho_D_glyceric_acid 4.01906 70.3493",
> "2_3_diphospho_D_glyceric_acid 4.02144 66.5612",
> "2_3_diphospho_D_glyceric_acid 4.02858 58.5852",
> "2_3_diphospho_D_glyceric_acid 4.03373 24.3897",
> "2_3_diphospho_D_glyceric_acid 4.04047 22.4084",
> "2_3_diphospho_D_glyceric_acid 4.04325 23.3606",
> "2_3_diphospho_D_glyceric_acid 4.04999 17.279",
> "2_3_diphospho_D_glyceric_acid 4.4853 22.1747",
> "2_3_diphospho_D_glyceric_acid 4.49164 44.5942",
> "2_3_diphospho_D_glyceric_acid 4.49799 25.3894",
> "2_3_diphospho_D_glyceric_acid 4.50076 35.5218",
> "2_3_diphospho_D_glyceric_acid 4.51028 44.1328",
> "2_3_diphospho_D_glyceric_acid 4.51622 21.4894",
> "2_3_diphospho_D_glyceric_acid 4.5186 18.3831",
> "2_amino_5_ethyl_1_3_4_thiadiazole 0.000156828 127.55",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.12848 35.0817",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.13681 45.9263",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.14513 64.8014",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.15306 116.661",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.1598 115.19",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.1709 102.384",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.17764 140.963",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.18478 115.727",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.19548 62.2725",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.20262 74.4761",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.20936 48.6332",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.21808 10.5804",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.22482 6.26529",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.27239 64.2389",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.27834 93.7896",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.29737 211.264",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.30371 298.131",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.32076 616.226",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.32671 503.942",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.33107 662.735",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.33741 593.161",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.35208 290.138",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.35724 310.132",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.36992 38.1639",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.38182 93.7426",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.62604 191.929",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.6522 179.805",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.77788 274.143",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.78462 294.731",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.79572 410.836",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.80167 361.596",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.82585 30.7865",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.95311 34.917",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.95787 30.7119",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.97849 465.916",
> "2_amino_5_ethyl_1_3_4_thiadiazole 1.99395 300.192",
> "2_amino_5_ethyl_1_3_4_thiadiazole 3.10007 55.6859",
> "2_amino_5_ethyl_1_3_4_thiadiazole 3.108 92.3526",
> "2_amino_5_ethyl_1_3_4_thiadiazole 3.12148 159.122",
> "2_amino_5_ethyl_1_3_4_thiadiazole 3.12981 193.855",
> "2_amino_5_ethyl_1_3_4_thiadiazole 3.13813 121.311",
> "2_amino_5_ethyl_1_3_4_thiadiazole 3.14289 114.087",
> "2_amino_5_ethyl_1_3_4_thiadiazole 3.15121 92.128",
> "2_amino_5_ethyl_1_3_4_thiadiazole 3.96911 17.9823",
> "2_amino_5_ethyl_1_3_4_thiadiazole 3.97823 36.9309",
> "2_amino_5_ethyl_1_3_4_thiadiazole 3.99052 57.8175",
> "2_amino_5_ethyl_1_3_4_thiadiazole 3.99964 100.939",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.00876 61.2461",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.01232 60.9695",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.01906 70.3493",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.02144 66.5612",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.02858 58.5852",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.03373 24.3897",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.04047 22.4084",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.04325 23.3606",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.04999 17.279",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.4853 22.1747",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.49164 44.5942",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.49799 25.3894",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.50076 35.5218",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.51028 44.1328",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.51622 21.4894",
> "2_amino_5_ethyl_1_3_4_thiadiazole 4.5186 18.3831"
> ), class = "factor")), .Names = "new.col.ppm.p..freq.p.", row.names = c(NA,
> 30L), class = "data.frame")
>>   dput(head(df2, 30))
> structure(list(new.col = structure(c(1L, 1L), .Label = "unknown", class =
> "factor"),
>      ppm.p. = c(7.44687, 4.81412), freq.p. = c(7.1684, 105.11)), .Names =
> c("new.col",
> "ppm.p.", "freq.p."), row.names = 1:2, class = "data.frame")
>



More information about the R-help mailing list