[R] How to identify outliers with values five times 99th percentile

Kuma Raj pollaroid at gmail.com
Tue Sep 9 11:21:40 CEST 2014


I have a data frame with some extreme values which I wish to identify
and repeat an analysis without these extreme values. How could I
identify several columns with values which are 5 times higher than the
99th percentile?

Sample data is pasted below.

> dput(df)

structure(list(ad1 = c(98, 6.9, 8.1, 56, 3.9, 6.9, 6.9, 5.8,

7.2, 20.5, 9.4, 7.6, 5.3, 7.9, 62.2, 9.2, 11.9, 8.8, 23.1, 5.4,

9.4, 56, 8.6, 20.7, 21, 10.5, 5.5, 4.3, 15.8, 6.8, 10.4, 5.1),

    ad2 = c(14.9, 19.7, 1, 17.7, 14.9, 13.6, 18.8, 20.9, 46,

    16.5, 11.7, 1, 9.2, 23.6, 19.7, 1, 11.4, 11, 23.1, 1, 1,

    8.9, 11.3, 6.4, 15.2, 1, 17.3, 10.1, 13.3, 21.3, 12.3, 15.4

    ), ad3 = c(0.91, 0.95, 10.7, 4.4, 0.43, 0.8, 3.1, 1.9, 2.3,

    5.6, 3.9, 7.3, 0.37, 4.1, 15.1, 21.8, 3, 0.79, 1, 4.6, 0.61,

    0.46, 0.87, 23.5, 3.8, 3.1, 0.33, 1.9, 3.2, 1.7, 0.53, 62.5

    ), ad4 = c(225.5, 269.7, 326, 485.4, 193.2, 274.1, 553.2,

    166.8, 435.9, 433.2, 187.1, 660.4, 235.4, 356.5, 378.8, 500.5,

    323.5, 327.1, 289.5, 301.2, 291.7, 333.5, 351.7, 384.1, 347,

    1354, 440.4, 189.2, 381, 252.7, 391.1, 255.1), ad5 = c(337.9,

    355.6, 419.5, 798.5, 225, 355.9, 394.4, 340.6, 463.9, 291.9,

    312.3, 491, 290.5, 231.9, 358, 386.4, 306.7, 440.6, 297.9,

    339.3, 341.1, 366.2, 325.4, 357, 412.2, 370.2, 421.3, 346.3,

    289.1, 257.4, 368, 322.6), ad6 = c(64.5, 130.6, 76, 167.8,

    47.3, 117, 60.7, 91.9, 221.9, 91.1, 105.1, 110.8, 64.5, 184.5,

    191.6, 259.4, 879.5, 142.1, 55.3, 123.1, 62.2, 75.2, 154.6,

    100.7, 93.1, 136.7, 74.3, 41.8, 110.1, 109.1, 172.5, 87.7

    ), ad7 = c(128L, 987L, 158L, 124L, 137L, 215L, 141L, 98L,

    291L, 261L, 106L, 137L, 141L, 159L, 221L, 108L, 123L, 107L,

    137L, 175L, 257L, 97L, 168L, 145L, 147L, 188L, 145L, 128L,

    153L, 187L, 123L, 354L), ad8 = c(3.26, 3.98, 2.88, 2.85,

    4.17, 3.16, 3.09, 4.35, 3.46, 3.81, 3.78, 3.81, 4.17, 4.27,

    4.27, 2.97, 3.43, 3.48, 3.78, 3.86, 3.11, 3.12, 3.16, 4.24,

    3.81, 3.11, 5.31, 3.75, 3.78, 3.55, 4.08, 3.5), ad9 = c(433L,

    211L, 66L, 173L, 224L, 466L, 224L, 273L, 94L, 321L, 160L,

    107L, 121L, 186L, 455L, 80L, 897L, 186L, 285L, 134L, 107L,

    355L, 261L, 249L, 332L, 107L, 273L, 107L, 160L, 535L, 160L,

    121L)), .Names = c("ad1", "ad2", "ad3", "ad4", "ad5", "ad6",

"ad7", "ad8", "ad9"), class = "data.frame", row.names = c(NA,

-32L))



More information about the R-help mailing list