[R] Using subset to filter data table

Santosh Srinivas santosh.srinivas at gmail.com
Wed Jan 19 09:27:14 CET 2011


tcc.new2 <- droplevels(tcc.new)

-----Original Message-----
From: r-help-bounces at r-project.org [mailto:r-help-bounces at r-project.org] On
Behalf Of Ben Harrison
Sent: 19 January 2011 12:24
To: r-help at r-project.org
Subject: [R] Using subset to filter data table

I am having difficulty understanding how I would constrain a data set by
filtering out 'records' based on certain criteria.
Using SQL I could query using 'select * from my.data where LithClass in
('sand', 'clay')' or some such.

Using subset, there seem to be ghosts left behind (that is, all of the
LithClass *.Labels* remain after subset)

> dput(tcc)

structure(list(Density = c(1.84, 1.91, 2, NA, 1.95, 2.04, 2.11,
1.85, 1.97, 2, 2.27, NA, NA, 2.18, NA, NA, 1.9, 2.68, 2.69, NA,

-- snip --

1.45, 1.49, 2.11), LithClass = structure(c(6L, 6L, 5L, 6L, 7L,
7L, 6L, 6L, 5L, 5L, 5L, 2L, 3L, 2L, 3L, 7L, 7L, 4L, 4L, 2L, 7L,
3L, 2L, 2L, 3L, 7L, 2L, 2L, 3L, 2L, 6L, 5L, 5L, 5L, 6L, 6L, 7L,
2L, 3L, 7L, 7L, 7L, 7L, 2L, 7L, 7L, 2L, 7L, 5L, 5L, 5L, 5L, 5L,
5L, 6L, 7L, 3L, 2L, 2L, 3L, 2L, 7L, 3L, 5L, 6L, 6L, 6L, 3L, 6L,
3L, 7L, 2L, 3L, 3L, 3L, 7L, 3L, 2L, 2L, 7L, 5L, 5L, 5L, 5L, 5L,
5L, 3L, 2L, 3L, 3L, 3L, 7L, 7L, 2L, 6L, 6L, 6L, 7L, 7L, 7L, 1L,
2L, 2L, 7L, 7L, 2L, 7L, 7L, 7L, 3L, 3L, 2L, 5L, 5L), .Label = c("basalt",
"clay", "coal", "dolomite", "limestone", "marl", "sand"), class =
"factor")), .Names = c("Density",
"Depth", "Porosity", "Conductivity", "LithClass"), row.names = c(NA,
114L), class = "data.frame")

> tcc.new = subset(tcc, LithClass %in% c('clay','sand','marl'))

>dput(tcc.new)
structure(list(Density = c(1.84, 1.91, NA, 1.95, 2.04, 2.11,

-- snip--

    1.63, 2.84, 1.93, 1.67, 2.22, 2.42, 1.84, 2.24, 2.38, 2.33,
    1.45), LithClass = structure(c(6L, 6L, 6L, 7L, 7L, 6L, 6L,
    2L, 2L, 7L, 7L, 2L, 7L, 2L, 2L, 7L, 2L, 2L, 2L, 6L, 6L, 6L,
    7L, 2L, 7L, 7L, 7L, 7L, 2L, 7L, 7L, 2L, 7L, 6L, 7L, 2L, 2L,
    2L, 7L, 6L, 6L, 6L, 6L, 7L, 2L, 7L, 2L, 2L, 7L, 2L, 7L, 7L,
    2L, 6L, 6L, 6L, 7L, 7L, 7L, 2L, 2L, 7L, 7L, 2L, 7L, 7L, 7L,
    2L), .Label = c("basalt", "clay", "coal", "dolomite", "limestone",
    "marl", "sand"), class = "factor")), .Names = c("Density",
"Depth", "Porosity", "Conductivity", "LithClass"), row.names = c(1L,
2L, 4L, 5L, 6L, 7L, 8L, 12L, 14L, 16L, 17L, 20L, 21L, 23L, 24L,
26L, 27L, 28L, 30L, 31L, 35L, 36L, 37L, 38L, 40L, 41L, 42L, 43L,
44L, 45L, 46L, 47L, 48L, 55L, 56L, 58L, 59L, 61L, 62L, 65L, 66L,
67L, 69L, 71L, 72L, 76L, 78L, 79L, 80L, 88L, 92L, 93L, 94L, 95L,
96L, 97L, 98L, 99L, 100L, 102L, 103L, 104L, 105L, 106L, 107L,
108L, 109L, 112L), class = "data.frame")

The data (in the "structure") is removed, but the labels for all LithClass
remain.
If I then produce a boxplot or some other graphic where LithClass is
involved, blank plots are produced for the deleted LithClass labels, which
is not what I want.
i.e.:
> with(tcc.new, boxplot(Conductivity~LithClass))

Here is the full data:
1> dput(tcc)
structure(list(Density = c(1.84, 1.91, 2, NA, 1.95, 2.04, 2.11,
1.85, 1.97, 2, 2.27, NA, NA, 2.18, NA, NA, 1.9, 2.68, 2.69, NA,
2.41, 1.16, 1.92, 1.96, 0.87, NA, NA, NA, 1.4, NA, 1.83, 1.93,
2.14, 2.18, NA, 2.12, 2.05, NA, 1.53, 2.23, 2.2, NA, 2.42, 2.54,
2.48, 2.58, 1.91, 2.09, 1.85, 2.48, 2.34, 2.19, 2.23, 2.09, 2.57,
2.12, 1.88, 2.21, 2.06, 1.21, 2.08, 1.84, 1.71, 2.11, 1.9, 1.84,
1.97, 1.57, NA, NA, 1.92, 2.29, NA, 1.17, NA, NA, 1.77, 2.03,
2.27, 2.22, 1.84, 2.09, 1.94, 2.3, 2.59, NA, 1.52, 2.15, 1.19,
NA, 1.14, 1.99, 1.94, 2.01, 2.02, 1.91, 2.45, 2.09, NA, NA, 2.14,
2.4, 2.47, 2.48, 2.49, 2.51, 2.49, 2.51, 2.5, 1.18, 1.23, 1.94,
2.04, 2.39), Depth = c(151, 198, 284, 480, 33.4, 75, 142, 186,
258, 420.6, 679.2, 884.1, 916, 943, 972, 998, 1064.4, 1076.7,
1125.6, 1176.5, 1225.6, 255.8, 272, 205.6, 319.3, 352.6, 666.1,
667.6, 918, 105, 156.5, 213, 305.1, 498, 591.75, 726, 765.4,
793.4, 829.1, 861, 896, 929, 957.9, 989.7, 1017, 1018, 96, 229,
355, 376, 456, 496, 515, 537, 557, 627, 512, 551, 615, 381.8,
585.3, 587.2, 591.25, 114, 302.2, 496.4, 597, 596, 616.7, 834.3,
980.3, 1064.6, 251.9, 253.75, 284.8, 318.3, 526.5, 627.5, 768,
770.3, 229.58, 323.8, 376.3, 470.85, 611.7, 797.38, 963.6, 1047.25,
334, 404.7, 517, 459, 338, 545.5, 189, 378, 515, 593, 670, 736,
931, 1003, 1325, 1391, 1457, 1523, 1590, 1664, 1769, 124.3, 176.5,
201.5, 502.1, 602.6), Porosity = c(50.22, 44.26, 39.31, NA, 40.38,
39.65, 35.3, 40.48, 40.13, 34.8, 18.52, NA, NA, 32.1, NA, NA,
39.79, 1.62, 1.71, NA, 14.55, 36.87, 39.08, 37.67, 16.9, NA,
NA, NA, 30.5, NA, 46.52, 43.18, 33.33, 31.35, NA, 26.74, 26.4,
NA, 36.19, 26.88, 27.2, NA, 11.9, 7.65, 9.25, 5.74, 40.71, 27.98,
47.6, 14.03, 20.62, 30.27, 27.51, 33.83, 9.16, 32.2, 26.71, 24.99,
20.1, 47.76, 28.54, 33.54, 23.74, 29.04, 45.38, 37.09, 43.14,
30.46, NA, NA, 30.7, 22.51, NA, 42.94, NA, NA, 29.39, 24.72,
22.36, 25.39, 45.26, 36.23, 41.38, 22, 7.6, NA, 24.09, 30.53,
36.18, NA, 25.19, 37.93, 42.95, 37.84, 47.21, 41.69, 13.64, 34.81,
NA, NA, 38.39, 12.31, 9.46, 12.09, 10.3, 11.02, 10.36, 9.27,
10.17, 29.38, 35.67, 42.38, 37.54, 15.44), Conductivity = c(1.28,
1.38, 1.47, 1.13, 0.98, 1.83, 2.2, 1.92, 2.04, 1.44, 2.08, 1.06,
1.19, 1.6, 0.36, 2.81, 2.14, 4.2, 4.41, 1.64, 1.93, 0.56, 1.71,
1.52, 0.22, 2.38, 2.06, 1.95, 0.52, 1.94, 2.42, 2.29, 1.62, 1.58,
1.01, 1.43, 2.41, 2.14, 0.59, 2.94, 2.6, 1.58, 1.52, 1.94, 1.64,
1.77, 1.49, 2.5, 1.16, 1.87, 1.81, 1.58, 1.62, 1.74, 2.14, 2.55,
1.08, 2.27, 1.28, 0.43, 1.79, 1.26, 0.8, 2.44, 1.39, 1.16, 1.17,
0.52, 2.05, 0.4, 2.7, 2.06, 0.64, 0.39, 0.4, 2.61, 1.54, 1.71,
2.3, 2.84, 1.32, 1.5, 1.43, 1.84, 2.3, 1.69, 0.45, 1.47, 0.36,
0.65, 0.91, 2.59, 1.64, 1.73, 1.97, 1.4, 2, 2.11, 1.63, 2.84,
1.56, 1.93, 1.67, 2.22, 2.42, 1.84, 2.24, 2.38, 2.33, 0.29, 0.41,
1.45, 1.49, 2.11), LithClass = structure(c(6L, 6L, 5L, 6L, 7L,
7L, 6L, 6L, 5L, 5L, 5L, 2L, 3L, 2L, 3L, 7L, 7L, 4L, 4L, 2L, 7L,
3L, 2L, 2L, 3L, 7L, 2L, 2L, 3L, 2L, 6L, 5L, 5L, 5L, 6L, 6L, 7L,
2L, 3L, 7L, 7L, 7L, 7L, 2L, 7L, 7L, 2L, 7L, 5L, 5L, 5L, 5L, 5L,
5L, 6L, 7L, 3L, 2L, 2L, 3L, 2L, 7L, 3L, 5L, 6L, 6L, 6L, 3L, 6L,
3L, 7L, 2L, 3L, 3L, 3L, 7L, 3L, 2L, 2L, 7L, 5L, 5L, 5L, 5L, 5L,
5L, 3L, 2L, 3L, 3L, 3L, 7L, 7L, 2L, 6L, 6L, 6L, 7L, 7L, 7L, 1L,
2L, 2L, 7L, 7L, 2L, 7L, 7L, 7L, 3L, 3L, 2L, 5L, 5L), .Label = c("basalt",
"clay", "coal", "dolomite", "limestone", "marl", "sand"), class =
"factor")), .Names = c("Density",
"Depth", "Porosity", "Conductivity", "LithClass"), row.names = c(NA,
114L), class = "data.frame")

	[[alternative HTML version deleted]]

______________________________________________
R-help at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.



More information about the R-help mailing list