[R] R in the NY Times

Marc Schwartz marc_schwartz at comcast.net
Thu Jan 8 03:47:51 CET 2009


> Here's a couple of similar plots created with ggplot2.  I chose to
> turn the data into a data frame with an explicit date column.  Using a
> log scale somewhat stabilises the variability.
> 
> ## SAS-L traffic
> sas <- structure(list(Jan = c(NA, 546L, 548L, 853L, 1007L, 894L, 514L,
> 1720L, 1826L, 1941L, 1832L, 1636L, 2122L, 2722L, 2750L, 2305L,
> 357L), Feb = c(NA, 511L, 734L, 1024L, 1150L, 1068L, 493L, 1519L,
> 1537L, 1845L, 1846L, 1652L, 1960L, 1645L, 926L, 2255L, NA), Mar = c(NA,
> 658L, 963L, 805L, 1108L, 945L, 659L, 1177L, 1915L, 2010L, 1755L,
> 2188L, 629L, 1711L, 1728L, 2712L, NA), Apr = c(NA, 681L, 792L,
> 1052L, 1315L, 784L, 1077L, 1163L, 1467L, 2199L, 1757L, 1826L,
> 2169L, 2796L, 2766L, 2789L, NA), May = c(NA, 712L, 945L, 1163L,
> 1212L, 448L, 778L, 1963L, 1735L, 2373L, 1863L, 1836L, 2283L,
> 3147L, 2974L, 2025L, NA), Jun = c(NA, 751L, 1002L, 999L, 1127L,
> 813L, 540L, 1615L, 1905L, 2133L, 1701L, 2606L, 2407L, 2723L,
> 2691L, 2368L, NA), Jul = c(15L, 763L, 775L, 1184L, 1074L, 896L,
> 476L, 1572L, 2027L, 2445L, 1926L, 1843L, 2061L, 761L, 2435L,
> 2607L, NA), Aug = c(458L, 975L, 969L, 1053L, 692L, 823L, 612L,
> 1696L, 1976L, 1492L, 1689L, 2143L, 1793L, 2027L, 2592L, 2584L,
> NA), Sep = c(330L, 703L, 745L, 1176L, 947L, 894L, 1351L, 1491L,
> 1439L, 1864L, 1646L, 1784L, 1365L, 2714L, 1868L, 2554L, NA),
> Oct = c(219L, 805L, 691L, 1197L, 900L, 1129L, 1708L, 1669L,
> 1592L, 2133L, 1832L, 1712L, 1427L, 2983L, 2320L, 2434L, NA
> ), Nov = c(472L, 752L, 773L, 911L, 853L, 733L, 1720L, 1490L,
> 1636L, 1663L, 1545L, 1786L, 1518L, 2848L, 2112L, 1984L, NA
> ), Dec = c(517L, 666L, 765L, 844L, 677L, 492L, 1595L, 1298L,
> 1424L, 1520L, 1445L, 2148L, 1524L, 2374L, 1948L, 1921L, NA
> )), .Names = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
> "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), class = "data.frame",
> row.names = c("1993",
> "1994", "1995", "1996", "1997", "1998", "1999", "2000", "2001",
> "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009"
> ))
> 
> ## s-news traffic
> s <- structure(c(NA, 210, 264, 246, 230, 189, 197, 174, 109, 51, 48,
> 5, 273, 173, 313, 232, 255, 179, 230, 161, 87, 59, 63, NA, 378,
> 313, 285, 252, 242, 218, 257, 193, 99, 74, 58, NA, 293, 300,
> 264, 300, 228, 196, 151, 182, 123, 48, 47, NA, 330, 334, 306,
> 331, 219, 189, 164, 174, 107, 46, 31, NA, 243, 254, 247, 282,
> 248, 217, 175, 109, 96, 34, 27, NA, 219, 284, 245, 258, 230,
> 221, 154, 159, 84, 47, 40, NA, 209, 270, 302, 260, 207, 187,
> 187, 144, 97, 39, 28, NA, 191, 300, 204, 260, 221, 186, 195,
> 107, 68, 35, 41, NA, 241, 253, 251, 229, 280, 295, 150, 98, 73,
> 70, 30, NA, 181, 300, 261, 232, 228, 197, 176, 82, 53, 56, 27,
> NA, 141, 194, 176, 194, 177, 142, 176, 84, 20, 41, 36, NA), .Dim = c(12L,
> 12L), .Dimnames = list(c("1998", "1999", "2000", "2001", "2002",
> "2003", "2004", "2005", "2006", "2007", "2008", "2009"), c("Jan",
> "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
> "Nov", "Dec")))
> 
> r <- structure(c(NA, 135, 226, 205, 558, 884, 1017, 1116, 1746,
> 2075, 1714, 2490, 462, NA, 79, 145, 355, 583, 697, 1137, 1580, 1724,
> 1920, 1907, 2583, NA, NA, 114, 195, 377, 651, 880, 1203, 1946,
> 1703, 2270, 2191, 2740, NA, 92, 101, 189, 377, 470, 965, 1488,
> 1657, 2057, 1818, 2145, 2487, NA, 36, 90, 161, 504, 552, 1057,
> 1268, 1561, 1887, 2029, 2210, 2517, NA, 47, 105, 186, 418, 550,
> 926, 1319, 1714, 2056, 1811, 2307, 2774, NA, 41, 110, 184, 293,
> 615, 918, 1344, 1618, 1872, 1785, 2138, 3268, NA, 37, 64, 148,
> 356, 562, 824, 1210, 1493, 1777, 1898, 2241, 2813, NA, 40, 94,
> 203, 434, 678, 705, 1443, 1534, 1709, 1902, 2028, 2990, NA, 76,
> 96, 231, 418, 657, 1055, 1567, 1712, 1810, 2328, 2708, 3037,
> NA, 61, 184, 318, 433, 825, 1038, 1605, 1895, 1907, 2127, 2594,
> 2730, NA, 57, 105, 221, 422, 530, 742, 1158, 1481, 1508, 1450,
> 2028, 2399, NA), .Dim = c(13L, 12L), .Dimnames = list(c("1997",
> "1998", "1999", "2000", "2001", "2002", "2003", "2004", "2005",
> "2006", "2007", "2008", "2009"), c("Jan", "Feb", "Mar", "Apr",
> "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")))
> 
> library(reshape)
> sas <- melt(as.matrix(sas), na.rm = TRUE)
> r <- melt(r, na.rm = TRUE)
> s <- melt(s, na.rm = TRUE)
> names(r) <- names(s) <- names(sas) <- c("year", "month", "count")
> 
> sas$software <- "sas"
> s$software <- "s"
> r$software <- "r"
> all <- rbind(sas, s, r)
> all$date <- with(all,
>   as.Date(paste(year, month, 15, sep = "-"), "%Y-%b-%d"))
> 
> 
> library(ggplot2)
> qplot(date, count, data = all, geom = "line", colour = software) +
>    geom_smooth(se = F, size = 1)
> last_plot() + scale_y_log10(breaks = 10^(1:3), labels = 10^(1:3))
> 
> yearly <- ddply(all, .(year, software), function(df) c(count = sum(df$count)))
> qplot(year, count, data = yearly, geom = "line", colour = software)


Hadley,

You might want to remove the 2009 data from each of the three lists
given that the January data is not yet complete.

The result of including the January 2009 data in your plots is that the
growth trajectory for the smoothed curves for SAS-L and R-Help appear to
be leveling or even declining, when at least for R-Help, that is not the
case. The S-News curve is not affected significantly, given the already
declining counts.

The effect of the 2009 data is most noticeable in the log scale plot.

Thus:

all <- subset(all, year < 2009)

# Linear scale
qplot(date, count, data = all, geom = "line", colour = software) +
      geom_smooth(se = F, size = 1)


# Log scale
last_plot() + scale_y_log10(breaks = 10^(1:3), labels = 10^(1:3))


HTH,

Marc Schwartz




More information about the R-help mailing list