# [Rd] stats::line() does not produce correct Tukey line when n mod 6 is 2 or 3

Martin Maechler maechler at stat.math.ethz.ch
Tue May 30 09:33:10 CEST 2017

```>>>>> Serguei Sokol <sokol at insa-toulouse.fr>
>>>>>     on Mon, 29 May 2017 15:28:12 +0200 writes:

> Sorry, I have seen it too late that we had different tab
> width in the original file and my editor.  Here is the
> patch with all white spaces instead of mixing tabs and
> white spaces.

thank you - it still gives quite a few unnecessary (i.e. "white space")
diffs with the source code, but that's not the problem :

The patch leads to correct results for the simple
(1:k, 1:k)  data sets (for all k).

However, even after the patch,
The example from the SO post differs from the result of Richie
Cotton's function...(even though that function had a silly bug
in step 1, the bug has not been "kicking" for the example):

Here's a fixed-up version of the pure R function and
the example and some comments :

## From Stackoverflow
## http://stackoverflow.com/questions/3224731/john-tukey-median-median-or-resistant-line-statistical-test-for-r-and-line
## median_median_line by Richie Cotton (July 12 2010, last edited at 13:49)
##
## Shorter variable names, fixed bug in step 1, added 'plot.' option: Martin Maechler, May 2017
MMline <- function(x, y, data, plot. = FALSE)
{
if(!missing(data))
{
x <- eval(substitute(x), data)
y <- eval(substitute(y), data)
}
stopifnot((n <- length(x)) == length(y), n >= 2)

## Step 1
n.3 <- n %/% 3L
groups <- rep(1:3, times = switch((n %% 3) + 1L,
n.3,
c(n.3, n.3 + 1L, n.3),
c(n.3 + 1L, n.3, n.3 + 1L)
))
groups <- lapply(list(groups), as.factor) # (gain a bit in tapply())

## Step 2
## sort *both*  x & y "along x":
x <- sort.int(x, index.return = TRUE)
y <- y[x\$ix]
x <- x\$x
if(plot.) plot(x,y)

## Step 3
m_x <- tapply(x, groups, median)
m_y <- tapply(y, groups, median)
if(plot.) {
points(m_x, m_y, cex=2, pch=3, col="red")
segments(m_x[1],m_y[1], m_x[3],m_y[3], col="red")
}
## Step 4
R <- if(n == 2) 2L else 3L
slope <- (m_y[[R]] - m_y[[1]]) / (m_x[[R]] - m_x[[1]])
intercept <- m_y[[1]] - slope * m_x[[1]]

## Step 5
mid_y <- intercept + slope * m_x[[2]]
intercept <- intercept + (m_y[[2]] - mid_y) / 3
if(plot.) abline(a = intercept, b = slope, col=adjustcolor("midnight blue", .5), lwd=2)
c(intercept = intercept, slope = slope)
}

## To test it, here's the second example from that page:

dfr <- data.frame(
time = c( .16,  .24,  .25,  .30,  .30,    .32,  .36,  .36,  .50,  .50,
.57,  .61,  .61,  .68,  .72,    .72,  .83,  .88,  .89),
distance = c(12.1, 29.8, 32.7, 42.8, 44.2,   55.8, 63.5, 65.1,124.6,129.7,
150.2,182.2,189.4,220.4,250.4,  261.0,334.5,375.5,399.1))

MMline(time, distance, dfr, plot.=TRUE)
## intercept    slope
##   -113.6     520.0
par(new=TRUE)# should plot identically!
MMline(time, distance, dfr[sample.int(nrow(dfr)), ], plot.=TRUE)

## Note the slightly odd way of specifying the groups. The instructions are
## quite picky about how you define group sizes, so the more obvious method
## of cut(x, quantile(x, seq.int(0, 1, 1/3))) doesn't work.

## edited Jul 12 '10 at 13:49 / answered Jul 12 '10 at 13:36
## Richie Cotton

## And someone remarked  that R's  line() did not give the same:

with(dfr, line(time, distance))
## ...
## Coefficients:
## [1]  -108.8   505.2
abline(-108.8, 505.2, col="blue") ##=> this one is wrong

## MM:
(cfs <- t(sapply(setNames(,2:50), function(k) {x <- 1:k; MMline(x,x)})))
##    intercept slope
## 2          0     1 # (special case fixed)
## 3          0     1
## 4          0     1
## 5          0     1
##   ............
## 49         0     1
## 50         0     1

## "Harder": (sort ing!)
cf2 <- t(sapply(setNames(,2:50),
function(k) {x <- sample.int(k); MMline(x, -10*x)}))
stopifnot(abs(cf2[,"intercept"] -   0) < 1e-12,
abs(cf2[,    "slope"] - -10) < 1e-12)

```