[Rd] stats::line() does not produce correct Tukey line when n mod 6 is 2 or 3

Serguei Sokol sokol at insa-toulouse.fr
Wed May 31 18:46:34 CEST 2017


Le 31/05/2017 à 17:30, Serguei Sokol a écrit :
>
> More thorough reading revealed that I have overlooked this phrase in the
> line's doc: "left and right /thirds/ of the data" (emphasis is mine).
Oops. I have read the first ref returned by google and it happened to be
tibco's doc, not the R's one. The layout is very similar hence my mistake.
The latter does not mention "thirds" but ...
Anyway, here is a new line's patch which still gives a result slightly different
form MMline(). The slope is the same but not the intercept.
What are the exact terms for intercept calculation that should be implemented?

Serguei.
-------------- next part --------------
--- line.c.orig	2016-03-17 00:03:03.000000000 +0100
+++ line.c	2017-05-31 18:24:55.330030280 +0200
@@ -17,7 +17,7 @@
  *  https://www.R-project.org/Licenses/
  */
 
-#include <R_ext/Utils.h>	/* R_rsort() */
+#include <R_ext/Utils.h>    /* R_rsort() */
 #include <math.h>
 
 #include <Rinternals.h>
@@ -25,8 +25,8 @@
 
 /* Speed up by `inlining' these (as macros) [since R version 1.2] : */
 #if 1
-#define il(n,x)	(int)floor((n - 1) * x)
-#define iu(n,x)	(int) ceil((n - 1) * x)
+#define il(n,x) (int)floor(((n) - 1) * (x))
+#define iu(n,x) (int) ceil(((n) - 1) * (x))
 
 #else
 static int il(int n, double x)
@@ -41,71 +41,68 @@
 #endif
 
 static void line(double *x, double *y, /* input (x[i],y[i])s */
-		 double *z, double *w, /* work and output: resid. & fitted */
-		 /* all the above of length */ int n,
-		 double coef[2])
+         double *z, double *w, /* work and output: resid. & fitted */
+         int *indx, /* to get thirds of points independently of repeated values */
+         /* all the above of length */ int n,
+         double coef[2])
 {
     int i, j, k;
     double xb, x1, x2, xt, yt, yb, tmp1, tmp2;
     double slope, yint;
 
     for(i = 0 ; i < n ; i++) {
-	z[i] = x[i];
-	w[i] = y[i];
+        z[i] = x[i];
+        w[i] = y[i];
+        indx[i] = i;
     }
-    R_rsort(z, n);/* z = ordered abscissae */
+    rsort_with_index(z, indx, n);/* z = ordered abscissae */
 
-    tmp1 = z[il(n, 1./6.)];
-    tmp2 = z[iu(n, 1./6.)];	xb = 0.5*(tmp1+tmp2);
-
-    tmp1 = z[il(n, 2./6.)];
-    tmp2 = z[iu(n, 2./6.)];	x1 = 0.5*(tmp1+tmp2);
-
-    tmp1 = z[il(n, 4./6.)];
-    tmp2 = z[iu(n, 4./6.)];	x2 = 0.5*(tmp1+tmp2);
-
-    tmp1 = z[il(n, 5./6.)];
-    tmp2 = z[iu(n, 5./6.)];	xt = 0.5*(tmp1+tmp2);
+    k=(n+1)/3;
+    tmp1 = z[il(k, 0.5)];
+    tmp2 = z[iu(k, 0.5)];
+    /* xb := Median(first third of x) */
+    xb = 0.5*(tmp1+tmp2);
+
+    tmp1 = z[n-k+il(k, 0.5)];
+    tmp2 = z[n-k+iu(k, 0.5)];
+    /* xt := Median(last third of x) */
+    xt = 0.5*(tmp1+tmp2);
 
     slope = 0.;
 
     for(j = 1 ; j <= 1 ; j++) {
-	/* yb := Median(y[i]; x[i] <= quantile(x, 1/3) */
-	k = 0;
-	for(i = 0 ; i < n ; i++)
-	    if(x[i] <= x1)
-		z[k++] = w[i];
-	R_rsort(z, k);
-	yb = 0.5 * (z[il(k, 0.5)] + z[iu(k, 0.5)]);
-
-	/* yt := Median(y[i]; x[i] >= quantile(x, 2/3) */
-	k = 0;
-	for(i = 0 ; i < n ; i++)
-	    if(x[i] >= x2)
-		z[k++] = w[i];
-	R_rsort(z,k);
-	yt = 0.5 * (z[il(k, 0.5)] + z[iu(k, 0.5)]);
-
-	slope += (yt - yb)/(xt - xb);
-	for(i = 0 ; i < n ; i++) {
-	    z[i] = y[i] - slope*x[i];
-	    /* never used: w[i] = z[i]; */
-	}
-	R_rsort(z,n);
-	yint = 0.5 * (z[il(n, 0.5)] + z[iu(n, 0.5)]);
+        /* yb := Median(y[i]; x[i] in first third of x) */
+        for(i = 0 ; i < k ; i++)
+            z[i] = w[indx[i]];
+        R_rsort(z, k);
+        yb = 0.5 * (z[il(k, 0.5)] + z[iu(k, 0.5)]);
+
+        /* yt := Median(y[i]; x[i] in last third of x */
+        for(i = 0 ; i < k ; i++)
+            z[i] = w[indx[n-k+i]];
+        R_rsort(z,k);
+        yt = 0.5 * (z[il(k, 0.5)] + z[iu(k, 0.5)]);
+
+        slope += (yt - yb)/(xt - xb);
+        for(i = 0 ; i < n ; i++) {
+            z[i] = y[i] - slope*x[i];
+            /* never used: w[i] = z[i]; */
+        }
+        R_rsort(z,n);
+        yint = 0.5 * (z[il(n, 0.5)] + z[iu(n, 0.5)]);
     }
     for( i = 0 ; i < n ; i++ ) {
-	w[i] = yint + slope*x[i];
-	z[i] = y[i] - w[i];
+        w[i] = yint + slope*x[i];
+        z[i] = y[i] - w[i];
     }
     coef[0] = yint;
     coef[1] = slope;
 }
 
-void tukeyline0(double *x, double *y, double *z, double *w, int *n,
-	       double *coef)
+void tukeyline0(double *x, double *y, double *z, double *w, int *indx, int *n,
+           double *coef)
 {
-    line(x, y, z, w, *n, coef);
+    line(x, y, z, w, indx, *n, coef);
 }
 
 SEXP tukeyline(SEXP x, SEXP y, SEXP call)
@@ -127,7 +124,8 @@
     SET_VECTOR_ELT(ans, 2, res);
     SEXP fit = allocVector(REALSXP, n);
     SET_VECTOR_ELT(ans, 3, fit);
-    line(REAL(x), REAL(y), REAL(res), REAL(fit), n, REAL(coef));
+    SEXP indx = allocVector(INTSXP, n);
+    line(REAL(x), REAL(y), REAL(res), REAL(fit), INTEGER(indx), n, REAL(coef));
     UNPROTECT(1);
     return ans;
 }


More information about the R-devel mailing list