[Rd] choose(n,k) when n is almost integer

Tue Dec 22 14:25:15 CET 2009

On Sat, Dec 19, 2009 at 10:02:49PM +0100, Martin Maechler wrote:
> {I've changed the subject;  it's really no longer about
>  lchoose()'s definition}
> 
[.......................]
> 
> Thanks a lot, Petr, for your explorations.
> I agree that at least something like your conservative change
> should happen.
> 
> Personally, I'd agree with you and prefer the "progressive"
> (non-conservative) change,
> basically getting rid of  all  R_IS_INT(n)
> tests, which seem kludgey.
> 
> OTOH, the R core members who did start using R_IS_INT(n)
> must have had good reasons with another set examples.
> 
> Have you tried 'make check-devel' (or 'check-all') also with the 
> progressive change?

After some tests, i changed the two suggested patches. The new versions
will be called A and B.

Let us consider computing choose(n, k) with an integer k and a general n.

Patch A is a minimal one. It rounds n to be an exact integer, if we assume
this after the test R_IS_INT(n).

Patch B corresponds to the progressive older suggestion. It tries to avoid
rounding of n, if possible. Rounding is not needed, is k < 30 or if none
of the factors in the product n(n-1)...(n-k+1) is close to zero.

Patch A (without indentation in attachment):

  --- R-devel-orig-sse/src/nmath/choose.c	2009-12-17 17:52:39.000000000 +0100
  +++ R-devel-work-copy-1-sse/src/nmath/choose.c	2009-12-22 09:06:32.000000000 +0100
  @@ -109,6 +109,7 @@
   	MATHLIB_WARNING2(_("'k' (%.2f) must be integer, rounded to %.0f"), k0, k);
       if (k < k_small_max) {
   	int j;
  +	if (R_IS_INT(n)) n = floor(n + 0.5);
   	if(n-k < k && n >= 0 && R_IS_INT(n)) k = n-k; /* <- Symmetry */
   	if (k <	 0) return 0.;
   	if (k == 0) return 1.;
  @@ -126,6 +127,7 @@
   	return r;
       }
       else if (R_IS_INT(n)) {
  +	n = floor(n + 0.5);
   	if(n < k) return 0.;
   	if(n - k < k_small_max) return choose(n, n-k); /* <- Symmetry */
   	return floor(exp(lfastchoose(n, k)) + 0.5);

Path B (without indentation in attachment):

  --- R-devel-orig-sse/src/nmath/choose.c	2009-12-17 17:52:39.000000000 +0100
  +++ R-devel-work-copy-2-sse/src/nmath/choose.c	2009-12-22 13:49:33.000000000 +0100
  @@ -93,13 +93,14 @@
       return lfastchoose(n, k);
   }
   
  +#define IS_INT(x)	  ((x) == floor(x))
   #define k_small_max 30
   /* 30 is somewhat arbitrary: it is on the *safe* side:
    * both speed and precision are clearly improved for k < 30.
   */
   double choose(double n, double k)
   {
  -    double r, k0 = k;
  +    double r, k0 = k, n1;
       k = floor(k + 0.5);
   #ifdef IEEE_754
       /* NaNs propagated correctly */
  @@ -109,14 +110,14 @@
   	MATHLIB_WARNING2(_("'k' (%.2f) must be integer, rounded to %.0f"), k0, k);
       if (k < k_small_max) {
   	int j;
  -	if(n-k < k && n >= 0 && R_IS_INT(n)) k = n-k; /* <- Symmetry */
  +	if(n-k < k && n >= 0 && IS_INT(n)) k = n-k; /* <- Symmetry */
   	if (k <	 0) return 0.;
   	if (k == 0) return 1.;
   	/* else: k >= 1 */
   	r = n;
   	for(j = 2; j <= k; j++)
  -	    r *= (n-j+1)/j;
  -	return R_IS_INT(n) ? floor(r + 0.5) : r;
  +	    r *= (n-(j-1))/j;
  +	return IS_INT(n) ? floor(r + 0.5) : r;
   	/* might have got rounding errors */
       }
       /* else: k >= k_small_max */
  @@ -125,12 +126,15 @@
   	if (ODD(k)) r = -r;
   	return r;
       }
  -    else if (R_IS_INT(n)) {
  +    else if (IS_INT(n)) {
   	if(n < k) return 0.;
   	if(n - k < k_small_max) return choose(n, n-k); /* <- Symmetry */
   	return floor(exp(lfastchoose(n, k)) + 0.5);
       }
       /* else non-integer n >= 0 : */
  +    n1 = floor(n + 0.5);
  +    if (n1 <= k-1 && fabs(n1 - n) <= 1e-7)
  +	return 0.;
       if (n < k-1) {
   	int s_choose;
   	r = lfastchoose2(n, k, /* -> */ &s_choose);
  @@ -140,5 +144,6 @@
   }
   
   #undef ODD
  +#undef IS_INT
   #undef R_IS_INT
   #undef k_small_max

Both patches solve the main problems of the original implementation, namely

  # for k < k_small_max
  choose(5 - 1e-9, 5) # [1] 0
  choose(5 + 1e-9, 5) # [1] 5
  # for k >= k_small_max
  choose(30 - 1e-9, 30) # [1] 0
  choose(60 - 1e-9, 30) # Error: segfault from C stack overflow

and we get in 7 digit precision

  # for k < k_small_max
  choose(5 - 1e-9, 5) # [1] 1
  choose(5 + 1e-9, 5) # [1] 1
  # for k >= k_small_max
  choose(30 - 1e-9, 30) # [1] 1
  choose(60 - 1e-9, 30) # [1] 1.182646e+17

The patches do not modify lchoose(). I think, this should be done after
a decision how to modify choose().

Both patches go through make check-devel until the test of Tcl/Tk, which fails,
since i do not have Tcl/Tk on the machine, which i have now available. I will
perform the whole test later.

Both patches and also the original implementation sometimes generate a warning
of the following type

  > choose(19 - 2e-7, 30)
  [1] -3.328339e-16
  Warning message:
  In choose(19 - 2e-07, 30) :
    full precision may not have been achieved in 'lgamma'

These warnings are generated in functions called from choose() and the intention
of this patch is not to change this. Most of these warnings are eliminated
in the original implementation by treating numbers, which differ from an
integer by less than 1e-7, as integers. However, there are some such cases
even if the difference is slightly larger than 1e-7 like the above 2e-7.

The cases, when the above warning is generated, are the same for both patches A and B
in the tests, which i did.

Patch B is significantly more accurate in cases, when the result is more than 1.

In order to verify these two claims, i used the test script at
  http://www.cs.cas.cz/~savicky/R-devel/test_choose_0.R

Its outputs with A and B are presented below. The list of cases with warning 
   Cases with warning
        [,1] [,2]
   [1,] 6715   31
   [2,]  152   33
   [3,]  152   34
   [4,] 6393   37
   [5,] 9388   39
   [6,] 6059   40
   [7,] 8773   40
   [8,] 5058   44
   [9,] 5531   44
  [10,] 6670   46
  [11,]  652   47
  [12,] 2172   49
  [13,] 3371   49
  [14,] 1913   50
is the same in all four tests. So, i only include the tables of errors, which
are different. The table contains columns "bound" and "error". The error is
the maximum error achieved in cases, where the compared values are at most the
bound on the same row and more than the bounds on the previous rows.

Patch A on Intel extended arithmetic

  minimum log2() of a wrong result for integer n : 53.95423 
  maximum error for real n :
       bound        error
  [1,] 0e+00 1.206026e-08
  [2,] 1e-20 2.699943e-08
  [3,] 1e-15 3.444711e-08
  [4,] 1e-10 2.806709e-08
  [5,] 1e-05 8.395916e-09
  [6,] 1e+00 3.695634e-07
  [7,]   Inf 2.990987e-07

Patch A on SSE arithmetic

  minimum log2() of a wrong result for integer n : 49.94785 
  maximum error for real n :
       bound        error
  [1,] 0e+00 1.206026e-08
  [2,] 1e-20 2.699943e-08
  [3,] 1e-15 3.444709e-08
  [4,] 1e-10 2.806707e-08
  [5,] 1e-05 8.395951e-09
  [6,] 1e+00 3.695634e-07
  [7,]   Inf 2.990987e-07

Patch B on Intel extended arithmetic

  minimum log2() of a wrong result for integer n : 53.95423 
  maximum error for real n :
       bound        error
  [1,] 0e+00 2.047988e-09
  [2,] 1e-20 2.699943e-08
  [3,] 1e-15 3.444711e-08
  [4,] 1e-10 2.806709e-08
  [5,] 1e-05 8.395916e-09
  [6,] 1e+00 1.207856e-11
  [7,]   Inf 1.509903e-14

Patch B on SSE arithmetic

  minimum log2() of a wrong result for integer n : 49.94785 
  maximum error for real n :
       bound        error
  [1,] 0e+00 2.047988e-09
  [2,] 1e-20 2.699943e-08
  [3,] 1e-15 3.444709e-08
  [4,] 1e-10 2.806707e-08
  [5,] 1e-05 8.395951e-09
  [6,] 1e+00 1.205369e-11
  [7,]   Inf 2.731149e-14

We can see that B has smaller error, if the output is more than 1.

For integer n, we can see that the smallest output values, where we
do not get the exact result is the same for A and B. There is a difference
between Intel extended arithmetic and SSE for clear reasons.

I suggest to use B and i appreciate comments.

Petr Savicky.

-------------- next part --------------

--- R-devel-orig-sse/src/nmath/choose.c	2009-12-17 17:52:39.000000000 +0100
+++ R-devel-work-copy-1-sse/src/nmath/choose.c	2009-12-22 09:06:32.000000000 +0100
@@ -109,6 +109,7 @@
 	MATHLIB_WARNING2(_("'k' (%.2f) must be integer, rounded to %.0f"), k0, k);
     if (k < k_small_max) {
 	int j;
+	if (R_IS_INT(n)) n = floor(n + 0.5);
 	if(n-k < k && n >= 0 && R_IS_INT(n)) k = n-k; /* <- Symmetry */
 	if (k <	 0) return 0.;
 	if (k == 0) return 1.;
@@ -126,6 +127,7 @@
 	return r;
     }
     else if (R_IS_INT(n)) {
+	n = floor(n + 0.5);
 	if(n < k) return 0.;
 	if(n - k < k_small_max) return choose(n, n-k); /* <- Symmetry */
 	return floor(exp(lfastchoose(n, k)) + 0.5);
-------------- next part --------------
--- R-devel-orig-sse/src/nmath/choose.c	2009-12-17 17:52:39.000000000 +0100
+++ R-devel-work-copy-2-sse/src/nmath/choose.c	2009-12-22 13:49:33.000000000 +0100
@@ -93,13 +93,14 @@
     return lfastchoose(n, k);
 }
 
+#define IS_INT(x)	  ((x) == floor(x))
 #define k_small_max 30
 /* 30 is somewhat arbitrary: it is on the *safe* side:
  * both speed and precision are clearly improved for k < 30.
 */
 double choose(double n, double k)
 {
-    double r, k0 = k;
+    double r, k0 = k, n1;
     k = floor(k + 0.5);
 #ifdef IEEE_754
     /* NaNs propagated correctly */
@@ -109,14 +110,14 @@
 	MATHLIB_WARNING2(_("'k' (%.2f) must be integer, rounded to %.0f"), k0, k);
     if (k < k_small_max) {
 	int j;
-	if(n-k < k && n >= 0 && R_IS_INT(n)) k = n-k; /* <- Symmetry */
+	if(n-k < k && n >= 0 && IS_INT(n)) k = n-k; /* <- Symmetry */
 	if (k <	 0) return 0.;
 	if (k == 0) return 1.;
 	/* else: k >= 1 */
 	r = n;
 	for(j = 2; j <= k; j++)
-	    r *= (n-j+1)/j;
-	return R_IS_INT(n) ? floor(r + 0.5) : r;
+	    r *= (n-(j-1))/j;
+	return IS_INT(n) ? floor(r + 0.5) : r;
 	/* might have got rounding errors */
     }
     /* else: k >= k_small_max */
@@ -125,12 +126,15 @@
 	if (ODD(k)) r = -r;
 	return r;
     }
-    else if (R_IS_INT(n)) {
+    else if (IS_INT(n)) {
 	if(n < k) return 0.;
 	if(n - k < k_small_max) return choose(n, n-k); /* <- Symmetry */
 	return floor(exp(lfastchoose(n, k)) + 0.5);
     }
     /* else non-integer n >= 0 : */
+    n1 = floor(n + 0.5);
+    if (n1 <= k-1 && fabs(n1 - n) <= 1e-7)
+	return 0.;
     if (n < k-1) {
 	int s_choose;
 	r = lfastchoose2(n, k, /* -> */ &s_choose);
@@ -140,5 +144,6 @@
 }
 
 #undef ODD
+#undef IS_INT
 #undef R_IS_INT
 #undef k_small_max