[Rd] 2 bugs in max.col() (PR#9542)

oehl_list at gmx.de oehl_list at gmx.de
Sat Mar 3 18:48:59 CET 2007


Dear R-Developers,

I think I found two bugs in max.col(). Ties between zeros are not broken, which might affect simulations. -Inf and Zero can be treated the same, which can give completely wrong results, e.g. when the second max is sought by replacing all maxs by -Inf. 

To fix max.col I do offer the C-code behind my function rowMax(), which also handles NAs and seems to be faster. However, since max.col() is widely used and since I only occasionally program C, I'd appreciate if someone would do code-review and (again) thorough testing before publishing it. Please note that the code was developed under R 1.6.2 and might need porting to R 2.4.1 .

Best regards


Jens Oehlschlägel



# -- output of max.col under 2.4.1. ------------------------

> x <- rbind(c(1,1), c(0,0), c(-Inf, 0), c(0, Inf), c(0, NA), c(NA, 0))
> rownames(x) <- paste("c(", apply(x, 1, paste, collapse=","), ")")
> x
            [,1] [,2]
c( 1,1 )       1    1
c( 0,0 )       0    0
c( -Inf,0 ) -Inf    0
c( 0,Inf )     0  Inf
c( 0,NA )      0   NA
c( NA,0 )     NA    0
> 
> cat("ties not broken for c(0,0)\n")
ties not broken for c(0,0)
> cat("erroneously ties broken for c(-Inf, 0)\n")
erroneously ties broken for c(-Inf, 0)
> tmp <- sapply(1:10, function(i)max.col(x, ties.method="random"));rownames(tmp)<-rownames(x);tmp
            [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
c( 1,1 )       2    1    2    2    1    1    1    2    1     1
c( 0,0 )       2    2    2    2    2    2    2    2    2     2
c( -Inf,0 )    2    1    1    2    2    1    2    1    1     1
c( 0,Inf )     2    2    2    2    2    2    2    2    2     2
c( 0,NA )     NA   NA   NA   NA   NA   NA   NA   NA   NA    NA
c( NA,0 )     NA   NA   NA   NA   NA   NA   NA   NA   NA    NA
> cat("The following look good\n")
The following look good
> tmp <- sapply(1:10, function(i)max.col(x, ties.method="first"));rownames(tmp)<-rownames(x);tmp
            [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
c( 1,1 )       1    1    1    1    1    1    1    1    1     1
c( 0,0 )       1    1    1    1    1    1    1    1    1     1
c( -Inf,0 )    2    2    2    2    2    2    2    2    2     2
c( 0,Inf )     2    2    2    2    2    2    2    2    2     2
c( 0,NA )     NA   NA   NA   NA   NA   NA   NA   NA   NA    NA
c( NA,0 )     NA   NA   NA   NA   NA   NA   NA   NA   NA    NA
> tmp <- sapply(1:10, function(i)max.col(x, ties.method="last"));rownames(tmp)<-rownames(x);tmp
            [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
c( 1,1 )       2    2    2    2    2    2    2    2    2     2
c( 0,0 )       2    2    2    2    2    2    2    2    2     2
c( -Inf,0 )    2    2    2    2    2    2    2    2    2     2
c( 0,Inf )     2    2    2    2    2    2    2    2    2     2
c( 0,NA )     NA   NA   NA   NA   NA   NA   NA   NA   NA    NA
c( NA,0 )     NA   NA   NA   NA   NA   NA   NA   NA   NA    NA

> version
               _                           
platform       i386-pc-mingw32             
arch           i386                        
os             mingw32                     
system         i386, mingw32               
status                                     
major          2                           
minor          4.1                         
year           2006                        
month          12                          
day            18                          
svn rev        40228                       
language       R                           
version.string R version 2.4.1 (2006-12-18)


# -- output of rowMax under 1.6.2 --------

> tmp <- sapply(1:10, function(i)rowMax(x, value=FALSE, na.rm=TRUE, ties.method="random"));rownames(tmp)<-rownames(x);tmp
            [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
c( 1,1 )       2    2    2    2    1    2    2    1    1     1
c( 0,0 )       1    1    2    2    2    2    1    1    1     2
c( -Inf,0 )    2    2    2    2    2    2    2    2    2     2
c( 0,Inf )     2    2    2    2    2    2    2    2    2     2
c( 0,NA )      1    1    1    1    1    1    1    1    1     1
c( NA,0 )      2    2    2    2    2    2    2    2    2     2
> tmp <- sapply(1:10, function(i)rowMax(x, value=FALSE, na.rm=TRUE, ties.method="first"));rownames(tmp)<-rownames(x);tmp
            [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
c( 1,1 )       1    1    1    1    1    1    1    1    1     1
c( 0,0 )       1    1    1    1    1    1    1    1    1     1
c( -Inf,0 )    2    2    2    2    2    2    2    2    2     2
c( 0,Inf )     2    2    2    2    2    2    2    2    2     2
c( 0,NA )      1    1    1    1    1    1    1    1    1     1
c( NA,0 )      2    2    2    2    2    2    2    2    2     2
> tmp <- sapply(1:10, function(i)rowMax(x, value=FALSE, na.rm=TRUE, ties.method="last"));rownames(tmp)<-rownames(x);tmp
            [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
c( 1,1 )       2    2    2    2    2    2    2    2    2     2
c( 0,0 )       2    2    2    2    2    2    2    2    2     2
c( -Inf,0 )    2    2    2    2    2    2    2    2    2     2
c( 0,Inf )     2    2    2    2    2    2    2    2    2     2
c( 0,NA )      1    1    1    1    1    1    1    1    1     1
c( NA,0 )      2    2    2    2    2    2    2    2    2     2



# -- replication ----------

x <- rbind(c(1,1), c(0,0), c(-Inf, 0), c(0, Inf), c(0, NA), c(NA, 0))
rownames(x) <- paste("c(", apply(x, 1, paste, collapse=","), ")")
x

cat("ties not broken for c(0,0)\n")
cat("erroneously ties broken for c(-Inf, 0)\n")
tmp <- sapply(1:10, function(i)max.col(x, ties.method="random"));rownames(tmp)<-rownames(x);tmp
cat("The following look good\n")
tmp <- sapply(1:10, function(i)max.col(x, ties.method="first"));rownames(tmp)<-rownames(x);tmp
tmp <- sapply(1:10, function(i)max.col(x, ties.method="last"));rownames(tmp)<-rownames(x);tmp

# rowMax under R 1.6.2
tmp <- sapply(1:10, function(i)rowMax(x, value=FALSE, na.rm=TRUE, ties.method="random"));rownames(tmp)<-rownames(x);tmp
tmp <- sapply(1:10, function(i)rowMax(x, value=FALSE, na.rm=TRUE, ties.method="first"));rownames(tmp)<-rownames(x);tmp
tmp <- sapply(1:10, function(i)rowMax(x, value=FALSE, na.rm=TRUE, ties.method="last"));rownames(tmp)<-rownames(x);tmp


# -- R 1.6.2 -----------

## Copyright 2003-2007 Jens Oehlschlägel
## May be used (possibly modified) with name "max.col" under GPL
## Provided "as is"
## Use at own risk

rowMax <- function(x, value=TRUE, index=NULL, na.rm=FALSE, ties.method = c("random", "first", "last"), use.names=FALSE){
  if (!length(x))
    stop("x has no length")
  d <- dim(x)
  if (!is.null(index) && !identical(d, dim(index)))
      stop("index has wrong dimension")
  if (length(d)!=2)
    stop("x must be a matrix")
  if (is.na(value) || !value){
    ties.method <- pmatch(ties.method, c("random", "first", "last"))
    if (is.na(ties.method))
      stop("illegal ties method")
    ret <- .C("R_row_maxwithindex_double"
    , x=as.double(x)
    , nr=d[1]
    , nc=d[2]
    , narm = as.integer(na.rm)
    , index=integer(d[1])
    , tiesmethod=as.integer(ties.method)
    , NAOK = TRUE, DUP = FALSE, PACKAGE = "base")[c("index")]
    i <- cbind(seq(length=d[1]), ret$index)
    if (is.na(value))
      ret$value <- x[i]
    if (!is.null(index))
      ret$index <- index[i]
    if (is.na(value)){
      if (use.names){
        names(ret$index) <- dimnames(x)[1]
        names(ret$values) <- dimnames(x)[1]
      }
      return(ret)
    }else{
      if (use.names){
        names(ret$index) <- dimnames(x)[1]
      }
      return(ret$index)
    }
  }else{
    ret <- .C("R_row_max_double"
    , x=as.double(x)
    , nr=d[1]
    , nc=d[2]
    , narm = as.integer(na.rm)
    , values=double(d[1])
    , NAOK = TRUE, DUP = FALSE, PACKAGE = "base")["values"]
    if (use.names){
      names(ret$values) <- dimnames(x)[1]
    }
    return(ret$values)
  }
}


# -- C --------------

// Author: Jens Oehlschlägel
// Date:   3.3.2007
// Provided 'as is' under GPL
// Use at own risk

#include <R_ext/Arith.h>        /* NA handling */
#include <Rmath.h>              /* probably not needed */
#include <R_ext/Random.h>       /* ..RNGstate */
#include <R_ext/Applic.h>       /* NA handling */
#include <R_ext/Utils.h>      /* probably not needed */

#define RELTOL 1e-5


void R_row_maxwithindex_double(
  double *x
, int *nr
, int *nc
, int *narm
, int *index
, int *tiesmethod
)
{
    int   r, c, m, ntie, n_r = *nr, n_c = *nc, na_rm = *narm, ties_method = *tiesmethod;
    double a = 0; // to avoid uninitialized compiler warning
    double b, tol, small, absa;
    Rboolean isna;
    if (ties_method==1)
      GetRNGstate();

    for (r = 0; r < n_r; r++) {
        /* first check row for any NAs and find the smallst entry */
        small = R_PosInf;
        if (na_rm){
          isna = TRUE;
          for (c = 0; c < n_c; c++){
              a = x[r + c * n_r];
              if (!ISNAN(a)) {
                isna = FALSE;
                absa = fabs(a);
                if (absa<small && absa!=0)
                  small = absa;
              }
          }
        }else{
          isna = FALSE;
          for (c = 0; c < n_c; c++) {
              a = x[r + c * n_r];
              if (ISNAN(a)) {
                 isna = TRUE;
                 break;
              }else{
                absa = fabs(a);
                if (absa<small && absa!=0)
                  small = absa;
              }
          }
        }
        if (isna) {
          index[r] = NA_INTEGER;
        }else{
          if (R_FINITE(small))
            tol = RELTOL * small;
          else
            tol = 0;
          //Rprintf("small=%g tol=%g\n", small, tol);
          // the following loop has two parts
          // 1) find non-NA
          // 2) find extreme (and don't ask  ISNAN anymore)
          m = NA_INTEGER;
          ntie = 1;
          for (c = 0; c < n_c; c++) {
            a = x[r + c * n_r];
            if (!ISNAN(a)){
              m = c;
              break;
            }
          }
          for (c++; c < n_c; c++) {
              b = x[r + c * n_r];
              //Rprintf("a-tol=%g a=%g a+tol=%g b=%g\n", a-tol, a, a+tol, b);
              if (b > a + tol){   // MASS had >= here, which does not tie zeros and Inf
                  ntie = 1;
                  a = b;
                  m = c;
              } else if (b >= a - tol) {
                  if (ties_method==1){
                    ntie++;
                    if (ntie * unif_rand() < 1.) m = c;
                  }else if (ties_method==3){
                    m=c;
                  }
              }
          }
          index[r] = m + 1;
        }
    }
    if (ties_method==1)
      PutRNGstate();
}


void R_row_max_double(double *x, int *nr, int *nc, int *narm, double *values)
{
    int   r, c, n_r = *nr, n_c = *nc, na_rm = *narm;
    double e, extreme;
    Rboolean isna;

    for (r = 0; r < n_r; r++) {
        /* first check row for any NAs and find the largest entry */
        extreme = R_NegInf;
        isna = FALSE;
        for (c = 0; c < n_c; c++) {
            e = x[r + c * n_r];
            if (ISNAN(e)){
                if (!na_rm){
                    isna = TRUE;
                    break;
                }
            }else if (e>extreme){
                extreme = e;
            }

        }
        if (isna)
            values[r] = NA_REAL;
        else
            values[r] = extreme;
    }
}

-- 
"Feel free" - 10 GB Mailbox, 100 FreeSMS/Monat ...
Jetzt GMX TopMail testen: www.gmx.net/de/go/mailfooter/topmail-out



More information about the R-devel mailing list