[R] G-test : log-likelihood ratio test

Peter L. Hurd phurd at ualberta.ca
Thu Sep 20 18:48:34 CEST 2001


I've written a g.test() aka log-likelihood ratio test function for my
opwn use.  It's something I've seen requested (and looked to find
myself) on this list a few times.

It has the same basic syntax as chisq.test().
It does both goodness of fit tests and tests of independence.
Yates' and Williams' corrections are implemented.

I've put some examples from Sokal & Rohlf up at
http://www.psych.ualberta.ca/~phurd/cruft

Feedback welcomed,
-P.

# Kludgy log-likelihood test of independence & goodness of fit
# Does Williams' and Yates' correction
#
# G (TOI) calculation from Zar (2000) Biostatistical Analysis 4th ed.
# G (GOF) calculation from Sokal & Rohlf (1995) Biometry 3rd ed.
# q calculation from Sokal & Rohlf (1995) Biometry 3rd ed.
# TOI Yates' correction taken from Mike Camann's 2x2 G-test fn.
# GOF Yates' correction as described in Zar (2000)
# more stuff taken from ctest's chisq.test()
#
# ToDo:
# 1) Beautify
# 2) Add warnings for violations
# 3) Make appropriate corrections happen by default
#
# V2.1 Pete Hurd Sept 20 2001.

g.test <- function(x, y = NULL, correct="none", p = rep(1/length(x), length(x)))
{
  DNAME <- deparse(substitute(x))
  if (is.data.frame(x)) x <- as.matrix(x)
  if (is.matrix(x)) {
    if (min(dim(x)) == 1) 
      x <- as.vector(x)
  }
  if (!is.matrix(x) && !is.null(y)) {
    if (length(x) != length(y)) 
      stop("x and y must have the same length")
    DNAME <- paste(DNAME, "and", deparse(substitute(y)))
    OK <- complete.cases(x, y)
    x <- as.factor(x[OK])
    y <- as.factor(y[OK])
    if ((nlevels(x) < 2) || (nlevels(y) < 2)) 
      stop("x and y must have at least 2 levels")
    x <- table(x, y)
  }
  if (any(x < 0) || any(is.na(x))) 
    stop("all entries of x must be nonnegative and finite")
  if ((n <- sum(x)) == 0) 
    stop("at least one entry of x must be positive")
  #If x is matrix, do test of independence
  if (is.matrix(x)) {
    #this block was the separate g.stat function
    cell.tot <- row.tot <- col.tot <- grand.tot <- 0
    nrows<-nrow(x)
    ncols<-ncol(x)
    if (correct=="yates"){ # Do Yates' correction
      if(dim(x)[1]!=2 || dim(x)[2]!=2) # check for 2x2 matrix
        stop("Yates' correction requires a 2 x 2 matrix")
      if((x[1,1]*x[2,2])-(x[1,2]*x[2,1]) > 0)
        {
          x[1,1] <- x[1,1] - 0.5
          x[2,2] <- x[2,2] - 0.5
          x[1,2] <- x[1,2] + 0.5
          x[2,1] <- x[2,1] + 0.5
        }
      else
        {
          x[1,1] <- x[1,1] + 0.5
          x[2,2] <- x[2,2] + 0.5
          x[1,2] <- x[1,2] - 0.5
          x[2,1] <- x[2,1] - 0.5
        }
    }
    # calculate G (Zar, 2000)
    for (i in 1:nrows){
      for (j in 1:ncols){
        if (x[i,j] != 0) cell.tot <- cell.tot + x[i,j] * log10(x[i,j])
      }
    }
    for (i in 1:nrows){ row.tot <- row.tot + (sum(x[i,])) * log10(sum(x[i,])) }
    for (j in 1:ncols){ col.tot <- col.tot + (sum(x[,j])) * log10(sum(x[,j])) }
    grand.tot <- sum(x)*log10(sum(x))
    total <- cell.tot - row.tot - col.tot + grand.tot
    G <- 4.60517 * total
    q <- 1
    if (correct=="williams"){ # Do Williams' correction
      row.tot <- col.tot <- 0    
      for (i in 1:nrows){ row.tot <- row.tot + 1/(sum(x[i,])) }
      for (j in 1:ncols){ col.tot <- col.tot + 1/(sum(x[,j])) }
      q <- 1+ ((n*row.tot-1)*(n*col.tot-1))/(6*n*(ncols-1)*(nrows-1))
    }
    G <- (G/q)
    # end of old g.stat function
    
    STATISTIC <- G
    PARAMETER <- (nrow(x)-1)*(ncol(x)-1)
    PVAL <- 1-pchisq(STATISTIC,df=PARAMETER)
    if(correct=="none")
      METHOD <- "Log likelihood ratio (G-test) test of independence without correction"
    if(correct=="williams")
      METHOD <- "Log likelihood ratio (G-test) test of independence with Williams' correction"
    if(correct=="yates")
      METHOD <- "Log likelihood ratio (G-test) test of independence with Yates' correction"
  }
  else {
    # x is not a matrix, so we do Goodness of Fit
    METHOD <- "Log likelihood ratio (G-test) goodness of fit test"
    if (length(x) == 1) 
      stop("x must at least have 2 elements")
    if (length(x) != length(p)) 
      stop("x and p must have the same number of elements")
    E <- n * p
    
    if (correct=="yates"){ # Do Yates' correction
      if(length(x)!=2)
        stop("Yates' correction requires 2 data values")
      if ( (x[1]-E[1]) > 0.25) {
        x[1] <- x[1]-0.5
        x[2] <- x[2]+0.5
      }
      else if ( (E[1]-x[1]) > 0.25){
        x[1] <- x[1]+0.5
        x[2] <- x[2]-0.5
      }
    }
    names(E) <- names(x)
    tot <- 0
    for (i in 1:length(x)){
      if (x[i] != 0) tot <- tot + x[i] * log(x[i]/E[i])
    }
    G <- (2*tot)
    if (correct=="williams"){ # Do Williams' correction
      q <- 1+(length(x)+1)/(6*n)
      G <- (G/q)
    }
    STATISTIC <- (G)
    PARAMETER <- length(x) - 1
    PVAL <- pchisq(STATISTIC, PARAMETER, lower = FALSE)
  }
  names(STATISTIC) <- "Log likelihood ratio statistic (G)"
  names(PARAMETER) <- "X-squared df"
  names(PVAL) <- "p.vlaue"
  structure(list(statistic=STATISTIC,parameter=PARAMETER,p.value=PVAL,
                 method=METHOD,data.name=DNAME),class="htest")
}

-- 
Peter L. Hurd                                  Department of Psychology
Assistant Professor                               University of Alberta
phurd at ualberta.ca                                     Edmonton, Alberta
http://www.psych.ualberta.ca/~phurd                      T6G 2E9 Canada


-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-
r-help mailing list -- Read http://www.ci.tuwien.ac.at/~hornik/R/R-FAQ.html
Send "info", "help", or "[un]subscribe"
(in the "body", not the subject !)  To: r-help-request at stat.math.ethz.ch
_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._



More information about the R-help mailing list