[Rd] Enhanced version of plot.lm()

Sat Apr 23 13:30:17 CEST 2005

I propose the following enhancements and changes to plot.lm(),
the most important of which is the addition of a Residuals vs
Leverage plot.

(1) A residual versus leverage plot has been added, available
by specifying which = 5, and not included as one of the default
plots.  Contours of Cook's distance are included, by default at
values of 0.5 and 1.0.  The labeled points, if any, are those with
the largest Cook's distances.  The parameter cook.levels can be
changed as required, to control what contours appear.

(2) Remove the word "plot" from the captions for which=2, 3, 4.
It is redundant.

(3) Now that the pos argument to text() is vectorized, use that
in preference to an offset.

(4) For which!=4 or 5, by default use pos=4 on the left half
of the panel, and pos=2 on the right half of the panel.
This prevents labels from appearing outside the plot area,
where they can overlap other graphical features.
The parameter label.pos allows users to change this default.

The modified code that I propose is below.   This, a modified .Rd
file, and files from diff used with the April 20 development version,
are in my directory

http://wwwmaths.anu.edu.au/~johnm/r/plot-lm/

I believe the Residual-Leverage plot is given in Krause & Olsen,
whether with Cook's distance contours I do not recall.  I do not
have access to a copy of this book.  Martin Maechler drew my
attention to it in 2003, as superior to the Cook's distance plot.
I have finally got around to coding it up!

John Maindonald.

"plot.lm" <-
function (x, which = 1:4, caption = c("Residuals vs Fitted",
                               "Normal Q-Q", "Scale-Location",
                               "Cook's distance", "Residuals vs 
Leverage"),
             panel = points, sub.caption = deparse(x$call), main = "",
             ask = prod(par("mfcol")) < length(which) && 
dev.interactive(),
             ..., id.n = 3, labels.id = names(residuals(x)), cex.id = 
0.75,
             cook.levels=c(0.5, 1.0), label.pos=c(4,2))
{
   if (!inherits(x, "lm"))
     stop("Use only with 'lm' objects")
   if (!is.numeric(which) || any(which < 1) || any(which > 5))
     stop("`which' must be in 1:5")
   isGlm <- inherits(x, "glm")
   show <- rep(FALSE, 5)
   show[which] <- TRUE
   r <- residuals(x)
   yh <- predict(x)
   w <- weights(x)
   if (!is.null(w)) {
     wind <- w != 0
     r <- r[wind]
     yh <- yh[wind]
     w <- w[wind]
     labels.id <- labels.id[wind]
   }
   n <- length(r)
   if (any(show[2:5])) {
     s <- if (inherits(x, "rlm"))
       x$s
     else sqrt(deviance(x)/df.residual(x))
     hii <- lm.influence(x, do.coef = FALSE)$hat
     if (any(show[4:5])) {
       cook <- if (isGlm)cooks.distance(x)
       else cooks.distance(x, sd = s, res = r)
     }
   }
   if (any(show[c(2:3,5)])) {
     ylab23 <- if (isGlm)
       "Std. deviance resid."
     else "Standardized residuals"
     r.w <- if (is.null(w))
       r
     else sqrt(w) * r
     rs <- r.w/(s * sqrt(1 - hii))
   }
   if (any(show[c(1, 3)]))
     l.fit <- if (isGlm)
       "Predicted values"
     else "Fitted values"
   if (is.null(id.n))
     id.n <- 0
   else {
     id.n <- as.integer(id.n)
     if (id.n < 0 || id.n > n)
       stop("`id.n' must be in {1,..,", n, "}")
   }
   if (id.n > 0) {
     if (is.null(labels.id))
       labels.id <- paste(1:n)
     iid <- 1:id.n
     show.r <- sort.list(abs(r), decreasing = TRUE)[iid]
     if (any(show[2:3]))
       show.rs <- sort.list(abs(rs), decreasing = TRUE)[iid]
     text.id <- function(x, y, ind, adj.x = FALSE){
       midx <- mean(range(x))
       labpos <- if (!adj.x) 3
       else
         label.pos[1+as.numeric(x>midx)]
       text(x, y, labels.id[ind], cex = cex.id, xpd = TRUE, pos=labpos,
            offset=0.25)
     }
   }
   one.fig <- prod(par("mfcol")) == 1
   if (ask) {
     op <- par(ask = TRUE)
     on.exit(par(op))
   }
   if (show[1]) {
     ylim <- range(r, na.rm = TRUE)
     if (id.n > 0)
       ylim <- ylim + c(-1, 1) * 0.08 * diff(ylim)
     plot(yh, r, xlab = l.fit, ylab = "Residuals", main = main,
          ylim = ylim, type = "n", ...)
     panel(yh, r, ...)
     if (one.fig)
       title(sub = sub.caption, ...)
     mtext(caption[1], 3, 0.25)
     if (id.n > 0) {
       y.id <- r[show.r]
       y.id[y.id < 0] <- y.id[y.id < 0] - strheight(" ")/3
       text.id(yh[show.r], y.id, show.r, adj.x = TRUE)
     }
     abline(h = 0, lty = 3, col = "gray")
   }
   if (show[2]) {
     ylim <- range(rs, na.rm = TRUE)
     ylim[2] <- ylim[2] + diff(ylim) * 0.075
     qq <- qqnorm(rs, main = main, ylab = ylab23, ylim = ylim,
                  ...)
     if (one.fig)
       title(sub = sub.caption, ...)
     mtext(caption[2], 3, 0.25)
     if (id.n > 0)
       text.id(qq$x[show.rs], qq$y[show.rs], show.rs, adj.x = TRUE)
   }
   if (show[3]) {
     sqrtabsr <- sqrt(abs(rs))
     ylim <- c(0, max(sqrtabsr, na.rm = TRUE))
     yl <- as.expression(substitute(sqrt(abs(YL)), list(YL = 
as.name(ylab23))))
     yhn0 <- if (is.null(w))
       yh
     else yh[w != 0]
     plot(yhn0, sqrtabsr, xlab = l.fit, ylab = yl, main = main,
          ylim = ylim, type = "n", ...)
     panel(yhn0, sqrtabsr, ...)
     if (one.fig)
       title(sub = sub.caption, ...)
     mtext(caption[3], 3, 0.25)
     if (id.n > 0)
       text.id(yhn0[show.rs], sqrtabsr[show.rs], show.rs,
               adj.x = TRUE)
   }
   if (show[4]) {
     if (id.n > 0) {
       show.r <- order(-cook)[iid]
       ymx <- cook[show.r[1]] * 1.075
     }
     else ymx <- max(cook)
     plot(cook, type = "h", ylim = c(0, ymx), main = main,
          xlab = "Obs. number", ylab = "Cook's distance", ...)
     if (one.fig)
       title(sub = sub.caption, ...)
     mtext(caption[4], 3, 0.25)
     if (id.n > 0)
       text.id(show.r, cook[show.r], show.r, adj.x=FALSE)
   }
   if (show[5]) {
     ylim <- range(rs, na.rm = TRUE)
     hatval <- hatvalues(x)
     if (id.n > 0) {
       ylim <- ylim + c(-1, 1) * 0.08 * diff(ylim)
       show.r <- order(-cook)[iid]
     }
     plot(hatval, rs, ylim = ylim, main = main,
          xlab = "Leverages", ylab = ylab23,
          type="n", ...)
     panel(hatval, rs, ...)
     if (one.fig)
       title(sub = sub.caption, ...)
     p <- length(coef(x))
     for(crit in cook.levels){
       curve(sqrt(crit*p*(1-x)/x), lty=2, add=T)
       curve(-sqrt(crit*p*(1-x)/x), lty=2, add=T)
     }
     xmax <- par()$usr[2]
     ymult <- sqrt(p*(1-xmax)/xmax)
     aty <- c(-sqrt(rev(cook.levels))*ymult, sqrt(cook.levels)*ymult)
     axis(4, at=aty, labels=paste(c(rev(cook.levels), cook.levels)),
          mgp=c(.25,.25,0), las=2, tck=0, cex.axis=cex.id)
     mtext(caption[5], 3, 0.25)
     if (id.n > 0) {
       y.id <- rs[show.r]
       y.id[y.id < 0] <- y.id[y.id < 0] - strheight(" ")/3
       text(hatval[show.r], y.id, paste(show.r), pos=2, cex=cex.id, 
offset=0.25)
   }
   }
   if (!one.fig && par("oma")[3] >= 1)
     mtext(sub.caption, outer = TRUE, cex = 1.25)
   invisible()
}

John Maindonald             email: john.maindonald at anu.edu.au
phone : +61 2 (6125)3473    fax  : +61 2(6125)5549
Centre for Bioinformation Science, Room 1194,
John Dedman Mathematical Sciences Building (Building 27)
Australian National University, Canberra ACT 0200.