[BioC] merge SMD data from different print batch

Wed Apr 5 16:05:14 CEST 2006

Dear Dr. Gollub,

Sorry for the delay in responding as I was debugging my functions. 

Thank you for your detailed explanation which made many things clearer. 
And thank you for developing SMD which is an earlier pioneers of open
repositories and set the standards for many later repositories.

The reason why I used LUID is that read.maimages( source="smd" )
function in LIMMA stores Luid information instead of SUID by default. 
I will email Dr. Symth to inform him of this.

Please ignore the codes in my previous email as the order of merge and
normalizeBetween arrays was incorrect. The codes below are correct and
slightly optimised for memory consumption.

-----------------------------------------------------------------------

normalize.SMD <- function(files, source=c("smd", "smd.old"), 
                 annotation=c("SUID", "Clone ID", "Gene Symbol",
                               "Cluster ID", "Accession")){

  ## 1. Uses LOESS normalization within arrays 
  ## 2. Merges different print batches into a coherent set
  ## 3. Uses scale normalization between arrays for the set as above

  require(limma)
  for(i in 1:length(files)){

    cat(i, "\t")
    fn   <- files[i]
    tmp1 <- read.maimages( files=fn, source=source,
                           annotation=annotation )
    tmp2 <- normalizeWithinArrays( tmp1, method="loess" )

    ## Average M and A with the same SUID
    suid <- as.character(tmp2$genes[ , "SUID"])
    Mval <- tapply( as.numeric(tmp2$M), suid, mean, na.rm=TRUE )
    Aval <- tapply( as.numeric(tmp2$A), suid, mean, na.rm=TRUE )
    stopifnot( identical( names(Mval), names(Aval) ) )

    df <- data.frame( names(Mval), Mval, Aval )
    colnames(df) <- c("SUID", paste(fn, "M", sep="_"), 
                              paste(fn, "A", sep="_"))

    if(i==1){

      expr <- df                 ## initialize the expression matrix
      ann  <- unique(tmp2$genes) ## initialize database of annotations

    } else {

      expr <- merge(expr, df, by="SUID", all=TRUE)
      ann  <- unique( rbind(ann, tmp2$genes) )
    }
    rm(fn, tmp1, tmp2, suid, Mval, Aval, df); gc()
  }

  expr <- expr[ order(as.character(expr$SUID)), ]
  rownames(expr) <- as.character(expr$SUID); expr$SUID <- NULL

  ann  <- ann [ order(as.character(ann$SUID)), ]
  rownames(ann)  <- as.character(ann$SUID);  ann$SUID <- NULL

  stopifnot( identical( rownames(expr), rownames(ann) ) )
  Mmat <- as.matrix( expr[ , grep( "_M$", colnames(expr) ) ] )
  Amat <- as.matrix( expr[ , grep( "_A$", colnames(expr) ) ] )
  obj  <- new( "MAList", list(M=Mmat, A=Amat, genes=ann) )

  out <- normalizeBetweenArrays( obj, method="scale" )
  return(out)
}

## sample usage to normalize

inputs <- list.files(pattern="xls$", full.names=TRUE)
output <- normalize.SMD( inputs, source="smd" ) 

## might also want remove probes with less than 75% presence
good   <- which( rowMeans( !is.na( output ) ) > 0.75 )
output <- output[ good, ]

-----------------------------------------------------------------------

Regards, Adai