[BioC] AnnBuilder script buildDPkg.R

John Zhang John Zhang <jzhang@jimmy.harvard.edu>
Wed, 22 Jan 2003 16:50:17 -0500 (EST)


--Drey_of_Squirrels_842_000
Content-Type: TEXT/plain; charset=us-ascii
Content-MD5: fRwXNAsOzYzHtK+veM9YRw==

I have fixed the bug in the script (AnnBuilder/inst/scripts/buildDPkg.R) and the 
revised version of the script is attached. Make sure pkgPath is a valid existing 
directory.

A bug in makeGOByNum.R was also fixed and the change will be reflected by 
tomorrow.
 
>From: Reinhold Koch <reinhold.koch@unibas.ch>
>To: bioconductor <bioconductor@stat.math.ethz.ch>
>MIME-version: 1.0
>Content-transfer-encoding: 7bit
>X-Virus-Scanned: by amavisd-milter (http://amavis.org/)
>X-Virus-Scanned: by amavisd-milter (http://amavis.org/)
>X-Spam-Status: No, hits=0.5 required=5.0 
tests=SPAM_PHRASE_00_01,TO_LOCALPART_EQ_REAL version=2.43
>X-Spam-Level: 
>Subject: [BioC] AnnBuilder script buildDPkg.R
>X-BeenThere: bioconductor@stat.math.ethz.ch
>X-Mailman-Version: 2.0.13
>List-Help: <mailto:bioconductor-request@stat.math.ethz.ch?subject=help>
>List-Post: <mailto:bioconductor@stat.math.ethz.ch>
>List-Subscribe: <http://www.stat.math.ethz.ch/mailman/listinfo/bioconductor>, 
<mailto:bioconductor-request@stat.math.ethz.ch?subject=subscribe>
>List-Id: The Bioconductor Project Mailing List <bioconductor.stat.math.ethz.ch>
>List-Unsubscribe: <http://www.stat.math.ethz.ch/mailman/listinfo/bioconductor>, 
<mailto:bioconductor-request@stat.math.ethz.ch?subject=unsubscribe>
>List-Archive: <http://www.stat.math.ethz.ch/pipermail/bioconductor/>
>Date: Tue, 21 Jan 2003 17:08:52 +0100
>
>Hello,
>
>I tried to update the annotations for Affymetrix' rgu34a chip using the
>buildDPkg.R. After some modifications I get as far as the flat file
>"rgu34aLL", but then fileToXML bails out with 
>
>"Error in lapply(fileRead[, i], doMultValue, multSep, i) :
>        subscript out of bounds
>Execution halted"
>
>Has anybody a version around that performs better?
>
>Cheers
>Reinhold Koch
>
>_______________________________________________
>Bioconductor mailing list
>Bioconductor@stat.math.ethz.ch
>http://www.stat.math.ethz.ch/mailman/listinfo/bioconductor

--Drey_of_Squirrels_842_000
Content-Type: TEXT/plain; name="buildDPkg.R"; charset=us-ascii; x-unix-mode=0640
Content-Description: buildDPkg.R
Content-MD5: DF87skt6NzDOsWm6gjzsPA==

# A script to build annotation data packages.
# pkgName - a character string for the name of the data package to be
#           built (e. g. hgu95a, rgu34a)
# organism - a character string for the name of the organism of
#            concern (now can only be "human", "mouse", or "rat")
# pkgPath - a character string for the full path of an existing
#           directory where the built backage will be stored
# version - a character string for the version number
#
# Write to jzhang@jimmy.harvard.edu for questions about this script.

buildDPkg <- function(pkgName, organism = "human",
pkgPath = "temp", version = "1.1.0"){

library(AnnBuilder)
path <- .path.package("AnnBuilder")

# Download data files from sources
llSource <- fileFetcher(getSrcUrl(src = "ll", organism = organism))
ugSource <- fileFetcher(getSrcUrl(src = "ug", organism = organism))
if(tolower(organism) == "human" || tolower(organism) == "mouse"){
    linkSource <- fileFetcher( paste(getSrcUrl(src = "UCSC", organism
                                               = organism),
                     "refLink.txt.gz", sep = ""))
    geneSource <- fileFetcher(paste(getSrcUrl(src = "UCSC", organism =
                                              organism),
                     "refGene.txt.gz", sep = ""))
}
goSource <- fileFetcher(getSrcUrl(src = "go", organism = organism))

# Get the unified mapping between a given id to LocusLink id
llParser <- file.path(path, "data", "llAffyLLParser")
ugParser <- file.path(path, "data", "ugAffyLLParser")
umichMapping <- file.path(path, "data", paste(pkgName, "_UMich", sep =
                                              ""))
dchipMapping <- file.path(path, "data", paste(pkgName, "_Cheng", sep =                                         ""))
toMap <- list(c(llSource, llParser), c(ugSource, ugParser),
              c(umichMapping, NA), c(dchipMapping, NA))
baseF <- file.path(path, "data", paste(pkgName, "id", sep = ""))
unified <- file.path(path, "temp", paste(pkgName, "unified", sep = ""))
unifier(baseFile = baseF, sources = toMap, trusted = 0, sep = "\t",
                    header = FALSE, baseCols = c("id", "acc"), byID = "id",
                    asFile = TRUE, outName = unified, verbose = TRUE)
# Parse the LocusLink data file using the unified mapping and a base file
ll <- file.path(path, "temp", paste(pkgName, "LL", sep = ""))
llParser <- file.path(path, "data", "llParser")
fileMuncher(outName = ll, baseFile = unified, dataFile =
           llSource, parser =  llParser, isDir = FALSE)
# Parse chromosomal location and orientation data
if(tolower(organism) == "human" || tolower(organism) == "mouse"){
    testLink <- file.path(path, "temp","buildLink")
    linkParser <- file.path(path, "data", "refLinkParser")
    fileMuncher(outName = testLink, baseFile = unified, dataFile =
           linkSource, parser = linkParser, isDir = FALSE)
    testGene <- file.path(path, "temp", "buildGene")
    geneParser <- file.path(path, "data", "refGeneParser")
    fileMuncher(outName = testGene, baseFile = testLink, dataFile =
           geneSource, parser = geneParser, isDir = FALSE)
    ## Some gene location data may end up with no chromosome number if
    ## only chromosome number data from LocusLink are used. Get the
    ## from the refGene file also.
    geneChr <-  file.path(path, "temp", "geneChr")
    geneParser <- file.path(path, "data", "refGeneChrParser")
    fileMuncher(outName = geneChr, baseFile = testLink, dataFile =
           geneSource, parser = geneParser, isDir = FALSE)
    # merge the two parsed files
    f1Col <- c("affy", "acc", "locusid", "unigeneid", "name", "symbol",
           "chrom", "cyto", "pmid", "grif","sumfun", "go")
    f2Col <- c("affy", "chrlocation", "chrorientation")
    testmerged <- file.path(path, "temp", paste(pkgName, "Merged", sep
                                                = ""))
    mergeFiles(file1 = ll, file2 = testGene, file1Col = f1Col,
           file2Col = f2Col, idCol = "affy", outName = testmerged,
           sep = "\t", header = FALSE, isFile = TRUE)
}else{
    testmerged <- ll
}

pathData <- file.path(path, "temp", "humanPath")
getPathway(file.path(path, "temp", "humanPath"), organism = organism)
mappedPath <- file.path(path, "temp", "buildPath")
mapPathway(mappedPath, geneFile = unified, pathFile = pathData,
geneColNames = c("affy", "acc", "ll", "counts"), pathColNames = c("ll", "path","enzyme"), mapId = "ll", colToKeep = c("affy", "path", "enzyme"),
geneSep = "\t", pathSep = "\t")
# Merge pathway data with previously parsed file
if(tolower(organism) == "human" || tolower(organism) == "mouse"){
    f1Col <- c("affy", "acc", "locusid", "unigeneid", "name",
               "symbol","chrom", "cyto", "pmid", "grif","sumfun", "go",
               "chrolocation","chroorientation")
}else{
    f1Col <- c("affy", "acc", "locusid", "unigeneid", "name", "symbol",
           "chrom", "cyto", "pmid", "grif","sumfun", "go")
}
f2Col <- c("affy", "path", "enzyme")
mergeFiles(file1 = testmerged, file2 =  mappedPath, file1Col = f1Col,
           file2Col = f2Col, idCol = "affy", outName = testmerged,
           sep = "\t", header = FALSE, isFile = TRUE)
# Produce the XM file (testXML1) containing gene information
testXML1 <- file.path(path, "temp", paste(pkgName, ".xml", sep = ""))
if(tolower(organism) == "human" || tolower(organism) == "mouse")
    colNames <- c("AFFY", "ACCNUM", "LOCUSID", "UNIGENE", "GENENAME",
                  "SYMBOL","CHR", "MAP", "PMID", "GRIF", "SUMFUNC",
                  "GO", "CHRLOC", "CHRORI", "PATH", "ENZYME")
else
    colNames <- c("AFFY", "ACCNUM", "LOCUSID", "UNIGENE", "GENENAME",
                  "SYMBOL","CHR", "MAP", "PMID", "GRIF", "SUMFUNC",
                  "GO", "PATH", "ENZYME")
if(tolower(organism) == "human" || tolower(organism) == "mouse"){
    multC <- c("PMID", "GO", "CHRLOC", "CHRORI", "PATH", "ENZYME",
                                                           "CHR","MAP")
}else{
    multC <- c("PMID", "GO", "PATH", "ENZYME", "CHR","MAP")
}
typeC <- c("GENENAME", "SYMBOL")
multS <- ";"
typeS <- ";"
fileToXML(targetName = pkgName, outName = testXML1, inName =testmerged,
       idColName = "AFFY", multColNames = multC, typeColNames = typeC,
       colNames  = colNames)
xml2DataPkg(fileName = testXML1, pkgName = pkgName, path = pkgPath,
            organism = organism)
# Get the reverse mapping between PubMed, pathway, enzyme and Affy ids
# pmid <- file2Env(testmerged, colNames, keyColName = "PMID",
# valueColName = "AFFY")
pathway <- file2Env(testmerged, colNames, keyColName =
                 "PATH", valueColName = "AFFY")
enzyme <- file2Env(testmerged, colNames, keyColName =
                   "ENZYME", valueColName = "AFFY")
# assign(paste(pkgName, "PMID2AFFY", sep = ""), pmid, parent.frame())
assign(paste(pkgName, "PATH2AFFY", sep = ""), pathway, parent.frame())
assign(paste(pkgName, "ENZYME2AFFY", sep = ""), enzyme, parent.frame())
# makeRdaFile("PMID2AFFY", pkgName, pkgPath, envir = parent.frame())
makeRdaFile("PATH2AFFY", pkgName, pkgPath, envir = parent.frame())
makeRdaFile("ENZYME2AFFY", pkgName, pkgPath, envir = parent.frame())
# Process the GO data file
GOXMLParser(dbName = "annbuilder", tName = "abtestgoorig", fileName =
            goSource)
goNAffy <- file.path(path, "temp", "testgoNaffy")
goParser <- file.path(path, "data", "affyGOParser")
fileMuncher(outName = goNAffy, baseFile=unified, dataFile =
llSource, parser = goParser, isDir = FALSE)
go2GeneMapper(dbName = "annbuilder", tName = "abtestgo", fileName = goNAffy,
              goTName = "abtestgoorig", exclude = "GO:0003673")
testXMLByNum <- file.path(path, "temp", paste(pkgName, "ByNum.xml",
                                              sep = ""))
makeGOByNum(dbName = "annbuilder", tName = "abtestgo", outName =
            testXMLByNum,tColNames = c("go",
                         "geneid","genes","total"), fNames =
            c("GO","GO2AFFY", "GO2ALLAFFY", "AFFYCOUNTS"),multCol =
            c("GO2AFFY","GO2ALLAFFY"))
xml2DataPkg(fileName = testXMLByNum, pkgName = pkgName, path =
            pkgPath, organism = organism, rdaOnly = TRUE)

--Drey_of_Squirrels_842_000--