[BioC] EasyRNAseq - error in building count table - Error in unlist

Felix Frey [guest] guest at bioconductor.org
Thu Sep 12 16:36:08 CEST 2013


Hello,
I have sequencing data of maize which I aligned to the reference sequence (The maize sequence ZmB73_RefGen_v2.tar.gz, downloaded from:
# http://ftp.maizesequence.org/current/assembly/) using tophat (v2.0.3), bowtie2 (2.0.0.6) and samtools (0.1.18.0).
I want to make a count table with easyRNAseq (1.4.2) to use for EdgeR.
Some months ago, I did following analysis with easyRNASeq:

 chr.map <- data.frame(  from=c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10"),
  to=c("1","2","3","4","5","6","7","8","9","10"))
 ensembl <- useMart("ENSEMBL_MART_PLANT",dataset="zmays_eg_gene")
 exon.annotation <- getBM(c("ensembl_gene_id",
                            "ensembl_transcript_id",
                            "chromosome_name",
                            "ensembl_gene_id",
                            "ensembl_exon_id",
                            "exon_chrom_start",
                            "exon_chrom_end"),
                            mart=ensembl,
                            filters="chromosome_name",
                            values=chrm)
 exon.annotation$chromosome <- paste("chr",exon.annotation$chromosome_name,sep="")
 exon.range <- RangedData(IRanges(
    start=exon.annotation$exon_chrom_start,
    end=exon.annotation$exon_chrom_end),
     space=exon.annotation$chromosome,
     strand=exon.annotation$strand,
     transcript=exon.annotation$ensembl_transcript_id,
     gene=exon.annotation$ensembl_gene_id,
     exon=exon.annotation$ensembl_exon_id,
     universe = "NULL")

 files <- c(	"accepted_hits_12a_sorted.bam",
		"accepted_hits_13a_sorted.bam",...)

 rnaSeq <- easyRNASeq(filesDirectory="/EASYRNASEQ/countdata",
                          gapped=F,
                          validity.check=TRUE,
                          chr.map=chr.map,
                          organism="custom",
                          annotationMethod="env",
                          annotationObject=exon.range,
                          count="genes",
			  filenames=files,
			  summarization="geneModels",
			  outputFormat="RNAseq")

I got an output gene count table.
Now, I wanted to repeat the analysis, but leaving out the non-protein-coding genes, which I wanted to achieve by introducing the "biotype" and filtering by "protein-coding" in the exon.annotation object.
To check the analysis i performed it again like some months before and I got an error message, which I do not understand completely:


OUTPUT:

> head( exon.annotation)
  ensembl_gene_id strand ensembl_transcript_id chromosome_name
1   GRMZM2G439951      1     GRMZM2G439951_T01               5
2   GRMZM2G094632      1     GRMZM2G094632_T02               5
3   GRMZM2G094632      1     GRMZM2G094632_T01               5
4   GRMZM2G094632      1     GRMZM2G094632_T01               5
5   GRMZM2G095090      1     GRMZM2G095090_T01               5
6   GRMZM2G095094      1     GRMZM2G095094_T01               5
  ensembl_gene_id.1   ensembl_exon_id exon_chrom_start exon_chrom_end
1     GRMZM2G439951 GRMZM2G439951_E01         39542856       39544325
2     GRMZM2G094632 GRMZM2G094632_E01         39435878       39436448
3     GRMZM2G094632 GRMZM2G094632_E03         39435878       39436317
4     GRMZM2G094632 GRMZM2G094632_E02         39436730       39437134
5     GRMZM2G095090 GRMZM2G095090_E01         39427195       39427706
6     GRMZM2G095094 GRMZM2G095094_E01         39424253       39427105
    gene_biotype chromosome
1 protein_coding       chr5
2 protein_coding       chr5
3 protein_coding       chr5
4 protein_coding       chr5
5 protein_coding       chr5
6 protein_coding       chr5

> head( exon.range)
RangedData with 6 rows and 5 value columns across 10 spaces
     space                 ranges |    strand        transcript
  <factor>              <IRanges> | <integer>       <character>
1     chr1 [301409811, 301409969] |        -1 AC185612.3_FGT006
2     chr1 [301362751, 301363304] |        -1 GRMZM5G884466_T03
3     chr1 [301353831, 301354022] |        -1 GRMZM5G884466_T03
4     chr1 [301353664, 301353745] |        -1 GRMZM5G884466_T03
5     chr1 [301353366, 301353557] |        -1 GRMZM5G884466_T03
6     chr1 [301353147, 301353260] |        -1 GRMZM5G884466_T03
              gene                   exon        biotype
       <character>            <character>    <character>
1 AC185612.3_FG006 AC185612.3_FG006.exon1 protein_coding
2    GRMZM5G884466      GRMZM5G884466_E12 protein_coding
3    GRMZM5G884466      GRMZM5G884466_E03 protein_coding
4    GRMZM5G884466      GRMZM5G884466_E04 protein_coding
5    GRMZM5G884466      GRMZM5G884466_E08 protein_coding
6    GRMZM5G884466      GRMZM5G884466_E10 protein_coding
>

>  rnaSeq_new <- easyRNASeq(filesDirectory="/projects/irg/grp_stich/personal_folders/Felix/TranscriptomeProfiling/RNASeq/RNASeq/tuxedo/tophat/EASYRNASEQ/countdata",
+                           gapped=F,
+                           validity.check=TRUE,
+                           chr.map=chr.map,
+                           organism="custom",
+                           annotationMethod="env",
+                           annotationObject=exon.range,
+                           count="genes",
+                           filenames=files,
+                           summarization="geneModels",
+                           outputFormat="RNAseq")
Checking arguments...
Fetching annotations...
Computing gene models...
Summarizing counts...
Processing accepted_hits_12a_sorted.bam
Updating the read length information.
The reads are of 95 bp.
Warning message:
In easyRNASeq(filesDirectory = "/projects/irg/grp_stich/personal_folders/Felix/TranscriptomeProfiling/RNASeq/RNASeq/tuxedo/tophat/EASYRNASEQ/countdata",  :
  There are 8770 synthetic exons as determined from your annotation that overlap! This implies that some reads will be counted more than once! Is that really what you want?
Error in unlist(aggregate(readCoverage(obj)[names(geneModel(obj))[gm.sel]],  :
  error in evaluating the argument 'x' in selecting a method for function 'unlist': Error in .Call2("Rle_getStartEndRunAndOffset", x, start, end, PACKAGE = "IRanges") :
  'x' values larger than vector length 'sum(width)'
>





Could you help me with this problem?

Best,
Felix

--
Felix Frey

Max Planck Institut für Pflanzenzüchtungsforschung
Carl-von-Linné-Weg 10
D-50829 Köln
+49 (0) 221-5062 405

 -- output of sessionInfo(): 

> sessionInfo()
R version 3.0.1 (2013-05-16)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
[1] C

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods
[8] base

other attached packages:
 [1] easyRNASeq_1.6.0       ShortRead_1.18.0       latticeExtra_0.6-26
 [4] RColorBrewer_1.0-5     Rsamtools_1.12.4       DESeq_1.12.1
 [7] lattice_0.20-15        locfit_1.5-9.1         BSgenome_1.28.0
[10] GenomicRanges_1.12.5   Biostrings_2.28.0      IRanges_1.18.3
[13] edgeR_3.2.4            limma_3.16.7           biomaRt_2.16.0
[16] Biobase_2.20.1         genomeIntervals_1.16.0 BiocGenerics_0.6.0
[19] intervals_0.14.0

loaded via a namespace (and not attached):
 [1] AnnotationDbi_1.22.6 DBI_0.2-7            RCurl_1.95-4.1
 [4] RSQLite_0.11.4       XML_3.98-1.1         annotate_1.38.0
 [7] bitops_1.0-6         genefilter_1.42.0    geneplotter_1.38.0
[10] grid_3.0.1           hwriter_1.3          splines_3.0.1
[13] stats4_3.0.1         survival_2.37-4      tools_3.0.1
[16] xtable_1.7-1         zlibbioc_1.6.0
>


--
Sent via the guest posting facility at bioconductor.org.



More information about the Bioconductor mailing list