[BioC] Low number of replicates DESeq

Federico Gaiti f.gaiti at uq.edu.au
Tue Feb 25 22:26:49 CET 2014


Hi MiKe,

thanks for useful suggestion.

As you and Steve suggested, I followed the multifactorial design section in both DESeq and DESeq2 vignette and then checked the quality by sample clustering and visualization.

In DESeq 1.14:

CountTable=read.table("DGE.txt",header=TRUE,row.names=1)
head(CountTable)
Design=data.frame(row.names=colnames(CountTable),condition=c("ADULT","ADULT","ADULT","ADULT","JUV","JUV","JUV","JUV","COMP","COMP","COMP","COMP","PRECOMP","PRECOMP","PRECOMP","PRECOMP"),libType=c("stranded","unstranded","unstranded","unstranded","stranded","unstranded","unstranded","unstranded","stranded","unstranded","unstranded","unstranded","stranded","unstranded","unstranded","unstranded"))
cdsFull=newCountDataSet(CountTable,Design)
head(cdsFull)
cdsFull=estimateSizeFactors(cdsFull)
sizeFactors(cdsFull)
cdsFull=estimateDispersions(cdsFull)
plotDispEsts(cdsFull)
fit1=fitNbinomGLMs(cdsFull,count ~ libType + condition)
str(fit1)
fit0=fitNbinomGLMs(cdsFull,count ~ libType )
pvalsGLM = nbinomGLMTest( fit1, fit0 )
padjGLM = p.adjust( pvalsGLM, method="BH" )
head(fit1)
cdsFullB=newCountDataSet(CountTable,Design$condition)
cdsFullB=estimateSizeFactors(cdsFullB)
cdsFullB=estimateDispersions(cdsFullB)
rs = rowSums ( counts ( cdsFull ))
theta = 0.4
use = (rs > quantile(rs, probs=theta))
table(use)
cdsFilt = cdsFull[ use, ]
fitFilt1  = fitNbinomGLMs( cdsFilt, count ~ libType + condition )
fitFilt0  = fitNbinomGLMs( cdsFilt, count ~ libType  )
pvalsFilt = nbinomGLMTest( fitFilt1, fitFilt0 )
padjFilt  = p.adjust(pvalsFilt, method="BH" )
padjFiltForComparison = rep(+Inf, length(padjGLM))
padjFiltForComparison[use] = padjFilt
tab3 = table( `no filtering`   = padjGLM < .05,`with filtering` = padjFiltForComparison < .05 )
addmargins(tab3)
res=cbind(fit1,pvalsGLM=pvalsGLM,padjGLM=padjGLM)
resSig=res[which(res$padjGLM<0.05),]
table(res$padjGLM<0.05)
write.csv(res,file='res_Multifactor_DESeq.csv')
hist(res$pvalsGLM,breaks=100)
resFilt=cbind(fitFilt1,pvalsFilt=pvalsFilt,padjFilt=padjFilt)
resSigFilt=resFilt[which(resFilt$padjFilt<0.05),]
table(resFilt$padjFilt<0.05)
write.csv(resFilt,file='resFilt_Multifactor_DESeq.csv')
plot(rank(rs)/length(rs), -log10(pvalsGLM), pch=16, cex=0.45)
h1 = hist(pvalsGLM[!use], breaks=50, plot=FALSE)
h2 = hist(pvalsGLM[use], breaks=50, plot=FALSE)
colori = c(`do not pass`="khaki", `pass`="powderblue")
barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, col = colori,space = 0, main = "", ylab="frequency")
text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA)
legend("topright", fill=rev(colori), legend=rev(names(colori)))
cdsFullBlind = estimateDispersions( cdsFull, method = "blind" )
vsdFull = varianceStabilizingTransformation( cdsFullBlind )
select = order(rowMeans(counts(cdsFull)), decreasing=TRUE)[1:100]
hmcol = colorRampPalette(brewer.pal(9, "GnBu"))(100)
heatmap.2(exprs(vsdFull)[select,], col = hmcol, trace="none", margin=c(10, 6))
heatmap.2(counts(cdsFull)[select,], col = hmcol, trace="none", margin=c(10,6))
dists = dist( t( exprs(vsdFull) ) )
mat = as.matrix( dists )
rownames(mat) = colnames(mat) = with(pData(cdsFullBlind), paste(condition, libType, sep=" : "))
heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(13, 13))
print(plotPCA(vsdFull, intgroup=c("condition", "libType")))

In DESeq2 1.3.41:

CountTable=read.table("DGE.txt",header=TRUE,row.names=1)
head(CountTable)
Design=data.frame(row.names=colnames(CountTable),condition=c("ADULT","ADULT","ADULT","ADULT","JUV","JUV","JUV","JUV","COMP","COMP","COMP","COMP","PRECOMP","PRECOMP","PRECOMP","PRECOMP"),libType=c("stranded","unstranded","unstranded","unstranded","stranded","unstranded","unstranded","unstranded","stranded","unstranded","unstranded","unstranded","stranded","unstranded","unstranded","unstranded"))
dds<-DESeqDataSetFromMatrix(countData=CountTable,colData=Design,design=~condition + libType)
colData(dds)$condition<-factor(colData(dds)$condition,levels=c("PRECOMP","COMP","JUV","ADULT"))
design(dds)<-formula(~libType + condition )
dds<-DESeq(dds)
rld <- rlogTransformation(dds, blind=TRUE)
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vstMat <- assay(vsd)
rlogMat <- assay(rld)
px     <- counts(dds)[,1] / sizeFactors(dds)[1]
ord    <- order(px)
ord    <- ord[px[ord] < 150]
ord    <- ord[seq(1, length(ord), length=50)]
last   <- ord[length(ord)]
vstcol <- c("blue", "black")
matplot(px[ord], cbind(assay(vsd)[, 1], log2(px))[ord, ], type="l", lty=1, col=vstcol, xlab="n", ylab="f(n)")
legend("bottomright", legend = c(expression("variance stabilizing transformation"), expression(log[2](n/s[1]))),fill=vstcol)
library("vsn")
par(mfrow=c(1,3))
notAllZero <- (rowSums(counts(dds))>0)
meanSdPlot(log2(counts(dds,normalized=TRUE)[notAllZero,] + 1), ylim = c(0,2.5))
meanSdPlot(assay(rld[notAllZero,]), ylim = c(0,2.5))
meanSdPlot(assay(vsd[notAllZero,]), ylim = c(0,2.5))
select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:100]
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
heatmap.2(counts(dds,normalized=TRUE)[select,], col = hmcol, Rowv = FALSE, Colv = FALSE, scale="none", dendrogram="none", trace="none", margin=c(10,6))
heatmap.2(assay(rld)[select,], col = hmcol, Rowv = FALSE, Colv = FALSE, scale="none", dendrogram="none", trace="none", margin=c(10, 6))
heatmap.2(assay(vsd)[select,], col = hmcol, Rowv = FALSE, Colv = FALSE, scale="none", dendrogram="none", trace="none", margin=c(10, 6))
distsRL <- dist(t(assay(rld)))
select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:30]
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
heatmap.2(counts(dds,normalized=TRUE)[select,], col = hmcol, Rowv = FALSE, Colv = FALSE, scale="none", dendrogram="none", trace="none", margin=c(10,6))
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
rownames(mat) <- colnames(mat) <- with(colData(dds),
paste(condition))
rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition, libType, sep=" : "))
heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(13, 13))
print(plotPCA(rld, intgroup=c("condition", "libType")))

See attached the plots I obtained (*_MULTIFACTORIAL.jpeg)

Let me know what you think

I wouldn't bother using the stranded data if I was doing a vanilla DGE but I am investigating long non-coding RNAs and so I need to have anti-sense transcription quantification. This is reason that brought me to include the unstranded data in the analysis, to increase the number of replicates and test for DE to obtain a set of lncRNAs I can handle for downstream analysis.
I really need to have replicates!

Let me know if you have any further idea

Thanks a lot for help

Federico


6 February 2014 1:03 AM
To: Federico Gaiti
Cc: Steve Lianoglou; bioconductor at r-project.org
Subject: Re: [BioC] Low number of replicates DESeq

hi Federico,

I would second Steve's recommendation to use DESeq2 and to take a look at the vignette section on sample clustering and visualization. For instance, it appears you are looking at correlations in the raw count scale, while we recommend some kind of transformation into the log scale (we offer two such transformations in DESeq2 which are described in the vignette). Even log(x + 1) would be better than the raw count scale, where inter-sample distances will be dominated by a few large counts.

As far as DGE analysis, you can add covariates to the design formula -- as discussed in the vignette -- which would account for differences due to the change in protocol. This would look like :

design(dds) <- ~ protocol + condition

This setup will look for consistent log fold changes due to condition, while accounting for the stranded/unstranded differences.

However, I would not recommend using the dispersions estimated from the unstranded protocol to perform inference on the single replicate stranded dataset, because it might be the case that the stranded protocol results in more highly dispersed counts. Then large differences would appear to be unlikely due to chance, when they really are due to chance, i.e. generating false positives. You really need replicates to perform statistical inference on the stranded experiment.

Mike



On Mon, Feb 24, 2014 at 8:18 PM, Federico Gaiti <f.gaiti at uq.edu.au<mailto:f.gaiti at uq.edu.au>> wrote:
Thanks Steve -- I'll have a look at the DESeq2 vignette.

Here is what I've done, let me know if it's still unclear

samtools view TOPHAT_STRANDED_sorted.bam | python -m HTSeq.scripts.count -s no - gtfFile > stranded_counts_noS.txt

head(Counts)

               1.1   1.2    1.3    1.4    2.1   2.2    2.3    2.4    3.1    3.2   3.3    3.4    4.1    4.2   4.3   4.4

XXXX    9       0       24      48      30      5       1       1       21      15      8       6       28      28      27      47

XXXX    0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0

XXXX    16      0       0       0       19      4       0       0       40      1       2       3       78      5       5       7

XXXX    0       8       0       7       5       5       19      7       14      4       4       7       9       4       12      1
......

and then in R:

data<-read.table("Counts",header=TRUE,row.names=1)
head(data)
Design=data.frame(
   row.names=colnames(data),
   condition=c('1','1','1','1','2','2','2','2','3','3','3','3','4','4','4','4'))
cds=newCountDataSet(data,Design)
cds=estimateSizeFactors(cds)
sizeFactors(cds)
All_samples_normalized<-counts(cds,normalized=TRUE)
table(rowSums(All_samples_normalized)==0)
data<-All_samples_normalized[rowSums(All_samples_normalized)>0,]
data_subset_matrix<-as.matrix(data)
Spearman_normalized<-rcorr(data_subset_matrix,type="spearman")
A<-Spearman_normalized$r
write.table(A,file="Spearman_normalized_allsamples.txt")
Pearson_normalized<-rcorr(data_subset_matrix,type="pearson")
B<-Pearson_normalized$r
write.table(B,file="Pearson_normalized_allsamples.txt")
A_matrix<-as.matrix(A)
B_matrix<-as.matrix(B)
corrgram(B_matrix,order="PCA")
corrgram(A_matrix,order="PCA")
pca3=PCA(A_matrix,graph=TRUE)
pca3=PCA(B_matrix,graph=TRUE)

This gave me a nice correlation but I'd still like to to use the stranded counts for the DGE.
The issue is that if I only use stranded data I don't have replicates. If I include the 3 unstranded replicates I need to use the option -s no for the stranded data because otherwise stranded and unstranded do not correlate.

So my ideas was to use the unstranded data to estimate the level of variation to get a threshold for DE detection but still use the stranded data as expression values.

Would it be possible? Or would it be better to stick to the -s no option for the DGE?

Thanks
Federico




________________________________________
From: mailinglist.honeypot at gmail.com<mailto:mailinglist.honeypot at gmail.com> [mailinglist.honeypot at gmail.com<mailto:mailinglist.honeypot at gmail.com>] on behalf of Steve Lianoglou [lianoglou.steve at gene.com<mailto:lianoglou.steve at gene.com>]
Sent: Tuesday, 25 February 2014 10:34 AM
To: Federico Gaiti
Cc: bioconductor at r-project.org<mailto:bioconductor at r-project.org>
Subject: Re: [BioC] Low number of replicates DESeq

Hi,

Since you are just starting your analysis and are in the world of
DESeq, you should probably switch to DESeq2.

You mention things about "low correlation" but it's not clear what
conditions you are comparing where. Instead of describing your
analysis at a high level, showing the code that you used would be more
helpful.

That having been said, the first thing I would do is to perform the
steps outlined in the DESeq2 vignette under the section "Data quality
assessment by sample clustering and visualization" to see if your
replicate data cluster closely together in meaningful ways using the
heatmaps and PCA plots outlined there.

HTH,
-steve

On Mon, Feb 24, 2014 at 3:31 PM, Federico Gaiti <f.gaiti at uq.edu.au<mailto:f.gaiti at uq.edu.au>> wrote:
> Hi all,
>
> I am using DESEq for a DGE analysis.
>
> I have STRANDED RNA-Seq data for 4 developmental stages with no replicates but I know that to have a more reliable DGE I should have replicates. So I got (from another lab member) UNSTRANDED RNA-Seq data with 3 replicates per stage.
>
> So my data situation at the moment is:
>
> STAGE 1     stranded
> STAGE 1.1  unstranded
> STAGE 1.2  unstranded
> STAGE 1.3  unstranded
> STAGE 2     stranded
> STAGE 2.1  unstranded
> STAGE 2.2  unstranded
> STAGE 2.3  unstranded
> STAGE 3     stranded
> STAGE 3.1  unstranded
> STAGE 3.2  unstranded
> STAGE 3.3  unstranded
> STAGE 4     stranded
> STAGE 4.1  unstranded
> STAGE 4.2  unstranded
> STAGE 4.3  unstranded
>
> Before doing a DGE, I thought to test the correlation between these samples, just to show that similar samples "cluster" together. If so, I thought to use the unstranded data for my DGE analysis to reach the final number of 4 replicates per stage.
>
> I mapped the raw reads to the genome using TOPHAT (v2.0.9) (fr-unstranded for unstranded data and fr-secondstrand for stranded data), used htseq-count (HTSeq 0.5.4p5) to get the raw reads counts for both the data. For the stranded data I used the option -s yes and for the unstranded data I used -s no. I then used DESeq (v1.14.0) to include metadata and for normalization, and I removed the genes that always have a 0 value. I then calcualted the correlation which was really low.
>
> I tried to use the option -s reverse for the stranded data and still got really low correlation. So I reran htseq-count on the stranded data selecting the option -s no and in this way I got a very similar number of total counts between the unstranded and stranded data, around 4-5M counts each stage (while both cases before the stranded ones were double in number).
>
> I included the metadata
>
>
>> Design
>             condition
> ADULT        ADULT
> ADULT1       ADULT
> ADULT2       ADULT
> ADULT3       ADULT
> JUV            JUV
> JUV1           JUV
> JUV2           JUV
> JUV3           JUV
> COMP          COMP
> COMP1         COMP
> COMP2         COMP
> COMP3         COMP
> PRECOMP    PRECOMP
> PRECOMP1   PRECOMP
> PRECOMP2   PRECOMP
> PRECOMP3   PRECOMP
>
> and estimated the new size factors, normalized and calculated the new correlation. Pearson performed pretty well, confirmed by both a PCA and correlogram. So my initial idea was to do a DGE "treating" the stranded data as unstranded, having 4 replicates per stage. Though, I'd still like to figure out a way to use the stranded counts since I am not sure if I am losing some information running htseq-count using -s no on the stranded data.
>
>
> What I had in mind was using unstranded data to estimate the level of variation to get a threshold for DE detection but still use the stranded data as expression values. Not sure if I can do that though given one is stranded and the other is not.
>
>
> I would like to hear from you if you have any thoughts about this.
>
>
> Let me know if you need any further details to better understand the issue.
>
>
> Thanks in advance,
>
> Federico
>
> Federico Gaiti
> Ph.D. Candidate
> School of Biological Sciences
> University of Queensland
> St Lucia QLD 4072
> Australia
> f.gaiti at uq.edu.au<mailto:f.gaiti at uq.edu.au>
>
>
>         [[alternative HTML version deleted]]
>
>
> _______________________________________________
> Bioconductor mailing list
> Bioconductor at r-project.org<mailto:Bioconductor at r-project.org>
> https://stat.ethz.ch/mailman/listinfo/bioconductor
> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor



--
Steve Lianoglou
Computational Biologist
Genentech

_______________________________________________
Bioconductor mailing list
Bioconductor at r-project.org<mailto:Bioconductor at r-project.org>
https://stat.ethz.ch/mailman/listinfo/bioconductor
Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor



More information about the Bioconductor mailing list