#### Filtering, smoothing and parameter estimation in a #### model for CG-islands in DNA #### States: 1,2,3,4 = A,C,G,T in a CG-island, 5,6,7,8 = A,C,G,T outside #Input of the transition matrix pr <- matrix(0,nrow=8,ncol=8) eps <- rep(0.05,4) eta <- rep(0.05,4) #transition probabilities starting in CG-islands pr[1,1:4] <- (1-eps[1])*c(0.180,0.274,0.426,0.120) pr[2,1:4] <- (1-eps[2])*c(0.170,0.368,0.274,0.188) pr[3,1:4] <- (1-eps[3])*c(0.161,0.339,0.375,0.125) pr[4,1:4] <- (1-eps[4])*c(0.079,0.355,0.384,0.182) pr[1:4,5:8] <- 0.25*eps #transition probabilities starting outside a CG-island pr[5:8,1:4] <- 0.25*eta pr[5,5:8] <- (1-eta[1])*c(0.300,0.205,0.285,0.210) pr[6,5:8] <- (1-eta[2])*c(0.322,0.298,0.078,0.302) pr[7,5:8] <- (1-eta[3])*c(0.248,0.246,0.298,0.208) pr[8,5:8] <- (1-eta[4])*c(0.177,0.239,0.292,0.292) mimage(pr,zlim=c(0,1)) #graphical representation of pr p0 <- rep(0.125,8) #initial distribution #----------------------------------------------------------- # Simulation Tt <- 2000 # Length of series # (should be >= 2000 for parameter estimation in this example) x <- f.simul(Tt=Tt+1,pr=pr,p0=p0)[-1] x1 <- (x-1)%/%4 #Indicator of CG-Islands y <- x-4*x1 #observations par(mfrow=c(2,1)) plot(1:500,y[1:500],ylim=c(0,5),type="l") points(1:500,5*x1[1:500],col=2) plot(501:1000,y[501:1000],ylim=c(0,5),type="l") points(501:1000,5*x1[501:1000],col=2) par(mfrow=c(1,1)) #Observation probabilities #b[k,t]=P(y_t|x_t=k) = 1 if y_t=k or y_t=k-4; = 0 otherwise b <- matrix(0,nrow=8,ncol=Tt) b[sort((0:(Tt-1))*8+c(y,y+4))] <- 1 #-------------------------------------------------------------- #Filtering erg <- f.filter(b=b,pr=pr,p0=rep(0.125,8)) p.cg <- apply(erg$fx[5:8,],2,sum) #Filtering probability of being in a CG-island sum(x1==(p.cg > 0.5)) #number of correct classifications mean((x1-p.cg)^2) #MSE error for filtering #Smoothing erg.sm <- f.smooth(b=b,fx=erg$fx,py=erg$py,pr=pr) p2.cg <- apply(erg.sm$margin[5:8,],2,sum) sum(x1==(p2.cg > 0.5)) #number of correct classifications mean((x1-p2.cg)^2) #MSE error for smoothing #Plotting of posterior probabilities for a CG-Island plot(1:Tt,x1) lines(1:Tt,p.cg,col=2) abline(h=0.5) points(1:Tt,(p.cg > 0.5)+0.02,col=3) lines(1:Tt,p2.cg,col=4) points(1:Tt,(p2.cg > 0.5)-0.02,col=5) #-------------------------------------------------------------- #Parameter estimation by EM: probabilities of regime changes are #assumed to be known #Initialisation n.obs <- table(y[-Tt],y[-1])/(Tt-1) #observed frequencies of pairs n.obs <- n.obs/apply(n.obs,1,sum) #empirical transition matrix m <- matrix(c(rep(1,4),rep(-1,8),rep(1,4)),nrow=4,ncol=4) pr1 <- n.obs - 0.1*m #modification to make transitions to C,G more likely pr2 <- n.obs + 0.1*m #modification to make transitions to C,G less likely #pr.est <- pr #starting with the true values (not good practice) pr.est <- matrix(0,nrow=8,ncol=8) pr.est[1:4,1:4] <- (1-eps)*pr1 #pr.est := current estimate of pr pr.est[1:4,5:8] <- eps*0.25 pr.est[5:8,1:4] <- eta*0.25 pr.est[5:8,5:8] <- (1-eta)*pr2 niter <- 100 pr1.save <- array(pr1,dim=c(4,4,niter)) pr2.save <- array(pr2,dim=c(4,4,niter)) loglik <- rep(0,niter-1) for (i.rep in (2:niter)) { #E-step erg.f <- f.filter(b,pr.est,p0) loglik[i.rep-1] <- sum(log(erg.f$py)) erg.s <- f.smooth(b,erg.f$fx,erg.f$py,pr.est) margin <- erg.s$margin pairs <- erg.s$pairs #M-step pr.est <- apply(pairs,c(1,2),mean) pr1 <- pr.est[1:4,1:4] pr2 <- pr.est[5:8,5:8] pr1 <- pr1/apply(pr1,1,sum) pr2 <- pr2/apply(pr2,1,sum) pr.est[1:4,1:4] <- (1-eps)*pr1 pr.est[1:4,5:8] <- eps*0.25 pr.est[5:8,1:4] <- eta*0.25 pr.est[5:8,5:8] <- (1-eta)*pr2 pr1.save[,,i.rep] <- pr1 pr2.save[,,i.rep] <- pr2 } plot(loglik,type="l") round(pr.est,3) round(pr,3) #graphical comparison of true and estimated transition probabilites par(mfrow=c(1,2)) mimage(pr,zlim=c(0,1)) mimage(pr.est,zlim=c(0,1)) #Filtering with estimated parameters erg <- f.filter(b=b,pr=pr.est,p0=rep(0.125,8)) p3.cg <- apply(erg$fx[5:8,],2,sum) #Filtering probability of being in a CG-island sum(x1==(p3.cg > 0.5)) #number of correct classifications mean((x1-p3.cg)^2) #MSE error for smoothing #Smoothing with estimated parameters erg.sm <- f.smooth(b=b,fx=erg$fx,py=erg$py,pr=pr.est) p4.cg <- apply(erg.sm$margin[5:8,],2,sum) sum(x1==(p4.cg > 0.5)) #number of correct classifications mean((x1-p4.cg)^2) #MSE error for smoothing #Comparing posterior probabilities for a CG-Island with true and #estimated parameters plot(1:Tt,p2.cg,type="l") lines(1:Tt,p4.cg,type="l",col=2) abline(h=0.5) #---------------------------------------------------------------- ##simulating from the smoothing distribution erg <- f.filter(b=b,pr=pr,p0=rep(0.125,8)) x.smooth <- f.backsimul(erg$fx,erg$px,y,pr) for (i in (1:100)) x.smooth <- cbind(x.smooth,f.backsimul(erg$fx,erg$px,y,pr)) p3.cg <- 1-apply(x.smooth==y,1,sum)/101 plot(1:Tt,p2.cg,type="l") lines(1:Tt,p3.cg,type="l",col=2) # Monte Carlo estimate almost correct #------------------------------------------------------------------ # MCMC algorithm to estimate parameters and states at the same time # probabilities of regime changes are assumed to be known #Initialisation pr.obs <- table(y[-Tt],y[-1])/(Tt-1) #observed frequencies of pairs pr.obs <- pr.obs/apply(pr.obs,1,sum) #empirical transition matrix m <- matrix(c(rep(1,4),rep(-1,8),rep(1,4)),nrow=4,ncol=4) #modification to make transitions to C,G less likely pr1 <- pr.obs - 0.1*m pr2 <- pr.obs + 0.1*m #pr.est <- pr #starting with the true values (not good practice) pr.est <- matrix(0,nrow=8,ncol=8) pr.est[1:4,1:4] <- (1-eps)*pr1 pr.est[1:4,5:8] <- eps*0.25 pr.est[5:8,1:4] <- eta*0.25 pr.est[5:8,5:8] <- (1-eta)*pr2 niter <- 1000 pr1.save <- array(pr1,dim=c(4,4,floor(0.1*niter))) pr2.save <- array(pr2,dim=c(4,4,floor(0.1*niter))) x.save <- matrix(0,nrow=Tt,ncol=floor(0.1*niter)) for (i in (1:niter)) { erg <- f.filter(b,pr.est,p0) x.act <- f.backsimul(erg$fx,erg$px,y,pr.est) #updating the states n.obs <- table(x.act[-Tt],x.act[-1]) #observed frequencies of pairs for (k in (1:4)) { z <- rgamma(4,n.obs[k,1:4]+1) pr1[k,] <- z/sum(z) z <- rgamma(4,n.obs[k+4,5:8]+1) pr2[k,] <- z/sum(z) } pr.est[1:4,1:4] <- (1-eps)*pr1 pr.est[1:4,5:8] <- eps*0.25 pr.est[5:8,1:4] <- eta*0.25 pr.est[5:8,5:8] <- (1-eta)*pr2 if ((i %% 10)==0) { pr1.save[,,i %/% 10] <- pr1 pr2.save[,,i %/% 10] <- pr2 x.save[,i %/% 10] <- x.act } } #posterior probability for being in a CG-island #(unknown parameters are integrated out) p4.cg <- 1-apply(x.save==y,1,mean) plot(1:Tt,p2.cg,type="l") lines(1:Tt,p4.cg,type="l",col=2) abline(h=0.5) sum(x1==(p4.cg > 0.5)) mean((x1-p4.cg)^2) #MSE error for smoothing ## Functions used ################################# mimage <- function(mat, ncol = 40, zlim=range(mat)) { ## Purpose: Color representation of a matrix ## ---------------------------------------------------------------------- ## Arguments: ## ---------------------------------------------------------------------- ## Author: Hans-Rudolf Kuensch, Date: 3 Dec 2007, 16:47 ## Interpolating a 'sequential' ColorBrewer palette YlOrBr <- c("#FFFFD4", "#FED98E", "#FE9929", "#D95F0E", "#993404") stopifnot(length(d <- dim(mat)) == 2) image(x=1:d[2], y=1:d[1], z=t(mat)[,d[1]:1], axes = FALSE, xlab = "j", ylab = "i",zlim=zlim, col = colorRampPalette(YlOrBr, space = "Lab")(ncol)) axis(1) axis(2, at= 1:d[2], labels= as.character(d[2]:1)) } #All functions except the last one can be used for an arbitray #hidden Markov model with discrete states f.simul <- function(Tt,pr,p0) { ## Purpose: Simulating from a Markov model ## ---------------------------------------------------------------------- ## Arguments: p0=initial distribution, pr=transition probability matrix ## ---------------------------------------------------------------------- ## Author: Hans-Ruedi Kuensch, Date: 2 Apr 2003, 17:49 x <- rep(1,Tt) m <- nrow(pr) pc <- pr for (i in (1:m)) pc[i,] <- cumsum(pr[i,]) x[1] <- sample(1:m,1,prob=p0) for (tt in (2:Tt)) { x[tt] <- min((1:m)[pc[x[tt-1],]>runif(1)]) } x } #Filtering of an arbitrary Hidden Markov Model f.filter <- function(b,pr,p0) { ## Purpose: Computing the prediction probabilities px, the filter ## probabilities fx and the predictions py in a HMM ## ---------------------------------------------------------------------- ## Arguments: b=conditional observation densities (b(k,t)=p(y_t|x_t=k)) ## pr=transition probabilites, p0=initial probability ## ---------------------------------------------------------------------- ## Author: Hans-Ruedi Kuensch, Date: 2.4.03/12.9.03 Tt <- ncol(b) m <- nrow(b) px <- matrix(p0,nrow=m,ncol=Tt) fx <- matrix(p0,nrow=m,ncol=Tt) py <- rep(1,Tt) fx[,1] <- b[,1]*fx[,1] py[1] <- sum(fx[,1]) fx[,1] <- fx[,1]/py[1] for (ti in (2:Tt)) { px[,ti] <- fx[,ti-1]%*%pr fx[,ti] <- b[,ti]*px[,ti] py[ti] <- sum(fx[,ti]) fx[,ti] <- fx[,ti]/py[ti] } list(fx=fx,px=px,py=py) } #Smoothing f.smooth <- function(b,fx,py,pr) { ## Purpose: Smoothing in a HMM ## ---------------------------------------------------------------------- ## Arguments: b=observation densities, fx=filter distributions, ## py=predictions of observations, ## pr=transition probabilities ## Output: margin=marginal smoother probabilities ## pairs=smoother probabilities for successive pairs ## ---------------------------------------------------------------------- ## Author: Hans-Ruedi Kuensch, Date: 3 Apr 2003, 14:30 Tt <- ncol(b) m <- nrow(b) r <- matrix(1,nrow=m,ncol=Tt) sm2 <- array(1,dim=c(m,m,Tt-1)) for (ti in (Tt:2)) { mat <- t(matrix(r[,ti]*b[,ti],m,m)) mat <- pr*mat/py[ti] r[,ti-1] <- apply(mat,1,sum) sm2[,,ti-1] <- matrix(fx[,ti-1],m,m)*mat } list(margin=r*fx,pairs=sm2) } f.backsimul <- function(fx,px,y,pr) { ## Purpose: backward simulation from the conditional distribution of ## states given observations in the CG-island model ## ---------------------------------------------------------------------- ## Arguments: ## ---------------------------------------------------------------------- ## Author: Hans-Ruedi Kuensch, Date: 5 May 2009, 20:48 Tt <- length(y) x <- rep(0,Tt) x[Tt] <- y[Tt] + 4*(runif(1) > fx[y[Tt],Tt]) for (t in ((Tt-1):1)) { x[t] <- y[t] + 4*(runif(1)*px[x[t+1],t+1] > pr[y[t],x[t+1]]*fx[y[t],t]) } x }