#### Filtering, smoothing and parameter estimation in a
#### model for CG-islands in DNA

#### States: 1,2,3,4 = A,C,G,T in a CG-island, 5,6,7,8 = A,C,G,T outside

#Input of the transition matrix
pr <- matrix(0,nrow=8,ncol=8)
eps <- rep(0.05,4)
eta <- rep(0.05,4)
#transition probabilities starting in CG-islands
pr[1,1:4] <- (1-eps[1])*c(0.180,0.274,0.426,0.120)
pr[2,1:4] <- (1-eps[2])*c(0.170,0.368,0.274,0.188)
pr[3,1:4] <- (1-eps[3])*c(0.161,0.339,0.375,0.125)
pr[4,1:4] <- (1-eps[4])*c(0.079,0.355,0.384,0.182)
pr[1:4,5:8] <- 0.25*eps
#transition probabilities starting outside a CG-island
pr[5:8,1:4] <- 0.25*eta
pr[5,5:8] <- (1-eta[1])*c(0.300,0.205,0.285,0.210)
pr[6,5:8] <- (1-eta[2])*c(0.322,0.298,0.078,0.302)
pr[7,5:8] <- (1-eta[3])*c(0.248,0.246,0.298,0.208)
pr[8,5:8] <- (1-eta[4])*c(0.177,0.239,0.292,0.292)


mimage(pr,zlim=c(0,1)) #graphical representation of pr

p0 <- rep(0.125,8)  #initial distribution

#-----------------------------------------------------------

# Simulation
Tt <- 2000 # Length of series
           # (should be >= 2000 for parameter estimation in this example)
x <- f.simul(Tt=Tt+1,pr=pr,p0=p0)[-1]
x1 <- (x-1)%/%4 #Indicator of CG-Islands
y <- x-4*x1 #observations
par(mfrow=c(2,1))
plot(1:500,y[1:500],ylim=c(0,5),type="l")
points(1:500,5*x1[1:500],col=2)
plot(501:1000,y[501:1000],ylim=c(0,5),type="l")
points(501:1000,5*x1[501:1000],col=2)
par(mfrow=c(1,1))

#Observation probabilities  
#b[k,t]=P(y_t|x_t=k) = 1 if y_t=k or y_t=k-4; = 0 otherwise

b <- matrix(0,nrow=8,ncol=Tt) 
b[sort((0:(Tt-1))*8+c(y,y+4))] <- 1


#--------------------------------------------------------------


#Filtering
erg <- f.filter(b=b,pr=pr,p0=rep(0.125,8))
p.cg <- apply(erg$fx[5:8,],2,sum) #Filtering probability of being in a CG-island
sum(x1==(p.cg > 0.5)) #number of correct classifications
mean((x1-p.cg)^2) #MSE error for filtering

#Smoothing
erg.sm <- f.smooth(b=b,fx=erg$fx,py=erg$py,pr=pr)
p2.cg <- apply(erg.sm$margin[5:8,],2,sum)
sum(x1==(p2.cg > 0.5)) #number of correct classifications
mean((x1-p2.cg)^2) #MSE error for smoothing

#Plotting of posterior probabilities for a CG-Island
plot(1:Tt,x1)
lines(1:Tt,p.cg,col=2)
abline(h=0.5)
points(1:Tt,(p.cg > 0.5)+0.02,col=3)
lines(1:Tt,p2.cg,col=4)
points(1:Tt,(p2.cg > 0.5)-0.02,col=5)

#--------------------------------------------------------------

#Parameter estimation by EM: probabilities of regime changes are
#assumed to be known

#Initialisation
n.obs <- table(y[-Tt],y[-1])/(Tt-1) #observed frequencies of pairs
n.obs <- n.obs/apply(n.obs,1,sum) #empirical transition matrix
m <- matrix(c(rep(1,4),rep(-1,8),rep(1,4)),nrow=4,ncol=4)
pr1 <- n.obs - 0.1*m #modification to make transitions to C,G more likely
pr2 <- n.obs + 0.1*m #modification to make transitions to C,G less likely
#pr.est <- pr #starting with the true values (not good practice)
pr.est <- matrix(0,nrow=8,ncol=8)
pr.est[1:4,1:4] <- (1-eps)*pr1 #pr.est := current estimate of pr
pr.est[1:4,5:8] <- eps*0.25
pr.est[5:8,1:4] <- eta*0.25
pr.est[5:8,5:8] <- (1-eta)*pr2
                                        
niter <- 100
pr1.save <- array(pr1,dim=c(4,4,niter))
pr2.save <- array(pr2,dim=c(4,4,niter))
loglik <- rep(0,niter-1)

for (i.rep in (2:niter)) {
#E-step
erg.f <- f.filter(b,pr.est,p0)
loglik[i.rep-1] <- sum(log(erg.f$py))
erg.s <- f.smooth(b,erg.f$fx,erg.f$py,pr.est)
margin <- erg.s$margin
pairs <- erg.s$pairs
#M-step
pr.est <- apply(pairs,c(1,2),mean)
pr1 <- pr.est[1:4,1:4]
pr2 <- pr.est[5:8,5:8]
pr1 <- pr1/apply(pr1,1,sum)
pr2 <- pr2/apply(pr2,1,sum)
pr.est[1:4,1:4] <- (1-eps)*pr1
pr.est[1:4,5:8] <- eps*0.25
pr.est[5:8,1:4] <- eta*0.25
pr.est[5:8,5:8] <- (1-eta)*pr2
pr1.save[,,i.rep] <- pr1
pr2.save[,,i.rep] <- pr2
}

plot(loglik,type="l")
round(pr.est,3)
round(pr,3)
#graphical comparison of true and estimated transition probabilites
par(mfrow=c(1,2))
mimage(pr,zlim=c(0,1))
mimage(pr.est,zlim=c(0,1))

#Filtering with estimated parameters
erg <- f.filter(b=b,pr=pr.est,p0=rep(0.125,8))
p3.cg <- apply(erg$fx[5:8,],2,sum) #Filtering probability of being in a CG-island
sum(x1==(p3.cg > 0.5)) #number of correct classifications
mean((x1-p3.cg)^2) #MSE error for smoothing

#Smoothing with estimated parameters
erg.sm <- f.smooth(b=b,fx=erg$fx,py=erg$py,pr=pr.est)
p4.cg <- apply(erg.sm$margin[5:8,],2,sum)
sum(x1==(p4.cg > 0.5)) #number of correct classifications
mean((x1-p4.cg)^2) #MSE error for smoothing

#Comparing posterior probabilities for a CG-Island with true and
#estimated parameters
plot(1:Tt,p2.cg,type="l")
lines(1:Tt,p4.cg,type="l",col=2)
abline(h=0.5)


#----------------------------------------------------------------

##simulating from the smoothing distribution
erg <- f.filter(b=b,pr=pr,p0=rep(0.125,8))
x.smooth <- f.backsimul(erg$fx,erg$px,y,pr)
for (i in (1:100)) x.smooth <- cbind(x.smooth,f.backsimul(erg$fx,erg$px,y,pr))
p3.cg <- 1-apply(x.smooth==y,1,sum)/101
plot(1:Tt,p2.cg,type="l")
lines(1:Tt,p3.cg,type="l",col=2) # Monte Carlo estimate almost correct


#------------------------------------------------------------------

# MCMC algorithm to estimate parameters and states at the same time
# probabilities of regime changes are assumed to be known

#Initialisation 
pr.obs <- table(y[-Tt],y[-1])/(Tt-1) #observed frequencies of pairs
pr.obs <- pr.obs/apply(pr.obs,1,sum) #empirical transition matrix
m <- matrix(c(rep(1,4),rep(-1,8),rep(1,4)),nrow=4,ncol=4)
     #modification to make transitions to C,G less likely
pr1 <- pr.obs - 0.1*m
pr2 <- pr.obs + 0.1*m
#pr.est <- pr #starting with the true values (not good practice)
pr.est <- matrix(0,nrow=8,ncol=8)
pr.est[1:4,1:4] <- (1-eps)*pr1
pr.est[1:4,5:8] <- eps*0.25
pr.est[5:8,1:4] <- eta*0.25
pr.est[5:8,5:8] <- (1-eta)*pr2

niter <- 1000
pr1.save <- array(pr1,dim=c(4,4,floor(0.1*niter)))
pr2.save <- array(pr2,dim=c(4,4,floor(0.1*niter)))
x.save <- matrix(0,nrow=Tt,ncol=floor(0.1*niter))

for (i in (1:niter)) {
  erg <- f.filter(b,pr.est,p0)
  x.act <- f.backsimul(erg$fx,erg$px,y,pr.est) #updating the states
  n.obs <- table(x.act[-Tt],x.act[-1])  #observed frequencies of pairs
  for (k in (1:4)) {
    z <- rgamma(4,n.obs[k,1:4]+1)
    pr1[k,] <- z/sum(z)
    z <- rgamma(4,n.obs[k+4,5:8]+1)
    pr2[k,] <- z/sum(z)
  }
  pr.est[1:4,1:4] <- (1-eps)*pr1
  pr.est[1:4,5:8] <- eps*0.25
  pr.est[5:8,1:4] <- eta*0.25
  pr.est[5:8,5:8] <- (1-eta)*pr2
  if ((i %% 10)==0) {
    pr1.save[,,i %/% 10] <- pr1
    pr2.save[,,i %/% 10] <- pr2
    x.save[,i %/% 10] <- x.act
  }
}

#posterior probability for being in a CG-island
#(unknown parameters are integrated out)
p4.cg <- 1-apply(x.save==y,1,mean)
plot(1:Tt,p2.cg,type="l") 
lines(1:Tt,p4.cg,type="l",col=2)
abline(h=0.5)
sum(x1==(p4.cg > 0.5))
mean((x1-p4.cg)^2) #MSE error for smoothing


## Functions used #################################

mimage <- function(mat, ncol = 40, zlim=range(mat))
{
  ## Purpose: Color representation of a matrix 
  ## ----------------------------------------------------------------------
  ## Arguments:
  ## ----------------------------------------------------------------------
  ## Author: Hans-Rudolf Kuensch, Date:  3 Dec 2007, 16:47

  ## Interpolating a 'sequential' ColorBrewer palette
  YlOrBr <- c("#FFFFD4", "#FED98E", "#FE9929", "#D95F0E", "#993404")

  stopifnot(length(d <- dim(mat)) == 2)
  image(x=1:d[2], y=1:d[1], z=t(mat)[,d[1]:1], axes = FALSE,
        xlab = "j", ylab  = "i",zlim=zlim,
        col = colorRampPalette(YlOrBr, space = "Lab")(ncol))
  axis(1)
  axis(2, at= 1:d[2], labels= as.character(d[2]:1))
}

#All functions except the last one can be used for an arbitray
#hidden Markov model with discrete states

f.simul <- function(Tt,pr,p0)
{
  ## Purpose: Simulating from a Markov model
  ## ----------------------------------------------------------------------
  ## Arguments: p0=initial distribution, pr=transition probability matrix
  ## ----------------------------------------------------------------------
  ## Author: Hans-Ruedi Kuensch, Date:  2 Apr 2003, 17:49
  x <- rep(1,Tt)
  m <- nrow(pr)
  pc <- pr
  for (i in (1:m)) pc[i,] <- cumsum(pr[i,]) 
  x[1] <- sample(1:m,1,prob=p0)
  for (tt in (2:Tt)) {
    x[tt] <- min((1:m)[pc[x[tt-1],]>runif(1)])
  }
  x
}

#Filtering of an arbitrary Hidden Markov Model

f.filter <- function(b,pr,p0)
{
  ## Purpose: Computing the prediction probabilities px, the filter
  ##          probabilities fx and the predictions py in a HMM
  ## ----------------------------------------------------------------------
  ## Arguments: b=conditional observation densities (b(k,t)=p(y_t|x_t=k))
  ##            pr=transition probabilites, p0=initial probability
  ## ----------------------------------------------------------------------
  ## Author: Hans-Ruedi Kuensch, Date:  2.4.03/12.9.03
  Tt <- ncol(b)
  m <- nrow(b)
  px <- matrix(p0,nrow=m,ncol=Tt)
  fx <- matrix(p0,nrow=m,ncol=Tt)
  py <- rep(1,Tt)
  fx[,1] <- b[,1]*fx[,1]
  py[1] <- sum(fx[,1])
  fx[,1] <- fx[,1]/py[1]
  for (ti in (2:Tt)) {
    px[,ti] <- fx[,ti-1]%*%pr
    fx[,ti] <- b[,ti]*px[,ti]
    py[ti] <- sum(fx[,ti])
    fx[,ti] <- fx[,ti]/py[ti]
  }
  list(fx=fx,px=px,py=py)
}

#Smoothing

f.smooth <- function(b,fx,py,pr)
{
  ## Purpose: Smoothing in a HMM
  ## ----------------------------------------------------------------------
  ## Arguments: b=observation densities, fx=filter distributions,
  ##            py=predictions of observations,
  ##            pr=transition probabilities
  ## Output: margin=marginal smoother probabilities
  ##         pairs=smoother probabilities for successive pairs
  ## ----------------------------------------------------------------------
  ## Author: Hans-Ruedi Kuensch, Date:  3 Apr 2003, 14:30
  Tt <- ncol(b)
  m <- nrow(b)
  r <- matrix(1,nrow=m,ncol=Tt)
  sm2 <- array(1,dim=c(m,m,Tt-1))
  for (ti in (Tt:2)) {
    mat <- t(matrix(r[,ti]*b[,ti],m,m))
    mat <- pr*mat/py[ti]
    r[,ti-1] <- apply(mat,1,sum)
    sm2[,,ti-1] <- matrix(fx[,ti-1],m,m)*mat
  }
  list(margin=r*fx,pairs=sm2)
}

f.backsimul <- function(fx,px,y,pr)
{
  ## Purpose: backward simulation from the conditional distribution of
  ##          states given observations in the CG-island model
  ## ----------------------------------------------------------------------
  ## Arguments:
  ## ----------------------------------------------------------------------
  ## Author: Hans-Ruedi Kuensch, Date:  5 May 2009, 20:48
  Tt <- length(y)
  x <- rep(0,Tt)
  x[Tt] <- y[Tt] + 4*(runif(1) > fx[y[Tt],Tt])
  for (t in ((Tt-1):1)) {
    x[t] <- y[t] + 4*(runif(1)*px[x[t+1],t+1] > pr[y[t],x[t+1]]*fx[y[t],t])
  }
  x
}