[R] hurdle, simulated power

Fri May 6 00:41:01 CEST 2011

Dave:

> We are planning an intervention study for adolescent alcohol use, and I 
> am planning to use simulations based on a hurdle model (using the 
> hurdle() function in package pscl) for sample size estimation.
>
> The simulation code and power code are below -- note that at the moment 
> the "power" code is just returning the coefficients, as something isn't 
> working quite right.

Hmmm, strange. I didn't see the problem either. Some parameters did not 
seem to be passed correctly but I did not see it.

But as I saw a few things that could be done more elegantly or in a way 
that they are easier to read (for me), I simply rewrote the code almost 
from scratch. For example, I now use plogis() for the inverse logit, 
rbinom() for drawing binomial random numbers, colMeans() for column means, 
and coeftest()/waldtest() for computing the p-values from the Wald tests.

The code is included below. Both the simulation of the coefficients and 
the simulation of associated power values seem to yield plausible results.

Hope that helps,
Z

dgp <- function(nobs = 1000, beta0 = 2.5, beta1 = -0.2,
   alpha0 = -0.9, alpha1 = 0.2, theta = 2.2)
{
   trt <- rep(0:1, length.out = nobs)
   pr <- plogis(alpha0 + alpha1 * trt)
   y1 <- rbinom(nobs, size = 1, prob = pr)
   mu <- exp(beta0 + beta1 * trt)
   y2 <- rnegbin(nobs, mu = mu, theta = theta)
   while(any(y2_0 <- y2 < 1L)) y2[y2_0] <-
     rnegbin(sum(y2_0), mu = mu[y2_0], theta = theta)
   data.frame(trt = trt, y = y1 * y2)
}

coefsim <- function(nrep = 100, ...) {
   nam <- c("beta0", "beta1", "alpha0", "alpha1", "theta")
   rval <- matrix(NA, nrow = nrep, ncol = length(nam))
   for(i in 1:nrep) {
     dat <- dgp(...)
     m <- hurdle(y ~ trt, data = dat, dist = "negbin")
     rval[i,] <- c(coef(m), m$theta)
   }
   colnames(rval) <- nam
   colMeans(rval)
}

powersim <- function(nrep = 100, level = 0.05, ...) {
   nam <- c("beta1", "alpha1", "both")
   rval <- matrix(NA, nrow = nrep, ncol = length(nam))
   for(i in 1:nrep) {
     dat <- dgp(...)
     m <- hurdle(y ~ trt, data = dat, dist = "negbin")
     m0 <- hurdle(y ~ 1, data = dat, dist = "negbin")
     rval[i,] <- c(coeftest(m)[c(2, 4), 4], waldtest(m0, m)[2, 4])
   }
   colnames(rval) <- nam
   colMeans(rval < level)
}

library("pscl")
library("lmtest")

set.seed(1)
out1 <- coefsim()
round(out1, digits = 3)

set.seed(1)
out2 <- powersim()
round(out2, digits = 3)

> The average estimates from code below are:
>
> count_(Intercept)         count_trt  zero_(Intercept)
>      2.498327128      -0.000321315       0.910293501
>         zero_trt
>     -0.200134813
>
> Three of the four look right (ie, converging to population values), but the 
> count_trt is stuck at zero, regardless of sample size (when it should be ~ 
> -0.20).
>
> Does anyone see what's wrong?
>
> Thanks for any input.
>
> cheers, Dave
>
>
>
> mysim <- function(n, beta0, beta1, alpha0, alpha1, theta){
> 	trt <- c(rep(0,n), rep(1,n))
> 	### mean function logit model
> 	p0 <- exp(alpha0 + alpha1*trt)/(1 + exp(alpha0 + alpha1*trt))
> 	### 0 / 1 based on p0
> 	y1 <- as.numeric(runif(n)>p0)
> 	### mean function count portion
> 	mu <- exp(beta0 + beta1*trt)
> 	### estimate counts using NB dist
> 	require(MASS, quietly = TRUE)
> 	y2 <- rnegbin(n, mu = mu, theta = theta)
> 	### if y2 = 0, draw new value
> 	while(sum(y2==0)>0){
> 		y2[which(y2==0)] <- rnegbin(length(which(y2==0)), 
> mu=mu[which(y2==0)], theta = theta)
> 	}
> 	y<-y1*y2
> 	data.frame(trt=trt,y=y)
> }
> #alpha0, alpha1 is the parameter for zero part
> #beta0,beta1 is the parameter for negative binomial
> #theta is dispersion parameter for negative binomial, infinity correspond to 
> poisson
> #
>
> #example power analysis
> #return three power, power1 for zero part, power2 for negative binomial part
> #power3 for joint test,significance level can be set, default is 0.05
> #M is simulation time
> #require pscl package
> #library(pscl)
>
> mypower <- function(n, beta0, beta1, alpha0, alpha1, theta, siglevel=0.05, 
> M=1000){
> 	myfun <- function(n,beta0,beta1,alpha0,alpha1,theta,siglevel){
> 		data <- mysim(n,beta0,beta1,alpha0,alpha1,theta)
> 		require(pscl, quietly = TRUE)
> 		res <- hurdle(y ~ trt, data = data, dist = "negbin", trace = 
> FALSE)
> 		est <- coef(res)#[c(2,4)]
> 		#v<-res$vcov[c(2,4),c(2,4)]
> 		#power1<-as.numeric(2*pnorm(-abs(est)[2]/sqrt(v[2,2]))<siglevel)
> 		#power2<-as.numeric(2*pnorm(-abs(est)[1]/sqrt(v[1,1]))<siglevel)
> 		#power3<-as.numeric((1-pchisq(t(est)%*%solve(v)%*%est,df=2))<siglevel)
> 		#c(power1,power2,power3)
> 	}
> 	r <- replicate(M, myfun(n,beta0,beta1,alpha0,alpha1,theta,siglevel), 
> simplify=TRUE)
> 	apply(r, 1, mean)
> }
>
> out <- mypower(n = 1000, beta0 = 2.5, beta1 = -0.20,
> 						 alpha0 = -0.90, alpha1 = 
> 0.20,
> 						 theta = 2.2, M = 100)
> out
>
>
> -- 
> Dave Atkins, PhD
> Research Associate Professor
> Department of Psychiatry and Behavioral Science
> University of Washington
> datkins at u.washington.edu
>
> Center for the Study of Health and Risk Behaviors (CSHRB) 
> 1100 NE 45th Street, Suite 300 
> Seattle, WA  98105 
> 206-616-3879 
> http://depts.washington.edu/cshrb/
> (Mon-Wed) 
>
> Center for Healthcare Improvement, for Addictions, Mental Illness,
>  Medically Vulnerable Populations (CHAMMP)
> 325 9th Avenue, 2HH-15
> Box 359911
> Seattle, WA 98104
> http://www.chammp.org
> (Thurs)
>
> ______________________________________________
> R-help at r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
>