postpredexample.R

Jim — Oct 16, 2013, 7:13 PM

# demonstrating overdispersion of data in Table 5.1

library(npmlreg)

# if y_i are indep binomial(n_i, p)
# and if p is given prior p^(-1)(1-p)^(-1)
# then [p | y] is beta(sum(y), sum(n) - sum(y))

data(missouri)  # contains full dataset

y <- missouri$Deaths
n <- missouri$Size
a <- sum(y)
b <- sum(n) - sum(y)
N <- length(y)

# function simulates one replicated dataset

one.sim <- function(){
  p <- rbeta(1, a, b)
  ys <- rbinom(N, size=n, prob=p)
  rates <- ys / n
  data.frame(N = n, Rate = rates)
}

# generate 8 replicated datasets

data <- NULL
for (j in 1:8)
  data <- rbind(data, data.frame(Rep=j, one.sim()))

# add the observed data 

data <- rbind(data, data.frame(Rep=9, N=n, Rate=y/n))
data$Type <- ifelse(data$Rep==9, "Observed", "Predicted")

# display the eight posterior predictive datasets and the observed

library(ggplot2)
ggplot(data, aes(log(N), Rate, color=Type )) + geom_point(size=3) +
  facet_wrap(~ Rep, ncol=3)

plot of chunk unnamed-chunk-1


# clear that a good checking function would be the
# standard deviation of the rates for the six largest counties
# (counties where log n > 8)

# write a new posterior pred sim, storing the sd of the rates
# for thre six largest counties

one.sim.new <- function(){
  p <- rbeta(1, a, b)
  ys <- rbinom(N, size=n, prob=p)
  rates <- ys / n
  sd(rates[log(n) > 8])
}

# simulate 1000 of the posterior pred sd

T.pred <- replicate(1000, one.sim.new())

# construct a histogram of the sds and overlay
# the observed sd

library(MASS)
truehist(T.pred, xlim=c(0, 0.0032))
obs.sd <- sd((y/n)[log(n) > 8])
abline(v=obs.sd, lwd=3, col="red")
text(.0025, 1000, "OBSERVED", cex=1.5, col="red")

plot of chunk unnamed-chunk-1