Jim — Oct 16, 2013, 7:13 PM
# demonstrating overdispersion of data in Table 5.1
library(npmlreg)
# if y_i are indep binomial(n_i, p)
# and if p is given prior p^(-1)(1-p)^(-1)
# then [p | y] is beta(sum(y), sum(n) - sum(y))
data(missouri) # contains full dataset
y <- missouri$Deaths
n <- missouri$Size
a <- sum(y)
b <- sum(n) - sum(y)
N <- length(y)
# function simulates one replicated dataset
one.sim <- function(){
p <- rbeta(1, a, b)
ys <- rbinom(N, size=n, prob=p)
rates <- ys / n
data.frame(N = n, Rate = rates)
}
# generate 8 replicated datasets
data <- NULL
for (j in 1:8)
data <- rbind(data, data.frame(Rep=j, one.sim()))
# add the observed data
data <- rbind(data, data.frame(Rep=9, N=n, Rate=y/n))
data$Type <- ifelse(data$Rep==9, "Observed", "Predicted")
# display the eight posterior predictive datasets and the observed
library(ggplot2)
ggplot(data, aes(log(N), Rate, color=Type )) + geom_point(size=3) +
facet_wrap(~ Rep, ncol=3)
# clear that a good checking function would be the
# standard deviation of the rates for the six largest counties
# (counties where log n > 8)
# write a new posterior pred sim, storing the sd of the rates
# for thre six largest counties
one.sim.new <- function(){
p <- rbeta(1, a, b)
ys <- rbinom(N, size=n, prob=p)
rates <- ys / n
sd(rates[log(n) > 8])
}
# simulate 1000 of the posterior pred sd
T.pred <- replicate(1000, one.sim.new())
# construct a histogram of the sds and overlay
# the observed sd
library(MASS)
truehist(T.pred, xlim=c(0, 0.0032))
obs.sd <- sd((y/n)[log(n) > 8])
abline(v=obs.sd, lwd=3, col="red")
text(.0025, 1000, "OBSERVED", cex=1.5, col="red")