regression.example.R

Jim — Nov 19, 2013, 4:06 PM

# regression example
# use Prestige example from Fox and Weisberg's R book

library(car)

# Consider dataset Prestige of Canadian Occupations
###################################################################
# The Prestige data frame has 102 rows and 6 columns. The observations 
# are occupations.
# This data frame contains the following columns:
# education:
#    Average education of occupational incumbents, years, in 1971.
# income:
#    Average income of incumbents, dollars, in 1971.
# women:
#    Percentage of incumbents who are women.
####################################################################

# standard multiple regression fit 

fit <- lm(prestige ~ education + log2(income) + women,
          data=Prestige)
fit

Call:
lm(formula = prestige ~ education + log2(income) + women, data = Prestige)

Coefficients:
 (Intercept)     education  log2(income)         women  
   -110.9658        3.7305        9.3147        0.0469  

# show some graphs

plot(fit)

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1


# get data in right format for blinreg
# define response vector y and design matrix X

y <- Prestige$prestige
X <- with(Prestige, cbind(1, education, log2(income), women))
dimnames(X)[[2]][3] <- "log2.income"

library(LearnBayes)

# simulate 50000 draws from posterior distribution 
# assuming a noninformative prior on (beta, sigma^2)

bfit <- blinreg(y, X, 50000)

# look at plots of marginal posteriors of beta2, beta3, beta4

par(mfrow=c(3, 1))
for(j in 1:3)
  plot(density(bfit$beta[, j+1]), main=paste("Beta", j + 1))

plot of chunk unnamed-chunk-1

par(mfrow=c(1, 1))

# compute posterior means and sds of regression coef

apply(bfit$beta, 2, mean)
           X   Xeducation Xlog2.income       Xwomen 
  -110.93108      3.73298      9.31025      0.04679 
apply(bfit$beta, 2, sd)
           X   Xeducation Xlog2.income       Xwomen 
    14.97011      0.35772      1.33790      0.03009 

# estimate of sigma

mean(bfit$sigma)
[1] 7.145
sd(bfit$sigma)
[1] 0.5147

# illustrate estimation and prediction of a linear combo

# suppose we are interested in the prestige for an occupation
# where the average education is 10 years, the average income 
# (in log2 dollars) is 13, and the percentage of women is 20

covariates <- matrix(c(1, 10, 13, 20), 1, 4)

# function blinregexpected will simulate draws of the
# expected prestige

expected.lp <- blinregexpected(covariates, bfit)

library(coda)
Loading required package: lattice
summary(mcmc(expected.lp))

Iterations = 1:50000
Thinning interval = 1 
Number of chains = 1 
Sample size per chain = 50000 

1. Empirical mean and standard deviation for each variable,
   plus standard error of the mean:

          Mean             SD       Naive SE Time-series SE 
      48.36790        1.01001        0.00452        0.00452 

2. Quantiles for each variable:

 2.5%   25%   50%   75% 97.5% 
 46.4  47.7  48.4  49.0  50.3 

# suppose we are interested in predicting the prestige for a new
# occupation with same set of covariates

predicted.lp <- blinregpred(covariates, bfit)

library(coda)
summary(mcmc(predicted.lp))

Iterations = 1:50000
Thinning interval = 1 
Number of chains = 1 
Sample size per chain = 50000 

1. Empirical mean and standard deviation for each variable,
   plus standard error of the mean:

          Mean             SD       Naive SE Time-series SE 
       48.3286         7.2099         0.0322         0.0320 

2. Quantiles for each variable:

 2.5%   25%   50%   75% 97.5% 
 34.2  43.5  48.3  53.1  62.4 

# note the differences between the 95% interval estimate for E(y)
# and the 95% prediction interval for future y

# model selection
# suppose we are interested in selecting the best model
# function bayes.model.selection will compute the estimate of log f
# and the posterior model probability for each of 2^3 = 8 possible
# models

bayes.model.selection(y, X, 100)
$mod.prob
  education log2.income women  log.m   Prob
1     FALSE       FALSE FALSE -442.1 0.0000
2      TRUE       FALSE FALSE -388.5 0.0000
3     FALSE        TRUE FALSE -408.5 0.0000
4      TRUE        TRUE FALSE -372.7 0.8163
5     FALSE       FALSE  TRUE -443.8 0.0000
6      TRUE       FALSE  TRUE -386.5 0.0000
7     FALSE        TRUE  TRUE -400.3 0.0000
8      TRUE        TRUE  TRUE -374.2 0.1837

$converge
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

# this indicates that the best model is (education, log2 income)