Status

Estimated and checked against the book:

Warning: there are small numerical differences, probably due to the difference between quap and Stan.

Libraries

library(tidyverse)
library(tidybayes)
library(rstan)
library(patchwork)
options(mc.cores = 4)

Section 5.1: Spurious association

Plausible regression lines implied by the priors:

We will estimate a series of regression models with a constant \(\alpha\) and regression coefficients \(\beta_k\), and these priors:

\[\alpha \sim N(0, .2)\] \[\beta_k \sim N(0, .5)\]

To see if these priors make sense, we can plot a few of the regression lines implied by these priors. To do this, we draw random numbers from the distributions above, and we plot the corresponding regression lines:

a = rnorm(50, 0, .2)
b = rnorm(50, 0, .5)
p <- ggplot()
for (i in 1:50) {
    p <- p + geom_abline(slope = b[i], intercept = a[i])
}
p + xlim(-3, 3) + ylim(-3, 3) +
    labs(x = 'Median age marriage (std)',
         y = 'Divorce rate (std)')

stan_program <- '
data {
  int<lower=1> n;   // number of observations
  int<lower=1> K;   // number of regressors (including constant)
  vector[n] Divorce;      // outcome
  matrix[n, K] X;   // regressors
}
parameters {
  real<lower=0,upper=50> sigma;    // scale
  vector[K] b;                     // coefficients (including constant)
}
transformed parameters {
  vector[n] mu;                    // location
  mu = X * b;
}
model {
  Divorce ~ normal(mu, sigma);    // probability model
  sigma ~ exponential(1);   // prior for scale
  b[1] ~ normal(0, 0.2);    // prior for intercept
  for (i in 2:K) {          // priors for coefficients
    b[i] ~ normal(0, 0.5);
  }
}
generated quantities {
  vector[n] yhat;           // predicted outcome
  for (i in 1:n) yhat[i] = normal_rng(mu[i], sigma);
}
'

standardize <- function(x) as.vector(scale(x))
WaffleDivorce <- read.csv('data/WaffleDivorce.csv', sep = ';') %>%
                 mutate(across(c(Divorce, Marriage, MedianAgeMarriage), standardize))
 
stan_data <- WaffleDivorce %>%
             compose_data(K = 2, 
                          y = Divorce,
                          X = model.matrix(~MedianAgeMarriage, .)) 

m5.1 <- stan(model_code = stan_program, data = stan_data)

stan_data <- WaffleDivorce %>%
             compose_data(K = 2, 
                          y = Divorce,
                          X = model.matrix(~Marriage, .)) 
m5.2 <- stan(model_code = stan_program, data = stan_data)

stan_data <- WaffleDivorce %>%
             compose_data(K = 3, 
                          y = Divorce,
                          X = model.matrix(~MedianAgeMarriage + Marriage, .)) 
m5.3 <- stan(model_code = stan_program, data = stan_data)
res5.1 <- m5.1 %>%
          spread_draws(b[i]) %>%
          mean_qi %>%
          mutate(i = c('Intercept', 'bA'),
                 Model = 'm5.1')
res5.2 <- m5.2 %>%
          spread_draws(b[i]) %>%
          mean_qi %>%
          mutate(i = c('Intercept', 'bM'),
                 Model = 'm5.2')
res5.3 <- m5.3 %>%
          spread_draws(b[i]) %>%
          mean_qi %>%
          mutate(i = c('Intercept', 'bA', 'bM'),
                 Model = 'm5.3')
res <- bind_rows(res5.1, res5.2, res5.3) %>%
       filter(i != 'Intercept')

ggplot(res, aes(x = b, y = Model, xmin = .lower, xmax = .upper)) +
    geom_vline(xintercept = 0, linetype = 'dotted') +
    geom_pointrange() +
    facet_grid(i ~ .) + 
    vincent::theme_vab()

Figure 5.5:

datplot <- m5.3 %>%
           gather_draws(mu[i]) %>% 
           mean_qi %>%
           rename(Predicted = .value)
datplot$Observed <- WaffleDivorce$Divorce

ggplot(datplot, aes(Observed, Predicted,
                    ymin = .lower, ymax = .upper)) +
    geom_pointrange() +
    geom_abline(intercept = 0, slope = 1, linetype = 'dashed') +
    labs(x = 'Observed divorce', y = 'Predicted divorce',
         title = 'Posterior predictive plot') +
    coord_fixed()