Chapter 3 Sampling the Imaginary
knitr::opts_chunk$set(echo = TRUE, cache = TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(purrr)
3.1 Figures and text
grid_of_globe_toss <- data_frame(
p_grid = seq(from = 0, to = 1, length.out = 1000),
prior = 1,
likelihood = dbinom(6, size = 9, prob = p_grid),
posterior_unstand = likelihood * prior,
posterior = posterior_unstand / sum(posterior_unstand)
)
samples <- with(grid_of_globe_toss,
sample(p_grid, prob = posterior, size = 1e4, replace = TRUE)
)
samples %>%
data_frame(resampled = .) %>%
ggplot(aes(x = resampled)) + geom_density()
Here we have taken a very simple grid (that is, a simple list of 1000 numbers) and resampled it into 10 000 numbers (10x more). This is not necessary. but it does illustrate the point: you can use samples from the posterior to estimate its shape, and it works just as well as using the real thing.
sum(grid_of_globe_toss$posterior)
## [1] 1
with(grid_of_globe_toss, sum(posterior[ p_grid < 0.5 ]))
## [1] 0.1718746
its that last bit that we normally can’t do – picking out the values of p_grid
. That’s because for other bigger models it is so complex we can’t answer that question accurately.
sum(samples < 0.5) / length(samples)
## [1] 0.1729
lo and behold, the value is nearly the same.
intervals <- list(
c(-Inf, 0.5),
c(0.5, 0.7)
)
sample_dens <- samples %>%
density() %>%
broom::tidy(.)
interval_density <- intervals %>%
map(~ sample_dens$x > .x[1] & sample_dens$x < .x[2]) %>%
map(~ sample_dens[.x,])
sample_dens %>%
ggplot(aes(x = x, y = y)) + geom_line() +
geom_polygon(data = rbind(interval_density[[1]], c(0.5, 0)))
rethinking::HPDI(samples = samples, prob = 0.8)
## |0.8 0.8|
## 0.4704705 0.8298298
3.2 simulation for prediction
rbinom(1e5, size = 9, prob = 0.7) %>%
data_frame(dummy = .) %>%
ggplot(aes(x = dummy)) +
geom_histogram() +
scale_x_continuous(breaks = 1:9)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
That shows you the expected distributions for one probability – 0.7
. We need them at all such probablities
predicted_samples <- rbinom(length(samples), 9, prob = samples)
predicted_samples %>%
data_frame(dummy = .) %>%
ggplot(aes(x = dummy)) +
geom_histogram() +
scale_x_continuous(breaks = 1:9)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
3.3 making breaking predicitons
repeated_single_samples <- map(samples, rbinom, n = 9, size = 1)
# should be able to get the histogram back
repeated_single_samples %>%
map_dbl(sum) %>%
data_frame(dummy = .) %>%
ggplot(aes(x = dummy)) +
geom_histogram() +
scale_x_continuous(breaks = 1:9)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## can also calculate statistics about the temporal sequenc,e which is absent
## from the model
repeated_single <- repeated_single_samples %>%
map(rle) %>%
map_df(~ data_frame(n_switches = max(.x$lengths),
max_run = length(.x$values)))
repeated_single %>%
tidyr::gather(measure, value) %>%
ggplot(aes(x = value)) +
geom_histogram() + facet_grid(~measure)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
3.4 exercises
set.seed(100)
samples_exercise <- with(grid_of_globe_toss,
sample(p_grid, prob = posterior, size = 1e4, replace = TRUE)
)
3.4.1 how much lies below p = 0.2
?
below <- sum(samples_exercise < 0.2) / length(samples_exercise)
about 0.05% below 0.2
3.4.2 how much lies above 0.8?
abv <- sum(samples_exercise > 0.8) / length(samples_exercise)
11.17%
3.4.3 how much between 0.2 and 0.8
sum(samples_exercise < 0.8 & samples_exercise> 0.2) / length(samples_exercise)
## [1] 0.8878
3.4.4 20% is below which value?
quantile(samples_exercise, 0.2)
## 20%
## 0.5195195
3.4.5 20% is above which value?
quantile(samples_exercise, 0.8)
## 80%
## 0.7567568
3.4.6 Which values contain the narrowest interval with 66% of posterior?
rethinking::HPDI(samples, 0.66)
## |0.66 0.66|
## 0.5225225 0.7927928
## just out of curiosity
plot(ecdf(samples))
abline(v = c(0.52, 0.8))
3.4.7 which values contain 66% of the variation, with equal amounts in the tails?
quantile(samples, c(0.16, 0.84))
## 16% 84%
## 0.4924925 0.7787788
3.4.8 Medium
grid_of_globe_toss_15 <- data_frame(
p_grid = seq(from = 0, to = 1, length.out = 1000) %>% rep(2),
prior = c(rep(1, 1000), rep(c(0,1), each = 500)),
prior_is = c("flat", "informative") %>% rep(each = 1000),
likelihood = dbinom(8, size = 15, prob = p_grid),
like_x_pri = likelihood * prior,
posterior = like_x_pri / sum(like_x_pri)
)
grid_of_globe_toss_15 %>%
ggplot(aes(x = seq_along(posterior), y = posterior)) +
geom_line() +
facet_wrap(~prior_is, scales = "free")
## sample from the posterior
posterior_samples <- grid_of_globe_toss_15 %>%
split(., .$prior_is) %>%
map(~ sample(.x$p_grid, size = 1e4,
prob = .x$posterior, replace = TRUE)
)
posterior_samples %>% map(rethinking::HPDI)
## $flat
## |0.89 0.89|
## 0.3373373 0.7207207
##
## $informative
## |0.89 0.89|
## 0.5005005 0.7077077
3.4.9 posterior predictive check
posterior_samples %>%
map(~ rbinom(length(.x), size = 15, prob = .x)) %>%
map_df(~ data_frame(posterior_pred = .), .id = "prior_is") %>%
ggplot(aes(x = posterior_pred)) +
geom_histogram(binwidth = 1) +
geom_vline(xintercept = 8)+
facet_grid(~ prior_is)
3.4.10 what is the probility of 6/9?
posterior_samples %>%
map(~ rbinom(length(.x), size = 8, prob = .x)) %>%
map(~ sum(.x == 6) / length(.x))
## $flat
## [1] 0.1506
##
## $informative
## [1] 0.2048
3.5 Hard
load in the data
library(rethinking)
homeworkch3 <- data(homeworkch3)
detach("package:rethinking", unload=TRUE)
library(purrr)
.. this does not work? all I get is the word “homeworkch3” which.. why would that even be.
Instead I will copy and paste the code for this example straight from Github
birth1 <- c(1,0,0,0,1,1,0,1,0,1,0,0,1,1,0,1,1,0,0,0,1,0,0,0,1,0,
0,0,0,1,1,1,0,1,0,1,1,1,0,1,0,1,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,
1,1,0,1,0,0,1,0,0,0,1,0,0,1,1,1,1,0,1,0,1,1,1,1,1,0,0,1,0,1,1,0,
1,0,1,1,1,0,1,1,1,1)
birth2 <- c(0,1,0,1,0,1,1,1,0,0,1,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,
1,1,1,0,1,1,1,0,1,0,0,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,0,1,1,0,1,1,0,1,1,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,1,
0,0,0,1,1,1,0,0,0,0)
Test it to make sure that it works the same as the text:
sum(birth1) + sum(birth2)
## [1] 111
ok.
3.5.1 use grid approximation to estimate the posterior distribution of the probility of a male
male_babies <- data_frame(
p_grid = seq(from = 0, to = 1, length.out = 1000),
prior = 1,
likelihood = dbinom(x = sum(birth1) + sum(birth2),
size = length(birth1) + length(birth2),
prob = p_grid),
like_x_pri = likelihood * prior,
posterior = like_x_pri / sum(like_x_pri)
)
male_babies %>%
ggplot(aes(x = p_grid, y = posterior)) + geom_point()
3.5.2 sample from the posterior distribution and get HPDI
male_babies_samples <- with(male_babies,
sample(p_grid, size = 1e4, replace = TRUE, prob = posterior))
list(0.5, 0.89, 0.97) %>%
map(rethinking::HPDI, sample=male_babies_samples)
## [[1]]
## |0.5 0.5|
## 0.5255255 0.5725726
##
## [[2]]
## |0.89 0.89|
## 0.5015015 0.6116116
##
## [[3]]
## |0.97 0.97|
## 0.4764765 0.6286286
3.5.3 simulate predictions
data_frame(sim_babies = rbinom(n = 1e4,
size = 200,
prob = male_babies_samples)) %>%
ggplot(aes(x = sim_babies)) + geom_histogram(binwidth = 1) +
geom_vline(xintercept = 111)
3.5.4 now, using the same posterior for all births, consider the first born only!
data_frame(sim_babies = rbinom(n = 1e4,
size = length(birth1), #100
prob = male_babies_samples)) %>%
ggplot(aes(x = sim_babies)) + geom_histogram(binwidth = 1) +
geom_vline(xintercept = sum(birth1))
the model overestimates firstborn? I wonder if these data are from a society that really values boys? Such that if you have a daughter first then you try again, hoping for a boy. But if you start with a boy then you stop, and you wouldn’t be in this sample (which is two-child families only)
3.5.5 simulate boys after girls
If I was right above, the model should underestimate these births!
Ok so first we need to know where the girl babies are in the first vector
## how many births after girls?
length(birth2[birth1 == 0])
## [1] 49
## how many of those were boys?
sum(birth2[birth1 == 0])
## [1] 39
data_frame(sim_babies = rbinom(n = 1e4,
size = 49, #total births after girls
prob = male_babies_samples)) %>%
ggplot(aes(x = sim_babies)) + geom_histogram(binwidth = 1) +
geom_vline(xintercept = sum(birth2[birth1 == 0]))
Wow, it definitely does underestimate it! it does a terrible job of guessing at the sex of a second birth.