Note that the answers here might not be exactly in line with what is in the book, since the samples there come from R code.
We use some of the code from the pymc-devs Python/pymc3 port of the Statistical Rethinking code examples.
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
np.random.seed(100)
In the book a seed is set so the samples can be replicated exactly. I guess we could run R and get the exact correct samples.
def posterior_grid_approx_3m1(grid_points=1000, successes=6, tosses=9):
p_grid = np.linspace(0, 1, grid_points)
prior = np.repeat(1, grid_points)
likelihood = stats.binom.pmf(successes, tosses, p_grid)
unstd_posterior = likelihood * prior
posterior = unstd_posterior / unstd_posterior.sum()
return p_grid, posterior
num_samples = int(1e4)
p_grid, posterior = posterior_grid_approx_3m1(grid_points=num_samples)
samples = np.random.choice(p_grid, p=posterior, size=num_samples, replace=True)
_, (ax0, ax1) = plt.subplots(1,2, figsize=(12,6))
ax0.plot(samples, 'o', alpha=0.2)
ax0.set_xlabel('sample number', fontsize=14)
ax0.set_ylabel('proportion water (p)', fontsize=14)
az.plot_kde(samples, ax=ax1)
ax1.set_xlabel('proportion water (p)', fontsize=14)
ax1.set_ylabel('density', fontsize=14);
sum(samples < 0.2) / num_samples
0.0003
sum(samples > 0.8) / num_samples
0.1176
sum((samples > 0.2) & (samples <= 0.8)) / num_samples
0.8821
np.percentile(samples, 20)
0.5188318831883189
np.percentile(samples, 80)
0.7595759575957596
az.hdi(samples, hdi_prob=0.66)
array([0.51605161, 0.78517852])
lower = (1-.66)/2
np.percentile(samples, [lower*100, 66+lower*100])
array([0.50063306, 0.77337734])
p_grid, posterior = posterior_grid_approx_3m1(grid_points=num_samples, successes=8, tosses=15)
plt.plot(p_grid, posterior)
plt.xlabel("proportion water (p)")
plt.ylabel("Density");
samples = np.random.choice(p_grid, p=posterior, size=num_samples, replace=True)
az.hdi(samples, hdi_prob=0.9)
array([0.3320332, 0.7189719])
w = stats.binom.rvs(n=15, p=samples, size=int(1e4))
bar_width = 0.1
plt.hist(w, bins=np.arange(0, 11) - bar_width / 2, width=bar_width)
plt.xlim(0, 9.5)
plt.xlabel("dummy water count")
plt.ylabel("Frequency");
(w == 8).mean()
0.1436
w = stats.binom.rvs(n=9, p=samples, size=int(1e4))
(w == 6).mean()
0.1731
def posterior_grid_approx_3m5(grid_points=1000, successes=6, tosses=9):
p_grid = np.linspace(0, 1, grid_points)
prior = (p_grid >= 0.5)*1
likelihood = stats.binom.pmf(successes, tosses, p_grid)
unstd_posterior = likelihood * prior
posterior = unstd_posterior / unstd_posterior.sum()
return p_grid, posterior
p_grid, posterior = posterior_grid_approx_3m5(grid_points=num_samples, successes=8, tosses=15)
plt.plot(p_grid, posterior)
plt.xlabel("proportion water (p)")
plt.ylabel("Density");
samples = np.random.choice(p_grid, p=posterior, size=num_samples, replace=True)
az.hdi(samples, hdi_prob=0.9)
array([0.50005001, 0.71377138])
w = stats.binom.rvs(n=15, p=samples, size=int(1e4))
bar_width = 0.1
plt.hist(w, bins=np.arange(0, 11) - bar_width / 2, width=bar_width)
plt.xlim(0, 9.5)
plt.xlabel("dummy water count")
plt.ylabel("Frequency");
(w == 8).mean()
0.1603
w = stats.binom.rvs(n=9, p=samples, size=int(1e4))
(w == 6).mean()
0.231
def tosses2width(n):
# Simulate tossing the globe with the given number of trials and the 'true' ratio.
obs = stats.binom.rvs(n=n, p=0.7)
# Then do our inference
p_grid, posterior = posterior_grid_approx_3m1(grid_points=num_samples, successes=obs, tosses=n)
# Take samples from the distribution
samples = np.random.choice(p_grid, p=posterior, size=int(1e5), replace=True)
# Find the 99% percentile interval and it'd width
interval = np.percentile(samples, [0.5, 99.5])
interval_width = interval[1] - interval[0]
return interval_width
y = np.array([tosses2width(n) for n in range(2000, 3000)])
x = range(2000,3000)
plt.plot(x, y)
plt.xlabel("number of tosses")
plt.ylabel("interval width");
Looks like somewhere between 2200 and 2400 tosses will result in an interval width of 0.05.
birth1 = (1,0,0,0,1,1,0,1,0,1,0,0,1,1,0,1,1,0,0,0,1,0,0,0,1,0,
0,0,0,1,1,1,0,1,0,1,1,1,0,1,0,1,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,
1,1,0,1,0,0,1,0,0,0,1,0,0,1,1,1,1,0,1,0,1,1,1,1,1,0,0,1,0,1,1,0,
1,0,1,1,1,0,1,1,1,1)
birth2 = (0,1,0,1,0,1,1,1,0,0,1,1,1,1,1,0,0,1,1,1,0,0,1,1,1,0,
1,1,1,0,1,1,1,0,1,0,0,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,0,1,1,0,1,1,0,1,1,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,1,
0,0,0,1,1,1,0,0,0,0)
p_grid = np.linspace(0,1,num=1000)
prior = np.repeat(1,1000)
num_boys = sum(birth1) + sum(birth2)
total = len(birth1) + len(birth2)
likelihood = stats.binom.pmf(num_boys, total, p_grid)
unstd_posterior = likelihood*prior
posterior = unstd_posterior / unstd_posterior.sum()
plt.plot(p_grid, posterior)
plt.xlabel('P(boy)')
plt.ylabel('density')
Text(0, 0.5, 'density')
max_ = max(posterior)
p_grid[list(posterior).index(max_)]
0.5545545545545546
samples = np.random.choice(p_grid, p=posterior, size=int(1e4))
az.hdi(samples, hdi_prob=.5)
array([0.52552553, 0.57257257])
az.hdi(samples, hdi_prob=.89)
array([0.5005005 , 0.61061061])
az.hdi(samples, hdi_prob=.97)
array([0.47347347, 0.62762763])
w = stats.binom.rvs(n=200, p=samples, size=int(1e4))
az.plot_kde(w)
plt.axvline(x=num_boys)
<matplotlib.lines.Line2D at 0x7fbbd2727a50>
Yes, this posterior predictive plot seems to make the observed data very likely.
w = stats.binom.rvs(n=100, p=samples, size=int(1e4))
az.plot_kde(w)
plt.axvline(x=sum(birth1))
<matplotlib.lines.Line2D at 0x7fbbd2a45490>
The actual data observed just from birth1 is not a very central, likely outcome according to the model estimated over all the births. We can conclude that birth1 and birth2 datapoints come from different distributions.
num_girls_birth1 = len(birth1) - sum(birth1)
w = stats.binom.rvs(n=num_girls_birth1, p=samples, size=int(1e4))
num_boys_following_girls = sum((np.array(birth1) == 0) & (np.array(birth2) == 1))
num_girls_birth1
49
num_boys_following_girls
39
plt.hist(w,bins=15)
plt.axvline(x=num_boys_following_girls)
<matplotlib.lines.Line2D at 0x7fbb70132dd0>
Surprisingly, of the 49 cases where the first birth was a girl, 39 of them were followed by a boy! This can't be right, as it suggests the births are not IID. Somehow a birth being a girl makes it more likely the next one will be a boy.
We're seeing that the number of boys in the first birth is roughly equal to that of the number of girls, but in the second birth, boys are far more common.
My hypothesis is that the first child is accepted by the parents regardless of gender, but that cultural pressure to have boys results in frequent infanticide if more than one girl is born. The infanticide is kept secret, so those latent girl births (that are in fact I.I.D) are never recorded.