import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import pytensor.tensor as pt
import scipy.stats as stats
import seaborn as sns
from scipy.special import expit
from utils import draw_causal_graph, standardize
RANDOM_SEED = 73
np.random.seed(RANDOM_SEED)
sns.set_context("talk")
sns.set_palette("colorblind")
cb_palette = sns.color_palette()
Notes
We’ll return to the wages and education example from Chapter 14 of Statistical Rethinking. However, instead of using wages as our outcome, we’ll tweak this by creating a binary outcome from wages, which I’ll refer to as $R$ for “rich”. The causal diagram will look the same as a traditional IV setup.
draw_causal_graph(
edge_list=[("Q", "E"), ("U", "E"), ("U", "R"), ("E", "R")],
node_props={"U": {"style": "dashed"}},
graph_direction="TD",
)
Per usual, we’ll use this to generate a simulated dataset. We’ll use the same code but this time derive R
from W
. The rationale being those with higher wages are more likely to be rich. We’ll still making the influence of education on wages (bEW_sim
) equal to 0. We’ll want to get this value back in our statistical models, even though we’ll be using R
as our ultimate outcome variable.
A key conceptual point is use of a logit link function to produce R
from W
. Additionally, we’ll make use of a binomial generalized linear model.
def generate_data(num_subjects, n_binomial_param) -> pd.DataFrame:
"""Generate simulated data.
Parameters
----------
num_subjects
Number of subjects/rows in data
n_binomial_param
Number of "observations" for each subject;
a parameter in the binomial GLM
Returns
-------
sim_df
pd.DataFrame
"""
bEW_sim = 0
U_sim = np.random.normal(size=num_subjects)
Q_sim = np.random.randint(1, 5, size=num_subjects)
E_sim = np.random.normal(loc=U_sim + Q_sim, size=num_subjects)
W_sim = np.random.normal(loc=U_sim + bEW_sim * E_sim, size=num_subjects)
sim_df = pd.DataFrame.from_dict(
{"W": standardize(W_sim), "E": standardize(E_sim), "Q": standardize(Q_sim)}
)
# Use of link functions to generate R
index_val = sim_df.index.values
sim_df["R"] = stats.binom.rvs(n=n_binomial_param, p=expit(W_sim))
sim_df["R_size"] = n_binomial_param
return sim_df
dat_sim = generate_data(num_subjects=500, n_binomial_param=2)
dat_sim.head()
W | E | Q | R | R_size | |
---|---|---|---|---|---|
0 | -1.327032 | -0.600469 | 0.453638 | 1 | 2 |
1 | -0.108554 | -0.159673 | -1.318387 | 0 | 2 |
2 | -0.908108 | 1.066808 | 0.453638 | 2 | 2 |
3 | -2.187143 | -2.856829 | -1.318387 | 0 | 2 |
4 | 0.044873 | -0.024763 | -0.432374 | 1 | 2 |
As usual, visualizing the data can give us some insight into how the data appears and where confounds may mislead. We know that Q
is a cause of E
and therefore the association we see is reflective of causation. The bottom-left figure W
vs. R
is a logit transformation of the former into the latter. However, both the plots in the right column are a result of the confound U
. It is driving the relationship between E
and W
and therefore we also see an association between E
and R
.
def plot_variable_relationships(sim_df, title):
f, ((ax0, ax1), (ax2, ax3)) = plt.subplots(2, 2, figsize=(12, 12))
sns.scatterplot(data=sim_df, x="Q", y="E", marker=r"$\circ$", ax=ax0)
ax0.set_title("Q vs. E")
sns.scatterplot(data=sim_df, x="E", y="W", marker=r"$\circ$", ax=ax1)
ax1.set_title("E vs. W\n(confounded by U)")
sns.boxplot(data=sim_df, x="W", y="R", orient="h", color=cb_palette[0], ax=ax2)
sns.scatterplot(data=dat_sim_n1, x="W", y="R", marker=r"$\circ$", ax=ax2)
ax2.invert_yaxis()
ax2.set_title("W vs. R")
sns.boxplot(data=sim_df, x="E", y="R", orient="h", color=cb_palette[0], ax=ax3)
sns.scatterplot(data=sim_df, x="E", y="R", marker=r"$\circ$", ax=ax3)
ax3.invert_yaxis()
ax3.set_title("E vs. R\n(confounded by U)")
f.suptitle(title)
f.tight_layout()
plot_variable_relationships(dat_sim, "number of subjects=500, n binom param=2")
We’ll use the Bayesian approach to run our inferential model. Again, the important thing is the link function to get our count output.
\[R_i \sim \text{Binomial}(n_i, p_i)\] \[\text{logit}(p_i) = W_i\] \[\left( \begin{array}{c} W_i \\ {E_i} \end{array} \right) \sim \text{MVNormal} \left( \begin{array}{c}{\mu_{W_i}} \\ {\mu_{E_i} } \end{array} , \textbf{S} \right)\] \[\mu_{W_i} = \alpha_W + \beta_{EW} W_i\] \[\mu_{E_i} = \alpha_E + \beta_{QE} E_i\] \[\alpha_W, \alpha_E \sim \text{Normal}(0, 0.2)\] \[\beta_{EW}, \beta_{QE} \sim \text{Normal}(0, 1.5)\] \[\textbf{S} = \begin{pmatrix} \sigma_{W}^2 & \rho\sigma_{W}\sigma_{E} \\ \rho\sigma_{W}\sigma_{E} & \sigma_{E}^2 \end{pmatrix} = \begin{pmatrix} \sigma_{P} & 0 \\ 0 & \sigma_{\beta} \end{pmatrix} \textbf{R} \begin{pmatrix} \sigma_{W} & 0 \\ 0 & \sigma_{E} \end{pmatrix}\] \[\textbf{R} \sim \text{LKJCorr}(2)\]def run_bayesian_iv_model_binary(data_df):
"""Model for education/rich binary outcome.
Parameters
----------
data_df
Generated dataset
Returns
-------
:
pymc idata object
"""
index_vals = data_df.index.values
with pm.Model() as model:
aW = pm.Normal("aW", 0.0, 0.2)
aE = pm.Normal("aE", 0.0, 0.2)
bEW = pm.Normal("bEW", 0.0, 0.5)
bQE = pm.Normal("bQE", 0.0, 0.5)
muW = pm.Deterministic("muW", aW + bEW * data_df.E.values)
muE = pm.Deterministic("muE", aE + bQE * data_df.Q.values)
chol, _, _ = pm.LKJCholeskyCov(
"chol_cov", n=2, eta=2, sd_dist=pm.Exponential.dist(1.0), compute_corr=True
)
# multivariate regression
MU = pt.stack([muW, muE]).T
YY_obs = pm.Data("YY_obs", data_df[["R", "E"]].values)
YY = pm.MvNormal("YY", mu=MU, chol=chol, observed=YY_obs)
# link function
p = pm.Deterministic("p", pm.math.invlogit(YY[index_vals, 0]))
R = pm.Binomial("R", n=data_df["R_size"], p=p, observed=data_df["R"])
idata = pm.sample(1000, random_seed=RANDOM_SEED, target_accept=0.95)
idata.rename({"chol_cov_corr": "Rho", "chol_cov_stds": "Sigma"}, inplace=True)
return idata
idata_14_6_logit = run_bayesian_iv_model_binary(dat_sim)
Sampling 4 chains, 0 divergences ━━━━━━━━━━━━╸━━━━━━━━━━━━━━━━━━━━━━━━━━━ 32% 0:00:17 / 0:00:10
IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.
Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)
IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.
Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)
IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.
Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)
IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.
Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 48 seconds.
f, ax0 = plt.subplots(1, 1, figsize=(6, 5), sharex=True)
az.plot_forest(idata_14_6_logit, var_names=["bEW", "bQE"], combined=True, ax=ax0)
ax0.set_title("m14.6 with logistic link")
Text(0.5, 1.0, 'm14.6 with logistic link')
As you can see, we get coefficients as the original wages and education example.
A practical question we might have in a real world scenario is how many subjects we might need and how many observations we might want per person. Generally, more people will give us more power for our estimate. We can run experiments to help us understand how these parameters influence the credible interval of our estimate.
num_subjects_list = [25, 100, 200, 500]
n_binomial_param_list = [1, 2, 5]
def format_summary_table(
idata, num_subjects, n_binomial_param, var_names=["bEW", "bQE"]
):
df = (
az.summary(idata, var_names=var_names)
.assign(num_subjects=num_subjects)
.assign(n_binomial_param=n_binomial_param)
)
return df
df_summary = list()
for num_subjects in num_subjects_list:
for n_binomial_param in n_binomial_param_list:
print(
f"Running {num_subjects} num_subjects, {n_binomial_param} n_binomial_param..."
)
dat_sim_expt = generate_data(
num_subjects=num_subjects, n_binomial_param=n_binomial_param
)
idata_14_6_logit_expt = run_bayesian_iv_model_binary(dat_sim_expt)
df_expt_summary = format_summary_table(
idata_14_6_logit_expt,
num_subjects=num_subjects,
n_binomial_param=n_binomial_param,
)
df_summary.append(df_expt_summary)
print("Done with experiment")
df_summary = pd.concat(df_summary)
df_summary
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | num_subjects | n_binomial_param | |
---|---|---|---|---|---|---|---|---|---|---|---|
bEW | 0.254 | 0.163 | -0.067 | 0.556 | 0.005 | 0.003 | 1330.0 | 1345.0 | 1.0 | 25 | 1 |
bQE | 0.430 | 0.174 | 0.087 | 0.741 | 0.003 | 0.002 | 3144.0 | 2492.0 | 1.0 | 25 | 1 |
bEW | 0.075 | 0.290 | -0.491 | 0.606 | 0.007 | 0.005 | 1558.0 | 1779.0 | 1.0 | 25 | 2 |
bQE | 0.572 | 0.158 | 0.256 | 0.841 | 0.003 | 0.002 | 2945.0 | 2536.0 | 1.0 | 25 | 2 |
bEW | -0.118 | 0.447 | -1.018 | 0.664 | 0.009 | 0.007 | 2226.0 | 2858.0 | 1.0 | 25 | 5 |
bQE | 0.503 | 0.153 | 0.203 | 0.777 | 0.003 | 0.002 | 2942.0 | 2430.0 | 1.0 | 25 | 5 |
bEW | 0.069 | 0.077 | -0.078 | 0.212 | 0.002 | 0.001 | 2062.0 | 1784.0 | 1.0 | 100 | 1 |
bQE | 0.631 | 0.078 | 0.491 | 0.780 | 0.001 | 0.001 | 4154.0 | 3071.0 | 1.0 | 100 | 1 |
bEW | 0.097 | 0.134 | -0.157 | 0.339 | 0.003 | 0.002 | 1835.0 | 1786.0 | 1.0 | 100 | 2 |
bQE | 0.589 | 0.079 | 0.442 | 0.736 | 0.001 | 0.001 | 2903.0 | 2622.0 | 1.0 | 100 | 2 |
bEW | -0.723 | 0.394 | -1.434 | 0.028 | 0.012 | 0.009 | 1025.0 | 1450.0 | 1.0 | 100 | 5 |
bQE | 0.555 | 0.087 | 0.389 | 0.718 | 0.002 | 0.002 | 1328.0 | 2054.0 | 1.0 | 100 | 5 |
bEW | -0.026 | 0.055 | -0.129 | 0.077 | 0.001 | 0.001 | 2076.0 | 2150.0 | 1.0 | 200 | 1 |
bQE | 0.637 | 0.055 | 0.533 | 0.738 | 0.001 | 0.001 | 3645.0 | 2818.0 | 1.0 | 200 | 1 |
bEW | 0.241 | 0.086 | 0.083 | 0.405 | 0.002 | 0.001 | 2431.0 | 2040.0 | 1.0 | 200 | 2 |
bQE | 0.607 | 0.055 | 0.509 | 0.715 | 0.001 | 0.001 | 3814.0 | 2899.0 | 1.0 | 200 | 2 |
bEW | -0.232 | 0.233 | -0.659 | 0.200 | 0.007 | 0.005 | 1234.0 | 1331.0 | 1.0 | 200 | 5 |
bQE | 0.589 | 0.060 | 0.479 | 0.704 | 0.001 | 0.001 | 1758.0 | 2294.0 | 1.0 | 200 | 5 |
bEW | -0.013 | 0.035 | -0.077 | 0.054 | 0.001 | 0.001 | 2187.0 | 2619.0 | 1.0 | 500 | 1 |
bQE | 0.640 | 0.034 | 0.581 | 0.707 | 0.001 | 0.000 | 3164.0 | 3009.0 | 1.0 | 500 | 1 |
bEW | 0.109 | 0.057 | -0.001 | 0.212 | 0.001 | 0.001 | 2186.0 | 2422.0 | 1.0 | 500 | 2 |
bQE | 0.617 | 0.035 | 0.548 | 0.682 | 0.001 | 0.000 | 3594.0 | 2525.0 | 1.0 | 500 | 2 |
bEW | 0.026 | 0.120 | -0.202 | 0.240 | 0.003 | 0.002 | 1838.0 | 2026.0 | 1.0 | 500 | 5 |
bQE | 0.605 | 0.036 | 0.538 | 0.673 | 0.001 | 0.001 | 2578.0 | 2687.0 | 1.0 | 500 | 5 |
f, (ax0, ax1) = plt.subplots(1, 2, figsize=(12, 5), sharey=True)
def plot_mean_and_ci(coefficient, ax):
x_shifts = [-15, 0, 15]
n_binomial_param_list = [1, 2, 5]
colors = cb_palette[0:3]
for n_binom, x_shift, color in zip(n_binomial_param_list, x_shifts, colors):
df = (
df_summary.reset_index(names="coefficient")
.query("coefficient==@coefficient")
.query("n_binomial_param==@n_binom")
)
ax.scatter(df["num_subjects"]+x_shift, df["mean"], facecolors=None, edgecolor=color, s=2**4, label=n_binom)
ax.vlines(x=df["num_subjects"]+x_shift, ymin=df["hdi_3%"], ymax=df["hdi_97%"], linewidth=0.75, color=color)
ax.set_title(coefficient)
ax.set(
xlabel="Number of subjects",
ylabel="Estimate"
)
if coefficient == "bQE":
ax.legend(title='n_binomial_param', title_fontsize=12, fontsize=12, loc='lower right')
plot_mean_and_ci("bEW", ax0)
plot_mean_and_ci("bQE", ax1)
g = sns.relplot(
data=df_summary.reset_index(names="coefficient"),
x="num_subjects",
y="sd",
hue="n_binomial_param",
hue_order = [1,2,5],
col="coefficient",
palette=cb_palette[0:3],
kind='line'
)
g.set_titles(col_template="{col_name}")
g.set_axis_labels(x_var="Number of subjects", y_var="SD of coefficient")
plt.setp(g._legend.get_texts(), fontsize='12')
plt.setp(g._legend.get_title(), fontsize='12');
First, it makes sense that the variability of bQE
is not affected by the binomial parameter since this coefficient is not dependent on the logit transformation. The variability of the bEW
decreases with the number of subjects, as one might intuit. But I didn’t necessarily expect that a higher $n$ as a binomial parameter would lead to higher variability.
My motivation for this post was understanding how to implement IV analysis with a binary outcome. I also ran an experiment to determine appropriate sample sizes. Some of the coding implementation was tricky for me initially. Thanks to the man himself for providing some help.
%load_ext watermark
%watermark -n -u -v -iv -w
The watermark extension is already loaded. To reload it, use:
%reload_ext watermark
Last updated: Wed Jun 12 2024
Python implementation: CPython
Python version : 3.12.3
IPython version : 8.24.0
pymc : 5.15.0
matplotlib: 3.8.4
scipy : 1.13.0
seaborn : 0.13.2
numpy : 1.26.4
pandas : 2.2.2
arviz : 0.18.0
pytensor : 2.20.0
Watermark: 2.4.3
statsmodels
. Both the Bayesian approach and the 2SLS method rely on minimizing the influence of confounds between $X$ and $Y$ to uncover the true, causal relationship between them. The key is understanding how confounds lurk in the error terms of respective models and then see how the instrument addresses them.
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import pytensor.tensor as pt
import scipy.stats as stats
import seaborn as sns
import statsmodels.api as sm
from scipy.special import expit
from statsmodels.sandbox.regression.gmm import IV2SLS
from utils import draw_causal_graph, standardize
RANDOM_SEED = 73
np.random.seed(RANDOM_SEED)
sns.set_context("talk")
sns.set_palette("colorblind")
I will use the example of IV analysis described in Chapter 14 of Statistical Rethinking by Richard McElreath. This example is an attempt to ask the question, how much does education E
influence someone’s wages W
? Does more schooling lead to more income? We cannot simply regress W
on E
because of variables (confounds) that can influence both. One confound could be work ethic. Someone who works hard is motivated to seek more education and be driven at their job to earn more. How would IV analysis then be used to remove the influence of such a confound? By identifying another variable (the instrument) that would influence only education and not wages. In McElreath’s example, the quarter that someone was born (instrumental variable Q
) can be used deconfound E
on W
. Why can this work? Per the book: “… people born earlier in the year tend to get less schooling. This is both because they are biologically older when they start school and because they become eligible to drop out of school earlier.” Here is our causal graph, where he variable U
represents unobserved confounders.
draw_causal_graph(
edge_list=[("Q", "E"), ("U", "E"), ("U", "W"), ("E", "W")],
node_props={"U": {"style": "dashed"}},
graph_direction="TD",
)
This causal structure can be used to inform data that we’ll simulate for this exercise. Note that we’re explicitly making the influence of education on wages (bEW_sim
) equal to 0. This is the estimate we want to recover in all inferential
statistical models. We’ll standardize all variables to facilitate model building and interpretation of results.
N = 500
bEW_sim = 0
U_sim = np.random.normal(size=N)
Q_sim = np.random.randint(1, 5, N)
E_sim = np.random.normal(loc=U_sim + Q_sim, size=N)
W_sim = np.random.normal(loc=U_sim + bEW_sim * E_sim, size=N)
dat_sim = pd.DataFrame.from_dict(
{"W": standardize(W_sim), "E": standardize(E_sim), "Q": standardize(Q_sim)}
)
To enhance our understanding, we can visualize the data but being conscious that looking at the data only tells us nothing about causal relationships. The figure on the left is showing a relationship between Q
and E
but so does E
and W
on the right. Only the left figure is a causal relationship. We set bEW_sim
to 0 so we know the correlation we’re seeing is a result of the confound U
.
f, (ax0, ax1) = plt.subplots(1, 2, figsize=(12, 5))
sns.scatterplot(data=dat_sim, x="Q", y="E", marker=r"$\circ$", ax=ax0)
ax0.set_title("Q vs. E")
sns.scatterplot(data=dat_sim, x="E", y="W", marker=r"$\circ$", ax=ax1)
ax1.set_title("E vs. W\n(confounded by U)")
f.tight_layout()
As the book details, if we create simple linear regression models (since we’re pretending U
is unobserved) we will get incorrect, biased estimates. Including Q
will result in bias amplification, making the estimate worse. But with this causal graph structure, IV analysis is applicable for proper estimation of E
on W
. We’ll start with a Bayesian approach for IV analysis before doing 2SLS.
The variable U
is a confound that acts as a fork creating a correlation between E
and W
. The Bayesian approach involves use of a multivariate linear model (e.g. multiple outcome variables) that acknowledges this covariance. By embedding the correlation structure into the model, we can recover a proper coefficient for education on wages $\beta_{EW}$. Here is how we can include the observed variables in one model for the Bayesian approach, as shown on page 458 of Statistical Rethinking.
\(\mu_{W_i} = \alpha_W + \beta_{EW} W_i\) \(\mu_{E_i} = \alpha_E + \beta_{QE} E_i\) \(\alpha_W, \alpha_E \sim \text{Normal}(0, 0.2)\) \(\beta_{EW}, \beta_{QE} \sim \text{Normal}(0, 1.5)\)
\[\textbf{S} = \begin{pmatrix} \sigma_{W}^2 & \rho\sigma_{W}\sigma_{E} \\ \rho\sigma_{W}\sigma_{E} & \sigma_{E}^2 \end{pmatrix} = \begin{pmatrix} \sigma_{P} & 0 \\ 0 & \sigma_{\beta} \end{pmatrix} \textbf{R} \begin{pmatrix} \sigma_{W} & 0 \\ 0 & \sigma_{E} \end{pmatrix}\] \[\textbf{R} \sim \text{LKJCorr}(2)\]We can implement these equations and run the Bayesian statistical model. This is the book’s description of model 14.6 and I am using the pymc translation of R code 14.26.
with pm.Model() as m14_6:
aW = pm.Normal("aW", 0.0, 0.2)
aE = pm.Normal("aE", 0.0, 0.2)
bEW = pm.Normal("bEW", 0.0, 0.5)
bQE = pm.Normal("bQE", 0.0, 0.5)
muW = aW + bEW * dat_sim.E.values
muE = aE + bQE * dat_sim.Q.values
chol, _, _ = pm.LKJCholeskyCov(
"chol_cov", n=2, eta=2, sd_dist=pm.Exponential.dist(1.0), compute_corr=True
)
WE_obs = pm.Data("WE_obs", dat_sim[["W", "E"]].values, mutable=True)
WE = pm.MvNormal("WE", mu=pt.stack([muW, muE]).T, chol=chol, observed=WE_obs)
trace_14_6 = pm.sample(1000, random_seed=RANDOM_SEED)
trace_14_6.rename({"chol_cov_corr": "Rho", "chol_cov_stds": "Sigma"}, inplace=True)
df_trace_14_6_summary = az.summary(
trace_14_6, var_names=["aW", "aE", "bEW", "bQE", "Rho", "Sigma"], round_to=2
)
df_trace_14_6_summary
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
aW | -0.00 | 0.04 | -0.08 | 0.08 | 0.0 | 0.0 | 4150.99 | 3688.87 | 1.0 |
aE | -0.00 | 0.03 | -0.06 | 0.06 | 0.0 | 0.0 | 3692.50 | 2853.40 | 1.0 |
bEW | 0.05 | 0.06 | -0.08 | 0.16 | 0.0 | 0.0 | 2131.69 | 2631.97 | 1.0 |
bQE | 0.68 | 0.03 | 0.62 | 0.74 | 0.0 | 0.0 | 2963.05 | 3214.42 | 1.0 |
Rho[0, 0] | 1.00 | 0.00 | 1.00 | 1.00 | 0.0 | 0.0 | 4000.00 | 4000.00 | NaN |
Rho[0, 1] | 0.46 | 0.05 | 0.36 | 0.56 | 0.0 | 0.0 | 2179.65 | 2729.02 | 1.0 |
Rho[1, 0] | 0.46 | 0.05 | 0.36 | 0.56 | 0.0 | 0.0 | 2179.65 | 2729.02 | 1.0 |
Rho[1, 1] | 1.00 | 0.00 | 1.00 | 1.00 | 0.0 | 0.0 | 3641.00 | 3807.84 | 1.0 |
Sigma[0] | 0.99 | 0.04 | 0.92 | 1.06 | 0.0 | 0.0 | 2727.13 | 2943.37 | 1.0 |
Sigma[1] | 0.73 | 0.02 | 0.69 | 0.78 | 0.0 | 0.0 | 4500.07 | 3156.00 | 1.0 |
The influence of education on wages in our statistical model bEW
captures 0 (ranges from -0.08 to 0.16), making it consistent with what we had used for bEW_sim
to generate our data. This is because we were able to account for the correlation between E
and W
. As we can see the off-diagonal terms for Rho
is positive.
Excellent! We’ve done the first objective of this post. Now let’s see how we would do it with two-stage least squares.
As the name implies, here we have two models using ordinary least squares.
In the first stage, we use our instrument Q
as our predictor variable and E
will be the outcome. We can completely ignore W
here. But we know from our generated data that U
is influencing E
. If we were to acknowledge U
, then the linear equation for E
would look like this.
$ E = \alpha + \beta_{QE}Q + \beta_{UE}U$
But since we’re pretending that we don’t know about U
in our inferential models, the influence of U
would be noise which I denote as $\epsilon$ here.
$ E = \alpha + \beta_{QE}Q + \epsilon$
This is our “first stage” equation. By making a the fitted model of this first stage equation, we run Q
back through the model, ignoring noise, and get predicted values of E
which we’ll call E_hat
. The values of E_hat
are now free from the influence of U
.
$ \hat{E} = \alpha + \beta_{QE}Q$
From a causal perspective, this results in cutting the backdoor from W
to E
. In the second-stage model, we can then use E_hat
as the predictor variable for W
. We can then see we get a proper estimate for the coefficient.
draw_causal_graph(
edge_list=[("Q", "E_hat"), ("U", "W"), ("E_hat", "W")],
node_props={"U": {"style": "dashed"}},
edge_props={
("Q", "E_hat"): {"label": "1st stage"},
("E_hat", "W"): {"label": "2nd stage"},
},
graph_direction="TD",
)
Let’s do these steps manually using OLS before trying with the statsmodels IV2SLS
function.
OLS
# First stage: Regress education on Q
first_stage = sm.OLS(dat_sim["E"], sm.add_constant(dat_sim["Q"])).fit()
# Predicted education added to df
dat_sim["E_hat"] = first_stage.predict(sm.add_constant(dat_sim["Q"]))
dat_sim.head()
W | E | Q | E_hat | |
---|---|---|---|---|
0 | 1.147366 | 1.442103 | 0.428599 | 0.292607 |
1 | 1.153508 | 0.386141 | -1.372236 | -0.936835 |
2 | 0.948497 | 0.413627 | -0.471819 | -0.322114 |
3 | -0.137755 | 0.193661 | -0.471819 | -0.322114 |
4 | -1.653446 | -2.076577 | -0.471819 | -0.322114 |
# Second stage: Regress wages on predicted education (instrumented)
second_stage = sm.OLS(dat_sim["W"], sm.add_constant(dat_sim["E_hat"])).fit()
# Summary of the second stage regression
second_stage.summary()
Dep. Variable: | W | R-squared: | 0.001 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | -0.001 |
Method: | Least Squares | F-statistic: | 0.5562 |
Date: | Tue, 11 Jun 2024 | Prob (F-statistic): | 0.456 |
Time: | 12:20:05 | Log-Likelihood: | -709.19 |
No. Observations: | 500 | AIC: | 1422. |
Df Residuals: | 498 | BIC: | 1431. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 2.776e-17 | 0.045 | 6.2e-16 | 1.000 | -0.088 | 0.088 |
E_hat | 0.0489 | 0.066 | 0.746 | 0.456 | -0.080 | 0.178 |
Omnibus: | 0.243 | Durbin-Watson: | 1.908 |
---|---|---|---|
Prob(Omnibus): | 0.885 | Jarque-Bera (JB): | 0.357 |
Skew: | 0.014 | Prob(JB): | 0.836 |
Kurtosis: | 2.872 | Cond. No. | 1.46 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
While the question of interest is not identifying the $\beta_{QE}$ coefficient, we can get this value.
first_stage.summary()
Dep. Variable: | E | R-squared: | 0.466 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.465 |
Method: | Least Squares | F-statistic: | 434.7 |
Date: | Tue, 11 Jun 2024 | Prob (F-statistic): | 7.20e-70 |
Time: | 12:20:05 | Log-Likelihood: | -552.59 |
No. Observations: | 500 | AIC: | 1109. |
Df Residuals: | 498 | BIC: | 1118. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -1.665e-16 | 0.033 | -5.09e-15 | 1.000 | -0.064 | 0.064 |
Q | 0.6827 | 0.033 | 20.850 | 0.000 | 0.618 | 0.747 |
Omnibus: | 0.118 | Durbin-Watson: | 2.053 |
---|---|---|---|
Prob(Omnibus): | 0.943 | Jarque-Bera (JB): | 0.036 |
Skew: | -0.007 | Prob(JB): | 0.982 |
Kurtosis: | 3.039 | Cond. No. | 1.00 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Now let’s implement this with statsmodels IV2SLS
. The coding is a lot simpler but it only provides the values for the second stage.
IV2SLS
# Add a constant to the model (intercept)
df_iv2sls = dat_sim.copy()
fit_iv2sls = IV2SLS(
endog=df_iv2sls["W"], exog=df_iv2sls["E"], instrument=df_iv2sls["Q"]
).fit()
fit_iv2sls.summary()
Dep. Variable: | W | R-squared: | 0.035 |
---|---|---|---|
Model: | IV2SLS | Adj. R-squared: | 0.033 |
Method: | Two Stage | F-statistic: | nan |
Least Squares | Prob (F-statistic): | nan | |
Date: | Tue, 11 Jun 2024 | ||
Time: | 12:20:05 | ||
No. Observations: | 500 | ||
Df Residuals: | 499 | ||
Df Model: | 1 |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
E | 0.0489 | 0.064 | 0.760 | 0.448 | -0.078 | 0.175 |
Omnibus: | 0.308 | Durbin-Watson: | 1.901 |
---|---|---|---|
Prob(Omnibus): | 0.857 | Jarque-Bera (JB): | 0.424 |
Skew: | 0.013 | Prob(JB): | 0.809 |
Kurtosis: | 2.860 | Cond. No. | 1.00 |
Let’s plot the point estimates and the 95% credible/confidence intervals.
# bayesian values
df_bayesian = (
df_trace_14_6_summary.loc[["bQE", "bEW"], ["mean", "hdi_3%", "hdi_97%"]]
.reset_index(names="coefficient")
.rename(columns=dict(zip(["hdi_3%", "hdi_97%"], ["lower", "upper"])))
.assign(approach="Bayesian")
)
# OLS values
df_ols = pd.concat(
[
pd.concat([first_stage.params, first_stage.conf_int()], axis=1),
pd.concat([second_stage.params, second_stage.conf_int()], axis=1),
]
)
df_ols.columns = ["mean", "lower", "upper"]
df_ols = (
df_ols.drop("const")
.rename(index={"Q": "bQE", "E_hat": "bEW"})
.reset_index(names="coefficient")
.assign(approach="OLS")
)
# IV2SLS values
df_iv2sls = pd.concat(
[
pd.concat([fit_iv2sls.params, fit_iv2sls.conf_int()], axis=1),
]
)
df_iv2sls.columns = ["mean", "lower", "upper"]
df_iv2sls = (
df_iv2sls.rename(index={"E": "bEW"})
.reset_index(names="coefficient")
.assign(approach="IV2SLS")
)
df_estimates = pd.concat([df_bayesian, df_ols, df_iv2sls], ignore_index=True)
df_estimates
coefficient | mean | lower | upper | approach | |
---|---|---|---|---|---|
0 | bQE | 0.680000 | 0.620000 | 0.740000 | Bayesian |
1 | bEW | 0.050000 | -0.080000 | 0.160000 | Bayesian |
2 | bQE | 0.682707 | 0.618376 | 0.747039 | OLS |
3 | bEW | 0.048923 | -0.079966 | 0.177811 | OLS |
4 | bEW | 0.048923 | -0.077613 | 0.175458 | IV2SLS |
def plot_coef_estimates(ax, coefficient):
df = df_estimates.query("coefficient==@coefficient")
x_mean_vals = df["mean"].tolist() + [None] if len(df) == 2 else df["mean"].tolist()
x_min_vals = df["lower"].tolist() + [None] if len(df) == 2 else df["lower"].tolist()
x_max_vals = df["upper"].tolist() + [None] if len(df) == 2 else df["upper"].tolist()
ax.scatter(
x=x_mean_vals,
y=range(3),
)
ax.hlines(
xmin=x_min_vals,
xmax=x_max_vals,
y=range(3),
)
ax.set_ylim([-1, 3])
ax.set_yticks(range(3))
ax.set_yticklabels(["Bayesian", "OLS", "IV2SLS"])
ax.invert_yaxis()
ax.set_title(coefficient)
return ax
f, (ax0, ax1) = plt.subplots(1, 2, figsize=(10, 3), sharey=True)
plot_coef_estimates(ax0, "bQE")
plot_coef_estimates(ax1, "bEW")
f.tight_layout()
The Bayesian approach credible interval spans 0 like the statsmodels
approaches, which reflects the coefficient value we used to generate our data. The epiphany for me is understanding how each model accounts for the confound (U
). The Bayesian approach parameterizes the unobserved confound in the covariance matrix so proper credit gets assigned to bEW (as 0). 2SLS essentially cuts the backdoor from W < U > E
. This shows we can recover the correct estimate of a confounded variable in this situation with different methods of instrumental variable analysis.
%load_ext watermark
%watermark -n -u -v -iv -w
Last updated: Tue Jun 11 2024
Python implementation: CPython
Python version : 3.12.3
IPython version : 8.24.0
scipy : 1.13.0
pymc : 5.15.0
pandas : 2.2.2
matplotlib : 3.8.4
numpy : 1.26.4
arviz : 0.18.0
seaborn : 0.13.2
statsmodels: 0.14.2
pytensor : 2.20.0
Watermark: 2.4.3
import graphviz as gr
def draw_causal_graph(
edge_list, node_props=None, edge_props=None, graph_direction="UD"
):
"""Utility to draw a causal (directed) graph
Taken from: https://github.com/dustinstansbury/statistical-rethinking-2023/blob/a0f4f2d15a06b33355cf3065597dcb43ef829991/utils.py#L52-L66
"""
g = gr.Digraph(graph_attr={"rankdir": graph_direction})
edge_props = {} if edge_props is None else edge_props
for e in edge_list:
props = edge_props[e] if e in edge_props else {}
g.edge(e[0], e[1], **props)
if node_props is not None:
for name, props in node_props.items():
g.node(name=name, **props)
return g
Example: A training program
Cross-sectional
Longitudinal
Types of longitudinal data
Statistical Framework
OLS with longitudinal data
Time manipulation: trends
Time manipulation: lags
Termed finite distributed lag models of order N
X1 and X2 are measured in the present. X3 is measured at three timepoints (present, lag of 1 and lag of 2.)
$\beta_3, \beta_4, \beta_5$ are independent; they are often summed to estimate a long-run effect of X on Y
Powerful model for estimating cause and effect of a variable
Advantages
Disadvantages
Conclusion
Fixed effect: A DAG approach
draw_causal_graph(
edge_list=[
("u", "X1_1"),
("u", "X1_2"),
("u", "X1_3"),
("u", "Y_1"),
("u", "Y_2"),
("u", "Y_3"),
("X2", "X1_1"),
("X2", "X1_2"),
("X2", "X1_3"),
("X2", "Y_1"),
("X2", "Y_2"),
("X2", "Y_3"),
("X1_1", "Y_1"),
("X1_2", "Y_2"),
("X1_3", "Y_3"),
("X1_1", "X1_2"),
("X1_2", "X1_3"),
],
edge_props={
("u", "X1_1"): {"style": "dashed"},
("u", "X1_2"): {"style": "dashed"},
("u", "X1_3"): {"style": "dashed"},
("u", "Y1_1"): {"style": "dashed"},
("u", "Y1_2"): {"style": "dashed"},
("u", "Y1_3"): {"style": "dashed"},
},
)
Implementing FE: LSDV
Z is a large vector of dummy variables
Implementing FE: Within-transformation
The within-transformation subtracts the average panel unit value from each measured data point
The within-transformation subtracts the average panel unit value from each measured data point
Advantages
Disadvantages
Conclusion
Basic DiD: Minimum Wages
“Treated”:
“Unreated control”:
Basic DiD: Minimum Wages (fast food restaurants)
Average Employment FTE | New Jersey | Penn | Difference |
---|---|---|---|
Before | 20.44 | 23.33 | -2.89 |
After | 21.03 | 21.17 | -0.14 |
Difference | 0.59 | -2.16 | 2.76 |
To get the full effect of the minimum wage law on employment, in NJ, use the counterfactual: assume NJ would have been on the same trajectory as Penn. That’s why you take the difference of the differences ($ 0.59 - (-2.16) $).
Basic DiD: Visualization
Here is one example but this doesn’t apply to the NJ/Penn example.
Regression DiD
\(Y_{it} = \alpha + \beta_1 Treatment_i + \beta_2 Time_i + (\beta_3 Treatment_i \times Time_t) + \beta_4 X_{it} + \epsilon_{it}\) \(Treatment = \text{1 if treated, 0 if control}\) \(Time = \text{1 if post-period, 0 if pre-period}\) \(Treatment_i \times Time_t = \text{Interaction}\)
Assumptions
Advantages
Disadvantages
Conclusions
Creating the synthetic control
Estimating the weights
Control variables
Evlauating the intervention
Cross-Contamination is allowed
Example: If CA impelements a new health policy, it’s likely neighboring states follow CA and implement some but not all policies. Neighboring control states are not clean. Example is Prop 99 for tobacco legislation.
Advantages
Disdvantages
Conclusion
%load_ext watermark
%watermark -n -u -v -iv -w
Last updated: Tue May 28 2024
Python implementation: CPython
Python version : 3.12.3
IPython version : 8.24.0
graphviz: 0.20.3
Watermark: 2.4.3
Robust analysis
Specification robustness
Examine the stability of model estimates when changing model specifications
Data robustness
Examine the consistency of estimates across different datasets or subsamples
Method robustness
How to present robustness analysis Tables are common.
Another advanced visualization example
Conclusion
How do you get this information across?
%load_ext watermark
%watermark -n -u -v -iv -w
The watermark extension is already loaded. To reload it, use:
%reload_ext watermark
Last updated: Tue May 28 2024
Python implementation: CPython
Python version : 3.12.3
IPython version : 8.24.0
Watermark: 2.4.3
In this post, we’ll focus on varying intercepts, first from a Bayesian approach using pymc, followed by an example with statsmodels. In later posts, we’ll increase the complexity such as incorporation of varying slopes.
import arviz as az
import graphviz as gr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
sns.set_context("talk")
sns.set_palette("colorblind")
def draw_causal_graph(
edge_list, node_props=None, edge_props=None, graph_direction="UD"
):
"""Utility to draw a causal (directed) graph
Taken from: https://github.com/dustinstansbury/statistical-rethinking-2023/blob/a0f4f2d15a06b33355cf3065597dcb43ef829991/utils.py#L52-L66
"""
g = gr.Digraph(graph_attr={"rankdir": graph_direction})
edge_props = {} if edge_props is None else edge_props
for e in edge_list:
props = edge_props[e] if e in edge_props else {}
g.edge(e[0], e[1], **props)
if node_props is not None:
for name, props in node_props.items():
g.node(name=name, **props)
return g
def standardize(x):
x = (x - np.mean(x)) / np.std(x)
return x
# Generate synthetic data
n_patients = 30
n_timepoints = 5
# Create patient IDs
patient_ids = np.repeat(np.arange(n_patients), n_timepoints)
# Create time points
time = np.tile(np.arange(n_timepoints), n_patients)
# Create patient-specific attributes (age and treatment)
age = np.random.randint(40, 70, n_patients)
treatment = np.random.binomial(1, 0.5, n_patients)
# Repeat age and treatment to match the longitudinal measurements
age_repeated = np.repeat(age, n_timepoints)
treatment_repeated = np.repeat(treatment, n_timepoints)
# Combine into a DataFrame
df_data = pd.DataFrame(
{
"patient_id": patient_ids,
"time": time,
"age": age_repeated,
"treatment": treatment_repeated,
}
)
df_data.head(10)
patient_id | time | age | treatment | |
---|---|---|---|---|
0 | 0 | 0 | 66 | 1 |
1 | 0 | 1 | 66 | 1 |
2 | 0 | 2 | 66 | 1 |
3 | 0 | 3 | 66 | 1 |
4 | 0 | 4 | 66 | 1 |
5 | 1 | 0 | 53 | 1 |
6 | 1 | 1 | 53 | 1 |
7 | 1 | 2 | 53 | 1 |
8 | 1 | 3 | 53 | 1 |
9 | 1 | 4 | 53 | 1 |
Here’s the fun part. We’ll simulate the outcome variable tumor size, using some mild assumptions and domain knowledge. First, we’ll assume that all participants have been identified as having a solid tumor cancer. Therefore:
This will be a simple linear model that we’ll use to create data:
\[s_i \sim \text{Normal}(\mu_i, \sigma)\] \[\mu_i = \alpha + \beta_T T_i + \beta_A A_i + \beta_R R_i\]However, to be clear, we’re using this model to generate our data but in this post, we’ll focus on varying intercepts and ignore predictors time, age, and treatment.
# Use a generative model to create tumor size with some randomness
alpha_tumor_size = 50 # intercept term
bT = 1 # positive association for time
bA = 0.25 # positive association for age
bR = -5 # negative association for treatment
mu_tumor_size = (
alpha_tumor_size
+ bT * df_data["time"]
+ bA * df_data["age"]
+ bR * df_data["treatment"]
)
sigma_tumor_size = 2
df_data["tumor_size"] = np.random.normal(mu_tumor_size, sigma_tumor_size)
df_data.head()
patient_id | time | age | treatment | tumor_size | |
---|---|---|---|---|---|
0 | 0 | 0 | 66 | 1 | 62.813967 |
1 | 0 | 1 | 66 | 1 | 61.505909 |
2 | 0 | 2 | 66 | 1 | 64.283770 |
3 | 0 | 3 | 66 | 1 | 65.343314 |
4 | 0 | 4 | 66 | 1 | 64.127617 |
Before doing any modeling, we’ll transform the data since in theory we shouldn’t peek. But we can’t pick priors unless we have some idea of what the data is like. An easy thing to do is standardize the data and therefore we can use a 0 mean, 2 SD prior to capture most of the data.
df_data["tumor_size_std"] = standardize(df_data["tumor_size"])
# how to represent patient_specific random effect?
draw_causal_graph(
edge_list=[("age", "tumor"), ("treatment", "tumor"), ("time", "tumor")],
graph_direction="LR",
)
df_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 patient_id 150 non-null int64
1 time 150 non-null int64
2 age 150 non-null int64
3 treatment 150 non-null int64
4 tumor_size 150 non-null float64
5 tumor_size_std 150 non-null float64
dtypes: float64(2), int64(4)
memory usage: 7.2 KB
sns.relplot(
data=df_data,
x="time",
y="tumor_size",
col="patient_id",
col_wrap=6,
hue="treatment",
kind="line",
)
<seaborn.axisgrid.FacetGrid at 0x16d7634a0>
ax = sns.boxplot(
data=df_data,
x="treatment",
y="tumor_size",
)
ax.set_title("Effect of Treatment on tumor size")
Text(0.5, 1.0, 'Effect of Treatment on tumor size')
Let’s define the equation. We’re going to assume the tumor size is Gaussian distributed.
It will be a linear combination of independent variables for time, age, and treatment. How will we represent the patient_id
?
There will be a term for average tumor size and the patient-specific tumor size will be the “random effect”.
s = tumor size
t = time
a = age
r = treatment
After reading McElreath, for now, I will ignore time, age, and treatment and just think of patient as a cluster and just do varying intercepts.
\[\mu_i = \alpha_{\text{pt[i]}}\]Let’s do this step-by-step and work our way from the most naive, simplest models to more complex and informative.
The patient_id
variable is completely ignored. A subscript to denote the patient is not relevant here?
# complete pooling, intercepts only
with pm.Model() as m0:
# priors
a = pm.Normal("a_bar", 0.0, 1)
sigma = pm.Exponential("sigma", 1.0)
# linear model
mu = a
# likelihood
s = pm.Normal("s", mu=mu, sigma=sigma, observed=df_data["tumor_size_std"])
trace_m0 = pm.sample(
draws=1000, random_seed=19, return_inferencedata=True, progressbar=True
)
Sampling 4 chains, 0 divergences ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╺━━━ 90% 0:00:02 / 0:00:09
az.summary(trace_m0)
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
a_bar | -0.002 | 0.081 | -0.161 | 0.141 | 0.001 | 0.001 | 4201.0 | 2889.0 | 1.0 |
sigma | 1.009 | 0.059 | 0.892 | 1.114 | 0.001 | 0.001 | 4028.0 | 2539.0 | 1.0 |
f, (ax0, ax1) = plt.subplots(1, 2, figsize=(12, 5)) # do add subplots
df_data["tumor_size"].hist(ax=ax0)
df_data["tumor_size_std"].hist(ax=ax1)
<Axes: >
Acknowledge that there are patient clusters but do not share any information across them. In other words have a prior but no adaptive regularization.
\[s_i \sim \text{Normal}(\mu_i, \sigma)\] \[\mu_i = \alpha_{\text{pt[i]}}\] \[\alpha_j \sim \text{Normal}(0, 1)\] \[\sigma \sim \text{Exponential}(1)\]# no pooling, intercepts only
with pm.Model() as m1:
# priors
a = pm.Normal("a", 0.0, 1, shape=df_data["patient_id"].nunique())
sigma = pm.Exponential("sigma", 1.0)
# linear model... # initialize with pymc data?... represent patient as its own cluster
mu = a[df_data["patient_id"]]
# likelihood
s = pm.Normal("s", mu=mu, sigma=sigma, observed=df_data["tumor_size_std"])
trace_m1 = pm.sample(
draws=1000, random_seed=19, return_inferencedata=True, progressbar=True
)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [a, sigma]
Output()
IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.
Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)
Sampling 4 chains, 0 divergences ━━━━━━━━━━━━━━━━━━━━━━━╸━━━━━━━━━━━━━━━━ 60% 0:00:05 / 0:00:06
IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.
Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)
az.summary(trace_m1).head()
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
a[0] | 0.249 | 0.296 | -0.324 | 0.794 | 0.003 | 0.004 | 7399.0 | 2915.0 | 1.0 |
a[1] | -0.644 | 0.296 | -1.198 | -0.087 | 0.003 | 0.003 | 8740.0 | 3331.0 | 1.0 |
a[2] | 0.381 | 0.300 | -0.178 | 0.942 | 0.003 | 0.003 | 9641.0 | 2892.0 | 1.0 |
a[3] | 0.831 | 0.297 | 0.259 | 1.383 | 0.003 | 0.003 | 7541.0 | 2610.0 | 1.0 |
a[4] | 0.426 | 0.295 | -0.128 | 0.968 | 0.003 | 0.003 | 9279.0 | 2689.0 | 1.0 |
f, ax = plt.subplots()
ax.scatter(
az.summary(trace_m1, var_names=["a"])["mean"],
standardize(df_data.groupby("patient_id")["tumor_size"].mean()),
)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, linestyle="dashed", color="gray")
ax.set(xlabel="raw data", ylabel="model parameter of intercepts");
Question
# multilevel model, random intercepts
with pm.Model() as m2:
# prior for average patient
a_bar = pm.Normal("a_bar", 0.0, 1)
sigma = pm.Exponential("sigma", 1.0)
# prior for SD of patients
sigma_pt = pm.Exponential("sigma_pt", 1.0)
# alpha priors for each patient
a = pm.Normal("a", a_bar, sigma_pt, shape=len(df_data["patient_id"].unique()))
# linear model
mu = a[df_data["patient_id"]]
# likelihood
s = pm.Normal("s", mu=mu, sigma=sigma, observed=df_data["tumor_size_std"])
trace_m2 = pm.sample(
draws=1000, random_seed=19, return_inferencedata=True, progressbar=True
)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [a_bar, sigma, sigma_pt, a]
Output()
az.summary(trace_m2, var_names=["a"]).head()
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
a[0] | 0.236 | 0.287 | -0.290 | 0.773 | 0.003 | 0.004 | 9775.0 | 3129.0 | 1.0 |
a[1] | -0.603 | 0.293 | -1.152 | -0.035 | 0.003 | 0.003 | 10061.0 | 3014.0 | 1.0 |
a[2] | 0.359 | 0.287 | -0.179 | 0.897 | 0.003 | 0.003 | 7390.0 | 2768.0 | 1.0 |
a[3] | 0.773 | 0.288 | 0.254 | 1.322 | 0.003 | 0.003 | 8147.0 | 2850.0 | 1.0 |
a[4] | 0.398 | 0.292 | -0.171 | 0.928 | 0.003 | 0.003 | 8075.0 | 2811.0 | 1.0 |
az.summary(trace_m2, var_names=["a_bar", "sigma"]).head()
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
a_bar | 0.003 | 0.150 | -0.286 | 0.284 | 0.002 | 0.002 | 6844.0 | 3590.0 | 1.0 |
sigma | 0.695 | 0.045 | 0.611 | 0.778 | 0.001 | 0.000 | 4631.0 | 2655.0 | 1.0 |
While there isn’t an appreciable difference, the multilevel model has a lower standard deviation for each cluster. This is the partial pooling effect.
f, (ax0, ax1) = plt.subplots(1, 2, figsize=(12, 5))
# means
ax0.scatter(
az.summary(trace_m1, var_names=["a"])["mean"],
az.summary(trace_m2, var_names=["a"])["mean"],
facecolors="none",
edgecolors="k",
)
ax0.plot([0, 1], [0, 1], transform=ax0.transAxes, linestyle="dashed", color="gray")
ax0.set(
xlabel="no pooling",
ylabel="partial pooling",
title="Intercepts\n(mean)",
)
# SD
ax1.scatter(
az.summary(trace_m1, var_names=["a"])["sd"],
az.summary(trace_m2, var_names=["a"])["sd"],
facecolors="none",
edgecolors="k",
)
ax1.plot([0, 1], [0, 1], transform=ax0.transAxes, linestyle="dashed", color="gray")
ax1.set(
xlabel="no pooling",
ylabel="partial pooling",
title="intercepts\n(standard deviation)",
)
ax1.plot([0, 1], [0, 1], transform=ax1.transAxes, linestyle="dashed", color="gray")
# Calculate the minimum and maximum of both x and y data
data_min = min(
min(az.summary(trace_m1, var_names=["a"])["sd"]),
min(az.summary(trace_m2, var_names=["a"])["sd"]),
)
data_max = max(
max(az.summary(trace_m1, var_names=["a"])["sd"]),
max(az.summary(trace_m2, var_names=["a"])["sd"]),
)
# Set the limits to be the same for both axes
ax1.set_xlim(data_min * 0.95, data_max * 1.05)
ax1.set_ylim(data_min * 0.95, data_max * 1.05)
f.tight_layout()
You can see that partial pooling decreases the standard error of the intercept parameter in most cases, even though the mean estimate does not really change. Let’s see how to implement this in statsmodels.
Using probabilistic programming provides a nice framework to get the random intercepts with probability distributions. But it may not scale as well. Let’s explore varying intercepts using statsmodels.
# Define the mixed-effects model formula with only varying intercepts
model = smf.mixedlm("tumor_size_std ~ 1", df_data, groups=df_data["patient_id"])
# Fit the model
result = model.fit()
# Print the summary of the model
print(result.summary())
Mixed Linear Model Regression Results
============================================================
Model: MixedLM Dependent Variable: tumor_size_std
No. Observations: 150 Method: REML
No. Groups: 30 Scale: 0.4754
Min. group size: 5 Log-Likelihood: -186.1971
Max. group size: 5 Converged: Yes
Mean group size: 5.0
-------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
-------------------------------------------------------------
Intercept -0.000 0.146 -0.000 1.000 -0.287 0.287
Group Var 0.546 0.272
============================================================
The main thing we want to look at is the bottom of the table. Intercept
refers to the population’s average ($\bar{\alpha}$ while Group Var
refers to the variance of the random intercepts associated with the grouping variable (patient_id
) which is sigma_pt
in the pymc
model. We can see that these are largely in alignment with the pymc
results even if Group Var
/sigma_pt
differ in their means.
az.summary(trace_m2, var_names=["a_bar", "sigma_pt"])
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
a_bar | 0.003 | 0.150 | -0.286 | 0.284 | 0.002 | 0.002 | 6844.0 | 3590.0 | 1.0 |
sigma_pt | 0.759 | 0.121 | 0.538 | 0.981 | 0.002 | 0.001 | 5584.0 | 3368.0 | 1.0 |
pymc
and statsmodels
outputNow let’s see how each individual patient’s estimates look between pymc
and statsmodels
. Statsmodels doesn’t provide the SD directly. It may be derived by bootstrapping but we’ll ignore this for now.
# Extract the random effects
df_smf_random_effects = (
pd.DataFrame(result.random_effects)
.T.reset_index()
.rename(columns={"index": "Patient", "Group": "random_effect_mean"})
)
df_smf_random_effects.head()
Patient | random_effect_mean | |
---|---|---|
0 | 0 | 0.237706 |
1 | 1 | -0.603279 |
2 | 2 | 0.358260 |
3 | 3 | 0.775414 |
4 | 4 | 0.398535 |
f = plt.figure(figsize=(12, 5))
ax = f.add_subplot(1, 2, 1)
ax.scatter(
az.summary(trace_m2, var_names=["a"])["mean"],
df_smf_random_effects['random_effect_mean'],
facecolors="none",
edgecolors="k",
)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, linestyle="dashed", color="gray");
ax.set(
xlabel="pymc",
ylabel="statsmodels",
title="Varying intercepts estimate\nby package",
);
As we can see, the two packages give essentially the same results for varying intercepts.
%load_ext watermark
%watermark -n -u -v -iv -w
Last updated: Tue May 28 2024
Python implementation: CPython
Python version : 3.12.3
IPython version : 8.24.0
pymc : 5.15.0
graphviz : 0.20.3
seaborn : 0.13.2
matplotlib : 3.8.4
statsmodels: 0.14.2
scipy : 1.13.0
numpy : 1.26.4
pandas : 2.2.2
arviz : 0.18.0
Watermark: 2.4.3
import graphviz as gr
def draw_causal_graph(
edge_list, node_props=None, edge_props=None, graph_direction="UD"
):
"""Utility to draw a causal (directed) graph
Taken from: https://github.com/dustinstansbury/statistical-rethinking-2023/blob/a0f4f2d15a06b33355cf3065597dcb43ef829991/utils.py#L52-L66
"""
g = gr.Digraph(graph_attr={"rankdir": graph_direction})
edge_props = {} if edge_props is None else edge_props
for e in edge_list:
props = edge_props[e] if e in edge_props else {}
g.edge(e[0], e[1], **props)
if node_props is not None:
for name, props in node_props.items():
g.node(name=name, **props)
return g
Examples
Advantages
Disadvantages
Statistical Framework
Conclusion
DAG: Controlling for Observable Factors
draw_causal_graph(
edge_list=[("X1i", "Yi"), ("ε", "Yi")],
edge_props={("ε", "Yi"): {"style": "dashed", "label": "β"}},
graph_direction="LR",
)
Other factors that are not seen in the survey data are summed up in the hidden error term.
$ Y_i = \beta_0 + \beta_1X1_i + \epsilon_i $
# `ε` is unicode for epsilon since `ε` fails to render
draw_causal_graph(
edge_list=[("X1i", "Yi"), ("ε", "Yi"), ("ε", "X1i")],
edge_props={
("X1i", "Yi"): {"label": "β1"},
("ε", "Yi"): {"style": "dashed"},
("ε", "X1i"): {"style": "dashed", "label": "backdoor"},
},
graph_direction="LR",
)
If the backdoor is present, then the estimate of $\beta_1$ will not be correct.
But imagine that $X2i$ in the error term can be observed. A new DAG might look like this.
draw_causal_graph(
edge_list=[("X1i", "Yi"), ("ε", "Yi"), ("X2i", "X1i"), ("X2i", "Yi")],
edge_props={
("X1i", "Yi"): {"label": "β1"},
("X2i", "Yi"): {"label": "β2"},
("ε", "Yi"): {"style": "dashed"},
("ε", "X1i"): {"style": "dashed", "label": "backdoor"},
},
graph_direction="LR",
)
$ Y_i = \beta_0 + \beta_1X1_i + \beta_2X2_i + \epsilon_i $
$X2$ is now specifically controlled for.
Triangular Tables: A way to observe the effect on a regression model of incrementally adding more variables but be careful of overfitting. Knowing what variables to include requires some domain knowledge.
Advantages
Disadvantages
Conclusion
Introduction
How does matching work?
Matching assumptions (need both)
Propensity score matching
Advantages
Disadvantages
Conclusion
Introduction
Visual technique
Example: raising of the school leaving age (1972)
Then what? How do you use discontinuity if you find one? Look another outcome.
Why causal?
How to perfom RDD?
Example: election You can also run multiple RDD lines.
Advantages
Disadvantages
Conclusion
Use the predicted X variable in the second equation, not the actual X variable.
It’s tricky to do this manually since some correction needs to take place.
Always use a dedicated program.
Examples of instruments in studies:
Advantages
Disadvantages
Conclusion
%load_ext watermark
%watermark -n -u -v -iv -w
Last updated: Fri May 24 2024
Python implementation: CPython
Python version : 3.11.7
IPython version : 8.21.0
graphviz: 0.20.1
Watermark: 2.4.3
Randomization Methods
Method | Description | Key Points |
---|---|---|
Simple randomization | Assigns equal probability to treatment and control groups | - Easy to implement - Can lead to unequal group sizes in small trials |
Block randomization | Get similar group sizes by dividing subjects into predetermined number of subjects (often a multiple of the number of groups, like 4, 8, etc. for two groups). Within each block, participants are then randomly assigned to treatment groups. | - Requires choosing the total number of subjects in each block - Can often avoid imbalance in small trials seen with simple randomization |
Stratified randomization | Balance based on characteristics/covariates (like age, sex, etc.) before randomizing within these strata | - Ensures balance in important covariates between groups |
Cluster randomization | Randomizes entire groups (like schools or hospitals) | - Suitable for group-level interventions or when individual assignment is impractical |
Covariate adaptive randomization | Increases the probability of being assigned to a group to address a deficit of a particular characteristic within the group | - Effective in trials with small sample sizes or multiple important covariates |
With this approach, causality can be assessed.
Most critical data in bold.
Analysis Type | Description |
---|---|
Intention-to-Treat (ITT) | Analyze groups as they were assigned, regardless of whether they completed the intervention. Maintains the benefits of randomization |
Per-Protocol | Includes only participants who completed the study. Useful in medical trials but may introduce bias due to loss of randomization (e.g. treatment failure may not be random). |
As-Treated | Analyzed according to treatment they actualy received, regardless of original assignment. This can happen when there can be emerging issues for someone in a control group. |
Analysis Type | Applicability | Statistical test/technique |
---|---|---|
Continuous outcomes | For normally distributed continuous outcomes | T-tests or ANOVA; linear regression if adjustment is needed |
Non-normally distributed outcomes | Non-normal or ordinal outcomes | Wilcoxon rank-sum, Mann-Whitney |
Categorical outcomes | Compare proportions between groups | Chi-square test; logistic regression if adjustment is needed |
Time-to-event | Time until event occurs | Kaplan-Meier curves, log-rank tests; Cox proportional hazard if adjustment is needed |
Regression techniques can be used when randomization doesn’t go as expected.
%load_ext watermark
%watermark -n -u -v -iv -w
Last updated: Fri May 17 2024
Python implementation: CPython
Python version : 3.11.7
IPython version : 8.21.0
Watermark: 2.4.3
import graphviz as gr
from scipy import stats
import statsmodels.api as sm
def draw_causal_graph(
edge_list, node_props=None, edge_props=None, graph_direction="UD"
):
"""Utility to draw a causal (directed) graph
Taken from: https://github.com/dustinstansbury/statistical-rethinking-2023/blob/a0f4f2d15a06b33355cf3065597dcb43ef829991/utils.py#L52-L66
"""
g = gr.Digraph(graph_attr={"rankdir": graph_direction})
edge_props = {} if edge_props is None else edge_props
for e in edge_list:
props = edge_props[e] if e in edge_props else {}
g.edge(e[0], e[1], **props)
if node_props is not None:
for name, props in node_props.items():
g.node(name=name, **props)
return g
draw_causal_graph(
edge_list=[("X1", "Y"), ("X2", "Y"), ("X2", "X1")],
edge_props={
("X2", "Y"): {"label": "backdoor", "style": "dashed"},
("X2", "Y"): {"style": "solid"},
},
graph_direction="LR",
)
# create a dataframe that holds the number and probability for each group
n_obs = 200
x2 = stats.norm.rvs(loc=0, scale=1, size=200)
x1 = stats.norm.rvs(loc=0, scale=1, size=200) + x2
y = x1 + x2
model0 = sm.OLS(y, x1)
model1 = sm.OLS(y, x1 + x2)
results0 = model0.fit()
results1 = model1.fit()
# with x1 only gives a biased estimate
results0.params
array([1.46420051])
# including x2 recovers the right parameter given the data generating process
results1.params
array([1.])
But what if x2 is unobserved? Unfortunately the backdoor path cannot be closed. This is where basic regression analysis fails us. But this is where more advanced methods can help us in the case of unobserved confounds.
DAGs and Methods
Methodology | Controls for Unobserved Confounds |
---|---|
Regression analysis | No (controls for observed confounds only) |
Propensity score matching | No (controls for observed confounds only) |
Instrumental variables | Yes (if a valid instrument is used) |
Regression discontinuity designs | Partially (near the cutoff point) |
Regression with time effects | Partially (for time-varying confounders) |
Fixed effects regression | Yes (for time-invariant confounders within entities) |
Difference-in-Differences Models | Yes (if trends are parallel without treatment) |
Synthetic control methods | Partially (for observed and unobserved pre-treatment confounders) |
Average Treatment Effect (ATE)
The average effect of the treatment across the entire population. \(ATE = E[Y^1_i - Y^0_i]\) where $Y^1$ is the outcome when treatment is given, $Y^0$ is the outcome when treatment is not given, $i$ is for individual, and $E$ is the expected, which means the effect is averaged out over all individuals.
Interpretation: If you impose a treatment on everyone, then this is the change the average individual will see. But it literally means everyone and so if a drug is sex-specific, using the ATE wouldn’t make sense.
Average Treatment Effect on the Treated (ATT)
The average effect of the treatment for those treated. \(ATT = E[Y^1_i - Y^0_i | \text{Treated}=1 ]\)
Interpretation: Shows effect of intervention only on those that received the intervention (treatment). ATT is usually different from ATE due to selection. (Unsure about $Y^0$ since by definition it shouldn’t exist here?)
Non-random treatment will likely lead to ATT and not ATE since people often self-select expecting benefits.
Average Treatment Effect on the Untreated (ATU)
The average effect of the treatment for those in the control group. \(ATU = E[Y^1_i - Y^0_i | \text{Treated}=0 ]\)
But this can’t be estimated. It’s still useful to think about what would have happened to those who were not reached by an intervention.
Local Average Treatment Effect (LATE)
The average effect of the treatment for those who complied \(LATE = E[Y^1_i - Y^0_i | \text{Compliers}=1 ]\) Treatment conditions only received under certain conditons; conditions influenced by another “instrumental variable”. Example: Study the impact of receiving a scholarship. The instrumental variable might be living in a particular region, LATE would measure the effect of the scholarship on just those students who received it due to their location. Many compliers means LATE approaches ATE. But few compliers limits external validity.
Conclusion
How different are nonexperimental methods compared to experimental methods? Focus of seminal study from Robert Lalonde in 1986.
Lalonde compared the randomzied experiment data with CPS and PSID surveys. The experimental data recovered the ATE. When looking at experimental data, there was a net positive in income suggesting the program worked. He got similar values using causal inference methods. However, survey data varied wildly. Depending on which one you choose can lead to very different policies.
Conclusion
%load_ext watermark
%watermark -n -u -v -iv -w
The watermark extension is already loaded. To reload it, use:
%reload_ext watermark
Last updated: Thu May 09 2024
Python implementation: CPython
Python version : 3.11.7
IPython version : 8.21.0
seaborn : 0.13.2
statsmodels: 0.14.1
scipy : 1.12.0
matplotlib : 3.8.2
pandas : 2.2.0
numpy : 1.25.2
Watermark: 2.4.3
You might be asking yourself, “Ben, why don’t you just request RSVPs and get a more definitive number?” There are several reasons for this:
Here are some interesting aspects of the problem.
This problem was addressed by a 1993 paper by Ken Butler and Michael Stephens called The Distribution of a Sum of Binomial Random Variables . However, we can re-derive some of the work through reasoning with just a few fundamental probability rules:
Let’s get started!
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import binom
from scipy.stats._distn_infrastructure import rv_discrete_frozen
Here is a visualization of the problem. We can have probability distributions for different groups on an invite list. For explanation purposes, let’s imagine that we’re considering inviting three groups: a set of grandparents, neighbors, and a busy co-worker with their family with probability of attending as 90%, 75% for neighbors, and 50%, respectively. These groups would have binomial distributions that look like this.
# create a dataframe that holds the number and probability for each group
df_data = pd.DataFrame(
{
"group": ["Grandparents", "Neighbors", "Co-worker's family"],
"n": [2, 4, 5],
"p": [0.9, 0.5, 0.2]
}
)
df_data
group | n | p | |
---|---|---|---|
0 | Grandparents | 2 | 0.9 |
1 | Neighbors | 4 | 0.5 |
2 | Co-worker's family | 5 | 0.2 |
# Visualize the probability distribution for each grouop
fig, axes = plt.subplots(1, 3, figsize=(16, 4), sharey=True)
for (i, row), ax in zip(df_data.iterrows(), axes.flat):
rv = binom(row['n'], row['p'])
x = np.arange(row['n']+1)
ax.bar(x, rv.pmf(x))
ax.set(
title = row['group'],
ylim = (0, 1),
xticks = x,
xlabel = "Number attending",
ylabel = "Probability"
)
As you can see, it’s straighforward to get a distribution for each group. But how do we get a distribution for the total number of people attending?
We can look at the Butler and Stephens paper and see that they with both exact and approximate solutions. For our purposes, we’ll focus on the exact distribution, especially since there was this line from their paper: “With modern computing facilities, it is possible to calculate the exact distribution of S.” Remember this was written in 1993 when computers looked like this. Nevertheless, when you see the calculations carried out, you can appreciate why a statement was warranted and why they explored approximations, especially as the number of samples (possible attendees in this case) might scale.
The heart of the algorithm is the following equation.
\[P(Y + Z=j) = \sum_{i=0}^{j} P(Y=i) P(Z = j-i)\]$Y$ and $Z$ are two discrete random variables (e.g. integers) which makes sense since we don’t want a fraction of a person attending (unless it was Halloween). While they described $Y$ and $Z$ as variables with a binomial distribution, we can see later why this is not strictly necessary. Additionally, the equation only has $Y$ and $Z$ since they start with only two groups. Once that has been calculated the new distribution is the “new” $Y$ and the next group would be the “new” $Z$. This continues recursively until all groups have been accounted for.
We can derive the above formula with our putative invite example. We’ll go back to our dataframe where each row contains a group, the number of people ($n$), and the probability of attendance ($p$) but limit it to the first two rows where Grandparents
is the $Y$ variable and Neighbors
is the $Z$ variable.
df_data.head(2)
group | n | p | |
---|---|---|---|
0 | Grandparents | 2 | 0.9 |
1 | Neighbors | 4 | 0.5 |
And let’s look at each group’s probability distribution in table form.
# grandparents
i_vals_gp = range(df_data.loc[0, "n"] + 1)
Y = binom(df_data.loc[0, "n"], df_data.loc[0, "p"])
df_Y = pd.DataFrame({'x':i_vals_gp, 'probability':Y.pmf(i_vals_gp)})
df_Y
x | probability | |
---|---|---|
0 | 0 | 0.01 |
1 | 1 | 0.18 |
2 | 2 | 0.81 |
# neighbors
i_vals_nb = range(df_data.loc[1, "n"] + 1)
Z = binom(df_data.loc[1, "n"], df_data.loc[1, "p"])
df_Z = pd.DataFrame({'x':i_vals_nb, 'probability':Z.pmf(i_vals_nb)})
df_Z
x | probability | |
---|---|---|
0 | 0 | 0.0625 |
1 | 1 | 0.2500 |
2 | 2 | 0.3750 |
3 | 3 | 0.2500 |
4 | 4 | 0.0625 |
Now, we can consider the probabilities for each possibility of total attendance $j$. That means, $j$ is bounded by 0 (no one comes, wah wah) to 6 (everyone shows up). The probability for each value of $j$ can be deduced with probability rules. I’m italicizing some keywords below since we can link and statements to multiplication and or statements to addition.
You can see the pattern and figure out the logic for the remaining values of $j$. Hopefully you can see now how this leads to the above equation. Let’s flesh this out with code.
def convert_binom_pmf(rv: rv_discrete_frozen) -> dict:
"""Convert a random variable's binomial distribution PMF to a dictionary.
Parameters
----------
rv
Random variable Y representing discrete values
Returns
-------
:
Dictionary of probabilities for each j
"""
j = rv.support()[1]
return {x:rv.pmf(x) for x in range(j+1)}
def probability_for_j_total_people(
j: int, rv_Y_prob: dict, rv_Z_prob: dict
) -> float:
"""Determine the probability of j people attending, given two random variables.
Parameters
----------
j
The number of total attendees.
rv_Y_prob
Probability distribution of random variable Y as a dictionary
rv_Z_prob
Probability distribution of random variable Y as a dictionary
Returns
-------
:
Total probability of j
"""
combinations = itertools.product(range(j + 1), repeat=2)
total_j_combinations = [(y, z) for y, z in combinations if y + z == j] # gives all y+z combinations that add up to j
prob = 0
x = range(j + 1)
for combo in total_j_combinations:
if (combo[0] in rv_Y_prob) and (combo[1] in rv_Z_prob):
prob_combo = rv_Y_prob[combo[0]] * rv_Z_prob[combo[1]]
prob += prob_combo
return prob
total_prob = 0 # sanity check that the total probability adds up to 1
j_prob = dict() # cache the probabilities in a dictionary
max_attendees = df_data.head(2)['n'].sum()
for j in range(max_attendees + 1):
rv_Y_prob = convert_binom_pmf(Y)
rv_Z_prob = convert_binom_pmf(Z)
prob = probability_for_j_total_people(j, rv_Y_prob, rv_Z_prob)
print(f"Probability for {j} total people: {prob:0.5f}")
j_prob[j] = prob
total_prob += prob
print(f"\nTotal probability after accounting for all cases: {total_prob:0.4f}")
Probability for 0 total people: 0.00062
Probability for 1 total people: 0.01375
Probability for 2 total people: 0.09938
Probability for 3 total people: 0.27250
Probability for 4 total people: 0.34937
Probability for 5 total people: 0.21375
Probability for 6 total people: 0.05063
Total probability after accounting for all cases: 1.0000
Awesome! It looks like we’ve successfully carried out the equation. But remember we’ve only done the first two groups. The next step is to add in the remaining group (co-worker's family
) using the same process. But the probability distribution of our new $Y$ variable is what we just calculated, which is now stored in a dictionary j_prob
. As I indicated above, there’s really no requirement that the distribution be binomial. It just has to be discrete.
# Let's recall the n and p of the remaining group
df_data.tail(1)
group | n | p | |
---|---|---|---|
2 | Co-worker's family | 5 | 0.2 |
We can carry out the same steps.
# coworkers
i_vals_cw = range(df_data.loc[2, "n"] + 1)
new_Z = binom(df_data.loc[2, "n"], df_data.loc[2, "p"])
rv_new_Z_prob = convert_binom_pmf(new_Z) # put in dictionary form for our function
rv_new_Z_prob
{0: 0.3276799999999998,
1: 0.4095999999999999,
2: 0.20479999999999987,
3: 0.051200000000000016,
4: 0.0064,
5: 0.0003200000000000001}
df_data
group | n | p | |
---|---|---|---|
0 | Grandparents | 2 | 0.9 |
1 | Neighbors | 4 | 0.5 |
2 | Co-worker's family | 5 | 0.2 |
Recall that the putative total invite list was 11 people. Therefore, our probability distribution should give probabilities for each value between 0 and 11, inclusive.
j_prob_new = dict() # cache the probabilities in a dictionary
for j in range(df_data["n"].sum() + 1):
prob = probability_for_j_total_people(
j, j_prob, rv_new_Z_prob
) # j_prob is what we calculated for the first two groups
j_prob_new[j] = prob
f, ax = plt.subplots(figsize=(8, 5))
ax.bar(j_prob_new.keys(), j_prob_new.values())
ax.set(
title="Probability distribution of total attendance",
xticks=list(j_prob_new.keys()),
xlabel="Number attending",
ylabel="Probability",
);
OK! So it now looks like we have our final, exact distribution for the sum of all variables, what Butler and Stephens called $S$. We can make the production of this distribution much more user friendly with another couple of functions. Passing in our dataframe, we will produce a list of dictionaries, where each dictionary is a group’s probability distribution. Then this list will be passed into a second function to give our final answer. You can see at the assert
statement that gives us the same answer that we derived above, step-by-step.
df_data
group | n | p | |
---|---|---|---|
0 | Grandparents | 2 | 0.9 |
1 | Neighbors | 4 | 0.5 |
2 | Co-worker's family | 5 | 0.2 |
def sum_of_discrete_rvs_exact_calculation(pmf_list: list) -> pd.Series:
"""Determine the probability distribution using recursion.
Perform the exact calculation on the first two rows (e.g. random variables).
If more than two rows exist, treat the resulting probability distribution as
a random variable and add the third row. Repeat until all rows have been
accounted for.
Parameters
----------
pmf_list
A list of dictionaries containing the probability mass functions of
different groups attending.
Returns
-------
:
Final probability distribution as a pandas Series
"""
pmf_list = pmf_list.copy() # prevent the original list from being altered
s_prob = dict()
max_attendees = max(list(pmf_list[0].keys())) + max(list(pmf_list[1].keys()))
for j in range(max_attendees + 1):
prob = probability_for_j_total_people(
j, pmf_list[0], pmf_list[1]
)
s_prob[j] = prob
# base case
if len(pmf_list) == 2:
return s_prob
# apply recursion
else:
# remove the first element and then replace the remaining first element with the new dictionary
pmf_list.pop(0)
pmf_list[0] = s_prob
return sum_of_discrete_rvs_exact_calculation(pmf_list)
# put each group in a probability distribution
pmf_list_party = [convert_binom_pmf(binom(df_data.loc[i, "n"], df_data.loc[i, "p"])) for i in df_data.index]
assert j_prob_new == sum_of_discrete_rvs_exact_calculation(pmf_list_party)
%load_ext watermark
%watermark -n -u -v -iv -w
Last updated: Fri Apr 12 2024
Python implementation: CPython
Python version : 3.11.7
IPython version : 8.21.0
numpy : 1.25.2
seaborn : 0.13.2
matplotlib: 3.8.2
pandas : 2.2.0
Watermark: 2.4.3
I decided to take a deep dive to resolve my confusion, with much help from numerous sources. Please check out the Acknowledgments and references section!
In this post, I’ll be comparing an example of mixed effects modeling across statistical philosophies and across statistical languages. As a bonus, a meme awaits.
method | approach | language | package |
---|---|---|---|
1 | frequentist | R | lme4 |
2 | Bayesian | Python | pymc |
Note that the default language in the code blocks is Python. A cell running R will have %%R
designated at the top. A variable can be inputted (-i
) or outputted (-o
) on that same line if it is used between the two languages.
Special thanks to Patrick Robotham for providing a lot of feedback.
from aesara import tensor as at
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pymc as pm
import xarray as xr
%config InlineBackend.figure_format = 'retina'
az.style.use("arviz-darkgrid")
rng = np.random.default_rng(1234)
az.rcParams["stats.hdi_prob"] = 0.95
def standardize(x):
x = (x - np.mean(x)) / np.std(x)
return x
# Enable running of R code
%load_ext rpy2.ipython
%%R
suppressMessages(library(tidyverse))
suppressMessages(library(lme4))
suppressMessages(library(arm))
suppressMessages(library(merTools))
cafe
datasetThe dataset I am using is created from a scenario described in Statistical Rethinking.
Here are a few more details of the dataset from Dr. McElreath’s book:
Begin by defining the population of cafés that the robot might visit. This means we’ll define the average wait time in the morning and the afternoon, as well as the correlation between them. These numbers are sufficient to define the average properties of the cafés. Let’s define these properties, then we’ll sample cafés from them.
Nearly all Python code is taken from the Statistical Rethinking pymc repo with some minor alterations.
a = 3.5 # average morning wait time
b = -1.0 # average difference afternoon wait time
sigma_a = 1.0 # std dev in intercepts
sigma_b = 0.5 # std dev in slopes
rho = -0.7 # correlation between intercepts and slopes
Mu = [a, b]
sigmas = [sigma_a, sigma_b]
Rho = np.matrix([[1, rho], [rho, 1]])
Sigma = np.diag(sigmas) * Rho * np.diag(sigmas) # covariance matrix
N_cafes = 20
vary_effects = np.random.multivariate_normal(mean=Mu, cov=Sigma, size=N_cafes)
a_cafe = vary_effects[:, 0]
b_cafe = vary_effects[:, 1]
Now simulate the observations.
N_visits = 10
afternoon = np.tile([0, 1], N_visits * N_cafes // 2)
cafe_id = np.repeat(np.arange(0, N_cafes), N_visits)
mu = a_cafe[cafe_id] + b_cafe[cafe_id] * afternoon
sigma = 0.5 # std dev within cafes
wait = np.random.normal(loc=mu, scale=sigma, size=N_visits * N_cafes)
df_cafes = pd.DataFrame(dict(cafe=cafe_id, afternoon=afternoon, wait=wait))
To get a sense of the data structure we just created, let’s take a look at the first and last 5 rows.
df_cafes.head()
cafe | afternoon | wait | |
---|---|---|---|
0 | 0 | 0 | 2.724888 |
1 | 0 | 1 | 1.951626 |
2 | 0 | 0 | 2.488389 |
3 | 0 | 1 | 1.188077 |
4 | 0 | 0 | 2.026425 |
df_cafes.tail()
cafe | afternoon | wait | |
---|---|---|---|
195 | 19 | 1 | 3.394933 |
196 | 19 | 0 | 4.544430 |
197 | 19 | 1 | 2.719524 |
198 | 19 | 0 | 3.379111 |
199 | 19 | 1 | 2.459750 |
Note that this dataset is balanced, meaning that each group (cafe) has the same number of observations. Mixed effects / multilevel models shine with unbalanced data where it can leverage partial pooling.
Let’s plot the raw data and see how the effect of afternoon influences wait time. Instead of plotting in order of the arbitrarily named cafes (0 to 19), I’ll show in order of increasing average morning wait time so that we can appreciate the variability across the dataset.
df_cafes.head()
cafe | afternoon | wait | |
---|---|---|---|
0 | 0 | 0 | 2.644592 |
1 | 0 | 1 | 2.126485 |
2 | 0 | 0 | 2.596465 |
3 | 0 | 1 | 2.250297 |
4 | 0 | 0 | 3.310709 |
%%R -i df_cafes
# credit to TJ Mahr for a template of this code
xlab <- "Afternoon"
ylab <- "Wait time"
titlelab <- "Wait times for each cafe (ordered by increasing average time)"
# order by increasing average morning wait time (intercept only)
cafe_ordered_by_avgwaittime <- df_cafes %>%
filter(afternoon==0) %>%
group_by(cafe) %>%
summarize(mean = mean(wait)) %>%
arrange(mean)
# Turn the gear column from a numeric in a factor with a certain order
df_cafes$cafe <- factor(df_cafes$cafe, levels=cafe_ordered_by_avgwaittime$cafe)
ggplot(df_cafes) +
aes(x = afternoon, y = wait) +
geom_boxplot(aes(fill=factor(afternoon))) +
stat_summary(fun.y="mean", geom="line") +
facet_wrap("cafe") +
labs(x = xlab, y = ylab, title=titlelab)
One pattern is that as we increase morning wait time (e.g. the intercept) the difference in wait time in the afternoon (the slope) gets bigger. In other words, when we simulated this dataset, we included a co-variance structure between the intercept and slope. When we develop an inferential model with this data, we want to be able to reveal this co-variance.
Galecki and Burzykowski, Wikipedia, and this page from UCLA all describe a linear mixed model with an equation similar to equation 1 below.
I rely heavily on the UCLA page since it is the one that helped me the most. In fact, if you don’t care about how it connects to the Bayesian approach, stop reading this and check that out instead!
In contrast to the Bayesian set of equations, the fixed effects and random effects are in the same equation here.
\[\textbf{y} = \textbf{X} \boldsymbol{\beta} + \textbf{Z} \textbf{u} + \boldsymbol{\epsilon} \tag{1}\]The left side of the equation $\textbf{y}$ represents all of our observations (or the wait time in the cafe example). The $\boldsymbol{\beta}$ in the first term of the equation represents a vector of coefficients across the population of cafes. These are the fixed effects. The $\textbf{u}$ in the second term of equation 1 represents a matrix of coefficients for each individual cafe. These are the random effects. Both $\textbf{X}$ and $\textbf{Z}$ are the design matrix of covariates. Finally, there’s a residual error term $\boldsymbol{\epsilon}$.
When relating this equation all back to the cafe dataset we just created, I needed to dig deeper to how terms represented an individual observation versus the group (cafe) level. Doing a dimensional analysis helped.
Equation 1 variable | Dimensions | Effects type | Comment |
---|---|---|---|
$\textbf{y}$ | 200 x 1 | n/a | This vector represents the wait time for all 200 observations. I’ll refer to this as $w_i$ later in equation 2. |
$\textbf{X}$ | 200 x 2 | associated with fixed | The first column of each observation is 1 since it is multiplied by the intercept term. The second column is $A$, which will be 0 or 1 for afternoon . |
$\boldsymbol{\beta}$ | 2 x 1 | fixed | The two elements in the $\boldsymbol{\beta}$ (bold font beta) are what I’ll refer to as the intercept $\alpha$ and the slope $\beta$ (unbolded beta) across all cafes in equation 2. |
$\textbf{Z}$ | 200 x (2x20) | associated with random | The first 20 columns representing intercepts for each cafe and the second 20 for the covariate (afternoon ). See visual below. |
$\textbf{u}$ | (2x20) x 1 | random | $\textbf{u}$ holds each of the 20 cafes’ intercept $a_\text{cafe}$ and slope $b_\text{cafe}$. There’s an implied correlation structure between them. |
$\boldsymbol{\epsilon}$ | 200 x 1 | n/a | Normally distributed residual error. |
To better understand what $\textbf{Z}$ looks like we can create an alternate representation of df_cafes
. Each row of the matrix $\textbf{Z}$ is for an individual observation. The first 20 columns of a row are the 20 intercepts of a cafe (column 1 is cafe 1, column 2 is cafe 2, etc.) All of the first 20 columns will contain a 0 except for the column that represents the cafe that observation is associated with which will be a 1. The next 20 columns (columns 21-40) will represent afternoon
. All of this second group of columns will be 0 except for the column that represents the cafe that observation is associated with and if the observation is associated with an afternon observation.
To be clear, the structure of df_cafes
, where each row is an observation with the cafe, afternoon status, and wait time, is already in a form to be understood by the lmer
and pymc
packages. What I’m showing below is to help understand what the matrix $\textbf{Z}$ looks like in the above equations.
Z = np.zeros((200, 40))
for i in df_cafes.index:
cafe = df_cafes.loc[i, 'cafe']
afternoon = df_cafes.loc[i, 'afternoon']
Z[i, cafe] = 1
Z[i, 20+cafe] = afternoon
We can take a look at the first 12 rows of Z. The first 10 are for the first cafe and observations alternate morning and afternoon, hence what’s displayed in column 20. I included the first two rows of the second cafe to show how the 1
moves over a row after the first 10 rows. I’ll use pandas
to better display the values.
pd.set_option('display.max_columns', 40)
(
pd.DataFrame(Z[0:12, :])
.astype(int)
.style
.highlight_max(axis=1, props='color:navy; background-color:yellow;')
.highlight_min(axis=1, props='color:white; background-color:#3E0B51;')
)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
10 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
11 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
We can visualize all of $\textbf{Z}$ here.
plt.imshow(Z, aspect='auto')
plt.text(10, 220, s='intercept (cafe)', ha='center', fontsize=14)
plt.text(30, 220, s='covariate (afternoon)', ha='center', fontsize=14)
plt.ylabel('observations')
plt.title('Visual representation of Z')
Text(0.5, 1.0, 'Visual representation of Z')
The vector in $\textbf{u}$ is really where the mixed effects model takes advantage of the covariance structure of the data. In our dataset, the first 20 elements of the vector represent the random intercepts of the cafes and the next 20 are the random slopes. A cafe’s random effects can be thought of as an offset from the populations (the fixed effects). Accordingly, a random effect will be multivariate normally distributed, with mean 0 and a co-variance matrix S.
\[\textbf{u} \sim \text{Normal}(0, \textbf{S}) \tag{2}\]Remember that the $\textbf{u}$ is a (2x20) x 1 matrix, where each cafe’s intercept $a_\text{cafe}$ and slope $b_\text{cafe}$ are contained. Therefore, we can also write this as.
\[\textbf{u} = \begin{bmatrix} a_{\text{cafe}} \\ b_{\text{cafe}} \end{bmatrix} \sim \text{MVNormal} \left( \begin{bmatrix} 0 \\ 0 \end{bmatrix} , \textbf{S} \right) \tag{3}\]In other words, in Equation 1, both the random intercept and random slope are both expected to lie at 0. With regards to $\textbf{S}$, my prior post talked about covariance matrixes so I won’t elaborate here. The key conceptual point of relevance in this problem is that the covariance matrix $\textbf{S}$ can reflect the correlation ($\rho$) that the intercept (average morning wait time) and slope (difference between morning and afternoon wait time).
\[\textbf{S} = \begin{pmatrix} \sigma_{\alpha}^2 & \rho\sigma_{\alpha}\sigma_{\beta} \\ \rho\sigma_{\alpha}\sigma_{\beta} & \sigma_{\beta}^2 \end{pmatrix} \tag{4}\]We know there is a correlation because (a) we generated the data that way and (b) we can directly observe this when we visualized the data.
Finally, the role of $\boldsymbol{\epsilon}$ is to capture any residual variance. Between observations, it is assumed to be homogenous and independent.
Equation 1 is written concisely in linear algebra form. However, since our dataset is relatively simple (only one predictor variable), equation 1 can be written in an expanded, alternative form as equation 2. This might make it easier to understand (at least it did for me). The notation will start to get hairy with subscripts and so I will explicitly rename some variables for this explanation. It will also better match with the Bayesian set of equations described in the McElreath text. Equation 2 is written at the level of a single observation $i$. I’ll repeat Equation 1 here so it’s easier to see the conversion.
\[\textbf{y} = \textbf{X} \boldsymbol{\beta} + \textbf{Z} \textbf{u} + \boldsymbol{\epsilon} \tag{5}\] \[W_i = (\alpha + \beta \times A_i) + (a_{\text{cafe}[i]} + b_{\text{cafe}[i]} \times A_i) + \epsilon_i \tag{6}\]Let’s start off with the left side where we can see that $\textbf{y}$ will now be $W_i$ for wait time. On the right side, I have segmented the fixed and random effects with parentheses. For both, I’ve deconstructed the linear algebra expression form to a simpler form. After re-arrangement, we can obtain the following form in equation 3.
\[W_i = (\alpha + a_{\text{cafe}[i]}) + (\beta + b_{\text{cafe}[i]}) \times A_i + \epsilon_{\text{cafe}} \tag{7}\]Here, we can better appreciate how a cafe’s random effects intercept can be thought of as an offset from the population intercept. The same logic of an offset can be applied to its slope. We will come back to equation 3 after covering Equation set 2, the Bayesian approach.
The following equations are taken from Chapter 14 in Statistical Rethinking. These set of equations look like a beast, but to be honest, they’re more intuitive to me, probably because I learned this approach initially. I’ll state the equations before comparing them directly with Equation set 1 but you may already start seeing the relationship. Essentially what is going on is a re-writing of the above equations in a Bayesian way such that the fixed effects can act as an adaptive prior.
\(W_i \sim \text{Normal}(\mu_i, \sigma) \tag{8}\) \(\mu_i = \alpha_{\text{cafe}[i]} + \beta_{\text{cafe}[i]} \times A_{i} \tag{9}\) \(\sigma \sim \text{Exp}(1) \tag{10}\)
Equation 8 is stating how wait time is normally distributed around $\mu$ and $\sigma$. By making $w_i$ stochastic instead of deterministic (using a ~ instead of =), the $\sigma$ replaces $\epsilon_i$. In equation 10, the prior for $\sigma$ is exponentially distributed and paramaterized with 1. The expected value parameter $\mu$ comes from the linear model in equation 9. You can start to see the similarities with equation 7 above.
\[\begin{bmatrix}\alpha_{\text{cafe}} \\ \beta_{\text{cafe}} \end{bmatrix} \sim \text{MVNormal} \left( \begin{bmatrix}{\alpha} \\ {\beta} \end{bmatrix} , \textbf{S} \right) \tag{11}\]The $\alpha_{\text{cafe}}$ and $\beta_{\text{cafe}}$ terms come from sampling of a multivariate normal distribution as shown in equation 11. Note the very subtle difference in placement of the subscript cafe
when compared to equation 6 and 7. This is an important point I’ll discuss later. On the right side, the two-dimensional normal distribution’s expected values are $\alpha$ and $\beta$. The rest of the equations shown below are our priors for each parameter we’re trying to estimate.
\(\sigma, \sigma_{\alpha}, \sigma_{\beta} \sim \text{Exp}(1) \tag{15}\) \(\textbf{R} \sim \text{LKJCorr}(2) \tag{16}\)
To recap, the first equation set has an explicit fixed effects term and varying effects term in the linear model. In the second equation, the linear model is already “mixed”. It contains both the fixed and varying effects terms implicitly. The fixed effects estimates can be seen in equation 5.
I think you can think of these $\alpha_{\text{cafe}}$ and $\beta_{\text{cafe}}$ terms as already incorporating the information from the fixed and random effects simultaneously.
Now that we have the dataset, we can run the two models, one with lmer
and one with pymc
. Here are the equations that these packages run.
lmer
(frequentist)The lmer
and by extension (brms
) syntax was initially confusing to me.
lmer(wait ~ 1 + afternoon + (1 + afternoon | cafe), df_cafes)
The 1
corresponds to inclusion of the intercept term. A 0
would exclude it. The 1 +
wait
corresponds to the “fixed effects” portion of the model ($\alpha + \beta \times A_i$) while the (1 + wait | cafe)
is the “varying effects” ($a_{\text{cafe}} + b_{\text{cafe}} \times A_i$).
%%R -i df_cafes -o m -o df_fe_estimates -o df_fe_ci -o df_fe_summary
# m df_fe_summary
m <- lmer(wait ~ 1 + afternoon + (1 + afternoon | cafe), df_cafes)
arm::display(m)
# get fixed effects coefficients
df_fe_estimates <- data.frame(summary(m)$coefficients)
# get fixed effects coefficient CIs
df_fe_ci <- data.frame(confint(m))
df_fe_summary <- merge(
df_fe_estimates,
df_fe_ci[c('(Intercept)', 'afternoon'), ],
by.x=0,
by.y=0
)
rownames(df_fe_summary) <- df_fe_summary[, 1]
lmer(formula = wait ~ 1 + afternoon + (1 + afternoon | cafe),
data = df_cafes)
coef.est coef.se
(Intercept) 3.64 0.23
afternoon -1.04 0.11
Error terms:
Groups Name Std.Dev. Corr
cafe (Intercept) 0.99
afternoon 0.39 -0.74
Residual 0.48
---
number of obs: 200, groups: cafe, 20
AIC = 369.9, DIC = 349.2
deviance = 353.5
R[write to console]: Computing profile confidence intervals ...
Can we get the partial pooling results from the lmer
output and see how it compares with the unpooled estimates? Let’s export it for use later.
%%R -i m -o df_partial_pooling -o random_sims
# Make a dataframe with the fitted effects
df_partial_pooling <- coef(m)[["cafe"]] %>%
rownames_to_column("cafe") %>%
as_tibble() %>%
rename(Intercept = `(Intercept)`, Slope_Days = afternoon) %>%
add_column(Model = "Partial pooling")
# estimate confidence interval
random_sims <- REsim(m, n.sims = 1000)
#plotREsim(random_sims)
random_sims
groupFctr | groupID | term | mean | median | sd | |
---|---|---|---|---|---|---|
1 | cafe | 0 | (Intercept) | -1.277651 | -1.283341 | 0.379761 |
2 | cafe | 1 | (Intercept) | 0.164935 | 0.162715 | 0.420411 |
3 | cafe | 2 | (Intercept) | -1.047076 | -1.043646 | 0.387153 |
4 | cafe | 3 | (Intercept) | 0.474320 | 0.500552 | 0.400053 |
5 | cafe | 4 | (Intercept) | -1.473647 | -1.468940 | 0.394707 |
6 | cafe | 5 | (Intercept) | 0.086072 | 0.082010 | 0.408971 |
7 | cafe | 6 | (Intercept) | -0.640217 | -0.628944 | 0.412642 |
8 | cafe | 7 | (Intercept) | 1.507154 | 1.516430 | 0.391119 |
9 | cafe | 8 | (Intercept) | -0.657831 | -0.659448 | 0.394984 |
10 | cafe | 9 | (Intercept) | 0.332758 | 0.331037 | 0.388295 |
11 | cafe | 10 | (Intercept) | -1.018611 | -1.025387 | 0.389930 |
12 | cafe | 11 | (Intercept) | 0.925071 | 0.913997 | 0.397095 |
13 | cafe | 12 | (Intercept) | -1.407149 | -1.403259 | 0.384820 |
14 | cafe | 13 | (Intercept) | -0.412975 | -0.414958 | 0.412863 |
15 | cafe | 14 | (Intercept) | 1.346380 | 1.343109 | 0.403694 |
16 | cafe | 15 | (Intercept) | 0.336807 | 0.346523 | 0.390567 |
17 | cafe | 16 | (Intercept) | 0.747439 | 0.735906 | 0.413094 |
18 | cafe | 17 | (Intercept) | -0.046579 | -0.035018 | 0.396795 |
19 | cafe | 18 | (Intercept) | 1.659019 | 1.646634 | 0.393909 |
20 | cafe | 19 | (Intercept) | 0.323375 | 0.327348 | 0.392401 |
21 | cafe | 0 | afternoon | 0.498557 | 0.501401 | 0.182594 |
22 | cafe | 1 | afternoon | -0.336036 | -0.337360 | 0.193462 |
23 | cafe | 2 | afternoon | 0.395379 | 0.391621 | 0.189140 |
24 | cafe | 3 | afternoon | 0.296956 | 0.293144 | 0.191710 |
25 | cafe | 4 | afternoon | 0.059611 | 0.055121 | 0.189680 |
26 | cafe | 5 | afternoon | -0.033068 | -0.036143 | 0.194723 |
27 | cafe | 6 | afternoon | 0.236107 | 0.237904 | 0.192575 |
28 | cafe | 7 | afternoon | -0.473485 | -0.479199 | 0.185549 |
29 | cafe | 8 | afternoon | 0.408039 | 0.411507 | 0.194145 |
30 | cafe | 9 | afternoon | -0.402131 | -0.393931 | 0.186868 |
31 | cafe | 10 | afternoon | 0.316072 | 0.309198 | 0.189218 |
32 | cafe | 11 | afternoon | -0.335749 | -0.340427 | 0.186644 |
33 | cafe | 12 | afternoon | 0.521558 | 0.519243 | 0.184606 |
34 | cafe | 13 | afternoon | -0.006800 | -0.014344 | 0.199548 |
35 | cafe | 14 | afternoon | -0.277165 | -0.281127 | 0.188748 |
36 | cafe | 15 | afternoon | -0.234501 | -0.235683 | 0.192804 |
37 | cafe | 16 | afternoon | -0.182673 | -0.185997 | 0.194017 |
38 | cafe | 17 | afternoon | -0.017126 | -0.023784 | 0.187302 |
39 | cafe | 18 | afternoon | -0.364424 | -0.364049 | 0.187532 |
40 | cafe | 19 | afternoon | -0.028883 | -0.032691 | 0.185824 |
OK, now let’s try the Bayesian approach and compare answers.
pymc
(Bayesian)n_cafes = df_cafes['cafe'].nunique()
cafe_idx = pd.Categorical(df_cafes["cafe"]).codes
with pm.Model() as m14_1:
# can't specify a separate sigma_a and sigma_b for sd_dist but they're equivalent here
chol, Rho_, sigma_cafe = pm.LKJCholeskyCov(
"chol_cov", n=2, eta=2, sd_dist=pm.Exponential.dist(1.0), compute_corr=True
)
a_bar = pm.Normal("a_bar", mu=5, sigma=2.0) # prior for average intercept
b_bar = pm.Normal("b_bar", mu=-1, sigma=0.5) # prior for average slope
ab_subject = pm.MvNormal(
"ab_subject", mu=at.stack([a_bar, b_bar]), chol=chol, shape=(n_cafes, 2)
) # population of varying effects
# shape needs to be (n_cafes, 2) because we're getting back both a and b for each cafe
mu = ab_subject[cafe_idx, 0] + ab_subject[cafe_idx, 1] * df_cafes["afternoon"].values # linear model
sigma_within = pm.Exponential("sigma_within", 1.0) # prior stddev within cafes (in the top line)
wait = pm.Normal("wait", mu=mu, sigma=sigma_within, observed=df_cafes["wait"].values) # likelihood
idata_m14_1 = pm.sample(1000, target_accept=0.9)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [chol_cov, a_bar, b_bar, ab_subject, sigma_within]
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 140 seconds.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.
# take a glimpse at the head and tail of the summary table
pd.concat(
[
az.summary(idata_m14_1).head(10),
az.summary(idata_m14_1).tail(10)
]
)
/Users/blacar/opt/anaconda3/envs/pymc_env2/lib/python3.10/site-packages/arviz/stats/diagnostics.py:586: RuntimeWarning: invalid value encountered in double_scalars
(between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
/Users/blacar/opt/anaconda3/envs/pymc_env2/lib/python3.10/site-packages/arviz/stats/diagnostics.py:586: RuntimeWarning: invalid value encountered in double_scalars
(between_chain_variance / within_chain_variance + num_samples - 1) / (num_samples)
mean | sd | hdi_2.5% | hdi_97.5% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
a_bar | 3.654 | 0.223 | 3.203 | 4.074 | 0.003 | 0.002 | 4802.0 | 3140.0 | 1.0 |
b_bar | -1.049 | 0.109 | -1.265 | -0.844 | 0.002 | 0.001 | 3446.0 | 3200.0 | 1.0 |
ab_subject[0, 0] | 2.380 | 0.200 | 1.996 | 2.785 | 0.003 | 0.002 | 4271.0 | 2783.0 | 1.0 |
ab_subject[0, 1] | -0.587 | 0.245 | -1.071 | -0.119 | 0.004 | 0.003 | 3077.0 | 2833.0 | 1.0 |
ab_subject[1, 0] | 3.820 | 0.199 | 3.442 | 4.220 | 0.003 | 0.002 | 3988.0 | 3167.0 | 1.0 |
ab_subject[1, 1] | -1.402 | 0.248 | -1.897 | -0.945 | 0.004 | 0.003 | 3165.0 | 3182.0 | 1.0 |
ab_subject[2, 0] | 2.606 | 0.199 | 2.210 | 2.988 | 0.003 | 0.002 | 4702.0 | 3450.0 | 1.0 |
ab_subject[2, 1] | -0.681 | 0.240 | -1.156 | -0.218 | 0.004 | 0.003 | 3696.0 | 3014.0 | 1.0 |
ab_subject[3, 0] | 4.120 | 0.203 | 3.739 | 4.532 | 0.003 | 0.002 | 3475.0 | 2800.0 | 1.0 |
ab_subject[3, 1] | -0.707 | 0.266 | -1.213 | -0.184 | 0.005 | 0.004 | 2482.0 | 2921.0 | 1.0 |
chol_cov[0] | 0.988 | 0.163 | 0.710 | 1.328 | 0.002 | 0.002 | 5207.0 | 3263.0 | 1.0 |
chol_cov[1] | -0.226 | 0.105 | -0.442 | -0.033 | 0.002 | 0.001 | 2769.0 | 3178.0 | 1.0 |
chol_cov[2] | 0.299 | 0.093 | 0.120 | 0.481 | 0.002 | 0.002 | 1379.0 | 1308.0 | 1.0 |
sigma_within | 0.482 | 0.027 | 0.431 | 0.534 | 0.000 | 0.000 | 3773.0 | 2542.0 | 1.0 |
chol_cov_corr[0, 0] | 1.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 4000.0 | 4000.0 | NaN |
chol_cov_corr[0, 1] | -0.579 | 0.192 | -0.898 | -0.196 | 0.003 | 0.002 | 3196.0 | 2983.0 | 1.0 |
chol_cov_corr[1, 0] | -0.579 | 0.192 | -0.898 | -0.196 | 0.003 | 0.002 | 3196.0 | 2983.0 | 1.0 |
chol_cov_corr[1, 1] | 1.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 4087.0 | 4000.0 | 1.0 |
chol_cov_stds[0] | 0.988 | 0.163 | 0.710 | 1.328 | 0.002 | 0.002 | 5207.0 | 3263.0 | 1.0 |
chol_cov_stds[1] | 0.386 | 0.107 | 0.182 | 0.605 | 0.003 | 0.002 | 1541.0 | 1201.0 | 1.0 |
lmer
and pymc
outputsWhile pymc
returns posterior estimates for each parameter, including $\rho$, for this post, we are interested in comparing the output comparable to the “fixed effects” and “varying effects” from lmer
. Having the equations above can help us piece together the relevant bits of information. The fixed intercept and slope are easy because we’ve used the same characters $\alpha$ and $\beta$ in equation set 2 as we did in Equation set 1.
However, when identifying the “varying effects”, we’ll have to do some arithmetic with the pymc
output. In contrast with the lmer
output, the pymc
outputs have the estimate for each cafe with “baked in” varying effects. In other words, the “offset” that we see in equation 7 ($a_{\text{cafe}[i]}$ and $b_{\text{cafe}[i]}$)
are already embedded in ($\alpha_{\text{cafe}[i]}$ and $\beta_{\text{cafe}[i]}$) in equation 9. We’ll have to therefore subtract out the fixed effecs in the pymc
output before we can compare with the lmer
output. First, let’s get fixed effects from pymc
.
df_summary_int_and_slope = az.summary(idata_m14_1, var_names=['a_bar', 'b_bar'])
df_summary_int_and_slope
mean | sd | hdi_2.5% | hdi_97.5% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
a_bar | 3.654 | 0.223 | 3.203 | 4.074 | 0.003 | 0.002 | 4802.0 | 3140.0 | 1.0 |
b_bar | -1.049 | 0.109 | -1.265 | -0.844 | 0.002 | 0.001 | 3446.0 | 3200.0 | 1.0 |
These estimates and uncertainties compare well with the fixed estimates lmer
.
f, (ax0, ax1) = plt.subplots(1, 2, figsize=(12,4))
# value to generate data
# a, average morning wait time was defined above
ax0.vlines(x=a, ymin=0.8, ymax=1.2, linestyle='dashed', color='red')
ax1.vlines(x=b, ymin=0.8, ymax=1.2, linestyle='dashed', color='red', label='simulated value')
# pymc fixed effects value
ax0.scatter(df_summary_int_and_slope.loc['a_bar', 'mean'], 1.1, color='navy')
ax0.hlines(xmin=df_summary_int_and_slope.loc['a_bar', 'hdi_2.5%'], xmax=df_summary_int_and_slope.loc['a_bar', 'hdi_97.5%'], y=1.1, color='navy')
ax1.scatter(df_summary_int_and_slope.loc['b_bar', 'mean'], 1.1, color='navy')
ax1.hlines(xmin=df_summary_int_and_slope.loc['b_bar', 'hdi_2.5%'], xmax=df_summary_int_and_slope.loc['b_bar', 'hdi_97.5%'], y=1.1, color='navy', label='pymc estimate')
# lmer fixed effects estimate
ax0.scatter(df_fe_summary.loc['(Intercept)', 'Estimate'], 0.9, color='darkgreen')
ax0.hlines(xmin=df_fe_summary.loc['(Intercept)', 'X2.5..'], xmax=df_fe_summary.loc['(Intercept)', 'X97.5..'], y=0.9, color='darkgreen')
ax1.scatter(df_fe_summary.loc['afternoon', 'Estimate'], 0.9, color='darkgreen')
ax1.hlines(xmin=df_fe_summary.loc['afternoon', 'X2.5..'], xmax=df_fe_summary.loc['afternoon', 'X97.5..'], y=0.9, color='darkgreen', label='lmer estimate')
# plot formatting
f.suptitle('Fixed effect estimates')
ax0.set_yticks([0.9, 1.1])
ax0.set_yticklabels(['lmer', 'pymc'])
ax1.set_yticks([0.9, 1.1])
ax1.set_yticklabels(['', ''])
ax0.set(xlabel='intercept')
ax1.set(xlabel='slope')
ax1.legend(fontsize=10)
plt.tight_layout()
/var/folders/tw/b9j0wcdj6_9cyljwt364lx7c0000gn/T/ipykernel_5516/1253574855.py:30: UserWarning: This figure was using constrained_layout, but that is incompatible with subplots_adjust and/or tight_layout; disabling constrained_layout.
plt.tight_layout()
As promised, here is the meme that rewards you for paying attention this far!
Now to get the varying effects from pymc
output, we’ll take each sample’s intercept and slope and subtract the fixed estimate.
# Convert to pandas dataframe and take a glimpse at the first few rows
idata_m14_1_df = idata_m14_1.to_dataframe()
idata_m14_1_df.head()
chain | draw | (posterior, a_bar) | (posterior, b_bar) | (posterior, ab_subject[0,0], 0, 0) | (posterior, ab_subject[0,1], 0, 1) | (posterior, ab_subject[1,0], 1, 0) | (posterior, ab_subject[1,1], 1, 1) | (posterior, ab_subject[10,0], 10, 0) | (posterior, ab_subject[10,1], 10, 1) | (posterior, ab_subject[11,0], 11, 0) | (posterior, ab_subject[11,1], 11, 1) | (posterior, ab_subject[12,0], 12, 0) | (posterior, ab_subject[12,1], 12, 1) | (posterior, ab_subject[13,0], 13, 0) | (posterior, ab_subject[13,1], 13, 1) | (posterior, ab_subject[14,0], 14, 0) | (posterior, ab_subject[14,1], 14, 1) | (posterior, ab_subject[15,0], 15, 0) | (posterior, ab_subject[15,1], 15, 1) | ... | (log_likelihood, wait[97], 97) | (log_likelihood, wait[98], 98) | (log_likelihood, wait[99], 99) | (log_likelihood, wait[9], 9) | (sample_stats, tree_depth) | (sample_stats, max_energy_error) | (sample_stats, process_time_diff) | (sample_stats, perf_counter_diff) | (sample_stats, energy) | (sample_stats, step_size_bar) | (sample_stats, diverging) | (sample_stats, energy_error) | (sample_stats, lp) | (sample_stats, acceptance_rate) | (sample_stats, n_steps) | (sample_stats, largest_eigval) | (sample_stats, smallest_eigval) | (sample_stats, index_in_trajectory) | (sample_stats, step_size) | (sample_stats, perf_counter_start) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 3.397744 | -0.993140 | 2.353823 | -0.712216 | 3.936642 | -1.328451 | 2.497521 | -0.990675 | 4.589760 | -1.271864 | 2.272038 | -0.780358 | 3.400074 | -1.307487 | 4.660517 | -0.920542 | 3.967868 | -1.339014 | ... | -0.592594 | -0.280869 | -1.783441 | -0.404212 | 5 | -0.452539 | 0.234946 | 0.067260 | 194.679539 | 0.246795 | False | -0.226605 | -167.432037 | 0.975607 | 31.0 | NaN | NaN | -17 | 0.284311 | 192.355518 |
1 | 0 | 1 | 3.227032 | -1.105823 | 2.486742 | -0.657790 | 3.890044 | -1.788579 | 2.894867 | -0.741011 | 4.346072 | -1.048541 | 2.446301 | -0.678041 | 3.564795 | -1.520221 | 5.013627 | -1.128684 | 3.793134 | -1.084814 | ... | -0.581570 | -0.708670 | -1.709776 | -0.741664 | 4 | 0.498338 | 0.123327 | 0.033713 | 196.867266 | 0.246795 | False | 0.273832 | -177.694232 | 0.809115 | 15.0 | NaN | NaN | -8 | 0.284311 | 192.423125 |
2 | 0 | 2 | 3.393307 | -0.926431 | 2.348434 | -0.604619 | 3.905778 | -1.355137 | 2.712834 | -1.124770 | 4.409195 | -1.291088 | 2.324233 | -0.754508 | 3.586107 | -1.562165 | 5.050191 | -1.556993 | 4.122478 | -1.718417 | ... | -0.452885 | -0.109849 | -2.293094 | -0.559207 | 5 | -0.382814 | 0.236803 | 0.063232 | 207.926089 | 0.246795 | False | -0.347905 | -176.112370 | 0.968229 | 31.0 | NaN | NaN | 6 | 0.284311 | 192.457135 |
3 | 0 | 3 | 3.750943 | -1.109148 | 2.613325 | -0.667234 | 3.682009 | -1.293790 | 2.558511 | -0.362557 | 4.548968 | -1.266139 | 2.264383 | -0.445725 | 3.102086 | -0.903726 | 4.589499 | -0.409875 | 4.063760 | -1.249921 | ... | -1.239451 | -0.574010 | -0.906557 | -1.015460 | 4 | -0.530897 | 0.116930 | 0.037484 | 198.279760 | 0.246795 | False | -0.024171 | -180.489888 | 0.987683 | 15.0 | NaN | NaN | -9 | 0.284311 | 192.520656 |
4 | 0 | 4 | 3.416951 | -1.152993 | 2.478859 | -0.812085 | 3.773041 | -1.423143 | 2.136978 | -0.465100 | 4.385045 | -1.180823 | 2.160109 | -0.395771 | 3.459758 | -1.300131 | 5.527213 | -2.107117 | 3.906480 | -1.388326 | ... | -0.400278 | -0.240346 | -2.188396 | -0.471960 | 5 | -0.382498 | 0.241781 | 0.072736 | 207.993298 | 0.246795 | False | -0.041904 | -183.942618 | 0.999986 | 31.0 | NaN | NaN | -24 | 0.284311 | 192.558443 |
5 rows × 270 columns
# Get the "unbaked in" varying intercept and slope
bayesian_int = list()
bayesian_slope = list()
for i in range(20):
idata_m14_1_df[f'varying_int_{i}'] = idata_m14_1_df[ ('posterior', f'ab_subject[{i},0]', i, 0)] - idata_m14_1_df[('posterior', 'a_bar')]
bayesian_int.append(idata_m14_1_df[f'varying_int_{i}'].mean())
idata_m14_1_df[f'varying_slope_{i}'] = idata_m14_1_df[ ('posterior', f'ab_subject[{i},1]', i, 1)] - idata_m14_1_df[('posterior', 'b_bar')]
bayesian_slope.append(idata_m14_1_df[f'varying_slope_{i}'].mean())
We can now make a direct comparison between the lmer
and pymc
outputs. I’ll ignore the uncertainties for the sake of a cleaner plot.
random_sims_int = random_sims.loc[random_sims['term']=='(Intercept)', 'mean'].copy()
random_sims_slope = random_sims.loc[random_sims['term']=='afternoon', 'mean'].copy()
f, (ax0, ax1) = plt.subplots(1, 2, figsize=(12, 4))
min_max_int = [min(list(random_sims_int) + bayesian_int), max(list(random_sims_int) + bayesian_int)]
min_max_slope = [min(list(random_sims_slope) + bayesian_slope), max(list(random_sims_slope) + bayesian_slope)]
# intercepts
ax0.scatter(random_sims_int, bayesian_int, facecolors='none', edgecolors='navy')
ax0.plot(min_max_int, min_max_int, linestyle='dashed', color='gray')
ax0.set(xlabel='lmer intercept estimates', ylabel='pymc intercept estimates', title='Comparison of varying intercepts')
# slopes
ax1.scatter(random_sims_slope, bayesian_slope, facecolors='none', edgecolors='navy')
ax1.plot(min_max_slope, min_max_slope, linestyle='dashed', color='gray')
ax1.set(xlabel='lmer slope estimates', ylabel='pymc slope estimates', title='Comparison of varying slopes')
[Text(0.5, 0, 'lmer slope estimates'),
Text(0, 0.5, 'pymc slope estimates'),
Text(0.5, 1.0, 'Comparison of varying slopes')]
As you can see we get very similar intercepts and slopes for the cafe-specific estimates (varying effects) for the intercept and slope between the lmer
and pymc
approaches.
Here in this post, I set out to compare different mixed model approaches. I looked at the equations and the programmatic implementations. I concluded by showing how the two methods can arrive at the same answer. It required a careful understanding of the differences in equations and coding language- and package-specific implementations. There were various points of writing this post that confused me but it provided opportunities for deeper understanding.
Acknowledgements
References
lme4
package. Dr. McElreath referenced this package as a non-Bayesian alternative in his book.%load_ext watermark
%watermark -n -u -v -iv -w -p aesara,aeppl
The watermark extension is already loaded. To reload it, use:
%reload_ext watermark
Last updated: Tue Sep 13 2022
Python implementation: CPython
Python version : 3.10.6
IPython version : 8.4.0
aesara: 2.8.2
aeppl : 0.0.35
pymc : 4.1.7
xarray : 2022.6.0
pandas : 1.4.3
sys : 3.10.6 | packaged by conda-forge | (main, Aug 22 2022, 20:43:44) [Clang 13.0.1 ]
arviz : 0.12.1
matplotlib: 3.5.3
aesara : 2.8.2
numpy : 1.23.2
Watermark: 2.3.1
%%R
sessionInfo()
R version 4.1.3 (2022-03-10)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS Monterey 12.5.1
Matrix products: default
LAPACK: /Users/blacar/opt/anaconda3/envs/pymc_env2/lib/libopenblasp-r0.3.21.dylib
locale:
[1] C/UTF-8/C/C/C/C
attached base packages:
[1] tools stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] merTools_0.5.2 arm_1.13-1 MASS_7.3-58.1 lme4_1.1-30
[5] Matrix_1.4-1 forcats_0.5.2 stringr_1.4.1 dplyr_1.0.9
[9] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0 tibble_3.1.8
[13] ggplot2_3.3.6 tidyverse_1.3.2
loaded via a namespace (and not attached):
[1] httr_1.4.4 jsonlite_1.8.0 splines_4.1.3
[4] foreach_1.5.2 modelr_0.1.9 shiny_1.7.2
[7] assertthat_0.2.1 broom.mixed_0.2.9.4 googlesheets4_1.0.1
[10] cellranger_1.1.0 globals_0.16.1 pillar_1.8.1
[13] backports_1.4.1 lattice_0.20-45 glue_1.6.2
[16] digest_0.6.29 promises_1.2.0.1 rvest_1.0.3
[19] minqa_1.2.4 colorspace_2.0-3 httpuv_1.6.5
[22] htmltools_0.5.3 pkgconfig_2.0.3 broom_1.0.0
[25] listenv_0.8.0 haven_2.5.1 xtable_1.8-4
[28] mvtnorm_1.1-3 scales_1.2.1 later_1.3.0
[31] tzdb_0.3.0 googledrive_2.0.0 farver_2.1.1
[34] generics_0.1.3 ellipsis_0.3.2 withr_2.5.0
[37] furrr_0.3.1 cli_3.3.0 mime_0.12
[40] magrittr_2.0.3 crayon_1.5.1 readxl_1.4.1
[43] fs_1.5.2 future_1.27.0 fansi_1.0.3
[46] parallelly_1.32.1 nlme_3.1-159 xml2_1.3.3
[49] hms_1.1.2 gargle_1.2.0 lifecycle_1.0.1
[52] munsell_0.5.0 reprex_2.0.2 compiler_4.1.3
[55] rlang_1.0.4 blme_1.0-5 grid_4.1.3
[58] nloptr_2.0.3 iterators_1.0.14 labeling_0.4.2
[61] boot_1.3-28 gtable_0.3.0 codetools_0.2-18
[64] abind_1.4-5 DBI_1.1.3 R6_2.5.1
[67] lubridate_1.8.0 fastmap_1.1.0 utf8_1.2.2
[70] stringi_1.7.8 parallel_4.1.3 Rcpp_1.0.9
[73] vctrs_0.4.1 dbplyr_2.2.1 tidyselect_1.1.2
[76] coda_0.19-4