Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ See the [python-environment skill](.github/skills/python-environment/SKILL.md) f
- **Model-agnostic design**: Experiment classes should work with both PyMC and scikit-learn models. Use `isinstance(self.model, PyMCModel)` vs `isinstance(self.model, RegressorMixin)` to dispatch to appropriate implementations
- **Model classes**: PyMC models inherit from `PyMCModel` (extends `pm.Model`). Scikit-learn models use `RegressorMixin` and are made compatible via `create_causalpy_compatible_class()`. Common interface: `fit()`, `predict()`, `score()`, `calculate_impact()`, `print_coefficients()`
- **Data handling**: PyMC models use `xarray.DataArray` with coords (keys like "coeffs", "obs_ind", "treated_units"). Scikit-learn models use numpy arrays. Data index should be named "obs_ind"
- **Formulas**: Use patsy for formula parsing (via `dmatrices()`)
- **Formulas**: Use formulaic for formula parsing (via `model_matrix()`)
- **Custom exceptions**: Use project-specific exceptions from `causalpy.custom_exceptions`: `FormulaException`, `DataException`, `BadIndexException`
- **File organization**: Experiments in `causalpy/experiments/`, PyMC models in `causalpy/pymc_models.py`, scikit-learn models in `causalpy/skl_models.py`
- **Backwards compatibility**: Avoid preserving backwards compatibility for API elements introduced within the same PR; only maintain compatibility for previously released APIs.
Expand Down
37 changes: 20 additions & 17 deletions causalpy/experiments/diff_in_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
import pandas as pd
import seaborn as sns
import xarray as xr
from formulaic import model_matrix
from matplotlib import pyplot as plt
from patsy import build_design_matrices, dmatrices
from sklearn.base import RegressorMixin

from causalpy.constants import LEGEND_FONT_SIZE
Expand Down Expand Up @@ -124,13 +124,12 @@ def __init__(
self.algorithm()

def _build_design_matrices(self) -> None:
"""Build design matrices from formula and data using patsy."""
y, X = dmatrices(self.formula, self.data)
self._y_design_info = y.design_info
self._x_design_info = X.design_info
self.labels = X.design_info.column_names
self.y, self.X = np.asarray(y), np.asarray(X)
self.outcome_variable_name = y.design_info.column_names[0]
"""Build design matrices from formula and data using formulaic."""
dm = model_matrix(self.formula, self.data)
self.labels = list(dm.rhs.columns)
self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
self.rhs_matrix_spec = dm.rhs.model_spec
self.outcome_variable_name = dm.lhs.columns[0]

def _prepare_data(self) -> None:
"""Convert design matrices to xarray DataArrays."""
Expand Down Expand Up @@ -182,8 +181,10 @@ def algorithm(self) -> None:
)
if self.x_pred_control.empty:
raise ValueError("x_pred_control is empty")
(new_x,) = build_design_matrices([self._x_design_info], self.x_pred_control)
self.y_pred_control = self.model.predict(np.asarray(new_x))
new_x = model_matrix(
spec=self.rhs_matrix_spec, data=self.x_pred_control
).to_numpy()
self.y_pred_control = self.model.predict(new_x)

# predicted outcome for treatment group
self.x_pred_treatment = (
Expand All @@ -199,8 +200,10 @@ def algorithm(self) -> None:
)
if self.x_pred_treatment.empty:
raise ValueError("x_pred_treatment is empty")
(new_x,) = build_design_matrices([self._x_design_info], self.x_pred_treatment)
self.y_pred_treatment = self.model.predict(np.asarray(new_x))
new_x = model_matrix(
spec=self.rhs_matrix_spec, data=self.x_pred_treatment
).to_numpy()
self.y_pred_treatment = self.model.predict(new_x)

# predicted outcome for counterfactual. This is given by removing the influence
# of the interaction term between the group and the post_treatment variable
Expand All @@ -219,18 +222,18 @@ def algorithm(self) -> None:
)
if self.x_pred_counterfactual.empty:
raise ValueError("x_pred_counterfactual is empty")
(new_x,) = build_design_matrices(
[self._x_design_info], self.x_pred_counterfactual, return_type="dataframe"
)
new_x = model_matrix(
spec=self.rhs_matrix_spec, data=self.x_pred_counterfactual
).to_numpy()
# INTERVENTION: set the interaction term between the group and the
# post_treatment variable to zero. This is the counterfactual.
for i, label in enumerate(self.labels):
if (
self.post_treatment_variable_name in label
and self.group_variable_name in label
):
new_x.iloc[:, i] = 0
self.y_pred_counterfactual = self.model.predict(np.asarray(new_x))
new_x[:, i] = 0
self.y_pred_counterfactual = self.model.predict(new_x)

# calculate causal impact
if isinstance(self.model, PyMCModel):
Expand Down
30 changes: 13 additions & 17 deletions causalpy/experiments/instrumental_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

import numpy as np
import pandas as pd
from patsy import dmatrices
from formulaic import model_matrix
from sklearn.linear_model import LinearRegression as sk_lin_reg

from causalpy.custom_exceptions import DataException
Expand Down Expand Up @@ -148,19 +148,17 @@ def __init__(

def _build_design_matrices(self) -> None:
"""Build design matrices for outcome and instrument formulas."""
y, X = dmatrices(self.formula, self.data)
self._y_design_info = y.design_info
self._x_design_info = X.design_info
self.labels = X.design_info.column_names
self.y, self.X = np.asarray(y), np.asarray(X)
self.outcome_variable_name = y.design_info.column_names[0]
dm = model_matrix(self.formula, self.data)
self.labels = list(dm.rhs.columns)
self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
self.rhs_matrix_spec = dm.rhs.model_spec
self.outcome_variable_name = dm.lhs.columns[0]

t, Z = dmatrices(self.instruments_formula, self.instruments_data)
self._t_design_info = t.design_info
self._z_design_info = Z.design_info
self.labels_instruments = Z.design_info.column_names
self.t, self.Z = np.asarray(t), np.asarray(Z)
self.instrument_variable_name = t.design_info.column_names[0]
dm = model_matrix(self.instruments_formula, self.instruments_data)
self.labels_instruments = list(dm.rhs.columns)
self.t, self.Z = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
self.instrument_rhs_matrix_spec = dm.rhs.model_spec
self.instrument_variable_name = dm.lhs.columns[0]

def algorithm(self) -> None:
"""Run the experiment algorithm: fit OLS, 2SLS, and Bayesian IV model."""
Expand Down Expand Up @@ -234,7 +232,7 @@ def get_2SLS_fit(self) -> None:
fitted_Z_values = first_stage_reg.predict(self.Z)
X2 = self.data.copy(deep=True)
X2[self.instrument_variable_name] = fitted_Z_values
_, X2 = dmatrices(self.formula, X2)
X2 = model_matrix(self.formula, X2).rhs.to_numpy()
second_stage_reg = sk_lin_reg().fit(X=X2, y=self.y)
betas_first = list(first_stage_reg.coef_[0][1:])
betas_first.insert(0, first_stage_reg.intercept_[0])
Expand All @@ -254,9 +252,7 @@ def get_naive_OLS_fit(self) -> None:
ols_reg = sk_lin_reg().fit(self.X, self.y)
beta_params = list(ols_reg.coef_[0][1:])
beta_params.insert(0, ols_reg.intercept_[0])
self.ols_beta_params = dict(
zip(self._x_design_info.column_names, beta_params, strict=False)
)
self.ols_beta_params = dict(zip(self.labels, beta_params, strict=False))
self.ols_reg = ols_reg

def plot(self, *args, **kwargs) -> None: # type: ignore[override]
Expand Down
25 changes: 11 additions & 14 deletions causalpy/experiments/interrupted_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
import numpy as np
import pandas as pd
import xarray as xr
from formulaic import model_matrix
from matplotlib import pyplot as plt
from patsy import build_design_matrices, dmatrices
from sklearn.base import RegressorMixin

from causalpy.constants import HDI_PROB, LEGEND_FONT_SIZE
Expand Down Expand Up @@ -56,7 +56,7 @@ class InterruptedTimeSeries(BaseExperiment):
**INCLUSIVE**: Observations at exactly ``treatment_time`` are included in the
post-intervention period (uses ``>=`` comparison).
formula : str
A statistical model formula using patsy syntax (e.g., "y ~ 1 + t + C(month)").
A statistical model formula using formulaic syntax (e.g., "y ~ 1 + t + C(month)").
model : Union[PyMCModel, RegressorMixin], optional
A PyMC (Bayesian) or sklearn (OLS) model. If None, defaults to a PyMC
LinearRegression model.
Expand Down Expand Up @@ -154,23 +154,20 @@ def __init__(
self.algorithm()

def _build_design_matrices(self) -> None:
"""Build design matrices for pre and post intervention periods using patsy."""
"""Build design matrices for pre and post intervention periods using formulaic."""
# set things up with pre-intervention data
y, X = dmatrices(self.formula, self.datapre)
self.outcome_variable_name = y.design_info.column_names[0]
self._y_design_info = y.design_info
self._x_design_info = X.design_info
self.labels = X.design_info.column_names
self.pre_y, self.pre_X = np.asarray(y), np.asarray(X)
dm = model_matrix(self.formula, self.datapre)
Comment thread
ksolarski marked this conversation as resolved.
self.labels = list(dm.rhs.columns)
self.matrix_spec = dm.model_spec
self.outcome_variable_name = dm.lhs.columns[0]
self.pre_y, self.pre_X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
# process post-intervention data
(new_y, new_x) = build_design_matrices(
[self._y_design_info, self._x_design_info], self.datapost
)
self.post_X = np.asarray(new_x)
self.post_y = np.asarray(new_y)
dm_post = model_matrix(self.matrix_spec, self.datapost)
self.post_y, self.post_X = (dm_post.lhs.to_numpy(), dm_post.rhs.to_numpy())

def _prepare_data(self) -> None:
"""Convert design matrices to xarray DataArrays for pre and post periods."""
# turn into xarray.DataArray's
self.pre_X = xr.DataArray(
self.pre_X,
dims=["obs_ind", "coeffs"],
Expand Down
20 changes: 10 additions & 10 deletions causalpy/experiments/inverse_propensity_weighting.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from formulaic import model_matrix
from matplotlib.lines import Line2D
from patsy import dmatrices
from sklearn.linear_model import LinearRegression as sk_lin_reg

from causalpy.custom_exceptions import DataException
Expand Down Expand Up @@ -99,16 +99,16 @@ def __init__(
def _build_design_matrices(self) -> None:
"""Build design matrices for the propensity score model.

Uses the patsy formula stored in ``self.formula`` to construct treatment
and covariate matrices from ``self.data``. Also creates an augmented
design matrix (``self.X_outcome``) that includes the treatment indicator,
for use with doubly-robust outcome modelling.
Uses the formulaic formula stored in ``self.formula`` to construct
treatment and covariate matrices from ``self.data``. Also creates an
augmented design matrix (``self.X_outcome``) that includes the treatment
indicator, for use with doubly-robust outcome modelling.
"""
t, X = dmatrices(self.formula, self.data)
self._t_design_info = t.design_info
self._t_design_info = X.design_info
self.labels = X.design_info.column_names
self.t, self.X = np.asarray(t), np.asarray(X)
dm = model_matrix(self.formula, self.data)
self.labels = list(dm.rhs.columns)
self.t, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
self.rhs_matrix_spec = dm.rhs.model_spec
self.treatment_variable_name = dm.lhs.columns[0]
self.y = self.data[self.outcome_variable]

COORDS = {"obs_ind": list(range(self.X.shape[0])), "coeffs": self.labels}
Expand Down
4 changes: 2 additions & 2 deletions causalpy/experiments/panel_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@ def get_plot_data_bayesian(self, **kwargs: Any) -> pd.DataFrame:
"""
# Get posterior predictions
if isinstance(self.model, PyMCModel):
mu = self.model.idata.posterior["mu"] # type: ignore[attr-defined]
mu = self.model.idata.posterior["mu"] # type: ignore[union-attr]
pred_mean = mu.mean(dim=["chain", "draw"]).values.flatten()
pred_lower = mu.quantile(0.025, dim=["chain", "draw"]).values.flatten()
pred_upper = mu.quantile(0.975, dim=["chain", "draw"]).values.flatten()
Expand Down Expand Up @@ -673,7 +673,7 @@ def plot_unit_effects(

if isinstance(self.model, PyMCModel):
# Bayesian: get posterior means
beta = self.model.idata.posterior["beta"] # type: ignore[attr-defined]
beta = self.model.idata.posterior["beta"] # type: ignore[union-attr]
unit_fe_indices = [self.labels.index(name) for name in unit_fe_names]

# Get mean and std for each unit FE
Expand Down
29 changes: 15 additions & 14 deletions causalpy/experiments/prepostnegd.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
import pandas as pd
import seaborn as sns
import xarray as xr
from formulaic import model_matrix
from matplotlib import pyplot as plt
from patsy import build_design_matrices, dmatrices
from sklearn.base import RegressorMixin

from causalpy.constants import HDI_PROB, LEGEND_FONT_SIZE
Expand Down Expand Up @@ -114,13 +114,12 @@ def __init__(
self.algorithm()

def _build_design_matrices(self) -> None:
"""Build design matrices from formula and data using patsy."""
y, X = dmatrices(self.formula, self.data)
self._y_design_info = y.design_info
self._x_design_info = X.design_info
self.labels = X.design_info.column_names
self.y, self.X = np.asarray(y), np.asarray(X)
self.outcome_variable_name = y.design_info.column_names[0]
"""Build design matrices from formula and data using formulaic."""
dm = model_matrix(self.formula, self.data)
self.labels = list(dm.rhs.columns)
self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
self.rhs_matrix_spec = dm.rhs.model_spec
self.outcome_variable_name = dm.lhs.columns[0]

def _prepare_data(self) -> None:
"""Convert design matrices to xarray DataArrays."""
Expand Down Expand Up @@ -169,19 +168,21 @@ def algorithm(self) -> None:
self.group_variable_name: np.zeros(self.pred_xi.shape),
}
)
(new_x_untreated,) = build_design_matrices(
[self._x_design_info], x_pred_untreated
)
self.pred_untreated = self.model.predict(X=np.asarray(new_x_untreated))
new_x_untreated = model_matrix(
spec=self.rhs_matrix_spec, data=x_pred_untreated
).to_numpy()
self.pred_untreated = self.model.predict(X=new_x_untreated)
# treated
x_pred_treated = pd.DataFrame(
{
self.pretreatment_variable_name: self.pred_xi,
self.group_variable_name: np.ones(self.pred_xi.shape),
}
)
(new_x_treated,) = build_design_matrices([self._x_design_info], x_pred_treated)
self.pred_treated = self.model.predict(X=np.asarray(new_x_treated))
new_x_treated = model_matrix(
spec=self.rhs_matrix_spec, data=x_pred_treated
).to_numpy()
self.pred_treated = self.model.predict(X=new_x_treated)

# Evaluate causal impact as equal to the treatment effect
self.causal_impact = self.model.idata.posterior["beta"].sel(
Expand Down
21 changes: 10 additions & 11 deletions causalpy/experiments/regression_discontinuity.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from patsy import build_design_matrices, dmatrices
from formulaic import model_matrix
from sklearn.base import RegressorMixin
import xarray as xr
from causalpy.custom_exceptions import (
Expand Down Expand Up @@ -149,13 +149,12 @@ def _build_design_matrices(self) -> None:
msg = f"Only {len(self.fit_data)} datapoints in the dataset."
warnings.warn(msg, UserWarning, stacklevel=2)

y, X = dmatrices(self.formula, self.fit_data)
dm = model_matrix(self.formula, self.fit_data)

self._y_design_info = y.design_info
self._x_design_info = X.design_info
self.labels = X.design_info.column_names
self.y, self.X = np.asarray(y), np.asarray(X)
self.outcome_variable_name = y.design_info.column_names[0]
self.labels = list(dm.rhs.columns)
self.y, self.X = (dm.lhs.to_numpy(), dm.rhs.to_numpy())
self.rhs_matrix_spec = dm.rhs.model_spec
self.outcome_variable_name = dm.lhs.columns[0]

def _prepare_data(self) -> None:
"""Convert design matrices to xarray DataArrays."""
Expand Down Expand Up @@ -206,8 +205,8 @@ def algorithm(self) -> None:
self.x_pred = pd.DataFrame(
{self.running_variable_name: xi, "treated": self._is_treated(xi)}
)
(new_x,) = build_design_matrices([self._x_design_info], self.x_pred)
self.pred = self.model.predict(X=np.asarray(new_x))
new_x = model_matrix(spec=self.rhs_matrix_spec, data=self.x_pred).to_numpy()
self.pred = self.model.predict(X=new_x)

# calculate discontinuity by evaluating the difference in model expectation on
# either side of the discontinuity
Expand All @@ -224,8 +223,8 @@ def algorithm(self) -> None:
"treated": np.array([0, 1]),
}
)
(new_x,) = build_design_matrices([self._x_design_info], self.x_discon)
self.pred_discon = self.model.predict(X=np.asarray(new_x))
new_x = model_matrix(spec=self.rhs_matrix_spec, data=self.x_discon).to_numpy()
self.pred_discon = self.model.predict(X=new_x)

# ******** THIS IS SUBOPTIMAL AT THE MOMENT ************************************
if isinstance(self.model, PyMCModel):
Expand Down
Loading
Loading