py-econometrics · leostimpfle · Dec 28, 2025 · Dec 28, 2025 · Dec 28, 2025 · Dec 28, 2025
diff --git a/.gitignore b/.gitignore
@@ -42,3 +42,5 @@ coverage.xml
 # pixi environments
 .pixi/*
 !.pixi/config.toml
+SKILL.md
+CLAUDE.md
diff --git a/docs/_quarto.yml b/docs/_quarto.yml
@@ -119,6 +119,13 @@ quartodoc:
         - report.coefplot
         - report.iplot
         - did.visualize.panelview
+    - title: Formula Parsing & Model Matrix
+      desc: |
+        Internal APIs for formula parsing and model matrix construction
+      contents:
+        - estimation.formula.parse.Formula
+        - estimation.formula.model_matrix.ModelMatrix
+        - estimation.formula.factor_interaction.factor_interaction
     - title: Misc / Utilities
       desc: |
         PyFixest internals and utilities

diff --git a/docs/_sidebar.yml b/docs/_sidebar.yml
@@ -34,6 +34,12 @@ website:
       - reference/report.iplot.qmd
       - reference/did.visualize.panelview.qmd
       section: Summarize and Visualize
+    - contents:
+      - reference/estimation.formula.parse.Formula.qmd
+      - reference/estimation.formula.parse.parse.qmd
+      - reference/estimation.formula.model_matrix.ModelMatrix.qmd
+      - reference/estimation.formula.factor_interaction.factor_interaction.qmd
+      section: Formula Parsing & Model Matrix
     - contents:
       - reference/estimation.demean.qmd
       - reference/estimation.detect_singletons.qmd

diff --git a/docs/quickstart.qmd b/docs/quickstart.qmd
@@ -507,7 +507,7 @@ multi_fit.etable()
 You can access an individual model by its name - i.e. a formula - via the `all_fitted_models` attribute.
 
 ```{python}
-multi_fit.all_fitted_models["Y~X1"].tidy()
+multi_fit.all_fitted_models["Y ~ X1"].tidy()
 ```
 
 or equivalently via the `fetch_model` method:

diff --git a/pyfixest/did/did2s.py b/pyfixest/did/did2s.py
@@ -8,8 +8,8 @@
 from pyfixest.did.did import DID
 from pyfixest.estimation import feols
 from pyfixest.estimation.feols_ import Feols
-from pyfixest.estimation.FormulaParser import FixestFormulaParser
-from pyfixest.estimation.model_matrix_fixest_ import model_matrix_fixest
+from pyfixest.estimation.formula import model_matrix
+from pyfixest.estimation.formula.parse import Formula
 
 
 class DID2S(DID):
@@ -304,37 +304,48 @@ def _did2s_vcov(
 
     # some formula parsing to get the correct formula for the first and second stage model matrix
     first_stage_x, first_stage_fe = first_stage.split("|")
-    first_stage_fe_list = [f"C({i})" for i in first_stage_fe.split("+")]
+    first_stage_fe_list = [f"C({i.strip()})" for i in first_stage_fe.split("+")]
     first_stage_fe_fml = "+".join(first_stage_fe_list)
-    first_stage = f"{first_stage_x}+{first_stage_fe_fml}"
-
-    second_stage = f"{second_stage}"
+    first_stage_fml = f"{first_stage_x}+{first_stage_fe_fml}"
 
     # note for future Alex: intercept needs to be dropped! it is not as fixed
     # effects are converted to dummies, hence has_fixed checks are False
 
-    FML1 = FixestFormulaParser(f"{yname} {first_stage}")
-    FML2 = FixestFormulaParser(f"{yname} {second_stage}")
-    FixestFormulaDict1 = FML1.FixestFormulaDict
-    FixestFormulaDict2 = FML2.FixestFormulaDict
+    # Create Formula objects for the new model_matrix system.
+    # First stage: use `- 1` so that C() dummy encoding keeps all levels,
+    # matching the feols demeaning approach (which implicitly includes all
+    # fixed-effect levels). Removing `- 1` would cause formulaic to drop
+    # reference levels, changing the GMM vcov standard errors.
+    FML1 = Formula(
+        _second_stage=f"{yname} ~ {first_stage_fml.replace('~', '').strip()} - 1",
+    )
+    # Second stage: do NOT use `- 1`. Formulaic needs the intercept present
+    # for full-rank encoding (dropping a reference level for factors like
+    # i(treat)). The intercept column is then removed by drop_intercept=True
+    # below, matching what feols does in _did2s_estimate.
+    FML2 = Formula(
+        _second_stage=f"{yname} ~ {second_stage.replace('~', '').strip()}",
+    )
 
-    mm_dict_first_stage = model_matrix_fixest(
-        FixestFormula=next(iter(FixestFormulaDict1.values()))[0],
+    mm_first_stage = model_matrix.create_model_matrix(
+        formula=FML1,
         data=data,
         weights=None,
         drop_singletons=False,
-        drop_intercept=False,
+        ensure_full_rank=True,
+        drop_intercept=True,
     )
-    X1 = cast(pd.DataFrame, mm_dict_first_stage.get("X"))
+    X1 = mm_first_stage.independent
 
-    mm_second_stage = model_matrix_fixest(
-        FixestFormula=next(iter(FixestFormulaDict2.values()))[0],
+    mm_second_stage = model_matrix.create_model_matrix(
+        formula=FML2,
         data=data,
         weights=None,
         drop_singletons=False,
+        ensure_full_rank=True,
         drop_intercept=True,
-    )  # reference values not dropped, multicollinearity error
-    X2 = cast(pd.DataFrame, mm_second_stage.get("X"))
+    )
+    X2 = mm_second_stage.independent
 
     X1 = csr_matrix(X1.to_numpy() * weights_array[:, None])
     X2 = csr_matrix(X2.to_numpy() * weights_array[:, None])
@@ -359,10 +370,7 @@ def _did2s_vcov(
     X10 = X10.tocsr()
     X2 = X2.tocsr()  # type: ignore
 
-    for (
-        _,
-        g,
-    ) in enumerate(clustid):
+    for _, g in enumerate(clustid):
         idx_g: np.ndarray = cluster_col.values == g
         X10g = X10[idx_g, :]
         X2g = X2[idx_g, :]

diff --git a/pyfixest/did/saturated_twfe.py b/pyfixest/did/saturated_twfe.py
@@ -203,15 +203,14 @@ def aggregate(
         treated_periods = list(period_set)
 
         df_agg = pd.DataFrame(
-            index=treated_periods,
+            index=pd.Index(treated_periods, name="period"),
             columns=["Estimate", "Std. Error", "t value", "Pr(>|t|)", "2.5%", "97.5%"],
         )
-        df_agg.index.name = "period"
 
         for period in treated_periods:
             R = np.zeros(len(coefs))
             for cohort in cohort_list:
-                cohort_pattern = rf"\[{re.escape(str(period))}\]:.*{re.escape(cohort)}$"
+                cohort_pattern = rf"^(?:.+)::{period}:(?:.+)::{cohort}$"
                 match_idx = [
                     i
                     for i, name in enumerate(coefnames)
@@ -319,28 +318,20 @@ def _saturated_event_study(
     unit_id: str,
     cluster: Optional[str] = None,
 ):
-    cohort_dummies = pd.get_dummies(
-        df.first_treated_period, drop_first=True, prefix="cohort_dummy"
+    ff = f"{outcome} ~ i(rel_time, first_treated_period, ref = -1.0, ref2=0.0) | {unit_id} + {time_id}"
+    m = feols(fml=ff, data=df, vcov={"CRV1": cluster})  # type: ignore
+    res = m.tidy().reset_index()
+    res = res.join(
+        res["Coefficient"].str.extract(
+            r".+::(?P<time>.+):.+::(?P<cohort>.+)", expand=True
+        )
     )
-    df_int = pd.concat([df, cohort_dummies], axis=1)
-
-    ff = f"""
-                {outcome} ~
-                {"+".join([f"i(rel_time, {x}, ref = -1.0)" for x in cohort_dummies.columns.tolist()])}
-                | {unit_id} + {time_id}
-                """
-    m = feols(fml=ff, data=df_int, vcov={"CRV1": cluster})  # type: ignore
-    res = m.tidy()
+    res["time"] = res["time"].astype(float)
     # create a dict with cohort specific effect curves
     res_cohort_eventtime_dict: dict[str, dict[str, pd.DataFrame | np.ndarray]] = {}
-    for cohort in cohort_dummies.columns:
-        res_cohort = res.filter(like=cohort, axis=0)
-        event_time = (
-            res_cohort.index.str.extract(r"\[(?:T\.)?(-?\d+(?:\.\d+)?)\]")
-            .astype(float)
-            .values.flatten()
-        )
-        res_cohort_eventtime_dict[cohort] = {"est": res_cohort, "time": event_time}
+    for cohort, res_cohort in res.groupby("cohort"):
+        event_time = res_cohort["time"].to_numpy()
+        res_cohort_eventtime_dict[str(cohort)] = {"est": res_cohort, "time": event_time}
 
     return m, res_cohort_eventtime_dict
 
@@ -366,11 +357,10 @@ def _test_treatment_heterogeneity(
     """
     mmres = model.tidy().reset_index()
     P = mmres.shape[0]
-    mmres[["time", "cohort"]] = mmres.Coefficient.str.split(":", expand=True)
-    mmres["time"] = mmres.time.str.extract(r"\[(?:T\.)?(-?\d+(?:\.\d+)?)\]").astype(
-        float
+    mmres[["time", "cohort"]] = mmres["Coefficient"].str.extract(
+        r".+::(?P<time>.+):.+::(?P<cohort>.+)", expand=True
     )
-    mmres["cohort"] = mmres.cohort.str.extract(r"(\d+)")
+    mmres["time"] = mmres["time"].astype(float)
     # indices of coefficients that are deviations from common event study coefs
     event_study_coefs = mmres.loc[~(mmres.cohort.isna()) & (mmres.time > 0)].index
     # Method 2 (K x P) - more efficient

diff --git a/pyfixest/errors/__init__.py b/pyfixest/errors/__init__.py
@@ -58,6 +58,10 @@ class EmptyVcovError(Exception):  # noqa: D101
     pass
 
 
+class FormulaSyntaxError(Exception):  # noqa: D101
+    pass
+
+
 __all__ = [
     "CovariateInteractionError",
     "DepvarIsNotNumericError",
@@ -67,6 +71,7 @@ class EmptyVcovError(Exception):  # noqa: D101
     "EndogVarsAsCovarsError",
     "FeatureDeprecationError",
     "FixedEffectInteractionError",
+    "FormulaSyntaxError",
     "InstrumentsAsCovarsError",
     "MatrixNotFullRankError",
     "NanInClusterVarError",

diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py
@@ -12,7 +12,7 @@
 from pyfixest.estimation.feols_compressed_ import FeolsCompressed
 from pyfixest.estimation.fepois_ import Fepois
 from pyfixest.estimation.feprobit_ import Feprobit
-from pyfixest.estimation.FormulaParser import FixestFormulaParser
+from pyfixest.estimation.formula.parse import Formula
 from pyfixest.estimation.literals import (
     DemeanerBackendOptions,
     QuantregMethodOptions,
@@ -214,7 +214,6 @@ def _prepare_estimation(
         self._ssc_dict: dict[str, Union[str, bool]] = {}
         self._drop_singletons = False
         self._is_multiple_estimation = False
-        self._drop_intercept = False
         self._weights = weights
         self._has_weights = False
         if weights is not None:
@@ -225,16 +224,19 @@ def _prepare_estimation(
         self._quantile_tol = quantile_tol
         self._quantile_maxiter = quantile_maxiter
 
-        FML = FixestFormulaParser(fml)
-        FML.set_fixest_multi_flag()
+        formula_dictionary = Formula.parse_to_dict(fml)
         self._is_multiple_estimation = (
-            FML._is_multiple_estimation
+            sum(len(v) for v in formula_dictionary.values()) > 1
             or self._run_split
             or (isinstance(quantile, list) and len(quantile) > 1)
         )
-        self.FixestFormulaDict = FML.FixestFormulaDict
+        self.FixestFormulaDict = formula_dictionary
         self._method = estimation
-        self._is_iv = FML.is_iv
+        self._is_iv = any(
+            formula.first_stage is not None
+            for _, formulas in formula_dictionary.items()
+            for formula in formulas
+        )
         # self._fml_dict = fxst_fml.condensed_fml_dict
         # self._fml_dict_iv = fxst_fml.condensed_fml_dict_iv
         self._ssc_dict = ssc if ssc is not None else {}
@@ -299,9 +301,9 @@ def _estimate_all_models(
             for _, fval in enumerate(_fixef_keys):
                 fixef_key_models = FixestFormulaDict.get(fval)
 
-                # dictionary to cache demeaned data with index: na_index_str,
+                # dictionary to cache demeaned data keyed by na_index,
                 # only relevant for `.feols()`
-                lookup_demeaned_data: dict[str, pd.DataFrame] = {}
+                lookup_demeaned_data: dict[frozenset[int], pd.DataFrame] = {}
 
                 for FixestFormula in fixef_key_models:  # type: ignore
                     # loop over both dictfe and dictfe_iv (if the latter is not None)
@@ -430,7 +432,7 @@ def _estimate_all_models(
                     # if X is empty: no inference (empty X only as shorthand for demeaning)
                     if not FIT._X_is_empty:
                         # inference
-                        vcov_type = _get_vcov_type(vcov, fval)
+                        vcov_type = _get_vcov_type(vcov)
                         FIT.vcov(
                             vcov=vcov_type,
                             vcov_kwargs=vcov_kwargs,

diff --git a/pyfixest/estimation/FormulaParser.py b/pyfixest/estimation/FormulaParser.py
@@ -1,4 +1,5 @@
 import re
+import warnings
 from itertools import product
 from typing import Optional, Union
 
@@ -41,6 +42,14 @@ def __init__(self, fml: str):
             None
 
         """
+        warnings.warn(
+            "FixestFormulaParser is deprecated and will be removed in a future version. "
+            "Use `pyfixest.estimation.formula.parse.parse()` instead. "
+            "See https://py-econometrics.github.io/pyfixest/reference/estimation.formula.parse.parse.html",
+            FutureWarning,
+            stacklevel=2,
+        )
+
         depvars, covars, fevars, endogvars, instruments = _deparse_fml(fml)
 
         # Parse all individual formula components that allow for

diff --git a/pyfixest/estimation/demean_.py b/pyfixest/estimation/demean_.py
@@ -12,8 +12,8 @@ def demean_model(
     X: pd.DataFrame,
     fe: Optional[pd.DataFrame],
     weights: Optional[np.ndarray],
-    lookup_demeaned_data: dict[str, Any],
-    na_index_str: str,
+    lookup_demeaned_data: dict[frozenset[int], Any],
+    na_index: frozenset[int],
     fixef_tol: float,
     fixef_maxiter: int,
     demean_func: Callable,
@@ -42,9 +42,9 @@ def demean_model(
         A dictionary with keys for each fixed effects combination and potentially
         values of demeaned data frames. The function checks this dictionary to
         see if some of the variables have already been demeaned.
-    na_index_str : str
-        A string with indices of dropped columns. Used for caching of demeaned
-        variables.
+    na_index : frozenset[int]
+        A frozenset of indices of dropped rows. Used as a hashable cache key
+        for demeaned variables.
     fixef_tol: float
         The tolerance for the demeaning algorithm.
     fixef_maxiter: int
@@ -79,9 +79,9 @@ def demean_model(
     if fe is not None:
         fe_array = fe.to_numpy()
         # check if looked dict has data for na_index
-        if lookup_demeaned_data.get(na_index_str) is not None:
+        if lookup_demeaned_data.get(na_index) is not None:
             # get data out of lookup table: list of [algo, data]
-            value = lookup_demeaned_data.get(na_index_str)
+            value = lookup_demeaned_data.get(na_index)
             if value is not None:
                 try:
                     _, YX_demeaned_old = value
@@ -146,7 +146,7 @@ def demean_model(
             YX_demeaned = pd.DataFrame(YX_demeaned)
             YX_demeaned.columns = yx_names
 
-        lookup_demeaned_data[na_index_str] = [None, YX_demeaned]
+        lookup_demeaned_data[na_index] = [None, YX_demeaned]
 
     else:
         # nothing to demean here

diff --git a/pyfixest/estimation/fegaussian_.py b/pyfixest/estimation/fegaussian_.py
@@ -5,7 +5,7 @@
 import pandas as pd
 
 from pyfixest.estimation.feglm_ import Feglm
-from pyfixest.estimation.FormulaParser import FixestFormula
+from pyfixest.estimation.formula.parse import Formula as FixestFormula
 from pyfixest.estimation.literals import DemeanerBackendOptions
 
 
@@ -24,7 +24,7 @@ def __init__(
         collin_tol: float,
         fixef_tol: float,
         fixef_maxiter: int,
-        lookup_demeaned_data: dict[str, pd.DataFrame],
+        lookup_demeaned_data: dict[frozenset[int], pd.DataFrame],
         tol: float,
         maxiter: int,
         solver: Literal[

diff --git a/pyfixest/estimation/feglm_.py b/pyfixest/estimation/feglm_.py
@@ -16,7 +16,7 @@
     _drop_multicollinear_variables,
 )
 from pyfixest.estimation.fepois_ import _check_for_separation
-from pyfixest.estimation.FormulaParser import FixestFormula
+from pyfixest.estimation.formula.parse import Formula as FixestFormula
 from pyfixest.estimation.literals import DemeanerBackendOptions
 from pyfixest.estimation.solvers import solve_ols
 from pyfixest.utils.dev_utils import DataFrameType
@@ -37,7 +37,7 @@ def __init__(
         collin_tol: float,
         fixef_tol: float,
         fixef_maxiter: int,
-        lookup_demeaned_data: dict[str, pd.DataFrame],
+        lookup_demeaned_data: dict[frozenset[int], pd.DataFrame],
         tol: float,
         maxiter: int,
         solver: Literal[