From e9dc8f436c50c37b4a7b8adaa08257bded8ea900 Mon Sep 17 00:00:00 2001 From: Brad Hackinen Date: Mon, 3 Feb 2025 16:03:03 -0800 Subject: [PATCH 1/5] Added offset option for fepois --- pyfixest/estimation/FixestMulti_.py | 12 ++++++++++++ .../estimation/deprecated/model_matrix_fixest_.py | 11 +++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py index 1522ceea8..933c8ffd6 100644 --- a/pyfixest/estimation/FixestMulti_.py +++ b/pyfixest/estimation/FixestMulti_.py @@ -165,6 +165,7 @@ def _prepare_estimation( quantile: float | None = None, quantile_tol: float = 1e-06, quantile_maxiter: int | None = None, + offset: Optional[Union[None, str]] = None, ) -> None: """ Prepare model for estimation. @@ -189,6 +190,9 @@ def _prepare_estimation( weights : Union[None, np.ndarray], optional An array of weights. Either None or a 1D array of length N. Default is None. + offset : Union[None, str], optional + Default is None. Offset variable for Poisson regression. If None, no offset. + If a string, the name of the column in `data` that contains the offset. ssc : dict[str, str], optional A dictionary specifying the type of standard errors to use for inference. See `feols()` or `fepois()`. @@ -220,6 +224,7 @@ def _prepare_estimation( self._drop_singletons = False self._is_multiple_estimation = False self._weights = weights + self._offset = offset self._has_weights = False if weights is not None: self._has_weights = True @@ -380,6 +385,13 @@ def _estimate_all_models( } ) + if self._method == "fepois": + model_kwargs.update( + { + "offset": self._offset, + } + ) + if self._method in { "feglm-logit", "feglm-probit", diff --git a/pyfixest/estimation/deprecated/model_matrix_fixest_.py b/pyfixest/estimation/deprecated/model_matrix_fixest_.py index aef0a387b..1c58f936b 100644 --- a/pyfixest/estimation/deprecated/model_matrix_fixest_.py +++ b/pyfixest/estimation/deprecated/model_matrix_fixest_.py @@ -17,6 +17,7 @@ def model_matrix_fixest( data: pd.DataFrame, drop_singletons: bool = False, weights: str | None = None, + offset: str | None = None, drop_intercept=False, context: int | Mapping[str, Any] = 0, ) -> dict: @@ -143,6 +144,7 @@ def model_matrix_fixest( **({"fml_first_stage": fml_first_stage} if _is_iv else {}), **({"fe": wrap_factorize(fval)} if fval is not None else {}), **({"weights": weights} if weights is not None else {}), + **({"offset": offset} if offset is not None else {}), } FML = Formula(**fml_kwargs) @@ -150,7 +152,7 @@ def model_matrix_fixest( mm = FML.get_model_matrix( data, output="pandas", context={"factorize": factorize, **_context} ) - endogvar = Z = weights_df = fe = None + endogvar = Z = weights_df = offset_df = fe = None model_spec = mm.model_spec @@ -164,6 +166,8 @@ def model_matrix_fixest( fe = mm["fe"] if weights is not None: weights_df = mm["weights"] + if offset is not None: + offset_df = mm["offset"] # drop infinite values inf_idx_list = [] @@ -192,7 +196,7 @@ def model_matrix_fixest( weights_df=weights_df, ) - for df in [Y, X, Z, endogvar, weights_df]: + for df in [Y, X, Z, endogvar, weights_df, offset_df]: if df is not None: cols_to_convert = df.select_dtypes(exclude=["int64", "float64"]).columns if cols_to_convert.size > 0: @@ -245,6 +249,8 @@ def model_matrix_fixest( endogvar=endogvar, weights_df=weights_df, ) + if offset is not None: + offset_df = offset_df[keep_idx] na_index = _get_na_index(data.shape[0], Y.index) na_index_str = ",".join(str(x) for x in na_index) @@ -262,6 +268,7 @@ def model_matrix_fixest( "endogvar": endogvar, "Z": Z, "weights_df": weights_df, + "offset_df": offset_df, "na_index": na_index, "na_index_str": na_index_str, "icovars": _icovars, From 91020adb6a720c72f9bc703006726bfacceaab88 Mon Sep 17 00:00:00 2001 From: Alexander Fischer Date: Tue, 4 Feb 2025 21:36:49 +0100 Subject: [PATCH 2/5] add tests --- tests/test_vs_fixest.py | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/tests/test_vs_fixest.py b/tests/test_vs_fixest.py index 57eb79f1f..3f4522b60 100644 --- a/tests/test_vs_fixest.py +++ b/tests/test_vs_fixest.py @@ -520,8 +520,9 @@ def test_single_fit_feols_empty( @pytest.mark.parametrize("k_adj", [True]) @pytest.mark.parametrize("G_adj", [True]) @pytest.mark.parametrize("weights", [None, "weights"]) +@pytest.mark.parametrize("offset", [False, True]) def test_single_fit_fepois( - data_fepois, dropna, inference, f3_type, fml, k_adj, G_adj, weights + data_fepois, dropna, inference, f3_type, fml, k_adj, G_adj, weights, offset ): global test_counter_fepois test_counter_fepois += 1 @@ -531,6 +532,13 @@ def test_single_fit_fepois( ssc_ = ssc(k_adj=k_adj, G_adj=G_adj) + data = data_fepois + if offset: + data["offset_var"] = np.ones(data.shape[0]) * 5 + offset_var = "offset_var" + else: + offset_var = None + data_fepois = data_fepois.copy() if dropna: data_fepois.dropna(inplace=True) @@ -552,27 +560,22 @@ def test_single_fit_fepois( iwls_tol=1e-10, iwls_maxiter=100, weights=weights, + offset=offset_var if offset else None, ) + r_kwargs = { + "vcov": r_inference, + "data": data_r, + "ssc": fixest.ssc(k_adj, "nonnested", False, G_adj, "min", "min"), + "glm_tol": 1e-10, + "glm_maxiter": 100, + } if weights is not None: - r_fixest = fixest.fepois( - ro.Formula(r_fml), - vcov=r_inference, - data=data_r, - ssc=fixest.ssc(k_adj, "nonnested", False, G_adj, "min", "min"), - glm_tol=1e-10, - glm_maxiter=100, - weights=ro.Formula("~" + weights), - ) - else: - r_fixest = fixest.fepois( - ro.Formula(r_fml), - vcov=r_inference, - data=data_r, - ssc=fixest.ssc(k_adj, "nonnested", False, G_adj, "min", "min"), - glm_tol=1e-10, - glm_maxiter=100, - ) + r_kwargs["weights"] = ro.Formula("~" + weights) + if offset: + r_kwargs["offset"] = ro.Formula("~" + offset_var) + + r_fixest = fixest.fepois(ro.Formula(r_fml), **r_kwargs) py_coef = mod.coef().xs("X1") py_se = mod.se().xs("X1") From 6981c8210aa21a05cf048fa381bc2b5e22312eda Mon Sep 17 00:00:00 2001 From: Alexander Fischer Date: Tue, 24 Mar 2026 23:15:20 +0100 Subject: [PATCH 3/5] bring back offsets --- pyfixest/estimation/FixestMulti_.py | 2 +- pyfixest/estimation/api/fepois.py | 9 ++++++++ .../deprecated/model_matrix_fixest_.py | 2 +- pyfixest/estimation/models/fepois_.py | 23 ++++++++++++++++--- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py index 933c8ffd6..55459a4ed 100644 --- a/pyfixest/estimation/FixestMulti_.py +++ b/pyfixest/estimation/FixestMulti_.py @@ -165,7 +165,7 @@ def _prepare_estimation( quantile: float | None = None, quantile_tol: float = 1e-06, quantile_maxiter: int | None = None, - offset: Optional[Union[None, str]] = None, + offset: str | None = None, ) -> None: """ Prepare model for estimation. diff --git a/pyfixest/estimation/api/fepois.py b/pyfixest/estimation/api/fepois.py index f005376d5..f78040f3f 100644 --- a/pyfixest/estimation/api/fepois.py +++ b/pyfixest/estimation/api/fepois.py @@ -24,6 +24,7 @@ def fepois( vcov_kwargs: dict[str, str | int] | None = None, weights: None | str = None, weights_type: WeightsTypeOptions = "aweights", + offset: str | None = None, ssc: dict[str, str | bool] | None = None, fixef_rm: FixedRmOptions = "singleton", fixef_tol: float = 1e-06, @@ -88,6 +89,13 @@ def fepois( are useful for compressed count data where identical observations are aggregated. For details see this blog post: https://notstatschat.rbind.io/2020/08/04/weights-in-statistics/. + offset : str, optional + Default is None. The name of a column in `data` to use as an offset in the + Poisson regression. An offset is added to the linear predictor, which is + equivalent to constraining its coefficient to 1. This is useful for modeling + rates when the exposure variable differs across observations (e.g. + `offset = "log_population"`). + ssc : str A ssc object specifying the small sample correction for inference. @@ -257,6 +265,7 @@ def fepois( ssc=ssc, fixef_rm=fixef_rm, drop_intercept=drop_intercept, + offset=offset, ) if fixest._is_iv: raise NotImplementedError( diff --git a/pyfixest/estimation/deprecated/model_matrix_fixest_.py b/pyfixest/estimation/deprecated/model_matrix_fixest_.py index 1c58f936b..cd46c298a 100644 --- a/pyfixest/estimation/deprecated/model_matrix_fixest_.py +++ b/pyfixest/estimation/deprecated/model_matrix_fixest_.py @@ -249,7 +249,7 @@ def model_matrix_fixest( endogvar=endogvar, weights_df=weights_df, ) - if offset is not None: + if offset_df is not None: offset_df = offset_df[keep_idx] na_index = _get_na_index(data.shape[0], Y.index) diff --git a/pyfixest/estimation/models/fepois_.py b/pyfixest/estimation/models/fepois_.py index be6f1545d..de5ee9361 100644 --- a/pyfixest/estimation/models/fepois_.py +++ b/pyfixest/estimation/models/fepois_.py @@ -106,6 +106,7 @@ def __init__( sample_split_var: str | None = None, sample_split_value: str | int | None = None, separation_check: list[str] | None = None, + offset: str | None = None, ): super().__init__( FixestFormula=FixestFormula, @@ -141,6 +142,7 @@ def __init__( self._method = "fepois" self.convergence = False self.separation_check = separation_check + self._offset_name = offset self._support_crv3_inference = True self._support_iid_inference = True @@ -155,6 +157,20 @@ def prepare_model_matrix(self): "Prepare model inputs for estimation." super().prepare_model_matrix() + # Extract offset from data or default to zeros + if self._offset_name is not None: + if self._offset_name not in self._data.columns: + raise ValueError( + f"Offset variable '{self._offset_name}' not found in data." + ) + self._offset = ( + self._data.loc[self._Y.index, self._offset_name] + .to_numpy() + .reshape((-1, 1)) + ) + else: + self._offset = np.zeros((self._N, 1)) + # check that self._Y is a pandas Series or DataFrame self._Y = _check_series_or_dataframe(self._Y) @@ -185,6 +201,7 @@ def prepare_model_matrix(self): self._data.drop(na_separation, axis=0, inplace=True) if self._weights_df is not None: self._weights_df.drop(na_separation, axis=0, inplace=True) + self._offset = np.delete(self._offset, na_separation, axis=0) self._N = self._Y.shape[0] self._N_rows = self._N # Re-set weights after dropping rows (handles both weighted and unweighted) @@ -288,12 +305,12 @@ def get_fit(self) -> None: _mean = np.mean(self._Y) mu = (self._Y + _mean) / 2 eta = np.log(mu) - Z = eta + self._Y / mu - 1 + Z = eta - self._offset + self._Y / mu - 1 reg_Z = Z.copy() last = self._compute_deviance(self._Y, mu) else: # update w and Z - Z = eta + self._Y / mu - 1 # eq (8) + Z = eta - self._offset + self._Y / mu - 1 # eq (8) reg_Z = Z.copy() # eq (9) # tighten HDFE tolerance - currently not possible with PyHDFE @@ -349,7 +366,7 @@ def get_fit(self) -> None: resid = Z_resid - X_resid @ delta_new # more updating - eta = Z - resid + eta = Z - resid + self._offset mu = np.exp(eta) # same criterion as fixest From 3b0953d8b482ae01408c6f695e1072f3fcf2a603 Mon Sep 17 00:00:00 2001 From: Alexander Fischer Date: Tue, 24 Mar 2026 23:35:36 +0100 Subject: [PATCH 4/5] adjust tests --- tests/test_vs_fixest.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/test_vs_fixest.py b/tests/test_vs_fixest.py index 3f4522b60..791ba57ae 100644 --- a/tests/test_vs_fixest.py +++ b/tests/test_vs_fixest.py @@ -532,14 +532,12 @@ def test_single_fit_fepois( ssc_ = ssc(k_adj=k_adj, G_adj=G_adj) - data = data_fepois + data_fepois = data_fepois.copy() if offset: - data["offset_var"] = np.ones(data.shape[0]) * 5 + data_fepois["offset_var"] = np.log(np.ones(data_fepois.shape[0]) * 2) offset_var = "offset_var" else: offset_var = None - - data_fepois = data_fepois.copy() if dropna: data_fepois.dropna(inplace=True) # long story, but categories need to be strings to be converted to R factors, @@ -641,8 +639,9 @@ def test_single_fit_fepois( py_tstat, r_tstat, 1e-06 if weights is None else 1e-05, "py_tstat != r_tstat" ) check_absolute_diff(py_confint, r_confint, 1e-06, "py_confint != r_confint") - check_absolute_diff(py_deviance, r_deviance, 1e-08, "py_deviance != r_deviance") - check_absolute_diff(py_loglik, r_loglik, 1e-08, "py_ll != r_loglik") + _dev_tol = 1e-07 if offset else 1e-08 + check_absolute_diff(py_deviance, r_deviance, _dev_tol, "py_deviance != r_deviance") + check_absolute_diff(py_loglik, r_loglik, _dev_tol, "py_ll != r_loglik") # cant match fixest yet if weights is None: From 85a8cef99011caea01eff3007b839d8e1058019e Mon Sep 17 00:00:00 2001 From: Alexander Fischer Date: Wed, 25 Mar 2026 00:01:12 +0100 Subject: [PATCH 5/5] move offset creation after separation check to fix test error --- pyfixest/estimation/models/fepois_.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/pyfixest/estimation/models/fepois_.py b/pyfixest/estimation/models/fepois_.py index de5ee9361..7c9ca3cc4 100644 --- a/pyfixest/estimation/models/fepois_.py +++ b/pyfixest/estimation/models/fepois_.py @@ -157,20 +157,6 @@ def prepare_model_matrix(self): "Prepare model inputs for estimation." super().prepare_model_matrix() - # Extract offset from data or default to zeros - if self._offset_name is not None: - if self._offset_name not in self._data.columns: - raise ValueError( - f"Offset variable '{self._offset_name}' not found in data." - ) - self._offset = ( - self._data.loc[self._Y.index, self._offset_name] - .to_numpy() - .reshape((-1, 1)) - ) - else: - self._offset = np.zeros((self._N, 1)) - # check that self._Y is a pandas Series or DataFrame self._Y = _check_series_or_dataframe(self._Y) @@ -201,7 +187,6 @@ def prepare_model_matrix(self): self._data.drop(na_separation, axis=0, inplace=True) if self._weights_df is not None: self._weights_df.drop(na_separation, axis=0, inplace=True) - self._offset = np.delete(self._offset, na_separation, axis=0) self._N = self._Y.shape[0] self._N_rows = self._N # Re-set weights after dropping rows (handles both weighted and unweighted) @@ -213,6 +198,16 @@ def prepare_model_matrix(self): self._k_fe = self._fe.nunique(axis=0) if self._has_fixef else None self._n_fe = np.sum(self._k_fe > 1) if self._has_fixef else 0 + # Extract offset after all drops (singleton + separation) so indices are aligned + if self._offset_name is not None: + if self._offset_name not in self._data.columns: + raise ValueError( + f"Offset variable '{self._offset_name}' not found in data." + ) + self._offset = self._data[self._offset_name].to_numpy().reshape((-1, 1)) + else: + self._offset = np.zeros((self._N, 1)) + def to_array(self): "Turn estimation DataFrames to np arrays." self._Y, self._X, self._Z = (