diff --git a/docs/changelog.qmd b/docs/changelog.qmd index 4375724ff..a37c98200 100644 --- a/docs/changelog.qmd +++ b/docs/changelog.qmd @@ -15,7 +15,45 @@ fit3 = pf.feols("Y ~ X1 + X2 | f1", data = df) - Adds the following statistics to the `Fepois` class: `_loglik`, `_loglik_null`, `_pseudo_r2` for regression without weights. - Set the default parameters for the MAP algorithm to a tolerance of `1e-06` and maximum number of iterations of `10_000`. -- Adds support for weights for poisson regression via `feopis()`. +- Adds support for weights for Poisson regression via `fepois()`, including frequency weights (`fweights`). +- Fixes a bug in heteroskedastic-robust standard errors (HC1/HC2/HC3) when using frequency weights with both `feols()` and `fepois()`. Previously, the meat matrix was incorrectly scaled by `w²` instead of `w`. See [issue #367](https://github.com/py-econometrics/pyfixest/issues/367). + +### Frequency Weights Example + +Frequency weights allow you to work with aggregated data where each row represents multiple identical observations: + +```{python} +import pyfixest as pf + +data = pf.get_data(model="Fepois") + +# aggregate data by (Y, X1) and count occurrences +data_agg = ( + data[["Y", "X1"]] + .groupby(["Y", "X1"]) + .size() + .reset_index() + .rename(columns={0: "count"}) +) + +# fit poisson without on long data +fit = pf.fepois( + "Y ~ X1", + data=data, + vcov="hetero" +) + +# fit poisson on compressed data +fit_wls = pf.fepois( + "Y ~ X1", + data=data_agg, + weights="count", + weights_type="fweights", + vcov="hetero" +) + +pf.etable([fit, fit_wls]) +``` ## PyFixest 0.40.1 diff --git a/pyfixest/estimation/feols_.py b/pyfixest/estimation/feols_.py index c172d879b..e24abfc70 100644 --- a/pyfixest/estimation/feols_.py +++ b/pyfixest/estimation/feols_.py @@ -824,12 +824,22 @@ def _vcov_hetero(self): transformed_scores = self._scores elif self._vcov_type_detail in ["HC2", "HC3"]: leverage = np.sum(self._X * (self._X @ np.linalg.inv(self._tZX)), axis=1) + if self._weights_type == "fweights": + leverage = leverage / self._weights.flatten() transformed_scores = ( self._scores / np.sqrt(1 - leverage)[:, None] if self._vcov_type_detail == "HC2" else self._scores / (1 - leverage)[:, None] ) + # For fweights, the scores are X_w * u_w = (sqrt(w)*x) * (sqrt(w)*u) = w*x*u. + # The meat matrix should be sum_i(w_i * u_i^2 * x_i @ x_i'), not + # sum_i(w_i^2 * u_i^2 * x_i @ x_i'). We correct by dividing by sqrt(w). + # For Poisson, use _user_weights (original weights before IRLS adjustment). + if self._weights_type == "fweights": + fweights = getattr(self, "_user_weights", self._weights) + transformed_scores = transformed_scores / np.sqrt(fweights) + Omega = transformed_scores.T @ transformed_scores meat = ( diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py index 631b785a7..4ad254238 100644 --- a/pyfixest/estimation/fepois_.py +++ b/pyfixest/estimation/fepois_.py @@ -202,7 +202,7 @@ def to_array(self): if self._fe is not None: self._fe = self._fe.to_numpy() if self._fe.ndim == 1: - self._fe = self._fe.reshape((self._N, 1)) + self._fe = self._fe.reshape((-1, 1)) def _compute_deviance( self, Y: np.ndarray, mu: np.ndarray, weights: Optional[np.ndarray] = None @@ -313,7 +313,7 @@ def get_fit(self) -> None: else: ZX_resid = ZX - Z_resid = ZX_resid[:, 0].reshape((self._N, 1)) # z_resid + Z_resid = ZX_resid[:, 0].reshape((-1, 1)) # z_resid X_resid = ZX_resid[:, 1:] # x_resid WX = np.sqrt(combined_weights) * X_resid @@ -346,7 +346,9 @@ def get_fit(self) -> None: # needed for the calculation of the vcov # update for inference + # Save original user weights for fweights correction in _vcov_hetero user_weights = self._weights.flatten() + self._user_weights = self._weights.copy() self._irls_weights = combined_weights.flatten() self._u_hat = (WZ - WX @ delta_new).flatten() diff --git a/tests/test_wls_types.py b/tests/test_wls_types.py index 9887d18dc..7174025f5 100644 --- a/tests/test_wls_types.py +++ b/tests/test_wls_types.py @@ -4,55 +4,85 @@ import pyfixest as pf -# @pytest.mark.skip(reason="Bug for fweights and heteroskedastic errors.") -def test_fweights_ols(): - "Test that the fweights are correctly implemented for OLS models." - # Fepois model for discrete Y - data = pf.get_data(model="Fepois") - data2_w = ( - data[["Y", "X1"]] - .groupby(["Y", "X1"]) - .size() - .reset_index() - .rename(columns={0: "count"}) - ) - data3_w = ( - data[["Y", "X1", "f1"]] - .groupby(["Y", "X1", "f1"]) - .size() - .reset_index() - .rename(columns={0: "count"}) - ) - - fit1 = pf.feols("Y ~ X1", data=data, ssc=pf.ssc(k_adj=False, G_adj=False)) - fit2 = pf.feols( - "Y ~ X1", - data=data2_w, - weights="count", - weights_type="fweights", - ssc=pf.ssc(k_adj=False, G_adj=False), - ) +def _assert_fit_equal(fit1, fit2, vcov_types, rtol=1e-5): + "Assert that two regression fits have identical coefficients, vcov, and SEs." assert fit1._N == fit2._N, "Number of observations is not the same." - if False: - np.testing.assert_allclose(fit1.tidy().values, fit2.tidy().values) + np.testing.assert_allclose( + fit1.coef().values, fit2.coef().values, rtol=rtol, err_msg="Coefficients differ" + ) - np.testing.assert_allclose(fit1.vcov("HC1")._vcov, fit2.vcov("HC1")._vcov) - np.testing.assert_allclose(fit1.vcov("HC2")._vcov, fit2.vcov("HC2")._vcov) - np.testing.assert_allclose(fit1.vcov("HC3")._vcov, fit2.vcov("HC3")._vcov) + for vcov_type in vcov_types: + fit1_vcov = fit1.vcov(vcov_type) + fit2_vcov = fit2.vcov(vcov_type) - fit3 = pf.feols("Y ~ X1 | f1", data=data) - fit4 = pf.feols( - "Y ~ X1 | f1", data=data3_w, weights="count", weights_type="fweights" + np.testing.assert_allclose( + fit1_vcov._vcov, + fit2_vcov._vcov, + rtol=rtol, + err_msg=f"Vcov differs for {vcov_type}", ) - np.testing.assert_allclose(fit3.tidy().values, fit4.tidy().values) np.testing.assert_allclose( - fit3.vcov({"CRV3": "f1"})._vcov, fit4.vcov({"CRV3": "f1"})._vcov + fit1_vcov.se().values, + fit2_vcov.se().values, + rtol=rtol, + err_msg=f"SEs differ for {vcov_type}", ) - np.testing.assert_allclose(fit1.vcov("HC1")._vcov, fit2.vcov("HC1")._vcov) - np.testing.assert_allclose(fit1.vcov("HC2")._vcov, fit2.vcov("HC2")._vcov) - np.testing.assert_allclose(fit1.vcov("HC3")._vcov, fit2.vcov("HC3")._vcov) + + +@pytest.mark.parametrize( + "model,fml,cols,vcov_types,rtol", + [ + # OLS without fixed effects - test hetero vcov types + ("ols", "Y ~ X1", ["Y", "X1"], ["iid", "HC1", "HC2", "HC3"], 1e-5), + # OLS without fixed effects - test CRV + ("ols", "Y ~ X1", ["Y", "X1", "f1"], [{"CRV1": "f1"}, {"CRV3": "f1"}], 1e-5), + # OLS with fixed effects - HC2/HC3 not supported + ( + "ols", + "Y ~ X1 | f1", + ["Y", "X1", "f1"], + ["iid", "HC1", {"CRV1": "f1"}, {"CRV3": "f1"}], + 1e-5, + ), + # Poisson without fixed effects + ("poisson", "Y ~ X1", ["Y", "X1"], ["iid", "hetero"], 1e-4), + # Poisson without fixed effects - test CRV + ("poisson", "Y ~ X1", ["Y", "X1", "f1"], [{"CRV1": "f1"}, {"CRV3": "f1"}], 1e-4), + # Poisson with fixed effects + ("poisson", "Y ~ X1 | f1", ["Y", "X1", "f1"], ["iid", "hetero"], 1e-4), + # Poisson with fixed effects - test CRV + ( + "poisson", + "Y ~ X1 | f1", + ["Y", "X1", "f1"], + [{"CRV1": "f1"}, {"CRV3": "f1"}], + 1e-4, + ), + ], +) +def test_fweights(model, fml, cols, vcov_types, rtol): + """Test that fweights are correctly implemented for OLS and Poisson models.""" + data = pf.get_data(model="Fepois") + data = data.dropna(subset=cols) + + data_agg = ( + data[cols].groupby(cols).size().reset_index().rename(columns={0: "count"}) + ) + + fit_func = pf.feols if model == "ols" else pf.fepois + + fit_raw = fit_func(fml, data=data, vcov="iid") + fit_agg = fit_func( + fml, + data=data_agg, + weights="count", + weights_type="fweights", + vcov="iid", + ) + + _assert_fit_equal(fit_raw, fit_agg, vcov_types, rtol=rtol) @pytest.mark.skip(reason="Not implemented yet.")