From d5f2b9e55a6ef165a48ccab861c47f1bd66b5136 Mon Sep 17 00:00:00 2001 From: Dirk Sliwka <49401450+dsliwka@users.noreply.github.com> Date: Mon, 18 Aug 2025 10:24:48 +0200 Subject: [PATCH 1/4] Allow that user can specify the model statistics to be displayed in the table and their order. --- docs/table-layout.qmd | 14 ++- pyfixest/report/summarize.py | 230 +++++++++++++++++++++-------------- 2 files changed, 153 insertions(+), 91 deletions(-) diff --git a/docs/table-layout.qmd b/docs/table-layout.qmd index 31186d29c..72d3e9c9a 100644 --- a/docs/table-layout.qmd +++ b/docs/table-layout.qmd @@ -71,11 +71,19 @@ We can also easily **drop** variables via the `drop` argument: pf.etable([fit1, fit2, fit3, fit4, fit5, fit6], drop=["X1"]) ``` -## Hide fixed effects or SE-type rows -We can hide the rows showing the relevant fixed effects and those showing the S.E. type by setting `show_fe=False` and `show_se_type=False` (for instance when the set of fixed effects or the estimation method for the std. errors is the same for all models and you want to describe this in the text or table notes rather than displaying it in the table). +## Specify displayed model statistics +The user can specify the model statistics to be displayed and their order by passing a list of strings to `model_stats`. Names of the statistics must match the model's respective attribute names such as "r2", "adj_r2", "N", "r2_within" (see the respective function reference for attributes and omit the leading "_"). The type of standard error estimated can be shown by adding "se_type" to the list. ```{python} -pf.etable([fit1, fit2, fit3, fit4, fit5, fit6], show_fe=False, show_se_type=False) +pf.etable([fit1, fit2, fit3, fit4, fit5, fit6], model_stats=['N','r2','se_type']) +``` + + +## Hide fixed effects +We can hide the rows showing the relevant fixed effects by setting `show_fe=False` (for instance when the set of fixed effects is the same for all models and you want to describe this in the text or table notes rather than displaying it in the table). + +```{python} +pf.etable([fit1, fit2, fit3, fit4, fit5, fit6], model_stats=['r2','N'],show_fe=False) ``` diff --git a/pyfixest/report/summarize.py b/pyfixest/report/summarize.py index 2575a9284..53b9a3e61 100644 --- a/pyfixest/report/summarize.py +++ b/pyfixest/report/summarize.py @@ -1,5 +1,7 @@ import re import warnings +import math + from collections import Counter from collections.abc import ValuesView from typing import Optional, Union @@ -26,6 +28,8 @@ def etable( type: str = "gt", signif_code: Optional[list] = None, coef_fmt: str = "b \n (se)", + model_stats: Optional[list[str]] = None, + model_stats_labels: Optional[dict[str, str]] = None, custom_stats: Optional[dict] = None, custom_model_stats: Optional[dict] = None, keep: Optional[Union[list, str]] = None, @@ -34,7 +38,7 @@ def etable( labels: Optional[dict] = None, cat_template: Optional[str] = None, show_fe: Optional[bool] = True, - show_se_type: Optional[bool] = True, + show_se_type: Optional[bool] = True, # legacy (ignored when model_stats provided) felabels: Optional[dict] = None, notes: str = "", model_heads: Optional[list] = None, @@ -61,6 +65,10 @@ def etable( The format of the coefficient (b), standard error (se), t-stats (t), and p-value (p). Default is `"b \n (se)"`. Spaces ` `, parentheses `()`, brackets `[]`, newlines `\n` are supported. + model_stats: Optional[list[str]] = None, + A list of model statistics to include in the table which will be displayed in the determined order. Names must match the model's respective attribute names (without leading "_") such as "r2", "adj_r2", "N", ... + model_stats_labels: Optional[dict[str, str]] = None, + A dictionary mapping model statistic names to display labels. If None, the default names are used. custom_stats: dict, optional A dictionary of custom statistics that can be used in the coef_fmt string to be displayed in the coefficuent cells analogously to "b", "se" etc. The keys are the names of the custom @@ -225,116 +233,157 @@ def etable( "lists in custom_model_stats values must have the same length as models" ) - dep_var_list = [] - nobs_list = [] + # Collect info needed for coefficients & fixed effects + dep_var_list: list[str] = [] fixef_list: list[str] = [] - n_coefs = [] - se_type_list = [] - r2_list = [] - adj_r2_list = [] - r2_within_list = [] - # Define code for R2, interaction & line break depending on output type + # Output-type dependent symbols if type in ["gt", "html"]: interactionSymbol = " × " - R2code = "R2" - adj_R2_code = "Adj. R2" - R2_within_code = "R2 Within" lbcode = "
" elif type == "tex": interactionSymbol = " $\\times$ " - R2code = "$R^2$" - adj_R2_code = "Adj. $R^2$" - R2_within_code = "$R^2$ Within" lbcode = r"\\" else: interactionSymbol = " x " - R2code = "R2" - adj_R2_code = "Adj. R2" - R2_within_code = "R2 Within" lbcode = "\n" + # Pre-scan models (only once) for model in models: dep_var_list.append(model._depvar) - n_coefs.append(len(model._coefnames)) - - _nobs_kwargs = kwargs.copy() - _nobs_kwargs["integer"] = True - _nobs_kwargs["scientific_notation"] = False - nobs_list.append(_number_formatter(model._N, **_nobs_kwargs)) - - if not np.isnan(model._r2): - r2_list.append(_number_formatter(model._r2, **kwargs)) - else: - r2_list.append("-") - - if not np.isnan(model._adj_r2): - adj_r2_list.append(_number_formatter(model._adj_r2, **kwargs)) - else: - adj_r2_list.append("-") - - if not np.isnan(model._r2_within): - r2_within_list.append(_number_formatter(model._r2_within, **kwargs)) - else: - r2_within_list.append("-") - - if model._vcov_type == "CRV": - se_type_list.append("by: " + "+".join(model._clustervar)) - else: - se_type_list.append(model._vcov_type) - if model._fixef is not None and model._fixef != "0": fixef_list += model._fixef.split("+") - # find all fixef variables when the user does not want to hide the FE rows + # Fixed effects set if show_fe: - # drop "" from fixef_list fixef_list = [x for x in fixef_list if x] - # keep only unique values fixef_list = list(set(fixef_list)) n_fixef = len(fixef_list) else: fixef_list = [] n_fixef = 0 - # First create a dataframe for the model stats such as R2, nobs, etc. - model_stats_df = pd.DataFrame() - if custom_model_stats is not None: - for stat, values in custom_model_stats.items(): - model_stats_df[stat] = values - model_stats_df["Observations"] = nobs_list - if show_se_type: - model_stats_df["S.E. type"] = se_type_list - model_stats_df[R2code] = r2_list - n_model_stats = model_stats_df.shape[1] - if any(x != "-" for x in r2_within_list): - model_stats_df[R2_within_code] = r2_within_list + # Determine default model stats (legacy emulation) if user did not provide any + if model_stats is None: + any_within = any( + hasattr(m, "_r2_within") and not math.isnan(getattr(m, "_r2_within", float("nan"))) + for m in models + ) + # Legacy order + model_stats = ["N"] + if show_se_type: + model_stats.append("se_type") + model_stats += ["r2", "r2_within" if any_within else "adj_r2"] + + assert isinstance(model_stats, (list, tuple)), "model_stats must be list-like" + model_stats = list(model_stats) + assert all(isinstance(s, str) for s in model_stats), "model_stats entries must be strings" + # Preserve order but drop duplicates + seen = set() + model_stats = [s for s in model_stats if not (s in seen or seen.add(s))] + + # Default labels by output type + def _default_label(stat: str) -> str: + if type in ("gt", "html"): + mapping = { + "N": "Observations", + "se_type": "S.E. type", + "r2": "R2", + "adj_r2": "Adj. R2", + "r2_within": "R2 Within", + } + elif type == "tex": + mapping = { + "N": "Observations", + "se_type": "S.E. type", + "r2": "$R^2$", + "adj_r2": "Adj. $R^2$", + "r2_within": "$R^2$ Within", + } + else: + mapping = { + "N": "Observations", + "se_type": "S.E. type", + "r2": "R2", + "adj_r2": "Adj. R2", + "r2_within": "R2 Within", + } + return mapping.get(stat, stat) + + # Value extractors / formatters + def _extract(model, key: str): + if key == "se_type": + if getattr(model, "_vcov_type", "") == "CRV": + return "by: " + "+".join(getattr(model, "_clustervar", [])) + return getattr(model, "_vcov_type", None) + attr_name = f"_{key}" + return getattr(model, attr_name, None) + + def _format_value(val, key: str): + if val is None: + return "-" + if isinstance(val, (int, np.integer)): + return _number_formatter(int(val), integer=True, **kwargs) + if isinstance(val, (float, np.floating)): + if math.isnan(val): + return "-" + # N treated as integer + if key.lower() in ("n", "nobs"): + return _number_formatter(val, integer=True, **kwargs) + return _number_formatter(val, **kwargs) + return str(val) + + model_stats_rows: dict[str, list[str]] = {} + for stat in model_stats: + values = [_format_value(_extract(m, stat), stat) for m in models] + label = _default_label(stat) + if model_stats_labels and stat in model_stats_labels: + label = model_stats_labels[stat] + model_stats_rows[label] = values + + # Build custom model stats first (if any) + if custom_model_stats is not None and len(custom_model_stats) > 0: + # Values already validated for correct length earlier + custom_df = pd.DataFrame.from_dict(custom_model_stats, orient="index") + else: + custom_df = pd.DataFrame() + + # Builtin / attribute stats + if model_stats_rows: + builtin_df = pd.DataFrame.from_dict(model_stats_rows, orient="index") else: - model_stats_df[adj_R2_code] = adj_r2_list - # Transpose - model_stats_df = model_stats_df.T - - # Create a dataframe for the Fixed Effects markers - fe_df = pd.DataFrame() - # when at least one model has a fixed effect & the user wants to show them - if fixef_list: + builtin_df = pd.DataFrame() + + # Combine (custom first) + if not custom_df.empty and not builtin_df.empty: + model_stats_df = pd.concat([custom_df, builtin_df], axis=0) + elif not custom_df.empty: + model_stats_df = custom_df + else: + model_stats_df = builtin_df + + # Ensure index name consistency + if model_stats_df.index.name is None: + model_stats_df.index.name = None + + n_model_stats = model_stats_df.shape[0] + + # Create a dataframe for the Fixed Effects markers (fixed implementation) + if show_fe and fixef_list: + fe_rows = {} for fixef in fixef_list: - # check if not empty string - if fixef: - for i, model in enumerate(models): - if ( - model._fixef is not None - and fixef in model._fixef.split("+") - and not model._use_mundlak - ): - fe_df.loc[i, fixef] = "x" - else: - fe_df.loc[i, fixef] = "-" - # Sort by model - fe_df.sort_index(inplace=True) - # Transpose - fe_df = fe_df.T + row = [] + for model in models: + has = ( + model._fixef is not None + and fixef in model._fixef.split("+") + and not model._use_mundlak + ) + row.append("x" if has else "-") + fe_rows[fixef] = row + fe_df = pd.DataFrame.from_dict(fe_rows, orient="index") else: + fe_df = pd.DataFrame() show_fe = False # Finally, collect & format estimated coefficients and standard errors etc. @@ -446,15 +495,20 @@ def etable( felabels = dict() if labels is None: labels = dict() - # When the user provides a dictionary for fixed effects, then use it - # When a corresponsing variable is not in the felabel dictionary, then use the labels dictionary - # When in neither then just use the original variable name fe_index = fe_df.index.to_series() fe_index = fe_index.apply(lambda x: felabels.get(x, labels.get(x, x))) fe_df.set_index(fe_index, inplace=True) - model_stats_df.columns = res.columns - if show_fe: + # Ensure model_stats_df columns align after coefficient construction: + # Allow user to pass model_stats = [] (no model stats displayed). + # In that case model_stats_df is (0, 0) and assigning columns would raise a length mismatch. + if model_stats_df.shape[1] == 0: + # Create an empty frame with the correct columns so later concatenation works. + model_stats_df = pd.DataFrame(index=pd.Index([], name=res.index.name), columns=res.columns) + else: + model_stats_df.columns = res.columns + # Also align fixed effects dataframe columns + if show_fe and not fe_df.empty: fe_df.columns = res.columns depvars = pd.DataFrame({"depvar": dep_var_list}).T From 2e257f2f292e9cd096bcc5a5cf53664800888ccb Mon Sep 17 00:00:00 2001 From: Dirk Sliwka <49401450+dsliwka@users.noreply.github.com> Date: Mon, 18 Aug 2025 11:06:44 +0200 Subject: [PATCH 2/4] Updates + add tests --- docs/table-layout.qmd | 10 ++++++++-- pyfixest/report/summarize.py | 3 +-- tests/test_summarise.py | 12 +++++++++++- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/docs/table-layout.qmd b/docs/table-layout.qmd index 72d3e9c9a..f3c9466a0 100644 --- a/docs/table-layout.qmd +++ b/docs/table-layout.qmd @@ -72,18 +72,24 @@ pf.etable([fit1, fit2, fit3, fit4, fit5, fit6], drop=["X1"]) ``` ## Specify displayed model statistics -The user can specify the model statistics to be displayed and their order by passing a list of strings to `model_stats`. Names of the statistics must match the model's respective attribute names such as "r2", "adj_r2", "N", "r2_within" (see the respective function reference for attributes and omit the leading "_"). The type of standard error estimated can be shown by adding "se_type" to the list. +The user can specify the model statistics to be displayed and their order by passing a list of strings to `model_stats`. Names of the statistics must match the model's respective attribute names such as "r2", "adj_r2", "N", "r2_within" (see the respective function reference for attributes and omit the leading "_"). The type of standard error estimated can be shown by adding "se_type" to the list. When passing an empty list no model statistics are displayed. ```{python} pf.etable([fit1, fit2, fit3, fit4, fit5, fit6], model_stats=['N','r2','se_type']) ``` +Model statistics can also be relabeled by passing a dictionary `model_stats_labels`. + +```{python} +pf.etable([fit1, fit2, fit3, fit4, fit5, fit6], model_stats=['N','r2'], model_stats_labels={"N": "Number of firms", "r2": "R squared"}) +``` + ## Hide fixed effects We can hide the rows showing the relevant fixed effects by setting `show_fe=False` (for instance when the set of fixed effects is the same for all models and you want to describe this in the text or table notes rather than displaying it in the table). ```{python} -pf.etable([fit1, fit2, fit3, fit4, fit5, fit6], model_stats=['r2','N'],show_fe=False) +pf.etable([fit1, fit2, fit3, fit4, fit5, fit6], model_stats=['r2','N'], show_fe=False) ``` diff --git a/pyfixest/report/summarize.py b/pyfixest/report/summarize.py index 53b9a3e61..beb362693 100644 --- a/pyfixest/report/summarize.py +++ b/pyfixest/report/summarize.py @@ -1,7 +1,6 @@ +import math import re import warnings -import math - from collections import Counter from collections.abc import ValuesView from typing import Optional, Union diff --git a/tests/test_summarise.py b/tests/test_summarise.py index 5b06ab987..8149e148b 100644 --- a/tests/test_summarise.py +++ b/tests/test_summarise.py @@ -112,7 +112,6 @@ def test_summary(): ] # API tests for new tex args - etable([fit1, fit2], type="tex") etable([fit1, fit2], type="tex", print_tex=True) @@ -134,6 +133,17 @@ def test_summary(): summary(fit_qreg) etable(fit_qreg) + # Tests for model_stats argument + # Basic explicit stats + etable([fit1, fit3], model_stats=["N", "r2"]) + # Empty list (no built-in stats) + etable([fit1, fit3], model_stats=[]) + # With custom labels + etable( + [fit1, fit3], + model_stats=["N", "r2"], + model_stats_labels={"N": "Obs.", "r2": "R²"}, + ) @pytest.mark.skip("Pyfixest PR is not yet merged into stargazer.") def test_stargazer(): From e19f5579d12524f9df1f461dc76ff7ef6796294b Mon Sep 17 00:00:00 2001 From: Dirk Sliwka <49401450+dsliwka@users.noreply.github.com> Date: Mon, 18 Aug 2025 16:39:11 +0200 Subject: [PATCH 3/4] Corrections --- pyfixest/report/summarize.py | 64 +++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/pyfixest/report/summarize.py b/pyfixest/report/summarize.py index beb362693..57c54d187 100644 --- a/pyfixest/report/summarize.py +++ b/pyfixest/report/summarize.py @@ -277,9 +277,8 @@ def etable( assert isinstance(model_stats, (list, tuple)), "model_stats must be list-like" model_stats = list(model_stats) assert all(isinstance(s, str) for s in model_stats), "model_stats entries must be strings" - # Preserve order but drop duplicates - seen = set() - model_stats = [s for s in model_stats if not (s in seen or seen.add(s))] + # Assert that there are no duplicates in model_stats + assert len(model_stats) == len(set(model_stats)), "model_stats contains duplicate entries" # Default labels by output type def _default_label(stat: str) -> str: @@ -309,32 +308,10 @@ def _default_label(stat: str) -> str: } return mapping.get(stat, stat) - # Value extractors / formatters - def _extract(model, key: str): - if key == "se_type": - if getattr(model, "_vcov_type", "") == "CRV": - return "by: " + "+".join(getattr(model, "_clustervar", [])) - return getattr(model, "_vcov_type", None) - attr_name = f"_{key}" - return getattr(model, attr_name, None) - - def _format_value(val, key: str): - if val is None: - return "-" - if isinstance(val, (int, np.integer)): - return _number_formatter(int(val), integer=True, **kwargs) - if isinstance(val, (float, np.floating)): - if math.isnan(val): - return "-" - # N treated as integer - if key.lower() in ("n", "nobs"): - return _number_formatter(val, integer=True, **kwargs) - return _number_formatter(val, **kwargs) - return str(val) model_stats_rows: dict[str, list[str]] = {} for stat in model_stats: - values = [_format_value(_extract(m, stat), stat) for m in models] + values = [_extract(m, stat) for m in models] label = _default_label(stat) if model_stats_labels and stat in model_stats_labels: label = model_stats_labels[stat] @@ -891,6 +868,41 @@ def _number_formatter(x: float, **kwargs) -> str: return _int if digits == 0 else f"{_int}.{_float}" +def _extract(model, key: str, **kwargs): + """ + Extract the value of a model statistics from a model. + + Parameters + ---------- + model: Any + The model from which to extract the value. + key: str + The name of the statistic to extract. The method adds _ to the key and calls getattr on the model. + + Returns + ------- + value: Any + The extracted and formatted value. + """ + if key == "se_type": + if getattr(model, "_vcov_type", "") == "CRV": + return "by: " + "+".join(getattr(model, "_clustervar", [])) + return getattr(model, "_vcov_type", None) + attr_name = f"_{key}" + val = getattr(model, attr_name, None) + if val is None: + return "-" + if isinstance(val, (int, np.integer)): + return _number_formatter(float(val), integer=True, **kwargs) + if isinstance(val, (float, np.floating)): + if math.isnan(val): + return "-" + return _number_formatter(float(val), **kwargs) + if isinstance(val, bool): + return str(val) + return str(val) + + def _relabel_index(index, labels=None, stats_labels=None): if stats_labels is None: if isinstance(index, pd.MultiIndex): From 3617f0ff332d60aed743162fb61d178e2b4633da Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 Aug 2025 15:54:56 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/table-layout.qmd | 2 +- pyfixest/report/summarize.py | 16 +++++++++++----- tests/test_summarise.py | 1 + 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/docs/table-layout.qmd b/docs/table-layout.qmd index f3c9466a0..c3f2a0046 100644 --- a/docs/table-layout.qmd +++ b/docs/table-layout.qmd @@ -85,7 +85,7 @@ pf.etable([fit1, fit2, fit3, fit4, fit5, fit6], model_stats=['N','r2'], model_st ``` -## Hide fixed effects +## Hide fixed effects We can hide the rows showing the relevant fixed effects by setting `show_fe=False` (for instance when the set of fixed effects is the same for all models and you want to describe this in the text or table notes rather than displaying it in the table). ```{python} diff --git a/pyfixest/report/summarize.py b/pyfixest/report/summarize.py index 57c54d187..9d3c2f367 100644 --- a/pyfixest/report/summarize.py +++ b/pyfixest/report/summarize.py @@ -265,7 +265,8 @@ def etable( # Determine default model stats (legacy emulation) if user did not provide any if model_stats is None: any_within = any( - hasattr(m, "_r2_within") and not math.isnan(getattr(m, "_r2_within", float("nan"))) + hasattr(m, "_r2_within") + and not math.isnan(getattr(m, "_r2_within", float("nan"))) for m in models ) # Legacy order @@ -276,9 +277,13 @@ def etable( assert isinstance(model_stats, (list, tuple)), "model_stats must be list-like" model_stats = list(model_stats) - assert all(isinstance(s, str) for s in model_stats), "model_stats entries must be strings" + assert all(isinstance(s, str) for s in model_stats), ( + "model_stats entries must be strings" + ) # Assert that there are no duplicates in model_stats - assert len(model_stats) == len(set(model_stats)), "model_stats contains duplicate entries" + assert len(model_stats) == len(set(model_stats)), ( + "model_stats contains duplicate entries" + ) # Default labels by output type def _default_label(stat: str) -> str: @@ -308,7 +313,6 @@ def _default_label(stat: str) -> str: } return mapping.get(stat, stat) - model_stats_rows: dict[str, list[str]] = {} for stat in model_stats: values = [_extract(m, stat) for m in models] @@ -480,7 +484,9 @@ def _default_label(stat: str) -> str: # In that case model_stats_df is (0, 0) and assigning columns would raise a length mismatch. if model_stats_df.shape[1] == 0: # Create an empty frame with the correct columns so later concatenation works. - model_stats_df = pd.DataFrame(index=pd.Index([], name=res.index.name), columns=res.columns) + model_stats_df = pd.DataFrame( + index=pd.Index([], name=res.index.name), columns=res.columns + ) else: model_stats_df.columns = res.columns # Also align fixed effects dataframe columns diff --git a/tests/test_summarise.py b/tests/test_summarise.py index 8149e148b..e778e05a4 100644 --- a/tests/test_summarise.py +++ b/tests/test_summarise.py @@ -145,6 +145,7 @@ def test_summary(): model_stats_labels={"N": "Obs.", "r2": "R²"}, ) + @pytest.mark.skip("Pyfixest PR is not yet merged into stargazer.") def test_stargazer(): data = pf.get_data()