Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/acknowledgements.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ standard errors, t-statistics, p-values, confidence intervals, etc for OLS, IV,
|---|---|---|
| [**lfe**](https://cran.r-project.org/web/packages/lfe/vignettes/lfehow.pdf) | R | We based our first implementation of the MAP algorithm on the description in the "how lfe works" vignette. |
| [**pyhdfe**](https://github.com/jeffgortmaker/pyhdfe) | Python | PyFixest's demeaning results are tested against Jeff Gortmaker's `pyhdfe`. `pyfixest`'s first MVP was built using `pyhdfe` it ran its demeaning algorithm via `pyhdfe` MAP algo. |
| [**FixedEffects.jl**](https://github.com/FixedEffects/FixedEffects.jl) | Julia | Matthieu Gomez's Julia package for high-dimensional fixed effects. PyFixest's variance-ratio collinearity check for LSMR backends (`collin_tol_var`) and the default LSMR convergence tolerance (`1e-6`) are informed by the defaults and approach in FixedEffects.jl. |

---

Expand Down
5 changes: 5 additions & 0 deletions pyfixest/estimation/FixestMulti_.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ def _estimate_all_models(
vcov_kwargs: Optional[dict[str, Any]] = None,
demeaner_backend: DemeanerBackendOptions = "numba",
collin_tol: float = 1e-6,
collin_tol_var: Optional[float] = None,
iwls_maxiter: int = 25,
iwls_tol: float = 1e-08,
separation_check: Optional[list[str]] = None,
Expand All @@ -272,6 +273,9 @@ def _estimate_all_models(
Defaults to "numba".
collin_tol : float, optional
The tolerance level for the multicollinearity check. Default is 1e-6.
collin_tol_var : float, optional
Tolerance for the variance ratio collinearity check. Default is None
(auto-enable for LSMR backends with threshold 1e-6).
iwls_maxiter : int, optional
The maximum number of iterations for the IWLS algorithm. Default is 25.
Only relevant for non-linear estimation strategies.
Expand Down Expand Up @@ -350,6 +354,7 @@ def _estimate_all_models(
model_kwargs.update(
{
"demeaner_backend": demeaner_backend,
"collin_tol_var": collin_tol_var,
}
)

Expand Down
13 changes: 11 additions & 2 deletions pyfixest/estimation/api/feglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@ def feglm(
ssc: Optional[dict[str, Union[str, bool]]] = None,
fixef_rm: FixedRmOptions = "singleton",
fixef_tol: float = 1e-06,
fixef_maxiter: int = 100_000,
fixef_maxiter: int = 10_000,
iwls_tol: float = 1e-08,
iwls_maxiter: int = 25,
collin_tol: float = 1e-09,
collin_tol_var: Optional[float] = None,
separation_check: Optional[list[str]] = None,
solver: SolverOptions = "scipy.linalg.solve",
demeaner_backend: DemeanerBackendOptions = "numba",
Expand Down Expand Up @@ -107,7 +108,8 @@ def feglm(

fixef_tol: float, optional
Tolerance for the fixed effects demeaning algorithm. Defaults to 1e-06.
Currently does not do anything, as fixed effects are not supported for GLMs.
For LSMR-based backends (cupy, cupy32, cupy64, scipy), the tolerance is
passed directly as LSMR's atol and btol parameters.

fixef_maxiter: int, optional
Maximum iterations for the demeaning algorithm.
Expand All @@ -122,6 +124,12 @@ def feglm(
collin_tol : float, optional
Tolerance for collinearity check, by default 1e-10.

collin_tol_var : float, optional
Tolerance for the variance ratio collinearity check.
Default is None: auto-enabled with threshold 1e-6 for LSMR
backends (cupy, cupy32, cupy64, scipy), disabled for
MAP backends (numba, rust, jax). Set to 0 to disable explicitly.

separation_check: list[str], optional
Methods to identify and drop separated observations.
Either "fe" or "ir". Executes "fe" by default (when None).
Expand Down Expand Up @@ -323,6 +331,7 @@ class [FixestMulti](/reference/estimation.FixestMulti_.FixestMulti.qmd) for mult
separation_check=separation_check,
demeaner_backend=demeaner_backend,
accelerate=accelerate,
collin_tol_var=collin_tol_var,
)

if fixest._is_multiple_estimation:
Expand Down
10 changes: 10 additions & 0 deletions pyfixest/estimation/api/feols.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def feols(
fixef_tol=1e-06,
fixef_maxiter: int = 10_000,
collin_tol: float = 1e-09,
collin_tol_var: Optional[float] = None,
drop_intercept: bool = False,
copy_data: bool = True,
store_data: bool = True,
Expand Down Expand Up @@ -89,8 +90,16 @@ def feols(
collin_tol : float, optional
Tolerance for collinearity check, by default 1e-10.

collin_tol_var : float, optional
Tolerance for the variance ratio collinearity check.
Default is None: auto-enabled with threshold 1e-6 for LSMR
backends (cupy, cupy32, cupy64, scipy), disabled for
MAP backends (numba, rust, jax). Set to 0 to disable explicitly.

fixef_tol: float, optional
Tolerance for the fixed effects demeaning algorithm. Defaults to 1e-06.
For LSMR-based backends (cupy, cupy32, cupy64, scipy), the tolerance is
passed directly as LSMR's atol and btol parameters.

fixef_maxiter: int, optional
Maximum number of iterations for the demeaning algorithm. Defaults to 100,000.
Expand Down Expand Up @@ -514,6 +523,7 @@ def _lspline(series: pd.Series, knots: list[float]) -> np.array:
vcov_kwargs=vcov_kwargs,
collin_tol=collin_tol,
demeaner_backend=demeaner_backend,
collin_tol_var=collin_tol_var,
)

if fixest._is_multiple_estimation:
Expand Down
10 changes: 10 additions & 0 deletions pyfixest/estimation/api/fepois.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def fepois(
iwls_tol: float = 1e-08,
iwls_maxiter: int = 25,
collin_tol: float = 1e-09,
collin_tol_var: Optional[float] = None,
separation_check: Optional[list[str]] = None,
solver: SolverOptions = "scipy.linalg.solve",
demeaner_backend: DemeanerBackendOptions = "numba",
Expand Down Expand Up @@ -99,6 +100,8 @@ def fepois(

fixef_tol: float, optional
Tolerance for the fixed effects demeaning algorithm. Defaults to 1e-06.
For LSMR-based backends (cupy, cupy32, cupy64, scipy), the tolerance is
passed directly as LSMR's atol and btol parameters.

fixef_maxiter: int, optional
Maximum number of iterations for the demeaning algorithm. Defaults to 100,000.
Expand All @@ -112,6 +115,12 @@ def fepois(
collin_tol : float, optional
Tolerance for collinearity check, by default 1e-10.

collin_tol_var : float, optional
Tolerance for the variance ratio collinearity check.
Default is None: auto-enabled with threshold 1e-6 for LSMR
backends (cupy, cupy32, cupy64, scipy), disabled for
MAP backends (numba, rust, jax). Set to 0 to disable explicitly.

separation_check: list[str], optional
Methods to identify and drop separated observations.
Either "fe" or "ir". Executes "fe" by default (when None).
Expand Down Expand Up @@ -267,6 +276,7 @@ def fepois(
collin_tol=collin_tol,
separation_check=separation_check,
demeaner_backend=demeaner_backend,
collin_tol_var=collin_tol_var,
)

if fixest._is_multiple_estimation:
Expand Down
2 changes: 1 addition & 1 deletion pyfixest/estimation/api/quantreg.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def quantreg(

fixef_rm = "none"
fixef_tol = 1e-06
fixef_maxiter = 100_000
fixef_maxiter = 10_000
iwls_tol = 1e-08
iwls_maxiter = 25

Expand Down
216 changes: 216 additions & 0 deletions pyfixest/estimation/collinearity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
"""Multicollinearity detection utilities."""

import warnings
from typing import Callable, Optional

import numpy as np


def _drop_multicollinear_variables_chol(
X_demeaned: np.ndarray,
coefnames: list[str],
collin_tol: float,
backend_func: Callable,
) -> tuple[np.ndarray, list[str], list[str], list[int]]:
"""
Check for multicollinearity in the design matrices X and Z.

Parameters
----------
X_demeaned : numpy.ndarray
A demeaned matrix.
coefnames : list[str]
The names of the coefficients.
collin_tol : float
The tolerance level for the multicollinearity check.
backend_func: Callable
Which backend function to use for the multicollinearity check.

Returns
-------
X_demeaned : numpy.ndarray
X_demeaned excluding multicollinear variables.
coefnames : list[str]
The names of the coefficients, excluding those identified as collinear.
collin_vars : list[str]
The collinear variables identified during the check.
collin_index : numpy.ndarray
Logical array, where True indicates that the variable is collinear.
"""
# TODO: avoid doing this computation twice, e.g. compute tXXinv here as fixest does

tXX = X_demeaned.T @ X_demeaned
id_excl, n_excl, all_removed = backend_func(tXX, collin_tol)

collin_vars = []
collin_index = []

if all_removed:
raise ValueError(
"""
All variables are collinear. Maybe your model specification introduces multicollinearity? If not, please reach out to the package authors!.
"""
)

names_array = np.array(coefnames)
if n_excl > 0:
collin_vars = names_array[id_excl].tolist()
if len(collin_vars) > 5:
indent = " "
formatted_collinear_vars = (
f"\n{indent}" + f"\n{indent}".join(collin_vars[:5]) + f"\n{indent}..."
)
else:
formatted_collinear_vars = str(collin_vars)

warnings.warn(
f"""
{len(collin_vars)} variables dropped due to multicollinearity.
The following variables are dropped: {formatted_collinear_vars}.
"""
)

X_demeaned = np.delete(X_demeaned, id_excl, axis=1)
if X_demeaned.ndim == 2 and X_demeaned.shape[1] == 0:
raise ValueError(
"""
All variables are collinear. Please check your model specification.
"""
)

names_array = np.delete(names_array, id_excl)
collin_index = id_excl.tolist()

return X_demeaned, list(names_array), collin_vars, collin_index


def _drop_multicollinear_variables_var(
X_demeaned: np.ndarray,
coefnames: list[str],
X_raw_sumsq: Optional[np.ndarray],
collin_tol_var: float,
) -> tuple[np.ndarray, list[str], list[str], list[int]]:
"""
Detect variables absorbed by fixed effects via variance ratio.

Computes rho_i = ||x_tilde_i||^2 / ||x_i||^2 for each column.
Columns with rho_i < collin_tol_var are flagged as absorbed.

Parameters
----------
X_demeaned : numpy.ndarray
The demeaned design matrix.
coefnames : list[str]
The names of the coefficients.
X_raw_sumsq : numpy.ndarray or None
Squared column norms of X before demeaning.
collin_tol_var : float
Tolerance for the variance ratio check.

Returns
-------
X_demeaned : numpy.ndarray
The design matrix after removing absorbed variables.
coefnames : list[str]
Coefficient names after removing absorbed variables.
collin_vars : list[str]
Names of absorbed variables.
collin_index : list[int]
Indices of absorbed variables.
"""
if X_raw_sumsq is None or X_demeaned.shape[1] == 0:
return X_demeaned, coefnames, [], []

demeaned_norms = (X_demeaned**2).sum(axis=0)
ratios = demeaned_norms / X_raw_sumsq
absorbed_mask = ratios < collin_tol_var
if not absorbed_mask.any():
return X_demeaned, coefnames, [], []

collin_index = np.where(absorbed_mask)[0]
names_array = np.array(coefnames)
collin_vars = names_array[collin_index].tolist()

warnings.warn(
f"""
{len(collin_vars)} variables dropped (absorbed by fixed effects).
The following variables are dropped: {collin_vars}.
"""
)

X_demeaned = np.delete(X_demeaned, collin_index, axis=1)
coefnames = np.delete(names_array, collin_index).tolist()

return X_demeaned, coefnames, collin_vars, collin_index.tolist()


def drop_multicollinear_variables(
X_demeaned: np.ndarray,
coefnames: list[str],
collin_tol: float,
backend_func: Callable,
X_raw_sumsq: Optional[np.ndarray],
collin_tol_var: float,
has_fixef: bool,
) -> tuple[np.ndarray, list[str], list[str], list[int]]:
"""
Run Cholesky + variance ratio collinearity checks.

Parameters
----------
X_demeaned : numpy.ndarray
The demeaned design matrix.
coefnames : list[str]
The names of the coefficients.
collin_tol : float
Tolerance for the Cholesky multicollinearity check.
backend_func : Callable
Backend function for the Cholesky check.
X_raw_sumsq : numpy.ndarray or None
Squared column norms of X before demeaning.
collin_tol_var : float
Tolerance for the variance ratio check.
has_fixef : bool
Whether the model has fixed effects.

Returns
-------
X_demeaned : numpy.ndarray
The design matrix after removing collinear variables.
coefnames : list[str]
Coefficient names after removing collinear variables.
collin_vars : list[str]
Names of all removed variables.
collin_index : list[int]
Indices of removed variables (relative to the original input columns).
"""
N = X_demeaned.shape[1]
collin_vars = []
collin_index = []

if N > 0:
(X_demeaned, coefnames, chol_vars, chol_idx) = (
_drop_multicollinear_variables_chol(
X_demeaned, coefnames, collin_tol, backend_func
)
)
collin_vars.extend(chol_vars)
collin_index.extend(chol_idx)

if (
has_fixef
and collin_tol_var > 0
and X_raw_sumsq is not None
and X_demeaned.shape[1] > 0
):
if chol_idx:
X_raw_sumsq = np.delete(X_raw_sumsq, chol_idx)
(X_demeaned, coefnames, var_vars, var_idx) = _drop_multicollinear_variables_var(
X_demeaned, coefnames, X_raw_sumsq, collin_tol_var
)
collin_vars.extend(var_vars)
if var_idx:
remaining = np.delete(np.arange(N), chol_idx)
collin_index.extend(remaining[var_idx].tolist())

return X_demeaned, coefnames, collin_vars, collin_index
Loading
Loading