From 4ab88d2c614112785ca58ef7b362757b422752cb Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Tue, 10 Mar 2026 17:51:18 +0100 Subject: [PATCH 01/11] Add `train_deviance_path_` to `GeneralizedLinearRegressorCV` Expose training-set deviance alongside validation deviance so users can diagnose over-/under-fitting as regularization changes. Co-Authored-By: Claude Opus 4.6 --- CHANGELOG.rst | 8 ++++++++ src/glum/_glm_cv.py | 36 +++++++++++++++++++++++++++++------- tests/glm/test_glm_cv.py | 26 ++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 150d9b90..14d10fb4 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,14 @@ Changelog ========= +3.3.0 - unreleased +------------------ + +**New features:** + +- :class:`~glum.GeneralizedLinearRegressorCV` now exposes ``train_deviance_path_``, an array of shape ``(n_folds, n_l1_ratios, n_alphas)`` with the training-set deviance for each fold and alpha. Comparing it with ``deviance_path_`` (validation deviance) helps diagnose over-/under-fitting as regularization changes. + + 3.2.0 - 2026-03-10 ------------------ diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py index 745232eb..a7032f02 100644 --- a/src/glum/_glm_cv.py +++ b/src/glum/_glm_cv.py @@ -298,9 +298,12 @@ class GeneralizedLinearRegressorCV(GeneralizedLinearRegressorBase): Estimated intercepts at every point along the regularization path, per fold and l1_ratio. - deviance_path_: array, shape(n_folds, n_alphas) + deviance_path_: array, shape(n_folds, n_l1_ratios, n_alphas) Deviance for the test set on each fold, varying alpha. + train_deviance_path_: array, shape(n_folds, n_l1_ratios, n_alphas) + Deviance for the training set on each fold, varying alpha. + robust : bool, optional (default = False) If true, then robust standard errors are computed by default. @@ -668,12 +671,22 @@ def _fit_path( else: offset_train, offset_test = None, None + x_train_raw = x_train + def _get_deviance(coef): mu = self._link_instance.inverse( _safe_lin_pred(x_test, coef, offset_test) ) return self._family_instance.deviance(y_test, mu, sample_weight=w_test) + def _get_train_deviance(coef): + mu = self._link_instance.inverse( + _safe_lin_pred(x_train_raw, coef, offset_train) + ) + return self._family_instance.deviance( + y_train, mu, sample_weight=w_train + ) + if ( hasattr(self._family_instance, "_power") and self._family_instance._power == 1.5 @@ -744,11 +757,12 @@ def _get_deviance(coef): self.col_means_, self.col_stds_, coef[:, 0], coef[:, 1:] ) assert isinstance(intercept_path_, np.ndarray) # make mypy happy - deviance_path_ = [ - _get_deviance(_coef) - for _coef in np.concatenate( - [intercept_path_[:, np.newaxis], coef_path_], axis=1 - ) + full_coef_path = np.concatenate( + [intercept_path_[:, np.newaxis], coef_path_], axis=1 + ) + deviance_path_ = [_get_deviance(_coef) for _coef in full_coef_path] + train_deviance_path_ = [ + _get_train_deviance(_coef) for _coef in full_coef_path ] else: # set intercept to zero as the other linear models do @@ -756,8 +770,11 @@ def _get_deviance(coef): self.col_means_, self.col_stds_, np.zeros(coef.shape[0]), coef ) deviance_path_ = [_get_deviance(_coef) for _coef in coef_path_] + train_deviance_path_ = [ + _get_train_deviance(_coef) for _coef in coef_path_ + ] - return intercept_path_, coef_path_, deviance_path_ + return intercept_path_, coef_path_, deviance_path_, train_deviance_path_ jobs = ( joblib.delayed(_fit_path)( @@ -796,6 +813,11 @@ def _get_deviance(coef): (cv.get_n_splits(), len(l1_ratio), len(alphas[0])), ) + self.train_deviance_path_ = np.reshape( + [elmt[3] for elmt in paths_data], + (cv.get_n_splits(), len(l1_ratio), len(alphas[0])), + ) + avg_deviance = self.deviance_path_.mean(axis=0) # type: ignore best_l1, best_alpha = np.unravel_index( diff --git a/tests/glm/test_glm_cv.py b/tests/glm/test_glm_cv.py index 10a8ba6b..2458633d 100644 --- a/tests/glm/test_glm_cv.py +++ b/tests/glm/test_glm_cv.py @@ -185,6 +185,7 @@ def _assert_all_close(x, y): _assert_all_close(est_2.l1_ratio_, est_ref.l1_ratio_) _assert_all_close(est_2.coef_path_, est_ref.coef_path_) _assert_all_close(est_2.deviance_path_, est_ref.deviance_path_) + _assert_all_close(est_2.train_deviance_path_, est_ref.train_deviance_path_) _assert_all_close(est_2.intercept_, est_ref.intercept_) _assert_all_close(est_2.coef_, est_ref.coef_) _assert_all_close( @@ -272,6 +273,31 @@ def test_cv_predict_with_alpha_index(l1_ratio): np.testing.assert_allclose(pred_alpha, pred_default) +@pytest.mark.parametrize("fit_intercept", [False, True]) +def test_train_deviance_path(fit_intercept): + """train_deviance_path_ should have the correct shape and train deviance + should generally be lower than test deviance.""" + np.random.seed(42) + n_samples, n_features = 200, 5 + n_alphas = 5 + X = np.random.randn(n_samples, n_features) + y = X @ np.array([1, 0.5, -0.5, 0, 0]) + np.random.randn(n_samples) * 0.1 + + model = GeneralizedLinearRegressorCV( + l1_ratio=0.5, + n_alphas=n_alphas, + min_alpha_ratio=1e-2, + fit_intercept=fit_intercept, + ).fit(X, y) + + assert hasattr(model, "train_deviance_path_") + assert model.train_deviance_path_.shape == model.deviance_path_.shape + + avg_train = model.train_deviance_path_.mean(axis=0) + avg_test = model.deviance_path_.mean(axis=0) + assert np.all(avg_train <= avg_test) + + @pytest.mark.parametrize("scale_factor", [1.0, 1000.0]) @pytest.mark.parametrize("l1_ratio", [0.0, 0.5, 1.0]) def test_match_with_base_class(l1_ratio, scale_factor): From 8a5e0d4d52923986afb287aa59e322cd331fca80 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Tue, 10 Mar 2026 18:05:09 +0100 Subject: [PATCH 02/11] Relax train deviance assertion to avoid flaky test Compare overall means instead of per-alpha to avoid failures from unlucky CV splits or high-regularization alphas. Co-Authored-By: Claude Opus 4.6 --- tests/glm/test_glm_cv.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/glm/test_glm_cv.py b/tests/glm/test_glm_cv.py index 2458633d..aefcc226 100644 --- a/tests/glm/test_glm_cv.py +++ b/tests/glm/test_glm_cv.py @@ -293,9 +293,8 @@ def test_train_deviance_path(fit_intercept): assert hasattr(model, "train_deviance_path_") assert model.train_deviance_path_.shape == model.deviance_path_.shape - avg_train = model.train_deviance_path_.mean(axis=0) - avg_test = model.deviance_path_.mean(axis=0) - assert np.all(avg_train <= avg_test) + # On average, train deviance should be lower than test deviance. + assert model.train_deviance_path_.mean() < model.deviance_path_.mean() @pytest.mark.parametrize("scale_factor", [1.0, 1000.0]) From 6a974a24fedcb6476859e44c72beff1d97e6951c Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Tue, 10 Mar 2026 18:06:54 +0100 Subject: [PATCH 03/11] small changes to changelog --- CHANGELOG.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 14d10fb4..0455d1a1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,9 +10,9 @@ Changelog 3.3.0 - unreleased ------------------ -**New features:** +**New feature:** -- :class:`~glum.GeneralizedLinearRegressorCV` now exposes ``train_deviance_path_``, an array of shape ``(n_folds, n_l1_ratios, n_alphas)`` with the training-set deviance for each fold and alpha. Comparing it with ``deviance_path_`` (validation deviance) helps diagnose over-/under-fitting as regularization changes. +- :class:`~glum.GeneralizedLinearRegressorCV` now exposes ``train_deviance_path_``, an array of shape ``(n_folds, n_l1_ratios, n_alphas)`` with the training-set deviance for each fold and alpha. 3.2.0 - 2026-03-10 From 38a08cc13f999397c57851f2ab1e1eacadcd3efe Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Thu, 12 Mar 2026 17:18:06 +0100 Subject: [PATCH 04/11] Compute train deviance with standardized x_train --- src/glum/_glm_cv.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py index a7032f02..57a142cf 100644 --- a/src/glum/_glm_cv.py +++ b/src/glum/_glm_cv.py @@ -671,22 +671,12 @@ def _fit_path( else: offset_train, offset_test = None, None - x_train_raw = x_train - def _get_deviance(coef): mu = self._link_instance.inverse( _safe_lin_pred(x_test, coef, offset_test) ) return self._family_instance.deviance(y_test, mu, sample_weight=w_test) - def _get_train_deviance(coef): - mu = self._link_instance.inverse( - _safe_lin_pred(x_train_raw, coef, offset_train) - ) - return self._family_instance.deviance( - y_train, mu, sample_weight=w_train - ) - if ( hasattr(self._family_instance, "_power") and self._family_instance._power == 1.5 @@ -717,6 +707,14 @@ def _get_train_deviance(coef): P2_no_alpha, ) + def _get_train_deviance(coef): + mu = self._link_instance.inverse( + _safe_lin_pred(x_train, coef, offset_train) + ) + return self._family_instance.deviance( + y_train, mu, sample_weight=w_train + ) + coef = self._get_start_coef( x_train, y_train, @@ -752,6 +750,11 @@ def _get_train_deviance(coef): b_ineq=b_ineq, ) + # Compute train deviance with standardized x_train + raw coef + # (before unstandardize). StandardizedMatrix ensures the linear + # predictor is identical to using unstandardized data. + train_deviance_path_ = [_get_train_deviance(_coef) for _coef in coef] + if self.fit_intercept: intercept_path_, coef_path_ = unstandardize( self.col_means_, self.col_stds_, coef[:, 0], coef[:, 1:] @@ -761,18 +764,12 @@ def _get_train_deviance(coef): [intercept_path_[:, np.newaxis], coef_path_], axis=1 ) deviance_path_ = [_get_deviance(_coef) for _coef in full_coef_path] - train_deviance_path_ = [ - _get_train_deviance(_coef) for _coef in full_coef_path - ] else: # set intercept to zero as the other linear models do intercept_path_, coef_path_ = unstandardize( self.col_means_, self.col_stds_, np.zeros(coef.shape[0]), coef ) deviance_path_ = [_get_deviance(_coef) for _coef in coef_path_] - train_deviance_path_ = [ - _get_train_deviance(_coef) for _coef in coef_path_ - ] return intercept_path_, coef_path_, deviance_path_, train_deviance_path_ From d3c2ccecd9215e34959fdfd69bccfdccb5b5deae Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Thu, 12 Mar 2026 17:19:49 +0100 Subject: [PATCH 05/11] remove comment --- src/glum/_glm_cv.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py index 57a142cf..62a56801 100644 --- a/src/glum/_glm_cv.py +++ b/src/glum/_glm_cv.py @@ -750,9 +750,6 @@ def _get_train_deviance(coef): b_ineq=b_ineq, ) - # Compute train deviance with standardized x_train + raw coef - # (before unstandardize). StandardizedMatrix ensures the linear - # predictor is identical to using unstandardized data. train_deviance_path_ = [_get_train_deviance(_coef) for _coef in coef] if self.fit_intercept: From 5d41656c12b994ea212b84984193b9621d1b0c9e Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Thu, 12 Mar 2026 17:36:16 +0100 Subject: [PATCH 06/11] test that train deviance is at correct scale; set to severe overfitting --- tests/glm/test_glm_cv.py | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/tests/glm/test_glm_cv.py b/tests/glm/test_glm_cv.py index aefcc226..7aae0032 100644 --- a/tests/glm/test_glm_cv.py +++ b/tests/glm/test_glm_cv.py @@ -274,26 +274,49 @@ def test_cv_predict_with_alpha_index(l1_ratio): @pytest.mark.parametrize("fit_intercept", [False, True]) -def test_train_deviance_path(fit_intercept): - """train_deviance_path_ should have the correct shape and train deviance - should generally be lower than test deviance.""" +@pytest.mark.parametrize("scale", [1.0, 1e4]) +def test_train_deviance_path(fit_intercept, scale): + """train_deviance_path_ should match manually computed train deviance. + Severe overfitting should be visible by comparing train and test deviance. + """ np.random.seed(42) - n_samples, n_features = 200, 5 + n_samples, n_features = 10, 5 n_alphas = 5 - X = np.random.randn(n_samples, n_features) - y = X @ np.array([1, 0.5, -0.5, 0, 0]) + np.random.randn(n_samples) * 0.1 + X = np.random.randn(n_samples, n_features) * scale + y = np.random.randn(n_samples) + cv = skl.model_selection.KFold(n_splits=3) model = GeneralizedLinearRegressorCV( l1_ratio=0.5, n_alphas=n_alphas, min_alpha_ratio=1e-2, fit_intercept=fit_intercept, + cv=cv, ).fit(X, y) - assert hasattr(model, "train_deviance_path_") assert model.train_deviance_path_.shape == model.deviance_path_.shape - # On average, train deviance should be lower than test deviance. + # Manually recompute train deviance from coef_path_ / intercept_path_ + family = model._family_instance + link = model._link_instance + for fold_idx, (train_idx, _) in enumerate(cv.split(X)): + X_train = X[train_idx] + y_train = y[train_idx] + w_train = np.ones(len(train_idx)) / len(train_idx) + for alpha_idx in range(n_alphas): + coef = model.coef_path_[fold_idx, 0, alpha_idx] + intercept = model.intercept_path_[fold_idx, 0, alpha_idx] + lin_pred = X_train @ coef + intercept + mu = link.inverse(lin_pred) + expected = family.deviance(y_train, mu, sample_weight=w_train) + np.testing.assert_allclose( + model.train_deviance_path_[fold_idx, 0, alpha_idx], + expected, + rtol=1e-5, + ) + + # In this severely overfitted example, average train deviance should be lower than + # average test deviance. assert model.train_deviance_path_.mean() < model.deviance_path_.mean() From 2759ef122f0ca1ab711f1007f74ef5ca0376265f Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Fri, 13 Mar 2026 09:04:41 +0100 Subject: [PATCH 07/11] remove scaling test because it is too obvious --- tests/glm/test_glm_cv.py | 36 ++++-------------------------------- 1 file changed, 4 insertions(+), 32 deletions(-) diff --git a/tests/glm/test_glm_cv.py b/tests/glm/test_glm_cv.py index 7aae0032..f10c36a7 100644 --- a/tests/glm/test_glm_cv.py +++ b/tests/glm/test_glm_cv.py @@ -273,50 +273,22 @@ def test_cv_predict_with_alpha_index(l1_ratio): np.testing.assert_allclose(pred_alpha, pred_default) -@pytest.mark.parametrize("fit_intercept", [False, True]) -@pytest.mark.parametrize("scale", [1.0, 1e4]) -def test_train_deviance_path(fit_intercept, scale): - """train_deviance_path_ should match manually computed train deviance. - Severe overfitting should be visible by comparing train and test deviance. - """ +def test_train_deviance_path(): + """train_deviance_path_ should have correct shape and train deviance + should be lower than test deviance in a severely overfitted example.""" np.random.seed(42) n_samples, n_features = 10, 5 n_alphas = 5 - X = np.random.randn(n_samples, n_features) * scale + X = np.random.randn(n_samples, n_features) * 1e4 y = np.random.randn(n_samples) - cv = skl.model_selection.KFold(n_splits=3) model = GeneralizedLinearRegressorCV( l1_ratio=0.5, n_alphas=n_alphas, min_alpha_ratio=1e-2, - fit_intercept=fit_intercept, - cv=cv, ).fit(X, y) assert model.train_deviance_path_.shape == model.deviance_path_.shape - - # Manually recompute train deviance from coef_path_ / intercept_path_ - family = model._family_instance - link = model._link_instance - for fold_idx, (train_idx, _) in enumerate(cv.split(X)): - X_train = X[train_idx] - y_train = y[train_idx] - w_train = np.ones(len(train_idx)) / len(train_idx) - for alpha_idx in range(n_alphas): - coef = model.coef_path_[fold_idx, 0, alpha_idx] - intercept = model.intercept_path_[fold_idx, 0, alpha_idx] - lin_pred = X_train @ coef + intercept - mu = link.inverse(lin_pred) - expected = family.deviance(y_train, mu, sample_weight=w_train) - np.testing.assert_allclose( - model.train_deviance_path_[fold_idx, 0, alpha_idx], - expected, - rtol=1e-5, - ) - - # In this severely overfitted example, average train deviance should be lower than - # average test deviance. assert model.train_deviance_path_.mean() < model.deviance_path_.mean() From 701bde79272639cda1a26366c41edcfaad8baa50 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Fri, 13 Mar 2026 09:20:36 +0100 Subject: [PATCH 08/11] simplify slightly --- tests/glm/test_glm_cv.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/glm/test_glm_cv.py b/tests/glm/test_glm_cv.py index f10c36a7..0bc3282f 100644 --- a/tests/glm/test_glm_cv.py +++ b/tests/glm/test_glm_cv.py @@ -279,11 +279,10 @@ def test_train_deviance_path(): np.random.seed(42) n_samples, n_features = 10, 5 n_alphas = 5 - X = np.random.randn(n_samples, n_features) * 1e4 + X = np.random.randn(n_samples, n_features) y = np.random.randn(n_samples) model = GeneralizedLinearRegressorCV( - l1_ratio=0.5, n_alphas=n_alphas, min_alpha_ratio=1e-2, ).fit(X, y) From a8a0f21e493bacd62912dcc599181e5bca9e0a97 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Fri, 13 Mar 2026 09:53:48 +0100 Subject: [PATCH 09/11] do save file --- CHANGELOG.rst | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c4c2a899..c61bb112 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,21 +7,17 @@ Changelog ========= -<<<<<<< HEAD + 3.3.0 - unreleased ------------------ **New feature:** - :class:`~glum.GeneralizedLinearRegressorCV` now exposes ``train_deviance_path_``, an array of shape ``(n_folds, n_l1_ratios, n_alphas)`` with the training-set deviance for each fold and alpha. -======= -3.2.1 - unreleased ------------------- **Other changes:** - Downgraded log messages in ``align_df_categories`` and ``add_missing_categories`` from INFO to DEBUG, and deduplicated them so they are emitted only once per column per fitted model. ->>>>>>> main 3.2.0 - 2026-03-10 From 056f54ac45798fb2383679b3f8d6ddef3f8d5225 Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Tue, 17 Mar 2026 18:31:50 +0100 Subject: [PATCH 10/11] small comment why test deviance requires branching by fit_intercept --- src/glum/_glm_cv.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py index 55324ee5..7ec5b880 100644 --- a/src/glum/_glm_cv.py +++ b/src/glum/_glm_cv.py @@ -753,6 +753,8 @@ def _get_train_deviance(coef): train_deviance_path_ = [_get_train_deviance(_coef) for _coef in coef] + # Unlike train deviance, test deviance is computed on unstandardized + # x and coefficient rescaling differs by self.fit_intercept. if self.fit_intercept: intercept_path_, coef_path_ = unstandardize( self.col_means_, self.col_stds_, coef[:, 0], coef[:, 1:] From fcb677b50091b80a8fb0bc10829f0f93250fbbfc Mon Sep 17 00:00:00 2001 From: Matthias Schmidtblaicher Date: Thu, 19 Mar 2026 18:42:09 +0100 Subject: [PATCH 11/11] small overhang from #990 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ca28a444..8934e66a 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ We believe that for GLM development, broad support for distributions, regulariza * Built-in formula-based model specification using `formulaic` * Classical statistical inference for unregularized models * Box constraints, linear inequality constraints, sample weights, offsets -* Support for multiple dataframe backends (pandas, polars, and more) via `narwhals` +* Multiple dataframe backends (pandas, polars, and more) via `narwhals` Performance also matters, so we conducted extensive benchmarks against other modern libraries. Although performance depends on the specific problem, we find that when N >> K (there are more observations than predictors), `glum` is consistently much faster for a wide range of problems. This repo includes the benchmarking tools in the `glum_benchmarks` module. For details, [see here](glum_benchmarks/README.md).