From fe55919e25c260bbf44c48d2e47d796596256bb4 Mon Sep 17 00:00:00 2001 From: alanchalk Date: Thu, 22 May 2025 11:53:08 +0100 Subject: [PATCH] Expose convert_from_pandas, fix test weight scaling, and store train indices in CV\n\n- Make convert_from_pandas public in _glm.py to allow external predictions\n- Scale test weights to 1 in _glm_cv.py to ensure correct test deviance\n- Store train indices from each fold as self.train_indices_ in _glm_cv.py --- src/glum/_glm.py | 3 +++ src/glum/_glm_cv.py | 10 +++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/glum/_glm.py b/src/glum/_glm.py index 20c5b305..66196684 100644 --- a/src/glum/_glm.py +++ b/src/glum/_glm.py @@ -285,6 +285,9 @@ def _convert_from_pandas( return X + # expose the method as public so can create predictions outside of class + convert_from_pandas = _convert_from_pandas + def _set_up_for_fit(self, y: np.ndarray) -> None: ####################################################################### # 1. input validation # diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py index f4e15f8c..fa542b83 100644 --- a/src/glum/_glm_cv.py +++ b/src/glum/_glm_cv.py @@ -543,7 +543,7 @@ def fit( _stype = ["csc"] else: _stype = ["csc", "csr"] - + def _fit_path( self, train_idx, @@ -571,6 +571,8 @@ def _fit_path( y[test_idx], sample_weight[test_idx], ) + # test weights need to sum to 1 too, else deviance is not properly scaled + w_test /= w_test.sum() if offset is not None: offset_train = offset[train_idx] @@ -667,8 +669,8 @@ def _get_deviance(coef): ) deviance_path_ = [_get_deviance(_coef) for _coef in coef_path_] - return intercept_path_, coef_path_, deviance_path_ - + return intercept_path_, coef_path_, deviance_path_, train_idx + jobs = ( joblib.delayed(_fit_path)( self, @@ -706,6 +708,8 @@ def _get_deviance(coef): (cv.get_n_splits(), len(l1_ratio), len(alphas[0])), ) + self.train_indices_ = [elmt[3] for elmt in paths_data] + avg_deviance = self.deviance_path_.mean(axis=0) # type: ignore best_l1, best_alpha = np.unravel_index(