From fe55919e25c260bbf44c48d2e47d796596256bb4 Mon Sep 17 00:00:00 2001
From: alanchalk <alanchalk@gmail.com>
Date: Thu, 22 May 2025 11:53:08 +0100
Subject: [PATCH] Expose convert_from_pandas, fix test weight scaling, and
 store train indices in CV\n\n- Make convert_from_pandas public in _glm.py to
 allow external predictions\n- Scale test weights to 1 in _glm_cv.py to ensure
 correct test deviance\n- Store train indices from each fold as
 self.train_indices_ in _glm_cv.py

---
 src/glum/_glm.py    |  3 +++
 src/glum/_glm_cv.py | 10 +++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/glum/_glm.py b/src/glum/_glm.py
index 20c5b305..66196684 100644
--- a/src/glum/_glm.py
+++ b/src/glum/_glm.py
@@ -285,6 +285,9 @@ def _convert_from_pandas(
 
         return X
 
+    # expose the method as public so can create predictions outside of class
+    convert_from_pandas = _convert_from_pandas
+    
     def _set_up_for_fit(self, y: np.ndarray) -> None:
         #######################################################################
         # 1. input validation                                                 #
diff --git a/src/glum/_glm_cv.py b/src/glum/_glm_cv.py
index f4e15f8c..fa542b83 100644
--- a/src/glum/_glm_cv.py
+++ b/src/glum/_glm_cv.py
@@ -543,7 +543,7 @@ def fit(
             _stype = ["csc"]
         else:
             _stype = ["csc", "csr"]
-
+        
         def _fit_path(
             self,
             train_idx,
@@ -571,6 +571,8 @@ def _fit_path(
                 y[test_idx],
                 sample_weight[test_idx],
             )
+            # test weights need to sum to 1 too, else deviance is not properly scaled
+            w_test /= w_test.sum()
 
             if offset is not None:
                 offset_train = offset[train_idx]
@@ -667,8 +669,8 @@ def _get_deviance(coef):
                 )
                 deviance_path_ = [_get_deviance(_coef) for _coef in coef_path_]
 
-            return intercept_path_, coef_path_, deviance_path_
-
+            return intercept_path_, coef_path_, deviance_path_, train_idx
+        
         jobs = (
             joblib.delayed(_fit_path)(
                 self,
@@ -706,6 +708,8 @@ def _get_deviance(coef):
             (cv.get_n_splits(), len(l1_ratio), len(alphas[0])),
         )
 
+        self.train_indices_ = [elmt[3] for elmt in paths_data]
+
         avg_deviance = self.deviance_path_.mean(axis=0)  # type: ignore
 
         best_l1, best_alpha = np.unravel_index(