|
| 1 | +""" |
| 2 | +Example: Using LGBScorecardConstructor |
| 3 | +
|
| 4 | +This example demonstrates how to use the LightGBM scorecard constructor |
| 5 | +and inspect its outputs. Currently implements extract_leaf_weights() and get_leafs(). |
| 6 | +
|
| 7 | +Status: Alpha - Partial implementation |
| 8 | +""" |
| 9 | + |
| 10 | +import numpy as np |
| 11 | +import pandas as pd |
| 12 | +from lightgbm import LGBMClassifier |
| 13 | +from sklearn.model_selection import train_test_split |
| 14 | + |
| 15 | +from xbooster.lgb_constructor import LGBScorecardConstructor |
| 16 | + |
| 17 | +# Set random seed for reproducibility |
| 18 | +np.random.seed(42) |
| 19 | + |
| 20 | +# ============================================================================ |
| 21 | +# 1. Create Sample Dataset |
| 22 | +# ============================================================================ |
| 23 | +print("=" * 80) |
| 24 | +print("1. Creating Sample Dataset") |
| 25 | +print("=" * 80) |
| 26 | + |
| 27 | +n_samples = 1000 |
| 28 | +X = pd.DataFrame( |
| 29 | + { |
| 30 | + "age": np.random.randint(18, 70, n_samples), |
| 31 | + "income": np.random.randint(20000, 150000, n_samples), |
| 32 | + "debt_ratio": np.random.uniform(0, 1, n_samples), |
| 33 | + "credit_history": np.random.uniform(0, 30, n_samples), |
| 34 | + "employment_years": np.random.randint(0, 40, n_samples), |
| 35 | + } |
| 36 | +) |
| 37 | + |
| 38 | +# Create target variable with some relationship to features |
| 39 | +y = pd.Series( |
| 40 | + ((X["debt_ratio"] > 0.5) | (X["income"] < 40000) | (X["credit_history"] < 5)).astype(int) |
| 41 | +) |
| 42 | + |
| 43 | +# Add some noise |
| 44 | +y = y ^ (np.random.random(n_samples) < 0.1).astype(int) |
| 45 | + |
| 46 | +print(f"Dataset shape: {X.shape}") |
| 47 | +print(f"Target distribution: {y.value_counts().to_dict()}") |
| 48 | +print(f"Event rate: {y.mean():.2%}") |
| 49 | +print() |
| 50 | + |
| 51 | +# Split data |
| 52 | +X_train, X_test, y_train, y_test = train_test_split( |
| 53 | + X, y, test_size=0.3, random_state=42, stratify=y |
| 54 | +) |
| 55 | + |
| 56 | +print(f"Train set: {X_train.shape}") |
| 57 | +print(f"Test set: {X_test.shape}") |
| 58 | +print() |
| 59 | + |
| 60 | +# ============================================================================ |
| 61 | +# 2. Train LightGBM Model |
| 62 | +# ============================================================================ |
| 63 | +print("=" * 80) |
| 64 | +print("2. Training LightGBM Model") |
| 65 | +print("=" * 80) |
| 66 | + |
| 67 | +model = LGBMClassifier( |
| 68 | + n_estimators=5, |
| 69 | + max_depth=3, |
| 70 | + learning_rate=0.1, |
| 71 | + random_state=42, |
| 72 | + verbose=-1, |
| 73 | +) |
| 74 | + |
| 75 | +model.fit(X_train, y_train) |
| 76 | + |
| 77 | +# Check model performance |
| 78 | +train_score = model.score(X_train, y_train) |
| 79 | +test_score = model.score(X_test, y_test) |
| 80 | + |
| 81 | +print(f"Train accuracy: {train_score:.4f}") |
| 82 | +print(f"Test accuracy: {test_score:.4f}") |
| 83 | +print(f"Number of trees: {model.n_estimators}") |
| 84 | +print(f"Max depth: {model.max_depth}") |
| 85 | +print(f"Learning rate: {model.learning_rate}") |
| 86 | +print() |
| 87 | + |
| 88 | +# ============================================================================ |
| 89 | +# 3. Initialize LGBScorecardConstructor |
| 90 | +# ============================================================================ |
| 91 | +print("=" * 80) |
| 92 | +print("3. Initializing LGBScorecardConstructor") |
| 93 | +print("=" * 80) |
| 94 | + |
| 95 | +constructor = LGBScorecardConstructor(model, X_train, y_train) |
| 96 | + |
| 97 | +print(f"Base score (log-odds): {constructor.base_score:.6f}") |
| 98 | +print(f"Number of estimators: {constructor.n_estimators}") |
| 99 | +print(f"Learning rate: {constructor.learning_rate}") |
| 100 | +print(f"Max depth: {constructor.max_depth}") |
| 101 | +print() |
| 102 | + |
| 103 | +# ============================================================================ |
| 104 | +# 4. Extract Leaf Weights |
| 105 | +# ============================================================================ |
| 106 | +print("=" * 80) |
| 107 | +print("4. Extracting Leaf Weights") |
| 108 | +print("=" * 80) |
| 109 | + |
| 110 | +leaf_weights = constructor.extract_leaf_weights() |
| 111 | + |
| 112 | +print(f"Leaf weights shape: {leaf_weights.shape}") |
| 113 | +print(f"Columns: {leaf_weights.columns.tolist()}") |
| 114 | +print() |
| 115 | + |
| 116 | +print("First 10 leaf weights:") |
| 117 | +print(leaf_weights.head(10)) |
| 118 | +print() |
| 119 | + |
| 120 | +print("Summary statistics:") |
| 121 | +print(leaf_weights[["Tree", "Split", "XAddEvidence"]].describe()) |
| 122 | +print() |
| 123 | + |
| 124 | +# Group by tree |
| 125 | +print("Leaf weights per tree:") |
| 126 | +tree_counts = leaf_weights.groupby("Tree").size() |
| 127 | +print(tree_counts) |
| 128 | +print() |
| 129 | + |
| 130 | +# Group by feature |
| 131 | +print("Leaf weights per feature:") |
| 132 | +feature_counts = leaf_weights.groupby("Feature").size().sort_values(ascending=False) |
| 133 | +print(feature_counts) |
| 134 | +print() |
| 135 | + |
| 136 | +# Check value ranges |
| 137 | +print("XAddEvidence range by tree:") |
| 138 | +value_ranges = leaf_weights.groupby("Tree")["XAddEvidence"].agg(["min", "max", "mean"]) |
| 139 | +print(value_ranges) |
| 140 | +print() |
| 141 | + |
| 142 | +# ============================================================================ |
| 143 | +# 5. Get Leaf Indices for Test Data |
| 144 | +# ============================================================================ |
| 145 | +print("=" * 80) |
| 146 | +print("5. Getting Leaf Indices for Test Data") |
| 147 | +print("=" * 80) |
| 148 | + |
| 149 | +leaf_indices = constructor.get_leafs(X_test, output_type="leaf_index") |
| 150 | + |
| 151 | +print(f"Leaf indices shape: {leaf_indices.shape}") |
| 152 | +print(f"Columns: {leaf_indices.columns.tolist()}") |
| 153 | +print() |
| 154 | + |
| 155 | +print("First 10 observations:") |
| 156 | +print(leaf_indices.head(10)) |
| 157 | +print() |
| 158 | + |
| 159 | +print("Unique leaf indices per tree:") |
| 160 | +for col in leaf_indices.columns: |
| 161 | + unique_leaves = leaf_indices[col].nunique() |
| 162 | + print(f" {col}: {unique_leaves} unique leaves") |
| 163 | +print() |
| 164 | + |
| 165 | +# ============================================================================ |
| 166 | +# 6. Get Margins for Test Data |
| 167 | +# ============================================================================ |
| 168 | +print("=" * 80) |
| 169 | +print("6. Getting Margins (Raw Scores) for Test Data") |
| 170 | +print("=" * 80) |
| 171 | + |
| 172 | +margins = constructor.get_leafs(X_test, output_type="margin") |
| 173 | + |
| 174 | +print(f"Margins shape: {margins.shape}") |
| 175 | +print(f"Columns: {margins.columns.tolist()}") |
| 176 | +print() |
| 177 | + |
| 178 | +print("First 10 observations:") |
| 179 | +print(margins.head(10)) |
| 180 | +print() |
| 181 | + |
| 182 | +print("Margin statistics per tree:") |
| 183 | +print(margins.describe()) |
| 184 | +print() |
| 185 | + |
| 186 | +# Verify margins sum to raw predictions |
| 187 | +print("Verification: Margins + Base Score = Raw Predictions") |
| 188 | +margin_sum = margins.sum(axis=1) + constructor.base_score |
| 189 | +raw_pred = model.predict(X_test, raw_score=True) |
| 190 | + |
| 191 | +print(f"Margin sum (first 5): {margin_sum.head().values}") |
| 192 | +print(f"Raw prediction (first 5): {raw_pred[:5]}") |
| 193 | +print(f"Match (all close): {np.allclose(margin_sum, raw_pred)}") |
| 194 | +print() |
| 195 | + |
| 196 | +# ============================================================================ |
| 197 | +# 7. Detailed Analysis of a Single Tree |
| 198 | +# ============================================================================ |
| 199 | +print("=" * 80) |
| 200 | +print("7. Detailed Analysis of Tree 0") |
| 201 | +print("=" * 80) |
| 202 | + |
| 203 | +tree_0_weights = leaf_weights[leaf_weights["Tree"] == 0] |
| 204 | +print("Tree 0 leaf weights:") |
| 205 | +print(tree_0_weights) |
| 206 | +print() |
| 207 | + |
| 208 | +# Get the tree structure from LightGBM |
| 209 | +tree_df = model.booster_.trees_to_dataframe() |
| 210 | +tree_0_structure = tree_df[tree_df["tree_index"] == 0] |
| 211 | + |
| 212 | +print("Tree 0 structure:") |
| 213 | +print() |
| 214 | + |
| 215 | +# Split nodes (decision nodes) |
| 216 | +split_nodes = tree_0_structure[tree_0_structure["split_feature"].notna()] |
| 217 | +print("Split/Decision Nodes (internal nodes):") |
| 218 | +print( |
| 219 | + split_nodes[ |
| 220 | + [ |
| 221 | + "node_index", |
| 222 | + "split_feature", |
| 223 | + "threshold", |
| 224 | + "decision_type", |
| 225 | + "left_child", |
| 226 | + "right_child", |
| 227 | + ] |
| 228 | + ] |
| 229 | +) |
| 230 | +print() |
| 231 | + |
| 232 | +# Leaf nodes |
| 233 | +leaf_nodes = tree_0_structure[tree_0_structure["split_feature"].isna()] |
| 234 | +print("Leaf Nodes (terminal nodes):") |
| 235 | +print(leaf_nodes[["node_index", "value"]]) |
| 236 | +print() |
| 237 | + |
| 238 | +print(f"Total nodes: {len(tree_0_structure)} ({len(split_nodes)} split + {len(leaf_nodes)} leaf)") |
| 239 | +print() |
| 240 | + |
| 241 | +# ============================================================================ |
| 242 | +# 8. Feature Importance Analysis |
| 243 | +# ============================================================================ |
| 244 | +print("=" * 80) |
| 245 | +print("8. Feature Importance from Leaf Weights") |
| 246 | +print("=" * 80) |
| 247 | + |
| 248 | +# Count how many times each feature appears in splits |
| 249 | +feature_importance = ( |
| 250 | + leaf_weights.groupby("Feature") |
| 251 | + .agg({"XAddEvidence": ["count", "mean", "std", "min", "max"]}) |
| 252 | + .round(4) |
| 253 | +) |
| 254 | + |
| 255 | +feature_importance.columns = ["Count", "Mean_Value", "Std_Value", "Min_Value", "Max_Value"] |
| 256 | +feature_importance = feature_importance.sort_values("Count", ascending=False) |
| 257 | + |
| 258 | +print("Feature importance based on split frequency:") |
| 259 | +print(feature_importance) |
| 260 | +print() |
| 261 | + |
| 262 | +# ============================================================================ |
| 263 | +# 9. Compare with Native LightGBM Predictions |
| 264 | +# ============================================================================ |
| 265 | +print("=" * 80) |
| 266 | +print("9. Comparing with Native LightGBM Predictions") |
| 267 | +print("=" * 80) |
| 268 | + |
| 269 | +# Get predictions |
| 270 | +pred_proba = model.predict_proba(X_test)[:, 1] |
| 271 | +pred_raw = model.predict(X_test, raw_score=True) |
| 272 | + |
| 273 | +# Show comparison for first 5 samples |
| 274 | +comparison_df = pd.DataFrame( |
| 275 | + { |
| 276 | + "BaseScore": constructor.base_score, |
| 277 | + "MarginSum": margins.sum(axis=1).values[:5], |
| 278 | + "RawScore": pred_raw[:5], |
| 279 | + "Probability": pred_proba[:5], |
| 280 | + } |
| 281 | +) |
| 282 | + |
| 283 | +print("Prediction comparison (first 5 samples):") |
| 284 | +print(comparison_df) |
| 285 | +print() |
| 286 | + |
| 287 | +# Calculate probability from raw score manually |
| 288 | +manual_proba = 1 / (1 + np.exp(-pred_raw)) |
| 289 | +print(f"Manual probability calculation matches: {np.allclose(manual_proba, pred_proba)}") |
| 290 | +print() |
| 291 | + |
| 292 | +# ============================================================================ |
| 293 | +# 10. Summary |
| 294 | +# ============================================================================ |
| 295 | +print("=" * 80) |
| 296 | +print("10. Summary") |
| 297 | +print("=" * 80) |
| 298 | + |
| 299 | +print("✓ Successfully created LGBScorecardConstructor") |
| 300 | +print("✓ Extracted leaf weights from LightGBM model") |
| 301 | +print("✓ Retrieved leaf indices for test data") |
| 302 | +print("✓ Retrieved margins (raw scores) for test data") |
| 303 | +print("✓ Verified margin calculations match LightGBM's raw predictions") |
| 304 | +print() |
| 305 | + |
| 306 | +print("Currently implemented methods:") |
| 307 | +print(" - extract_leaf_weights(): Extract tree structure and leaf values") |
| 308 | +print(" - get_leafs(): Get leaf indices or margins for new data") |
| 309 | +print() |
| 310 | + |
| 311 | +print("Methods still to be implemented:") |
| 312 | +print(" - construct_scorecard(): Combine leaf weights with event statistics") |
| 313 | +print(" - create_points(): Apply PDO (Points to Double Odds) scaling") |
| 314 | +print(" - predict_score(): Score new data using the scorecard") |
| 315 | +print() |
| 316 | + |
| 317 | +print("Next steps for full implementation:") |
| 318 | +print(" 1. Implement construct_scorecard() to calculate WOE/IV") |
| 319 | +print(" 2. Implement create_points() for credit score calculation") |
| 320 | +print(" 3. Implement predict_score() for inference") |
| 321 | +print(" 4. Add SQL query generation for deployment") |
| 322 | +print() |
| 323 | + |
| 324 | +print("=" * 80) |
| 325 | +print("Example completed successfully!") |
| 326 | +print("=" * 80) |
0 commit comments