Skip to content

Commit 5f5ff5f

Browse files
committed
feat: add LightGBM scorecard constructor (alpha)
- Implement LGBScorecardConstructor with extract_leaf_weights() and get_leafs() - Add comprehensive test suite with 9 tests (104 total tests passing) - Include working example demonstrating implemented functionality - Use XAddEvidence column name for consistency with XGBoost - Methods implemented: * extract_leaf_weights(): Parse LightGBM tree structure * get_leafs(): Get leaf indices or margins for predictions - Stub methods for community contribution: * construct_scorecard(): WOE/IV calculations (TODO) * create_points(): PDO scaling (TODO) * predict_score(): Inference (TODO) Status: Alpha - Reference implementation Related: issue #7 (@RektPunk) Pattern: Follows xgb_constructor.py methodology
1 parent b1ca94f commit 5f5ff5f

File tree

3 files changed

+850
-0
lines changed

3 files changed

+850
-0
lines changed
Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
"""
2+
Example: Using LGBScorecardConstructor
3+
4+
This example demonstrates how to use the LightGBM scorecard constructor
5+
and inspect its outputs. Currently implements extract_leaf_weights() and get_leafs().
6+
7+
Status: Alpha - Partial implementation
8+
"""
9+
10+
import numpy as np
11+
import pandas as pd
12+
from lightgbm import LGBMClassifier
13+
from sklearn.model_selection import train_test_split
14+
15+
from xbooster.lgb_constructor import LGBScorecardConstructor
16+
17+
# Set random seed for reproducibility
18+
np.random.seed(42)
19+
20+
# ============================================================================
21+
# 1. Create Sample Dataset
22+
# ============================================================================
23+
print("=" * 80)
24+
print("1. Creating Sample Dataset")
25+
print("=" * 80)
26+
27+
n_samples = 1000
28+
X = pd.DataFrame(
29+
{
30+
"age": np.random.randint(18, 70, n_samples),
31+
"income": np.random.randint(20000, 150000, n_samples),
32+
"debt_ratio": np.random.uniform(0, 1, n_samples),
33+
"credit_history": np.random.uniform(0, 30, n_samples),
34+
"employment_years": np.random.randint(0, 40, n_samples),
35+
}
36+
)
37+
38+
# Create target variable with some relationship to features
39+
y = pd.Series(
40+
((X["debt_ratio"] > 0.5) | (X["income"] < 40000) | (X["credit_history"] < 5)).astype(int)
41+
)
42+
43+
# Add some noise
44+
y = y ^ (np.random.random(n_samples) < 0.1).astype(int)
45+
46+
print(f"Dataset shape: {X.shape}")
47+
print(f"Target distribution: {y.value_counts().to_dict()}")
48+
print(f"Event rate: {y.mean():.2%}")
49+
print()
50+
51+
# Split data
52+
X_train, X_test, y_train, y_test = train_test_split(
53+
X, y, test_size=0.3, random_state=42, stratify=y
54+
)
55+
56+
print(f"Train set: {X_train.shape}")
57+
print(f"Test set: {X_test.shape}")
58+
print()
59+
60+
# ============================================================================
61+
# 2. Train LightGBM Model
62+
# ============================================================================
63+
print("=" * 80)
64+
print("2. Training LightGBM Model")
65+
print("=" * 80)
66+
67+
model = LGBMClassifier(
68+
n_estimators=5,
69+
max_depth=3,
70+
learning_rate=0.1,
71+
random_state=42,
72+
verbose=-1,
73+
)
74+
75+
model.fit(X_train, y_train)
76+
77+
# Check model performance
78+
train_score = model.score(X_train, y_train)
79+
test_score = model.score(X_test, y_test)
80+
81+
print(f"Train accuracy: {train_score:.4f}")
82+
print(f"Test accuracy: {test_score:.4f}")
83+
print(f"Number of trees: {model.n_estimators}")
84+
print(f"Max depth: {model.max_depth}")
85+
print(f"Learning rate: {model.learning_rate}")
86+
print()
87+
88+
# ============================================================================
89+
# 3. Initialize LGBScorecardConstructor
90+
# ============================================================================
91+
print("=" * 80)
92+
print("3. Initializing LGBScorecardConstructor")
93+
print("=" * 80)
94+
95+
constructor = LGBScorecardConstructor(model, X_train, y_train)
96+
97+
print(f"Base score (log-odds): {constructor.base_score:.6f}")
98+
print(f"Number of estimators: {constructor.n_estimators}")
99+
print(f"Learning rate: {constructor.learning_rate}")
100+
print(f"Max depth: {constructor.max_depth}")
101+
print()
102+
103+
# ============================================================================
104+
# 4. Extract Leaf Weights
105+
# ============================================================================
106+
print("=" * 80)
107+
print("4. Extracting Leaf Weights")
108+
print("=" * 80)
109+
110+
leaf_weights = constructor.extract_leaf_weights()
111+
112+
print(f"Leaf weights shape: {leaf_weights.shape}")
113+
print(f"Columns: {leaf_weights.columns.tolist()}")
114+
print()
115+
116+
print("First 10 leaf weights:")
117+
print(leaf_weights.head(10))
118+
print()
119+
120+
print("Summary statistics:")
121+
print(leaf_weights[["Tree", "Split", "XAddEvidence"]].describe())
122+
print()
123+
124+
# Group by tree
125+
print("Leaf weights per tree:")
126+
tree_counts = leaf_weights.groupby("Tree").size()
127+
print(tree_counts)
128+
print()
129+
130+
# Group by feature
131+
print("Leaf weights per feature:")
132+
feature_counts = leaf_weights.groupby("Feature").size().sort_values(ascending=False)
133+
print(feature_counts)
134+
print()
135+
136+
# Check value ranges
137+
print("XAddEvidence range by tree:")
138+
value_ranges = leaf_weights.groupby("Tree")["XAddEvidence"].agg(["min", "max", "mean"])
139+
print(value_ranges)
140+
print()
141+
142+
# ============================================================================
143+
# 5. Get Leaf Indices for Test Data
144+
# ============================================================================
145+
print("=" * 80)
146+
print("5. Getting Leaf Indices for Test Data")
147+
print("=" * 80)
148+
149+
leaf_indices = constructor.get_leafs(X_test, output_type="leaf_index")
150+
151+
print(f"Leaf indices shape: {leaf_indices.shape}")
152+
print(f"Columns: {leaf_indices.columns.tolist()}")
153+
print()
154+
155+
print("First 10 observations:")
156+
print(leaf_indices.head(10))
157+
print()
158+
159+
print("Unique leaf indices per tree:")
160+
for col in leaf_indices.columns:
161+
unique_leaves = leaf_indices[col].nunique()
162+
print(f" {col}: {unique_leaves} unique leaves")
163+
print()
164+
165+
# ============================================================================
166+
# 6. Get Margins for Test Data
167+
# ============================================================================
168+
print("=" * 80)
169+
print("6. Getting Margins (Raw Scores) for Test Data")
170+
print("=" * 80)
171+
172+
margins = constructor.get_leafs(X_test, output_type="margin")
173+
174+
print(f"Margins shape: {margins.shape}")
175+
print(f"Columns: {margins.columns.tolist()}")
176+
print()
177+
178+
print("First 10 observations:")
179+
print(margins.head(10))
180+
print()
181+
182+
print("Margin statistics per tree:")
183+
print(margins.describe())
184+
print()
185+
186+
# Verify margins sum to raw predictions
187+
print("Verification: Margins + Base Score = Raw Predictions")
188+
margin_sum = margins.sum(axis=1) + constructor.base_score
189+
raw_pred = model.predict(X_test, raw_score=True)
190+
191+
print(f"Margin sum (first 5): {margin_sum.head().values}")
192+
print(f"Raw prediction (first 5): {raw_pred[:5]}")
193+
print(f"Match (all close): {np.allclose(margin_sum, raw_pred)}")
194+
print()
195+
196+
# ============================================================================
197+
# 7. Detailed Analysis of a Single Tree
198+
# ============================================================================
199+
print("=" * 80)
200+
print("7. Detailed Analysis of Tree 0")
201+
print("=" * 80)
202+
203+
tree_0_weights = leaf_weights[leaf_weights["Tree"] == 0]
204+
print("Tree 0 leaf weights:")
205+
print(tree_0_weights)
206+
print()
207+
208+
# Get the tree structure from LightGBM
209+
tree_df = model.booster_.trees_to_dataframe()
210+
tree_0_structure = tree_df[tree_df["tree_index"] == 0]
211+
212+
print("Tree 0 structure:")
213+
print()
214+
215+
# Split nodes (decision nodes)
216+
split_nodes = tree_0_structure[tree_0_structure["split_feature"].notna()]
217+
print("Split/Decision Nodes (internal nodes):")
218+
print(
219+
split_nodes[
220+
[
221+
"node_index",
222+
"split_feature",
223+
"threshold",
224+
"decision_type",
225+
"left_child",
226+
"right_child",
227+
]
228+
]
229+
)
230+
print()
231+
232+
# Leaf nodes
233+
leaf_nodes = tree_0_structure[tree_0_structure["split_feature"].isna()]
234+
print("Leaf Nodes (terminal nodes):")
235+
print(leaf_nodes[["node_index", "value"]])
236+
print()
237+
238+
print(f"Total nodes: {len(tree_0_structure)} ({len(split_nodes)} split + {len(leaf_nodes)} leaf)")
239+
print()
240+
241+
# ============================================================================
242+
# 8. Feature Importance Analysis
243+
# ============================================================================
244+
print("=" * 80)
245+
print("8. Feature Importance from Leaf Weights")
246+
print("=" * 80)
247+
248+
# Count how many times each feature appears in splits
249+
feature_importance = (
250+
leaf_weights.groupby("Feature")
251+
.agg({"XAddEvidence": ["count", "mean", "std", "min", "max"]})
252+
.round(4)
253+
)
254+
255+
feature_importance.columns = ["Count", "Mean_Value", "Std_Value", "Min_Value", "Max_Value"]
256+
feature_importance = feature_importance.sort_values("Count", ascending=False)
257+
258+
print("Feature importance based on split frequency:")
259+
print(feature_importance)
260+
print()
261+
262+
# ============================================================================
263+
# 9. Compare with Native LightGBM Predictions
264+
# ============================================================================
265+
print("=" * 80)
266+
print("9. Comparing with Native LightGBM Predictions")
267+
print("=" * 80)
268+
269+
# Get predictions
270+
pred_proba = model.predict_proba(X_test)[:, 1]
271+
pred_raw = model.predict(X_test, raw_score=True)
272+
273+
# Show comparison for first 5 samples
274+
comparison_df = pd.DataFrame(
275+
{
276+
"BaseScore": constructor.base_score,
277+
"MarginSum": margins.sum(axis=1).values[:5],
278+
"RawScore": pred_raw[:5],
279+
"Probability": pred_proba[:5],
280+
}
281+
)
282+
283+
print("Prediction comparison (first 5 samples):")
284+
print(comparison_df)
285+
print()
286+
287+
# Calculate probability from raw score manually
288+
manual_proba = 1 / (1 + np.exp(-pred_raw))
289+
print(f"Manual probability calculation matches: {np.allclose(manual_proba, pred_proba)}")
290+
print()
291+
292+
# ============================================================================
293+
# 10. Summary
294+
# ============================================================================
295+
print("=" * 80)
296+
print("10. Summary")
297+
print("=" * 80)
298+
299+
print("✓ Successfully created LGBScorecardConstructor")
300+
print("✓ Extracted leaf weights from LightGBM model")
301+
print("✓ Retrieved leaf indices for test data")
302+
print("✓ Retrieved margins (raw scores) for test data")
303+
print("✓ Verified margin calculations match LightGBM's raw predictions")
304+
print()
305+
306+
print("Currently implemented methods:")
307+
print(" - extract_leaf_weights(): Extract tree structure and leaf values")
308+
print(" - get_leafs(): Get leaf indices or margins for new data")
309+
print()
310+
311+
print("Methods still to be implemented:")
312+
print(" - construct_scorecard(): Combine leaf weights with event statistics")
313+
print(" - create_points(): Apply PDO (Points to Double Odds) scaling")
314+
print(" - predict_score(): Score new data using the scorecard")
315+
print()
316+
317+
print("Next steps for full implementation:")
318+
print(" 1. Implement construct_scorecard() to calculate WOE/IV")
319+
print(" 2. Implement create_points() for credit score calculation")
320+
print(" 3. Implement predict_score() for inference")
321+
print(" 4. Add SQL query generation for deployment")
322+
print()
323+
324+
print("=" * 80)
325+
print("Example completed successfully!")
326+
print("=" * 80)

0 commit comments

Comments
 (0)