Skip to content

Commit 665d129

Browse files
committed
Release v0.1.5rc1: Clean API refactoring and Pythonic input handling
1 parent a6b3d92 commit 665d129

10 files changed

Lines changed: 833 additions & 434 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,6 @@ dist/
2323
# Development notes with sensitive info
2424
.dev_notes.md
2525
.env
26+
27+
# UV lock file (auto-generated, large)
28+
uv.lock

CHANGELOG.md

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,41 @@
11
# Changelog
22

3-
## Version 0.1.5a1 (Current)
3+
## Version 0.1.5rc1 (2025-10-26)
4+
5+
**Clean API Refactoring & Pythonic Input Handling**: Release candidate with major UX improvements
6+
7+
- **Clean API Architecture**:
8+
- **Eliminated Conversion Logic**: Replaced 30+ lines of repetitive numpy/pandas conversion code with clean helper methods
9+
- Introduced `_ensure_dataframe()`, `_ensure_series()`, and `_validate_constraints()` helper methods
10+
- Clean, single-line conversion: `X = self._ensure_dataframe(X); y = self._ensure_series(y)`
11+
- Improved code maintainability and readability dramatically
12+
- **Smart Feature Naming**: Automatic detection of meaningful feature names from monotonic constraints
13+
- When numpy arrays are passed with monotonic constraints, uses constraint keys as column names
14+
- Example: `monotonic_cst={"Application_Score": -1}` automatically names the feature "Application_Score"
15+
- Eliminates need for manual DataFrame conversion in most cases
16+
- **1D Array Handling**: Properly handles 1D numpy arrays by auto-reshaping to 2D
17+
- Fixes `IndexError: tuple index out of range` when accessing `X.shape[1]` on 1D arrays
18+
- Seamless handling: `encoder.fit(x, y)` where `x` is 1D numpy array now works perfectly
19+
- **Consistent Transform Behavior**: Uses fitted column names in transform method for consistency
20+
- `_ensure_dataframe(X, use_fitted_names=True)` preserves names from fit phase
21+
22+
- **User Experience Improvements**:
23+
- **No More Warnings for Standard Usage**: Removed annoying warnings for common numpy array inputs
24+
- **Intuitive API**: Users can pass 1D arrays with meaningful constraint names seamlessly
25+
- **Backward Compatibility**: All existing functionality preserved, new features are additive
26+
- **Pythonic Design**: Follows Python principles of "it should just work" for common use cases
27+
28+
- **Example Usage** (now works seamlessly):
29+
```python
30+
# Clean, intuitive usage - no more ugly conversion warnings!
31+
encoder = FastWoe(
32+
binning_method="tree",
33+
monotonic_cst={"Application_Score": -1}
34+
)
35+
encoder.fit(x, y) # x can be 1D numpy array - works perfectly!
36+
```
37+
38+
## Version 0.1.5a1 (Previous)
439

540
**Multiclass WOE Refactoring & Bug Fixes**: Major code organization improvements and multiclass prediction fixes
641

examples/fastwoe_monotonic.ipynb

Lines changed: 63 additions & 102 deletions
Large diffs are not rendered by default.

examples/woe_density.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
"""
2+
Weight of Evidence Density Plot
3+
Author: https://github.com/deburky
4+
"""
5+
6+
import os
7+
from pathlib import Path
8+
from tempfile import TemporaryDirectory
9+
10+
import matplotlib.pyplot as plt
11+
import numpy as np
12+
import pandas as pd
13+
from pygam import LogisticGAM, s
14+
from scipy.stats import norm
15+
from sklearn.mixture import GaussianMixture
16+
from sklearn.neighbors import KernelDensity
17+
18+
import fastwoe
19+
from fastwoe import FastWoe
20+
21+
print(f"fastwoe version: {fastwoe.__version__}")
22+
23+
# ---------------------------
24+
# 1. Load data
25+
# ---------------------------
26+
ROOT_DIR = Path(__file__).parent.parent
27+
data_path = ROOT_DIR / "data" / "BankCaseStudyData.csv"
28+
df = pd.read_csv(data_path)
29+
30+
y = df["Final_Decision"].map({"Accept": 0, "Decline": 1}).values
31+
x = df["Application_Score"].values
32+
33+
x_good = x[y == 0]
34+
x_bad = x[y == 1]
35+
36+
# ---------------------------
37+
# 2. Setup
38+
# ---------------------------
39+
x_grid = np.linspace(x.min() - 1, x.max() + 1, 500).reshape(-1, 1)
40+
eps = 1e-6
41+
42+
colors = [
43+
"#d3aa3d", # Normal parametric
44+
"#97d2f1", # KDE
45+
"#1e9575", # GMM
46+
"#e3e162", # Histogram
47+
"#ef7b7b", # GAM
48+
"#6a4c93", # Isotonic
49+
"#4a90e2", # FastWoe (tree)
50+
"#6a4c93", # FastWoe (faiss_kmeans)
51+
]
52+
53+
# ---------------------------
54+
# 3. WOE Calculations
55+
# ---------------------------
56+
57+
# Normal parametric fit
58+
mu_good, std_good = np.mean(x_good), np.std(x_good)
59+
mu_bad, std_bad = np.mean(x_bad), np.std(x_bad)
60+
f_good_norm = norm.pdf(x_grid, mu_good, std_good)
61+
f_bad_norm = norm.pdf(x_grid, mu_bad, std_bad)
62+
woe_norm = np.log((f_bad_norm + eps) / (f_good_norm + eps))
63+
64+
# KDE
65+
kde_good = KernelDensity(kernel="gaussian", bandwidth=0.4).fit(x_good.reshape(-1, 1))
66+
kde_bad = KernelDensity(kernel="gaussian", bandwidth=0.4).fit(x_bad.reshape(-1, 1))
67+
f_good_kde = np.exp(kde_good.score_samples(x_grid))
68+
f_bad_kde = np.exp(kde_bad.score_samples(x_grid))
69+
woe_kde = np.log((f_bad_kde + eps) / (f_good_kde + eps))
70+
71+
# GMM
72+
gmm_good = GaussianMixture(n_components=2, random_state=42).fit(x_good.reshape(-1, 1))
73+
gmm_bad = GaussianMixture(n_components=2, random_state=42).fit(x_bad.reshape(-1, 1))
74+
f_good_gmm = np.exp(gmm_good.score_samples(x_grid))
75+
f_bad_gmm = np.exp(gmm_bad.score_samples(x_grid))
76+
woe_gmm = np.log((f_bad_gmm + eps) / (f_good_gmm + eps))
77+
78+
# Histogram / binned WOE
79+
bins = np.histogram_bin_edges(x, bins="fd")
80+
good_hist, _ = np.histogram(x_good, bins=bins, density=True)
81+
bad_hist, _ = np.histogram(x_bad, bins=bins, density=True)
82+
woe_hist = np.log((bad_hist + eps) / (good_hist + eps))
83+
bin_centers = 0.5 * (bins[1:] + bins[:-1])
84+
85+
# GAM WOE
86+
gam = LogisticGAM(s(0, n_splines=20)).fit(x.reshape(-1, 1), y)
87+
log_odds = gam._modelmat(x_grid) @ gam.coef_ # pylint: disable=protected-access
88+
prior_odds = np.log(y.mean() / (1 - y.mean()))
89+
woe_gam = log_odds - prior_odds
90+
91+
# FastWoe WOE Tree
92+
encoder = FastWoe(binning_method="tree")
93+
encoder.fit(x, y)
94+
woe_fastwoe = encoder.transform(x_grid.reshape(-1, 1))
95+
96+
# FastWoe WOE (faiss_kmeans)
97+
encoder = FastWoe(binning_method="faiss_kmeans")
98+
encoder.fit(x, y)
99+
woe_fastwoe_faiss = encoder.transform(x_grid.reshape(-1, 1))
100+
101+
# FastWoe WOE (Tree with monotonic constraints)
102+
encoder = FastWoe(
103+
binning_method="tree", monotonic_cst={"Application_Score": -1}
104+
) # Use meaningful feature name - should work automatically now!
105+
encoder.fit(x, y)
106+
woe_fastwoe_mono = encoder.transform(x_grid.reshape(-1, 1))
107+
108+
# Debug: Check if monotonic and non-monotonic results are different
109+
print(
110+
f"Tree without constraints - first 5 WOE values: {woe_fastwoe.values.flatten()[:5]}"
111+
)
112+
print(
113+
f"Tree with monotonic constraints - first 5 WOE values: {woe_fastwoe_mono.values.flatten()[:5]}"
114+
)
115+
print(f"Are they identical? {np.allclose(woe_fastwoe.values, woe_fastwoe_mono.values)}")
116+
117+
# ---------------------------
118+
# 4. Plot
119+
# ---------------------------
120+
fig, ax1 = plt.subplots(figsize=(11, 6))
121+
122+
# WOE curves
123+
# ax1.plot(x_grid, woe_norm, color=colors[0], label="Normal Parametric", linewidth=2)
124+
# ax1.plot(x_grid, woe_kde, color=colors[1], label="KDE", linewidth=2)
125+
ax1.plot(
126+
x_grid,
127+
woe_fastwoe_mono,
128+
color=colors[5],
129+
label="FastWoe (tree) - Monotonic",
130+
linewidth=2,
131+
)
132+
ax1.step(
133+
bin_centers, woe_hist, color=colors[1], label="Histogram", where="mid", linewidth=2
134+
)
135+
ax1.plot(x_grid, woe_gam, color=colors[2], label="GAM", linewidth=2)
136+
ax1.plot(x_grid, woe_fastwoe, color=colors[3], label="FastWoe (tree)", linewidth=2)
137+
ax1.plot(
138+
x_grid,
139+
woe_fastwoe_faiss,
140+
color=colors[4],
141+
label="FastWoe (faiss k-means)",
142+
linewidth=2,
143+
)
144+
145+
ax1.axhline(0, color="black", linestyle="--", linewidth=1)
146+
147+
# Secondary axis for counts (hidden)
148+
ax2 = ax1.twinx()
149+
counts, _ = np.histogram(x, bins=bins)
150+
ax2.bar(
151+
bin_centers,
152+
counts,
153+
width=(bins[1] - bins[0]) * 0.8,
154+
alpha=0.2,
155+
color="gray",
156+
align="center",
157+
)
158+
ax2.get_yaxis().set_visible(False)
159+
160+
# Title and labels
161+
plt.suptitle(
162+
"Weight of Evidence (WOE) by Method — Application Score",
163+
fontsize=22,
164+
y=0.9,
165+
)
166+
ax1.set_xlabel("Application Score", fontsize=14)
167+
ax1.set_ylabel("WOE(x)", fontsize=14)
168+
ax1.tick_params(axis="both", which="major", labelsize=14)
169+
170+
# disable upper and right spines
171+
ax1.spines["top"].set_visible(False)
172+
ax1.spines["right"].set_visible(False)
173+
174+
# You might also want to remove spines from ax2 if they're visible:
175+
ax2.spines["top"].set_visible(False)
176+
ax2.spines["right"].set_visible(False)
177+
178+
# Legend above plot, 3 columns, no frame
179+
ax1.legend(
180+
loc="lower center",
181+
bbox_to_anchor=(0.5, 1.04), # Centers horizontally, positions above plot
182+
ncol=3,
183+
fontsize=11,
184+
frameon=False, # No frame like the first example
185+
)
186+
187+
ax1.grid(False)
188+
fig.tight_layout(rect=(0, 0, 1, 0.91))
189+
190+
# Create a temporary directory to save the image
191+
with TemporaryDirectory() as temp_dir:
192+
image_path = os.path.join(temp_dir, "woe_density.png")
193+
plt.savefig(image_path)
194+
plt.show()
195+
print(f"WOE density plot saved to: {image_path}")

fastwoe/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from .fastwoe import FastWoe, WoePreprocessor
1414
from .interpret_fastwoe import WeightOfEvidence
1515

16-
__version__ = "0.1.5a1"
16+
__version__ = "0.1.5rc1"
1717
__author__ = "xRiskLab"
1818
__email__ = "contact@xrisklab.ai"
1919

0 commit comments

Comments
 (0)