|
| 1 | +""" |
| 2 | +Weight of Evidence Density Plot |
| 3 | +Author: https://github.com/deburky |
| 4 | +""" |
| 5 | + |
| 6 | +import os |
| 7 | +from pathlib import Path |
| 8 | +from tempfile import TemporaryDirectory |
| 9 | + |
| 10 | +import matplotlib.pyplot as plt |
| 11 | +import numpy as np |
| 12 | +import pandas as pd |
| 13 | +from pygam import LogisticGAM, s |
| 14 | +from scipy.stats import norm |
| 15 | +from sklearn.mixture import GaussianMixture |
| 16 | +from sklearn.neighbors import KernelDensity |
| 17 | + |
| 18 | +import fastwoe |
| 19 | +from fastwoe import FastWoe |
| 20 | + |
| 21 | +print(f"fastwoe version: {fastwoe.__version__}") |
| 22 | + |
| 23 | +# --------------------------- |
| 24 | +# 1. Load data |
| 25 | +# --------------------------- |
| 26 | +ROOT_DIR = Path(__file__).parent.parent |
| 27 | +data_path = ROOT_DIR / "data" / "BankCaseStudyData.csv" |
| 28 | +df = pd.read_csv(data_path) |
| 29 | + |
| 30 | +y = df["Final_Decision"].map({"Accept": 0, "Decline": 1}).values |
| 31 | +x = df["Application_Score"].values |
| 32 | + |
| 33 | +x_good = x[y == 0] |
| 34 | +x_bad = x[y == 1] |
| 35 | + |
| 36 | +# --------------------------- |
| 37 | +# 2. Setup |
| 38 | +# --------------------------- |
| 39 | +x_grid = np.linspace(x.min() - 1, x.max() + 1, 500).reshape(-1, 1) |
| 40 | +eps = 1e-6 |
| 41 | + |
| 42 | +colors = [ |
| 43 | + "#d3aa3d", # Normal parametric |
| 44 | + "#97d2f1", # KDE |
| 45 | + "#1e9575", # GMM |
| 46 | + "#e3e162", # Histogram |
| 47 | + "#ef7b7b", # GAM |
| 48 | + "#6a4c93", # Isotonic |
| 49 | + "#4a90e2", # FastWoe (tree) |
| 50 | + "#6a4c93", # FastWoe (faiss_kmeans) |
| 51 | +] |
| 52 | + |
| 53 | +# --------------------------- |
| 54 | +# 3. WOE Calculations |
| 55 | +# --------------------------- |
| 56 | + |
| 57 | +# Normal parametric fit |
| 58 | +mu_good, std_good = np.mean(x_good), np.std(x_good) |
| 59 | +mu_bad, std_bad = np.mean(x_bad), np.std(x_bad) |
| 60 | +f_good_norm = norm.pdf(x_grid, mu_good, std_good) |
| 61 | +f_bad_norm = norm.pdf(x_grid, mu_bad, std_bad) |
| 62 | +woe_norm = np.log((f_bad_norm + eps) / (f_good_norm + eps)) |
| 63 | + |
| 64 | +# KDE |
| 65 | +kde_good = KernelDensity(kernel="gaussian", bandwidth=0.4).fit(x_good.reshape(-1, 1)) |
| 66 | +kde_bad = KernelDensity(kernel="gaussian", bandwidth=0.4).fit(x_bad.reshape(-1, 1)) |
| 67 | +f_good_kde = np.exp(kde_good.score_samples(x_grid)) |
| 68 | +f_bad_kde = np.exp(kde_bad.score_samples(x_grid)) |
| 69 | +woe_kde = np.log((f_bad_kde + eps) / (f_good_kde + eps)) |
| 70 | + |
| 71 | +# GMM |
| 72 | +gmm_good = GaussianMixture(n_components=2, random_state=42).fit(x_good.reshape(-1, 1)) |
| 73 | +gmm_bad = GaussianMixture(n_components=2, random_state=42).fit(x_bad.reshape(-1, 1)) |
| 74 | +f_good_gmm = np.exp(gmm_good.score_samples(x_grid)) |
| 75 | +f_bad_gmm = np.exp(gmm_bad.score_samples(x_grid)) |
| 76 | +woe_gmm = np.log((f_bad_gmm + eps) / (f_good_gmm + eps)) |
| 77 | + |
| 78 | +# Histogram / binned WOE |
| 79 | +bins = np.histogram_bin_edges(x, bins="fd") |
| 80 | +good_hist, _ = np.histogram(x_good, bins=bins, density=True) |
| 81 | +bad_hist, _ = np.histogram(x_bad, bins=bins, density=True) |
| 82 | +woe_hist = np.log((bad_hist + eps) / (good_hist + eps)) |
| 83 | +bin_centers = 0.5 * (bins[1:] + bins[:-1]) |
| 84 | + |
| 85 | +# GAM WOE |
| 86 | +gam = LogisticGAM(s(0, n_splines=20)).fit(x.reshape(-1, 1), y) |
| 87 | +log_odds = gam._modelmat(x_grid) @ gam.coef_ # pylint: disable=protected-access |
| 88 | +prior_odds = np.log(y.mean() / (1 - y.mean())) |
| 89 | +woe_gam = log_odds - prior_odds |
| 90 | + |
| 91 | +# FastWoe WOE Tree |
| 92 | +encoder = FastWoe(binning_method="tree") |
| 93 | +encoder.fit(x, y) |
| 94 | +woe_fastwoe = encoder.transform(x_grid.reshape(-1, 1)) |
| 95 | + |
| 96 | +# FastWoe WOE (faiss_kmeans) |
| 97 | +encoder = FastWoe(binning_method="faiss_kmeans") |
| 98 | +encoder.fit(x, y) |
| 99 | +woe_fastwoe_faiss = encoder.transform(x_grid.reshape(-1, 1)) |
| 100 | + |
| 101 | +# FastWoe WOE (Tree with monotonic constraints) |
| 102 | +encoder = FastWoe( |
| 103 | + binning_method="tree", monotonic_cst={"Application_Score": -1} |
| 104 | +) # Use meaningful feature name - should work automatically now! |
| 105 | +encoder.fit(x, y) |
| 106 | +woe_fastwoe_mono = encoder.transform(x_grid.reshape(-1, 1)) |
| 107 | + |
| 108 | +# Debug: Check if monotonic and non-monotonic results are different |
| 109 | +print( |
| 110 | + f"Tree without constraints - first 5 WOE values: {woe_fastwoe.values.flatten()[:5]}" |
| 111 | +) |
| 112 | +print( |
| 113 | + f"Tree with monotonic constraints - first 5 WOE values: {woe_fastwoe_mono.values.flatten()[:5]}" |
| 114 | +) |
| 115 | +print(f"Are they identical? {np.allclose(woe_fastwoe.values, woe_fastwoe_mono.values)}") |
| 116 | + |
| 117 | +# --------------------------- |
| 118 | +# 4. Plot |
| 119 | +# --------------------------- |
| 120 | +fig, ax1 = plt.subplots(figsize=(11, 6)) |
| 121 | + |
| 122 | +# WOE curves |
| 123 | +# ax1.plot(x_grid, woe_norm, color=colors[0], label="Normal Parametric", linewidth=2) |
| 124 | +# ax1.plot(x_grid, woe_kde, color=colors[1], label="KDE", linewidth=2) |
| 125 | +ax1.plot( |
| 126 | + x_grid, |
| 127 | + woe_fastwoe_mono, |
| 128 | + color=colors[5], |
| 129 | + label="FastWoe (tree) - Monotonic", |
| 130 | + linewidth=2, |
| 131 | +) |
| 132 | +ax1.step( |
| 133 | + bin_centers, woe_hist, color=colors[1], label="Histogram", where="mid", linewidth=2 |
| 134 | +) |
| 135 | +ax1.plot(x_grid, woe_gam, color=colors[2], label="GAM", linewidth=2) |
| 136 | +ax1.plot(x_grid, woe_fastwoe, color=colors[3], label="FastWoe (tree)", linewidth=2) |
| 137 | +ax1.plot( |
| 138 | + x_grid, |
| 139 | + woe_fastwoe_faiss, |
| 140 | + color=colors[4], |
| 141 | + label="FastWoe (faiss k-means)", |
| 142 | + linewidth=2, |
| 143 | +) |
| 144 | + |
| 145 | +ax1.axhline(0, color="black", linestyle="--", linewidth=1) |
| 146 | + |
| 147 | +# Secondary axis for counts (hidden) |
| 148 | +ax2 = ax1.twinx() |
| 149 | +counts, _ = np.histogram(x, bins=bins) |
| 150 | +ax2.bar( |
| 151 | + bin_centers, |
| 152 | + counts, |
| 153 | + width=(bins[1] - bins[0]) * 0.8, |
| 154 | + alpha=0.2, |
| 155 | + color="gray", |
| 156 | + align="center", |
| 157 | +) |
| 158 | +ax2.get_yaxis().set_visible(False) |
| 159 | + |
| 160 | +# Title and labels |
| 161 | +plt.suptitle( |
| 162 | + "Weight of Evidence (WOE) by Method — Application Score", |
| 163 | + fontsize=22, |
| 164 | + y=0.9, |
| 165 | +) |
| 166 | +ax1.set_xlabel("Application Score", fontsize=14) |
| 167 | +ax1.set_ylabel("WOE(x)", fontsize=14) |
| 168 | +ax1.tick_params(axis="both", which="major", labelsize=14) |
| 169 | + |
| 170 | +# disable upper and right spines |
| 171 | +ax1.spines["top"].set_visible(False) |
| 172 | +ax1.spines["right"].set_visible(False) |
| 173 | + |
| 174 | +# You might also want to remove spines from ax2 if they're visible: |
| 175 | +ax2.spines["top"].set_visible(False) |
| 176 | +ax2.spines["right"].set_visible(False) |
| 177 | + |
| 178 | +# Legend above plot, 3 columns, no frame |
| 179 | +ax1.legend( |
| 180 | + loc="lower center", |
| 181 | + bbox_to_anchor=(0.5, 1.04), # Centers horizontally, positions above plot |
| 182 | + ncol=3, |
| 183 | + fontsize=11, |
| 184 | + frameon=False, # No frame like the first example |
| 185 | +) |
| 186 | + |
| 187 | +ax1.grid(False) |
| 188 | +fig.tight_layout(rect=(0, 0, 1, 0.91)) |
| 189 | + |
| 190 | +# Create a temporary directory to save the image |
| 191 | +with TemporaryDirectory() as temp_dir: |
| 192 | + image_path = os.path.join(temp_dir, "woe_density.png") |
| 193 | + plt.savefig(image_path) |
| 194 | + plt.show() |
| 195 | + print(f"WOE density plot saved to: {image_path}") |
0 commit comments