Release v0.2.7: CatBoost standardization and PyPI workflow

xRiskLab · xRiskLab · commit 160883a53bd3 · 2025-12-04T18:16:12.000+01:00
- Standardize CatBoost to use XAddEvidence (matching XGBoost/LightGBM)
- Fix README depth=1 documentation (recommended, not required)
- Add PyPI publish workflow with trusted publishing
- Update version from 0.2.7rc2 to 0.2.7 (stable)
- All 106 tests passing
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,103 @@
+name: Build and Publish
+
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+    inputs:
+      publish_to_pypi:
+        description: 'Publish to PyPI'
+        required: true
+        type: boolean
+        default: false
+
+jobs:
+  build:
+    name: Build distribution packages
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.11'
+
+    - name: Install build dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build hatch-autorun twine
+
+    - name: Build package
+      run: python -m build
+
+    - name: Check distribution files
+      run: twine check dist/*
+
+    - name: List distribution files
+      run: ls -lh dist/
+
+    - name: Upload artifacts
+      uses: actions/upload-artifact@v4
+      with:
+        name: python-package-distributions
+        path: dist/
+        retention-days: 30
+
+  publish-pypi:
+    name: Publish to PyPI
+    needs: build
+    runs-on: ubuntu-latest
+    if: |
+      (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true') ||
+      (github.event_name == 'release' && !github.event.release.prerelease)
+
+    environment:
+      name: pypi
+      url: https://pypi.org/p/xbooster
+
+    permissions:
+      id-token: write
+
+    steps:
+    - name: Download artifacts
+      uses: actions/download-artifact@v4
+      with:
+        name: python-package-distributions
+        path: dist/
+
+    - name: Publish to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+
+  create-github-release-notes:
+    name: Create GitHub Release Notes
+    needs: build
+    runs-on: ubuntu-latest
+    if: github.event_name == 'release'
+
+    permissions:
+      contents: write
+
+    steps:
+    - name: Download artifacts
+      uses: actions/download-artifact@v4
+      with:
+        name: python-package-distributions
+        path: dist/
+
+    - name: Install GitHub CLI
+      run: |
+        type -p curl >/dev/null || (apt update && apt install curl -y)
+        curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg \
+        && chmod go+r /usr/share/keyrings/githubcli-archive-keyring.gpg \
+        && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
+        && apt update \
+        && apt install gh -y
+
+    - name: Upload to GitHub Release
+      env:
+        GITHUB_TOKEN: ${{ github.token }}
+      run: |
+        gh release upload '${{ github.ref_name }}' dist/** --repo '${{ github.repository }}'
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,30 @@
 # Changelog
 
+## [0.2.7] - 2025-12-04
+
+### Changed
+- **CatBoost Naming Standardization**: Replaced `LeafValue` with `XAddEvidence` throughout CatBoost implementation
+  - Standardized naming to match XGBoost and LightGBM implementations
+  - Updated all CatBoost-related files: `catboost_scorecard.py`, `catboost_wrapper.py`, `cb_constructor.py`
+  - Updated all tests and documentation
+
+### Fixed
+- **README Documentation**: Corrected CatBoost depth requirement statement
+  - Changed from "Only supports depth=1" to "depth=1 is recommended for better interpretability"
+  - Code actually supports any tree depth (as long as trees are complete binary)
+  - Updated code examples to use `XAddEvidence` instead of `LeafValue`
+
+### Added
+- **PyPI Publish Workflow**: Added automated PyPI publishing workflow (`.github/workflows/publish.yml`)
+  - Supports both release events and manual workflow dispatch
+  - Uses trusted publishing (OpenID Connect) for secure PyPI uploads
+  - Automatically uploads distribution files to GitHub releases
+
+### Technical Details
+- All 106 tests passing
+- Version updated from 0.2.7rc2 to 0.2.7 (stable release)
+- LightGBM support is now stable (previously release candidate)
+
 ## [0.2.7rc2] - 2025-11-23 (Release Candidate)
 
 ### Fixed
diff --git a/README.md b/README.md
@@ -218,7 +218,7 @@ The `DataPreprocessor` provides:
 3. Generation of interaction constraints for XGBoost
 4. Consistent feature naming for scorecard generation
 
-### LightGBM Support 💡 (Release Candidate)
+### LightGBM Usage
 
 xbooster provides support for LightGBM models with scorecard functionality. Here's how to use it:
 
@@ -296,12 +296,11 @@ print(f"Model Gini: {model_gini:.4f}")
 - **Flexible**: `use_base_score` parameter for optional base score normalization
 
 **Important Notes:**
-- **Release Candidate**: This feature is in testing phase - feedback welcome!
 - LightGBM's sklearn API handles base_score differently than XGBoost
 - The `use_base_score=True` parameter (default) ensures proper normalization
 - Only `XAddEvidence` score type is supported (WOE not applicable)
 
-### CatBoost Support 🐱 (Beta)
+### CatBoost Usage
 
 xbooster provides experimental support for CatBoost models with reduced functionality compared to XGBoost. Here's how to use it:
 
@@ -340,19 +339,19 @@ model = CatBoostClassifier(
 model.fit(pool)
 
 # Create and fit the scorecard constructor
-constructor = CatBoostScorecardConstructor(model, pool)  # use_woe=False is the default, using raw LeafValue
+constructor = CatBoostScorecardConstructor(model, pool)  # use_woe=False is the default, using raw XAddEvidence
 
-# Alternatively, to use WOE values instead of raw leaf values:
+# Alternatively, to use WOE values instead of raw XAddEvidence:
 # constructor = CatBoostScorecardConstructor(model, pool, use_woe=True)
 
 # Construct the scorecard
 scorecard = constructor.construct_scorecard()
 print("\nScorecard:")
 print(scorecard.head(3))
 
-# Print raw leaf values
-print("\nRaw Leaf Values:")
-print(scorecard[["Tree", "LeafIndex", "LeafValue", "WOE"]].head(10))
+# Print raw XAddEvidence values
+print("\nRaw XAddEvidence Values:")
+print(scorecard[["Tree", "LeafIndex", "XAddEvidence", "WOE"]].head(10))
 
 # Make predictions using different methods - Do this BEFORE creating points
 # Original CatBoost predictions
@@ -410,7 +409,7 @@ visualizer.plot_tree(tree_idx=0, title="CatBoost Tree Visualization")
 
 The CatBoost implementation has some limitations compared to the XGBoost version:
 
-1. Only supports depth=1 trees for interpretability
+1. **Depth recommendation**: While the code supports any tree depth (as long as trees are complete binary), `depth=1` is recommended for better interpretability. Deeper trees work but may be harder to interpret.
 2. Limited support for categorical features
 3. No SQL query generation
 4. Reduced visualization options
diff --git a/tests/test_catboost_scorecard.py b/tests/test_catboost_scorecard.py
@@ -60,10 +60,9 @@ def test_trees_to_scorecard(trained_model: CatBoostClassifier, test_pool: Pool):
         "NonEvents",
         "Events",
         "EventRate",
-        "LeafValue",
+        "XAddEvidence",
         "WOE",
         "IV",
-        "xAddEvidence",
         "CountPct",
         "DetailedSplit",
     }
@@ -77,10 +76,9 @@ def test_trees_to_scorecard(trained_model: CatBoostClassifier, test_pool: Pool):
     assert scorecard["NonEvents"].dtype == np.float64
     assert scorecard["Events"].dtype == np.float64
     assert scorecard["EventRate"].dtype == np.float64
-    assert scorecard["LeafValue"].dtype == np.float64
+    assert scorecard["XAddEvidence"].dtype == np.float64
     assert scorecard["WOE"].dtype == np.float64
     assert scorecard["IV"].dtype == np.float64
-    assert scorecard["xAddEvidence"].dtype == np.float64
     assert scorecard["CountPct"].dtype == np.float64
 
     # Check for valid values
diff --git a/tests/test_catboost_wrapper.py b/tests/test_catboost_wrapper.py
@@ -165,19 +165,19 @@ def test_get_binned_feature_table(woe_mapper):
     assert isinstance(table, pd.DataFrame)
     assert not table.empty
 
-    required_columns = {"Feature", "Condition", "LeafValue", "Weight", "TreeCount"}
+    required_columns = {"Feature", "Condition", "XAddEvidence", "Weight", "TreeCount"}
     assert set(table.columns) >= required_columns
 
     assert table["Feature"].dtype == object
     assert table["Condition"].dtype == object
-    assert table["LeafValue"].dtype == np.float64
+    assert table["XAddEvidence"].dtype == np.float64
     assert table["Weight"].dtype == np.float64
     assert table["TreeCount"].dtype == np.int64
 
 
 def test_get_value_column(woe_mapper):
     """Test the get_value_column method."""
-    assert woe_mapper.get_value_column() == "LeafValue"
+    assert woe_mapper.get_value_column() == "XAddEvidence"
 
     woe_mapper.points_column = "Points"
     assert woe_mapper.get_value_column() == "Points"
diff --git a/tests/test_cb_constructor.py b/tests/test_cb_constructor.py
@@ -270,7 +270,7 @@ def test_construct_scorecard(scorecard_constructor):
         "IV",
         "CountPct",
         "DetailedSplit",
-        "LeafValue",
+        "XAddEvidence",
     }
     assert set(scorecard.columns).issuperset(required_columns)
 
@@ -288,7 +288,7 @@ def test_construct_scorecard(scorecard_constructor):
     assert scorecard["IV"].dtype == np.float64
     assert scorecard["CountPct"].dtype == np.float64
     assert scorecard["DetailedSplit"].dtype == object
-    assert scorecard["LeafValue"].dtype == np.float64
+    assert scorecard["XAddEvidence"].dtype == np.float64
 
     # Check for valid values
     assert scorecard["Count"].min() >= 0
@@ -453,7 +453,7 @@ def test_get_scorecard(trained_model, catboost_pool):
     assert not scorecard.empty
     assert "Tree" in scorecard.columns
     assert "LeafIndex" in scorecard.columns
-    assert "LeafValue" in scorecard.columns
+    assert "XAddEvidence" in scorecard.columns
     assert "DetailedSplit" in scorecard.columns
 
 
diff --git a/xbooster/__init__.py b/xbooster/__init__.py
@@ -5,7 +5,7 @@
 from gradient boosted tree models (XGBoost and CatBoost).
 """
 
-__version__ = "0.2.7rc2"
+__version__ = "0.2.7"
 __author__ = "xRiskLab"
 __email__ = "contact@xrisklab.ai"
 
diff --git a/xbooster/_utils.py b/xbooster/_utils.py
@@ -442,7 +442,7 @@ def build_node(path: str, level: int) -> dict:
                         f"count: {int(row['Count'])}\n"
                         f"rate: {row['EventRate']:.3f}\n"
                         f"woe: {row['WOE']:.3f}\n"
-                        f"val: {row['LeafValue']:.3f}"
+                        f"val: {row['XAddEvidence']:.3f}"
                     ),
                     "depth": level,
                     "is_leaf": True,
diff --git a/xbooster/catboost_scorecard.py b/xbooster/catboost_scorecard.py
@@ -225,7 +225,7 @@ def trees_to_scorecard(
                     {
                         "Tree": tree_idx,
                         "LeafIndex": leaf_idx,
-                        "LeafValue": clean_val,
+                        "XAddEvidence": clean_val,
                         "Conditions": conditions,
                         "Feature": feature,
                         "Sign": sign,
@@ -304,9 +304,6 @@ def trees_to_scorecard(
             .round(4)
         )
 
-        # Calculate xAddEvidence
-        scorecard_df["xAddEvidence"] = scorecard_df["LeafValue"]
-
         # Calculate CountPct
         total_count = scorecard_df["Count"].sum()
         scorecard_df["CountPct"] = (scorecard_df["Count"] / total_count * 100).fillna(0.0)
@@ -329,10 +326,9 @@ def trees_to_scorecard(
                 "NonEvents",
                 "Events",
                 "EventRate",
-                "LeafValue",
+                "XAddEvidence",
                 "WOE",
                 "IV",
-                "xAddEvidence",
                 "DetailedSplit",
             ]
         ]
diff --git a/xbooster/catboost_wrapper.py b/xbooster/catboost_wrapper.py
diff --git a/xbooster/cb_constructor.py b/xbooster/cb_constructor.py