Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@ jobs:
key: ${{ matrix.os }}-venv-${{ matrix.python-version }}-${{ hashFiles('**/uv.lock') }}

- name: Install the project dependencies
run: uv sync --group test --python "$(python -c 'import sys; print(sys.executable)')"
shell: bash
run: uv sync --group test

- name: Check pre-commit
run: uv run pre-commit run --all-files
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@ wheels/

# Dev cache
.ruff_cache/
.pytest_cache/

# Virtual environments
.venv

# IDE files
.idea/
.vscode/

14 changes: 11 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
.PHONY: setup help
.PHONY: setup test test-coverage help
.DEFAULT_GOAL := help

setup: ## Install development dependencies
@# check if uv is installed
@uv --version >/dev/null 2>&1 || (echo "uv is not installed, please install it" && exit 1)

@# install dependencies
uv sync --group dev
uv sync --group dev --group test
uv run pre-commit install

test:
test: ## Run tests without regression
uv run ruff check
uv run pytest tests -m "not slow"

full-test:
uv run ruff check
uv run pytest tests

test-coverage: ## Run tests and calculate test coverage
uv run pytest --cov=bbttest tests

help:
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
21 changes: 13 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,28 +87,33 @@ Once you obtained a fitted PyBBT model, you can generate statistic dataframe con

```python

stats_df = model.get_stats_dataframe(
stats_df = model.posterior_table(
rope_value=(0.45, 0.55), # Defines ROPE of hypothesis for interpretations
control_model="alg1", # If provided, only hypotheses comparing to control_model will be included
selected_models=["alg2", "alg3"], # If provided, only hypotheses comparing selected_models will be included
selected_models=["alg2"], # If provided, only hypotheses comparing selected_models will be included
)

print(stats_df)

pair mean delta above_50 in_rope weak_interpretation
0 alg1 > alg2 0.63 0.53 0.75 0.19 Unknown
```

Additionally, you can generate multiple hypothesis interpretations regarding control model for different ROPE values:

```python
from bbttest import multiple_ropes_control_table

stats_df = multiple_ropes_control_table(
model,
ropes=[(0.4, 0.6), (0.45, 0.55), (0.48, 0.52)],
stats_df = model.rope_comparison_control_table(
rope_values=[(0.4, 0.6), (0.45, 0.55), (0.48, 0.52)],
control_model="alg1",
interpretation_type="weak",
interpretation="weak",
)

print(stats_df)

rope_value better_models equivalent_models worse_models unknown_models
0 (0.4, 0.6) alg3, alg1
1 (0.45, 0.55) alg3, alg1
2 (0.48, 0.52) alg3, alg1
```

## License
Expand Down
5 changes: 1 addition & 4 deletions bbttest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
"""bbt-test: Bayesian Bradley-Terry model for algorithm comparison."""

from .const import HyperPrior, ReportedProperty, TieSolver
from .py_bbt import PyBBT
from .utils import multiple_ropes_control_table
from .bbt import HyperPrior, PyBBT, ReportedProperty, TieSolver

__all__ = [
"HyperPrior",
"PyBBT",
"ReportedProperty",
"TieSolver",
"multiple_ropes_control_table",
]
11 changes: 11 additions & 0 deletions bbttest/bbt/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""bbt module: Bayesian Bradley-Terry model implementation."""

from .const import HyperPrior, ReportedProperty, TieSolver
from .py_bbt import PyBBT

__all__ = [
"HyperPrior",
"PyBBT",
"ReportedProperty",
"TieSolver",
]
9 changes: 6 additions & 3 deletions bbttest/alg.py → bbttest/bbt/alg.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging as log
from collections.abc import Generator
from collections.abc import Generator, Iterable

import arviz as az
import numpy as np
Expand Down Expand Up @@ -158,9 +158,9 @@ def _construct_win_table(

def _get_pwin(
bbt_result: az.InferenceData,
alg_names: list[str] | None = None,
alg_names: Iterable[str] | None = None,
control: str | None = None,
selected: list[str] | None = None,
selected: Iterable[str] | None = None,
):
def _pairwise_prob(strength_i, strength_j):
return strength_i / (strength_i + strength_j)
Expand All @@ -183,6 +183,9 @@ def _pairwise_prob(strength_i, strength_j):
# Filter by selected algorithms if specified
if selected is not None:
selected_set = set(selected)
if control not in selected_set and control is not None:
selected_set.add(control)

indices = [i for i, name in enumerate(ordered_names) if name in selected_set]
ordered_names = ordered_names[indices]
strengths = strengths[:, indices]
Expand Down
25 changes: 15 additions & 10 deletions bbttest/const.py → bbttest/bbt/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,26 @@ class HyperPrior(str, Enum):
NORMAL = "normal"

def _get_pymc_dist(self, scale, name="sigma"):
if self == HyperPrior.LOG_NORMAL:
return LogNormal(name, mu=0, sigma=1)
elif self == HyperPrior.LOG_NORMAL_SCALED:
return LogNormal(name, mu=0, sigma=scale)
elif self == HyperPrior.CAUCHY:
return Cauchy(name, alpha=0, beta=scale)
elif self == HyperPrior.NORMAL:
return Normal(name, mu=0, sigma=scale)
else:
raise ValueError(f"Unsupported hyperprior: {self}")
match self:
case HyperPrior.LOG_NORMAL:
return LogNormal(name, mu=0, sigma=1)
case HyperPrior.LOG_NORMAL_SCALED:
return LogNormal(name, mu=0, sigma=scale)
case HyperPrior.CAUCHY:
return Cauchy(name, alpha=0, beta=scale)
case HyperPrior.NORMAL:
return Normal(name, mu=0, sigma=scale)
case _:
raise ValueError(f"Unsupported hyperprior: {self}")


class ReportedProperty(str, Enum):
"""
Enum containing properties that can be reported from BBT results.
"""

LEFT_MODEL = "left_model"
RIGHT_MODEL = "right_model"
MEDIAN = "median"
MEAN = "mean"
HDI_LOW = "hdi_low"
Expand Down Expand Up @@ -65,3 +68,5 @@ class TieSolver(str, Enum):
ReportedProperty.IN_ROPE,
ReportedProperty.WEAK_INTERPRETATION,
)

ALL_PROPERTIES = tuple(ReportedProperty)
2 changes: 1 addition & 1 deletion bbttest/model.py → bbttest/bbt/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def _mcmcbbt_pymc(
sample_kwargs = {
k: v
for k, v in kwargs.items()
if k in ["draws", "tune", "chains", "cores", "target_accept"]
if k in ["draws", "tune", "chains", "cores", "target_accept", "random_seed"]
}
fit = pm.sample(**sample_kwargs)

Expand Down
115 changes: 105 additions & 10 deletions bbttest/py_bbt.py → bbttest/bbt/py_bbt.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections.abc import Sequence
from collections.abc import Iterable, Sequence
from typing import Literal

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -75,14 +76,18 @@ class PyBBT:
def __init__(
self,
local_rope_value: float | None = None,
tie_solver: TieSolver = TieSolver.SPREAD,
hyper_prior: HyperPrior = HyperPrior.LOG_NORMAL,
tie_solver: TieSolver | str = TieSolver.SPREAD,
hyper_prior: HyperPrior | str = HyperPrior.LOG_NORMAL,
scale: float = 1.0,
):
self._local_rope_value = local_rope_value
self._tie_solver = tie_solver
self._use_davidson = tie_solver == TieSolver.DAVIDSON
self._hyper_prior = hyper_prior
self._tie_solver = (
TieSolver(tie_solver) if isinstance(tie_solver, str) else tie_solver
)
self._use_davidson = self._tie_solver == TieSolver.DAVIDSON
self._hyper_prior = (
HyperPrior(hyper_prior) if isinstance(hyper_prior, str) else hyper_prior
)
self._scale = scale
self._fitted = False

Expand Down Expand Up @@ -140,8 +145,8 @@ def posterior_table(
self,
rope_value: tuple[float, float] = (0.45, 0.55),
control_model: str | None = None,
selected_models: list[str] | None = None,
columns: Sequence[ReportedProperty | str] = DEFAULT_PROPERTIES,
selected_models: Iterable[str] | None = None,
columns: Iterable[ReportedProperty | str] = DEFAULT_PROPERTIES,
hdi_proba: float = 0.89,
round_ndigits: int | None = 2,
) -> pd.DataFrame:
Expand All @@ -165,7 +170,7 @@ def posterior_table(
bbt_result=self._fit_posterior,
alg_names=self._algorithms,
control=control_model,
selected=selected_models,
selected=list(selected_models) if selected_models is not None else None,
)
out_table = pd.DataFrame({"pair": names})
out_table["left_model"] = out_table["pair"].str.split(">").str[0].str.strip()
Expand Down Expand Up @@ -220,10 +225,100 @@ def posterior_table(
out_table["delta"] = out_table["hdi_high"] - out_table["hdi_low"]

if round_ndigits is not None:
return out_table.round(round_ndigits)
return out_table.round(round_ndigits)[["pair", *columns]]
for col in columns:
if col not in out_table.columns:
raise ValueError(
f"Column {col} is not available in the posterior table."
)
return out_table[["pair", *columns]]

def rope_comparison_control_table(
self,
rope_values: Sequence[tuple[float, float]],
control_model: str,
selected_models: Sequence[str] | None = None,
interpretation: Literal["weak", "strong"] = "weak",
return_as_array: bool = False,
join_char: str = ", ",
) -> pd.DataFrame:
"""
Construct a table comparing models against predefined control models across multiple ROPEs.
The output table contains N rows (one per ROPE) and 5 columns
(rope value, better models, equivalent models, worse models, unknown models).

Args:
model: Fitted PyBBT model.
ropes: List of ROPE tuples to evaluate.
interpretation: Type of interpretation to use ("weak" or "strong"), see [1]_.
return_as_array: Whether the individual cells should contain model names as list or joined into single string.
join_char: Character(s) used to join multiple model names in a single cell.

Returns
-------
pd.DataFrame: Table comparing models against control models across multiple ROPEs.

References
----------
.. [1] `Jacques Wainer
"A Bayesian Bradley-Terry model to compare multiple ML algorithms on multiple data sets"
Journal of Machine Learning Research 24 (2023): 1-34
<http://jmlr.org/papers/v24/22-0907.html>`_
"""
self._check_if_fitted()
records = []
for rope in rope_values:
posterior_df = self.posterior_table(
rope_value=rope,
control_model=control_model,
selected_models=selected_models,
columns=[
ReportedProperty.LEFT_MODEL,
ReportedProperty.WEAK_INTERPRETATION,
ReportedProperty.STRONG_INTERPRETATION,
],
)
better_models: list[str] = []
equivalent_models: list[str] = []
worse_models: list[str] = []
unknown_models: list[str] = []
for _, row in posterior_df.iterrows():
interpretation_col = (
"weak_interpretation"
if interpretation == "weak"
else "strong_interpretation"
)
if row[interpretation_col] == f"{row['left_model']} better":
better_models.append(row["left_model"])
elif row[interpretation_col] == "Equivalent":
equivalent_models.append(row["left_model"])
elif row[interpretation_col] == "Unknown":
unknown_models.append(row["left_model"])
else:
worse_models.append(row["left_model"])
if not return_as_array:
better_models_str = join_char.join(better_models)
equivalent_models_str = join_char.join(equivalent_models)
worse_models_str = join_char.join(worse_models)
unknown_models_str = join_char.join(unknown_models)
records.append(
{
"rope_value": rope,
"better_models": better_models_str,
"equivalent_models": equivalent_models_str,
"worse_models": worse_models_str,
"unknown_models": unknown_models_str,
}
)
else:
records.append(
{
"rope_value": rope,
"better_models": better_models,
"equivalent_models": equivalent_models,
"worse_models": worse_models,
"unknown_models": unknown_models,
}
)
result_df = pd.DataFrame.from_records(records)
return result_df
Loading