diff --git a/README.md b/README.md index 98ff6db..a8f4d74 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,20 @@ model = PyBBT( ) ``` +#### Evaluating BBT when reporting errors + +By default BBT assumes that the goal of the evaluation is to maximize the metric (e.g. when reporting F1 score or AUROC). In cases, when metrics reported in the dataframe should be minimized (e.g. RMSE), you can set the parameter `maximize` in `PyBBT` to False: + +```python +model = PyBBT( + local_rope_value=0.01, + maximize=False, # Set to False if the metric should be minimized +).fit( + df, + dataset_col="dataset", +) +``` + ### Paired posterior fitting PyBBT model support two variants of input data for paired case, either a single dataframe with multiple rows per algorithm per dataset, or a pair of dataframes, one defining mean performance per algorithm, and the second with standard deviations. diff --git a/bbttest/bbt/__init__.py b/bbttest/bbt/__init__.py index 16f29c2..64cdcbd 100644 --- a/bbttest/bbt/__init__.py +++ b/bbttest/bbt/__init__.py @@ -1,6 +1,6 @@ """bbt module: Bayesian Bradley-Terry model implementation.""" -from .const import HyperPrior, ReportedProperty, TieSolver +from .params import HyperPrior, ReportedProperty, TieSolver from .py_bbt import PyBBT __all__ = [ diff --git a/bbttest/bbt/alg.py b/bbttest/bbt/alg.py index a55653b..2db69bc 100644 --- a/bbttest/bbt/alg.py +++ b/bbttest/bbt/alg.py @@ -1,4 +1,5 @@ import logging as log +import warnings from collections.abc import Generator, Iterable import arviz as az @@ -6,7 +7,8 @@ import pandas as pd from tqdm.auto import tqdm -from .const import TieSolver +from .const import UNNAMED_COLUMNS_WARNING_TEMPLATE +from .params import TieSolver ALG1_COL = 2 ALG2_COL = 3 @@ -125,14 +127,27 @@ def _construct_win_table( dataset_col: str | int | None, local_rope_value: float | None, tie_solver: TieSolver, + maximize: bool, ) -> tuple[np.ndarray, list[str]]: # Extract algorithm names - algorithms_names = set(data.columns.tolist()) + algorithms_names = data.columns.tolist() if isinstance(dataset_col, int): dataset_col = data.columns[dataset_col] if dataset_col is not None: - algorithms_names.discard(dataset_col) - algorithms_names = list(algorithms_names) + algorithms_names.remove(dataset_col) + + data = data.copy() + if not maximize: + data.loc[:, algorithms_names] = -1 * data[algorithms_names] + + if any("Unnamed" in col for col in algorithms_names): + warnings.warn( + UNNAMED_COLUMNS_WARNING_TEMPLATE.format( + algorithms_names=algorithms_names, + dataset_col=dataset_col, + ), + UserWarning, + ) if dataset_col is None or data.shape[0] == data[dataset_col].nunique(): table = _construct_no_paired( diff --git a/bbttest/bbt/const.py b/bbttest/bbt/const.py index f5d8044..92ad316 100644 --- a/bbttest/bbt/const.py +++ b/bbttest/bbt/const.py @@ -1,72 +1,4 @@ -from enum import Enum - -from pymc.distributions import Cauchy, LogNormal, Normal - - -class HyperPrior(str, Enum): - """ - Hyper Prior distributions for BBT MCMC sampling. - """ - - LOG_NORMAL = "logNormal" - LOG_NORMAL_SCALED = "logNormalScaled" - CAUCHY = "cauchy" - NORMAL = "normal" - - def _get_pymc_dist(self, scale, name="sigma"): - match self: - case HyperPrior.LOG_NORMAL: - return LogNormal(name, mu=0, sigma=1) - case HyperPrior.LOG_NORMAL_SCALED: - return LogNormal(name, mu=0, sigma=scale) - case HyperPrior.CAUCHY: - return Cauchy(name, alpha=0, beta=scale) - case HyperPrior.NORMAL: - return Normal(name, mu=0, sigma=scale) - case _: - raise ValueError(f"Unsupported hyperprior: {self}") - - -class ReportedProperty(str, Enum): - """ - Enum containing properties that can be reported from BBT results. - """ - - LEFT_MODEL = "left_model" - RIGHT_MODEL = "right_model" - MEDIAN = "median" - MEAN = "mean" - HDI_LOW = "hdi_low" - HDI_HIGH = "hdi_high" - DELTA = "delta" - ABOVE_50 = "above_50" - IN_ROPE = "in_rope" - WEAK_INTERPRETATION = "weak_interpretation" - STRONG_INTERPRETATION = "strong_interpretation" - - -class TieSolver(str, Enum): - """ - Enum containing tie solving strategies. - - ADD - Add 1 win to both players. - SPREAD - Add 1/2 win to both players. - FOGET - Ignore the tie. - DAVIDSON - Use Davidson's method to handle ties. - """ - - ADD = "add" - SPREAD = "spread" - FORGET = "forget" - DAVIDSON = "davidson" - - -DEFAULT_PROPERTIES = ( - ReportedProperty.MEAN, - ReportedProperty.DELTA, - ReportedProperty.ABOVE_50, - ReportedProperty.IN_ROPE, - ReportedProperty.WEAK_INTERPRETATION, -) - -ALL_PROPERTIES = tuple(ReportedProperty) +UNNAMED_COLUMNS_WARNING_TEMPLATE = """Some algorithm names are unnamed. This may lead to issues in the win table construction. +Algorithm names extracted: {algorithms_names} +Dataset column: {dataset_col} +""" diff --git a/bbttest/bbt/model.py b/bbttest/bbt/model.py index 105217f..bde5e2c 100644 --- a/bbttest/bbt/model.py +++ b/bbttest/bbt/model.py @@ -3,7 +3,7 @@ import pymc as pm import pytensor.tensor as pt -from .const import HyperPrior +from .params import HyperPrior def _build_bbt_model( diff --git a/bbttest/bbt/params.py b/bbttest/bbt/params.py new file mode 100644 index 0000000..f5d8044 --- /dev/null +++ b/bbttest/bbt/params.py @@ -0,0 +1,72 @@ +from enum import Enum + +from pymc.distributions import Cauchy, LogNormal, Normal + + +class HyperPrior(str, Enum): + """ + Hyper Prior distributions for BBT MCMC sampling. + """ + + LOG_NORMAL = "logNormal" + LOG_NORMAL_SCALED = "logNormalScaled" + CAUCHY = "cauchy" + NORMAL = "normal" + + def _get_pymc_dist(self, scale, name="sigma"): + match self: + case HyperPrior.LOG_NORMAL: + return LogNormal(name, mu=0, sigma=1) + case HyperPrior.LOG_NORMAL_SCALED: + return LogNormal(name, mu=0, sigma=scale) + case HyperPrior.CAUCHY: + return Cauchy(name, alpha=0, beta=scale) + case HyperPrior.NORMAL: + return Normal(name, mu=0, sigma=scale) + case _: + raise ValueError(f"Unsupported hyperprior: {self}") + + +class ReportedProperty(str, Enum): + """ + Enum containing properties that can be reported from BBT results. + """ + + LEFT_MODEL = "left_model" + RIGHT_MODEL = "right_model" + MEDIAN = "median" + MEAN = "mean" + HDI_LOW = "hdi_low" + HDI_HIGH = "hdi_high" + DELTA = "delta" + ABOVE_50 = "above_50" + IN_ROPE = "in_rope" + WEAK_INTERPRETATION = "weak_interpretation" + STRONG_INTERPRETATION = "strong_interpretation" + + +class TieSolver(str, Enum): + """ + Enum containing tie solving strategies. + + ADD - Add 1 win to both players. + SPREAD - Add 1/2 win to both players. + FOGET - Ignore the tie. + DAVIDSON - Use Davidson's method to handle ties. + """ + + ADD = "add" + SPREAD = "spread" + FORGET = "forget" + DAVIDSON = "davidson" + + +DEFAULT_PROPERTIES = ( + ReportedProperty.MEAN, + ReportedProperty.DELTA, + ReportedProperty.ABOVE_50, + ReportedProperty.IN_ROPE, + ReportedProperty.WEAK_INTERPRETATION, +) + +ALL_PROPERTIES = tuple(ReportedProperty) diff --git a/bbttest/bbt/py_bbt.py b/bbttest/bbt/py_bbt.py index 44f677f..747bb85 100644 --- a/bbttest/bbt/py_bbt.py +++ b/bbttest/bbt/py_bbt.py @@ -5,8 +5,8 @@ import pandas as pd from .alg import _construct_win_table, _get_pwin, _hdi -from .const import DEFAULT_PROPERTIES, HyperPrior, ReportedProperty, TieSolver from .model import _mcmcbbt_pymc +from .params import DEFAULT_PROPERTIES, HyperPrior, ReportedProperty, TieSolver class PyBBT: @@ -42,13 +42,17 @@ class PyBBT: scale: float, default 1.0 The scale parameter for the hyper prior distribution. Ignored if the HyperPrior is LOG_NORMAL. + maximize: bool, default True + Whether higher scores indicate better performance (e.g. accuracy/f1). If using a metric where the goal is to + minimize the score (e.g. RMSE) set this to False. + Attributes ---------- fitted: bool Whether the model has been fitted. - Examlples - --------- + Examples + -------- >>> import pandas as pd >>> from bbttest import PyBBT, TieSolver >>> data = pd.DataFrame({ @@ -78,6 +82,7 @@ def __init__( local_rope_value: float | None = None, tie_solver: TieSolver | str = TieSolver.SPREAD, hyper_prior: HyperPrior | str = HyperPrior.LOG_NORMAL, + maximize: bool = True, scale: float = 1.0, ): self._local_rope_value = local_rope_value @@ -88,6 +93,7 @@ def __init__( self._hyper_prior = ( HyperPrior(hyper_prior) if isinstance(hyper_prior, str) else hyper_prior ) + self._maximize = maximize self._scale = scale self._fitted = False @@ -127,6 +133,7 @@ def fit( dataset_col=dataset_col, local_rope_value=self._local_rope_value, tie_solver=self._tie_solver, + maximize=self._maximize, ) self._fit_posterior = _mcmcbbt_pymc( diff --git a/pyproject.toml b/pyproject.toml index b8580ec..d25f523 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,11 @@ test = [ "mypy", ] +[tool.pytest] +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", +] + [tool.mypy] python_version = "3.11" check_untyped_defs = true # check all functions, this fixes some tests diff --git a/tests/bbt/test_alg.py b/tests/bbt/test_alg.py new file mode 100644 index 0000000..2532a11 --- /dev/null +++ b/tests/bbt/test_alg.py @@ -0,0 +1,114 @@ +from io import StringIO + +import numpy as np +import pandas as pd +import pytest + +from bbttest.bbt.alg import ( + _construct_win_table, +) +from bbttest.bbt.params import TieSolver + +SCORES_1 = pd.DataFrame( + { + "alg1": [0.705, 0.7, 0.9], + "alg2": [0.696, 0.7, 0.8], + "alg3": [0.7, 0.75, 0.9], + } +) + + +class TestConstructTable: + """Test whether the win/tie/loss table is constructed correctly.""" + + @pytest.mark.parametrize( + "data, local_rope_value, maximize, expected_table", + [ + ( + SCORES_1, + None, + True, + np.array( + [ + [0, 1, 2, 0, 1], # alg1 vs alg2 + [0, 2, 1, 1, 1], # alg1 vs alg3 + [1, 2, 0, 3, 0], # alg2 vs alg3 + ] + ), + ), + ( + SCORES_1, + 0.01, + True, + np.array( + [ + [0, 1, 1, 0, 2], # alg1 vs alg2 + [0, 2, 0, 1, 2], # alg1 vs alg3 + [1, 2, 0, 2, 1], # alg2 vs alg3 + ] + ), + ), + ( + SCORES_1, + 0.01, + False, + np.array( + [ + [0, 1, 0, 1, 2], # alg1 vs alg2 + [0, 2, 1, 0, 2], # alg1 vs alg3 + [1, 2, 2, 0, 1], # alg2 vs alg3 + ] + ), + ), + ], + ) + def test_construct_win_table( + self, + data: pd.DataFrame, + local_rope_value: float | None, + maximize: bool, + expected_table: np.ndarray, + ): + """Test the construction of the win/tie/loss table.""" + # When + result_table, alg_names = _construct_win_table( + data=data, + data_sd=None, + dataset_col=None, + local_rope_value=local_rope_value, + tie_solver=TieSolver.DAVIDSON, # Keeps the ties in the table + maximize=maximize, + ) + + # Then + np.testing.assert_array_almost_equal(result_table, expected_table) + + +class TestUserWarnings: + """Test whether the correct warnings are raised.""" + + def test_unnamed_columns(self): + """Test whether a warning is raised when the dataset column is unnamed.""" + # Given - This simulated incorrect reading of a CSV file with an index + + CSV_CONTENT = """,alg1,alg2,alg3 + 0,0.705,0.696,0.7 + 1,0.7,0.7,0.75 + 2,0.9,0.8,0.9 + """ + + data = pd.read_csv(StringIO(CSV_CONTENT)) + # When / Then + + with pytest.warns( + UserWarning, + match="Some algorithm names are unnamed. This may lead to issues in the win table construction.", + ): + _construct_win_table( + data=data, + data_sd=None, + dataset_col=None, # This column is unnamed + local_rope_value=None, + tie_solver=TieSolver.DAVIDSON, + maximize=True, + ) diff --git a/tests/bbt/test_py_bbt.py b/tests/bbt/test_py_bbt.py index b83fc69..b00b75f 100644 --- a/tests/bbt/test_py_bbt.py +++ b/tests/bbt/test_py_bbt.py @@ -11,7 +11,7 @@ import pytest from bbttest import HyperPrior, PyBBT, ReportedProperty, TieSolver -from bbttest.bbt.const import ALL_PROPERTIES +from bbttest.bbt.params import ALL_PROPERTIES @pytest.fixture(scope="module") @@ -136,6 +136,7 @@ def test_init_defaults(self): assert model._tie_solver == TieSolver.SPREAD assert model._hyper_prior == HyperPrior.LOG_NORMAL assert model._scale == 1.0 + assert model._maximize assert not model.fitted diff --git a/tests/regression/test_benchmarking_mol.py b/tests/regression/test_benchmarking_mol.py index 20ebef0..9995026 100644 --- a/tests/regression/test_benchmarking_mol.py +++ b/tests/regression/test_benchmarking_mol.py @@ -35,7 +35,7 @@ import pytest from bbttest import PyBBT, TieSolver -from bbttest.bbt.const import DEFAULT_PROPERTIES, ReportedProperty +from bbttest.bbt.params import DEFAULT_PROPERTIES, ReportedProperty @pytest.fixture(scope="module")