From 3e2564bca25c8d86b70c67655e40673ce887a832 Mon Sep 17 00:00:00 2001
From: Mateusz Praski <mateusz.praski@gmail.com>
Date: Mon, 23 Feb 2026 16:52:31 +0100
Subject: [PATCH 1/5] Added support for error based metrics

---
 README.md             |  14 +++++
 bbttest/bbt/alg.py    |  21 ++++++--
 bbttest/bbt/py_bbt.py |   7 +++
 tests/bbt/test_alg.py | 116 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 155 insertions(+), 3 deletions(-)
 create mode 100644 tests/bbt/test_alg.py

diff --git a/README.md b/README.md
index 98ff6db..2d0ead5 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,20 @@ model = PyBBT(
 )
 ```
 
+#### Evaluating BBT when reporting errors
+
+By default BBT assumes that the goal of the evaluation is to maximize the metric (e.g. when reporting F1 score or AUROC). In cases, when metrics reported in the dataframe should be minimzed (e.g. RMSE), you can set the parameter `maximize` in `PyBBT` to False:
+
+```python
+model = PyBBT(
+    local_rope_value=0.01,
+    maximize=False, # Set to False if the metric should be minimized
+).fit(
+    df,
+    dataset_col="dataset",
+)
+```
+
 ### Paired posterior fitting
 
 PyBBT model support two variants of input data for paired case, either a single dataframe with multiple rows per algorithm per dataset, or a pair of dataframes, one defining mean performance per algorithm, and the second with standard deviations.
diff --git a/bbttest/bbt/alg.py b/bbttest/bbt/alg.py
index a55653b..f8b5df4 100644
--- a/bbttest/bbt/alg.py
+++ b/bbttest/bbt/alg.py
@@ -1,4 +1,5 @@
 import logging as log
+import warnings
 from collections.abc import Generator, Iterable
 
 import arviz as az
@@ -125,14 +126,28 @@ def _construct_win_table(
     dataset_col: str | int | None,
     local_rope_value: float | None,
     tie_solver: TieSolver,
+    maximize: bool,
 ) -> tuple[np.ndarray, list[str]]:
     # Extract algorithm names
-    algorithms_names = set(data.columns.tolist())
+    algorithms_names = data.columns.tolist()
     if isinstance(dataset_col, int):
         dataset_col = data.columns[dataset_col]
     if dataset_col is not None:
-        algorithms_names.discard(dataset_col)
-    algorithms_names = list(algorithms_names)
+        algorithms_names.remove(dataset_col)
+
+    data = data.copy()
+    if not maximize:
+        data.loc[:, algorithms_names] = -1 * data[algorithms_names]
+
+    if any("Unnamed" in col for col in algorithms_names):
+        warnings.warn(
+            f"""
+            Some algorithm names are unnamed. This may lead to issues in the win table construction.
+            Algorithm names extracted: {algorithms_names}
+            Dataset column: {dataset_col}
+            """,
+            UserWarning,
+        )
 
     if dataset_col is None or data.shape[0] == data[dataset_col].nunique():
         table = _construct_no_paired(
diff --git a/bbttest/bbt/py_bbt.py b/bbttest/bbt/py_bbt.py
index 44f677f..550d34b 100644
--- a/bbttest/bbt/py_bbt.py
+++ b/bbttest/bbt/py_bbt.py
@@ -42,6 +42,10 @@ class PyBBT:
     scale: float, default 1.0
         The scale parameter for the hyper prior distribution. Ignored if the HyperPrior is LOG_NORMAL.
 
+    maximize: bool, default True
+        Whether higher scores indicate better performance (e.g. accuracy/f1). If using a metric where the goal is to
+        minimize the score (e.g. RMSE) set this to False.
+
     Attributes
     ----------
     fitted: bool
@@ -78,6 +82,7 @@ def __init__(
         local_rope_value: float | None = None,
         tie_solver: TieSolver | str = TieSolver.SPREAD,
         hyper_prior: HyperPrior | str = HyperPrior.LOG_NORMAL,
+        maximize: bool = True,
         scale: float = 1.0,
     ):
         self._local_rope_value = local_rope_value
@@ -88,6 +93,7 @@ def __init__(
         self._hyper_prior = (
             HyperPrior(hyper_prior) if isinstance(hyper_prior, str) else hyper_prior
         )
+        self._maximize = maximize
         self._scale = scale
         self._fitted = False
 
@@ -127,6 +133,7 @@ def fit(
             dataset_col=dataset_col,
             local_rope_value=self._local_rope_value,
             tie_solver=self._tie_solver,
+            maximize=self._maximize,
         )
 
         self._fit_posterior = _mcmcbbt_pymc(
diff --git a/tests/bbt/test_alg.py b/tests/bbt/test_alg.py
new file mode 100644
index 0000000..1a0c209
--- /dev/null
+++ b/tests/bbt/test_alg.py
@@ -0,0 +1,116 @@
+from io import StringIO
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from bbttest.bbt.alg import (
+    _construct_win_table,
+)
+from bbttest.bbt.const import TieSolver
+
+SCORES_1 = pd.DataFrame(
+    {
+        "alg1": [0.705, 0.7, 0.9],
+        "alg2": [0.696, 0.7, 0.8],
+        "alg3": [0.7, 0.75, 0.9],
+    }
+)
+
+
+class TestConstructTable:
+    """Test whether the win/tie/loss table is constructed correctly."""
+
+    @pytest.mark.parametrize(
+        "data, local_rope_value, maximize, expected_table",
+        [
+            (
+                SCORES_1,
+                None,
+                True,
+                np.array(
+                    [
+                        [0, 1, 2, 0, 1],  # alg1 vs alg2
+                        [0, 2, 1, 1, 1],  # alg1 vs alg3
+                        [1, 2, 0, 3, 0],  # alg2 vs alg3
+                    ]
+                ),
+            ),
+            (
+                SCORES_1,
+                0.01,
+                True,
+                np.array(
+                    [
+                        [0, 1, 1, 0, 2],  # alg1 vs alg2
+                        [0, 2, 0, 1, 2],  # alg1 vs alg3
+                        [1, 2, 0, 2, 1],  # alg2 vs alg3
+                    ]
+                ),
+            ),
+            (
+                SCORES_1,
+                0.01,
+                False,
+                np.array(
+                    [
+                        [0, 1, 0, 1, 2],  # alg1 vs alg2
+                        [0, 2, 1, 0, 2],  # alg1 vs alg3
+                        [1, 2, 2, 0, 1],  # alg2 vs alg3
+                    ]
+                ),
+            ),
+        ],
+    )
+    def test_construct_win_table(
+        self,
+        data: pd.DataFrame,
+        local_rope_value: float,
+        maximize: bool,
+        expected_table: np.ndarray,
+    ):
+        """Test the construction of the win/tie/loss table."""
+        # When
+        result_table, alg_names = _construct_win_table(
+            data=data,
+            data_sd=None,
+            dataset_col=None,
+            local_rope_value=local_rope_value,
+            tie_solver=TieSolver.DAVIDSON,  # Keeps the ties in the table
+            maximize=maximize,
+        )
+
+        print(alg_names)
+
+        # Then
+        np.testing.assert_array_almost_equal(result_table, expected_table)
+
+
+class TestUserWarnings:
+    """Test whether the correct warnings are raised."""
+
+    def test_unnamed_columns(self):
+        """Test whether a warning is raised when the dataset column is unnamed."""
+        # Given - This simulated incorrect reading of a CSV file with an index
+
+        CSV_CONTENT = """,alg1,alg2,alg3
+        0,0.705,0.696,0.7
+        1,0.7,0.7,0.75
+        2,0.9,0.8,0.9
+        """
+
+        data = pd.read_csv(StringIO(CSV_CONTENT))
+        # When / Then
+
+        with pytest.warns(
+            UserWarning,
+            match="Some algorithm names are unnamed. This may lead to issues in the win table construction.",
+        ):
+            _construct_win_table(
+                data=data,
+                data_sd=None,
+                dataset_col=None,  # This column is unnamed
+                local_rope_value=None,
+                tie_solver=TieSolver.DAVIDSON,
+                maximize=True,
+            )

From 504611fc7eb3b7286d650e2fca256ba34f194790 Mon Sep 17 00:00:00 2001
From: Mateusz Praski <mateusz.praski@gmail.com>
Date: Mon, 23 Feb 2026 17:09:04 +0100
Subject: [PATCH 2/5] Reorganize package-only const values and configuration
 enums

---
 bbttest/bbt/__init__.py  |  2 +-
 bbttest/bbt/alg.py       | 12 +++----
 bbttest/bbt/const.py     | 76 +++-------------------------------------
 bbttest/bbt/model.py     |  2 +-
 bbttest/bbt/params.py    | 72 +++++++++++++++++++++++++++++++++++++
 bbttest/bbt/py_bbt.py    |  2 +-
 tests/bbt/test_alg.py    |  2 --
 tests/bbt/test_py_bbt.py |  1 +
 8 files changed, 86 insertions(+), 83 deletions(-)
 create mode 100644 bbttest/bbt/params.py

diff --git a/bbttest/bbt/__init__.py b/bbttest/bbt/__init__.py
index 16f29c2..64cdcbd 100644
--- a/bbttest/bbt/__init__.py
+++ b/bbttest/bbt/__init__.py
@@ -1,6 +1,6 @@
 """bbt module: Bayesian Bradley-Terry model implementation."""
 
-from .const import HyperPrior, ReportedProperty, TieSolver
+from .params import HyperPrior, ReportedProperty, TieSolver
 from .py_bbt import PyBBT
 
 __all__ = [
diff --git a/bbttest/bbt/alg.py b/bbttest/bbt/alg.py
index f8b5df4..2db69bc 100644
--- a/bbttest/bbt/alg.py
+++ b/bbttest/bbt/alg.py
@@ -7,7 +7,8 @@
 import pandas as pd
 from tqdm.auto import tqdm
 
-from .const import TieSolver
+from .const import UNNAMED_COLUMNS_WARNING_TEMPLATE
+from .params import TieSolver
 
 ALG1_COL = 2
 ALG2_COL = 3
@@ -141,11 +142,10 @@ def _construct_win_table(
 
     if any("Unnamed" in col for col in algorithms_names):
         warnings.warn(
-            f"""
-            Some algorithm names are unnamed. This may lead to issues in the win table construction.
-            Algorithm names extracted: {algorithms_names}
-            Dataset column: {dataset_col}
-            """,
+            UNNAMED_COLUMNS_WARNING_TEMPLATE.format(
+                algorithms_names=algorithms_names,
+                dataset_col=dataset_col,
+            ),
             UserWarning,
         )
 
diff --git a/bbttest/bbt/const.py b/bbttest/bbt/const.py
index f5d8044..92ad316 100644
--- a/bbttest/bbt/const.py
+++ b/bbttest/bbt/const.py
@@ -1,72 +1,4 @@
-from enum import Enum
-
-from pymc.distributions import Cauchy, LogNormal, Normal
-
-
-class HyperPrior(str, Enum):
-    """
-    Hyper Prior distributions for BBT MCMC sampling.
-    """
-
-    LOG_NORMAL = "logNormal"
-    LOG_NORMAL_SCALED = "logNormalScaled"
-    CAUCHY = "cauchy"
-    NORMAL = "normal"
-
-    def _get_pymc_dist(self, scale, name="sigma"):
-        match self:
-            case HyperPrior.LOG_NORMAL:
-                return LogNormal(name, mu=0, sigma=1)
-            case HyperPrior.LOG_NORMAL_SCALED:
-                return LogNormal(name, mu=0, sigma=scale)
-            case HyperPrior.CAUCHY:
-                return Cauchy(name, alpha=0, beta=scale)
-            case HyperPrior.NORMAL:
-                return Normal(name, mu=0, sigma=scale)
-            case _:
-                raise ValueError(f"Unsupported hyperprior: {self}")
-
-
-class ReportedProperty(str, Enum):
-    """
-    Enum containing properties that can be reported from BBT results.
-    """
-
-    LEFT_MODEL = "left_model"
-    RIGHT_MODEL = "right_model"
-    MEDIAN = "median"
-    MEAN = "mean"
-    HDI_LOW = "hdi_low"
-    HDI_HIGH = "hdi_high"
-    DELTA = "delta"
-    ABOVE_50 = "above_50"
-    IN_ROPE = "in_rope"
-    WEAK_INTERPRETATION = "weak_interpretation"
-    STRONG_INTERPRETATION = "strong_interpretation"
-
-
-class TieSolver(str, Enum):
-    """
-    Enum containing tie solving strategies.
-
-    ADD - Add 1 win to both players.
-    SPREAD - Add 1/2 win to both players.
-    FOGET - Ignore the tie.
-    DAVIDSON - Use Davidson's method to handle ties.
-    """
-
-    ADD = "add"
-    SPREAD = "spread"
-    FORGET = "forget"
-    DAVIDSON = "davidson"
-
-
-DEFAULT_PROPERTIES = (
-    ReportedProperty.MEAN,
-    ReportedProperty.DELTA,
-    ReportedProperty.ABOVE_50,
-    ReportedProperty.IN_ROPE,
-    ReportedProperty.WEAK_INTERPRETATION,
-)
-
-ALL_PROPERTIES = tuple(ReportedProperty)
+UNNAMED_COLUMNS_WARNING_TEMPLATE = """Some algorithm names are unnamed. This may lead to issues in the win table construction.
+Algorithm names extracted: {algorithms_names}
+Dataset column: {dataset_col}
+"""
diff --git a/bbttest/bbt/model.py b/bbttest/bbt/model.py
index 105217f..bde5e2c 100644
--- a/bbttest/bbt/model.py
+++ b/bbttest/bbt/model.py
@@ -3,7 +3,7 @@
 import pymc as pm
 import pytensor.tensor as pt
 
-from .const import HyperPrior
+from .params import HyperPrior
 
 
 def _build_bbt_model(
diff --git a/bbttest/bbt/params.py b/bbttest/bbt/params.py
new file mode 100644
index 0000000..f5d8044
--- /dev/null
+++ b/bbttest/bbt/params.py
@@ -0,0 +1,72 @@
+from enum import Enum
+
+from pymc.distributions import Cauchy, LogNormal, Normal
+
+
+class HyperPrior(str, Enum):
+    """
+    Hyper Prior distributions for BBT MCMC sampling.
+    """
+
+    LOG_NORMAL = "logNormal"
+    LOG_NORMAL_SCALED = "logNormalScaled"
+    CAUCHY = "cauchy"
+    NORMAL = "normal"
+
+    def _get_pymc_dist(self, scale, name="sigma"):
+        match self:
+            case HyperPrior.LOG_NORMAL:
+                return LogNormal(name, mu=0, sigma=1)
+            case HyperPrior.LOG_NORMAL_SCALED:
+                return LogNormal(name, mu=0, sigma=scale)
+            case HyperPrior.CAUCHY:
+                return Cauchy(name, alpha=0, beta=scale)
+            case HyperPrior.NORMAL:
+                return Normal(name, mu=0, sigma=scale)
+            case _:
+                raise ValueError(f"Unsupported hyperprior: {self}")
+
+
+class ReportedProperty(str, Enum):
+    """
+    Enum containing properties that can be reported from BBT results.
+    """
+
+    LEFT_MODEL = "left_model"
+    RIGHT_MODEL = "right_model"
+    MEDIAN = "median"
+    MEAN = "mean"
+    HDI_LOW = "hdi_low"
+    HDI_HIGH = "hdi_high"
+    DELTA = "delta"
+    ABOVE_50 = "above_50"
+    IN_ROPE = "in_rope"
+    WEAK_INTERPRETATION = "weak_interpretation"
+    STRONG_INTERPRETATION = "strong_interpretation"
+
+
+class TieSolver(str, Enum):
+    """
+    Enum containing tie solving strategies.
+
+    ADD - Add 1 win to both players.
+    SPREAD - Add 1/2 win to both players.
+    FOGET - Ignore the tie.
+    DAVIDSON - Use Davidson's method to handle ties.
+    """
+
+    ADD = "add"
+    SPREAD = "spread"
+    FORGET = "forget"
+    DAVIDSON = "davidson"
+
+
+DEFAULT_PROPERTIES = (
+    ReportedProperty.MEAN,
+    ReportedProperty.DELTA,
+    ReportedProperty.ABOVE_50,
+    ReportedProperty.IN_ROPE,
+    ReportedProperty.WEAK_INTERPRETATION,
+)
+
+ALL_PROPERTIES = tuple(ReportedProperty)
diff --git a/bbttest/bbt/py_bbt.py b/bbttest/bbt/py_bbt.py
index 550d34b..baf88ca 100644
--- a/bbttest/bbt/py_bbt.py
+++ b/bbttest/bbt/py_bbt.py
@@ -5,8 +5,8 @@
 import pandas as pd
 
 from .alg import _construct_win_table, _get_pwin, _hdi
-from .const import DEFAULT_PROPERTIES, HyperPrior, ReportedProperty, TieSolver
 from .model import _mcmcbbt_pymc
+from .params import DEFAULT_PROPERTIES, HyperPrior, ReportedProperty, TieSolver
 
 
 class PyBBT:
diff --git a/tests/bbt/test_alg.py b/tests/bbt/test_alg.py
index 1a0c209..5ad025e 100644
--- a/tests/bbt/test_alg.py
+++ b/tests/bbt/test_alg.py
@@ -80,8 +80,6 @@ def test_construct_win_table(
             maximize=maximize,
         )
 
-        print(alg_names)
-
         # Then
         np.testing.assert_array_almost_equal(result_table, expected_table)
 
diff --git a/tests/bbt/test_py_bbt.py b/tests/bbt/test_py_bbt.py
index b83fc69..69fbee3 100644
--- a/tests/bbt/test_py_bbt.py
+++ b/tests/bbt/test_py_bbt.py
@@ -136,6 +136,7 @@ def test_init_defaults(self):
         assert model._tie_solver == TieSolver.SPREAD
         assert model._hyper_prior == HyperPrior.LOG_NORMAL
         assert model._scale == 1.0
+        assert model._maximize
         assert not model.fitted
 
 

From 349115a7dd4a155c3bf609f28d4d29e77d8b6d47 Mon Sep 17 00:00:00 2001
From: Mateusz Praski <mateusz.praski@gmail.com>
Date: Mon, 23 Feb 2026 17:11:03 +0100
Subject: [PATCH 3/5] Fix typos

---
 README.md             | 2 +-
 bbttest/bbt/py_bbt.py | 4 ++--
 tests/bbt/test_alg.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 2d0ead5..a8f4d74 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ model = PyBBT(
 
 #### Evaluating BBT when reporting errors
 
-By default BBT assumes that the goal of the evaluation is to maximize the metric (e.g. when reporting F1 score or AUROC). In cases, when metrics reported in the dataframe should be minimzed (e.g. RMSE), you can set the parameter `maximize` in `PyBBT` to False:
+By default BBT assumes that the goal of the evaluation is to maximize the metric (e.g. when reporting F1 score or AUROC). In cases, when metrics reported in the dataframe should be minimized (e.g. RMSE), you can set the parameter `maximize` in `PyBBT` to False:
 
 ```python
 model = PyBBT(
diff --git a/bbttest/bbt/py_bbt.py b/bbttest/bbt/py_bbt.py
index baf88ca..747bb85 100644
--- a/bbttest/bbt/py_bbt.py
+++ b/bbttest/bbt/py_bbt.py
@@ -51,8 +51,8 @@ class PyBBT:
     fitted: bool
         Whether the model has been fitted.
 
-    Examlples
-    ---------
+    Examples
+    --------
     >>> import pandas as pd
     >>> from bbttest import PyBBT, TieSolver
     >>> data = pd.DataFrame({
diff --git a/tests/bbt/test_alg.py b/tests/bbt/test_alg.py
index 5ad025e..be19059 100644
--- a/tests/bbt/test_alg.py
+++ b/tests/bbt/test_alg.py
@@ -65,7 +65,7 @@ class TestConstructTable:
     def test_construct_win_table(
         self,
         data: pd.DataFrame,
-        local_rope_value: float,
+        local_rope_value: float | None,
         maximize: bool,
         expected_table: np.ndarray,
     ):

From f8df442a0ffc0b33ff2a69ab8e3ac1c51c200e1d Mon Sep 17 00:00:00 2001
From: Mateusz Praski <mateusz.praski@gmail.com>
Date: Mon, 23 Feb 2026 17:14:38 +0100
Subject: [PATCH 4/5] Fix imports in tests:

---
 tests/bbt/test_alg.py                     | 2 +-
 tests/bbt/test_py_bbt.py                  | 2 +-
 tests/regression/test_benchmarking_mol.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/bbt/test_alg.py b/tests/bbt/test_alg.py
index be19059..2532a11 100644
--- a/tests/bbt/test_alg.py
+++ b/tests/bbt/test_alg.py
@@ -7,7 +7,7 @@
 from bbttest.bbt.alg import (
     _construct_win_table,
 )
-from bbttest.bbt.const import TieSolver
+from bbttest.bbt.params import TieSolver
 
 SCORES_1 = pd.DataFrame(
     {
diff --git a/tests/bbt/test_py_bbt.py b/tests/bbt/test_py_bbt.py
index 69fbee3..b00b75f 100644
--- a/tests/bbt/test_py_bbt.py
+++ b/tests/bbt/test_py_bbt.py
@@ -11,7 +11,7 @@
 import pytest
 
 from bbttest import HyperPrior, PyBBT, ReportedProperty, TieSolver
-from bbttest.bbt.const import ALL_PROPERTIES
+from bbttest.bbt.params import ALL_PROPERTIES
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/regression/test_benchmarking_mol.py b/tests/regression/test_benchmarking_mol.py
index 20ebef0..9995026 100644
--- a/tests/regression/test_benchmarking_mol.py
+++ b/tests/regression/test_benchmarking_mol.py
@@ -35,7 +35,7 @@
 import pytest
 
 from bbttest import PyBBT, TieSolver
-from bbttest.bbt.const import DEFAULT_PROPERTIES, ReportedProperty
+from bbttest.bbt.params import DEFAULT_PROPERTIES, ReportedProperty
 
 
 @pytest.fixture(scope="module")

From 61be67d39dc565e00cdac6797f2b8232c585c9c4 Mon Sep 17 00:00:00 2001
From: Mateusz Praski <mateusz.praski@gmail.com>
Date: Mon, 23 Feb 2026 17:16:22 +0100
Subject: [PATCH 5/5] Register `slow` mark in Pytest

---
 pyproject.toml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index b8580ec..d25f523 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,6 +43,11 @@ test = [
     "mypy",
 ]
 
+[tool.pytest]
+markers = [
+    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+]
+
 [tool.mypy]
 python_version = "3.11"
 check_untyped_defs = true  # check all functions, this fixes some tests