MLCIL · Thematiq · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
@@ -57,6 +57,20 @@ model = PyBBT(
 )
 ```
 
+#### Evaluating BBT when reporting errors
+
+By default BBT assumes that the goal of the evaluation is to maximize the metric (e.g. when reporting F1 score or AUROC). In cases, when metrics reported in the dataframe should be minimized (e.g. RMSE), you can set the parameter `maximize` in `PyBBT` to False:
+
+```python
+model = PyBBT(
+    local_rope_value=0.01,
+    maximize=False, # Set to False if the metric should be minimized
+).fit(
+    df,
+    dataset_col="dataset",
+)
+```
+
 ### Paired posterior fitting
 
 PyBBT model support two variants of input data for paired case, either a single dataframe with multiple rows per algorithm per dataset, or a pair of dataframes, one defining mean performance per algorithm, and the second with standard deviations.

@@ -1,6 +1,6 @@
 """bbt module: Bayesian Bradley-Terry model implementation."""
 
-from .const import HyperPrior, ReportedProperty, TieSolver
+from .params import HyperPrior, ReportedProperty, TieSolver
 from .py_bbt import PyBBT
 
 __all__ = [

@@ -1,12 +1,14 @@
 import logging as log
+import warnings
 from collections.abc import Generator, Iterable
 
 import arviz as az
 import numpy as np
 import pandas as pd
 from tqdm.auto import tqdm
 
-from .const import TieSolver
+from .const import UNNAMED_COLUMNS_WARNING_TEMPLATE
+from .params import TieSolver
 
 ALG1_COL = 2
 ALG2_COL = 3
@@ -125,14 +127,27 @@ def _construct_win_table(
     dataset_col: str | int | None,
     local_rope_value: float | None,
     tie_solver: TieSolver,
+    maximize: bool,
 ) -> tuple[np.ndarray, list[str]]:
     # Extract algorithm names
-    algorithms_names = set(data.columns.tolist())
+    algorithms_names = data.columns.tolist()
     if isinstance(dataset_col, int):
         dataset_col = data.columns[dataset_col]
     if dataset_col is not None:
-        algorithms_names.discard(dataset_col)
-    algorithms_names = list(algorithms_names)
+        algorithms_names.remove(dataset_col)
+
+    data = data.copy()
+    if not maximize:
+        data.loc[:, algorithms_names] = -1 * data[algorithms_names]
+
+    if any("Unnamed" in col for col in algorithms_names):
+        warnings.warn(
+            UNNAMED_COLUMNS_WARNING_TEMPLATE.format(
+                algorithms_names=algorithms_names,
+                dataset_col=dataset_col,
+            ),
+            UserWarning,
+        )
 
     if dataset_col is None or data.shape[0] == data[dataset_col].nunique():
         table = _construct_no_paired(

@@ -1,72 +1,4 @@
-from enum import Enum
-
-from pymc.distributions import Cauchy, LogNormal, Normal
-
-
-class HyperPrior(str, Enum):
-    """
-    Hyper Prior distributions for BBT MCMC sampling.
-    """
-
-    LOG_NORMAL = "logNormal"
-    LOG_NORMAL_SCALED = "logNormalScaled"
-    CAUCHY = "cauchy"
-    NORMAL = "normal"
-
-    def _get_pymc_dist(self, scale, name="sigma"):
-        match self:
-            case HyperPrior.LOG_NORMAL:
-                return LogNormal(name, mu=0, sigma=1)
-            case HyperPrior.LOG_NORMAL_SCALED:
-                return LogNormal(name, mu=0, sigma=scale)
-            case HyperPrior.CAUCHY:
-                return Cauchy(name, alpha=0, beta=scale)
-            case HyperPrior.NORMAL:
-                return Normal(name, mu=0, sigma=scale)
-            case _:
-                raise ValueError(f"Unsupported hyperprior: {self}")
-
-
-class ReportedProperty(str, Enum):
-    """
-    Enum containing properties that can be reported from BBT results.
-    """
-
-    LEFT_MODEL = "left_model"
-    RIGHT_MODEL = "right_model"
-    MEDIAN = "median"
-    MEAN = "mean"
-    HDI_LOW = "hdi_low"
-    HDI_HIGH = "hdi_high"
-    DELTA = "delta"
-    ABOVE_50 = "above_50"
-    IN_ROPE = "in_rope"
-    WEAK_INTERPRETATION = "weak_interpretation"
-    STRONG_INTERPRETATION = "strong_interpretation"
-
-
-class TieSolver(str, Enum):
-    """
-    Enum containing tie solving strategies.
-
-    ADD - Add 1 win to both players.
-    SPREAD - Add 1/2 win to both players.
-    FOGET - Ignore the tie.
-    DAVIDSON - Use Davidson's method to handle ties.
-    """
-
-    ADD = "add"
-    SPREAD = "spread"
-    FORGET = "forget"
-    DAVIDSON = "davidson"
-
-
-DEFAULT_PROPERTIES = (
-    ReportedProperty.MEAN,
-    ReportedProperty.DELTA,
-    ReportedProperty.ABOVE_50,
-    ReportedProperty.IN_ROPE,
-    ReportedProperty.WEAK_INTERPRETATION,
-)
-
-ALL_PROPERTIES = tuple(ReportedProperty)
+UNNAMED_COLUMNS_WARNING_TEMPLATE = """Some algorithm names are unnamed. This may lead to issues in the win table construction.
+Algorithm names extracted: {algorithms_names}
+Dataset column: {dataset_col}
+"""
@@ -3,7 +3,7 @@
 import pymc as pm
 import pytensor.tensor as pt
 
-from .const import HyperPrior
+from .params import HyperPrior
 
 
 def _build_bbt_model(

@@ -0,0 +1,72 @@
+from enum import Enum
+
+from pymc.distributions import Cauchy, LogNormal, Normal
+
+
+class HyperPrior(str, Enum):
+    """
+    Hyper Prior distributions for BBT MCMC sampling.
+    """
+
+    LOG_NORMAL = "logNormal"
+    LOG_NORMAL_SCALED = "logNormalScaled"
+    CAUCHY = "cauchy"
+    NORMAL = "normal"
+
+    def _get_pymc_dist(self, scale, name="sigma"):
+        match self:
+            case HyperPrior.LOG_NORMAL:
+                return LogNormal(name, mu=0, sigma=1)
+            case HyperPrior.LOG_NORMAL_SCALED:
+                return LogNormal(name, mu=0, sigma=scale)
+            case HyperPrior.CAUCHY:
+                return Cauchy(name, alpha=0, beta=scale)
+            case HyperPrior.NORMAL:
+                return Normal(name, mu=0, sigma=scale)
+            case _:
+                raise ValueError(f"Unsupported hyperprior: {self}")
+
+
+class ReportedProperty(str, Enum):
+    """
+    Enum containing properties that can be reported from BBT results.
+    """
+
+    LEFT_MODEL = "left_model"
+    RIGHT_MODEL = "right_model"
+    MEDIAN = "median"
+    MEAN = "mean"
+    HDI_LOW = "hdi_low"
+    HDI_HIGH = "hdi_high"
+    DELTA = "delta"
+    ABOVE_50 = "above_50"
+    IN_ROPE = "in_rope"
+    WEAK_INTERPRETATION = "weak_interpretation"
+    STRONG_INTERPRETATION = "strong_interpretation"
+
+
+class TieSolver(str, Enum):
+    """
+    Enum containing tie solving strategies.
+
+    ADD - Add 1 win to both players.
+    SPREAD - Add 1/2 win to both players.
+    FOGET - Ignore the tie.
+    DAVIDSON - Use Davidson's method to handle ties.
+    """
+
+    ADD = "add"
+    SPREAD = "spread"
+    FORGET = "forget"
+    DAVIDSON = "davidson"
+
+
+DEFAULT_PROPERTIES = (
+    ReportedProperty.MEAN,
+    ReportedProperty.DELTA,
+    ReportedProperty.ABOVE_50,
+    ReportedProperty.IN_ROPE,
+    ReportedProperty.WEAK_INTERPRETATION,
+)
+
+ALL_PROPERTIES = tuple(ReportedProperty)
@@ -5,8 +5,8 @@
 import pandas as pd
 
 from .alg import _construct_win_table, _get_pwin, _hdi
-from .const import DEFAULT_PROPERTIES, HyperPrior, ReportedProperty, TieSolver
 from .model import _mcmcbbt_pymc
+from .params import DEFAULT_PROPERTIES, HyperPrior, ReportedProperty, TieSolver
 
 
 class PyBBT:
@@ -42,13 +42,17 @@ class PyBBT:
     scale: float, default 1.0
         The scale parameter for the hyper prior distribution. Ignored if the HyperPrior is LOG_NORMAL.
 
+    maximize: bool, default True
+        Whether higher scores indicate better performance (e.g. accuracy/f1). If using a metric where the goal is to
+        minimize the score (e.g. RMSE) set this to False.
+
     Attributes
     ----------
     fitted: bool
         Whether the model has been fitted.
 
-    Examlples
-    ---------
+    Examples
+    --------
     >>> import pandas as pd
     >>> from bbttest import PyBBT, TieSolver
     >>> data = pd.DataFrame({
@@ -78,6 +82,7 @@ def __init__(
         local_rope_value: float | None = None,
         tie_solver: TieSolver | str = TieSolver.SPREAD,
         hyper_prior: HyperPrior | str = HyperPrior.LOG_NORMAL,
+        maximize: bool = True,
         scale: float = 1.0,
     ):
         self._local_rope_value = local_rope_value
@@ -88,6 +93,7 @@ def __init__(
         self._hyper_prior = (
             HyperPrior(hyper_prior) if isinstance(hyper_prior, str) else hyper_prior
         )
+        self._maximize = maximize
         self._scale = scale
         self._fitted = False
 
@@ -127,6 +133,7 @@ def fit(
             dataset_col=dataset_col,
             local_rope_value=self._local_rope_value,
             tie_solver=self._tie_solver,
+            maximize=self._maximize,
         )
 
         self._fit_posterior = _mcmcbbt_pymc(

@@ -43,6 +43,11 @@ test = [
     "mypy",
 ]
 
+[tool.pytest]
+markers = [
+    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+]
+
 [tool.mypy]
 python_version = "3.11"
 check_untyped_defs = true  # check all functions, this fixes some tests