From 2e40a95a7743e489a0ea58b3b529cfac5746d408 Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Tue, 25 Nov 2025 14:28:28 +0100 Subject: [PATCH 1/7] First do bounding per LR-system, then calculate percentiles. --- lrmodule/mcmc.py | 14 +++++++++++++- tests/mcmc_matlab_comparison/test_mcmc.py | 7 ++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/lrmodule/mcmc.py b/lrmodule/mcmc.py index 48db446..6e501f8 100644 --- a/lrmodule/mcmc.py +++ b/lrmodule/mcmc.py @@ -3,6 +3,7 @@ import arviz as az import numpy as np import pymc as pm +from lir.bounding import LLRBounder from lir.data.models import FeatureData, LLRData from lir.transform import Transformer from scipy.stats import betabinom, binom, norm @@ -17,12 +18,13 @@ class McmcLLRModel(Transformer): determined. """ - def __init__( + def __init__( # noqa: PLR0913 self, distribution_h1: str, parameters_h1: dict[str, dict[str, int]] | None, distribution_h2: str, parameters_h2: dict[str, dict[str, int]] | None, + bounder: LLRBounder | None, interval: tuple[float, float] = (0.05, 0.95), **mcmc_kwargs, ): @@ -33,17 +35,25 @@ def __init__( :param parameters_h1: definition of the parameters of distribution_h1, and their prior distributions :param distribution_h2: statistical distribution used to model H2, for example 'normal' or 'binomial' :param parameters_h2: definition of the parameters of distribution_h2, and their prior distributions + :param bounder: bounding method to apply to the unbound llrs, to prevent overextrapolation :param interval: lower and upper bounds of the credible interval in range 0..1; default: (0.05, 0.95) :param mcmc_kwargs: mcmc simulation settings, see `McmcModel.__init__` for more details. """ self.model_h1 = McmcModel(distribution_h1, parameters_h1, **mcmc_kwargs) self.model_h2 = McmcModel(distribution_h2, parameters_h2, **mcmc_kwargs) + self.bounder = bounder self.interval = interval def fit(self, instances: FeatureData) -> Self: """Fit the defined model to the supplied instances.""" self.model_h1.fit(instances.features[instances.labels == 1]) self.model_h2.fit(instances.features[instances.labels == 0]) + if self.bounder is not None: + # determine the bounds based on the LLRs of the training data + logp_h1 = self.model_h1.transform(instances.features) + logp_h2 = self.model_h2.transform(instances.features) + llrs = logp_h1 - logp_h2 + self.bounder = self.bounder.fit(llrs, instances.labels) return self def transform(self, instances: FeatureData) -> LLRData: @@ -51,6 +61,8 @@ def transform(self, instances: FeatureData) -> LLRData: logp_h1 = self.model_h1.transform(instances.features) logp_h2 = self.model_h2.transform(instances.features) llrs = logp_h1 - logp_h2 + if self.bounder is not None: + llrs = self.bounder.transform(llrs) quantiles = np.quantile(llrs, [0.5] + list(self.interval), axis=1, method="midpoint") return instances.replace_as(LLRData, features=quantiles.transpose(1, 0)) diff --git a/tests/mcmc_matlab_comparison/test_mcmc.py b/tests/mcmc_matlab_comparison/test_mcmc.py index f160da4..d063a25 100644 --- a/tests/mcmc_matlab_comparison/test_mcmc.py +++ b/tests/mcmc_matlab_comparison/test_mcmc.py @@ -88,7 +88,12 @@ def test_llr_dataset(dataset_name: str): labels = np.concatenate([np.ones(scores_km.shape[0]), np.zeros(scores_knm.shape[0])]) model = McmcLLRModel( - cfg.distribution_h1, cfg.parameters_h1, cfg.distribution_h2, cfg.parameters_h2, random_seed=cfg.random_seed + cfg.distribution_h1, + cfg.parameters_h1, + cfg.distribution_h2, + cfg.parameters_h2, + bounder=None, + random_seed=cfg.random_seed, ) model.fit(FeatureData(features=features, labels=labels)) llrs = model.transform(FeatureData(features=scores_eval)) From 303cc34a130dcb50f74242ce0e4794c7b7c34de6 Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Wed, 26 Nov 2025 15:18:40 +0100 Subject: [PATCH 2/7] Store bounds, instead of a bounders with many bounds. --- lrmodule/mcmc.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/lrmodule/mcmc.py b/lrmodule/mcmc.py index 6e501f8..498d463 100644 --- a/lrmodule/mcmc.py +++ b/lrmodule/mcmc.py @@ -42,6 +42,8 @@ def __init__( # noqa: PLR0913 self.model_h1 = McmcModel(distribution_h1, parameters_h1, **mcmc_kwargs) self.model_h2 = McmcModel(distribution_h2, parameters_h2, **mcmc_kwargs) self.bounder = bounder + self.lower_llr_bounds = None + self.upper_llr_bounds = None self.interval = interval def fit(self, instances: FeatureData) -> Self: @@ -49,11 +51,17 @@ def fit(self, instances: FeatureData) -> Self: self.model_h1.fit(instances.features[instances.labels == 1]) self.model_h2.fit(instances.features[instances.labels == 0]) if self.bounder is not None: - # determine the bounds based on the LLRs of the training data + # determine the bounds based on the LLRs of the training data, each sample results into an LR-system logp_h1 = self.model_h1.transform(instances.features) logp_h2 = self.model_h2.transform(instances.features) llrs = logp_h1 - logp_h2 - self.bounder = self.bounder.fit(llrs, instances.labels) + # determine the bounds for each LR-system individually + self.lower_llr_bounds = np.empty(llrs.shape[1]) + self.upper_llr_bounds = np.empty(llrs.shape[1]) + for i_system in range(llrs.shape[1]): + bounder = self.bounder.fit(llrs[:, i_system], instances.labels) + self.lower_llr_bounds[i_system] = bounder.lower_llr_bound + self.upper_llr_bounds[i_system] = bounder.upper_llr_bound return self def transform(self, instances: FeatureData) -> LLRData: @@ -62,7 +70,9 @@ def transform(self, instances: FeatureData) -> LLRData: logp_h2 = self.model_h2.transform(instances.features) llrs = logp_h1 - logp_h2 if self.bounder is not None: - llrs = self.bounder.transform(llrs) + # apply the bounds to all the LR-systems in one go + llrs = np.where(self.lower_llr_bounds < llrs, llrs, self.lower_llr_bounds) + llrs = np.where(self.upper_llr_bounds > llrs, llrs, self.upper_llr_bounds) quantiles = np.quantile(llrs, [0.5] + list(self.interval), axis=1, method="midpoint") return instances.replace_as(LLRData, features=quantiles.transpose(1, 0)) From af12190b089c078299755b4b982d0fa4f39f3ca7 Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Wed, 26 Nov 2025 15:28:10 +0100 Subject: [PATCH 3/7] Fix invalid-argument-type error --- lrmodule/mcmc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lrmodule/mcmc.py b/lrmodule/mcmc.py index 498d463..29ca1d7 100644 --- a/lrmodule/mcmc.py +++ b/lrmodule/mcmc.py @@ -48,6 +48,8 @@ def __init__( # noqa: PLR0913 def fit(self, instances: FeatureData) -> Self: """Fit the defined model to the supplied instances.""" + if instances.labels is None: + raise ValueError("Labels are required to fit this model.") self.model_h1.fit(instances.features[instances.labels == 1]) self.model_h2.fit(instances.features[instances.labels == 0]) if self.bounder is not None: From f47fda4371c8024ace882566ee71a8cc85386ae6 Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Thu, 27 Nov 2025 14:34:44 +0100 Subject: [PATCH 4/7] Added details on noqa: code --- lrmodule/mcmc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lrmodule/mcmc.py b/lrmodule/mcmc.py index 29ca1d7..424c185 100644 --- a/lrmodule/mcmc.py +++ b/lrmodule/mcmc.py @@ -18,7 +18,7 @@ class McmcLLRModel(Transformer): determined. """ - def __init__( # noqa: PLR0913 + def __init__( # noqa: PLR0913 (more than five arguments are allowed) self, distribution_h1: str, parameters_h1: dict[str, dict[str, int]] | None, @@ -80,7 +80,7 @@ def transform(self, instances: FeatureData) -> LLRData: class McmcModel: - def __init__( # noqa: PLR0913 + def __init__( # noqa: PLR0913 (more than five arguments are allowed) self, distribution: str, parameters: dict[str, dict[str, int]] | None, From b75d7b075c73da738bd15cb14bd0cf576ed3ac3e Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Thu, 27 Nov 2025 16:11:03 +0100 Subject: [PATCH 5/7] Store the bounders, instead of the bounds; transform the systems one by one, instead of all at once. --- lrmodule/mcmc.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/lrmodule/mcmc.py b/lrmodule/mcmc.py index 424c185..6d9fd45 100644 --- a/lrmodule/mcmc.py +++ b/lrmodule/mcmc.py @@ -24,7 +24,7 @@ def __init__( # noqa: PLR0913 (more than five arguments are allowed) parameters_h1: dict[str, dict[str, int]] | None, distribution_h2: str, parameters_h2: dict[str, dict[str, int]] | None, - bounder: LLRBounder | None, + bounding: LLRBounder | None, interval: tuple[float, float] = (0.05, 0.95), **mcmc_kwargs, ): @@ -41,9 +41,8 @@ def __init__( # noqa: PLR0913 (more than five arguments are allowed) """ self.model_h1 = McmcModel(distribution_h1, parameters_h1, **mcmc_kwargs) self.model_h2 = McmcModel(distribution_h2, parameters_h2, **mcmc_kwargs) - self.bounder = bounder - self.lower_llr_bounds = None - self.upper_llr_bounds = None + self.bounding = bounding + self.bounders = None self.interval = interval def fit(self, instances: FeatureData) -> Self: @@ -52,18 +51,15 @@ def fit(self, instances: FeatureData) -> Self: raise ValueError("Labels are required to fit this model.") self.model_h1.fit(instances.features[instances.labels == 1]) self.model_h2.fit(instances.features[instances.labels == 0]) - if self.bounder is not None: + if self.bounding is not None: # determine the bounds based on the LLRs of the training data, each sample results into an LR-system logp_h1 = self.model_h1.transform(instances.features) logp_h2 = self.model_h2.transform(instances.features) llrs = logp_h1 - logp_h2 # determine the bounds for each LR-system individually - self.lower_llr_bounds = np.empty(llrs.shape[1]) - self.upper_llr_bounds = np.empty(llrs.shape[1]) + self.bounders = [self.bounding.__class__() for _ in range(llrs.shape[1])] for i_system in range(llrs.shape[1]): - bounder = self.bounder.fit(llrs[:, i_system], instances.labels) - self.lower_llr_bounds[i_system] = bounder.lower_llr_bound - self.upper_llr_bounds[i_system] = bounder.upper_llr_bound + self.bounders[i_system] = self.bounders[i_system].fit(llrs[:, i_system], instances.labels) return self def transform(self, instances: FeatureData) -> LLRData: @@ -71,10 +67,10 @@ def transform(self, instances: FeatureData) -> LLRData: logp_h1 = self.model_h1.transform(instances.features) logp_h2 = self.model_h2.transform(instances.features) llrs = logp_h1 - logp_h2 - if self.bounder is not None: - # apply the bounds to all the LR-systems in one go - llrs = np.where(self.lower_llr_bounds < llrs, llrs, self.lower_llr_bounds) - llrs = np.where(self.upper_llr_bounds > llrs, llrs, self.upper_llr_bounds) + if self.bounding is not None: + # apply the bounders one by one + for i_system in range(llrs.shape[1]): + llrs[:, i_system] = self.bounders[i_system].transform(llrs[:, i_system]) quantiles = np.quantile(llrs, [0.5] + list(self.interval), axis=1, method="midpoint") return instances.replace_as(LLRData, features=quantiles.transpose(1, 0)) From f42f66d88ee8a033f91abf21924409aae84681fd Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Mon, 1 Dec 2025 11:30:40 +0100 Subject: [PATCH 6/7] Small fixes; non-subscriptable error, test input argument. --- lrmodule/mcmc.py | 2 +- tests/mcmc_matlab_comparison/test_mcmc.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lrmodule/mcmc.py b/lrmodule/mcmc.py index 6d9fd45..0aa08ce 100644 --- a/lrmodule/mcmc.py +++ b/lrmodule/mcmc.py @@ -67,7 +67,7 @@ def transform(self, instances: FeatureData) -> LLRData: logp_h1 = self.model_h1.transform(instances.features) logp_h2 = self.model_h2.transform(instances.features) llrs = logp_h1 - logp_h2 - if self.bounding is not None: + if (self.bounding is not None) and (self.bounders is not None): # apply the bounders one by one for i_system in range(llrs.shape[1]): llrs[:, i_system] = self.bounders[i_system].transform(llrs[:, i_system]) diff --git a/tests/mcmc_matlab_comparison/test_mcmc.py b/tests/mcmc_matlab_comparison/test_mcmc.py index d063a25..83fd51e 100644 --- a/tests/mcmc_matlab_comparison/test_mcmc.py +++ b/tests/mcmc_matlab_comparison/test_mcmc.py @@ -92,7 +92,7 @@ def test_llr_dataset(dataset_name: str): cfg.parameters_h1, cfg.distribution_h2, cfg.parameters_h2, - bounder=None, + bounding=None, random_seed=cfg.random_seed, ) model.fit(FeatureData(features=features, labels=labels)) From 537be38c02ee7f88c8d42653c72d614692b82465 Mon Sep 17 00:00:00 2001 From: Leen van der Ham Date: Mon, 1 Dec 2025 13:10:46 +0100 Subject: [PATCH 7/7] Use ELUB as default bounding method. --- lrmodule/mcmc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lrmodule/mcmc.py b/lrmodule/mcmc.py index 0aa08ce..7a6b4c5 100644 --- a/lrmodule/mcmc.py +++ b/lrmodule/mcmc.py @@ -3,6 +3,7 @@ import arviz as az import numpy as np import pymc as pm +from lir.algorithms.bayeserror import ELUBBounder from lir.bounding import LLRBounder from lir.data.models import FeatureData, LLRData from lir.transform import Transformer @@ -24,7 +25,7 @@ def __init__( # noqa: PLR0913 (more than five arguments are allowed) parameters_h1: dict[str, dict[str, int]] | None, distribution_h2: str, parameters_h2: dict[str, dict[str, int]] | None, - bounding: LLRBounder | None, + bounding: LLRBounder | None = ELUBBounder(), interval: tuple[float, float] = (0.05, 0.95), **mcmc_kwargs, ):