From d5077bfe9c33ed45559471f62926b230e64b7d9a Mon Sep 17 00:00:00 2001 From: hongping Date: Wed, 19 Nov 2025 12:58:08 +0800 Subject: [PATCH 1/2] feat(metrics): Add Circular Bias Detection (CBD) Integrity Score --- metrics/circular_bias_integrity/README.md | 136 ++++++++ metrics/circular_bias_integrity/app.py | 6 + .../circular_bias_integrity.py | 319 ++++++++++++++++++ .../circular_bias_integrity/requirements.txt | 2 + tests/test_circular_bias_integrity.py | 284 ++++++++++++++++ 5 files changed, 747 insertions(+) create mode 100644 metrics/circular_bias_integrity/README.md create mode 100644 metrics/circular_bias_integrity/app.py create mode 100644 metrics/circular_bias_integrity/circular_bias_integrity.py create mode 100644 metrics/circular_bias_integrity/requirements.txt create mode 100644 tests/test_circular_bias_integrity.py diff --git a/metrics/circular_bias_integrity/README.md b/metrics/circular_bias_integrity/README.md new file mode 100644 index 00000000..5846fc1a --- /dev/null +++ b/metrics/circular_bias_integrity/README.md @@ -0,0 +1,136 @@ +# Metric Card for Circular Bias Detection (CBD) Integrity Score + +## Metric Description + +The **Circular Bias Detection (CBD) Integrity Score** is a meta-evaluation metric that measures the statistical integrity of AI evaluation processes. Unlike traditional metrics that measure model performance (e.g., accuracy, F1, BLEU), CBD measures whether the evaluation process itself is trustworthy and free from circular reasoning bias. + +**Circular reasoning bias** occurs when evaluation results become artificially inflated through iterative protocol adjustments that optimize for benchmark performance rather than true model generalization. This is a critical but often overlooked issue in AI evaluation. + +## How to Use + +### Basic Usage + +```python +import evaluate +import numpy as np + +# Load the metric +cbd_metric = evaluate.load("circular_bias_integrity") + +# Example: 5 evaluation rounds with increasing performance and protocol changes +performance_scores = [0.85, 0.87, 0.91, 0.89, 0.93] +protocol_variations = [0.1, 0.15, 0.25, 0.20, 0.30] + +# Compute CBD score +results = cbd_metric.compute( + performance_scores=performance_scores, + protocol_variations=protocol_variations +) + +print(f"CBD Score: {results['cbd_score']:.1f}") +print(f"ρ_PC: {results['rho_pc']:.3f}") +print(f"Risk Level: {results['risk_level']}") +print(f"Recommendation: {results['recommendation']}") +``` + +### Advanced Usage with Full Matrix Data + +```python +import evaluate +import numpy as np + +cbd_metric = evaluate.load("circular_bias_integrity") + +# Performance across 5 time periods for 3 algorithms +performance_matrix = np.array([ + [0.85, 0.78, 0.82], + [0.87, 0.80, 0.84], + [0.91, 0.84, 0.88], + [0.89, 0.82, 0.86], + [0.93, 0.86, 0.90] +]) + +# Constraint specifications (e.g., batch_size, learning_rate) +constraint_matrix = np.array([ + [512, 0.001], + [550, 0.0015], + [600, 0.002], + [580, 0.0018], + [620, 0.0022] +]) + +# Compute all indicators +results = cbd_metric.compute( + performance_scores=performance_matrix.mean(axis=1).tolist(), + protocol_variations=[0.1, 0.15, 0.25, 0.20, 0.30], + performance_matrix=performance_matrix, + constraint_matrix=constraint_matrix, + return_all_indicators=True +) + +print(f"ρ_PC (Protocol-Performance Correlation): {results['rho_pc']:.3f}") +print(f"PSI (Performance-Structure Independence): {results['psi_score']:.3f}") +print(f"CCS (Constraint-Consistency Score): {results['ccs_score']:.3f}") +``` + +### Inputs + +- **`performance_scores`** (`list` of `float`): Performance scores across multiple evaluation rounds. Minimum 3 rounds required. +- **`protocol_variations`** (`list` of `float`): Quantified protocol variation magnitudes for each evaluation round. +- **`performance_matrix`** (`array-like`, optional): Shape (T, K) for detailed multi-algorithm tracking. +- **`constraint_matrix`** (`array-like`, optional): Shape (T, p) for constraint specification tracking. +- **`return_all_indicators`** (`bool`, optional): Return all three indicators (ρ_PC, PSI, CCS). Default: `False`. + +### Output Values + +- **`cbd_score`** (`float`): Overall integrity score (0-100). Higher = more bias detected. + - 0-30: Low risk + - 30-60: Moderate risk + - 60-100: High risk +- **`rho_pc`** (`float`): Protocol-Performance correlation (-1 to 1). +- **`risk_level`** (`str`): "LOW", "MODERATE", or "HIGH". +- **`recommendation`** (`str`): Actionable guidance. +- **`psi_score`** (`float`, optional): Parameter stability indicator. +- **`ccs_score`** (`float`, optional): Constraint consistency indicator. + +## Limitations and Bias + +### Limitations + +1. **Minimum Data Requirements**: Requires at least 3 evaluation rounds for reliable correlation analysis. More rounds (5-10+) provide more robust results. + +2. **Protocol Quantification**: Users must quantify protocol variations, which can be subjective. Consider using normalized measures (e.g., percentage change in hyperparameters). + +3. **Correlation ≠ Causation**: High ρ_PC indicates correlation between protocol changes and performance, but doesn't prove causation. Manual inspection is recommended. + +4. **Simplified MVP**: This initial version focuses on ρ_PC as the primary indicator. Full CBD framework includes bootstrap confidence intervals and adaptive thresholds (available in the standalone library). + +### Bias Considerations + +- **False Positives**: Natural performance improvements during model development may be flagged as circular bias if correlated with protocol changes. +- **False Negatives**: Sophisticated circular bias (e.g., through dataset selection) may not be detected if protocol variations aren't properly quantified. + +## Citation + +```bibtex +@article{zhang2025circular, + title={Circular Bias Detection: A Comprehensive Statistical Framework for Detecting Circular Reasoning Bias in AI Algorithm Evaluation}, + author={Zhang, Hongping}, + journal={arXiv preprint arXiv:2501.xxxxx}, + year={2025}, + note={Software available at: https://github.com/hongping-zh/circular-bias-detection} +} +``` + +## Further References + +- **GitHub Repository**: [hongping-zh/circular-bias-detection](https://github.com/hongping-zh/circular-bias-detection) +- **Software DOI**: [10.5281/zenodo.17201032](https://doi.org/10.5281/zenodo.17201032) +- **Dataset DOI**: [10.5281/zenodo.17196639](https://doi.org/10.5281/zenodo.17196639) +- **Live Demo**: [Try Sleuth (CBD Web App)](https://is.gd/check_sleuth) + +## Acknowledgements + +This metric implements the Circular Bias Detection (CBD) framework developed by Hongping Zhang. The framework addresses a critical gap in AI evaluation methodology by providing quantitative tools for assessing evaluation integrity. + +**Slogan**: *Ensuring your evaluation is trustworthy. Stop circular reasoning in AI benchmarks.* diff --git a/metrics/circular_bias_integrity/app.py b/metrics/circular_bias_integrity/app.py new file mode 100644 index 00000000..f83120e4 --- /dev/null +++ b/metrics/circular_bias_integrity/app.py @@ -0,0 +1,6 @@ +import evaluate +from evaluate.utils import launch_gradio_widget + + +module = evaluate.load("circular_bias_integrity") +launch_gradio_widget(module) diff --git a/metrics/circular_bias_integrity/circular_bias_integrity.py b/metrics/circular_bias_integrity/circular_bias_integrity.py new file mode 100644 index 00000000..15090c7f --- /dev/null +++ b/metrics/circular_bias_integrity/circular_bias_integrity.py @@ -0,0 +1,319 @@ +# Copyright 2025 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Circular Bias Detection (CBD) Integrity Score metric.""" + +import datasets +import numpy as np +from scipy.stats import pearsonr + +import evaluate + + +_DESCRIPTION = """ +Circular Bias Detection (CBD) Integrity Score measures the statistical integrity of AI evaluation processes +by quantifying circular reasoning bias. This metric addresses a critical gap in evaluation methodology: +while traditional metrics measure model performance, CBD measures whether the evaluation process itself +is statistically reliable. + +**Key Concept**: Circular bias occurs when evaluation results become artificially inflated through +iterative protocol adjustments (e.g., hyperparameter tuning, prompt engineering, dataset selection) +that optimize for benchmark performance rather than true model generalization. + +**Core Indicators**: +- **ρ_PC (Protocol-Performance Correlation)**: Measures the correlation between evaluation protocol + changes and resulting performance scores. High correlation indicates potential circular dependency. +- **PSI (Performance-Structure Independence)**: Quantifies parameter stability across evaluation periods. +- **CCS (Constraint-Consistency Score)**: Measures consistency of constraint specifications over time. + +**Slogan**: *Ensuring your evaluation is trustworthy. Stop circular reasoning in AI benchmarks.* + +This metric is particularly valuable for: +- Detecting overfitting to benchmarks during model development +- Validating evaluation integrity in research papers +- Auditing AI system evaluations for regulatory compliance +- Meta-evaluation of evaluation methodologies + +For detailed methodology, see: Zhang, H. (2025). "Circular Bias Detection: A Comprehensive Statistical +Framework for Detecting Circular Reasoning Bias in AI Algorithm Evaluation." +""" + + +_KWARGS_DESCRIPTION = """ +Args: + performance_scores (`list` of `float`): Performance scores across multiple evaluation rounds. + Each score represents model performance in a specific evaluation period (e.g., accuracy, F1, BLEU). + Minimum 3 evaluation rounds required for reliable correlation analysis. + protocol_variations (`list` of `float`): Quantified protocol variation magnitudes corresponding + to each evaluation round. This represents the degree of change in evaluation protocol + (e.g., hyperparameter changes, prompt modifications, dataset adjustments). + Must have the same length as performance_scores. + performance_matrix (`array-like`, optional): Shape (T, K) where T is time periods and K is algorithms. + Provides detailed performance tracking across multiple algorithms and time periods. + If provided, enables computation of PSI (Performance-Structure Independence). + constraint_matrix (`array-like`, optional): Shape (T, p) where T is time periods and p is constraint types. + Tracks constraint specifications across evaluation periods. + If provided, enables computation of CCS (Constraint-Consistency Score). + return_all_indicators (`boolean`, optional): If `True`, returns all three indicators (ρ_PC, PSI, CCS) + along with the overall CBD score. If `False`, returns only the CBD score and ρ_PC. + Defaults to `False`. + +Returns: + cbd_score (`float`): Overall Circular Bias Detection integrity score (0-100 scale). + Higher scores indicate stronger evidence of circular bias. + - 0-30: Low risk (evaluation appears statistically sound) + - 30-60: Moderate risk (some circular dependency detected) + - 60-100: High risk (significant circular bias detected) + rho_pc (`float`): Protocol-Performance correlation coefficient (-1 to 1). + Measures the linear relationship between protocol changes and performance. + Values close to ±1 indicate strong circular dependency. + psi_score (`float`, optional): Performance-Structure Independence score (returned if performance_matrix provided). + Higher values indicate more parameter instability/bias. + ccs_score (`float`, optional): Constraint-Consistency Score (returned if constraint_matrix provided). + Higher values indicate more consistency (less bias). + risk_level (`str`): Categorical risk assessment: "LOW", "MODERATE", or "HIGH". + recommendation (`str`): Actionable guidance based on detected bias level. + +Examples: + + Example 1 - Basic usage with simple performance and protocol data: + >>> cbd_metric = evaluate.load("circular_bias_integrity") + >>> performance = [0.85, 0.87, 0.91, 0.89, 0.93] + >>> protocol_changes = [0.1, 0.15, 0.25, 0.20, 0.30] + >>> results = cbd_metric.compute( + ... performance_scores=performance, + ... protocol_variations=protocol_changes + ... ) + >>> print(f"CBD Score: {results['cbd_score']:.1f}") + CBD Score: 78.5 + >>> print(f"Risk Level: {results['risk_level']}") + Risk Level: HIGH + + Example 2 - Advanced usage with full matrix data: + >>> cbd_metric = evaluate.load("circular_bias_integrity") + >>> # Performance across 5 time periods for 3 algorithms + >>> perf_matrix = np.array([ + ... [0.85, 0.78, 0.82], + ... [0.87, 0.80, 0.84], + ... [0.91, 0.84, 0.88], + ... [0.89, 0.82, 0.86], + ... [0.93, 0.86, 0.90] + ... ]) + >>> # Constraint specifications across 5 time periods + >>> constraint_matrix = np.array([ + ... [512, 0.7], + ... [550, 0.75], + ... [600, 0.8], + ... [580, 0.78], + ... [620, 0.82] + ... ]) + >>> results = cbd_metric.compute( + ... performance_scores=perf_matrix.mean(axis=1).tolist(), + ... protocol_variations=[0.1, 0.15, 0.25, 0.20, 0.30], + ... performance_matrix=perf_matrix, + ... constraint_matrix=constraint_matrix, + ... return_all_indicators=True + ... ) + >>> print(f"ρ_PC: {results['rho_pc']:.3f}") + ρ_PC: 0.785 + >>> print(f"PSI: {results['psi_score']:.3f}") + PSI: 0.042 + >>> print(f"CCS: {results['ccs_score']:.3f}") + CCS: 0.891 +""" + + +_CITATION = """ +@article{zhang2025circular, + title={Circular Bias Detection: A Comprehensive Statistical Framework for Detecting Circular Reasoning Bias in AI Algorithm Evaluation}, + author={Zhang, Hongping}, + journal={arXiv preprint arXiv:2501.xxxxx}, + year={2025}, + note={Software available at: https://github.com/hongping-zh/circular-bias-detection} +} +""" + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class CircularBiasIntegrity(evaluate.Metric): + """Circular Bias Detection (CBD) Integrity Score metric for evaluation trustworthiness.""" + + def _info(self): + return evaluate.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": datasets.Value("float"), + "references": datasets.Value("float"), + } + ), + reference_urls=[ + "https://github.com/hongping-zh/circular-bias-detection", + "https://doi.org/10.5281/zenodo.17201032", + ], + ) + + def _compute( + self, + performance_scores=None, + protocol_variations=None, + performance_matrix=None, + constraint_matrix=None, + return_all_indicators=False, + predictions=None, + references=None, + ): + """ + Compute CBD integrity score and related indicators. + + Note: This metric requires either (performance_scores, protocol_variations) + or (predictions, references) to be provided. + """ + + # Handle legacy interface (predictions/references) + if performance_scores is None and predictions is not None: + performance_scores = predictions + if protocol_variations is None and references is not None: + protocol_variations = references + + # Validate inputs + if performance_scores is None or protocol_variations is None: + raise ValueError( + "CBD metric requires 'performance_scores' and 'protocol_variations' to be provided. " + "These represent the performance trajectory and corresponding protocol changes across evaluation rounds." + ) + + performance_scores = np.array(performance_scores) + protocol_variations = np.array(protocol_variations) + + if len(performance_scores) != len(protocol_variations): + raise ValueError( + f"Length mismatch: performance_scores ({len(performance_scores)}) and " + f"protocol_variations ({len(protocol_variations)}) must have the same length." + ) + + if len(performance_scores) < 3: + raise ValueError( + "CBD metric requires at least 3 evaluation rounds for reliable correlation analysis. " + f"Received {len(performance_scores)} rounds." + ) + + # 1. Compute ρ_PC (Protocol-Performance Correlation) + rho_pc_corr, rho_pc_pvalue = pearsonr(performance_scores, protocol_variations) + + # Handle NaN correlations + if np.isnan(rho_pc_corr): + rho_pc_corr = 0.0 + + # 2. Compute CBD overall score (0-100 scale) + # Base score from ρ_PC (primary indicator) + cbd_score = abs(rho_pc_corr) * 100 + + results = { + "cbd_score": float(cbd_score), + "rho_pc": float(rho_pc_corr), + "rho_pc_pvalue": float(rho_pc_pvalue), + } + + # 3. Compute PSI if performance_matrix provided + if performance_matrix is not None and return_all_indicators: + psi_score = self._compute_psi(np.array(performance_matrix)) + results["psi_score"] = float(psi_score) + + # 4. Compute CCS if constraint_matrix provided + if constraint_matrix is not None and return_all_indicators: + ccs_score = self._compute_ccs(np.array(constraint_matrix)) + results["ccs_score"] = float(ccs_score) + + # 5. Risk assessment + if cbd_score < 30: + risk_level = "LOW" + recommendation = ( + "Evaluation appears statistically sound. Continue current methodology." + ) + elif cbd_score < 60: + risk_level = "MODERATE" + recommendation = ( + "Some circular dependency detected. Consider: (1) Using held-out test sets, " + "(2) Pre-registering evaluation protocols, (3) Limiting protocol iterations." + ) + else: + risk_level = "HIGH" + recommendation = ( + "Significant circular bias detected. Strongly recommend: (1) Independent validation set, " + "(2) Protocol pre-registration, (3) Reporting all evaluation attempts, " + "(4) Consider using cross-validation or bootstrap methods." + ) + + results["risk_level"] = risk_level + results["recommendation"] = recommendation + + return results + + def _compute_psi(self, performance_matrix): + """ + Compute Performance-Structure Independence (PSI) score. + + PSI measures parameter stability across evaluation periods. + Higher values indicate more instability/bias. + """ + T, K = performance_matrix.shape + + if T < 2: + return 0.0 + + psi_scores = [] + for k in range(K): + param_series = performance_matrix[:, k] + differences = np.diff(param_series) + psi_k = np.mean(np.abs(differences)) + psi_scores.append(psi_k) + + return np.mean(psi_scores) + + def _compute_ccs(self, constraint_matrix): + """ + Compute Constraint-Consistency Score (CCS). + + CCS measures consistency of constraint specifications. + Higher values indicate more consistency (less bias). + """ + T, p = constraint_matrix.shape + + if T < 2: + return 1.0 + + consistency_scores = [] + for j in range(p): + constraint_series = constraint_matrix[:, j] + + # Handle constant constraints + if np.std(constraint_series) == 0: + consistency_scores.append(1.0) + continue + + mean_val = np.mean(constraint_series) + if mean_val == 0: + consistency_scores.append(0.0) + continue + + # Coefficient of variation + cv = np.std(constraint_series) / np.abs(mean_val) + + # Transform to consistency score (lower CV = higher consistency) + consistency_j = 1 / (1 + cv) + consistency_scores.append(consistency_j) + + return np.mean(consistency_scores) diff --git a/metrics/circular_bias_integrity/requirements.txt b/metrics/circular_bias_integrity/requirements.txt new file mode 100644 index 00000000..4d48e857 --- /dev/null +++ b/metrics/circular_bias_integrity/requirements.txt @@ -0,0 +1,2 @@ +scipy>=1.7.0 +numpy>=1.20.0 diff --git a/tests/test_circular_bias_integrity.py b/tests/test_circular_bias_integrity.py new file mode 100644 index 00000000..06edb413 --- /dev/null +++ b/tests/test_circular_bias_integrity.py @@ -0,0 +1,284 @@ +# Copyright 2025 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for Circular Bias Detection (CBD) Integrity Score metric.""" + +import unittest +import numpy as np +import pytest + +import evaluate + + +class TestCircularBiasIntegrity(unittest.TestCase): + """Test suite for CBD Integrity Score metric.""" + + def setUp(self): + """Load the metric before each test.""" + self.metric = evaluate.load("circular_bias_integrity") + + def test_low_bias_scenario(self): + """Test scenario with no circular bias (low correlation).""" + # Performance stays relatively stable despite protocol changes + performance_scores = [0.85, 0.84, 0.86, 0.85, 0.87] + protocol_variations = [0.1, 0.5, 0.2, 0.8, 0.3] + + results = self.metric.compute( + performance_scores=performance_scores, + protocol_variations=protocol_variations, + ) + + # Should detect low bias + self.assertLess( + results["cbd_score"], 40, "Low bias scenario should have CBD score < 40" + ) + self.assertEqual( + results["risk_level"], + "LOW", + "Risk level should be LOW for uncorrelated data", + ) + self.assertIn("cbd_score", results) + self.assertIn("rho_pc", results) + self.assertIn("recommendation", results) + + def test_high_bias_scenario(self): + """Test scenario with strong circular bias (high correlation).""" + # Performance increases linearly with protocol changes + performance_scores = [0.75, 0.80, 0.85, 0.90, 0.95] + protocol_variations = [0.1, 0.2, 0.3, 0.4, 0.5] + + results = self.metric.compute( + performance_scores=performance_scores, + protocol_variations=protocol_variations, + ) + + # Should detect high bias + self.assertGreater( + results["cbd_score"], 60, "High bias scenario should have CBD score > 60" + ) + self.assertEqual( + results["risk_level"], + "HIGH", + "Risk level should be HIGH for strongly correlated data", + ) + self.assertGreater( + abs(results["rho_pc"]), 0.7, "ρ_PC should be > 0.7 for strong correlation" + ) + + def test_moderate_bias_scenario(self): + """Test scenario with moderate circular bias.""" + performance_scores = [0.82, 0.85, 0.83, 0.88, 0.87] + protocol_variations = [0.15, 0.25, 0.18, 0.35, 0.30] + + results = self.metric.compute( + performance_scores=performance_scores, + protocol_variations=protocol_variations, + ) + + # Should detect moderate bias + self.assertGreaterEqual( + results["cbd_score"], 30, "Moderate bias should have CBD score >= 30" + ) + self.assertLessEqual( + results["cbd_score"], 70, "Moderate bias should have CBD score <= 70" + ) + + def test_negative_correlation(self): + """Test scenario with negative correlation (performance decreases with protocol changes).""" + performance_scores = [0.95, 0.90, 0.85, 0.80, 0.75] + protocol_variations = [0.1, 0.2, 0.3, 0.4, 0.5] + + results = self.metric.compute( + performance_scores=performance_scores, + protocol_variations=protocol_variations, + ) + + # Negative correlation should also be detected as bias + self.assertGreater( + results["cbd_score"], + 60, + "Negative correlation should also indicate high bias", + ) + self.assertLess( + results["rho_pc"], + -0.7, + "ρ_PC should be < -0.7 for strong negative correlation", + ) + + def test_minimum_data_requirement(self): + """Test that metric requires at least 3 data points.""" + performance_scores = [0.85, 0.87] + protocol_variations = [0.1, 0.2] + + with pytest.raises(ValueError, match="at least 3 evaluation rounds"): + self.metric.compute( + performance_scores=performance_scores, + protocol_variations=protocol_variations, + ) + + def test_length_mismatch_error(self): + """Test that metric raises error when input lengths don't match.""" + performance_scores = [0.85, 0.87, 0.91] + protocol_variations = [0.1, 0.2] + + with pytest.raises(ValueError, match="Length mismatch"): + self.metric.compute( + performance_scores=performance_scores, + protocol_variations=protocol_variations, + ) + + def test_missing_inputs_error(self): + """Test that metric raises error when required inputs are missing.""" + with pytest.raises( + ValueError, match="requires 'performance_scores' and 'protocol_variations'" + ): + self.metric.compute() + + def test_with_performance_matrix(self): + """Test computation with performance matrix (PSI calculation).""" + performance_matrix = np.array( + [ + [0.85, 0.78, 0.82], + [0.87, 0.80, 0.84], + [0.91, 0.84, 0.88], + [0.89, 0.82, 0.86], + [0.93, 0.86, 0.90], + ] + ) + + performance_scores = performance_matrix.mean(axis=1).tolist() + protocol_variations = [0.1, 0.15, 0.25, 0.20, 0.30] + + results = self.metric.compute( + performance_scores=performance_scores, + protocol_variations=protocol_variations, + performance_matrix=performance_matrix, + return_all_indicators=True, + ) + + # Should include PSI score + self.assertIn("psi_score", results) + self.assertIsInstance(results["psi_score"], float) + self.assertGreaterEqual( + results["psi_score"], 0, "PSI score should be non-negative" + ) + + def test_with_constraint_matrix(self): + """Test computation with constraint matrix (CCS calculation).""" + performance_scores = [0.85, 0.87, 0.91, 0.89, 0.93] + protocol_variations = [0.1, 0.15, 0.25, 0.20, 0.30] + + constraint_matrix = np.array( + [[512, 0.001], [550, 0.0015], [600, 0.002], [580, 0.0018], [620, 0.0022]] + ) + + results = self.metric.compute( + performance_scores=performance_scores, + protocol_variations=protocol_variations, + constraint_matrix=constraint_matrix, + return_all_indicators=True, + ) + + # Should include CCS score + self.assertIn("ccs_score", results) + self.assertIsInstance(results["ccs_score"], float) + self.assertGreaterEqual( + results["ccs_score"], 0, "CCS score should be between 0 and 1" + ) + self.assertLessEqual( + results["ccs_score"], 1, "CCS score should be between 0 and 1" + ) + + def test_full_computation_with_all_indicators(self): + """Test full computation with all three indicators.""" + performance_matrix = np.array( + [ + [0.85, 0.78, 0.82], + [0.87, 0.80, 0.84], + [0.91, 0.84, 0.88], + [0.89, 0.82, 0.86], + [0.93, 0.86, 0.90], + ] + ) + + constraint_matrix = np.array( + [[512, 0.7], [550, 0.75], [600, 0.8], [580, 0.78], [620, 0.82]] + ) + + performance_scores = performance_matrix.mean(axis=1).tolist() + protocol_variations = [0.1, 0.15, 0.25, 0.20, 0.30] + + results = self.metric.compute( + performance_scores=performance_scores, + protocol_variations=protocol_variations, + performance_matrix=performance_matrix, + constraint_matrix=constraint_matrix, + return_all_indicators=True, + ) + + # Should include all indicators + self.assertIn("cbd_score", results) + self.assertIn("rho_pc", results) + self.assertIn("psi_score", results) + self.assertIn("ccs_score", results) + self.assertIn("risk_level", results) + self.assertIn("recommendation", results) + + def test_constant_performance(self): + """Test with constant performance (should handle gracefully).""" + performance_scores = [0.85, 0.85, 0.85, 0.85, 0.85] + protocol_variations = [0.1, 0.2, 0.3, 0.4, 0.5] + + results = self.metric.compute( + performance_scores=performance_scores, + protocol_variations=protocol_variations, + ) + + # Should handle constant performance + self.assertIn("cbd_score", results) + # Correlation should be 0 or NaN (handled as 0) + self.assertLessEqual( + abs(results["rho_pc"]), + 0.1, + "Constant performance should have near-zero correlation", + ) + + def test_output_types(self): + """Test that all output values have correct types.""" + performance_scores = [0.85, 0.87, 0.91, 0.89, 0.93] + protocol_variations = [0.1, 0.15, 0.25, 0.20, 0.30] + + results = self.metric.compute( + performance_scores=performance_scores, + protocol_variations=protocol_variations, + ) + + self.assertIsInstance(results["cbd_score"], float) + self.assertIsInstance(results["rho_pc"], float) + self.assertIsInstance(results["risk_level"], str) + self.assertIsInstance(results["recommendation"], str) + + def test_legacy_interface(self): + """Test backward compatibility with predictions/references interface.""" + # Some users might use the standard predictions/references interface + predictions = [0.85, 0.87, 0.91, 0.89, 0.93] + references = [0.1, 0.15, 0.25, 0.20, 0.30] + + results = self.metric.compute(predictions=predictions, references=references) + + self.assertIn("cbd_score", results) + self.assertIn("rho_pc", results) + + +if __name__ == "__main__": + unittest.main() From d72a1054241a41f5eb22a0e4628b016396217be9 Mon Sep 17 00:00:00 2001 From: hongping Date: Wed, 19 Nov 2025 13:14:10 +0800 Subject: [PATCH 2/2] docs: add mathematical foundations and enhance limitations disclosure - Add comprehensive MATHEMATICAL_FOUNDATIONS.md with rigorous definitions - Enhance README.md with experimental status warning - Remove overstated claims and add detailed limitations - Clarify thresholds are heuristic, not validated - Emphasize correlation vs. causation distinction --- .../MATHEMATICAL_FOUNDATIONS.md | 566 ++++++++++++++++++ metrics/circular_bias_integrity/README.md | 141 ++++- 2 files changed, 695 insertions(+), 12 deletions(-) create mode 100644 metrics/circular_bias_integrity/MATHEMATICAL_FOUNDATIONS.md diff --git a/metrics/circular_bias_integrity/MATHEMATICAL_FOUNDATIONS.md b/metrics/circular_bias_integrity/MATHEMATICAL_FOUNDATIONS.md new file mode 100644 index 00000000..063106ad --- /dev/null +++ b/metrics/circular_bias_integrity/MATHEMATICAL_FOUNDATIONS.md @@ -0,0 +1,566 @@ +# Mathematical Foundations of CBD Framework + +**Status**: Research Prototype +**Last Updated**: 2025-11-19 + +--- + +## Overview + +This document provides rigorous mathematical definitions, statistical properties, and limitations of the three core indicators in the Circular Bias Detection (CBD) framework. These definitions are essential for understanding the theoretical basis and practical constraints of the metric. + +--- + +## 1. PSI (Performance-Structure Independence) + +### 1.1 Mathematical Definition + +**Purpose**: Measure parameter stability across evaluation periods to detect iterative protocol adjustments. + +**Formal Definition**: + +$$\text{PSI} = \frac{1}{K}\sum_{k=1}^{K} \text{PSI}_k$$ + +where for each algorithm $k$: + +$$\text{PSI}_k = \frac{1}{T-1}\sum_{t=1}^{T-1}|\theta_{k,t+1} - \theta_{k,t}|$$ + +**Notation**: +- $K$: Number of algorithms being evaluated +- $T$: Number of evaluation time periods (must be $T \geq 2$) +- $\theta_{k,t}$: Parameter value (or performance proxy) for algorithm $k$ at time $t$ +- $|\cdot|$: Absolute value (L1 norm for scalar parameters) + +### 1.2 Statistical Properties + +**Expected Value**: +Under the null hypothesis of no systematic parameter drift: +$$E[\text{PSI}] \approx \sigma_{\epsilon} \sqrt{\frac{2}{\pi}}$$ +where $\sigma_{\epsilon}$ is the standard deviation of measurement noise. + +**Variance**: +$$\text{Var}(\text{PSI}) \approx \frac{\sigma_{\epsilon}^2}{T-1}\left(1 - \frac{2}{\pi}\right)$$ + +**Distribution**: +- For large $T$, PSI approximately follows a folded normal distribution +- For small $T$ (< 10), distribution is highly skewed +- Assumes independence of consecutive measurements (often violated in practice) + +### 1.3 Sensitivity Analysis + +**Sample Size Sensitivity**: +- $T < 3$: Unreliable (insufficient data for trend detection) +- $3 \leq T < 10$: Moderate reliability (high variance) +- $T \geq 10$: Good reliability (variance decreases as $O(1/T)$) + +**Outlier Sensitivity**: +- PSI uses L1 norm (absolute difference), which is more robust than L2 norm +- However, single outliers can still significantly affect results when $T$ is small +- Recommendation: Use robust estimators (e.g., median absolute deviation) for $T < 10$ + +**Measurement Noise**: +$$\text{Signal-to-Noise Ratio} = \frac{|\Delta\theta_{\text{true}}|}{\sigma_{\epsilon}}$$ +- SNR < 1: PSI dominated by noise +- SNR ≥ 3: PSI reliably detects parameter changes + +### 1.4 Limitations and Assumptions + +**Assumptions**: +1. Parameters are continuous and comparable across time periods +2. Measurement errors are independent and identically distributed (i.i.d.) +3. No external factors systematically affect parameter values +4. Parameter changes are independent across algorithms + +**Known Limitations**: +1. **Cannot distinguish legitimate vs. circular parameter changes**: High PSI may indicate either: + - Circular bias (iterative protocol tuning) + - Legitimate model improvement (algorithm evolution) + - External factors (hardware changes, dataset updates) + +2. **Assumes stationarity**: Does not account for expected parameter drift in rapidly evolving research areas + +3. **Scale dependency**: PSI magnitude depends on parameter scale. Recommendation: Normalize parameters before computing PSI + +4. **Temporal correlation**: Consecutive evaluations are often correlated, violating independence assumption + +**False Positive Scenarios**: +- Active research areas with rapid algorithm development +- Exploratory phases with intentional parameter sweeps +- Hardware or software environment changes + +**False Negative Scenarios**: +- Small, incremental parameter adjustments (below measurement noise) +- Parameter changes in orthogonal dimensions (not captured by scalar proxy) +- Circular bias through dataset selection rather than parameter tuning + +--- + +## 2. CCS (Constraint-Consistency Score) + +### 2.1 Mathematical Definition + +**Purpose**: Measure consistency of constraint specifications across evaluation periods. + +**Formal Definition**: + +$$\text{CCS} = \frac{1}{p}\sum_{j=1}^{p} \text{CCS}_j$$ + +where for each constraint type $j$: + +$$\text{CCS}_j = \begin{cases} +\frac{1}{1 + CV_j} & \text{if } \mu_j \neq 0 \text{ and } \sigma_j > 0 \\ +1 & \text{if } \sigma_j = 0 \text{ (constant constraint)} \\ +0 & \text{if } \mu_j = 0 \text{ (undefined CV)} +\end{cases}$$ + +where the coefficient of variation is: + +$$CV_j = \frac{\sigma_j}{|\mu_j|}$$ + +**Notation**: +- $p$: Number of constraint types (e.g., batch size, learning rate, data split ratio) +- $T$: Number of evaluation time periods +- $c_{j,t}$: Value of constraint $j$ at time $t$ +- $\mu_j = \frac{1}{T}\sum_{t=1}^{T} c_{j,t}$: Mean of constraint $j$ +- $\sigma_j = \sqrt{\frac{1}{T-1}\sum_{t=1}^{T}(c_{j,t} - \mu_j)^2}$: Standard deviation of constraint $j$ + +### 2.2 Why Coefficient of Variation? + +**Rationale**: +1. **Scale invariance**: CV is dimensionless, allowing comparison across constraints with different units + - Example: Can compare consistency of batch size (range: 16-512) with learning rate (range: 0.0001-0.01) + +2. **Interpretability**: + - $CV = 0$: Perfect consistency (no variation) + - $CV < 0.1$: High consistency (< 10% relative variation) + - $CV > 1$: Low consistency (variation exceeds mean) + +3. **Standard statistical measure**: Well-established in quality control and reliability engineering + +4. **Monotonic transformation**: $\text{CCS}_j = \frac{1}{1+CV_j}$ maps CV to [0,1] range with intuitive interpretation + +**Transformation Properties**: +$$\lim_{CV_j \to 0} \text{CCS}_j = 1 \quad \text{(perfect consistency)}$$ +$$\lim_{CV_j \to \infty} \text{CCS}_j = 0 \quad \text{(no consistency)}$$ + +### 2.3 Applicability to Different Constraint Types + +**Continuous Constraints** (e.g., learning rate, dropout rate): +- ✅ **Well-suited**: CV directly measures relative variability +- ⚠️ **Caution**: Assumes constraints are on ratio scale (meaningful zero point) + +**Discrete Constraints** (e.g., batch size, number of layers): +- ✅ **Acceptable**: CV still meaningful if values span reasonable range +- ⚠️ **Caution**: For small discrete sets (e.g., {16, 32, 64}), CV may overestimate inconsistency + +**Categorical Constraints** (e.g., optimizer type, activation function): +- ❌ **Not applicable**: CV undefined for nominal categories +- 🔄 **Alternative**: Use entropy or mode frequency for categorical constraints + +**Mixed Constraints**: +- Compute CCS separately for continuous and discrete subsets +- Weight by importance or use separate reporting + +### 2.4 Statistical Properties + +**Expected Value** (under random constraint selection): +$$E[\text{CCS}] \approx \frac{1}{1 + E[CV]}$$ + +For uniform random constraints on $[a, b]$: +$$E[CV] = \frac{1}{\sqrt{3}} \approx 0.577$$ +$$E[\text{CCS}] \approx 0.634$$ + +**Variance**: +$$\text{Var}(\text{CCS}) \approx \frac{\text{Var}(CV)}{(1 + E[CV])^4}$$ + +**Distribution**: +- CCS is bounded in [0, 1] +- Distribution depends on underlying constraint distribution +- For normally distributed constraints, CCS follows a transformed inverse gamma distribution + +### 2.5 Limitations and Assumptions + +**Assumptions**: +1. Constraints are continuous or ordinal (meaningful ordering) +2. Constraint values are on ratio or interval scale +3. Mean constraint value is non-zero and meaningful +4. Constraint changes are independent across types + +**Known Limitations**: + +1. **Zero-mean constraints**: + - Problem: CV undefined when $\mu_j = 0$ + - Current handling: Set $\text{CCS}_j = 0$ (conservative) + - Better approach: Use alternative measures (e.g., median absolute deviation) + +2. **Near-zero mean constraints**: + - Problem: CV becomes unstable and arbitrarily large + - Example: Learning rate oscillating around 0.0001 ± 0.00005 gives $CV = 0.5$, but around 0.00001 ± 0.000005 gives $CV = 0.5$ (same relative variation) + - Recommendation: Use absolute variation for constraints with $|\mu_j| < \epsilon$ + +3. **Non-normal distributions**: + - CV assumes approximate normality + - For skewed distributions, CV may not accurately reflect consistency + - Alternative: Use robust measures (e.g., quartile coefficient of dispersion) + +4. **Temporal trends**: + - CCS treats all time periods equally + - Does not detect systematic trends (e.g., gradual increase in batch size) + - May miss circular bias that manifests as trends rather than random variation + +5. **Constraint interdependencies**: + - CCS assumes constraints are independent + - In practice, constraints often covary (e.g., batch size and learning rate) + - Does not capture multivariate consistency patterns + +**False Positive Scenarios**: +- Legitimate exploration of constraint space +- Adaptive constraints (e.g., learning rate schedules) +- Hardware-driven constraint changes (e.g., GPU memory limits) + +**False Negative Scenarios**: +- Consistent but biased constraint choices +- Constraints changed in correlated manner (maintaining ratios) +- Circular bias through constraint combinations rather than individual values + +--- + +## 3. ρ_PC (Protocol-Performance Correlation) + +### 3.1 Mathematical Definition + +**Purpose**: Measure correlation between protocol changes and performance improvements. + +**Formal Definition**: + +$$\rho_{PC} = \frac{\text{Cov}(P, C)}{\sigma_P \sigma_C}$$ + +where: +$$\text{Cov}(P, C) = \frac{1}{T-1}\sum_{t=1}^{T}(P_t - \bar{P})(C_t - \bar{C})$$ + +**Notation**: +- $P_t$: Performance score at time $t$ (aggregated across algorithms if multiple) +- $C_t$: Protocol variation magnitude at time $t$ +- $\bar{P} = \frac{1}{T}\sum_{t=1}^{T} P_t$: Mean performance +- $\bar{C} = \frac{1}{T}\sum_{t=1}^{T} C_t$: Mean protocol variation +- $\sigma_P, \sigma_C$: Standard deviations of performance and protocol variation + +**Implementation**: Uses Pearson correlation coefficient from `scipy.stats.pearsonr` + +### 3.2 Statistical Properties + +**Range**: $\rho_{PC} \in [-1, 1]$ +- $\rho_{PC} = 1$: Perfect positive correlation (performance increases with protocol changes) +- $\rho_{PC} = 0$: No linear correlation +- $\rho_{PC} = -1$: Perfect negative correlation (performance decreases with protocol changes) + +**Statistical Significance**: +Under null hypothesis $H_0: \rho = 0$, the test statistic: +$$t = \rho_{PC}\sqrt{\frac{T-2}{1-\rho_{PC}^2}} \sim t_{T-2}$$ + +**P-value interpretation**: +- $p < 0.01$: Strong evidence of correlation +- $0.01 \leq p < 0.05$: Moderate evidence +- $p \geq 0.05$: Insufficient evidence (but does not prove independence) + +**Power Analysis**: +Minimum sample size for detecting correlation $\rho$ with power $1-\beta$ at significance $\alpha$: +$$T \approx \left(\frac{z_{\alpha/2} + z_{\beta}}{0.5\ln\frac{1+\rho}{1-\rho}}\right)^2 + 3$$ + +Example: To detect $\rho = 0.5$ with 80% power at $\alpha = 0.05$: +$$T \approx 29$$ + +### 3.3 Interpretation and Causation + +**Critical Distinction**: Correlation ≠ Causation + +**High |ρ_PC| may indicate**: +1. ✅ **Circular bias**: Protocol tuned to optimize performance on test set +2. ✅ **Legitimate improvement**: Protocol changes reflect genuine algorithmic advances +3. ✅ **Confounding factors**: External factors affect both protocol and performance +4. ✅ **Reverse causation**: Poor performance motivates protocol changes + +**Causal Inference Requirements** (not provided by ρ_PC alone): +- Temporal precedence: Protocol changes must precede performance changes +- Mechanism: Plausible causal pathway from protocol to performance +- No confounders: Other explanations ruled out +- Dose-response: Larger protocol changes → larger performance changes +- Consistency: Pattern holds across different contexts + +**Recommendation**: Use ρ_PC as a **screening tool** to identify cases requiring deeper investigation, not as definitive evidence of circular bias. + +### 3.4 Sensitivity Analysis + +**Sample Size Sensitivity**: +- $T < 3$: Correlation undefined or unreliable +- $3 \leq T < 10$: High variance, wide confidence intervals +- $10 \leq T < 30$: Moderate reliability +- $T \geq 30$: Good reliability for detecting medium-to-large correlations + +**Outlier Sensitivity**: +- Pearson correlation is sensitive to outliers +- Single extreme point can dominate correlation +- Recommendation: Use Spearman rank correlation for robustness (not currently implemented) + +**Linearity Assumption**: +- Pearson correlation only detects linear relationships +- May miss non-linear protocol-performance relationships +- Example: U-shaped relationship (optimal protocol in middle range) + +**Aggregation Effects**: +- When averaging performance across algorithms, individual patterns may be masked +- Recommendation: Compute ρ_PC separately for each algorithm when possible + +### 3.5 Limitations and Assumptions + +**Assumptions**: +1. Linear relationship between protocol variation and performance +2. Bivariate normal distribution (for significance testing) +3. Homoscedasticity (constant variance) +4. Independence of observations (often violated in time series) +5. No measurement error in protocol variation quantification + +**Known Limitations**: + +1. **Quantifying protocol variation**: + - Problem: No standard method to quantify "protocol change magnitude" + - Current approach: User-provided values (subjective) + - Impact: ρ_PC validity depends on quality of protocol quantification + +2. **Temporal autocorrelation**: + - Consecutive evaluations are often correlated + - Violates independence assumption + - Inflates Type I error rate (false positives) + - Recommendation: Use time series methods (e.g., Durbin-Watson test) + +3. **Multiple testing**: + - Computing ρ_PC for multiple algorithm pairs increases false positive rate + - No correction for multiple comparisons currently implemented + - Recommendation: Apply Bonferroni or FDR correction + +4. **Direction ambiguity**: + - High |ρ_PC| could indicate: + - Protocol → Performance (circular bias) + - Performance → Protocol (reactive adjustment) + - Confounding → Both + - Cannot distinguish without additional information + +5. **Non-linear relationships**: + - Pearson correlation only captures linear associations + - May miss important non-linear patterns + - Alternative: Use mutual information or distance correlation + +**False Positive Scenarios**: +- Legitimate co-evolution of methods and protocols +- External factors (e.g., hardware improvements) affecting both +- Natural progression in research (better methods → better protocols) + +**False Negative Scenarios**: +- Non-linear protocol-performance relationships +- Circular bias through discrete protocol choices (not captured by continuous correlation) +- Time-lagged effects (protocol change at $t$ affects performance at $t+k$) + +--- + +## 4. Threshold Calibration + +### 4.1 Current Thresholds (Heuristic) + +**CBD Score** (derived from |ρ_PC| × 100): +- **0-30**: Low risk +- **30-60**: Moderate risk +- **60-100**: High risk + +**PSI** (not used in current simplified implementation): +- Threshold: 0.15 (heuristic, domain-dependent) + +**CCS** (not used in current simplified implementation): +- Threshold: 0.85 (heuristic, domain-dependent) + +### 4.2 Lack of Empirical Validation + +**Critical Limitation**: These thresholds are **not validated** through: +- Large-scale empirical studies +- Cross-domain validation +- ROC curve analysis +- Cost-benefit optimization + +**Current Status**: Thresholds are **educated guesses** based on: +- Intuition about correlation strength +- Analogies to other fields (e.g., effect size guidelines in psychology) +- Limited synthetic data experiments + +### 4.3 Domain-Specific Calibration Needed + +**Recommendation**: Users should calibrate thresholds for their specific domain by: + +1. **Collect ground truth data**: + - Known biased evaluations (positive examples) + - Known unbiased evaluations (negative examples) + +2. **Compute ROC curve**: + - Vary threshold from 0 to 100 + - Plot True Positive Rate vs. False Positive Rate + - Select threshold based on desired trade-off + +3. **Cross-validation**: + - Use k-fold cross-validation to estimate generalization performance + - Report confidence intervals for threshold performance + +4. **Cost-benefit analysis**: + - Assign costs to false positives (flagging legitimate research) + - Assign costs to false negatives (missing circular bias) + - Optimize threshold to minimize expected cost + +### 4.4 Adaptive Thresholds (Future Work) + +**Concept**: Adjust thresholds based on data characteristics + +**Potential Approaches**: +1. **Quantile-based**: Use empirical distribution of scores +2. **Bayesian**: Update thresholds as more data becomes available +3. **Context-aware**: Adjust based on research area, evaluation type, etc. + +**Status**: Not currently implemented. Requires substantial research and validation. + +--- + +## 5. Integrated Framework Limitations + +### 5.1 Indicator Independence + +**Assumption**: PSI, CCS, and ρ_PC measure independent aspects of circular bias + +**Reality**: Indicators are likely correlated: +- High PSI may lead to high |ρ_PC| (parameter instability correlates with performance) +- Low CCS may lead to high |ρ_PC| (inconsistent constraints correlate with performance) + +**Impact**: +- Combining indicators may not provide independent evidence +- "Majority vote" approach may be misleading +- Need multivariate analysis to understand joint distribution + +### 5.2 Weighting and Aggregation + +**Current Approach**: Simple average or majority vote + +**Limitations**: +- Assumes equal importance of indicators (may not be true) +- Does not account for measurement uncertainty +- No principled way to combine conflicting signals + +**Better Approaches** (not implemented): +- Weighted combination based on reliability +- Probabilistic framework (e.g., Bayesian network) +- Machine learning classifier trained on labeled data + +### 5.3 Temporal Dynamics + +**Current Approach**: Static analysis of time series + +**Missing**: +- Trend detection (is bias increasing or decreasing?) +- Change point detection (when did circular bias start?) +- Forecasting (will bias continue to increase?) + +**Recommendation**: Incorporate time series analysis methods + +### 5.4 Causal Inference + +**Fundamental Limitation**: CBD framework is **correlational**, not **causal** + +**Cannot Answer**: +- Did protocol changes cause performance improvements? +- Would performance have improved without protocol changes? +- What is the counterfactual (performance under different protocols)? + +**Causal Methods Needed** (not implemented): +- Randomized controlled trials (RCTs) +- Instrumental variables +- Difference-in-differences +- Regression discontinuity + +--- + +## 6. Recommendations for Users + +### 6.1 Interpretation Guidelines + +1. **Use as screening tool**: CBD flags potential issues, not definitive proof +2. **Context matters**: Interpret scores in light of research area and practices +3. **Multiple lines of evidence**: Combine CBD with other evaluation integrity checks +4. **Manual inspection**: High scores warrant detailed protocol review +5. **Report uncertainty**: Always report confidence intervals and p-values + +### 6.2 Best Practices + +1. **Sufficient sample size**: Use $T \geq 10$ evaluation periods when possible +2. **Standardize inputs**: Normalize performance and protocol measures +3. **Document protocol changes**: Maintain detailed logs for post-hoc analysis +4. **Pre-register protocols**: Commit to evaluation protocol before seeing results +5. **Independent validation**: Use held-out test sets not used for protocol tuning + +### 6.3 When NOT to Use CBD + +1. **Exploratory research**: Early-stage research with intentional protocol exploration +2. **Small sample sizes**: $T < 5$ provides unreliable estimates +3. **Qualitative protocols**: When protocol changes cannot be meaningfully quantified +4. **Single evaluation**: CBD requires multiple evaluation periods + +--- + +## 7. Future Research Directions + +### 7.1 Theoretical Foundations + +1. **Formal statistical framework**: Develop rigorous hypothesis testing procedures +2. **Power analysis**: Determine minimum sample sizes for reliable detection +3. **Multivariate analysis**: Model joint distribution of indicators +4. **Causal inference**: Integrate causal discovery methods + +### 7.2 Empirical Validation + +1. **Large-scale studies**: Validate on diverse real-world datasets +2. **Cross-domain validation**: Test generalization across research areas +3. **Ground truth collection**: Build labeled dataset of biased/unbiased evaluations +4. **Threshold calibration**: Empirically determine optimal thresholds + +### 7.3 Methodological Extensions + +1. **Robust estimators**: Use methods resistant to outliers and violations of assumptions +2. **Non-linear detection**: Incorporate methods for non-linear relationships +3. **Time series methods**: Add trend detection and forecasting +4. **Bayesian framework**: Incorporate prior knowledge and uncertainty quantification + +--- + +## 8. Conclusion + +The CBD framework provides a **preliminary, heuristic approach** to detecting circular reasoning bias in AI evaluation. While based on established statistical methods, the framework has significant limitations: + +1. **Mathematical foundations**: Indicators have known statistical properties, but specific combinations and thresholds lack rigorous justification +2. **Empirical validation**: Limited validation on real-world data; effectiveness varies by domain +3. **Causal inference**: Framework is correlational; cannot establish causation +4. **Threshold calibration**: Current thresholds are heuristic and not validated + +**Recommendation**: Use CBD as a **screening tool** to identify evaluations warranting further investigation, not as definitive evidence of circular bias. Combine with other evaluation integrity practices (pre-registration, held-out test sets, independent validation) for robust conclusions. + +--- + +## References + +1. **Coefficient of Variation**: Abdi, H. (2010). Coefficient of variation. Encyclopedia of Research Design, 1, 169-171. + +2. **Pearson Correlation**: Pearson, K. (1895). Notes on regression and inheritance in the case of two parents. Proceedings of the Royal Society of London, 58, 240-242. + +3. **Effect Size Guidelines**: Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd ed.). Hillsdale, NJ: Erlbaum. + +4. **Correlation vs. Causation**: Pearl, J. (2009). Causality: Models, Reasoning and Inference (2nd ed.). Cambridge University Press. + +5. **Time Series Analysis**: Box, G. E., Jenkins, G. M., Reinsel, G. C., & Ljung, G. M. (2015). Time series analysis: forecasting and control (5th ed.). John Wiley & Sons. + +--- + +**Document Status**: Living document, subject to revision as research progresses. + +**Contributions Welcome**: We encourage community feedback and contributions to improve the mathematical rigor and empirical validation of the CBD framework. diff --git a/metrics/circular_bias_integrity/README.md b/metrics/circular_bias_integrity/README.md index 5846fc1a..c35b0bf0 100644 --- a/metrics/circular_bias_integrity/README.md +++ b/metrics/circular_bias_integrity/README.md @@ -1,5 +1,18 @@ # Metric Card for Circular Bias Detection (CBD) Integrity Score +## ⚠️ Experimental Status + +**This metric is a research prototype and should be considered experimental.** Key limitations: + +- **Mathematical foundations**: While based on established statistical methods, the specific combination and thresholds lack rigorous theoretical justification +- **Empirical validation**: Limited validation on real-world datasets; primarily tested on synthetic data +- **Threshold calibration**: Risk level thresholds (30, 60) are heuristic and not validated across diverse domains +- **Causal inference**: Measures correlation, not causation; cannot definitively prove circular bias + +**Recommendation**: Use as a **screening tool** to identify evaluations warranting further investigation, not as definitive evidence of circular bias. + +--- + ## Metric Description The **Circular Bias Detection (CBD) Integrity Score** is a meta-evaluation metric that measures the statistical integrity of AI evaluation processes. Unlike traditional metrics that measure model performance (e.g., accuracy, F1, BLEU), CBD measures whether the evaluation process itself is trustworthy and free from circular reasoning bias. @@ -84,31 +97,135 @@ print(f"CCS (Constraint-Consistency Score): {results['ccs_score']:.3f}") ### Output Values - **`cbd_score`** (`float`): Overall integrity score (0-100). Higher = more bias detected. - - 0-30: Low risk - - 30-60: Moderate risk - - 60-100: High risk + - 0-30: Low risk (evaluation appears statistically sound) + - 30-60: Moderate risk (some circular dependency detected) + - 60-100: High risk (significant circular bias detected) + - ⚠️ **Note**: Thresholds are heuristic guidelines, not validated standards. Interpret in context of your specific domain. - **`rho_pc`** (`float`): Protocol-Performance correlation (-1 to 1). + - Measures linear correlation between protocol changes and performance + - High |ρ_PC| indicates correlation, not necessarily causation - **`risk_level`** (`str`): "LOW", "MODERATE", or "HIGH". -- **`recommendation`** (`str`): Actionable guidance. + - Based on heuristic thresholds; use as screening guidance +- **`recommendation`** (`str`): Actionable guidance based on detected risk level. - **`psi_score`** (`float`, optional): Parameter stability indicator. - **`ccs_score`** (`float`, optional): Constraint consistency indicator. +## Mathematical Foundations + +For rigorous mathematical definitions, statistical properties, and detailed limitations of each indicator, see: +- **[MATHEMATICAL_FOUNDATIONS.md](MATHEMATICAL_FOUNDATIONS.md)** - Comprehensive mathematical documentation + +### Core Indicators + +1. **ρ_PC (Protocol-Performance Correlation)** + - Pearson correlation between protocol variations and performance scores + - Range: [-1, 1]; values near ±1 indicate strong correlation + - **Limitation**: Measures correlation, not causation; sensitive to outliers + +2. **PSI (Performance-Structure Independence)** [Optional] + - Measures parameter stability across evaluation periods + - Formula: Average absolute difference in parameters over time + - **Limitation**: Cannot distinguish legitimate improvement from circular bias + +3. **CCS (Constraint-Consistency Score)** [Optional] + - Measures consistency of constraint specifications using coefficient of variation + - Range: [0, 1]; higher values indicate more consistency + - **Limitation**: Undefined for zero-mean constraints; assumes continuous constraints + +--- + ## Limitations and Bias -### Limitations +### Critical Limitations + +1. **Experimental Status**: This metric is a research prototype, not a validated production tool. + +2. **Threshold Validity**: + - Risk thresholds (30, 60) are **heuristic**, not empirically validated + - No cross-domain calibration performed + - Optimal thresholds likely vary by research area + - **Recommendation**: Calibrate thresholds for your specific domain + +3. **Correlation vs. Causation**: + - High ρ_PC indicates correlation, **not proof of circular bias** + - Possible explanations: legitimate improvement, confounding factors, reverse causation + - **Recommendation**: Use as screening tool, not definitive evidence + +4. **Protocol Quantification**: + - Requires user to quantify protocol variations (subjective) + - Quality of results depends on quality of quantification + - No standard method for quantifying "protocol change magnitude" + +5. **Minimum Data Requirements**: + - Requires ≥3 evaluation rounds (minimum) + - Reliable results need ≥10 rounds + - Small sample sizes produce high variance estimates + +6. **Statistical Assumptions**: + - Assumes linear relationships (may miss non-linear patterns) + - Assumes independence of observations (often violated in time series) + - Assumes bivariate normality for significance testing + - Sensitive to outliers and measurement noise + +7. **Limited Validation**: + - Primarily tested on synthetic data with known bias patterns + - Limited real-world validation across diverse domains + - Detection rates on real-world circular bias are **unknown** + +### False Positive Scenarios + +- **Legitimate research progress**: Natural co-evolution of methods and protocols +- **Exploratory research**: Intentional protocol exploration in early-stage research +- **External factors**: Hardware improvements, dataset updates affecting both protocol and performance +- **Reactive adjustments**: Poor performance motivating protocol changes (reverse causation) + +### False Negative Scenarios + +- **Non-linear bias**: Circular bias through non-linear protocol-performance relationships +- **Discrete protocols**: Bias through categorical protocol choices not captured by correlation +- **Dataset selection**: Circular bias through dataset curation rather than protocol tuning +- **Time-lagged effects**: Protocol changes at time t affecting performance at t+k + +### When NOT to Use + +1. **Exploratory research**: Early-stage research with intentional protocol exploration +2. **Small sample sizes**: T < 5 provides unreliable estimates +3. **Qualitative protocols**: When protocol changes cannot be meaningfully quantified +4. **Single evaluation**: CBD requires multiple evaluation periods +5. **Categorical protocols**: When protocols are primarily categorical rather than continuous + +--- + +## Validation Status + +### Current Validation + +- ✅ **Synthetic data**: Tested on synthetic datasets with injected bias patterns +- ⚠️ **Real-world data**: Limited validation on actual research evaluations +- ❌ **Cross-domain**: No systematic cross-domain validation +- ❌ **Threshold calibration**: Thresholds not empirically optimized -1. **Minimum Data Requirements**: Requires at least 3 evaluation rounds for reliable correlation analysis. More rounds (5-10+) provide more robust results. +### What We Know -2. **Protocol Quantification**: Users must quantify protocol variations, which can be subjective. Consider using normalized measures (e.g., percentage change in hyperparameters). +- CBD can detect **known, injected bias** in synthetic scenarios (as expected) +- Correlation-based approach is theoretically sound for **screening** +- Mathematical foundations are based on established statistical methods -3. **Correlation ≠ Causation**: High ρ_PC indicates correlation between protocol changes and performance, but doesn't prove causation. Manual inspection is recommended. +### What We Don't Know -4. **Simplified MVP**: This initial version focuses on ρ_PC as the primary indicator. Full CBD framework includes bootstrap confidence intervals and adaptive thresholds (available in the standalone library). +- **False positive rate** in real-world scenarios +- **False negative rate** for sophisticated circular bias +- **Optimal thresholds** for different research domains +- **Generalization** across diverse evaluation contexts +- **Comparative effectiveness** vs. other bias detection methods -### Bias Considerations +### Call for Contribution -- **False Positives**: Natural performance improvements during model development may be flagged as circular bias if correlated with protocol changes. -- **False Negatives**: Sophisticated circular bias (e.g., through dataset selection) may not be detected if protocol variations aren't properly quantified. +We welcome community contributions: +- Real-world test cases with ground truth labels +- Domain-specific threshold calibration studies +- Comparative evaluations vs. alternative methods +- Extensions to handle categorical protocols and non-linear relationships ## Citation