From 979fd8823fde27d2fa975c80c75356d41be3bc03 Mon Sep 17 00:00:00 2001
From: Eva Micankova <emicanko@redhat.com>
Date: Wed, 28 Jan 2026 16:29:49 +0100
Subject: [PATCH 1/3] Enhance test quality

---
 Makefile                                      |   6 +-
 pyproject.toml                                |   1 +
 pyrightconfig.json                            |  12 +
 script/__init__.py                            |   1 +
 script/compare_evaluations.py                 |   2 +-
 script/run_multi_provider_eval.py             |   8 +-
 tests/conftest.py                             |   9 +
 tests/script/conftest.py                      | 212 ++++++++
 tests/script/test_compare_evaluations.py      | 232 ++++-----
 tests/script/test_run_multi_provider_eval.py  | 398 ++++++---------
 tests/unit/core/api/conftest.py               |  42 ++
 tests/unit/core/api/test_client.py            | 159 +++---
 tests/unit/core/api/test_streaming_parser.py  | 101 ++--
 tests/unit/core/config/test_models.py         |  64 ++-
 tests/unit/core/llm/conftest.py               |  29 ++
 tests/unit/core/llm/test_custom.py            |  19 +-
 tests/unit/core/llm/test_deepeval_manager.py  |  30 +-
 tests/unit/core/llm/test_llm_manager.py       |  65 +--
 tests/unit/core/llm/test_manager.py           |  26 +-
 tests/unit/core/metrics/conftest.py           | 142 ++++++
 tests/unit/core/metrics/custom/test_custom.py |  15 +-
 .../core/metrics/custom/test_tool_eval.py     |  90 ++--
 tests/unit/core/metrics/test_geval.py         | 127 +++--
 tests/unit/core/metrics/test_keywords_eval.py |  22 +-
 tests/unit/core/metrics/test_manager.py       | 124 +++--
 tests/unit/core/metrics/test_nlp.py           | 170 ++-----
 tests/unit/core/models/test_api_additional.py |  49 +-
 tests/unit/core/models/test_data.py           | 144 ++++--
 .../core/models/test_system_additional.py     |  56 ++-
 tests/unit/core/output/conftest.py            |  95 ++++
 tests/unit/core/output/test_final_coverage.py |  30 +-
 tests/unit/core/output/test_generator.py      | 228 +++++----
 tests/unit/core/output/test_statistics.py     | 149 +++---
 tests/unit/core/script/test_manager.py        |  23 +-
 .../core/script/test_manager_additional.py    |  37 +-
 tests/unit/core/system/test_env_validator.py  |  59 ++-
 tests/unit/core/system/test_lazy_import.py    |  12 +-
 tests/unit/core/system/test_loader.py         |  34 +-
 tests/unit/core/system/test_setup.py          |  45 +-
 tests/unit/core/system/test_ssl_certifi.py    |  42 +-
 tests/unit/core/system/test_validator.py      | 162 ++++--
 tests/unit/pipeline/evaluation/conftest.py    | 223 ++++++++
 .../unit/pipeline/evaluation/test_amender.py  |  30 +-
 tests/unit/pipeline/evaluation/test_errors.py |  20 +-
 .../pipeline/evaluation/test_evaluator.py     | 268 +++++-----
 .../unit/pipeline/evaluation/test_pipeline.py |  88 ++--
 .../pipeline/evaluation/test_processor.py     | 476 ++++++++----------
 tests/unit/runner/test_evaluation.py          |  56 ++-
 48 files changed, 2671 insertions(+), 1761 deletions(-)
 create mode 100644 pyrightconfig.json
 create mode 100644 script/__init__.py
 create mode 100644 tests/conftest.py
 create mode 100644 tests/script/conftest.py
 create mode 100644 tests/unit/core/api/conftest.py
 create mode 100644 tests/unit/core/llm/conftest.py
 create mode 100644 tests/unit/core/metrics/conftest.py
 create mode 100644 tests/unit/core/output/conftest.py
 create mode 100644 tests/unit/pipeline/evaluation/conftest.py

diff --git a/Makefile b/Makefile
index 135f0b7f..f99bea30 100644
--- a/Makefile
+++ b/Makefile
@@ -39,7 +39,7 @@ update-deps: ## Check pyproject.toml for changes, update the lock file if needed
 	uv sync --group dev
 
 check-types: ## Checks type hints in sources
-	uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/
+	uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/ tests
 
 black-check:
 	uv run black . --check
@@ -73,10 +73,10 @@ help: ## Show this help screen
 
 pylint:
 	uv run pylint src
-	uv run pylint lsc_agent_eval/src
+	uv run pylint --disable=R0801 lsc_agent_eval/src tests
 
 pyright:
-	uv run pyright src lsc_agent_eval/src
+	uv run pyright src lsc_agent_eval/src tests
 
 docstyle:
 	uv run pydocstyle -v .
diff --git a/pyproject.toml b/pyproject.toml
index 71c12676..d9efb241 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -89,6 +89,7 @@ warn_required_dynamic_aliases = true
 
 [tool.pylint.MASTER]
 load-plugins = ["pylint_pydantic"]
+init-hook = "import sys; sys.path.append('.')"
 
 [tool.ruff]
 [tool.ruff.lint.flake8-tidy-imports]
diff --git a/pyrightconfig.json b/pyrightconfig.json
new file mode 100644
index 00000000..22193a37
--- /dev/null
+++ b/pyrightconfig.json
@@ -0,0 +1,12 @@
+{
+  "reportAttributeAccessIssue": "warning",
+  "executionEnvironments": [
+    {
+      "root": "tests",
+      "reportAttributeAccessIssue": "none",
+      "extraPaths": [
+        "."
+      ]
+    }
+  ]
+}
diff --git a/script/__init__.py b/script/__init__.py
new file mode 100644
index 00000000..d6eec20f
--- /dev/null
+++ b/script/__init__.py
@@ -0,0 +1 @@
+"""Script utilities for lightspeed-evaluation."""
diff --git a/script/compare_evaluations.py b/script/compare_evaluations.py
index ac993598..be8e1699 100755
--- a/script/compare_evaluations.py
+++ b/script/compare_evaluations.py
@@ -421,7 +421,7 @@ def _check_confidence_interval_overlap(
         Returns:
             Dictionary containing overlap test results
         """
-        result = {
+        result: dict[str, Any] = {
             "test_performed": False,
             "intervals_overlap": None,
             "significant": None,
diff --git a/script/run_multi_provider_eval.py b/script/run_multi_provider_eval.py
index 4cca522b..34a5d471 100755
--- a/script/run_multi_provider_eval.py
+++ b/script/run_multi_provider_eval.py
@@ -318,7 +318,7 @@ def _create_provider_model_configs(self) -> list[dict[str, Any]]:
         Returns:
             List of dictionaries with provider, model, and settings
         """
-        configs = []
+        configs: list[dict[str, Any]] = []
 
         # Get providers from the config
         providers = self.providers_config.get("providers", {})
@@ -781,7 +781,7 @@ def _analyze_single_model(
 
         # Calculate score statistics
         if all_scores:
-            score_stats = {
+            score_stats: dict[str, Any] = {
                 "mean": float(np.mean(all_scores)),
                 "median": float(np.median(all_scores)),
                 "std": float(np.std(all_scores)),
@@ -818,10 +818,10 @@ def _analyze_single_model(
                     logger.warning(
                         "scipy not available, skipping confidence interval calculation"
                     )
-                    score_stats["confidence_interval"] = None
+                    score_stats["confidence_interval"] = None  # type: ignore[assignment]
             else:
                 # Single score - no confidence interval
-                score_stats["confidence_interval"] = None
+                score_stats["confidence_interval"] = None  # type: ignore[assignment]
         else:
             score_stats = {
                 "mean": 0.0,
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..f38a1ee1
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,9 @@
+"""Pytest configuration and fixtures for lightspeed-evaluation tests."""
+
+import sys
+from pathlib import Path
+
+# Add project root to Python path so we can import from script directory
+project_root = Path(__file__).parent.parent
+if str(project_root) not in sys.path:
+    sys.path.insert(0, str(project_root))
diff --git a/tests/script/conftest.py b/tests/script/conftest.py
new file mode 100644
index 00000000..8ab273da
--- /dev/null
+++ b/tests/script/conftest.py
@@ -0,0 +1,212 @@
+"""Pytest configuration and fixtures for script tests."""
+
+from pathlib import Path
+from typing import Any
+
+import pytest
+import yaml
+
+from script.run_multi_provider_eval import MultiProviderEvaluationRunner
+
+
+@pytest.fixture
+def script_path() -> Path:
+    """Return the path to the compare_evaluations.py script."""
+    # Test is in tests/script/, script is in project_root/script/
+    return Path(__file__).parent.parent.parent / "script" / "compare_evaluations.py"
+
+
+@pytest.fixture
+def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
+    """Return sample evaluation data for testing."""
+    sample_results1 = [
+        {
+            "conversation_group_id": "conv1",
+            "turn_id": "1",
+            "metric_identifier": "ragas:faithfulness",
+            "result": "PASS",
+            "score": 0.8,
+            "threshold": 0.7,
+            "execution_time": 1.0,
+        },
+        {
+            "conversation_group_id": "conv1",
+            "turn_id": "2",
+            "metric_identifier": "ragas:faithfulness",
+            "result": "PASS",
+            "score": 0.9,
+            "threshold": 0.7,
+            "execution_time": 1.2,
+        },
+    ]
+
+    sample_results2 = [
+        {
+            "conversation_group_id": "conv1",
+            "turn_id": "1",
+            "metric_identifier": "ragas:faithfulness",
+            "result": "PASS",
+            "score": 0.85,
+            "threshold": 0.7,
+            "execution_time": 1.1,
+        },
+        {
+            "conversation_group_id": "conv1",
+            "turn_id": "2",
+            "metric_identifier": "ragas:faithfulness",
+            "result": "FAIL",
+            "score": 0.6,
+            "threshold": 0.7,
+            "execution_time": 1.0,
+        },
+    ]
+
+    return sample_results1, sample_results2
+
+
+@pytest.fixture
+def temp_config_files(tmp_path: Path) -> dict:
+    """Create temporary configuration files for testing."""
+    # Create multi_eval_config.yaml
+    providers_config = {
+        "providers": {
+            "openai": {
+                "models": ["gpt-4o-mini", "gpt-4-turbo"],
+            },
+            "watsonx": {
+                "models": ["ibm/granite-13b-chat-v2"],
+            },
+        },
+        "settings": {"output_base": str(tmp_path / "eval_output")},
+    }
+    providers_path = tmp_path / "multi_eval_config.yaml"
+    with open(providers_path, "w", encoding="utf-8") as f:
+        yaml.dump(providers_config, f)
+
+    # Create system.yaml
+    system_config = {
+        "llm": {
+            "provider": "openai",
+            "model": "gpt-4o-mini",
+            "temperature": 0.0,
+        },
+        "api": {"enabled": False},
+        "output": {"output_dir": "./eval_output"},
+    }
+    system_path = tmp_path / "system.yaml"
+    with open(system_path, "w", encoding="utf-8") as f:
+        yaml.dump(system_config, f)
+
+    # Create evaluation_data.yaml
+    eval_data = [
+        {
+            "conversation_group_id": "test_conv",
+            "turns": [
+                {
+                    "turn_id": "turn_1",
+                    "query": "Test query",
+                    "response": "Test response",
+                    "contexts": ["Context 1"],
+                    "expected_response": "Expected",
+                    "turn_metrics": ["ragas:response_relevancy"],
+                }
+            ],
+        }
+    ]
+    eval_path = tmp_path / "evaluation_data.yaml"
+    with open(eval_path, "w", encoding="utf-8") as f:
+        yaml.dump(eval_data, f)
+
+    return {
+        "providers_config": providers_path,
+        "system_config": system_path,
+        "eval_data": eval_path,
+        "output_dir": tmp_path / "eval_output",
+    }
+
+
+@pytest.fixture
+def runner(  # pylint: disable=redefined-outer-name
+    temp_config_files: dict,
+) -> MultiProviderEvaluationRunner:
+    """Create a MultiProviderEvaluationRunner instance for testing."""
+    return MultiProviderEvaluationRunner(
+        providers_config_path=str(temp_config_files["providers_config"]),
+        system_config_path=str(temp_config_files["system_config"]),
+        eval_data_path=str(temp_config_files["eval_data"]),
+    )
+
+
+@pytest.fixture
+def sample_evaluation_summary() -> dict[str, Any]:
+    """Create a sample evaluation summary JSON for testing analysis."""
+    return {
+        "timestamp": "2025-01-01T12:00:00",
+        "total_evaluations": 10,
+        "summary_stats": {
+            "overall": {
+                "TOTAL": 10,
+                "PASS": 8,
+                "FAIL": 2,
+                "ERROR": 0,
+                "pass_rate": 80.0,  # Percentage format
+                "fail_rate": 20.0,
+                "error_rate": 0.0,
+            },
+            "by_metric": {
+                "ragas:faithfulness": {
+                    "pass": 4,
+                    "fail": 0,
+                    "error": 0,
+                    "pass_rate": 100.0,
+                    "fail_rate": 0.0,
+                    "error_rate": 0.0,
+                    "score_statistics": {
+                        "mean": 0.95,
+                        "median": 0.95,
+                        "std": 0.02,
+                        "min": 0.92,
+                        "max": 0.98,
+                        "count": 4,
+                    },
+                },
+                "ragas:response_relevancy": {
+                    "pass": 4,
+                    "fail": 2,
+                    "error": 0,
+                    "pass_rate": 66.67,
+                    "fail_rate": 33.33,
+                    "error_rate": 0.0,
+                    "score_statistics": {
+                        "mean": 0.75,
+                        "median": 0.78,
+                        "std": 0.12,
+                        "min": 0.55,
+                        "max": 0.88,
+                        "count": 6,
+                    },
+                },
+            },
+        },
+        "results": [
+            {
+                "conversation_group_id": "conv1",
+                "turn_id": "turn1",
+                "metric_identifier": "ragas:faithfulness",
+                "result": "PASS",
+                "score": 0.95,
+                "threshold": 0.8,
+                "execution_time": 1.0,
+            },
+            {
+                "conversation_group_id": "conv1",
+                "turn_id": "turn2",
+                "metric_identifier": "ragas:response_relevancy",
+                "result": "PASS",
+                "score": 0.85,
+                "threshold": 0.7,
+                "execution_time": 1.2,
+            },
+        ]
+        * 5,  # Repeat to get 10 results
+    }
diff --git a/tests/script/test_compare_evaluations.py b/tests/script/test_compare_evaluations.py
index 020704e9..e03bebdb 100755
--- a/tests/script/test_compare_evaluations.py
+++ b/tests/script/test_compare_evaluations.py
@@ -7,65 +7,15 @@
 import sys
 from pathlib import Path
 
+from typing import Any
 import pytest
 
+from script.compare_evaluations import EvaluationComparison
 
-@pytest.fixture
-def script_path():
-    """Return the path to the compare_evaluations.py script."""
-    # Test is in tests/script/, script is in project_root/script/
-    return Path(__file__).parent.parent.parent / "script" / "compare_evaluations.py"
-
-
-@pytest.fixture
-def sample_evaluation_data():
-    """Return sample evaluation data for testing."""
-    sample_results1 = [
-        {
-            "conversation_group_id": "conv1",
-            "turn_id": "1",
-            "metric_identifier": "ragas:faithfulness",
-            "result": "PASS",
-            "score": 0.8,
-            "threshold": 0.7,
-            "execution_time": 1.0,
-        },
-        {
-            "conversation_group_id": "conv1",
-            "turn_id": "2",
-            "metric_identifier": "ragas:faithfulness",
-            "result": "PASS",
-            "score": 0.9,
-            "threshold": 0.7,
-            "execution_time": 1.2,
-        },
-    ]
-
-    sample_results2 = [
-        {
-            "conversation_group_id": "conv1",
-            "turn_id": "1",
-            "metric_identifier": "ragas:faithfulness",
-            "result": "PASS",
-            "score": 0.85,
-            "threshold": 0.7,
-            "execution_time": 1.1,
-        },
-        {
-            "conversation_group_id": "conv1",
-            "turn_id": "2",
-            "metric_identifier": "ragas:faithfulness",
-            "result": "FAIL",
-            "score": 0.6,
-            "threshold": 0.7,
-            "execution_time": 1.0,
-        },
-    ]
 
-    return sample_results1, sample_results2
-
-
-def create_sample_summary(results, timestamp="2025-01-01T00:00:00"):
+def create_sample_summary(
+    results: list[dict[str, Any]], timestamp: str = "2025-01-01T00:00:00"
+) -> dict[str, Any]:
     """Create a sample evaluation summary."""
     return {
         "timestamp": timestamp,
@@ -97,7 +47,10 @@ def create_sample_summary(results, timestamp="2025-01-01T00:00:00"):
     }
 
 
-def test_basic_comparison(script_path, sample_evaluation_data):
+def test_basic_comparison(
+    script_path: Path,
+    sample_evaluation_data: tuple[list[dict[str, Any]], list[dict[str, Any]]],
+) -> None:
     """Test basic comparison functionality."""
     sample_results1, sample_results2 = sample_evaluation_data
 
@@ -109,9 +62,9 @@ def test_basic_comparison(script_path, sample_evaluation_data):
         file1 = Path(temp_dir) / "summary1.json"
         file2 = Path(temp_dir) / "summary2.json"
 
-        with open(file1, "w") as f:
+        with open(file1, "w", encoding="utf-8") as f:
             json.dump(summary1, f)
-        with open(file2, "w") as f:
+        with open(file2, "w", encoding="utf-8") as f:
             json.dump(summary2, f)
 
         # Test the script
@@ -119,6 +72,7 @@ def test_basic_comparison(script_path, sample_evaluation_data):
             [sys.executable, str(script_path), str(file1), str(file2)],
             capture_output=True,
             text=True,
+            check=False,
         )
 
         assert result.returncode == 0, f"Script failed with error: {result.stderr}"
@@ -128,12 +82,15 @@ def test_basic_comparison(script_path, sample_evaluation_data):
         ), "Output should contain comparison report"
 
 
-def test_invalid_arguments(script_path):
+def test_invalid_arguments(script_path: Path) -> None:
     """Test error handling for invalid arguments."""
 
     # Test with only one file
     result = subprocess.run(
-        [sys.executable, str(script_path), "file1.json"], capture_output=True, text=True
+        [sys.executable, str(script_path), "file1.json"],
+        capture_output=True,
+        text=True,
+        check=False,
     )
 
     assert result.returncode != 0, "Script should fail with only one file"
@@ -146,6 +103,7 @@ def test_invalid_arguments(script_path):
         [sys.executable, str(script_path), "file1.json", "file2.json", "file3.json"],
         capture_output=True,
         text=True,
+        check=False,
     )
 
     assert result.returncode != 0, "Script should fail with three files"
@@ -154,13 +112,14 @@ def test_invalid_arguments(script_path):
     ), f"Expected error message not found in stderr: {result.stderr}"
 
 
-def test_nonexistent_files(script_path):
+def test_nonexistent_files(script_path: Path) -> None:
     """Test error handling for nonexistent files."""
 
     result = subprocess.run(
         [sys.executable, str(script_path), "nonexistent1.json", "nonexistent2.json"],
         capture_output=True,
         text=True,
+        check=False,
     )
 
     assert result.returncode != 0, "Script should fail with nonexistent files"
@@ -173,25 +132,21 @@ class TestEvaluationComparisonMethods:
     """Unit tests for EvaluationComparison internal methods."""
 
     @pytest.fixture
-    def comparison_instance(self):
+    def comparison_instance(self) -> EvaluationComparison:
         """Create an EvaluationComparison instance for testing."""
-        # Import here to avoid module loading issues
-        import sys
-
-        # Add project root to path (tests/script/ -> tests/ -> project_root/)
-        sys.path.append(str(Path(__file__).parent.parent.parent))
-        from script.compare_evaluations import EvaluationComparison
-
         return EvaluationComparison(alpha=0.05)
 
-    def test_compare_score_distributions_basic(self, comparison_instance):
+    def test_compare_score_distributions_basic(
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _compare_score_distributions with basic score data."""
         # Test data based on normal distributions
         scores1 = [0.8, 0.9, 0.7, 0.85, 0.75, 0.88, 0.82, 0.79, 0.86, 0.81]
         scores2 = [0.6, 0.65, 0.55, 0.62, 0.58, 0.63, 0.59, 0.61, 0.64, 0.57]
 
-        result = comparison_instance._compare_score_distributions(scores1, scores2)
-
+        result = comparison_instance._compare_score_distributions(  # pylint: disable=protected-access
+            scores1, scores2
+        )
         # Check structure
         assert "run1_stats" in result
         assert "run2_stats" in result
@@ -216,14 +171,18 @@ def test_compare_score_distributions_basic(self, comparison_instance):
             assert "p_value" in result["tests"]["mann_whitney_u"]
             assert "significant" in result["tests"]["mann_whitney_u"]
 
-    def test_compare_score_distributions_scipy_example(self, comparison_instance):
+    def test_compare_score_distributions_scipy_example(
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _compare_score_distributions using scipy documentation examples."""
         # Example inspired by scipy.stats.ttest_ind documentation
         # Two samples with different means
         scores1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
         scores2 = [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
 
-        result = comparison_instance._compare_score_distributions(scores1, scores2)
+        result = comparison_instance._compare_score_distributions(  # pylint: disable=protected-access
+            scores1, scores2
+        )
 
         # The means should be 5.5 and 6.5 respectively
         assert abs(result["run1_stats"]["mean"] - 5.5) < 0.01
@@ -234,21 +193,27 @@ def test_compare_score_distributions_scipy_example(self, comparison_instance):
         # (though the exact p-values depend on the implementation)
         assert "tests" in result
 
-    def test_compare_score_distributions_identical_data(self, comparison_instance):
+    def test_compare_score_distributions_identical_data(
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _compare_score_distributions with identical data."""
         scores1 = [0.8, 0.8, 0.8, 0.8, 0.8]
         scores2 = [0.8, 0.8, 0.8, 0.8, 0.8]
 
-        result = comparison_instance._compare_score_distributions(scores1, scores2)
+        result = comparison_instance._compare_score_distributions(  # pylint: disable=protected-access
+            scores1, scores2
+        )
 
         assert result["run1_stats"]["mean"] == result["run2_stats"]["mean"]
         assert result["mean_difference"] == 0.0
         assert result["relative_change"] == 0.0
 
-    def test_perform_pass_rate_tests_basic(self, comparison_instance):
+    def test_perform_pass_rate_tests_basic(
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _perform_pass_rate_tests with basic contingency table data."""
         # Based on scipy.stats.chi2_contingency example
-        comparison = {"tests": {}}
+        comparison: dict = {"tests": {}}
         # Example: Run1 has 16 pass, 4 fail; Run2 has 18 pass, 2 fail
         test_data = {
             "pass_count1": 16,
@@ -259,7 +224,9 @@ def test_perform_pass_rate_tests_basic(self, comparison_instance):
             "total2": 20,
         }
 
-        comparison_instance._perform_pass_rate_tests(comparison, test_data)
+        comparison_instance._perform_pass_rate_tests(  # pylint: disable=protected-access
+            comparison, test_data
+        )
 
         # Check that tests were performed
         assert "tests" in comparison
@@ -269,11 +236,13 @@ def test_perform_pass_rate_tests_basic(self, comparison_instance):
         )
         assert has_tests or "error" in comparison["tests"]
 
-    def test_perform_pass_rate_tests_scipy_chisquare_example(self, comparison_instance):
+    def test_perform_pass_rate_tests_scipy_chisquare_example(
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _perform_pass_rate_tests using scipy chisquare documentation example."""
         # Based on the scipy documentation example: chisquare([16, 18, 16, 14, 12, 12])
         # Convert to pass/fail format for our function
-        comparison = {"tests": {}}
+        comparison: dict = {"tests": {}}
         test_data = {
             "pass_count1": 16,
             "fail_count1": 4,  # Making total 20
@@ -283,7 +252,9 @@ def test_perform_pass_rate_tests_scipy_chisquare_example(self, comparison_instan
             "total2": 20,
         }
 
-        comparison_instance._perform_pass_rate_tests(comparison, test_data)
+        comparison_instance._perform_pass_rate_tests(  # pylint: disable=protected-access
+            comparison, test_data
+        )
 
         # Verify structure
         assert "tests" in comparison
@@ -303,10 +274,12 @@ def test_perform_pass_rate_tests_scipy_chisquare_example(self, comparison_instan
             assert "p_value" in fisher
             assert "significant" in fisher
 
-    def test_perform_pass_rate_tests_edge_cases(self, comparison_instance):
+    def test_perform_pass_rate_tests_edge_cases(
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _perform_pass_rate_tests with edge cases."""
         # Test with zero totals
-        comparison = {"tests": {}}
+        comparison: dict = {"tests": {}}
         test_data = {
             "pass_count1": 0,
             "fail_count1": 0,
@@ -316,61 +289,80 @@ def test_perform_pass_rate_tests_edge_cases(self, comparison_instance):
             "total2": 15,
         }
 
-        comparison_instance._perform_pass_rate_tests(comparison, test_data)
+        comparison_instance._perform_pass_rate_tests(  # pylint: disable=protected-access
+            comparison, test_data
+        )
 
         # Should handle gracefully (no tests performed or error recorded)
         assert "tests" in comparison
 
-    def test_check_confidence_interval_overlap_no_overlap(self, comparison_instance):
+    def test_check_confidence_interval_overlap_no_overlap(
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _check_confidence_interval_overlap with non-overlapping intervals."""
         ci1 = {"low": 0.1, "high": 0.3, "mean": 0.2, "confidence_level": 0.95}
         ci2 = {"low": 0.7, "high": 0.9, "mean": 0.8, "confidence_level": 0.95}
 
-        result = comparison_instance._check_confidence_interval_overlap(ci1, ci2)
+        result = comparison_instance._check_confidence_interval_overlap(  # pylint: disable=protected-access
+            ci1, ci2
+        )
 
         assert "intervals_overlap" in result
         assert "significant" in result
         assert result["intervals_overlap"] is False
         assert result["significant"] is True
 
-    def test_check_confidence_interval_overlap_with_overlap(self, comparison_instance):
+    def test_check_confidence_interval_overlap_with_overlap(
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _check_confidence_interval_overlap with overlapping intervals."""
         ci1 = {"low": 0.2, "high": 0.6, "mean": 0.4, "confidence_level": 0.95}
         ci2 = {"low": 0.4, "high": 0.8, "mean": 0.6, "confidence_level": 0.95}
 
-        result = comparison_instance._check_confidence_interval_overlap(ci1, ci2)
+        result = comparison_instance._check_confidence_interval_overlap(  # pylint: disable=protected-access
+            ci1, ci2
+        )
 
         assert "intervals_overlap" in result
         assert "significant" in result
         assert result["intervals_overlap"] is True
         assert result["significant"] is False
 
-    def test_check_confidence_interval_overlap_none_inputs(self, comparison_instance):
+    def test_check_confidence_interval_overlap_none_inputs(
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _check_confidence_interval_overlap with None inputs."""
-        result = comparison_instance._check_confidence_interval_overlap(None, None)
+        result = comparison_instance._check_confidence_interval_overlap(  # pylint: disable=protected-access
+            None, None
+        )
 
         assert "test_performed" in result
         # Should handle None inputs gracefully - might not perform test
 
-    def test_check_confidence_interval_overlap_partial_none(self, comparison_instance):
+    def test_check_confidence_interval_overlap_partial_none(
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _check_confidence_interval_overlap with one None input."""
         ci1 = {"low": 0.2, "high": 0.6, "mean": 0.4, "confidence_level": 0.95}
 
-        result = comparison_instance._check_confidence_interval_overlap(ci1, None)
-
+        result = comparison_instance._check_confidence_interval_overlap(  # pylint: disable=protected-access
+            ci1, None
+        )
         assert "test_performed" in result
         # Should handle partial None inputs gracefully
 
     def test_compare_score_distributions_known_statistical_results(
-        self, comparison_instance
-    ):
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _compare_score_distributions with known statistical results."""
         # Use data that should produce predictable statistical results
         # Two clearly different distributions
         scores1 = [1.0, 1.1, 1.2, 1.3, 1.4]  # Mean ≈ 1.2, low variance
         scores2 = [2.0, 2.1, 2.2, 2.3, 2.4]  # Mean ≈ 2.2, low variance
 
-        result = comparison_instance._compare_score_distributions(scores1, scores2)
+        result = comparison_instance._compare_score_distributions(  # pylint: disable=protected-access
+            scores1, scores2
+        )
 
         # These should be significantly different
         assert abs(result["mean_difference"] - 1.0) < 0.01
@@ -386,11 +378,13 @@ def test_compare_score_distributions_known_statistical_results(
             assert result["tests"]["mann_whitney_u"]["p_value"] < 0.05
             assert result["tests"]["mann_whitney_u"]["significant"] is True
 
-    def test_perform_pass_rate_tests_known_chi_square_result(self, comparison_instance):
+    def test_perform_pass_rate_tests_known_chi_square_result(
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _perform_pass_rate_tests with data that should produce known chi-square results."""
         # Based on scipy documentation example for chi2_contingency
         # Create a 2x2 contingency table: [[16, 4], [18, 2]]
-        comparison = {"tests": {}}
+        comparison: dict = {"tests": {}}
         test_data = {
             "pass_count1": 16,
             "fail_count1": 4,
@@ -400,7 +394,9 @@ def test_perform_pass_rate_tests_known_chi_square_result(self, comparison_instan
             "total2": 20,
         }
 
-        comparison_instance._perform_pass_rate_tests(comparison, test_data)
+        comparison_instance._perform_pass_rate_tests(  # pylint: disable=protected-access
+            comparison, test_data
+        )
 
         # Verify the chi-square test was performed and has reasonable results
         if "chi_square" in comparison["tests"]:
@@ -415,11 +411,11 @@ def test_perform_pass_rate_tests_known_chi_square_result(self, comparison_instan
             assert 0 <= chi_square["p_value"] <= 1  # p-value is a probability
 
     def test_perform_pass_rate_tests_fisher_exact_small_sample(
-        self, comparison_instance
-    ):
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _perform_pass_rate_tests with small sample sizes suitable for Fisher exact test."""
         # Small sample sizes where Fisher exact test is more appropriate
-        comparison = {"tests": {}}
+        comparison: dict = {"tests": {}}
         test_data = {
             "pass_count1": 3,
             "fail_count1": 2,
@@ -429,7 +425,9 @@ def test_perform_pass_rate_tests_fisher_exact_small_sample(
             "total2": 5,
         }
 
-        comparison_instance._perform_pass_rate_tests(comparison, test_data)
+        comparison_instance._perform_pass_rate_tests(  # pylint: disable=protected-access
+            comparison, test_data
+        )
 
         # Verify Fisher exact test results
         if "fisher_exact" in comparison["tests"]:
@@ -440,8 +438,8 @@ def test_perform_pass_rate_tests_fisher_exact_small_sample(
             assert 0 <= fisher["p_value"] <= 1  # p-value is a probability
 
     def test_check_confidence_interval_overlap_exact_boundaries(
-        self, comparison_instance
-    ):
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _check_confidence_interval_overlap with exact boundary conditions."""
         # Test case where intervals just touch at boundaries
         ci1 = {"low": 0.1, "high": 0.5, "mean": 0.3, "confidence_level": 0.95}
@@ -452,7 +450,9 @@ def test_check_confidence_interval_overlap_exact_boundaries(
             "confidence_level": 0.95,
         }
 
-        result = comparison_instance._check_confidence_interval_overlap(ci1, ci2)
+        result = comparison_instance._check_confidence_interval_overlap(  # pylint: disable=protected-access
+            ci1, ci2
+        )
 
         # Touching at boundary might be considered overlap or not, depending on implementation
         assert "intervals_overlap" in result
@@ -460,12 +460,16 @@ def test_check_confidence_interval_overlap_exact_boundaries(
         assert isinstance(result["intervals_overlap"], bool)
         assert isinstance(result["significant"], bool)
 
-    def test_compare_score_distributions_single_values(self, comparison_instance):
+    def test_compare_score_distributions_single_values(
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _compare_score_distributions with single values (edge case)."""
         scores1 = [0.8]
         scores2 = [0.6]
 
-        result = comparison_instance._compare_score_distributions(scores1, scores2)
+        result = comparison_instance._compare_score_distributions(  # pylint: disable=protected-access
+            scores1, scores2
+        )
 
         # Should handle single values gracefully
         assert result["run1_stats"]["count"] == 1
@@ -479,10 +483,12 @@ def test_compare_score_distributions_single_values(self, comparison_instance):
         # Statistical tests might not be performed with single values
         assert "tests" in result
 
-    def test_perform_pass_rate_tests_extreme_ratios(self, comparison_instance):
+    def test_perform_pass_rate_tests_extreme_ratios(
+        self, comparison_instance: EvaluationComparison
+    ) -> None:
         """Test _perform_pass_rate_tests with extreme pass rate differences."""
         # One run with 100% pass rate, another with 0% pass rate
-        comparison = {"tests": {}}
+        comparison: dict = {"tests": {}}
         test_data = {
             "pass_count1": 10,
             "fail_count1": 0,
@@ -492,7 +498,9 @@ def test_perform_pass_rate_tests_extreme_ratios(self, comparison_instance):
             "total2": 10,
         }
 
-        comparison_instance._perform_pass_rate_tests(comparison, test_data)
+        comparison_instance._perform_pass_rate_tests(  # pylint: disable=protected-access
+            comparison, test_data
+        )
 
         # Should handle extreme cases
         assert "tests" in comparison
diff --git a/tests/script/test_run_multi_provider_eval.py b/tests/script/test_run_multi_provider_eval.py
index 103950a0..ef0057dc 100644
--- a/tests/script/test_run_multi_provider_eval.py
+++ b/tests/script/test_run_multi_provider_eval.py
@@ -2,94 +2,24 @@
 """Pytest tests for run_multi_provider_eval.py script."""
 
 import json
-import sys
 from pathlib import Path
+from typing import Any
 from unittest.mock import patch
+import tempfile as temp_module
+import logging
+import multiprocessing
+import shutil
 
 import pytest
 import yaml
 
-# Add the script directory to the path
-sys.path.insert(0, str(Path(__file__).parent.parent.parent / "script"))
-
-from run_multi_provider_eval import MultiProviderEvaluationRunner
-
-
-@pytest.fixture
-def temp_config_files(tmp_path):
-    """Create temporary configuration files for testing."""
-    # Create multi_eval_config.yaml
-    providers_config = {
-        "providers": {
-            "openai": {
-                "models": ["gpt-4o-mini", "gpt-4-turbo"],
-            },
-            "watsonx": {
-                "models": ["ibm/granite-13b-chat-v2"],
-            },
-        },
-        "settings": {"output_base": str(tmp_path / "eval_output")},
-    }
-    providers_path = tmp_path / "multi_eval_config.yaml"
-    with open(providers_path, "w", encoding="utf-8") as f:
-        yaml.dump(providers_config, f)
-
-    # Create system.yaml
-    system_config = {
-        "llm": {
-            "provider": "openai",
-            "model": "gpt-4o-mini",
-            "temperature": 0.0,
-        },
-        "api": {"enabled": False},
-        "output": {"output_dir": "./eval_output"},
-    }
-    system_path = tmp_path / "system.yaml"
-    with open(system_path, "w", encoding="utf-8") as f:
-        yaml.dump(system_config, f)
-
-    # Create evaluation_data.yaml
-    eval_data = [
-        {
-            "conversation_group_id": "test_conv",
-            "turns": [
-                {
-                    "turn_id": "turn_1",
-                    "query": "Test query",
-                    "response": "Test response",
-                    "contexts": ["Context 1"],
-                    "expected_response": "Expected",
-                    "turn_metrics": ["ragas:response_relevancy"],
-                }
-            ],
-        }
-    ]
-    eval_path = tmp_path / "evaluation_data.yaml"
-    with open(eval_path, "w", encoding="utf-8") as f:
-        yaml.dump(eval_data, f)
-
-    return {
-        "providers_config": providers_path,
-        "system_config": system_path,
-        "eval_data": eval_path,
-        "output_dir": tmp_path / "eval_output",
-    }
-
-
-@pytest.fixture
-def runner(temp_config_files):
-    """Create a MultiProviderEvaluationRunner instance for testing."""
-    return MultiProviderEvaluationRunner(
-        providers_config_path=str(temp_config_files["providers_config"]),
-        system_config_path=str(temp_config_files["system_config"]),
-        eval_data_path=str(temp_config_files["eval_data"]),
-    )
+from script.run_multi_provider_eval import MultiProviderEvaluationRunner
 
 
 class TestMultiProviderEvaluationRunnerInit:
     """Tests for MultiProviderEvaluationRunner initialization."""
 
-    def test_init_success(self, temp_config_files):
+    def test_init_success(self, temp_config_files: dict[str, Path]) -> None:
         """Test successful initialization of the runner."""
         runner = MultiProviderEvaluationRunner(
             providers_config_path=str(temp_config_files["providers_config"]),
@@ -103,9 +33,9 @@ def test_init_success(self, temp_config_files):
         assert runner.system_config_path == Path(temp_config_files["system_config"])
         assert runner.eval_data_path == Path(temp_config_files["eval_data"])
         assert runner.output_base.exists()
-        assert runner.results == []
+        assert not runner.results
 
-    def test_init_config_not_found(self, temp_config_files):
+    def test_init_config_not_found(self, temp_config_files: dict[str, Path]) -> None:
         """Test initialization fails when any config file is missing."""
         with pytest.raises(FileNotFoundError, match="Providers config not found"):
             MultiProviderEvaluationRunner(
@@ -114,7 +44,9 @@ def test_init_config_not_found(self, temp_config_files):
                 eval_data_path=str(temp_config_files["eval_data"]),
             )
 
-    def test_max_workers_from_constructor(self, temp_config_files):
+    def test_max_workers_from_constructor(
+        self, temp_config_files: dict[str, Path]
+    ) -> None:
         """Test max_workers configured via constructor argument."""
         runner = MultiProviderEvaluationRunner(
             providers_config_path=str(temp_config_files["providers_config"]),
@@ -124,7 +56,9 @@ def test_max_workers_from_constructor(self, temp_config_files):
         )
         assert runner.max_workers == 4
 
-    def test_max_workers_from_config_file(self, temp_config_files, tmp_path):
+    def test_max_workers_from_config_file(
+        self, temp_config_files: dict[str, Path], tmp_path: Path
+    ) -> None:
         """Test max_workers configured via config file."""
         # Create config with max_workers setting
         config_with_workers = {
@@ -147,7 +81,9 @@ def test_max_workers_from_config_file(self, temp_config_files, tmp_path):
         )
         assert runner.max_workers == 6
 
-    def test_max_workers_string_coercion(self, temp_config_files, tmp_path):
+    def test_max_workers_string_coercion(
+        self, temp_config_files: dict[str, Path], tmp_path: Path
+    ) -> None:
         """Test max_workers string value from YAML is coerced to int."""
         # Create config with string max_workers
         config_with_string = {
@@ -171,7 +107,9 @@ def test_max_workers_string_coercion(self, temp_config_files, tmp_path):
         assert runner.max_workers == 4
         assert isinstance(runner.max_workers, int)
 
-    def test_max_workers_invalid_value(self, temp_config_files, tmp_path):
+    def test_max_workers_invalid_value(
+        self, temp_config_files: dict[str, Path], tmp_path: Path
+    ) -> None:
         """Test max_workers with invalid value raises clear error."""
         # Create config with invalid max_workers
         config_invalid = {
@@ -194,7 +132,9 @@ def test_max_workers_invalid_value(self, temp_config_files, tmp_path):
                 eval_data_path=str(temp_config_files["eval_data"]),
             )
 
-    def test_max_workers_minimum_value(self, temp_config_files):
+    def test_max_workers_minimum_value(
+        self, temp_config_files: dict[str, Path]
+    ) -> None:
         """Test max_workers is enforced to be at least 1."""
         runner = MultiProviderEvaluationRunner(
             providers_config_path=str(temp_config_files["providers_config"]),
@@ -213,10 +153,12 @@ def test_max_workers_minimum_value(self, temp_config_files):
         assert runner2.max_workers == 1  # Should be clamped to 1
 
     def test_resource_warning_high_thread_count(
-        self, temp_config_files, tmp_path, caplog
-    ):
+        self,
+        temp_config_files: dict[str, Path],
+        tmp_path: Path,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
         """Test warning is logged when total threads is very high."""
-        import logging
 
         # Create system config with high max_threads
         system_config = {
@@ -248,11 +190,12 @@ def test_resource_warning_high_thread_count(
         assert runner.max_workers == 4
 
     def test_no_resource_warning_reasonable_config(
-        self, temp_config_files, tmp_path, caplog
-    ):
+        self,
+        temp_config_files: dict[str, Path],
+        tmp_path: Path,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
         """Test no warning with reasonable thread count."""
-        import logging
-        import multiprocessing
 
         # Calculate safe thread count based on actual CPU count
         cpu_count = multiprocessing.cpu_count()
@@ -280,50 +223,66 @@ def test_no_resource_warning_reasonable_config(
             )
 
         # Check no warning was logged
+        total_threads = max_workers * max_threads
         assert not any(
             "High resource usage detected" in record.message
             for record in caplog.records
-        ), f"Expected no warning with {max_workers} workers × {max_threads} threads = {max_workers * max_threads} on {cpu_count} CPUs"
+        ), (
+            f"Expected no warning: {max_workers} workers × {max_threads} "
+            f"threads = {total_threads} on {cpu_count} CPUs"
+        )
         assert runner.max_workers == max_workers
 
 
 class TestLoadYAML:
     """Tests for _load_yaml method."""
 
-    def test_load_valid_yaml(self, runner, temp_config_files):
+    def test_load_valid_yaml(
+        self, runner: MultiProviderEvaluationRunner, temp_config_files: dict[str, Path]
+    ) -> None:
         """Test loading a valid YAML file."""
-        config = runner._load_yaml(temp_config_files["providers_config"])
+        config = runner._load_yaml(  # pylint: disable=protected-access
+            temp_config_files["providers_config"]
+        )
         assert isinstance(config, dict)
         assert "providers" in config
         assert "openai" in config["providers"]
         assert "models" in config["providers"]["openai"]
         assert "settings" in config
 
-    def test_load_invalid_yaml(self, runner, tmp_path):
+    def test_load_invalid_yaml(
+        self, runner: MultiProviderEvaluationRunner, tmp_path: Path
+    ) -> None:
         """Test loading an invalid YAML file."""
         invalid_yaml = tmp_path / "invalid.yaml"
         with open(invalid_yaml, "w", encoding="utf-8") as f:
             f.write("invalid: yaml: content: [")
 
         with pytest.raises(ValueError, match="Error parsing YAML file"):
-            runner._load_yaml(invalid_yaml)
+            runner._load_yaml(invalid_yaml)  # pylint: disable=protected-access
 
-    def test_load_yaml_non_dict_type(self, runner, tmp_path):
+    def test_load_yaml_non_dict_type(
+        self, runner: MultiProviderEvaluationRunner, tmp_path: Path
+    ) -> None:
         """Test that YAML files not containing dictionaries are rejected."""
         list_yaml = tmp_path / "list.yaml"
         with open(list_yaml, "w", encoding="utf-8") as f:
             yaml.dump(["item1", "item2", "item3"], f)
 
         with pytest.raises(ValueError, match="must be a mapping, got list"):
-            runner._load_yaml(list_yaml)
+            runner._load_yaml(list_yaml)  # pylint: disable=protected-access
 
 
-class TestCreateProviderModelConfigs:
+class TestCreateProviderModelConfigs:  # pylint: disable=too-few-public-methods
     """Tests for _create_provider_model_configs method."""
 
-    def test_create_configs_multiple_providers(self, runner):
+    def test_create_configs_multiple_providers(
+        self, runner: MultiProviderEvaluationRunner
+    ) -> None:
         """Test creating configs with multiple providers."""
-        configs = runner._create_provider_model_configs()
+        configs = (
+            runner._create_provider_model_configs()  # pylint: disable=protected-access
+        )
 
         assert len(configs) == 3  # 2 openai models + 1 watsonx model
 
@@ -345,21 +304,25 @@ def test_create_configs_multiple_providers(self, runner):
 class TestCreateModifiedSystemConfig:
     """Tests for _create_modified_system_config method."""
 
-    def test_llm_config_stays_constant(self, runner):
+    def test_llm_config_stays_constant(
+        self, runner: MultiProviderEvaluationRunner
+    ) -> None:
         """Test that LLM judge config is NOT modified (stays constant for fair comparison)."""
         original_llm_provider = runner.system_config["llm"]["provider"]
         original_llm_model = runner.system_config["llm"]["model"]
 
-        modified = runner._create_modified_system_config(
-            provider_id="watsonx",
-            model="ibm/granite-13b-chat-v2",
+        modified = (
+            runner._create_modified_system_config(  # pylint: disable=protected-access
+                provider_id="watsonx",
+                model="ibm/granite-13b-chat-v2",
+            )
         )
 
         # LLM judge should remain unchanged
         assert modified["llm"]["provider"] == original_llm_provider
         assert modified["llm"]["model"] == original_llm_model
 
-    def test_api_config_is_modified(self, temp_config_files):
+    def test_api_config_is_modified(self, temp_config_files: dict[str, Path]) -> None:
         """Test that API config is modified when API is enabled."""
         # Create system config with API enabled
         system_config = {
@@ -385,9 +348,11 @@ def test_api_config_is_modified(self, temp_config_files):
             eval_data_path=str(temp_config_files["eval_data"]),
         )
 
-        modified = runner._create_modified_system_config(
-            provider_id="watsonx",
-            model="ibm/granite-13b-chat-v2",
+        modified = (
+            runner._create_modified_system_config(  # pylint: disable=protected-access
+                provider_id="watsonx",
+                model="ibm/granite-13b-chat-v2",
+            )
         )
 
         # API config should be modified with provider and model only
@@ -403,11 +368,15 @@ def test_api_config_is_modified(self, temp_config_files):
 class TestCreateTempSystemConfig:
     """Tests for _create_temp_system_config method."""
 
-    def test_create_temp_config_file(self, runner):
+    def test_create_temp_config_file(
+        self, runner: MultiProviderEvaluationRunner
+    ) -> None:
         """Test that a temporary config file is created."""
-        temp_path = runner._create_temp_system_config(
-            provider_id="openai",
-            model="gpt-4o-mini",
+        temp_path = (
+            runner._create_temp_system_config(  # pylint: disable=protected-access
+                provider_id="openai",
+                model="gpt-4o-mini",
+            )
         )
 
         try:
@@ -426,32 +395,36 @@ def test_create_temp_config_file(self, runner):
             if temp_path.exists():
                 temp_path.unlink()
 
-    def test_temp_config_cleanup_on_yaml_dump_failure(self, runner, tmp_path):
+    def test_temp_config_cleanup_on_yaml_dump_failure(
+        self,
+        runner: MultiProviderEvaluationRunner,
+    ) -> None:
         """Test that temp file is cleaned up when yaml.dump() fails."""
-        import tempfile as temp_module
 
         # Track the temp file path that gets created
         created_temp_path = None
         original_named_temp_file = temp_module.NamedTemporaryFile
 
-        def track_temp_file(*args, **kwargs):
+        def track_temp_file(*args: Any, **kwargs: Any) -> Any:
             nonlocal created_temp_path
-            temp_file = original_named_temp_file(*args, **kwargs)
+            temp_file = original_named_temp_file(  # pylint: disable=consider-using-with
+                *args, **kwargs
+            )
             created_temp_path = Path(temp_file.name)
             return temp_file
 
         # Mock NamedTemporaryFile to track the created file
         with patch(
-            "run_multi_provider_eval.tempfile.NamedTemporaryFile",
+            "script.run_multi_provider_eval.tempfile.NamedTemporaryFile",
             side_effect=track_temp_file,
         ):
             # Mock yaml.dump to raise an exception
             with patch(
-                "run_multi_provider_eval.yaml.dump",
+                "script.run_multi_provider_eval.yaml.dump",
                 side_effect=Exception("YAML dump failed"),
             ):
                 with pytest.raises(Exception, match="YAML dump failed"):
-                    runner._create_temp_system_config(
+                    runner._create_temp_system_config(  # pylint: disable=protected-access
                         provider_id="openai",
                         model="gpt-4o-mini",
                     )
@@ -464,15 +437,20 @@ def track_temp_file(*args, **kwargs):
                     not created_temp_path.exists()
                 ), "Temp file should have been cleaned up"
 
-    def test_temp_config_sanitizes_special_characters(self, runner):
+    def test_temp_config_sanitizes_special_characters(
+        self, runner: MultiProviderEvaluationRunner
+    ) -> None:
         """Test that special characters in provider_id and model are sanitized."""
-        temp_path = runner._create_temp_system_config(
-            provider_id="open..ai//test",
-            model="gpt:4o-mini/special",
+        temp_path = (
+            runner._create_temp_system_config(  # pylint: disable=protected-access
+                provider_id="open..ai//test",
+                model="gpt:4o-mini/special",
+            )
         )
 
         try:
-            # Verify filename doesn't contain path separators or colons (except drive letter on Windows)
+            # Verify filename doesn't contain path separators or colons
+            # (except drive letter on Windows)
             assert "/" not in temp_path.name
             # On some systems, : might appear in drive letters on Windows, so we're lenient
             # The key is that path traversal characters are neutralized
@@ -486,7 +464,9 @@ class TestPathTraversalSecurity:
     """Tests for path traversal security."""
 
     @pytest.fixture
-    def runner(self, temp_config_files):
+    def runner(
+        self, temp_config_files: dict[str, Path]
+    ) -> MultiProviderEvaluationRunner:
         """Create a runner instance for testing."""
         return MultiProviderEvaluationRunner(
             providers_config_path=str(temp_config_files["providers_config"]),
@@ -494,14 +474,16 @@ def runner(self, temp_config_files):
             eval_data_path=str(temp_config_files["eval_data"]),
         )
 
-    def test_path_traversal_blocked_in_provider_id(self, runner):
+    def test_path_traversal_blocked_in_provider_id(
+        self, runner: MultiProviderEvaluationRunner
+    ) -> None:
         """Test that path traversal in provider_id is sanitized."""
         with patch(
-            "run_multi_provider_eval.run_evaluation",
+            "script.run_multi_provider_eval.run_evaluation",
             return_value={"PASS": 0, "FAIL": 0, "ERROR": 1},
         ):
             # Attempt path traversal in provider_id
-            result = runner._run_single_evaluation(
+            result = runner._run_single_evaluation(  # pylint: disable=protected-access
                 provider_name="malicious",
                 provider_id="../../etc",
                 model="test",
@@ -517,18 +499,18 @@ def test_path_traversal_blocked_in_provider_id(self, runner):
 
             # Cleanup
             if output_path.exists():
-                import shutil
-
                 shutil.rmtree(output_path.parent, ignore_errors=True)
 
-    def test_path_traversal_blocked_in_model(self, runner):
+    def test_path_traversal_blocked_in_model(
+        self, runner: MultiProviderEvaluationRunner
+    ) -> None:
         """Test that path traversal in model name is sanitized."""
         with patch(
-            "run_multi_provider_eval.run_evaluation",
+            "script.run_multi_provider_eval.run_evaluation",
             return_value={"PASS": 0, "FAIL": 0, "ERROR": 1},
         ):
             # Attempt path traversal in model
-            result = runner._run_single_evaluation(
+            result = runner._run_single_evaluation(  # pylint: disable=protected-access
                 provider_name="openai",
                 provider_id="openai",
                 model="../../../etc/passwd",
@@ -543,22 +525,22 @@ def test_path_traversal_blocked_in_model(self, runner):
 
             # Cleanup
             if output_path.exists():
-                import shutil
-
                 shutil.rmtree(output_path.parent.parent, ignore_errors=True)
 
 
 class TestRunSingleEvaluation:
     """Tests for _run_single_evaluation method."""
 
-    def test_run_single_evaluation_success(self, runner):
+    def test_run_single_evaluation_success(
+        self, runner: MultiProviderEvaluationRunner
+    ) -> None:
         """Test successful single evaluation."""
         # Mock run_evaluation to return a successful summary
         with patch(
-            "run_multi_provider_eval.run_evaluation",
+            "script.run_multi_provider_eval.run_evaluation",
             return_value={"PASS": 5, "FAIL": 2, "ERROR": 0},
         ) as mock_run_eval:
-            result = runner._run_single_evaluation(
+            result = runner._run_single_evaluation(  # pylint: disable=protected-access
                 provider_name="openai",
                 provider_id="openai",
                 model="gpt-4o-mini",
@@ -572,11 +554,13 @@ def test_run_single_evaluation_success(self, runner):
             assert "duration_seconds" in result
             mock_run_eval.assert_called_once()
 
-    def test_run_single_evaluation_failure(self, runner):
+    def test_run_single_evaluation_failure(
+        self, runner: MultiProviderEvaluationRunner
+    ) -> None:
         """Test evaluation failure handling."""
         # Mock run_evaluation to return None (failure)
-        with patch("run_multi_provider_eval.run_evaluation", return_value=None):
-            result = runner._run_single_evaluation(
+        with patch("script.run_multi_provider_eval.run_evaluation", return_value=None):
+            result = runner._run_single_evaluation(  # pylint: disable=protected-access
                 provider_name="openai",
                 provider_id="openai",
                 model="gpt-4o-mini",
@@ -585,14 +569,16 @@ def test_run_single_evaluation_failure(self, runner):
             assert result["success"] is False
             assert result["error"] == "Evaluation returned None (failed)"
 
-    def test_run_single_evaluation_invalid_summary(self, runner):
+    def test_run_single_evaluation_invalid_summary(
+        self, runner: MultiProviderEvaluationRunner
+    ) -> None:
         """Test evaluation with invalid summary structure."""
         # Mock run_evaluation to return a summary missing required keys
         with patch(
-            "run_multi_provider_eval.run_evaluation",
+            "script.run_multi_provider_eval.run_evaluation",
             return_value={"PASS": 5, "FAIL": 2},  # Missing ERROR key
         ):
-            result = runner._run_single_evaluation(
+            result = runner._run_single_evaluation(  # pylint: disable=protected-access
                 provider_name="openai",
                 provider_id="openai",
                 model="gpt-4o-mini",
@@ -603,10 +589,12 @@ def test_run_single_evaluation_invalid_summary(self, runner):
             assert "summary" not in result
 
 
-class TestRunEvaluations:
+class TestRunEvaluations:  # pylint: disable=too-few-public-methods
     """Tests for run_evaluations method."""
 
-    def test_run_evaluations_sequential(self, runner):
+    def test_run_evaluations_sequential(
+        self, runner: MultiProviderEvaluationRunner
+    ) -> None:
         """Test sequential evaluation execution."""
         # Force sequential mode
         runner.max_workers = 1
@@ -626,10 +614,12 @@ def test_run_evaluations_sequential(self, runner):
             assert mock_single_eval.call_count == 3
 
 
-class TestGenerateSummary:
+class TestGenerateSummary:  # pylint: disable=too-few-public-methods
     """Tests for generate_summary method."""
 
-    def test_generate_summary_mixed_results(self, runner):
+    def test_generate_summary_mixed_results(
+        self, runner: MultiProviderEvaluationRunner
+    ) -> None:
         """Test summary generation with mixed results."""
         runner.results = [
             {"success": True, "provider_id": "openai", "model": "gpt-4o-mini"},
@@ -644,87 +634,15 @@ def test_generate_summary_mixed_results(self, runner):
         assert summary["success_rate"] == "50.0%"
 
 
-@pytest.fixture
-def sample_evaluation_summary():
-    """Create a sample evaluation summary JSON for testing analysis."""
-    return {
-        "timestamp": "2025-01-01T12:00:00",
-        "total_evaluations": 10,
-        "summary_stats": {
-            "overall": {
-                "TOTAL": 10,
-                "PASS": 8,
-                "FAIL": 2,
-                "ERROR": 0,
-                "pass_rate": 80.0,  # Percentage format
-                "fail_rate": 20.0,
-                "error_rate": 0.0,
-            },
-            "by_metric": {
-                "ragas:faithfulness": {
-                    "pass": 4,
-                    "fail": 0,
-                    "error": 0,
-                    "pass_rate": 100.0,
-                    "fail_rate": 0.0,
-                    "error_rate": 0.0,
-                    "score_statistics": {
-                        "mean": 0.95,
-                        "median": 0.95,
-                        "std": 0.02,
-                        "min": 0.92,
-                        "max": 0.98,
-                        "count": 4,
-                    },
-                },
-                "ragas:response_relevancy": {
-                    "pass": 4,
-                    "fail": 2,
-                    "error": 0,
-                    "pass_rate": 66.67,
-                    "fail_rate": 33.33,
-                    "error_rate": 0.0,
-                    "score_statistics": {
-                        "mean": 0.75,
-                        "median": 0.78,
-                        "std": 0.12,
-                        "min": 0.55,
-                        "max": 0.88,
-                        "count": 6,
-                    },
-                },
-            },
-        },
-        "results": [
-            {
-                "conversation_group_id": "conv1",
-                "turn_id": "turn1",
-                "metric_identifier": "ragas:faithfulness",
-                "result": "PASS",
-                "score": 0.95,
-                "threshold": 0.8,
-                "execution_time": 1.0,
-            },
-            {
-                "conversation_group_id": "conv1",
-                "turn_id": "turn2",
-                "metric_identifier": "ragas:response_relevancy",
-                "result": "PASS",
-                "score": 0.85,
-                "threshold": 0.7,
-                "execution_time": 1.2,
-            },
-        ]
-        * 5,  # Repeat to get 10 results
-    }
-
-
 class TestBestModelAnalysis:
     """Tests for best model analysis functionality."""
 
     def test_analyze_model_performance(
-        self, runner, tmp_path, sample_evaluation_summary
-    ):
+        self,
+        runner: MultiProviderEvaluationRunner,
+        tmp_path: Path,
+        sample_evaluation_summary: dict[str, Any],
+    ) -> None:
         """Test successful model performance analysis."""
         # Setup: Create evaluation summary files
         model_dir = tmp_path / "eval_output" / "openai" / "gpt-4o-mini"
@@ -753,25 +671,33 @@ def test_analyze_model_performance(
         assert stats["overall"]["passed"] == 8
         assert 0.0 <= stats["composite_score"] <= 1.0
 
-    def test_percentage_to_decimal_conversion(self, runner, sample_evaluation_summary):
+    def test_percentage_to_decimal_conversion(
+        self, runner: MultiProviderEvaluationRunner, sample_evaluation_summary: dict
+    ) -> None:
         """Test that percentage rates (80.0) convert to decimals (0.8)."""
-        stats = runner._analyze_single_model("test/model", sample_evaluation_summary)
+        stats = runner._analyze_single_model(  # pylint: disable=protected-access
+            "test/model", sample_evaluation_summary
+        )
 
         # Verify percentage conversion
         assert abs(stats["overall"]["pass_rate"] - 0.8) < 0.01
         assert 0.0 <= stats["overall"]["pass_rate"] <= 1.0
 
-    def test_composite_score(self, runner):
+    def test_composite_score(self, runner: MultiProviderEvaluationRunner) -> None:
         """Test composite score calculation."""
         # Perfect model should get score of 1.0
-        perfect = runner._calculate_composite_score(1.0, 0.0, 1.0, 1.0)
+        perfect = runner._calculate_composite_score(  # pylint: disable=protected-access
+            1.0, 0.0, 1.0, 1.0
+        )
         assert abs(perfect - 1.0) < 0.0001
 
         # Poor model should get score of 0.0
-        poor = runner._calculate_composite_score(0.0, 1.0, 0.0, 0.0)
+        poor = runner._calculate_composite_score(  # pylint: disable=protected-access
+            0.0, 1.0, 0.0, 0.0
+        )
         assert poor == 0.0
 
-    def test_model_ranking(self, runner):
+    def test_model_ranking(self, runner: MultiProviderEvaluationRunner) -> None:
         """Test models are ranked by composite score."""
         runner.model_stats = {
             "model1": {"composite_score": 0.85},
@@ -786,7 +712,9 @@ def test_model_ranking(self, runner):
         assert ranked[1][0] == "model1"  # Second: 0.85
         assert ranked[2][0] == "model3"  # Lowest: 0.70
 
-    def test_save_analysis_to_yaml(self, runner, tmp_path):
+    def test_save_analysis_to_yaml(
+        self, runner: MultiProviderEvaluationRunner, tmp_path: Path
+    ) -> None:
         """Test saving analysis results to YAML file."""
         runner.output_base = tmp_path
         runner.model_stats = {
@@ -806,7 +734,9 @@ def test_save_analysis_to_yaml(self, runner, tmp_path):
         assert data["best_model"]["model"] == "model1"
         assert data["best_model"]["composite_score"] == 0.85
 
-    def test_print_report(self, runner, capsys):
+    def test_print_report(
+        self, runner: MultiProviderEvaluationRunner, capsys: pytest.CaptureFixture[str]
+    ) -> None:
         """Test statistical comparison report output."""
         runner.model_stats = {
             "model1": {
diff --git a/tests/unit/core/api/conftest.py b/tests/unit/core/api/conftest.py
new file mode 100644
index 00000000..244e678c
--- /dev/null
+++ b/tests/unit/core/api/conftest.py
@@ -0,0 +1,42 @@
+"""Pytest configuration and fixtures for api tests."""
+
+from typing import Any
+
+import pytest
+
+from pytest_mock import MockerFixture
+from lightspeed_evaluation.core.models import APIConfig
+
+
+@pytest.fixture
+def api_config() -> APIConfig:
+    """Create test API config."""
+    return APIConfig(
+        enabled=True,
+        api_base="http://localhost:8080",
+        version="v1",
+        endpoint_type="query",
+        timeout=30,
+        cache_enabled=False,
+    )
+
+
+@pytest.fixture
+def basic_api_config() -> APIConfig:
+    """Create basic API configuration for streaming."""
+    return APIConfig(
+        enabled=True,
+        api_base="http://localhost:8080",
+        endpoint_type="streaming",
+        timeout=30,
+        provider="openai",
+        model="gpt-4",
+        cache_enabled=False,
+    )
+
+
+@pytest.fixture
+def mock_response(mocker: MockerFixture) -> Any:
+    """Create a mock streaming response."""
+    response = mocker.Mock()
+    return response
diff --git a/tests/unit/core/api/test_client.py b/tests/unit/core/api/test_client.py
index 7224c0dc..caa7d2b3 100644
--- a/tests/unit/core/api/test_client.py
+++ b/tests/unit/core/api/test_client.py
@@ -1,45 +1,21 @@
 """Unit tests for core API client module."""
 
+from pathlib import Path
 import pytest
+import httpx
+from pytest_mock import MockerFixture
+from pydantic import ValidationError
 
 from lightspeed_evaluation.core.models import APIConfig, APIResponse
 from lightspeed_evaluation.core.system.exceptions import APIError
 from lightspeed_evaluation.core.api.client import APIClient
 
 
-@pytest.fixture
-def api_config():
-    """Create test API config."""
-    return APIConfig(
-        enabled=True,
-        api_base="http://localhost:8080",
-        version="v1",
-        endpoint_type="query",
-        timeout=30,
-        cache_enabled=False,
-    )
-
-
-@pytest.fixture
-def basic_api_config():
-    """Create basic API configuration for streaming."""
-    return APIConfig(
-        enabled=True,
-        api_base="http://localhost:8080",
-        endpoint_type="streaming",
-        timeout=30,
-        provider="openai",
-        model="gpt-4",
-        cache_enabled=False,
-    )
-
-
 class TestAPIClient:
     """Unit tests for APIClient."""
 
-    def test_initialization_unsupported_endpoint_type(self):
+    def test_initialization_unsupported_endpoint_type(self) -> None:
         """Test initialization fails with unsupported endpoint type."""
-        from pydantic import ValidationError
 
         # Pydantic will validate the endpoint_type, so this should raise ValidationError
         with pytest.raises(ValidationError, match="Endpoint type must be one of"):
@@ -51,7 +27,9 @@ def test_initialization_unsupported_endpoint_type(self):
                 timeout=30,
             )
 
-    def test_query_standard_endpoint_success(self, api_config, mocker):
+    def test_query_standard_endpoint_success(
+        self, api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test successful query to standard endpoint."""
         mock_response = mocker.Mock()
         mock_response.status_code = 200
@@ -79,7 +57,9 @@ def test_query_standard_endpoint_success(self, api_config, mocker):
         assert result.conversation_id == "conv_123"
         assert result.contexts == ["Context 1"]
 
-    def test_query_with_conversation_id(self, api_config, mocker):
+    def test_query_with_conversation_id(
+        self, api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test query with existing conversation_id."""
         mock_response = mocker.Mock()
         mock_response.status_code = 200
@@ -107,7 +87,9 @@ def test_query_with_conversation_id(self, api_config, mocker):
         request_data = call_kwargs[1]["json"]
         assert request_data["conversation_id"] == "conv_123"
 
-    def test_query_with_attachments(self, api_config, mocker):
+    def test_query_with_attachments(
+        self, api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test query with attachments."""
         mock_response = mocker.Mock()
         mock_response.status_code = 200
@@ -137,9 +119,10 @@ def test_query_with_attachments(self, api_config, mocker):
         assert request_data["attachments"][0]["content"] == "file1.txt"
         assert request_data["attachments"][1]["content"] == "file2.pdf"
 
-    def test_query_http_error(self, api_config, mocker):
+    def test_query_http_error(
+        self, api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test query handling HTTP errors."""
-        import httpx
 
         mock_response = mocker.Mock()
         mock_response.status_code = 500
@@ -162,9 +145,10 @@ def test_query_http_error(self, api_config, mocker):
         with pytest.raises(APIError, match="API error: 500"):
             client.query("Test query")
 
-    def test_query_timeout_error(self, api_config, mocker):
+    def test_query_timeout_error(
+        self, api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test query handling timeout."""
-        import httpx
 
         mock_client = mocker.Mock()
         mock_client.post.side_effect = httpx.TimeoutException("Timeout")
@@ -180,7 +164,9 @@ def test_query_timeout_error(self, api_config, mocker):
         with pytest.raises(APIError, match="timeout"):
             client.query("Test query")
 
-    def test_query_missing_response_field(self, api_config, mocker):
+    def test_query_missing_response_field(
+        self, api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test query handling missing response field."""
         mock_response = mocker.Mock()
         mock_response.status_code = 200
@@ -203,7 +189,7 @@ def test_query_missing_response_field(self, api_config, mocker):
         with pytest.raises(APIError, match="missing 'response' field"):
             client.query("Test query")
 
-    def test_query_streaming_endpoint(self, mocker):
+    def test_query_streaming_endpoint(self, mocker: MockerFixture) -> None:
         """Test query to streaming endpoint."""
         config = APIConfig(
             enabled=True,
@@ -247,9 +233,10 @@ def test_query_streaming_endpoint(self, mocker):
         assert result.response == "Streamed response"
         assert result.conversation_id == "conv_123"
 
-    def test_handle_response_errors_non_200(self, api_config, mocker):
+    def test_handle_response_errors_non_200(
+        self, api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test _handle_response_errors with non-200 status."""
-        import httpx
 
         mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
 
@@ -261,9 +248,13 @@ def test_handle_response_errors_non_200(self, api_config, mocker):
         mock_response.read.return_value = b'{"detail": "Not found"}'
 
         with pytest.raises(httpx.HTTPStatusError):
-            client._handle_response_errors(mock_response)
+            client._handle_response_errors(  # pylint: disable=protected-access
+                mock_response
+            )
 
-    def test_extract_error_message_with_detail(self, api_config, mocker):
+    def test_extract_error_message_with_detail(
+        self, api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test _extract_error_message with detail field."""
         mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
 
@@ -272,10 +263,14 @@ def test_extract_error_message_with_detail(self, api_config, mocker):
         mock_response = mocker.Mock()
         mock_response.read.return_value = b'{"detail": "Error message"}'
 
-        error_msg = client._extract_error_message(mock_response)
+        error_msg = client._extract_error_message(  # pylint: disable=protected-access
+            mock_response
+        )
         assert "Error message" in error_msg
 
-    def test_extract_error_message_with_nested_detail(self, api_config, mocker):
+    def test_extract_error_message_with_nested_detail(
+        self, api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test _extract_error_message with nested detail."""
         mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
 
@@ -286,11 +281,15 @@ def test_extract_error_message_with_nested_detail(self, api_config, mocker):
             b'{"detail": {"response": "Error", "cause": "Reason"}}'
         )
 
-        error_msg = client._extract_error_message(mock_response)
+        error_msg = client._extract_error_message(  # pylint: disable=protected-access
+            mock_response
+        )
         assert "Error" in error_msg
         assert "Reason" in error_msg
 
-    def test_standard_query_formats_tool_calls(self, api_config, mocker):
+    def test_standard_query_formats_tool_calls(
+        self, api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test that standard query formats tool calls correctly."""
         mock_response = mocker.Mock()
         mock_response.status_code = 200
@@ -325,7 +324,9 @@ def test_standard_query_formats_tool_calls(self, api_config, mocker):
 class TestAPIClientConfiguration:
     """Additional tests for APIClient configuration and initialization."""
 
-    def test_initialization_streaming_endpoint(self, basic_api_config, mocker):
+    def test_initialization_streaming_endpoint(
+        self, basic_api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test client initialization with streaming endpoint."""
         mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
 
@@ -336,7 +337,9 @@ def test_initialization_streaming_endpoint(self, basic_api_config, mocker):
         assert client.timeout == 30
         assert client.cache is None
 
-    def test_initialization_with_cache(self, tmp_path, mocker):
+    def test_initialization_with_cache(
+        self, tmp_path: Path, mocker: MockerFixture
+    ) -> None:
         """Test client initialization with cache enabled."""
         config = APIConfig(
             enabled=True,
@@ -357,7 +360,9 @@ def test_initialization_with_cache(self, tmp_path, mocker):
         assert client.cache is not None
         mock_cache.assert_called_once_with(str(tmp_path / "test_cache"))
 
-    def test_validate_endpoint_type_valid(self, basic_api_config, mocker):
+    def test_validate_endpoint_type_valid(
+        self, basic_api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test validation with valid endpoint type."""
         mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
 
@@ -365,7 +370,9 @@ def test_validate_endpoint_type_valid(self, basic_api_config, mocker):
         client = APIClient(basic_api_config)
         assert client.endpoint_type == "streaming"
 
-    def test_setup_client_with_api_key(self, basic_api_config, mocker):
+    def test_setup_client_with_api_key(
+        self, basic_api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test client setup includes API key from environment."""
         mocker.patch.dict("os.environ", {"API_KEY": "test_secret_key"})
         mock_client = mocker.Mock()
@@ -379,7 +386,9 @@ def test_setup_client_with_api_key(self, basic_api_config, mocker):
         # Verify headers were updated (should include Authorization header)
         assert mock_client.headers.update.call_count >= 1
 
-    def test_query_requires_initialized_client(self, basic_api_config, mocker):
+    def test_query_requires_initialized_client(
+        self, basic_api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test query fails if client not initialized."""
         mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
 
@@ -389,33 +398,43 @@ def test_query_requires_initialized_client(self, basic_api_config, mocker):
         with pytest.raises(APIError, match="not initialized"):
             client.query("test query")
 
-    def test_prepare_request_basic(self, basic_api_config, mocker):
+    def test_prepare_request_basic(
+        self, basic_api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test request preparation with basic parameters."""
         mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
 
         client = APIClient(basic_api_config)
-        request = client._prepare_request("What is Python?")
+        request = client._prepare_request(  # pylint: disable=protected-access
+            "What is Python?"
+        )
 
         assert request.query == "What is Python?"
         assert request.provider == "openai"
         assert request.model == "gpt-4"
 
-    def test_prepare_request_with_conversation_id(self, basic_api_config, mocker):
+    def test_prepare_request_with_conversation_id(
+        self, basic_api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test request preparation with conversation ID."""
         mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
 
         client = APIClient(basic_api_config)
-        request = client._prepare_request("Follow-up", conversation_id="conv_123")
+        request = client._prepare_request(  # pylint: disable=protected-access
+            "Follow-up", conversation_id="conv_123"
+        )
 
         assert request.query == "Follow-up"
         assert request.conversation_id == "conv_123"
 
-    def test_prepare_request_with_attachments(self, basic_api_config, mocker):
+    def test_prepare_request_with_attachments(
+        self, basic_api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test request preparation with attachments."""
         mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
 
         client = APIClient(basic_api_config)
-        request = client._prepare_request(
+        request = client._prepare_request(  # pylint: disable=protected-access
             "Analyze this", attachments=["file1.txt", "file2.pdf"]
         )
 
@@ -423,7 +442,9 @@ def test_prepare_request_with_attachments(self, basic_api_config, mocker):
         # Attachments may be processed, just verify they're present in some form
         assert hasattr(request, "attachments")
 
-    def test_close_client(self, basic_api_config, mocker):
+    def test_close_client(
+        self, basic_api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test closing the HTTP client."""
         mock_http_client = mocker.Mock()
         mocker.patch(
@@ -436,7 +457,9 @@ def test_close_client(self, basic_api_config, mocker):
 
         mock_http_client.close.assert_called_once()
 
-    def test_get_cache_key_generates_consistent_hash(self, tmp_path, mocker):
+    def test_get_cache_key_generates_consistent_hash(
+        self, tmp_path: Path, mocker: MockerFixture
+    ) -> None:
         """Test cache key generation is consistent for same request."""
         config = APIConfig(
             enabled=True,
@@ -455,11 +478,15 @@ def test_get_cache_key_generates_consistent_hash(self, tmp_path, mocker):
         client = APIClient(config)
 
         # Create identical requests
-        request1 = client._prepare_request("test query")
-        request2 = client._prepare_request("test query")
+        request1 = client._prepare_request(  # pylint: disable=protected-access
+            "test query"
+        )
+        request2 = client._prepare_request(  # pylint: disable=protected-access
+            "test query"
+        )
 
-        key1 = client._get_cache_key(request1)
-        key2 = client._get_cache_key(request2)
+        key1 = client._get_cache_key(request1)  # pylint: disable=protected-access
+        key2 = client._get_cache_key(request2)  # pylint: disable=protected-access
 
         # Same request should generate same cache key
         assert key1 == key2
@@ -467,8 +494,8 @@ def test_get_cache_key_generates_consistent_hash(self, tmp_path, mocker):
         assert len(key1) > 0
 
     def test_client_initialization_sets_content_type_header(
-        self, basic_api_config, mocker
-    ):
+        self, basic_api_config: APIConfig, mocker: MockerFixture
+    ) -> None:
         """Test client initialization sets Content-Type header."""
         mock_client = mocker.Mock()
         mocker.patch(
@@ -485,7 +512,7 @@ def test_client_initialization_sets_content_type_header(
             for call in calls
         )
 
-    def test_standard_endpoint_initialization(self, mocker):
+    def test_standard_endpoint_initialization(self, mocker: MockerFixture) -> None:
         """Test initialization with standard (non-streaming) endpoint."""
         config = APIConfig(
             enabled=True,
diff --git a/tests/unit/core/api/test_streaming_parser.py b/tests/unit/core/api/test_streaming_parser.py
index f78dfae9..3c20d5a6 100644
--- a/tests/unit/core/api/test_streaming_parser.py
+++ b/tests/unit/core/api/test_streaming_parser.py
@@ -1,5 +1,6 @@
 """Unit tests for streaming parser."""
 
+from typing import Any
 import pytest
 
 from lightspeed_evaluation.core.api.streaming_parser import (
@@ -10,17 +11,10 @@
 )
 
 
-@pytest.fixture
-def mock_response(mocker):
-    """Create a mock streaming response."""
-    response = mocker.Mock()
-    return response
-
-
 class TestParseStreamingResponse:
     """Unit tests for parse_streaming_response."""
 
-    def test_parse_complete_response(self, mock_response):
+    def test_parse_complete_response(self, mock_response: Any) -> None:
         """Test parsing a complete streaming response."""
         lines = [
             'data: {"event": "start", "data": {"conversation_id": "conv_123"}}',
@@ -38,11 +32,16 @@ def test_parse_complete_response(self, mock_response):
         assert "streaming_duration" in result
         assert "tokens_per_second" in result
 
-    def test_parse_response_with_tool_calls(self, mock_response):
+    def test_parse_response_with_tool_calls(self, mock_response: Any) -> None:
         """Test parsing response with tool calls."""
         lines = [
             'data: {"event": "start", "data": {"conversation_id": "conv_456"}}',
-            'data: {"event": "tool_call", "data": {"token": {"tool_name": "search", "arguments": {"query": "test"}}}}',
+            (
+                "data: {"
+                '"event": "tool_call", '
+                '"data": {"token": {"tool_name": "search", "arguments": {"query": "test"}}}'
+                "}"
+            ),
             'data: {"event": "turn_complete", "data": {"token": "Final response"}}',
         ]
         mock_response.iter_lines.return_value = lines
@@ -54,7 +53,7 @@ def test_parse_response_with_tool_calls(self, mock_response):
         assert len(result["tool_calls"]) == 1
         assert result["tool_calls"][0][0]["tool_name"] == "search"
 
-    def test_parse_response_missing_final_response(self, mock_response):
+    def test_parse_response_missing_final_response(self, mock_response: Any) -> None:
         """Test parsing fails when final response is missing."""
         lines = [
             'data: {"event": "start", "data": {"conversation_id": "conv_789"}}',
@@ -64,7 +63,7 @@ def test_parse_response_missing_final_response(self, mock_response):
         with pytest.raises(ValueError, match="No final response found"):
             parse_streaming_response(mock_response)
 
-    def test_parse_response_missing_conversation_id(self, mock_response):
+    def test_parse_response_missing_conversation_id(self, mock_response: Any) -> None:
         """Test parsing fails when conversation ID is missing."""
         lines = [
             'data: {"event": "turn_complete", "data": {"token": "Response"}}',
@@ -74,7 +73,7 @@ def test_parse_response_missing_conversation_id(self, mock_response):
         with pytest.raises(ValueError, match="No Conversation ID found"):
             parse_streaming_response(mock_response)
 
-    def test_parse_response_with_error_event(self, mock_response):
+    def test_parse_response_with_error_event(self, mock_response: Any) -> None:
         """Test parsing handles error events."""
         lines = [
             'data: {"event": "error", "data": {"token": "API Error occurred"}}',
@@ -84,7 +83,7 @@ def test_parse_response_with_error_event(self, mock_response):
         with pytest.raises(ValueError, match="Streaming API error: API Error occurred"):
             parse_streaming_response(mock_response)
 
-    def test_parse_response_skips_empty_lines(self, mock_response):
+    def test_parse_response_skips_empty_lines(self, mock_response: Any) -> None:
         """Test parser skips empty lines."""
         lines = [
             "",
@@ -100,7 +99,7 @@ def test_parse_response_skips_empty_lines(self, mock_response):
         assert result["response"] == "Response"
         assert result["conversation_id"] == "conv_123"
 
-    def test_parse_response_skips_non_data_lines(self, mock_response):
+    def test_parse_response_skips_non_data_lines(self, mock_response: Any) -> None:
         """Test parser skips lines without 'data:' prefix."""
         lines = [
             "event: start",
@@ -115,12 +114,22 @@ def test_parse_response_skips_non_data_lines(self, mock_response):
         assert result["response"] == "Response"
         assert result["conversation_id"] == "conv_123"
 
-    def test_parse_response_with_multiple_tool_calls(self, mock_response):
+    def test_parse_response_with_multiple_tool_calls(self, mock_response: Any) -> None:
         """Test parsing multiple tool calls."""
         lines = [
             'data: {"event": "start", "data": {"conversation_id": "conv_123"}}',
-            'data: {"event": "tool_call", "data": {"token": {"tool_name": "search", "arguments": {"q": "test"}}}}',
-            'data: {"event": "tool_call", "data": {"token": {"tool_name": "calculate", "arguments": {"expr": "2+2"}}}}',
+            (
+                "data: {"
+                '"event": "tool_call", '
+                '"data": {"token": {"tool_name": "search", "arguments": {"q": "test"}}}'
+                "}"
+            ),
+            (
+                "data: {"
+                '"event": "tool_call", '
+                '"data": {"token": {"tool_name": "calculate", "arguments": {"expr": "2+2"}}}'
+                "}"
+            ),
             'data: {"event": "turn_complete", "data": {"token": "Done"}}',
         ]
         mock_response.iter_lines.return_value = lines
@@ -135,7 +144,7 @@ def test_parse_response_with_multiple_tool_calls(self, mock_response):
 class TestParseSSELine:
     """Unit tests for _parse_sse_line."""
 
-    def test_parse_valid_json(self):
+    def test_parse_valid_json(self) -> None:
         """Test parsing valid JSON SSE line."""
         json_data = '{"event": "start", "data": {"conversation_id": "123"}}'
 
@@ -146,7 +155,7 @@ def test_parse_valid_json(self):
         assert event == "start"
         assert data["conversation_id"] == "123"
 
-    def test_parse_invalid_json(self):
+    def test_parse_invalid_json(self) -> None:
         """Test parsing invalid JSON returns None."""
         json_data = "not valid json"
 
@@ -154,17 +163,17 @@ def test_parse_invalid_json(self):
 
         assert result is None
 
-    def test_parse_missing_event_field(self):
+    def test_parse_missing_event_field(self) -> None:
         """Test parsing with missing event field."""
         json_data = '{"data": {"some": "data"}}'
 
         result = _parse_sse_line(json_data)
 
         assert result is not None
-        event, data = result
+        event, _ = result
         assert event == ""  # Default empty string
 
-    def test_parse_missing_data_field(self):
+    def test_parse_missing_data_field(self) -> None:
         """Test parsing with missing data field."""
         json_data = '{"event": "test"}'
 
@@ -179,7 +188,7 @@ def test_parse_missing_data_field(self):
 class TestParseToolCall:
     """Unit tests for _parse_tool_call."""
 
-    def test_parse_valid_tool_call(self):
+    def test_parse_valid_tool_call(self) -> None:
         """Test parsing valid tool call."""
         token = {"tool_name": "search", "arguments": {"query": "test"}}
 
@@ -189,7 +198,7 @@ def test_parse_valid_tool_call(self):
         assert result["tool_name"] == "search"
         assert result["arguments"]["query"] == "test"
 
-    def test_parse_tool_call_missing_tool_name(self):
+    def test_parse_tool_call_missing_tool_name(self) -> None:
         """Test parsing tool call without tool_name."""
         token = {"arguments": {"query": "test"}}
 
@@ -197,7 +206,7 @@ def test_parse_tool_call_missing_tool_name(self):
 
         assert result is None
 
-    def test_parse_tool_call_missing_arguments(self):
+    def test_parse_tool_call_missing_arguments(self) -> None:
         """Test parsing tool call without arguments."""
         token = {"tool_name": "search"}
 
@@ -205,7 +214,7 @@ def test_parse_tool_call_missing_arguments(self):
 
         assert result is None
 
-    def test_parse_tool_call_with_empty_arguments(self):
+    def test_parse_tool_call_with_empty_arguments(self) -> None:
         """Test parsing tool call with empty arguments dict."""
         token = {"tool_name": "search", "arguments": {}}
 
@@ -215,11 +224,11 @@ def test_parse_tool_call_with_empty_arguments(self):
         assert result["tool_name"] == "search"
         assert result["arguments"] == {}
 
-    def test_parse_tool_call_invalid_structure(self):
+    def test_parse_tool_call_invalid_structure(self) -> None:
         """Test parsing malformed tool call."""
         token = "not a dict"
 
-        result = _parse_tool_call(token)
+        result = _parse_tool_call(token)  # pyright: ignore[reportArgumentType]
 
         assert result is None
 
@@ -227,13 +236,13 @@ def test_parse_tool_call_invalid_structure(self):
 class TestFormatToolSequences:
     """Unit tests for _format_tool_sequences."""
 
-    def test_format_empty_tool_calls(self):
+    def test_format_empty_tool_calls(self) -> None:
         """Test formatting empty tool calls list."""
         result = _format_tool_sequences([])
 
         assert result == []
 
-    def test_format_single_tool_call(self):
+    def test_format_single_tool_call(self) -> None:
         """Test formatting single tool call."""
         tool_calls = [{"tool_name": "search", "arguments": {"query": "test"}}]
 
@@ -243,7 +252,7 @@ def test_format_single_tool_call(self):
         assert len(result[0]) == 1
         assert result[0][0]["tool_name"] == "search"
 
-    def test_format_multiple_tool_calls(self):
+    def test_format_multiple_tool_calls(self) -> None:
         """Test formatting multiple tool calls into sequences."""
         tool_calls = [
             {"tool_name": "search", "arguments": {"query": "test"}},
@@ -260,7 +269,7 @@ def test_format_multiple_tool_calls(self):
 class TestStreamingPerformanceMetrics:
     """Unit tests for streaming performance metrics (TTFT, tokens per second)."""
 
-    def test_time_to_first_token_captured(self, mock_response):
+    def test_time_to_first_token_captured(self, mock_response: Any) -> None:
         """Test that time to first token is captured on first content event."""
         lines = [
             'data: {"event": "start", "data": {"conversation_id": "conv_123"}}',
@@ -274,7 +283,7 @@ def test_time_to_first_token_captured(self, mock_response):
         assert result["time_to_first_token"] is not None
         assert result["time_to_first_token"] >= 0
 
-    def test_streaming_duration_captured(self, mock_response):
+    def test_streaming_duration_captured(self, mock_response: Any) -> None:
         """Test that streaming duration is captured."""
         lines = [
             'data: {"event": "start", "data": {"conversation_id": "conv_123"}}',
@@ -290,7 +299,7 @@ def test_streaming_duration_captured(self, mock_response):
         # Duration should be >= TTFT
         assert result["streaming_duration"] >= result["time_to_first_token"]
 
-    def test_tokens_per_second_with_token_counts(self, mock_response):
+    def test_tokens_per_second_with_token_counts(self, mock_response: Any) -> None:
         """Test tokens per second calculation when token counts are provided."""
         lines = [
             'data: {"event": "start", "data": {"conversation_id": "conv_123"}}',
@@ -308,7 +317,7 @@ def test_tokens_per_second_with_token_counts(self, mock_response):
         assert result["tokens_per_second"] is not None
         assert result["tokens_per_second"] > 0
 
-    def test_tokens_per_second_without_token_counts(self, mock_response):
+    def test_tokens_per_second_without_token_counts(self, mock_response: Any) -> None:
         """Test tokens per second is None when no output tokens."""
         lines = [
             'data: {"event": "start", "data": {"conversation_id": "conv_123"}}',
@@ -322,7 +331,7 @@ def test_tokens_per_second_without_token_counts(self, mock_response):
         assert result["output_tokens"] == 0
         assert result["tokens_per_second"] is None
 
-    def test_ttft_captured_on_token_event(self, mock_response):
+    def test_ttft_captured_on_token_event(self, mock_response: Any) -> None:
         """Test TTFT is captured on first token event (not just turn_complete)."""
         lines = [
             'data: {"event": "start", "data": {"conversation_id": "conv_123"}}',
@@ -337,11 +346,16 @@ def test_ttft_captured_on_token_event(self, mock_response):
         assert result["time_to_first_token"] is not None
         assert result["time_to_first_token"] >= 0
 
-    def test_ttft_captured_on_tool_call_event(self, mock_response):
+    def test_ttft_captured_on_tool_call_event(self, mock_response: Any) -> None:
         """Test TTFT is captured on tool_call event."""
         lines = [
             'data: {"event": "start", "data": {"conversation_id": "conv_123"}}',
-            'data: {"event": "tool_call", "data": {"token": {"tool_name": "search", "arguments": {}}}}',
+            (
+                "data: {"
+                '"event": "tool_call", '
+                '"data": {"token": {"tool_name": "search", "arguments": {}}}'
+                "}"
+            ),
             'data: {"event": "turn_complete", "data": {"token": "Final response"}}',
         ]
         mock_response.iter_lines.return_value = lines
@@ -352,12 +366,17 @@ def test_ttft_captured_on_tool_call_event(self, mock_response):
         assert result["time_to_first_token"] is not None
         assert result["time_to_first_token"] >= 0
 
-    def test_performance_metrics_with_complete_flow(self, mock_response):
+    def test_performance_metrics_with_complete_flow(self, mock_response: Any) -> None:
         """Test complete streaming flow with all performance metrics."""
         lines = [
             'data: {"event": "start", "data": {"conversation_id": "conv_perf_test"}}',
             'data: {"event": "token", "data": {"token": "Streaming..."}}',
-            'data: {"event": "tool_call", "data": {"token": {"tool_name": "search", "arguments": {"q": "test"}}}}',
+            (
+                "data: {"
+                '"event": "tool_call", '
+                '"data": {"token": {"tool_name": "search", "arguments": {"q": "test"}}}'
+                "}"
+            ),
             'data: {"event": "turn_complete", "data": {"token": "Complete response"}}',
             'data: {"event": "end", "data": {"input_tokens": 100, "output_tokens": 250}}',
         ]
diff --git a/tests/unit/core/config/test_models.py b/tests/unit/core/config/test_models.py
index 07d21f69..9519b7ad 100644
--- a/tests/unit/core/config/test_models.py
+++ b/tests/unit/core/config/test_models.py
@@ -1,6 +1,7 @@
 """Unit tests for core.config.models module."""
 
 import pytest
+from pydantic import ValidationError
 from lightspeed_evaluation.core.models import (
     CoreConfig,
     EvaluationData,
@@ -10,13 +11,12 @@
     SystemConfig,
     TurnData,
 )
-from pydantic import ValidationError
 
 
 class TestTurnData:
     """Unit tests for TurnData model."""
 
-    def test_valid_turn_data_creation(self):
+    def test_valid_turn_data_creation(self) -> None:
         """Test creating valid TurnData instance."""
         turn = TurnData(
             turn_id="1",
@@ -31,27 +31,32 @@ def test_valid_turn_data_creation(self):
         assert turn.response == "Python is a programming language."
         assert turn.contexts is not None
         assert len(turn.contexts) == 1
-        assert turn.contexts[0] == "Python context"
+        assert (
+            turn.contexts[0]  # pylint: disable=unsubscriptable-object
+            == "Python context"
+        )
         assert turn.expected_response == "Python is a high-level language."
 
-    def test_turn_data_invalid_empty_query(self):
+    def test_turn_data_invalid_empty_query(self) -> None:
         """Test validation error for empty query."""
         with pytest.raises(
             ValidationError, match="String should have at least 1 character"
         ):
             TurnData(turn_id="1", query="", response="Valid response")
 
-    def test_turn_data_invalid_context_missing_content(self):
+    def test_turn_data_invalid_context_missing_content(self) -> None:
         """Test validation error for non-string context."""
         with pytest.raises(ValidationError, match="Input should be a valid string"):
             TurnData(
                 turn_id="1",
                 query="Valid query",
                 response="Valid response",
-                contexts=[{"title": "No content field"}],
+                contexts=[
+                    {"title": "No content field"}
+                ],  # pyright: ignore[reportArgumentType]
             )
 
-    def test_turn_data_multiple_contexts(self):
+    def test_turn_data_multiple_contexts(self) -> None:
         """Test TurnData with multiple valid contexts."""
         contexts = [
             "First context",
@@ -61,17 +66,26 @@ def test_turn_data_multiple_contexts(self):
         turn = TurnData(
             turn_id="1", query="Test query", response="Test response", contexts=contexts
         )
-
+        assert turn.contexts is not None
         assert len(turn.contexts) == 3
-        assert turn.contexts[0] == "First context"
-        assert turn.contexts[1] == "Second context"
-        assert turn.contexts[2] == "Third context"
+        assert (
+            turn.contexts[0]  # pylint: disable=unsubscriptable-object
+            == "First context"
+        )
+        assert (
+            turn.contexts[1]  # pylint: disable=unsubscriptable-object
+            == "Second context"
+        )
+        assert (
+            turn.contexts[2]  # pylint: disable=unsubscriptable-object
+            == "Third context"
+        )
 
 
 class TestEvaluationData:
     """Unit tests for EvaluationData model."""
 
-    def test_valid_evaluation_data_creation(self):
+    def test_valid_evaluation_data_creation(self) -> None:
         """Test creating valid EvaluationData instance."""
         turn = TurnData(
             turn_id="1",
@@ -92,7 +106,7 @@ def test_valid_evaluation_data_creation(self):
         assert len(eval_data.turns) == 1
         assert eval_data.turns[0].turn_metrics == ["ragas:faithfulness"]
 
-    def test_evaluation_data_with_minimal_fields(self):
+    def test_evaluation_data_with_minimal_fields(self) -> None:
         """Test EvaluationData with only required fields."""
         turn = TurnData(turn_id="1", query="Test query", response="Test response")
         eval_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
@@ -103,7 +117,7 @@ def test_evaluation_data_with_minimal_fields(self):
         assert len(eval_data.turns) == 1
         assert eval_data.turns[0].turn_metrics is None
 
-    def test_evaluation_data_invalid_empty_conversation_id(self):
+    def test_evaluation_data_invalid_empty_conversation_id(self) -> None:
         """Test validation error for empty conversation_group_id."""
         turn = TurnData(turn_id="1", query="Test query", response="Test response")
         with pytest.raises(
@@ -111,7 +125,7 @@ def test_evaluation_data_invalid_empty_conversation_id(self):
         ):
             EvaluationData(conversation_group_id="", turns=[turn])
 
-    def test_evaluation_data_invalid_metric_format_missing_colon(self):
+    def test_evaluation_data_invalid_metric_format_missing_colon(self) -> None:
         """Test validation error for metric without colon."""
         with pytest.raises(
             ValidationError, match='must be in format "framework:metric_name"'
@@ -123,7 +137,7 @@ def test_evaluation_data_invalid_metric_format_missing_colon(self):
                 turn_metrics=["invalid_metric"],
             )
 
-    def test_evaluation_data_with_metadata(self):
+    def test_evaluation_data_with_metadata(self) -> None:
         """Test EvaluationData with metadata fields."""
         turn = TurnData(
             turn_id="1",
@@ -150,7 +164,7 @@ def test_evaluation_data_with_metadata(self):
 class TestLLMConfig:
     """Unit tests for LLMConfig model."""
 
-    def test_valid_llm_config_creation(self):
+    def test_valid_llm_config_creation(self) -> None:
         """Test creating valid LLMConfig instance."""
         config = LLMConfig(
             provider="openai",
@@ -168,7 +182,7 @@ def test_valid_llm_config_creation(self):
         assert config.timeout == 60
         assert config.num_retries == 3
 
-    def test_llm_config_with_defaults(self):
+    def test_llm_config_with_defaults(self) -> None:
         """Test LLMConfig with default values."""
         config = LLMConfig(provider="openai", model="gpt-4")
 
@@ -183,7 +197,7 @@ def test_llm_config_with_defaults(self):
 class TestSystemConfig:
     """Unit tests for SystemConfig model."""
 
-    def test_valid_system_config_creation(self):
+    def test_valid_system_config_creation(self) -> None:
         """Test creating valid SystemConfig instance."""
         config = SystemConfig(
             core=CoreConfig(max_threads=42),
@@ -200,7 +214,7 @@ def test_valid_system_config_creation(self):
         assert config.output.enabled_outputs == ["json"]
         assert config.core.max_threads == 42
 
-    def test_system_config_with_defaults(self):
+    def test_system_config_with_defaults(self) -> None:
         """Test SystemConfig with default values."""
         config = SystemConfig()
 
@@ -211,7 +225,7 @@ def test_system_config_with_defaults(self):
         assert "csv" in config.output.enabled_outputs
         assert config.core.max_threads is None
 
-    def test_system_config_logging_defaults(self):
+    def test_system_config_logging_defaults(self) -> None:
         """Test SystemConfig logging configuration defaults."""
         config = SystemConfig()
 
@@ -224,7 +238,7 @@ def test_system_config_logging_defaults(self):
 class TestEvaluationResult:
     """Unit tests for EvaluationResult model."""
 
-    def test_valid_evaluation_result_creation(self):
+    def test_valid_evaluation_result_creation(self) -> None:
         """Test creating valid EvaluationResult instance."""
         result = EvaluationResult(
             conversation_group_id="test_conv",
@@ -242,7 +256,7 @@ def test_valid_evaluation_result_creation(self):
         assert result.score == 0.85
         assert result.reason == "High faithfulness score"
 
-    def test_evaluation_result_conversation_level(self):
+    def test_evaluation_result_conversation_level(self) -> None:
         """Test EvaluationResult for conversation-level metric."""
         result = EvaluationResult(
             conversation_group_id="test_conv",
@@ -257,7 +271,7 @@ def test_evaluation_result_conversation_level(self):
         assert result.metric_identifier == "deepeval:conversation_completeness"
         assert result.score == 0.92
 
-    def test_evaluation_result_validation_invalid_result(self):
+    def test_evaluation_result_validation_invalid_result(self) -> None:
         """Test EvaluationResult validation with invalid result."""
         with pytest.raises(ValidationError, match="Result must be one of"):
             EvaluationResult(
@@ -268,7 +282,7 @@ def test_evaluation_result_validation_invalid_result(self):
                 score=0.5,
             )
 
-    def test_evaluation_result_validation_invalid_score(self):
+    def test_evaluation_result_validation_invalid_score(self) -> None:
         """Test EvaluationResult validation with invalid score."""
         with pytest.raises(ValidationError, match="less than or equal to 1"):
             EvaluationResult(
diff --git a/tests/unit/core/llm/conftest.py b/tests/unit/core/llm/conftest.py
new file mode 100644
index 00000000..461496de
--- /dev/null
+++ b/tests/unit/core/llm/conftest.py
@@ -0,0 +1,29 @@
+"""Pytest configuration and fixtures for llm tests."""
+
+import pytest
+
+from lightspeed_evaluation.core.models import LLMConfig
+
+
+@pytest.fixture
+def llm_params() -> dict:
+    """Create sample LLM parameters."""
+    return {
+        "temperature": 0.5,
+        "max_completion_tokens": 1024,
+        "timeout": 120,
+        "num_retries": 5,
+    }
+
+
+@pytest.fixture
+def basic_llm_config() -> LLMConfig:
+    """Create basic LLM configuration."""
+    return LLMConfig(
+        provider="openai",
+        model="gpt-4",
+        temperature=0.0,
+        max_tokens=512,
+        timeout=60,
+        num_retries=3,
+    )
diff --git a/tests/unit/core/llm/test_custom.py b/tests/unit/core/llm/test_custom.py
index 19801e36..bbd9d3ca 100644
--- a/tests/unit/core/llm/test_custom.py
+++ b/tests/unit/core/llm/test_custom.py
@@ -1,15 +1,16 @@
 """Unit tests for custom LLM classes."""
 
 import pytest
+from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.llm.custom import BaseCustomLLM, TokenTracker
 from lightspeed_evaluation.core.system.exceptions import LLMError
 
 
-class TestTokenTracker:
+class TestTokenTracker:  # pylint: disable=too-few-public-methods
     """Tests for TokenTracker."""
 
-    def test_token_callback_accumulates_tokens(self, mocker):
+    def test_token_callback_accumulates_tokens(self, mocker: MockerFixture) -> None:
         """Test that token callback accumulates token counts."""
         tracker = TokenTracker()
 
@@ -19,7 +20,9 @@ def test_token_callback_accumulates_tokens(self, mocker):
         mock_response.usage.prompt_tokens = 10
         mock_response.usage.completion_tokens = 20
 
-        tracker._token_callback({}, mock_response, 0.0, 0.0)
+        tracker._token_callback(  # pylint: disable=protected-access
+            {}, mock_response, 0.0, 0.0
+        )
 
         input_tokens, output_tokens = tracker.get_counts()
         assert input_tokens == 10
@@ -29,7 +32,7 @@ def test_token_callback_accumulates_tokens(self, mocker):
 class TestBaseCustomLLM:
     """Tests for BaseCustomLLM."""
 
-    def test_setup_ssl_verify_enabled(self, mocker):
+    def test_setup_ssl_verify_enabled(self, mocker: MockerFixture) -> None:
         """Test SSL verification enabled by default."""
         mock_litellm = mocker.patch("lightspeed_evaluation.core.llm.custom.litellm")
         mocker.patch.dict("os.environ", {"SSL_CERTIFI_BUNDLE": "/path/to/bundle.pem"})
@@ -38,7 +41,7 @@ def test_setup_ssl_verify_enabled(self, mocker):
 
         assert mock_litellm.ssl_verify == "/path/to/bundle.pem"
 
-    def test_setup_ssl_verify_disabled(self, mocker):
+    def test_setup_ssl_verify_disabled(self, mocker: MockerFixture) -> None:
         """Test SSL verification can be disabled."""
         mock_litellm = mocker.patch("lightspeed_evaluation.core.llm.custom.litellm")
         mocker.patch.dict("os.environ", {})
@@ -47,7 +50,7 @@ def test_setup_ssl_verify_disabled(self, mocker):
 
         assert mock_litellm.ssl_verify is False
 
-    def test_call_returns_single_response(self, mocker):
+    def test_call_returns_single_response(self, mocker: MockerFixture) -> None:
         """Test call returns single string when n=1."""
         mock_litellm = mocker.patch("lightspeed_evaluation.core.llm.custom.litellm")
         mocker.patch.dict("os.environ", {})
@@ -64,7 +67,7 @@ def test_call_returns_single_response(self, mocker):
 
         assert result == "Test response"
 
-    def test_call_with_temperature_override(self, mocker):
+    def test_call_with_temperature_override(self, mocker: MockerFixture) -> None:
         """Test call with temperature override."""
         mock_litellm = mocker.patch("lightspeed_evaluation.core.llm.custom.litellm")
         mocker.patch.dict("os.environ", {})
@@ -81,7 +84,7 @@ def test_call_with_temperature_override(self, mocker):
         call_args = mock_litellm.completion.call_args[1]
         assert call_args["temperature"] == 0.9
 
-    def test_call_raises_llm_error_on_failure(self, mocker):
+    def test_call_raises_llm_error_on_failure(self, mocker: MockerFixture) -> None:
         """Test call raises LLMError on failure."""
         mock_litellm = mocker.patch("lightspeed_evaluation.core.llm.custom.litellm")
         mocker.patch.dict("os.environ", {})
diff --git a/tests/unit/core/llm/test_deepeval_manager.py b/tests/unit/core/llm/test_deepeval_manager.py
index 367d7380..9ff27e41 100644
--- a/tests/unit/core/llm/test_deepeval_manager.py
+++ b/tests/unit/core/llm/test_deepeval_manager.py
@@ -1,25 +1,15 @@
 """Unit tests for DeepEval LLM Manager."""
 
 import pytest
+from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager
 
 
-@pytest.fixture
-def llm_params():
-    """Create sample LLM parameters."""
-    return {
-        "temperature": 0.5,
-        "max_completion_tokens": 1024,
-        "timeout": 120,
-        "num_retries": 5,
-    }
-
-
 class TestDeepEvalLLMManager:
     """Tests for DeepEvalLLMManager."""
 
-    def test_initialization(self, llm_params, mocker):
+    def test_initialization(self, llm_params: dict, mocker: MockerFixture) -> None:
         """Test manager initialization."""
         mock_model = mocker.patch(
             "lightspeed_evaluation.core.llm.deepeval.LiteLLMModel"
@@ -31,7 +21,9 @@ def test_initialization(self, llm_params, mocker):
         assert manager.llm_params == llm_params
         mock_model.assert_called_once()
 
-    def test_initialization_with_default_temperature(self, mocker):
+    def test_initialization_with_default_temperature(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test initialization with default temperature."""
         mock_model = mocker.patch(
             "lightspeed_evaluation.core.llm.deepeval.LiteLLMModel"
@@ -44,7 +36,9 @@ def test_initialization_with_default_temperature(self, mocker):
         call_kwargs = mock_model.call_args.kwargs
         assert call_kwargs["temperature"] == 0.0
 
-    def test_initialization_with_default_num_retries(self, mocker):
+    def test_initialization_with_default_num_retries(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test initialization with default num_retries."""
         mock_model = mocker.patch(
             "lightspeed_evaluation.core.llm.deepeval.LiteLLMModel"
@@ -57,7 +51,7 @@ def test_initialization_with_default_num_retries(self, mocker):
         call_kwargs = mock_model.call_args.kwargs
         assert call_kwargs["num_retries"] == 3
 
-    def test_get_llm(self, llm_params, mocker):
+    def test_get_llm(self, llm_params: dict, mocker: MockerFixture) -> None:
         """Test get_llm method."""
         mock_model_instance = mocker.Mock()
         mocker.patch(
@@ -70,7 +64,7 @@ def test_get_llm(self, llm_params, mocker):
 
         assert llm == mock_model_instance
 
-    def test_get_model_info(self, llm_params, mocker):
+    def test_get_model_info(self, llm_params: dict, mocker: MockerFixture) -> None:
         """Test get_model_info method."""
         mocker.patch("lightspeed_evaluation.core.llm.deepeval.LiteLLMModel")
 
@@ -83,7 +77,9 @@ def test_get_model_info(self, llm_params, mocker):
         assert info["timeout"] == 120
         assert info["num_retries"] == 5
 
-    def test_initialization_prints_message(self, llm_params, mocker, capsys):
+    def test_initialization_prints_message(
+        self, llm_params: dict, mocker: MockerFixture, capsys: pytest.CaptureFixture
+    ) -> None:
         """Test that initialization prints configuration message."""
         mocker.patch("lightspeed_evaluation.core.llm.deepeval.LiteLLMModel")
 
diff --git a/tests/unit/core/llm/test_llm_manager.py b/tests/unit/core/llm/test_llm_manager.py
index f4fc77d0..22810c5d 100644
--- a/tests/unit/core/llm/test_llm_manager.py
+++ b/tests/unit/core/llm/test_llm_manager.py
@@ -1,28 +1,18 @@
 """Unit tests for LLM Manager."""
 
 import pytest
+from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.models import LLMConfig, SystemConfig
 from lightspeed_evaluation.core.llm.manager import LLMManager
 
 
-@pytest.fixture
-def basic_llm_config():
-    """Create basic LLM configuration."""
-    return LLMConfig(
-        provider="openai",
-        model="gpt-4",
-        temperature=0.0,
-        max_tokens=512,
-        timeout=60,
-        num_retries=3,
-    )
-
-
 class TestLLMManager:
     """Tests for LLMManager."""
 
-    def test_initialization_openai(self, basic_llm_config, mocker):
+    def test_initialization_openai(
+        self, basic_llm_config: LLMConfig, mocker: MockerFixture
+    ) -> None:
         """Test initialization with OpenAI provider."""
         mocker.patch("lightspeed_evaluation.core.llm.manager.validate_provider_env")
 
@@ -31,7 +21,7 @@ def test_initialization_openai(self, basic_llm_config, mocker):
         assert manager.model_name == "gpt-4"
         assert manager.config.provider == "openai"
 
-    def test_initialization_azure(self, mocker):
+    def test_initialization_azure(self, mocker: MockerFixture) -> None:
         """Test initialization with Azure provider."""
         config = LLMConfig(
             provider="azure",
@@ -45,7 +35,7 @@ def test_initialization_azure(self, mocker):
 
         assert "azure" in manager.model_name
 
-    def test_initialization_azure_with_deployment(self, mocker):
+    def test_initialization_azure_with_deployment(self, mocker: MockerFixture) -> None:
         """Test initialization with Azure deployment name."""
         config = LLMConfig(
             provider="azure",
@@ -59,7 +49,7 @@ def test_initialization_azure_with_deployment(self, mocker):
 
         assert manager.model_name == "azure/my-deployment"
 
-    def test_initialization_watsonx(self, mocker):
+    def test_initialization_watsonx(self, mocker: MockerFixture) -> None:
         """Test initialization with WatsonX provider."""
         config = LLMConfig(
             provider="watsonx",
@@ -72,7 +62,7 @@ def test_initialization_watsonx(self, mocker):
 
         assert manager.model_name == "watsonx/ibm/granite-13b"
 
-    def test_initialization_anthropic(self, mocker):
+    def test_initialization_anthropic(self, mocker: MockerFixture) -> None:
         """Test initialization with Anthropic provider."""
         config = LLMConfig(
             provider="anthropic",
@@ -85,7 +75,7 @@ def test_initialization_anthropic(self, mocker):
 
         assert manager.model_name == "anthropic/claude-3-opus"
 
-    def test_initialization_gemini(self, mocker):
+    def test_initialization_gemini(self, mocker: MockerFixture) -> None:
         """Test initialization with Gemini provider."""
         config = LLMConfig(
             provider="gemini",
@@ -98,7 +88,7 @@ def test_initialization_gemini(self, mocker):
 
         assert manager.model_name == "gemini/gemini-pro"
 
-    def test_initialization_vertex(self, mocker):
+    def test_initialization_vertex(self, mocker: MockerFixture) -> None:
         """Test initialization with Vertex AI provider."""
         config = LLMConfig(
             provider="vertex",
@@ -111,7 +101,7 @@ def test_initialization_vertex(self, mocker):
 
         assert manager.model_name == "gemini-pro"
 
-    def test_initialization_ollama(self, mocker):
+    def test_initialization_ollama(self, mocker: MockerFixture) -> None:
         """Test initialization with Ollama provider."""
         config = LLMConfig(
             provider="ollama",
@@ -124,7 +114,7 @@ def test_initialization_ollama(self, mocker):
 
         assert manager.model_name == "ollama/llama2"
 
-    def test_initialization_hosted_vllm(self, mocker):
+    def test_initialization_hosted_vllm(self, mocker: MockerFixture) -> None:
         """Test initialization with hosted vLLM provider."""
         config = LLMConfig(
             provider="hosted_vllm",
@@ -137,7 +127,9 @@ def test_initialization_hosted_vllm(self, mocker):
 
         assert manager.model_name == "hosted_vllm/mistral-7b"
 
-    def test_initialization_generic_provider(self, basic_llm_config, mocker, capsys):
+    def test_initialization_generic_provider(
+        self, mocker: MockerFixture, capsys: pytest.CaptureFixture
+    ) -> None:
         """Test initialization with unknown/generic provider."""
         config = LLMConfig(
             provider="custom_provider",
@@ -155,7 +147,9 @@ def test_initialization_generic_provider(self, basic_llm_config, mocker, capsys)
         captured = capsys.readouterr()
         assert "generic" in captured.out.lower() or "warning" in captured.out.lower()
 
-    def test_get_model_name(self, basic_llm_config, mocker):
+    def test_get_model_name(
+        self, basic_llm_config: LLMConfig, mocker: MockerFixture
+    ) -> None:
         """Test get_model_name method."""
         mocker.patch("lightspeed_evaluation.core.llm.manager.validate_provider_env")
 
@@ -163,7 +157,9 @@ def test_get_model_name(self, basic_llm_config, mocker):
 
         assert manager.get_model_name() == "gpt-4"
 
-    def test_get_llm_params(self, basic_llm_config, mocker):
+    def test_get_llm_params(
+        self, basic_llm_config: LLMConfig, mocker: MockerFixture
+    ) -> None:
         """Test get_llm_params method."""
         mocker.patch("lightspeed_evaluation.core.llm.manager.validate_provider_env")
 
@@ -176,7 +172,9 @@ def test_get_llm_params(self, basic_llm_config, mocker):
         assert params["timeout"] == 60
         assert params["num_retries"] == 3
 
-    def test_get_config(self, basic_llm_config, mocker):
+    def test_get_config(
+        self, basic_llm_config: LLMConfig, mocker: MockerFixture
+    ) -> None:
         """Test get_config method."""
         mocker.patch("lightspeed_evaluation.core.llm.manager.validate_provider_env")
 
@@ -187,7 +185,7 @@ def test_get_config(self, basic_llm_config, mocker):
         assert config.provider == "openai"
         assert config.model == "gpt-4"
 
-    def test_from_system_config(self, mocker):
+    def test_from_system_config(self, mocker: MockerFixture) -> None:
         """Test creating manager from SystemConfig."""
         system_config = SystemConfig()
         system_config.llm = LLMConfig(
@@ -203,7 +201,9 @@ def test_from_system_config(self, mocker):
         assert manager.config.model == "gpt-3.5-turbo"
         assert manager.config.temperature == 0.5
 
-    def test_from_llm_config(self, basic_llm_config, mocker):
+    def test_from_llm_config(
+        self, basic_llm_config: LLMConfig, mocker: MockerFixture
+    ) -> None:
         """Test creating manager from LLMConfig."""
         mocker.patch("lightspeed_evaluation.core.llm.manager.validate_provider_env")
 
@@ -211,7 +211,7 @@ def test_from_llm_config(self, basic_llm_config, mocker):
 
         assert manager.config == basic_llm_config
 
-    def test_llm_params_with_custom_values(self, mocker):
+    def test_llm_params_with_custom_values(self, mocker: MockerFixture) -> None:
         """Test LLM params with custom configuration values."""
         config = LLMConfig(
             provider="openai",
@@ -231,7 +231,12 @@ def test_llm_params_with_custom_values(self, mocker):
         assert params["timeout"] == 120
         assert params["num_retries"] == 5
 
-    def test_initialization_prints_message(self, basic_llm_config, mocker, capsys):
+    def test_initialization_prints_message(
+        self,
+        basic_llm_config: LLMConfig,
+        mocker: MockerFixture,
+        capsys: pytest.CaptureFixture,
+    ) -> None:
         """Test that initialization prints configuration message."""
         mocker.patch("lightspeed_evaluation.core.llm.manager.validate_provider_env")
 
diff --git a/tests/unit/core/llm/test_manager.py b/tests/unit/core/llm/test_manager.py
index b3164148..9d83a962 100644
--- a/tests/unit/core/llm/test_manager.py
+++ b/tests/unit/core/llm/test_manager.py
@@ -12,12 +12,12 @@
 class TestLLMError:
     """Unit tests for LLMError exception."""
 
-    def test_llm_error_creation(self):
+    def test_llm_error_creation(self) -> None:
         """Test creating LLMError exception."""
         error = LLMError("Test error message")
         assert str(error) == "Test error message"
 
-    def test_llm_error_inheritance(self):
+    def test_llm_error_inheritance(self) -> None:
         """Test that LLMError inherits from Exception."""
         error = LLMError("Test error")
         assert isinstance(error, Exception)
@@ -26,7 +26,7 @@ def test_llm_error_inheritance(self):
 class TestLLMManager:
     """Unit tests for LLMManager class."""
 
-    def test_llm_manager_initialization_openai(self, mocker: MockerFixture):
+    def test_llm_manager_initialization_openai(self, mocker: MockerFixture) -> None:
         """Test LLMManager initialization with OpenAI provider."""
         config = LLMConfig(provider="openai", model="gpt-4")
 
@@ -38,7 +38,9 @@ def test_llm_manager_initialization_openai(self, mocker: MockerFixture):
         assert manager.model_name == "gpt-4"
         mock_print.assert_called_with("✅ LLM Manager: openai/gpt-4 -> gpt-4")
 
-    def test_llm_manager_initialization_generic_provider(self, mocker: MockerFixture):
+    def test_llm_manager_initialization_generic_provider(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test LLMManager initialization with unknown/generic provider."""
         config = LLMConfig(provider="custom", model="custom-model")
 
@@ -48,7 +50,7 @@ def test_llm_manager_initialization_generic_provider(self, mocker: MockerFixture
         assert manager.model_name == "custom/custom-model"
         mock_print.assert_any_call("⚠️ Using generic provider format for custom")
 
-    def test_llm_manager_openai_missing_api_key(self, mocker: MockerFixture):
+    def test_llm_manager_openai_missing_api_key(self, mocker: MockerFixture) -> None:
         """Test LLMManager with OpenAI provider but missing API key."""
         config = LLMConfig(provider="openai", model="gpt-4")
 
@@ -58,7 +60,7 @@ def test_llm_manager_openai_missing_api_key(self, mocker: MockerFixture):
         ):
             LLMManager(config)
 
-    def test_get_model_name(self, mocker: MockerFixture):
+    def test_get_model_name(self, mocker: MockerFixture) -> None:
         """Test get_model_name method."""
         config = LLMConfig(provider="openai", model="gpt-4")
 
@@ -66,7 +68,7 @@ def test_get_model_name(self, mocker: MockerFixture):
         manager = LLMManager(config)
         assert manager.get_model_name() == "gpt-4"
 
-    def test_get_llm_params(self, mocker: MockerFixture):
+    def test_get_llm_params(self, mocker: MockerFixture) -> None:
         """Test get_llm_params method."""
         config = LLMConfig(
             provider="openai",
@@ -91,7 +93,7 @@ def test_get_llm_params(self, mocker: MockerFixture):
         }
         assert params == expected
 
-    def test_get_llm_params_with_ssl_verify_false(self, mocker: MockerFixture):
+    def test_get_llm_params_with_ssl_verify_false(self, mocker: MockerFixture) -> None:
         """Test get_llm_params method with ssl_verify set to False."""
         config = LLMConfig(
             provider="openai",
@@ -117,7 +119,7 @@ def test_get_llm_params_with_ssl_verify_false(self, mocker: MockerFixture):
         }
         assert params == expected
 
-    def test_get_config(self, mocker: MockerFixture):
+    def test_get_config(self, mocker: MockerFixture) -> None:
         """Test get_config method."""
         config = LLMConfig(provider="openai", model="gpt-4")
 
@@ -125,7 +127,7 @@ def test_get_config(self, mocker: MockerFixture):
         manager = LLMManager(config)
         assert manager.get_config() == config
 
-    def test_from_system_config(self, mocker: MockerFixture):
+    def test_from_system_config(self, mocker: MockerFixture) -> None:
         """Test from_system_config class method."""
         system_config = SystemConfig.model_validate(
             {
@@ -146,7 +148,7 @@ def test_from_system_config(self, mocker: MockerFixture):
         assert manager.config.temperature == 0.5
         assert manager.config.max_tokens == 2000
 
-    def test_provider_case_insensitive(self, mocker: MockerFixture):
+    def test_provider_case_insensitive(self, mocker: MockerFixture) -> None:
         """Test that provider names are handled case-insensitively."""
         config = LLMConfig(provider="OpenAI", model="gpt-4")
 
@@ -154,7 +156,7 @@ def test_provider_case_insensitive(self, mocker: MockerFixture):
         manager = LLMManager(config)
         assert manager.model_name == "gpt-4"
 
-    def test_multiple_providers_in_sequence(self, mocker: MockerFixture):
+    def test_multiple_providers_in_sequence(self, mocker: MockerFixture) -> None:
         """Test creating managers for different providers in sequence."""
         providers_data = [
             ("openai", "gpt-4", {"OPENAI_API_KEY": "test-key"}, "gpt-4"),
diff --git a/tests/unit/core/metrics/conftest.py b/tests/unit/core/metrics/conftest.py
new file mode 100644
index 00000000..6938d5ff
--- /dev/null
+++ b/tests/unit/core/metrics/conftest.py
@@ -0,0 +1,142 @@
+"""Pytest configuration and fixtures for metrics tests."""
+
+import sys
+
+import pytest
+from pytest_mock import MockerFixture
+
+from lightspeed_evaluation.core.metrics.nlp import NLPMetrics
+from lightspeed_evaluation.core.models import EvaluationScope, TurnData, SystemConfig
+
+
+@pytest.fixture
+def system_config() -> SystemConfig:
+    """Create a test system config with metrics metadata."""
+    config = SystemConfig()
+
+    # Set up test metrics metadata
+    config.default_turn_metrics_metadata = {
+        "ragas:faithfulness": {
+            "threshold": 0.7,
+            "default": True,
+            "description": "Test",
+        },
+        "ragas:response_relevancy": {
+            "threshold": 0.8,
+            "default": False,
+            "description": "Test",
+        },
+        "custom:answer_correctness": {
+            "threshold": 0.75,
+            "default": True,
+            "description": "Test",
+        },
+    }
+
+    config.default_conversation_metrics_metadata = {
+        "deepeval:conversation_completeness": {
+            "threshold": 0.6,
+            "default": True,
+            "description": "Test",
+        },
+        "deepeval:conversation_relevancy": {
+            "threshold": 0.7,
+            "default": False,
+            "description": "Test",
+        },
+    }
+
+    return config
+
+
+@pytest.fixture
+def nlp_metrics() -> NLPMetrics:
+    """Create NLPMetrics instance."""
+    return NLPMetrics()
+
+
+@pytest.fixture
+def sample_turn_data() -> TurnData:
+    """Create sample TurnData for testing."""
+    return TurnData(
+        turn_id="test_turn",
+        query="What is the capital of France?",
+        response="The capital of France is Paris.",
+        expected_response="The capital of France is Paris.",
+    )
+
+
+@pytest.fixture
+def sample_scope(  # pylint: disable=redefined-outer-name
+    sample_turn_data: TurnData,
+) -> EvaluationScope:
+    """Create sample EvaluationScope for turn-level evaluation."""
+    return EvaluationScope(
+        turn_idx=0,
+        turn_data=sample_turn_data,
+        is_conversation=False,
+    )
+
+
+@pytest.fixture
+def conversation_scope(  # pylint: disable=redefined-outer-name
+    sample_turn_data: TurnData,
+) -> EvaluationScope:
+    """Create sample EvaluationScope for conversation-level evaluation."""
+    return EvaluationScope(
+        turn_idx=0,
+        turn_data=sample_turn_data,
+        is_conversation=True,
+    )
+
+
+@pytest.fixture
+def mock_bleu_scorer(mocker: MockerFixture) -> MockerFixture:
+    """Mock sacrebleu BLEU with configurable return value.
+
+    Uses sys.modules injection to mock sacrebleu without requiring it to be installed.
+    """
+    mock_result = mocker.MagicMock()
+    mock_result.score = 85.0  # sacrebleu returns 0-100 scale
+
+    mock_scorer_instance = mocker.MagicMock()
+    mock_scorer_instance.corpus_score = mocker.MagicMock(return_value=mock_result)
+
+    mock_bleu_class = mocker.MagicMock(return_value=mock_scorer_instance)
+
+    # Create a fake sacrebleu module and inject it into sys.modules
+    mock_sacrebleu = mocker.MagicMock()
+    mock_sacrebleu.BLEU = mock_bleu_class
+    mocker.patch.dict(sys.modules, {"sacrebleu": mock_sacrebleu})
+
+    return mock_scorer_instance
+
+
+@pytest.fixture
+def mock_rouge_scorer(mocker: MockerFixture) -> MockerFixture:
+    """Mock RougeScore with configurable return value.
+
+    Returns different scores for precision, recall, fmeasure.
+    """
+    mock_scorer_instance = mocker.MagicMock()
+    # Return scores for precision, recall, fmeasure (called in that order)
+    mock_scorer_instance.single_turn_score = mocker.MagicMock(
+        side_effect=[0.95, 0.89, 0.92]
+    )
+    mocker.patch(
+        "lightspeed_evaluation.core.metrics.nlp.RougeScore",
+        return_value=mock_scorer_instance,
+    )
+    return mock_scorer_instance
+
+
+@pytest.fixture
+def mock_similarity_scorer(mocker: MockerFixture) -> MockerFixture:
+    """Mock NonLLMStringSimilarity with configurable return value."""
+    mock_scorer_instance = mocker.MagicMock()
+    mock_scorer_instance.single_turn_score = mocker.MagicMock(return_value=0.78)
+    mocker.patch(
+        "lightspeed_evaluation.core.metrics.nlp.NonLLMStringSimilarity",
+        return_value=mock_scorer_instance,
+    )
+    return mock_scorer_instance
diff --git a/tests/unit/core/metrics/custom/test_custom.py b/tests/unit/core/metrics/custom/test_custom.py
index a8a0cda3..039ad6b5 100644
--- a/tests/unit/core/metrics/custom/test_custom.py
+++ b/tests/unit/core/metrics/custom/test_custom.py
@@ -1,5 +1,6 @@
 """Tests for custom metrics module."""
 
+from pytest_mock import MockerFixture
 from lightspeed_evaluation.core.metrics.custom.custom import CustomMetrics
 from lightspeed_evaluation.core.metrics.manager import MetricLevel
 from lightspeed_evaluation.core.models import EvaluationScope, TurnData
@@ -8,7 +9,9 @@
 class TestCustomMetricsToolEval:
     """Test CustomMetrics tool_eval functionality."""
 
-    def test_evaluate_tool_calls_with_none_tool_calls(self, mocker):
+    def test_evaluate_tool_calls_with_none_tool_calls(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test that None tool_calls is handled correctly."""
         # Mock LLM manager
         mock_llm_manager = mocker.Mock()
@@ -33,7 +36,7 @@ def test_evaluate_tool_calls_with_none_tool_calls(self, mocker):
         assert score == 1.0
         assert "Alternative 2 matched" in reason
 
-    def test_default_config_uses_full_ordered(self, mocker):
+    def test_default_config_uses_full_ordered(self, mocker: MockerFixture) -> None:
         """Test that default config uses full_match=True and ordered=True."""
         mock_llm_manager = mocker.Mock()
         mock_llm_manager.get_model_name.return_value = "test-model"
@@ -63,7 +66,7 @@ def test_default_config_uses_full_ordered(self, mocker):
         assert "full" in reason
         assert "ordered" in reason
 
-    def test_config_ordered_false_from_metadata(self, mocker):
+    def test_config_ordered_false_from_metadata(self, mocker: MockerFixture) -> None:
         """Test that ordered=False is read from turn_metrics_metadata."""
         mock_llm_manager = mocker.Mock()
         mock_llm_manager.get_model_name.return_value = "test-model"
@@ -93,7 +96,7 @@ def test_config_ordered_false_from_metadata(self, mocker):
         assert score == 1.0
         assert "unordered" in reason
 
-    def test_config_match_partial_from_metadata(self, mocker):
+    def test_config_match_partial_from_metadata(self, mocker: MockerFixture) -> None:
         """Test that full_match=False is read from turn_metrics_metadata."""
         mock_llm_manager = mocker.Mock()
         mock_llm_manager.get_model_name.return_value = "test-model"
@@ -121,7 +124,9 @@ def test_config_match_partial_from_metadata(self, mocker):
         assert "partial" in reason
         assert "1/1 matched" in reason
 
-    def test_config_from_system_defaults_via_metric_manager(self, mocker):
+    def test_config_from_system_defaults_via_metric_manager(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test that config is read from system.yaml via MetricManager."""
         mock_llm_manager = mocker.Mock()
         mock_llm_manager.get_model_name.return_value = "test-model"
diff --git a/tests/unit/core/metrics/custom/test_tool_eval.py b/tests/unit/core/metrics/custom/test_tool_eval.py
index ad199c6d..7dae5e4c 100644
--- a/tests/unit/core/metrics/custom/test_tool_eval.py
+++ b/tests/unit/core/metrics/custom/test_tool_eval.py
@@ -12,7 +12,7 @@
 class TestEvaluateToolCalls:
     """Test cases for evaluate_tool_calls function."""
 
-    def test_primary_pattern_match(self):
+    def test_primary_pattern_match(self) -> None:
         """Test successful match with primary pattern."""
         expected = [
             [  # Primary pattern
@@ -27,7 +27,7 @@ def test_primary_pattern_match(self):
         assert "Primary pattern matched" in details
         assert "Tool calls match expected structure and arguments" in details
 
-    def test_alternative_pattern_match(self):
+    def test_alternative_pattern_match(self) -> None:
         """Test successful match with alternative pattern."""
         expected = [
             [  # Primary pattern
@@ -45,24 +45,26 @@ def test_alternative_pattern_match(self):
         assert "Alternative 2 matched" in details
         assert "Tool calls match expected structure and arguments" in details
 
-    def test_empty_pattern_match_primary(self):
+    def test_empty_pattern_match_primary(self) -> None:
         """Test empty pattern match as primary."""
-        expected = [[]]  # Primary: no tools expected
-        actual = []
+        expected: list[list[dict]] = [[]]  # Primary: no tools expected
+        actual: list = []
 
-        success, details = evaluate_tool_calls(expected, actual)
+        success, details = evaluate_tool_calls(
+            expected, actual  # pyright: ignore[reportArgumentType]
+        )
 
         assert success is True
         assert "Primary pattern matched" in details
         assert "No tool calls made (valid alternate skip scenario)" in details
 
-    def test_empty_pattern_match_alternative(self):
+    def test_empty_pattern_match_alternative(self) -> None:
         """Test empty pattern match as alternative."""
         expected = [
             [[{"tool_name": "test_tool", "arguments": {}}]],  # Primary: some tool
             [],  # Alternative: no tools (skip scenario)
         ]
-        actual = []
+        actual: list = []
 
         success, details = evaluate_tool_calls(expected, actual)
 
@@ -70,7 +72,7 @@ def test_empty_pattern_match_alternative(self):
         assert "Alternative 2 matched" in details
         assert "valid alternate skip scenario" in details
 
-    def test_no_pattern_match(self):
+    def test_no_pattern_match(self) -> None:
         """Test when no patterns match."""
         expected = [
             [  # Primary pattern
@@ -87,13 +89,15 @@ def test_no_pattern_match(self):
         assert success is False
         assert "didn't match any of the 2 expected pattern(s)" in details
 
-    def test_error_handling(self):
+    def test_error_handling(self) -> None:
         """Test error handling in evaluate_tool_calls."""
         # Invalid expected format should be handled gracefully
         expected = "invalid"  # Not a list
-        actual = []
+        actual: list = []
 
-        success, details = evaluate_tool_calls(expected, actual)
+        success, details = evaluate_tool_calls(
+            expected, actual  # pyright: ignore[reportArgumentType]
+        )
 
         assert success is False
         # The function iterates over the string characters, so we get a different error
@@ -106,7 +110,7 @@ def test_error_handling(self):
 class TestCompareToolCalls:
     """Test cases for compare_tool_calls function."""
 
-    def test_exact_match(self):
+    def test_exact_match(self) -> None:
         """Test exact tool call match."""
         expected = [[{"tool_name": "test_tool", "arguments": {"key": "value"}}]]
         actual = [[{"tool_name": "test_tool", "arguments": {"key": "value"}}]]
@@ -115,7 +119,7 @@ def test_exact_match(self):
 
         assert result["success"] is True
 
-    def test_length_mismatch(self):
+    def test_length_mismatch(self) -> None:
         """Test tool call sequence length mismatch."""
         expected = [
             [{"tool_name": "tool1", "arguments": {}}],
@@ -127,10 +131,10 @@ def test_length_mismatch(self):
 
         assert result["success"] is False
 
-    def test_empty_sequences(self):
+    def test_empty_sequences(self) -> None:
         """Test empty tool call sequences."""
-        expected = []
-        actual = []
+        expected: list = []
+        actual: list = []
 
         result = compare_tool_calls(expected, actual)
 
@@ -140,7 +144,7 @@ def test_empty_sequences(self):
 class TestCompareToolCallSequence:
     """Test cases for _compare_tool_call_sequence function."""
 
-    def test_sequence_match(self):
+    def test_sequence_match(self) -> None:
         """Test matching tool call sequence."""
         expected = [
             {"tool_name": "tool1", "arguments": {"key1": "value1"}},
@@ -155,7 +159,7 @@ def test_sequence_match(self):
 
         assert result is True
 
-    def test_sequence_length_mismatch(self):
+    def test_sequence_length_mismatch(self) -> None:
         """Test tool call sequence with different lengths."""
         expected = [{"tool_name": "tool1", "arguments": {}}]
         actual = [
@@ -171,7 +175,7 @@ def test_sequence_length_mismatch(self):
 class TestCompareSingleToolCall:
     """Test cases for _compare_single_tool_call function."""
 
-    def test_tool_name_match(self):
+    def test_tool_name_match(self) -> None:
         """Test matching tool names and arguments."""
         expected = {"tool_name": "test_tool", "arguments": {"key": "value"}}
         actual = {"tool_name": "test_tool", "arguments": {"key": "value"}}
@@ -180,7 +184,7 @@ def test_tool_name_match(self):
 
         assert result is True
 
-    def test_tool_name_mismatch(self):
+    def test_tool_name_mismatch(self) -> None:
         """Test mismatched tool names."""
         expected = {"tool_name": "tool1", "arguments": {}}
         actual = {"tool_name": "tool2", "arguments": {}}
@@ -189,7 +193,7 @@ def test_tool_name_mismatch(self):
 
         assert result is False
 
-    def test_missing_arguments(self):
+    def test_missing_arguments(self) -> None:
         """Test tool calls with missing arguments."""
         expected = {"tool_name": "test_tool", "arguments": {"key": "value"}}
         actual = {"tool_name": "test_tool"}  # Missing arguments
@@ -202,7 +206,7 @@ def test_missing_arguments(self):
 class TestCompareToolArguments:
     """Test cases for _compare_tool_arguments function."""
 
-    def test_exact_arguments_match(self):
+    def test_exact_arguments_match(self) -> None:
         """Test exact argument matching."""
         expected = {"key1": "value1", "key2": "value2"}
         actual = {"key1": "value1", "key2": "value2"}
@@ -211,7 +215,7 @@ def test_exact_arguments_match(self):
 
         assert result is True
 
-    def test_regex_pattern_match(self):
+    def test_regex_pattern_match(self) -> None:
         """Test regex pattern matching in arguments."""
         expected = {"name": "web-server-[0-9]+"}
         actual = {"name": "web-server-123"}
@@ -220,7 +224,7 @@ def test_regex_pattern_match(self):
 
         assert result is True
 
-    def test_missing_argument_key(self):
+    def test_missing_argument_key(self) -> None:
         """Test missing argument key."""
         expected = {"key1": "value1", "key2": "value2"}
         actual = {"key1": "value1"}  # Missing key2
@@ -229,7 +233,7 @@ def test_missing_argument_key(self):
 
         assert result is False
 
-    def test_extra_argument_keys(self):
+    def test_extra_argument_keys(self) -> None:
         """Test extra argument keys."""
         expected = {"key1": "value1"}
         actual = {"key1": "value1", "key2": "value2"}  # Extra key2
@@ -238,7 +242,7 @@ def test_extra_argument_keys(self):
 
         assert result is False
 
-    def test_invalid_regex_pattern(self):
+    def test_invalid_regex_pattern(self) -> None:
         """Test invalid regex pattern handling."""
         expected = {"name": "[invalid_regex"}  # Invalid regex
         actual = {"name": "test"}
@@ -247,12 +251,14 @@ def test_invalid_regex_pattern(self):
 
         assert result is False
 
-    def test_non_dict_arguments(self):
+    def test_non_dict_arguments(self) -> None:
         """Test non-dictionary arguments."""
         expected = "not_a_dict"
         actual = {"key": "value"}
 
-        result = _compare_tool_arguments(expected, actual)
+        result = _compare_tool_arguments(
+            expected, actual  # pyright: ignore[reportArgumentType]
+        )
 
         assert result is False
 
@@ -260,7 +266,7 @@ def test_non_dict_arguments(self):
 class TestOrderedParameter:
     """Test cases for the ordered parameter in tool evaluation."""
 
-    def test_ordered_true_default_matches_in_order(self):
+    def test_ordered_true_default_matches_in_order(self) -> None:
         """Test ordered=True (default) matches when order is correct, fails otherwise."""
         expected = [
             [
@@ -286,7 +292,7 @@ def test_ordered_true_default_matches_in_order(self):
         success, _ = evaluate_tool_calls(expected, actual_wrong, ordered=True)
         assert success is False
 
-    def test_ordered_false_matches_any_order(self):
+    def test_ordered_false_matches_any_order(self) -> None:
         """Test ordered=False succeeds regardless of order."""
         expected = [
             [
@@ -303,7 +309,7 @@ def test_ordered_false_matches_any_order(self):
         assert success is True
         assert "unordered" in details
 
-    def test_ordered_false_fails_when_content_differs(self):
+    def test_ordered_false_fails_when_content_differs(self) -> None:
         """Test ordered=False still fails when tool calls don't match."""
         expected = [
             [
@@ -319,7 +325,7 @@ def test_ordered_false_fails_when_content_differs(self):
         success, _ = evaluate_tool_calls(expected, actual, ordered=False)
         assert success is False
 
-    def test_unordered_handles_duplicates_correctly(self):
+    def test_unordered_handles_duplicates_correctly(self) -> None:
         """Test unordered matching handles duplicate sequences properly."""
         # Each expected item must match exactly one actual item
         expected = [
@@ -343,7 +349,7 @@ def test_unordered_handles_duplicates_correctly(self):
         assert evaluate_tool_calls(expected, actual_valid, ordered=False)[0] is True
         assert evaluate_tool_calls(expected, actual_invalid, ordered=False)[0] is False
 
-    def test_tools_within_sequence_always_ordered(self):
+    def test_tools_within_sequence_always_ordered(self) -> None:
         """Test that tools within a single sequence must always match in order.
 
         The `ordered` parameter only affects sequence order, not tool order within.
@@ -369,7 +375,7 @@ def test_tools_within_sequence_always_ordered(self):
 class TestMatchParameter:
     """Test cases for full_match parameter (full vs partial matching)."""
 
-    def test_full_match_default_requires_exact_count(self):
+    def test_full_match_default_requires_exact_count(self) -> None:
         """Test full_match=True (default) requires all expected to match all actual."""
         expected = [
             [
@@ -396,7 +402,7 @@ def test_full_match_default_requires_exact_count(self):
         success, _ = evaluate_tool_calls(expected, actual_extra, full_match=True)
         assert success is False
 
-    def test_partial_match_allows_extra_actual_tools(self):
+    def test_partial_match_allows_extra_actual_tools(self) -> None:
         """Test full_match=False allows extra actual tools."""
         expected = [
             [
@@ -413,7 +419,7 @@ def test_partial_match_allows_extra_actual_tools(self):
         assert success is True
         assert "partial" in details
 
-    def test_partial_match_succeeds_with_some_matches(self):
+    def test_partial_match_succeeds_with_some_matches(self) -> None:
         """Test full_match=False succeeds if any expected tool is found."""
         expected = [
             [
@@ -432,7 +438,7 @@ def test_partial_match_succeeds_with_some_matches(self):
         assert "1/2 matched" in details
         assert "1 unmatched" in details
 
-    def test_partial_match_fails_when_no_matches(self):
+    def test_partial_match_fails_when_no_matches(self) -> None:
         """Test full_match=False fails when no expected tools are found."""
         expected = [
             [
@@ -448,7 +454,7 @@ def test_partial_match_fails_when_no_matches(self):
         success, _ = evaluate_tool_calls(expected, actual, full_match=False)
         assert success is False
 
-    def test_partial_match_ordered_reports_statistics(self):
+    def test_partial_match_ordered_reports_statistics(self) -> None:
         """Test full_match=False with ordered=True reports match statistics."""
         expected = [
             [
@@ -470,7 +476,7 @@ def test_partial_match_ordered_reports_statistics(self):
         assert "2/2 matched" in details
         assert "0 unmatched" in details
 
-    def test_partial_match_ordered_finds_all_items(self):
+    def test_partial_match_ordered_finds_all_items(self) -> None:
         """Test full_match=False ordered finds all items using greedy matching."""
         expected = [
             [
@@ -493,7 +499,7 @@ def test_partial_match_ordered_finds_all_items(self):
         assert success is True
         assert "2/2 matched" in details
 
-    def test_partial_match_unordered_ignores_order(self):
+    def test_partial_match_unordered_ignores_order(self) -> None:
         """Test full_match=False with ordered=False ignores order."""
         expected = [
             [
@@ -516,7 +522,7 @@ def test_partial_match_unordered_ignores_order(self):
         assert "unordered" in details
         assert "2/2 matched" in details
 
-    def test_partial_match_all_matched_reports_correctly(self):
+    def test_partial_match_all_matched_reports_correctly(self) -> None:
         """Test full_match=False reports all matched correctly."""
         expected = [
             [
diff --git a/tests/unit/core/metrics/test_geval.py b/tests/unit/core/metrics/test_geval.py
index 0018a180..79ac617f 100644
--- a/tests/unit/core/metrics/test_geval.py
+++ b/tests/unit/core/metrics/test_geval.py
@@ -9,11 +9,11 @@
 from lightspeed_evaluation.core.metrics.manager import MetricLevel
 
 
-class TestGEvalHandler:
+class TestGEvalHandler:  # pylint: disable=too-many-public-methods
     """Test cases for GEvalHandler class."""
 
     @pytest.fixture
-    def mock_llm_manager(self):
+    def mock_llm_manager(self) -> MagicMock:
         """Create a mock DeepEvalLLMManager."""
         mock_manager = MagicMock()
         mock_llm = MagicMock()
@@ -21,19 +21,23 @@ def mock_llm_manager(self):
         return mock_manager
 
     @pytest.fixture
-    def mock_metric_manager(self):
+    def mock_metric_manager(self) -> MagicMock:
         """Create a mock MetricManager."""
         return MagicMock()
 
     @pytest.fixture
-    def handler(self, mock_llm_manager, mock_metric_manager):
+    def handler(
+        self, mock_llm_manager: MagicMock, mock_metric_manager: MagicMock
+    ) -> GEvalHandler:
         """Create a GEvalHandler instance with mocked dependencies."""
         return GEvalHandler(
             deepeval_llm_manager=mock_llm_manager,
             metric_manager=mock_metric_manager,
         )
 
-    def test_initialization(self, mock_llm_manager, mock_metric_manager):
+    def test_initialization(
+        self, mock_llm_manager: MagicMock, mock_metric_manager: MagicMock
+    ) -> None:
         """Test GEvalHandler initialization with required dependencies."""
         handler = GEvalHandler(
             deepeval_llm_manager=mock_llm_manager,
@@ -43,10 +47,12 @@ def test_initialization(self, mock_llm_manager, mock_metric_manager):
         assert handler.deepeval_llm_manager == mock_llm_manager
         assert handler.metric_manager == mock_metric_manager
 
-    def test_convert_evaluation_params_field_names(self, handler):
+    def test_convert_evaluation_params_field_names(self, handler: GEvalHandler) -> None:
         """Test conversion of evaluation data field names to LLMTestCaseParams enum."""
         params = ["query", "response", "expected_response"]
-        result = handler._convert_evaluation_params(params)
+        result = handler._convert_evaluation_params(  # pylint: disable=protected-access
+            params
+        )
 
         assert result is not None
         assert len(result) == 3
@@ -54,10 +60,14 @@ def test_convert_evaluation_params_field_names(self, handler):
         assert LLMTestCaseParams.ACTUAL_OUTPUT in result
         assert LLMTestCaseParams.EXPECTED_OUTPUT in result
 
-    def test_convert_evaluation_params_with_contexts(self, handler):
+    def test_convert_evaluation_params_with_contexts(
+        self, handler: GEvalHandler
+    ) -> None:
         """Test conversion including contexts and retrieval_context fields."""
         params = ["query", "response", "contexts", "retrieval_context"]
-        result = handler._convert_evaluation_params(params)
+        result = handler._convert_evaluation_params(  # pylint: disable=protected-access
+            params
+        )
 
         assert result is not None
         assert len(result) == 4
@@ -66,10 +76,14 @@ def test_convert_evaluation_params_with_contexts(self, handler):
         assert LLMTestCaseParams.CONTEXT in result
         assert LLMTestCaseParams.RETRIEVAL_CONTEXT in result
 
-    def test_convert_evaluation_params_enum_values_backward_compat(self, handler):
+    def test_convert_evaluation_params_enum_values_backward_compat(
+        self, handler: GEvalHandler
+    ) -> None:
         """Test conversion with direct enum value strings (backward compatibility)."""
         params = ["INPUT", "ACTUAL_OUTPUT", "EXPECTED_OUTPUT"]
-        result = handler._convert_evaluation_params(params)
+        result = handler._convert_evaluation_params(  # pylint: disable=protected-access
+            params
+        )
 
         assert result is not None
         assert len(result) == 3
@@ -77,28 +91,41 @@ def test_convert_evaluation_params_enum_values_backward_compat(self, handler):
         assert LLMTestCaseParams.ACTUAL_OUTPUT in result
         assert LLMTestCaseParams.EXPECTED_OUTPUT in result
 
-    def test_convert_evaluation_params_invalid_returns_none(self, handler):
+    def test_convert_evaluation_params_invalid_returns_none(
+        self, handler: GEvalHandler
+    ) -> None:
         """Test that invalid params return None to allow GEval auto-detection."""
         params = ["invalid_param", "another_invalid"]
-        result = handler._convert_evaluation_params(params)
+        result = handler._convert_evaluation_params(  # pylint: disable=protected-access
+            params
+        )
 
         assert result is None
 
-    def test_convert_evaluation_params_empty_returns_none(self, handler):
+    def test_convert_evaluation_params_empty_returns_none(
+        self, handler: GEvalHandler
+    ) -> None:
         """Test that empty params list returns None."""
-        result = handler._convert_evaluation_params([])
-
+        result = handler._convert_evaluation_params(  # pylint: disable=protected-access
+            []
+        )
         assert result is None
 
-    def test_convert_evaluation_params_mixed_invalid_returns_none(self, handler):
+    def test_convert_evaluation_params_mixed_invalid_returns_none(
+        self, handler: GEvalHandler
+    ) -> None:
         """Test that any invalid param causes None return."""
         params = ["query", "invalid_param", "response"]
-        result = handler._convert_evaluation_params(params)
+        result = handler._convert_evaluation_params(  # pylint: disable=protected-access
+            params
+        )
 
         # Should return None because of the invalid param
         assert result is None
 
-    def test_get_geval_config_uses_metric_manager(self, handler, mock_metric_manager):
+    def test_get_geval_config_uses_metric_manager(
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test that _get_geval_config delegates to MetricManager."""
         expected_config = {
             "criteria": "Test criteria",
@@ -108,7 +135,7 @@ def test_get_geval_config_uses_metric_manager(self, handler, mock_metric_manager
         mock_metric_manager.get_metric_metadata.return_value = expected_config
 
         conv_data = MagicMock()
-        config = handler._get_geval_config(
+        config = handler._get_geval_config(  # pylint: disable=protected-access
             metric_name="test_metric",
             conv_data=conv_data,
             turn_data=None,
@@ -123,7 +150,9 @@ def test_get_geval_config_uses_metric_manager(self, handler, mock_metric_manager
             turn_data=None,
         )
 
-    def test_get_geval_config_turn_level(self, handler, mock_metric_manager):
+    def test_get_geval_config_turn_level(
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test retrieving turn-level config uses correct MetricLevel."""
         expected_config = {"criteria": "Turn criteria", "threshold": 0.9}
         mock_metric_manager.get_metric_metadata.return_value = expected_config
@@ -131,7 +160,7 @@ def test_get_geval_config_turn_level(self, handler, mock_metric_manager):
         conv_data = MagicMock()
         turn_data = MagicMock()
 
-        config = handler._get_geval_config(
+        config = handler._get_geval_config(  # pylint: disable=protected-access
             metric_name="turn_metric",
             conv_data=conv_data,
             turn_data=turn_data,
@@ -147,13 +176,13 @@ def test_get_geval_config_turn_level(self, handler, mock_metric_manager):
         )
 
     def test_get_geval_config_returns_none_when_not_found(
-        self, handler, mock_metric_manager
-    ):
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test that None is returned when MetricManager finds no config."""
         mock_metric_manager.get_metric_metadata.return_value = None
 
         conv_data = MagicMock()
-        config = handler._get_geval_config(
+        config = handler._get_geval_config(  # pylint: disable=protected-access
             metric_name="nonexistent_metric",
             conv_data=conv_data,
             turn_data=None,
@@ -162,7 +191,9 @@ def test_get_geval_config_returns_none_when_not_found(
 
         assert config is None
 
-    def test_evaluate_missing_config(self, handler, mock_metric_manager):
+    def test_evaluate_missing_config(
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test that evaluate returns error when config is not found."""
         mock_metric_manager.get_metric_metadata.return_value = None
 
@@ -178,7 +209,9 @@ def test_evaluate_missing_config(self, handler, mock_metric_manager):
         assert score is None
         assert "configuration not found" in reason.lower()
 
-    def test_evaluate_missing_criteria(self, handler, mock_metric_manager):
+    def test_evaluate_missing_criteria(
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test that evaluate requires 'criteria' in config."""
         mock_metric_manager.get_metric_metadata.return_value = {
             "threshold": 0.8,
@@ -198,7 +231,9 @@ def test_evaluate_missing_criteria(self, handler, mock_metric_manager):
         assert score is None
         assert "criteria" in reason.lower()
 
-    def test_evaluate_turn_missing_turn_data(self, handler, mock_metric_manager):
+    def test_evaluate_turn_missing_turn_data(
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test that turn-level evaluation requires turn_data."""
         mock_metric_manager.get_metric_metadata.return_value = {
             "criteria": "Test criteria"
@@ -216,7 +251,9 @@ def test_evaluate_turn_missing_turn_data(self, handler, mock_metric_manager):
         assert score is None
         assert "turn data required" in reason.lower()
 
-    def test_evaluate_turn_success(self, handler, mock_metric_manager):
+    def test_evaluate_turn_success(
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test successful turn-level evaluation."""
         with patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
@@ -256,7 +293,9 @@ def test_evaluate_turn_success(self, handler, mock_metric_manager):
             assert reason == "Test passed"
             mock_metric.measure.assert_called_once()
 
-    def test_evaluate_turn_with_optional_fields(self, handler, mock_metric_manager):
+    def test_evaluate_turn_with_optional_fields(
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test turn-level evaluation includes optional fields when present."""
         with patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
@@ -303,7 +342,9 @@ def test_evaluate_turn_with_optional_fields(self, handler, mock_metric_manager):
                 assert call_kwargs["expected_output"] == "Expected response"
                 assert call_kwargs["context"] == ["Context 1", "Context 2"]
 
-    def test_evaluate_turn_none_score_returns_zero(self, handler, mock_metric_manager):
+    def test_evaluate_turn_none_score_returns_zero(
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test that None score from metric is converted to 0.0."""
         with patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
@@ -338,7 +379,9 @@ def test_evaluate_turn_none_score_returns_zero(self, handler, mock_metric_manage
             assert score == 0.0
             assert reason == "Could not evaluate"
 
-    def test_evaluate_turn_handles_exceptions(self, handler, mock_metric_manager):
+    def test_evaluate_turn_handles_exceptions(
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test that turn evaluation handles exceptions gracefully."""
         with patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
@@ -373,8 +416,8 @@ def test_evaluate_turn_handles_exceptions(self, handler, mock_metric_manager):
             assert "Test error" in reason
 
     def test_evaluate_turn_uses_default_params_when_none_provided(
-        self, handler, mock_metric_manager
-    ):
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test that default evaluation_params are used when none provided."""
         with patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
@@ -413,7 +456,9 @@ def test_evaluate_turn_uses_default_params_when_none_provided(
                 LLMTestCaseParams.ACTUAL_OUTPUT,
             ]
 
-    def test_evaluate_conversation_success(self, handler, mock_metric_manager):
+    def test_evaluate_conversation_success(
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test successful conversation-level evaluation."""
         with patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
@@ -453,7 +498,9 @@ def test_evaluate_conversation_success(self, handler, mock_metric_manager):
             assert reason == "Conversation coherent"
             mock_metric.measure.assert_called_once()
 
-    def test_evaluate_conversation_aggregates_turns(self, handler, mock_metric_manager):
+    def test_evaluate_conversation_aggregates_turns(
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test that conversation evaluation properly aggregates turn data."""
         with patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
@@ -512,8 +559,8 @@ def test_evaluate_conversation_aggregates_turns(self, handler, mock_metric_manag
                 assert "Turn 3 - Assistant:" in call_kwargs["actual_output"]
 
     def test_evaluate_conversation_with_evaluation_steps(
-        self, handler, mock_metric_manager
-    ):
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test that evaluation_steps are passed to GEval when provided."""
         with patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
@@ -558,8 +605,8 @@ def test_evaluate_conversation_with_evaluation_steps(
             ]
 
     def test_evaluate_conversation_handles_exceptions(
-        self, handler, mock_metric_manager
-    ):
+        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+    ) -> None:
         """Test that conversation evaluation handles exceptions gracefully."""
         with patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
diff --git a/tests/unit/core/metrics/test_keywords_eval.py b/tests/unit/core/metrics/test_keywords_eval.py
index df140918..19cc0eea 100644
--- a/tests/unit/core/metrics/test_keywords_eval.py
+++ b/tests/unit/core/metrics/test_keywords_eval.py
@@ -7,7 +7,7 @@
 class TestKeywordsEval:
     """Test cases for keywords eval metric."""
 
-    def test_keywords_eval_first_list_all_matched(self):
+    def test_keywords_eval_first_list_all_matched(self) -> None:
         """Test successful keywords evaluation when first list has all keywords matched."""
         turn_data = TurnData(
             turn_id="test_turn",
@@ -25,7 +25,7 @@ def test_keywords_eval_first_list_all_matched(self):
         assert "Keywords eval successful: Option 1" in reason
         assert "all keywords matched: 'yes', 'openshift-monitoring'" in reason
 
-    def test_keywords_eval_first_list_fails_second_succeeds(self):
+    def test_keywords_eval_first_list_fails_second_succeeds(self) -> None:
         """Test keywords evaluation when first list fails but second list succeeds."""
         turn_data = TurnData(
             turn_id="test_turn",
@@ -46,7 +46,7 @@ def test_keywords_eval_first_list_fails_second_succeeds(self):
         assert "Keywords eval successful: Option 2" in reason
         assert "all keywords matched: 'monitoring', 'confirmed'" in reason
 
-    def test_keywords_eval_all_lists_fail(self):
+    def test_keywords_eval_all_lists_fail(self) -> None:
         """Test keywords evaluation when all lists fail."""
         turn_data = TurnData(
             turn_id="test_turn",
@@ -70,7 +70,7 @@ def test_keywords_eval_all_lists_fail(self):
             "Option 2: unmatched ['confirmed', 'monitoring'], matched [none]" in reason
         )
 
-    def test_keywords_eval_partial_match_in_failed_list(self):
+    def test_keywords_eval_partial_match_in_failed_list(self) -> None:
         """Test keywords evaluation with partial matches in failed lists."""
         turn_data = TurnData(
             turn_id="test_turn",
@@ -92,7 +92,7 @@ def test_keywords_eval_partial_match_in_failed_list(self):
         assert "Option 1: unmatched ['yes', 'confirmed'], matched [none]" in reason
         assert "Option 2: unmatched ['openshift'], matched ['monitoring']" in reason
 
-    def test_keywords_eval_case_insensitive(self):
+    def test_keywords_eval_case_insensitive(self) -> None:
         """Test that keywords evaluation is case insensitive."""
         turn_data = TurnData(
             turn_id="test_turn",
@@ -109,7 +109,7 @@ def test_keywords_eval_case_insensitive(self):
         assert "Keywords eval successful: Option 1" in reason
         assert "all keywords matched: 'yes', 'openshift-monitoring'" in reason
 
-    def test_keywords_eval_substring_matching(self):
+    def test_keywords_eval_substring_matching(self) -> None:
         """Test that keywords evaluation works with substring matching."""
         turn_data = TurnData(
             turn_id="test_turn",
@@ -129,7 +129,7 @@ def test_keywords_eval_substring_matching(self):
         assert "Keywords eval successful: Option 1" in reason
         assert "all keywords matched: 'monitoring', 'success'" in reason
 
-    def test_keywords_eval_no_expected_keywords(self):
+    def test_keywords_eval_no_expected_keywords(self) -> None:
         """Test keywords evaluation when no expected keywords provided."""
         turn_data = TurnData(
             turn_id="test_turn",
@@ -143,7 +143,7 @@ def test_keywords_eval_no_expected_keywords(self):
         assert score is None
         assert "No expected keywords provided" in reason
 
-    def test_keywords_eval_no_response(self):
+    def test_keywords_eval_no_response(self) -> None:
         """Test keywords evaluation when no response provided."""
         turn_data = TurnData(
             turn_id="test_turn",
@@ -157,7 +157,7 @@ def test_keywords_eval_no_response(self):
         assert score == 0.0
         assert "No response provided" in reason
 
-    def test_keywords_eval_empty_response(self):
+    def test_keywords_eval_empty_response(self) -> None:
         """Test keywords evaluation with empty response."""
         # Create turn data with valid response first, then modify it
         turn_data = TurnData(
@@ -174,14 +174,14 @@ def test_keywords_eval_empty_response(self):
         assert score == 0.0
         assert "No response provided" in reason
 
-    def test_keywords_eval_conversation_level_error(self):
+    def test_keywords_eval_conversation_level_error(self) -> None:
         """Test that keywords_eval returns error for conversation-level evaluation."""
         score, reason = evaluate_keywords(None, None, None, True)
 
         assert score is None
         assert "Keywords eval is a turn-level metric" in reason
 
-    def test_keywords_eval_no_turn_data(self):
+    def test_keywords_eval_no_turn_data(self) -> None:
         """Test keywords evaluation when no turn data provided."""
         score, reason = evaluate_keywords(None, 0, None, False)
 
diff --git a/tests/unit/core/metrics/test_manager.py b/tests/unit/core/metrics/test_manager.py
index b6fc312d..756f2c8e 100644
--- a/tests/unit/core/metrics/test_manager.py
+++ b/tests/unit/core/metrics/test_manager.py
@@ -1,7 +1,5 @@
 """Unit tests for core metrics manager module."""
 
-import pytest
-
 from lightspeed_evaluation.core.metrics.manager import MetricLevel, MetricManager
 from lightspeed_evaluation.core.models import (
     EvaluationData,
@@ -10,50 +8,12 @@
 )
 
 
-@pytest.fixture
-def system_config():
-    """Create a test system config with metrics metadata."""
-    config = SystemConfig()
-
-    # Set up test metrics metadata
-    config.default_turn_metrics_metadata = {
-        "ragas:faithfulness": {
-            "threshold": 0.7,
-            "default": True,
-            "description": "Test",
-        },
-        "ragas:response_relevancy": {
-            "threshold": 0.8,
-            "default": False,
-            "description": "Test",
-        },
-        "custom:answer_correctness": {
-            "threshold": 0.75,
-            "default": True,
-            "description": "Test",
-        },
-    }
-
-    config.default_conversation_metrics_metadata = {
-        "deepeval:conversation_completeness": {
-            "threshold": 0.6,
-            "default": True,
-            "description": "Test",
-        },
-        "deepeval:conversation_relevancy": {
-            "threshold": 0.7,
-            "default": False,
-            "description": "Test",
-        },
-    }
-
-    return config
-
-
-class TestMetricManager:
+class TestMetricManager:  # pylint: disable=too-many-public-methods
     """Unit tests for MetricManager."""
 
-    def test_resolve_metrics_with_none_uses_defaults(self, system_config):
+    def test_resolve_metrics_with_none_uses_defaults(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test that None resolves to system defaults."""
         manager = MetricManager(system_config)
 
@@ -64,16 +24,20 @@ def test_resolve_metrics_with_none_uses_defaults(self, system_config):
         assert "custom:answer_correctness" in metrics
         assert "ragas:response_relevancy" not in metrics  # default=False
 
-    def test_resolve_metrics_with_empty_list_skips_evaluation(self, system_config):
+    def test_resolve_metrics_with_empty_list_skips_evaluation(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test that empty list skips evaluation."""
         manager = MetricManager(system_config)
 
         metrics = manager.resolve_metrics([], MetricLevel.TURN)
 
         # Should return empty list
-        assert metrics == []
+        assert not metrics
 
-    def test_resolve_metrics_with_explicit_list(self, system_config):
+    def test_resolve_metrics_with_explicit_list(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test that explicit list is returned as-is."""
         manager = MetricManager(system_config)
 
@@ -83,7 +47,9 @@ def test_resolve_metrics_with_explicit_list(self, system_config):
         # Should return the exact list provided
         assert metrics == explicit_metrics
 
-    def test_resolve_metrics_conversation_level_defaults(self, system_config):
+    def test_resolve_metrics_conversation_level_defaults(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test conversation-level default metrics."""
         manager = MetricManager(system_config)
 
@@ -93,7 +59,9 @@ def test_resolve_metrics_conversation_level_defaults(self, system_config):
         assert "deepeval:conversation_completeness" in metrics
         assert "deepeval:conversation_relevancy" not in metrics
 
-    def test_get_metric_metadata_from_system_defaults(self, system_config):
+    def test_get_metric_metadata_from_system_defaults(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test getting full metadata from system defaults."""
         manager = MetricManager(system_config)
 
@@ -106,7 +74,9 @@ def test_get_metric_metadata_from_system_defaults(self, system_config):
         assert metadata["default"] is True
         assert metadata["description"] == "Test"
 
-    def test_get_metric_metadata_turn_level_override(self, system_config):
+    def test_get_metric_metadata_turn_level_override(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test turn-level metadata completely overrides system defaults."""
         manager = MetricManager(system_config)
 
@@ -133,7 +103,9 @@ def test_get_metric_metadata_turn_level_override(self, system_config):
         assert "default" not in metadata
         assert "description" not in metadata
 
-    def test_get_metric_metadata_conversation_level_override(self, system_config):
+    def test_get_metric_metadata_conversation_level_override(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test conversation-level metadata overrides system defaults."""
         manager = MetricManager(system_config)
 
@@ -159,7 +131,7 @@ def test_get_metric_metadata_conversation_level_override(self, system_config):
         assert metadata["threshold"] == 0.85
         assert metadata["criteria"] == "Custom criteria"
 
-    def test_get_metric_metadata_not_found(self, system_config):
+    def test_get_metric_metadata_not_found(self, system_config: SystemConfig) -> None:
         """Test getting metadata for unknown metric returns None."""
         manager = MetricManager(system_config)
 
@@ -167,7 +139,7 @@ def test_get_metric_metadata_not_found(self, system_config):
 
         assert metadata is None
 
-    def test_get_metric_metadata_preserves_all_fields(self, system_config):
+    def test_get_metric_metadata_preserves_all_fields(self) -> None:
         """Test that all metadata fields are preserved."""
         config = SystemConfig()
         config.default_turn_metrics_metadata = {
@@ -198,7 +170,9 @@ def test_get_metric_metadata_preserves_all_fields(self, system_config):
         assert metadata["default"] is True
         assert metadata["description"] == "GEval metric for technical accuracy"
 
-    def test_get_effective_threshold_from_system_defaults(self, system_config):
+    def test_get_effective_threshold_from_system_defaults(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test getting threshold from system defaults."""
         manager = MetricManager(system_config)
 
@@ -208,7 +182,9 @@ def test_get_effective_threshold_from_system_defaults(self, system_config):
 
         assert threshold == 0.7
 
-    def test_get_effective_threshold_turn_level_override(self, system_config):
+    def test_get_effective_threshold_turn_level_override(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test turn-level metadata overrides system defaults."""
         manager = MetricManager(system_config)
 
@@ -226,7 +202,9 @@ def test_get_effective_threshold_turn_level_override(self, system_config):
         # Should use turn-specific threshold
         assert threshold == 0.9
 
-    def test_get_effective_threshold_conversation_level_override(self, system_config):
+    def test_get_effective_threshold_conversation_level_override(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test conversation-level metadata overrides system defaults."""
         manager = MetricManager(system_config)
 
@@ -248,7 +226,9 @@ def test_get_effective_threshold_conversation_level_override(self, system_config
         # Should use conversation-specific threshold
         assert threshold == 0.85
 
-    def test_get_effective_threshold_not_found(self, system_config):
+    def test_get_effective_threshold_not_found(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test getting threshold for unknown metric returns None."""
         manager = MetricManager(system_config)
 
@@ -256,7 +236,9 @@ def test_get_effective_threshold_not_found(self, system_config):
 
         assert threshold is None
 
-    def test_get_effective_threshold_no_metadata_at_level(self, system_config):
+    def test_get_effective_threshold_no_metadata_at_level(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test threshold lookup when no metadata at level."""
         manager = MetricManager(system_config)
 
@@ -274,7 +256,9 @@ def test_get_effective_threshold_no_metadata_at_level(self, system_config):
         # Should fall back to system defaults
         assert threshold == 0.7
 
-    def test_get_effective_threshold_metric_not_in_level_metadata(self, system_config):
+    def test_get_effective_threshold_metric_not_in_level_metadata(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test threshold for metric not in level metadata."""
         manager = MetricManager(system_config)
 
@@ -293,7 +277,9 @@ def test_get_effective_threshold_metric_not_in_level_metadata(self, system_confi
         # Should fall back to system defaults
         assert threshold == 0.7
 
-    def test_count_metrics_for_conversation_all_defaults(self, system_config):
+    def test_count_metrics_for_conversation_all_defaults(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test counting metrics when using all defaults."""
         manager = MetricManager(system_config)
 
@@ -313,7 +299,9 @@ def test_count_metrics_for_conversation_all_defaults(self, system_config):
         assert counts["conversation_metrics"] == 1
         assert counts["total_turns"] == 2
 
-    def test_count_metrics_for_conversation_explicit_metrics(self, system_config):
+    def test_count_metrics_for_conversation_explicit_metrics(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test counting with explicit metrics."""
         manager = MetricManager(system_config)
 
@@ -340,7 +328,9 @@ def test_count_metrics_for_conversation_explicit_metrics(self, system_config):
         assert counts["conversation_metrics"] == 1
         assert counts["total_turns"] == 2
 
-    def test_count_metrics_for_conversation_skip_evaluation(self, system_config):
+    def test_count_metrics_for_conversation_skip_evaluation(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test counting when evaluation is skipped."""
         manager = MetricManager(system_config)
 
@@ -359,7 +349,9 @@ def test_count_metrics_for_conversation_skip_evaluation(self, system_config):
         assert counts["conversation_metrics"] == 0
         assert counts["total_turns"] == 1
 
-    def test_count_metrics_for_conversation_mixed(self, system_config):
+    def test_count_metrics_for_conversation_mixed(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test counting with mixed default and explicit metrics."""
         manager = MetricManager(system_config)
 
@@ -383,7 +375,7 @@ def test_count_metrics_for_conversation_mixed(self, system_config):
         assert counts["conversation_metrics"] == 1
         assert counts["total_turns"] == 3
 
-    def test_extract_default_metrics_empty_metadata(self):
+    def test_extract_default_metrics_empty_metadata(self) -> None:
         """Test extracting defaults when no metrics have default=true."""
         config = SystemConfig()
         config.default_turn_metrics_metadata = {
@@ -397,7 +389,9 @@ def test_extract_default_metrics_empty_metadata(self):
         # Should return empty list when no defaults
         assert metrics == []
 
-    def test_get_effective_threshold_with_both_metadata_sources(self, system_config):
+    def test_get_effective_threshold_with_both_metadata_sources(
+        self, system_config: SystemConfig
+    ) -> None:
         """Test that level metadata takes priority over system defaults."""
         manager = MetricManager(system_config)
 
diff --git a/tests/unit/core/metrics/test_nlp.py b/tests/unit/core/metrics/test_nlp.py
index ad3225dd..453cb27c 100644
--- a/tests/unit/core/metrics/test_nlp.py
+++ b/tests/unit/core/metrics/test_nlp.py
@@ -15,6 +15,7 @@
 import sys
 
 import pytest
+from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.constants import (
     ROUGE_TYPE_ROUGE1,
@@ -25,109 +26,10 @@
 from lightspeed_evaluation.core.system.exceptions import MetricError
 
 
-# ============================================================================
-# Fixtures
-# ============================================================================
-
-
-@pytest.fixture
-def nlp_metrics():
-    """Create NLPMetrics instance."""
-    return NLPMetrics()
-
-
-@pytest.fixture
-def sample_turn_data():
-    """Create sample TurnData for testing."""
-    return TurnData(
-        turn_id="test_turn",
-        query="What is the capital of France?",
-        response="The capital of France is Paris.",
-        expected_response="The capital of France is Paris.",
-    )
-
-
-@pytest.fixture
-def sample_scope(sample_turn_data):
-    """Create sample EvaluationScope for turn-level evaluation."""
-    return EvaluationScope(
-        turn_idx=0,
-        turn_data=sample_turn_data,
-        is_conversation=False,
-    )
-
-
-@pytest.fixture
-def conversation_scope(sample_turn_data):
-    """Create sample EvaluationScope for conversation-level evaluation."""
-    return EvaluationScope(
-        turn_idx=0,
-        turn_data=sample_turn_data,
-        is_conversation=True,
-    )
-
-
-@pytest.fixture
-def mock_bleu_scorer(mocker):
-    """Mock sacrebleu BLEU with configurable return value.
-
-    Uses sys.modules injection to mock sacrebleu without requiring it to be installed.
-    """
-    mock_result = mocker.MagicMock()
-    mock_result.score = 85.0  # sacrebleu returns 0-100 scale
-
-    mock_scorer_instance = mocker.MagicMock()
-    mock_scorer_instance.corpus_score = mocker.MagicMock(return_value=mock_result)
-
-    mock_bleu_class = mocker.MagicMock(return_value=mock_scorer_instance)
-
-    # Create a fake sacrebleu module and inject it into sys.modules
-    mock_sacrebleu = mocker.MagicMock()
-    mock_sacrebleu.BLEU = mock_bleu_class
-    mocker.patch.dict(sys.modules, {"sacrebleu": mock_sacrebleu})
-
-    return mock_scorer_instance
-
-
-@pytest.fixture
-def mock_rouge_scorer(mocker):
-    """Mock RougeScore with configurable return value.
-
-    Returns different scores for precision, recall, fmeasure.
-    """
-    mock_scorer_instance = mocker.MagicMock()
-    # Return scores for precision, recall, fmeasure (called in that order)
-    mock_scorer_instance.single_turn_score = mocker.MagicMock(
-        side_effect=[0.95, 0.89, 0.92]
-    )
-    mocker.patch(
-        "lightspeed_evaluation.core.metrics.nlp.RougeScore",
-        return_value=mock_scorer_instance,
-    )
-    return mock_scorer_instance
-
-
-@pytest.fixture
-def mock_similarity_scorer(mocker):
-    """Mock NonLLMStringSimilarity with configurable return value."""
-    mock_scorer_instance = mocker.MagicMock()
-    mock_scorer_instance.single_turn_score = mocker.MagicMock(return_value=0.78)
-    mocker.patch(
-        "lightspeed_evaluation.core.metrics.nlp.NonLLMStringSimilarity",
-        return_value=mock_scorer_instance,
-    )
-    return mock_scorer_instance
-
-
-# ============================================================================
-# Tests
-# ============================================================================
-
-
-class TestNLPMetricsInit:
+class TestNLPMetricsInit:  # pylint: disable=too-few-public-methods
     """Test NLPMetrics initialization."""
 
-    def test_initialization(self, nlp_metrics):
+    def test_initialization(self, nlp_metrics: NLPMetrics) -> None:
         """Test that NLPMetrics initializes correctly."""
         assert nlp_metrics is not None
         assert "bleu" in nlp_metrics.supported_metrics
@@ -138,14 +40,18 @@ def test_initialization(self, nlp_metrics):
 class TestNLPMetricsValidation:
     """Tests for metric-level validation."""
 
-    def test_conversation_level_rejected(self, nlp_metrics, conversation_scope):
+    def test_conversation_level_rejected(
+        self, nlp_metrics: NLPMetrics, conversation_scope: EvaluationScope
+    ) -> None:
         """Test that NLP metrics reject conversation-level evaluation."""
         score, reason = nlp_metrics.evaluate("bleu", None, conversation_scope)
 
         assert score is None
         assert "turn-level metric" in reason
 
-    def test_unsupported_metric(self, nlp_metrics, sample_scope):
+    def test_unsupported_metric(
+        self, nlp_metrics: NLPMetrics, sample_scope: EvaluationScope
+    ) -> None:
         """Test evaluate with unsupported metric name."""
         score, reason = nlp_metrics.evaluate("unsupported_metric", None, sample_scope)
 
@@ -157,16 +63,22 @@ class TestBLEUScore:
     """Tests for BLEU score metric."""
 
     def test_bleu_successful_evaluation(
-        self, nlp_metrics, sample_scope, mock_bleu_scorer
-    ):
+        self,
+        nlp_metrics: NLPMetrics,
+        sample_scope: EvaluationScope,
+        mock_bleu_scorer: MockerFixture,
+    ) -> None:
         """Test BLEU score with valid inputs."""
+        assert mock_bleu_scorer is not None  # Fixture sets up the mock
         score, reason = nlp_metrics.evaluate("bleu", None, sample_scope)
 
         assert score is not None
         assert score == pytest.approx(0.85, abs=0.01)
         assert "NLP BLEU" in reason
 
-    def test_bleu_with_custom_ngram(self, nlp_metrics, mocker):
+    def test_bleu_with_custom_ngram(
+        self, nlp_metrics: NLPMetrics, mocker: MockerFixture
+    ) -> None:
         """Test BLEU score with custom max_ngram configuration."""
         mock_result = mocker.MagicMock()
         mock_result.score = 90.0
@@ -200,7 +112,9 @@ def test_bleu_with_custom_ngram(self, nlp_metrics, mocker):
         # Verify BLEU was initialized with max_ngram_order=2
         mock_bleu_class.assert_called_once_with(max_ngram_order=2)
 
-    def test_bleu_with_invalid_ngram_uses_default(self, nlp_metrics, mocker):
+    def test_bleu_with_invalid_ngram_uses_default(
+        self, nlp_metrics: NLPMetrics, mocker: MockerFixture
+    ) -> None:
         """Test BLEU score falls back to default when invalid max_ngram provided."""
         mock_result = mocker.MagicMock()
         mock_result.score = 85.0
@@ -237,9 +151,13 @@ class TestROUGEScore:
     """Tests for ROUGE score metric."""
 
     def test_rouge_successful_evaluation(
-        self, nlp_metrics, sample_scope, mock_rouge_scorer
-    ):
+        self,
+        nlp_metrics: NLPMetrics,
+        sample_scope: EvaluationScope,
+        mock_rouge_scorer: MockerFixture,
+    ) -> None:
         """Test ROUGE score with valid inputs."""
+        assert mock_rouge_scorer is not None  # Fixture sets up the mock
         score, reason = nlp_metrics.evaluate("rouge", None, sample_scope)
 
         assert score is not None
@@ -250,7 +168,9 @@ def test_rouge_successful_evaluation(
         assert "recall" in reason
         assert "fmeasure" in reason
 
-    def test_rouge_with_custom_rouge_type(self, nlp_metrics, mocker):
+    def test_rouge_with_custom_rouge_type(
+        self, nlp_metrics: NLPMetrics, mocker: MockerFixture
+    ) -> None:
         """Test ROUGE score with custom rouge_type via turn_metrics_metadata."""
         mock_scorer_instance = mocker.MagicMock()
         # Return different scores for each mode (precision, recall, fmeasure)
@@ -290,9 +210,13 @@ class TestSemanticSimilarityDistance:
     """Tests for string distance similarity (NonLLMStringSimilarity) metric."""
 
     def test_semantic_similarity_distance_successful_evaluation(
-        self, nlp_metrics, sample_scope, mock_similarity_scorer
-    ):
+        self,
+        nlp_metrics: NLPMetrics,
+        sample_scope: EvaluationScope,
+        mock_similarity_scorer: MockerFixture,
+    ) -> None:
         """Test string distance similarity with valid inputs."""
+        assert mock_similarity_scorer is not None  # Fixture sets up the mock
         score, reason = nlp_metrics.evaluate(
             "semantic_similarity_distance", None, sample_scope
         )
@@ -302,8 +226,8 @@ def test_semantic_similarity_distance_successful_evaluation(
         assert "NLP String Distance" in reason
 
     def test_semantic_similarity_distance_with_custom_measure(
-        self, nlp_metrics, mocker
-    ):
+        self, nlp_metrics: NLPMetrics, mocker: MockerFixture
+    ) -> None:
         """Test string distance similarity with custom distance measure config."""
         mock_scorer_instance = mocker.MagicMock()
         mock_scorer_instance.single_turn_score = mocker.MagicMock(return_value=0.95)
@@ -336,7 +260,12 @@ def test_semantic_similarity_distance_with_custom_measure(
 class TestMetricErrorHandling:
     """Tests for error handling across all NLP metrics."""
 
-    def test_bleu_failure_raises_metric_error(self, nlp_metrics, sample_scope, mocker):
+    def test_bleu_failure_raises_metric_error(
+        self,
+        nlp_metrics: NLPMetrics,
+        sample_scope: EvaluationScope,
+        mocker: MockerFixture,
+    ) -> None:
         """Test that BLEU raises MetricError when scoring fails."""
         mock_scorer_instance = mocker.MagicMock()
         mock_scorer_instance.corpus_score = mocker.MagicMock(
@@ -363,9 +292,14 @@ def test_bleu_failure_raises_metric_error(self, nlp_metrics, sample_scope, mocke
             ),
         ],
     )
-    def test_ragas_metric_failure_raises_metric_error(
-        self, nlp_metrics, sample_scope, mocker, metric_name, scorer_path
-    ):
+    def test_ragas_metric_failure_raises_metric_error(  # pylint: disable=too-many-arguments,too-many-positional-arguments
+        self,
+        nlp_metrics: NLPMetrics,
+        sample_scope: EvaluationScope,
+        mocker: MockerFixture,
+        metric_name: str,
+        scorer_path: str,
+    ) -> None:
         """Test that Ragas-based metrics raise MetricError when scoring fails."""
         mock_scorer_instance = mocker.MagicMock()
         mock_scorer_instance.single_turn_score = mocker.MagicMock(
diff --git a/tests/unit/core/models/test_api_additional.py b/tests/unit/core/models/test_api_additional.py
index 8456cd46..77fb68c0 100644
--- a/tests/unit/core/models/test_api_additional.py
+++ b/tests/unit/core/models/test_api_additional.py
@@ -14,7 +14,7 @@
 class TestRAGChunk:
     """Tests for RAGChunk model."""
 
-    def test_rag_chunk_creation(self):
+    def test_rag_chunk_creation(self) -> None:
         """Test creating RAG chunk."""
         chunk = RAGChunk(content="test content", source="test source", score=0.95)
 
@@ -22,22 +22,26 @@ def test_rag_chunk_creation(self):
         assert chunk.source == "test source"
         assert chunk.score == 0.95
 
-    def test_rag_chunk_without_score(self):
+    def test_rag_chunk_without_score(self) -> None:
         """Test RAG chunk without score."""
         chunk = RAGChunk(content="content", source="source")
 
         assert chunk.score is None
 
-    def test_rag_chunk_extra_field_forbidden(self):
+    def test_rag_chunk_extra_field_forbidden(self) -> None:
         """Test that extra fields are forbidden."""
         with pytest.raises(ValidationError):
-            RAGChunk(content="content", source="source", extra_field="not allowed")
+            RAGChunk(
+                content="content",
+                source="source",
+                extra_field="not allowed",  # pyright: ignore[reportCallIssue]
+            )
 
 
 class TestAttachmentData:
     """Tests for AttachmentData model."""
 
-    def test_attachment_creation(self):
+    def test_attachment_creation(self) -> None:
         """Test creating attachment."""
         attachment = AttachmentData(content="file content")
 
@@ -45,7 +49,7 @@ def test_attachment_creation(self):
         assert attachment.attachment_type == "configuration"
         assert attachment.content_type == "text/plain"
 
-    def test_attachment_custom_type(self):
+    def test_attachment_custom_type(self) -> None:
         """Test attachment with custom types."""
         attachment = AttachmentData(
             content="yaml data",
@@ -60,7 +64,7 @@ def test_attachment_custom_type(self):
 class TestAPIRequest:
     """Tests for APIRequest model."""
 
-    def test_create_simple_request(self):
+    def test_create_simple_request(self) -> None:
         """Test creating simple API request."""
         request = APIRequest.create(query="What is Python?")
 
@@ -68,7 +72,7 @@ def test_create_simple_request(self):
         assert request.provider is None
         assert request.model is None
 
-    def test_create_request_with_all_params(self):
+    def test_create_request_with_all_params(self) -> None:
         """Test creating request with all parameters."""
         request = APIRequest.create(
             query="Test query",
@@ -86,7 +90,7 @@ def test_create_request_with_all_params(self):
         assert request.conversation_id == "conv123"
         assert request.system_prompt == "Custom prompt"
 
-    def test_create_request_with_attachments(self):
+    def test_create_request_with_attachments(self) -> None:
         """Test creating request with attachments."""
         # APIRequest.create expects string attachments, not AttachmentData objects
         attachments = ["file1", "file2"]
@@ -98,9 +102,12 @@ def test_create_request_with_attachments(self):
 
         assert request.attachments is not None
         assert len(request.attachments) == 2
-        assert request.attachments[0].content == "file1"
+        assert (
+            request.attachments[0].content  # pylint: disable=unsubscriptable-object
+            == "file1"
+        )
 
-    def test_request_empty_query_validation(self):
+    def test_request_empty_query_validation(self) -> None:
         """Test that empty query fails validation."""
         with pytest.raises(ValidationError):
             APIRequest(query="")
@@ -109,7 +116,7 @@ def test_request_empty_query_validation(self):
 class TestAPIResponse:
     """Tests for APIResponse model."""
 
-    def test_response_creation(self):
+    def test_response_creation(self) -> None:
         """Test creating API response."""
         response = APIResponse(
             response="Test response",
@@ -121,16 +128,16 @@ def test_response_creation(self):
         assert response.conversation_id == "conv123"
         assert len(response.contexts) == 2
 
-    def test_response_empty_contexts(self):
+    def test_response_empty_contexts(self) -> None:
         """Test response with empty contexts."""
         response = APIResponse(
             response="Test",
             conversation_id="conv123",
         )
 
-        assert response.contexts == []
+        assert not response.contexts
 
-    def test_response_with_tool_calls(self):
+    def test_response_with_tool_calls(self) -> None:
         """Test response with tool calls."""
         response = APIResponse(
             response="Test",
@@ -140,7 +147,7 @@ def test_response_with_tool_calls(self):
 
         assert len(response.tool_calls) == 1
 
-    def test_from_raw_response(self):
+    def test_from_raw_response(self) -> None:
         """Test creating response from raw API data."""
         raw_data = {
             "response": "Test response",
@@ -158,14 +165,14 @@ def test_from_raw_response(self):
         assert len(response.contexts) == 2
         assert "chunk1" in response.contexts
 
-    def test_from_raw_response_without_conversation_id(self):
+    def test_from_raw_response_without_conversation_id(self) -> None:
         """Test that from_raw_response fails without conversation_id."""
         raw_data = {"response": "Test"}
 
         with pytest.raises(ValueError, match="conversation_id is required"):
             APIResponse.from_raw_response(raw_data)
 
-    def test_response_with_streaming_performance_metrics(self):
+    def test_response_with_streaming_performance_metrics(self) -> None:
         """Test response with streaming performance metrics."""
         response = APIResponse(
             response="Test",
@@ -179,7 +186,7 @@ def test_response_with_streaming_performance_metrics(self):
         assert response.streaming_duration == 2.5
         assert response.tokens_per_second == 85.3
 
-    def test_response_without_streaming_metrics(self):
+    def test_response_without_streaming_metrics(self) -> None:
         """Test response defaults for streaming metrics (None for non-streaming)."""
         response = APIResponse(
             response="Test",
@@ -190,7 +197,7 @@ def test_response_without_streaming_metrics(self):
         assert response.streaming_duration is None
         assert response.tokens_per_second is None
 
-    def test_from_raw_response_with_streaming_metrics(self):
+    def test_from_raw_response_with_streaming_metrics(self) -> None:
         """Test creating response from raw data with streaming metrics."""
         raw_data = {
             "response": "Test response",
@@ -210,7 +217,7 @@ def test_from_raw_response_with_streaming_metrics(self):
         assert response.streaming_duration == 3.456
         assert response.tokens_per_second == 46.5
 
-    def test_from_raw_response_without_streaming_metrics(self):
+    def test_from_raw_response_without_streaming_metrics(self) -> None:
         """Test creating response from raw data without streaming metrics (query endpoint)."""
         raw_data = {
             "response": "Test response",
diff --git a/tests/unit/core/models/test_data.py b/tests/unit/core/models/test_data.py
index 5ae0a876..3afc4a73 100644
--- a/tests/unit/core/models/test_data.py
+++ b/tests/unit/core/models/test_data.py
@@ -13,7 +13,7 @@
 class TestTurnData:
     """General tests for TurnData model."""
 
-    def test_minimal_fields(self):
+    def test_minimal_fields(self) -> None:
         """Test TurnData with only required fields."""
         turn = TurnData(turn_id="turn1", query="Test query")
 
@@ -22,12 +22,12 @@ def test_minimal_fields(self):
         assert turn.response is None
         assert turn.contexts is None
 
-    def test_empty_turn_id_fails(self):
+    def test_empty_turn_id_fails(self) -> None:
         """Test that empty turn_id fails validation."""
         with pytest.raises(ValidationError):
             TurnData(turn_id="", query="Test")
 
-    def test_empty_query_fails(self):
+    def test_empty_query_fails(self) -> None:
         """Test that empty query fails validation."""
         with pytest.raises(ValidationError):
             TurnData(turn_id="turn1", query="")
@@ -36,13 +36,13 @@ def test_empty_query_fails(self):
 class TestTurnDataToolCallsValidation:
     """Test cases for TurnData expected_tool_calls field validation and conversion."""
 
-    def test_single_set_format_conversion(self):
+    def test_single_set_format_conversion(self) -> None:
         """Test that single set format is converted to multiple sets format."""
         # Single set format (backward compatibility)
         turn_data = TurnData(
             turn_id="test_single",
             query="Test query",
-            expected_tool_calls=[
+            expected_tool_calls=[  # pyright: ignore[reportArgumentType]
                 [{"tool_name": "test_tool", "arguments": {"key": "value"}}]
             ],
         )
@@ -50,12 +50,21 @@ def test_single_set_format_conversion(self):
         # Should be converted to multiple sets format
         expected = turn_data.expected_tool_calls
         assert expected is not None
-        assert len(expected) == 1  # One alternative set
-        assert len(expected[0]) == 1  # One sequence in the set
-        assert len(expected[0][0]) == 1  # One tool call in the sequence
-        assert expected[0][0][0]["tool_name"] == "test_tool"
+        assert (
+            len(expected) == 1  # pylint: disable=unsubscriptable-object
+        )  # One alternative set
+        assert (
+            len(expected[0]) == 1  # pylint: disable=unsubscriptable-object
+        )  # One sequence in the set
+        assert (
+            len(expected[0][0]) == 1  # pylint: disable=unsubscriptable-object
+        )  # One tool call in the sequence
+        assert (
+            expected[0][0][0]["tool_name"]  # pylint: disable=unsubscriptable-object
+            == "test_tool"
+        )
 
-    def test_multiple_sets_format_preserved(self):
+    def test_multiple_sets_format_preserved(self) -> None:
         """Test that multiple sets format is preserved as-is."""
         # Multiple sets format
         turn_data = TurnData(
@@ -70,10 +79,16 @@ def test_multiple_sets_format_preserved(self):
         expected = turn_data.expected_tool_calls
         assert expected is not None
         assert len(expected) == 2  # Two alternative sets
-        assert expected[0][0][0]["tool_name"] == "tool1"
-        assert expected[1][0][0]["tool_name"] == "tool2"
+        assert (
+            expected[0][0][0]["tool_name"]  # pylint: disable=unsubscriptable-object
+            == "tool1"
+        )
+        assert (
+            expected[1][0][0]["tool_name"]  # pylint: disable=unsubscriptable-object
+            == "tool2"
+        )
 
-    def test_empty_alternatives_allowed(self):
+    def test_empty_alternatives_allowed(self) -> None:
         """Test that empty alternatives are allowed as fallback."""
         turn_data = TurnData(
             turn_id="test_flexible",
@@ -87,10 +102,14 @@ def test_empty_alternatives_allowed(self):
         expected = turn_data.expected_tool_calls
         assert expected is not None
         assert len(expected) == 2
-        assert len(expected[0]) == 1  # First set has one sequence
-        assert len(expected[1]) == 0  # Second set is empty
-
-    def test_complex_sequences(self):
+        assert (
+            len(expected[0]) == 1  # pylint: disable=unsubscriptable-object
+        )  # First set has one sequence
+        assert (
+            len(expected[1]) == 0  # pylint: disable=unsubscriptable-object
+        )  # Second set is empty
+
+    def test_complex_sequences(self) -> None:
         """Test complex tool call sequences."""
         turn_data = TurnData(
             turn_id="test_complex",
@@ -107,17 +126,21 @@ def test_complex_sequences(self):
         expected = turn_data.expected_tool_calls
         assert expected is not None
         assert len(expected) == 2
-        assert len(expected[0]) == 2  # Two sequences in first set
-        assert len(expected[1]) == 1  # One sequence in second set
-
-    def test_none_expected_tool_calls(self):
+        assert (
+            len(expected[0]) == 2  # pylint: disable=unsubscriptable-object
+        )  # Two sequences in first set
+        assert (
+            len(expected[1]) == 1  # pylint: disable=unsubscriptable-object
+        )  # One sequence in second set
+
+    def test_none_expected_tool_calls(self) -> None:
         """Test that None is handled correctly."""
         turn_data = TurnData(
             turn_id="test_none", query="Test query", expected_tool_calls=None
         )
         assert turn_data.expected_tool_calls is None
 
-    def test_regex_arguments_preserved(self):
+    def test_regex_arguments_preserved(self) -> None:
         """Test that regex patterns in arguments are preserved."""
         turn_data = TurnData(
             turn_id="test_regex",
@@ -129,18 +152,23 @@ def test_regex_arguments_preserved(self):
 
         expected = turn_data.expected_tool_calls
         assert expected is not None
-        assert expected[0][0][0]["arguments"]["name"] == "web-server-[0-9]+"
+        assert (
+            expected[0][0][0]["arguments"][  # pylint: disable=unsubscriptable-object
+                "name"
+            ]
+            == "web-server-[0-9]+"
+        )
 
-    def test_invalid_format_rejected(self):
+    def test_invalid_format_rejected(self) -> None:
         """Test that non-list format is rejected."""
         with pytest.raises(ValidationError):
             TurnData(
                 turn_id="test_invalid",
                 query="Test query",
-                expected_tool_calls="not_a_list",
+                expected_tool_calls="not_a_list",  # pyright: ignore[reportArgumentType]
             )
 
-    def test_invalid_tool_call_structure_rejected(self):
+    def test_invalid_tool_call_structure_rejected(self) -> None:
         """Test that invalid tool call structure is rejected."""
         with pytest.raises(ValidationError):
             TurnData(
@@ -149,7 +177,7 @@ def test_invalid_tool_call_structure_rejected(self):
                 expected_tool_calls=[[[{"invalid": "structure"}]]],
             )
 
-    def test_empty_sequence_rejected(self):
+    def test_empty_sequence_rejected(self) -> None:
         """Test that empty sequences are rejected."""
         with pytest.raises(
             ValidationError,
@@ -161,7 +189,7 @@ def test_empty_sequence_rejected(self):
                 expected_tool_calls=[[]],
             )
 
-    def test_empty_set_as_first_element_rejected(self):
+    def test_empty_set_as_first_element_rejected(self) -> None:
         """Test that empty set as the first element is rejected."""
         with pytest.raises(ValidationError, match="Empty set cannot be the first"):
             TurnData(
@@ -170,7 +198,7 @@ def test_empty_set_as_first_element_rejected(self):
                 expected_tool_calls=[[], []],
             )
 
-    def test_multiple_empty_alternatives_rejected(self):
+    def test_multiple_empty_alternatives_rejected(self) -> None:
         """Test that multiple empty alternatives are rejected as redundant."""
         with pytest.raises(
             ValidationError, match="Found 2 empty alternatives.*redundant"
@@ -189,19 +217,19 @@ def test_multiple_empty_alternatives_rejected(self):
 class TestTurnDataFormatDetection:
     """Test cases for format detection logic."""
 
-    def test_empty_list_rejected(self):
+    def test_empty_list_rejected(self) -> None:
         """Test that empty list is rejected."""
         with pytest.raises(
             ValidationError, match="Empty set cannot be the only alternative"
         ):
             TurnData(turn_id="test", query="Test", expected_tool_calls=[])
 
-    def test_is_single_set_format_detection(self):
+    def test_is_single_set_format_detection(self) -> None:
         """Test detection of single set format."""
         turn_data = TurnData(
             turn_id="test",
             query="Test",
-            expected_tool_calls=[
+            expected_tool_calls=[  # pyright: ignore[reportArgumentType]
                 [{"tool_name": "tool1", "arguments": {}}],
                 [{"tool_name": "tool2", "arguments": {}}],
             ],
@@ -210,7 +238,9 @@ def test_is_single_set_format_detection(self):
         expected = turn_data.expected_tool_calls
         assert expected is not None
         assert len(expected) == 1  # One alternative set
-        assert len(expected[0]) == 2  # Two sequences in that set
+        assert (
+            len(expected[0]) == 2  # pylint: disable=unsubscriptable-object
+        )  # Two sequences in that set
 
 
 class TestTurnDataExpectedResponseValidation:
@@ -220,7 +250,7 @@ class TestTurnDataExpectedResponseValidation:
         "valid_response",
         ["Single word", ["Response option 1", "Response option 2"]],
     )
-    def test_valid_expected_response(self, valid_response):
+    def test_valid_expected_response(self, valid_response: str | list[str]) -> None:
         """Test valid expected_response values."""
         turn_data = TurnData(
             turn_id="test_turn",
@@ -229,7 +259,7 @@ def test_valid_expected_response(self, valid_response):
         )
         assert turn_data.expected_response == valid_response
 
-    def test_none_expected_response_valid(self):
+    def test_none_expected_response_valid(self) -> None:
         """Test that None is valid for expected_response."""
         turn_data = TurnData(
             turn_id="test_turn",
@@ -248,7 +278,9 @@ def test_none_expected_response_valid(self):
             (["valid", "   "], "cannot be empty or whitespace"),
         ],
     )
-    def test_invalid_expected_response(self, invalid_response, match_pattern):
+    def test_invalid_expected_response(
+        self, invalid_response: str | list[str], match_pattern: str
+    ) -> None:
         """Test that invalid expected_response values are rejected."""
         with pytest.raises(ValidationError, match=match_pattern):
             TurnData(
@@ -261,7 +293,7 @@ def test_invalid_expected_response(self, invalid_response, match_pattern):
 class TestTurnDataKeywordsValidation:
     """Test cases for expected_keywords validation in TurnData."""
 
-    def test_valid_single_group(self):
+    def test_valid_single_group(self) -> None:
         """Test valid expected_keywords with single group."""
         turn_data = TurnData(
             turn_id="test_turn",
@@ -270,7 +302,7 @@ def test_valid_single_group(self):
         )
         assert turn_data.expected_keywords == [["keyword1", "keyword2"]]
 
-    def test_valid_multiple_groups(self):
+    def test_valid_multiple_groups(self) -> None:
         """Test valid expected_keywords with multiple groups."""
         turn_data = TurnData(
             turn_id="test_turn",
@@ -280,23 +312,26 @@ def test_valid_multiple_groups(self):
                 ["monitoring", "namespace"],
             ],
         )
+        assert turn_data.expected_keywords is not None
         assert len(turn_data.expected_keywords) == 2
 
-    def test_none_is_valid(self):
+    def test_none_is_valid(self) -> None:
         """Test that None is valid for expected_keywords."""
         turn_data = TurnData(
             turn_id="test_turn", query="Test query", expected_keywords=None
         )
         assert turn_data.expected_keywords is None
 
-    def test_non_list_rejected(self):
+    def test_non_list_rejected(self) -> None:
         """Test that non-list expected_keywords is rejected."""
         with pytest.raises(ValidationError, match="Input should be a valid list"):
             TurnData(
-                turn_id="test_turn", query="Test query", expected_keywords="not_a_list"
+                turn_id="test_turn",
+                query="Test query",
+                expected_keywords="not_a_list",  # pyright: ignore[reportArgumentType]
             )
 
-    def test_empty_inner_list_rejected(self):
+    def test_empty_inner_list_rejected(self) -> None:
         """Test that empty inner lists are rejected."""
         with pytest.raises(ValidationError, match="cannot be empty"):
             TurnData(
@@ -305,7 +340,7 @@ def test_empty_inner_list_rejected(self):
                 expected_keywords=[[], ["valid_list"]],
             )
 
-    def test_empty_string_element_rejected(self):
+    def test_empty_string_element_rejected(self) -> None:
         """Test that empty string elements are rejected."""
         with pytest.raises(ValidationError, match="cannot be empty or whitespace"):
             TurnData(
@@ -318,7 +353,7 @@ def test_empty_string_element_rejected(self):
 class TestEvaluationData:
     """Tests for EvaluationData model."""
 
-    def test_valid_creation(self):
+    def test_valid_creation(self) -> None:
         """Test EvaluationData creation with valid data."""
         turns = [
             TurnData(turn_id="turn1", query="First query"),
@@ -337,30 +372,31 @@ def test_valid_creation(self):
         assert eval_data.tag == "test_tag"
         assert len(eval_data.turns) == 2
         assert eval_data.description == "Test conversation"
+        assert eval_data.conversation_metrics is not None
         assert len(eval_data.conversation_metrics) == 1
 
-    def test_default_tag_value(self):
+    def test_default_tag_value(self) -> None:
         """Test EvaluationData has correct default tag value."""
         turn = TurnData(turn_id="turn1", query="Query")
         eval_data = EvaluationData(conversation_group_id="conv1", turns=[turn])
 
         assert eval_data.tag == "eval"
 
-    def test_empty_tag_rejected(self):
+    def test_empty_tag_rejected(self) -> None:
         """Test that empty tag is rejected."""
         turn = TurnData(turn_id="turn1", query="Query")
 
         with pytest.raises(ValidationError):
             EvaluationData(conversation_group_id="conv1", turns=[turn], tag="")
 
-    def test_empty_conversation_id_rejected(self):
+    def test_empty_conversation_id_rejected(self) -> None:
         """Test that empty conversation_group_id is rejected."""
         turn = TurnData(turn_id="turn1", query="Query")
 
         with pytest.raises(ValidationError):
             EvaluationData(conversation_group_id="", turns=[turn])
 
-    def test_empty_turns_rejected(self):
+    def test_empty_turns_rejected(self) -> None:
         """Test that empty turns list is rejected."""
         with pytest.raises(ValidationError):
             EvaluationData(conversation_group_id="conv1", turns=[])
@@ -369,7 +405,7 @@ def test_empty_turns_rejected(self):
 class TestEvaluationResult:
     """Tests for EvaluationResult model."""
 
-    def test_default_values(self):
+    def test_default_values(self) -> None:
         """Test EvaluationResult has correct default values."""
         result = EvaluationResult(
             conversation_group_id="conv1",
@@ -385,7 +421,7 @@ def test_default_values(self):
         assert result.reason == ""
         assert result.execution_time == 0
 
-    def test_explicit_tag_value(self):
+    def test_explicit_tag_value(self) -> None:
         """Test EvaluationResult with explicit tag value."""
         result = EvaluationResult(
             conversation_group_id="conv1",
@@ -398,7 +434,7 @@ def test_explicit_tag_value(self):
 
         assert result.tag == "custom_tag"
 
-    def test_empty_tag_rejected(self):
+    def test_empty_tag_rejected(self) -> None:
         """Test that empty tag is rejected."""
         with pytest.raises(ValidationError):
             EvaluationResult(
@@ -410,7 +446,7 @@ def test_empty_tag_rejected(self):
                 threshold=0.7,
             )
 
-    def test_invalid_result_status_rejected(self):
+    def test_invalid_result_status_rejected(self) -> None:
         """Test that invalid result status is rejected."""
         with pytest.raises(ValidationError, match="Result must be one of"):
             EvaluationResult(
@@ -421,7 +457,7 @@ def test_invalid_result_status_rejected(self):
                 threshold=0.7,
             )
 
-    def test_negative_execution_time_rejected(self):
+    def test_negative_execution_time_rejected(self) -> None:
         """Test that negative execution_time is rejected."""
         with pytest.raises(ValidationError):
             EvaluationResult(
@@ -433,7 +469,7 @@ def test_negative_execution_time_rejected(self):
                 execution_time=-1,
             )
 
-    def test_conversation_level_metric_allows_none_turn_id(self):
+    def test_conversation_level_metric_allows_none_turn_id(self) -> None:
         """Test that turn_id can be None for conversation-level metrics."""
         result = EvaluationResult(
             conversation_group_id="conv1",
diff --git a/tests/unit/core/models/test_system_additional.py b/tests/unit/core/models/test_system_additional.py
index 282d19d0..1a860a28 100644
--- a/tests/unit/core/models/test_system_additional.py
+++ b/tests/unit/core/models/test_system_additional.py
@@ -4,6 +4,7 @@
 import tempfile
 import pytest
 from pydantic import ValidationError
+from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.models import (
     LLMConfig,
@@ -18,47 +19,47 @@
 class TestLLMConfig:
     """Additional tests for LLMConfig."""
 
-    def test_temperature_validation_min(self):
+    def test_temperature_validation_min(self) -> None:
         """Test temperature minimum validation."""
         with pytest.raises(ValidationError):
             LLMConfig(temperature=-0.1)
 
-    def test_temperature_validation_max(self):
+    def test_temperature_validation_max(self) -> None:
         """Test temperature maximum validation."""
         with pytest.raises(ValidationError):
             LLMConfig(temperature=2.1)
 
-    def test_max_tokens_validation(self):
+    def test_max_tokens_validation(self) -> None:
         """Test max_tokens minimum validation."""
         with pytest.raises(ValidationError):
             LLMConfig(max_tokens=0)
 
-    def test_timeout_validation(self):
+    def test_timeout_validation(self) -> None:
         """Test timeout minimum validation."""
         with pytest.raises(ValidationError):
             LLMConfig(timeout=0)
 
-    def test_num_retries_validation(self):
+    def test_num_retries_validation(self) -> None:
         """Test num_retries minimum validation."""
         with pytest.raises(ValidationError):
             LLMConfig(num_retries=-1)
 
-    def test_ssl_verify_default(self):
+    def test_ssl_verify_default(self) -> None:
         """Test ssl_verify has correct default value."""
         config = LLMConfig()
         assert config.ssl_verify is True
 
-    def test_ssl_verify_false(self):
+    def test_ssl_verify_false(self) -> None:
         """Test ssl_verify can be set to False."""
         config = LLMConfig(ssl_verify=False)
         assert config.ssl_verify is False
 
-    def test_ssl_cert_file_default(self):
+    def test_ssl_cert_file_default(self) -> None:
         """Test ssl_cert_file defaults to None."""
         config = LLMConfig()
         assert config.ssl_cert_file is None
 
-    def test_ssl_cert_file_valid_path(self):
+    def test_ssl_cert_file_valid_path(self) -> None:
         """Test ssl_cert_file with valid certificate file."""
         with tempfile.NamedTemporaryFile(mode="w", suffix=".crt", delete=False) as f:
             cert_path = f.name
@@ -66,12 +67,13 @@ def test_ssl_cert_file_valid_path(self):
 
         try:
             config = LLMConfig(ssl_cert_file=cert_path)
+            assert config.ssl_cert_file is not None
             assert config.ssl_cert_file == os.path.abspath(cert_path)
             assert os.path.isabs(config.ssl_cert_file)
         finally:
             os.unlink(cert_path)
 
-    def test_ssl_cert_file_expands_env_variables(self, mocker):
+    def test_ssl_cert_file_expands_env_variables(self, mocker: MockerFixture) -> None:
         """Test ssl_cert_file expands environment variables."""
         with tempfile.NamedTemporaryFile(mode="w", suffix=".crt", delete=False) as f:
             cert_path = f.name
@@ -87,14 +89,14 @@ def test_ssl_cert_file_expands_env_variables(self, mocker):
         finally:
             os.unlink(cert_path)
 
-    def test_ssl_cert_file_nonexistent_raises_error(self):
+    def test_ssl_cert_file_nonexistent_raises_error(self) -> None:
         """Test ssl_cert_file validation fails for non-existent file."""
         with pytest.raises(ValidationError) as exc_info:
             LLMConfig(ssl_cert_file="/tmp/nonexistent_cert_12345.crt")
 
         assert "not found" in str(exc_info.value).lower()
 
-    def test_ssl_cert_file_directory_raises_error(self):
+    def test_ssl_cert_file_directory_raises_error(self) -> None:
         """Test ssl_cert_file validation fails for directory paths."""
         temp_dir = tempfile.gettempdir()
         with pytest.raises(ValidationError):
@@ -104,7 +106,7 @@ def test_ssl_cert_file_directory_raises_error(self):
 class TestEmbeddingConfig:
     """Tests for EmbeddingConfig."""
 
-    def test_default_values(self):
+    def test_default_values(self) -> None:
         """Test default embedding configuration."""
         config = EmbeddingConfig()
 
@@ -112,7 +114,7 @@ def test_default_values(self):
         assert config.model is not None
         assert config.cache_enabled is True
 
-    def test_custom_embedding_model(self):
+    def test_custom_embedding_model(self) -> None:
         """Test custom embedding model configuration."""
         config = EmbeddingConfig(
             provider="openai",
@@ -126,7 +128,7 @@ def test_custom_embedding_model(self):
 class TestAPIConfig:
     """Tests for APIConfig."""
 
-    def test_default_api_config(self):
+    def test_default_api_config(self) -> None:
         """Test default API configuration."""
         config = APIConfig()
 
@@ -134,7 +136,7 @@ def test_default_api_config(self):
         assert isinstance(config.cache_enabled, bool)
         assert config.timeout > 0
 
-    def test_custom_api_config(self):
+    def test_custom_api_config(self) -> None:
         """Test custom API configuration."""
         config = APIConfig(
             enabled=True,
@@ -146,7 +148,7 @@ def test_custom_api_config(self):
         assert config.api_base == "https://custom.api.com"
         assert config.timeout == 300
 
-    def test_timeout_validation(self):
+    def test_timeout_validation(self) -> None:
         """Test API timeout validation."""
         with pytest.raises(ValidationError):
             APIConfig(timeout=0)
@@ -155,14 +157,14 @@ def test_timeout_validation(self):
 class TestOutputConfig:
     """Tests for OutputConfig."""
 
-    def test_default_output_config(self):
+    def test_default_output_config(self) -> None:
         """Test default output configuration."""
         config = OutputConfig()
 
         assert "csv" in config.enabled_outputs
         assert len(config.csv_columns) > 0
 
-    def test_custom_output_config(self):
+    def test_custom_output_config(self) -> None:
         """Test custom output configuration."""
         config = OutputConfig(
             enabled_outputs=["json"],
@@ -172,7 +174,7 @@ def test_custom_output_config(self):
         assert config.enabled_outputs == ["json"]
         assert len(config.csv_columns) == 2
 
-    def test_minimal_csv_columns(self):
+    def test_minimal_csv_columns(self) -> None:
         """Test with minimal CSV columns."""
         config = OutputConfig(csv_columns=["result"])
         assert len(config.csv_columns) >= 1
@@ -181,7 +183,7 @@ def test_minimal_csv_columns(self):
 class TestVisualizationConfig:
     """Tests for VisualizationConfig."""
 
-    def test_default_visualization_config(self):
+    def test_default_visualization_config(self) -> None:
         """Test default visualization configuration."""
         config = VisualizationConfig()
 
@@ -189,12 +191,12 @@ def test_default_visualization_config(self):
         assert config.dpi > 0
         assert len(config.figsize) == 2
 
-    def test_custom_visualization_config(self):
+    def test_custom_visualization_config(self) -> None:
         """Test custom visualization configuration."""
         config = VisualizationConfig(
             enabled_graphs=["pass_rates", "score_distribution"],
             dpi=150,
-            figsize=(12, 8),
+            figsize=(12, 8),  # pyright: ignore[reportArgumentType]
         )
 
         assert "pass_rates" in config.enabled_graphs
@@ -202,7 +204,7 @@ def test_custom_visualization_config(self):
         assert config.dpi == 150
         assert config.figsize == [12, 8]  # Pydantic converts tuple to list
 
-    def test_dpi_validation(self):
+    def test_dpi_validation(self) -> None:
         """Test DPI validation."""
         with pytest.raises(ValidationError):
             VisualizationConfig(dpi=0)
@@ -211,7 +213,7 @@ def test_dpi_validation(self):
 class TestLoggingConfig:
     """Tests for LoggingConfig."""
 
-    def test_default_logging_config(self):
+    def test_default_logging_config(self) -> None:
         """Test default logging configuration."""
         config = LoggingConfig()
 
@@ -219,7 +221,7 @@ def test_default_logging_config(self):
         assert config.package_level is not None
         assert isinstance(config.package_overrides, dict)
 
-    def test_custom_logging_config(self):
+    def test_custom_logging_config(self) -> None:
         """Test custom logging configuration."""
         config = LoggingConfig(
             source_level="DEBUG",
@@ -231,7 +233,7 @@ def test_custom_logging_config(self):
         assert config.package_level == "ERROR"
         assert config.package_overrides["httpx"] == "CRITICAL"
 
-    def test_show_timestamps_toggle(self):
+    def test_show_timestamps_toggle(self) -> None:
         """Test show_timestamps configuration."""
         config1 = LoggingConfig(show_timestamps=True)
         config2 = LoggingConfig(show_timestamps=False)
diff --git a/tests/unit/core/output/conftest.py b/tests/unit/core/output/conftest.py
new file mode 100644
index 00000000..7203df1d
--- /dev/null
+++ b/tests/unit/core/output/conftest.py
@@ -0,0 +1,95 @@
+"""Pytest configuration and fixtures for output tests."""
+
+import pytest
+from pytest_mock import MockerFixture
+from lightspeed_evaluation.core.models import EvaluationResult
+
+
+@pytest.fixture
+def sample_results() -> list[EvaluationResult]:
+    """Create sample evaluation results."""
+    return [
+        EvaluationResult(
+            conversation_group_id="conv1",
+            turn_id="turn1",
+            metric_identifier="ragas:faithfulness",
+            score=0.85,
+            result="PASS",
+            threshold=0.7,
+            reason="Good",
+            query="What is Python?",
+            response="Python is a programming language",
+        ),
+        EvaluationResult(
+            conversation_group_id="conv1",
+            turn_id="turn2",
+            metric_identifier="ragas:answer_relevancy",
+            score=0.60,
+            result="FAIL",
+            threshold=0.7,
+            reason="Low score",
+            query="How?",
+            response="It works",
+        ),
+    ]
+
+
+@pytest.fixture
+def sample_results_statistics() -> list[EvaluationResult]:
+    """Create sample evaluation results."""
+    return [
+        EvaluationResult(
+            conversation_group_id="conv1",
+            turn_id="turn1",
+            metric_identifier="metric1",
+            score=0.9,
+            result="PASS",
+            threshold=0.7,
+            reason="Good",
+        ),
+        EvaluationResult(
+            conversation_group_id="conv1",
+            turn_id="turn2",
+            metric_identifier="metric1",
+            score=0.5,
+            result="FAIL",
+            threshold=0.7,
+            reason="Low score",
+        ),
+        EvaluationResult(
+            conversation_group_id="conv2",
+            turn_id="turn1",
+            metric_identifier="metric2",
+            score=0.8,
+            result="PASS",
+            threshold=0.7,
+            reason="Good",
+        ),
+        EvaluationResult(
+            conversation_group_id="conv2",
+            turn_id="turn2",
+            metric_identifier="metric2",
+            score=None,
+            result="ERROR",
+            threshold=0.7,
+            reason="Failed",
+        ),
+    ]
+
+
+@pytest.fixture
+def mock_system_config(mocker: MockerFixture) -> MockerFixture:
+    """Create mock system config."""
+    config = mocker.Mock()
+    config.output.enabled_outputs = ["csv", "json", "txt"]
+    config.output.csv_columns = [
+        "conversation_group_id",
+        "turn_id",
+        "metric_identifier",
+        "result",
+        "score",
+    ]
+    config.visualization.enabled_graphs = []
+    # Mock model_fields to support iteration in _write_config_params and _build_config_dict
+    config.model_fields.keys.return_value = []
+    return config
diff --git a/tests/unit/core/output/test_final_coverage.py b/tests/unit/core/output/test_final_coverage.py
index 4139413c..bbf9c8e8 100644
--- a/tests/unit/core/output/test_final_coverage.py
+++ b/tests/unit/core/output/test_final_coverage.py
@@ -1,16 +1,21 @@
 """Additional tests to boost coverage towards 75%."""
 
+from pathlib import Path
+
+from pytest_mock import MockerFixture
 from lightspeed_evaluation.core.models import EvaluationResult
+from lightspeed_evaluation.core.output.generator import OutputHandler
 from lightspeed_evaluation.core.output.statistics import (
     calculate_basic_stats,
     calculate_detailed_stats,
 )
+from lightspeed_evaluation.core.system.loader import validate_metrics
 
 
 class TestStatisticsEdgeCases:
     """Edge case tests for statistics module."""
 
-    def test_stats_with_mixed_results(self):
+    def test_stats_with_mixed_results(self) -> None:
         """Test statistics with all result types."""
         results = [
             EvaluationResult(
@@ -32,7 +37,7 @@ def test_stats_with_mixed_results(self):
         assert len(detailed["by_metric"]) > 0
         assert len(detailed["by_conversation"]) == 2
 
-    def test_detailed_stats_single_conversation_multiple_metrics(self):
+    def test_detailed_stats_single_conversation_multiple_metrics(self) -> None:
         """Test detailed stats with one conversation, multiple metrics."""
         results = [
             EvaluationResult(
@@ -52,7 +57,7 @@ def test_detailed_stats_single_conversation_multiple_metrics(self):
         assert len(detailed["by_metric"]) == 10
         assert detailed["by_conversation"]["conv1"]["pass"] == 10
 
-    def test_detailed_stats_multiple_conversations_single_metric(self):
+    def test_detailed_stats_multiple_conversations_single_metric(self) -> None:
         """Test detailed stats with multiple conversations, one metric."""
         results = [
             EvaluationResult(
@@ -77,9 +82,8 @@ def test_detailed_stats_multiple_conversations_single_metric(self):
 class TestOutputHandlerEdgeCases:
     """Edge case tests for output handler."""
 
-    def test_calculate_stats_with_single_result(self, tmp_path):
+    def test_calculate_stats_with_single_result(self, tmp_path: Path) -> None:
         """Test stats calculation with exactly one result."""
-        from lightspeed_evaluation.core.output.generator import OutputHandler
 
         handler = OutputHandler(output_dir=str(tmp_path))
         results = [
@@ -93,15 +97,16 @@ def test_calculate_stats_with_single_result(self, tmp_path):
             )
         ]
 
-        stats = handler._calculate_stats(results)
+        stats = handler._calculate_stats(results)  # pylint: disable=protected-access
 
         assert stats["basic"]["TOTAL"] == 1
         assert stats["basic"]["PASS"] == 1
         assert stats["basic"]["pass_rate"] == 100.0
 
-    def test_generate_csv_with_minimal_columns(self, tmp_path, mocker):
+    def test_generate_csv_with_minimal_columns(
+        self, tmp_path: Path, mocker: MockerFixture
+    ) -> None:
         """Test CSV generation with minimal column set."""
-        from lightspeed_evaluation.core.output.generator import OutputHandler
 
         config = mocker.Mock()
         config.output.csv_columns = ["conversation_group_id", "result"]
@@ -118,7 +123,9 @@ def test_generate_csv_with_minimal_columns(self, tmp_path, mocker):
             )
         ]
 
-        csv_file = handler._generate_csv_report(results, "test")
+        csv_file = handler._generate_csv_report(  # pylint: disable=protected-access
+            results, "test"
+        )
 
         assert csv_file.exists()
         content = csv_file.read_text()
@@ -127,12 +134,11 @@ def test_generate_csv_with_minimal_columns(self, tmp_path, mocker):
         assert "PASS" in content
 
 
-class TestSystemLoaderEdgeCases:
+class TestSystemLoaderEdgeCases:  # pylint: disable=too-few-public-methods
     """Edge case tests for system loader."""
 
-    def test_validate_metrics_with_mixed_valid_invalid(self):
+    def test_validate_metrics_with_mixed_valid_invalid(self) -> None:
         """Test validating mix of valid and invalid metrics."""
-        from lightspeed_evaluation.core.system.loader import validate_metrics
 
         turn_metrics = [
             "ragas:faithfulness",
diff --git a/tests/unit/core/output/test_generator.py b/tests/unit/core/output/test_generator.py
index 0f8aa4a2..5b4b2ef8 100644
--- a/tests/unit/core/output/test_generator.py
+++ b/tests/unit/core/output/test_generator.py
@@ -1,64 +1,19 @@
 """Unit tests for output generator."""
 
 import json
+from pathlib import Path
 
-import pytest
+import csv as csv_module
+from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.models import EvaluationResult
 from lightspeed_evaluation.core.output.generator import OutputHandler
 
 
-@pytest.fixture
-def sample_results():
-    """Create sample evaluation results."""
-    return [
-        EvaluationResult(
-            conversation_group_id="conv1",
-            turn_id="turn1",
-            metric_identifier="ragas:faithfulness",
-            score=0.85,
-            result="PASS",
-            threshold=0.7,
-            reason="Good",
-            query="What is Python?",
-            response="Python is a programming language",
-        ),
-        EvaluationResult(
-            conversation_group_id="conv1",
-            turn_id="turn2",
-            metric_identifier="ragas:answer_relevancy",
-            score=0.60,
-            result="FAIL",
-            threshold=0.7,
-            reason="Low score",
-            query="How?",
-            response="It works",
-        ),
-    ]
-
-
-@pytest.fixture
-def mock_system_config(mocker):
-    """Create mock system config."""
-    config = mocker.Mock()
-    config.output.enabled_outputs = ["csv", "json", "txt"]
-    config.output.csv_columns = [
-        "conversation_group_id",
-        "turn_id",
-        "metric_identifier",
-        "result",
-        "score",
-    ]
-    config.visualization.enabled_graphs = []
-    # Mock model_fields to support iteration in _write_config_params and _build_config_dict
-    config.model_fields.keys.return_value = []
-    return config
-
-
 class TestOutputHandler:
     """Tests for OutputHandler."""
 
-    def test_initialization(self, tmp_path):
+    def test_initialization(self, tmp_path: Path) -> None:
         """Test handler initialization."""
         handler = OutputHandler(output_dir=str(tmp_path), base_filename="test")
 
@@ -66,32 +21,43 @@ def test_initialization(self, tmp_path):
         assert handler.base_filename == "test"
         assert tmp_path.exists()
 
-    def test_calculate_stats_with_results(self, tmp_path, sample_results):
+    def test_calculate_stats_with_results(
+        self, tmp_path: Path, sample_results: list[EvaluationResult]
+    ) -> None:
         """Test statistics calculation."""
         handler = OutputHandler(output_dir=str(tmp_path))
-        stats = handler._calculate_stats(sample_results)
+        stats = handler._calculate_stats(  # pylint: disable=protected-access
+            sample_results
+        )
 
         assert stats["basic"]["TOTAL"] == 2
         assert stats["basic"]["PASS"] == 1
         assert stats["basic"]["FAIL"] == 1
         assert "detailed" in stats
 
-    def test_calculate_stats_empty(self, tmp_path):
+    def test_calculate_stats_empty(self, tmp_path: Path) -> None:
         """Test statistics with empty results."""
         handler = OutputHandler(output_dir=str(tmp_path))
-        stats = handler._calculate_stats([])
+        stats = handler._calculate_stats([])  # pylint: disable=protected-access
 
         assert stats["basic"]["TOTAL"] == 0
-        assert stats["detailed"]["by_metric"] == {}
-
-    def test_generate_csv_report(self, tmp_path, sample_results, mock_system_config):
+        assert not stats["detailed"]["by_metric"]
+
+    def test_generate_csv_report(
+        self,
+        tmp_path: Path,
+        sample_results: list[EvaluationResult],
+        mock_system_config: MockerFixture,
+    ) -> None:
         """Test CSV generation."""
         handler = OutputHandler(
             output_dir=str(tmp_path),
             system_config=mock_system_config,
         )
 
-        csv_file = handler._generate_csv_report(sample_results, "test")
+        csv_file = handler._generate_csv_report(  # pylint: disable=protected-access
+            sample_results, "test"
+        )
 
         assert csv_file.exists()
         assert csv_file.suffix == ".csv"
@@ -101,18 +67,22 @@ def test_generate_csv_report(self, tmp_path, sample_results, mock_system_config)
         assert "conversation_group_id" in content
         assert "conv1" in content
 
-    def test_generate_json_summary(self, tmp_path, sample_results):
+    def test_generate_json_summary(
+        self, tmp_path: Path, sample_results: list[EvaluationResult]
+    ) -> None:
         """Test JSON summary generation."""
         handler = OutputHandler(output_dir=str(tmp_path))
-        stats = handler._calculate_stats(sample_results)
+        stats = handler._calculate_stats(  # pylint: disable=protected-access
+            sample_results
+        )
         api_tokens = {
             "total_api_input_tokens": 100,
             "total_api_output_tokens": 200,
             "total_api_tokens": 300,
         }
-        streaming_stats = {}
+        streaming_stats: dict = {}
 
-        json_file = handler._generate_json_summary(
+        json_file = handler._generate_json_summary(  # pylint: disable=protected-access
             sample_results,
             "test",
             stats["basic"],
@@ -124,7 +94,7 @@ def test_generate_json_summary(self, tmp_path, sample_results):
         assert json_file.exists()
 
         # Verify structure
-        with open(json_file) as f:
+        with open(json_file, encoding="utf-8") as f:
             data = json.load(f)
 
         assert "summary_stats" in data or "results" in data
@@ -133,18 +103,22 @@ def test_generate_json_summary(self, tmp_path, sample_results):
         assert "summary_stats" in data
         assert data["summary_stats"]["overall"]["total_api_tokens"] == 300
 
-    def test_generate_text_summary(self, tmp_path, sample_results):
+    def test_generate_text_summary(
+        self, tmp_path: Path, sample_results: list[EvaluationResult]
+    ) -> None:
         """Test text summary generation."""
         handler = OutputHandler(output_dir=str(tmp_path))
-        stats = handler._calculate_stats(sample_results)
+        stats = handler._calculate_stats(  # pylint: disable=protected-access
+            sample_results
+        )
         api_tokens = {
             "total_api_input_tokens": 100,
             "total_api_output_tokens": 200,
             "total_api_tokens": 300,
         }
-        streaming_stats = {}
+        streaming_stats: dict = {}
 
-        txt_file = handler._generate_text_summary(
+        txt_file = handler._generate_text_summary(  # pylint: disable=protected-access
             sample_results,
             "test",
             stats["basic"],
@@ -160,15 +134,19 @@ def test_generate_text_summary(self, tmp_path, sample_results):
         # Verify API token usage is included
         assert "Token Usage (API Calls)" in content
 
-    def test_get_output_directory(self, tmp_path):
+    def test_get_output_directory(self, tmp_path: Path) -> None:
         """Test get output directory."""
         handler = OutputHandler(output_dir=str(tmp_path))
 
         assert handler.get_output_directory() == tmp_path
 
     def test_generate_reports_creates_files(
-        self, tmp_path, sample_results, mock_system_config, mocker
-    ):
+        self,
+        tmp_path: Path,
+        sample_results: list[EvaluationResult],
+        mock_system_config: MockerFixture,
+        mocker: MockerFixture,
+    ) -> None:
         """Test that generate_reports creates output files."""
         mock_now = mocker.Mock()
         mock_now.strftime.return_value = "20250101_120000"
@@ -192,8 +170,8 @@ def test_generate_reports_creates_files(
         assert (tmp_path / "eval_20250101_120000_summary.txt").exists()
 
     def test_generate_reports_with_empty_results(
-        self, tmp_path, mock_system_config, mocker
-    ):
+        self, tmp_path: Path, mock_system_config: MockerFixture, mocker: MockerFixture
+    ) -> None:
         """Test generating reports with no results."""
         mock_now = mocker.Mock()
         mock_now.strftime.return_value = "20250101_120000"
@@ -212,8 +190,11 @@ def test_generate_reports_with_empty_results(
         handler.generate_reports([])
 
     def test_generate_individual_reports_csv_only(
-        self, tmp_path, sample_results, mocker
-    ):
+        self,
+        tmp_path: Path,
+        sample_results: list[EvaluationResult],
+        mocker: MockerFixture,
+    ) -> None:
         """Test generating only CSV."""
         config = mocker.Mock()
         config.output.enabled_outputs = ["csv"]
@@ -221,15 +202,22 @@ def test_generate_individual_reports_csv_only(
         config.visualization.enabled_graphs = []
 
         handler = OutputHandler(output_dir=str(tmp_path), system_config=config)
-        stats = handler._calculate_stats(sample_results)
+        stats = handler._calculate_stats(  # pylint: disable=protected-access
+            sample_results
+        )
 
-        handler._generate_individual_reports(sample_results, "test", ["csv"], stats)
+        handler._generate_individual_reports(  # pylint: disable=protected-access
+            sample_results, "test", ["csv"], stats
+        )
 
         assert (tmp_path / "test_detailed.csv").exists()
 
     def test_generate_individual_reports_json_only(
-        self, tmp_path, sample_results, mocker
-    ):
+        self,
+        tmp_path: Path,
+        sample_results: list[EvaluationResult],
+        mocker: MockerFixture,
+    ) -> None:
         """Test generating only JSON."""
         config = mocker.Mock()
         config.output.enabled_outputs = ["json"]
@@ -237,15 +225,22 @@ def test_generate_individual_reports_json_only(
         config.model_fields.keys.return_value = []
 
         handler = OutputHandler(output_dir=str(tmp_path), system_config=config)
-        stats = handler._calculate_stats(sample_results)
+        stats = handler._calculate_stats(  # pylint: disable=protected-access
+            sample_results
+        )
 
-        handler._generate_individual_reports(sample_results, "test", ["json"], stats)
+        handler._generate_individual_reports(  # pylint: disable=protected-access
+            sample_results, "test", ["json"], stats
+        )
 
         assert (tmp_path / "test_summary.json").exists()
 
     def test_generate_individual_reports_txt_only(
-        self, tmp_path, sample_results, mocker
-    ):
+        self,
+        tmp_path: Path,
+        sample_results: list[EvaluationResult],
+        mocker: MockerFixture,
+    ) -> None:
         """Test generating only TXT."""
         config = mocker.Mock()
         config.output.enabled_outputs = ["txt"]
@@ -253,13 +248,21 @@ def test_generate_individual_reports_txt_only(
         config.model_fields.keys.return_value = []
 
         handler = OutputHandler(output_dir=str(tmp_path), system_config=config)
-        stats = handler._calculate_stats(sample_results)
-
-        handler._generate_individual_reports(sample_results, "test", ["txt"], stats)
+        stats = handler._calculate_stats(  # pylint: disable=protected-access
+            sample_results
+        )
+        handler._generate_individual_reports(  # pylint: disable=protected-access
+            sample_results, "test", ["txt"], stats
+        )
 
         assert (tmp_path / "test_summary.txt").exists()
 
-    def test_csv_with_all_columns(self, tmp_path, sample_results, mocker):
+    def test_csv_with_all_columns(
+        self,
+        tmp_path: Path,
+        sample_results: list[EvaluationResult],
+        mocker: MockerFixture,
+    ) -> None:
         """Test CSV with all available columns."""
         config = mocker.Mock()
         config.output.csv_columns = [
@@ -276,14 +279,21 @@ def test_csv_with_all_columns(self, tmp_path, sample_results, mocker):
         config.visualization.enabled_graphs = []
 
         handler = OutputHandler(output_dir=str(tmp_path), system_config=config)
-        csv_file = handler._generate_csv_report(sample_results, "test")
+        csv_file = handler._generate_csv_report(  # pylint: disable=protected-access
+            sample_results, "test"
+        )
 
         content = csv_file.read_text()
         assert "query" in content
         assert "response" in content
         assert "Python" in content
 
-    def test_generate_reports_without_config(self, tmp_path, sample_results, mocker):
+    def test_generate_reports_without_config(
+        self,
+        tmp_path: Path,
+        sample_results: list[EvaluationResult],
+        mocker: MockerFixture,
+    ) -> None:
         """Test generating reports without system config."""
         mock_now = mocker.Mock()
         mock_now.strftime.return_value = "20250101_120000"
@@ -303,7 +313,9 @@ def test_generate_reports_without_config(self, tmp_path, sample_results, mocker)
 class TestOutputHandlerInitialization:
     """Additional tests for OutputHandler initialization and configuration."""
 
-    def test_output_handler_initialization_default(self, tmp_path, mocker):
+    def test_output_handler_initialization_default(
+        self, tmp_path: Path, mocker: MockerFixture
+    ) -> None:
         """Test OutputHandler initialization with default parameters."""
         mock_print = mocker.patch("builtins.print")
 
@@ -316,7 +328,9 @@ def test_output_handler_initialization_default(self, tmp_path, mocker):
 
         mock_print.assert_called_with(f"✅ Output handler initialized: {tmp_path}")
 
-    def test_output_handler_initialization_custom(self, tmp_path, mocker):
+    def test_output_handler_initialization_custom(
+        self, tmp_path: Path, mocker: MockerFixture
+    ) -> None:
         """Test OutputHandler initialization with custom parameters."""
         system_config = mocker.Mock()
         system_config.llm.provider = "openai"
@@ -333,7 +347,9 @@ def test_output_handler_initialization_custom(self, tmp_path, mocker):
         assert handler.base_filename == "custom_eval"
         assert handler.system_config == system_config
 
-    def test_output_handler_creates_directory(self, tmp_path, mocker):
+    def test_output_handler_creates_directory(
+        self, tmp_path: Path, mocker: MockerFixture
+    ) -> None:
         """Test that OutputHandler creates output directory if it doesn't exist."""
         output_path = tmp_path / "new_output_dir"
 
@@ -344,7 +360,9 @@ def test_output_handler_creates_directory(self, tmp_path, mocker):
         assert handler.output_dir.exists()
         assert handler.output_dir.is_dir()
 
-    def test_generate_csv_with_specific_results(self, tmp_path, mocker):
+    def test_generate_csv_with_specific_results(
+        self, tmp_path: Path, mocker: MockerFixture
+    ) -> None:
         """Test CSV report generation with specific results."""
         results = [
             EvaluationResult(
@@ -391,14 +409,13 @@ def test_generate_csv_with_specific_results(self, tmp_path, mocker):
         mocker.patch("builtins.print")
 
         handler = OutputHandler(output_dir=str(tmp_path))
-        csv_file = handler._generate_csv_report(results, "test_eval")
+        csv_file = handler._generate_csv_report(  # pylint: disable=protected-access
+            results, "test_eval"
+        )
 
         assert csv_file.exists()
         assert csv_file.suffix == ".csv"
 
-        # Read and verify CSV content
-        import csv as csv_module
-
         with open(csv_file, encoding="utf-8") as f:
             reader = csv_module.DictReader(f)
             rows = list(reader)
@@ -420,7 +437,9 @@ def test_generate_csv_with_specific_results(self, tmp_path, mocker):
         assert rows[2]["query"] == "Create namespace"
         assert rows[2]["contexts"] == ""
 
-    def test_csv_columns_configuration(self, tmp_path, mocker):
+    def test_csv_columns_configuration(
+        self, tmp_path: Path, mocker: MockerFixture
+    ) -> None:
         """Test that CSV uses configured columns."""
         results = [
             EvaluationResult(
@@ -442,10 +461,9 @@ def test_csv_columns_configuration(self, tmp_path, mocker):
         system_config.visualization.enabled_graphs = []
 
         handler = OutputHandler(output_dir=str(tmp_path), system_config=system_config)
-        csv_file = handler._generate_csv_report(results, "test_eval")
-
-        # Read CSV headers
-        import csv as csv_module
+        csv_file = handler._generate_csv_report(  # pylint: disable=protected-access
+            results, "test_eval"
+        )
 
         with open(csv_file, encoding="utf-8") as f:
             reader = csv_module.reader(f)
@@ -453,9 +471,11 @@ def test_csv_columns_configuration(self, tmp_path, mocker):
 
         assert headers == ["conversation_group_id", "result", "score"]
 
-    def test_filename_timestamp_format(self, tmp_path, mocker):
+    def test_filename_timestamp_format(
+        self, tmp_path: Path, mocker: MockerFixture
+    ) -> None:
         """Test that generated filenames include proper timestamps."""
-        results = []
+        results: list = []
 
         mocker.patch("builtins.print")
 
@@ -467,7 +487,9 @@ def test_filename_timestamp_format(self, tmp_path, mocker):
         )
         mock_datetime.now.return_value.strftime.return_value = "20240101_120000"
 
-        csv_file = handler._generate_csv_report(results, "test_20240101_120000")
+        csv_file = handler._generate_csv_report(  # pylint: disable=protected-access
+            results, "test_20240101_120000"
+        )
 
         assert "test_20240101_120000" in csv_file.name
         assert csv_file.suffix == ".csv"
diff --git a/tests/unit/core/output/test_statistics.py b/tests/unit/core/output/test_statistics.py
index 83fe8b0a..e94b8578 100644
--- a/tests/unit/core/output/test_statistics.py
+++ b/tests/unit/core/output/test_statistics.py
@@ -13,53 +13,10 @@
 )
 
 
-@pytest.fixture
-def sample_results():
-    """Create sample evaluation results."""
-    return [
-        EvaluationResult(
-            conversation_group_id="conv1",
-            turn_id="turn1",
-            metric_identifier="metric1",
-            score=0.9,
-            result="PASS",
-            threshold=0.7,
-            reason="Good",
-        ),
-        EvaluationResult(
-            conversation_group_id="conv1",
-            turn_id="turn2",
-            metric_identifier="metric1",
-            score=0.5,
-            result="FAIL",
-            threshold=0.7,
-            reason="Low score",
-        ),
-        EvaluationResult(
-            conversation_group_id="conv2",
-            turn_id="turn1",
-            metric_identifier="metric2",
-            score=0.8,
-            result="PASS",
-            threshold=0.7,
-            reason="Good",
-        ),
-        EvaluationResult(
-            conversation_group_id="conv2",
-            turn_id="turn2",
-            metric_identifier="metric2",
-            score=None,
-            result="ERROR",
-            threshold=0.7,
-            reason="Failed",
-        ),
-    ]
-
-
 class TestBootstrapIntervals:
     """Tests for bootstrap_intervals function."""
 
-    def test_bootstrap_intervals_basic(self):
+    def test_bootstrap_intervals_basic(self) -> None:
         """Test basic bootstrap interval calculation."""
         series = pd.Series([0.8, 0.85, 0.9, 0.75, 0.88])
 
@@ -71,21 +28,21 @@ def test_bootstrap_intervals_basic(self):
         assert 0 <= low <= 1
         assert 0 <= high <= 1
 
-    def test_bootstrap_intervals_invalid_confidence(self):
+    def test_bootstrap_intervals_invalid_confidence(self) -> None:
         """Test bootstrap with invalid confidence value."""
         series = pd.Series([0.8, 0.85, 0.9])
 
         with pytest.raises(ValueError, match="Invalid confidence"):
             bootstrap_intervals(series, confidence=150)
 
-    def test_bootstrap_intervals_negative_confidence(self):
+    def test_bootstrap_intervals_negative_confidence(self) -> None:
         """Test bootstrap with negative confidence value."""
         series = pd.Series([0.8, 0.85, 0.9])
 
         with pytest.raises(ValueError, match="Invalid confidence"):
             bootstrap_intervals(series, confidence=-10)
 
-    def test_bootstrap_intervals_custom_confidence(self):
+    def test_bootstrap_intervals_custom_confidence(self) -> None:
         """Test bootstrap with custom confidence level."""
         series = pd.Series([0.8, 0.85, 0.9, 0.75, 0.88])
 
@@ -95,7 +52,7 @@ def test_bootstrap_intervals_custom_confidence(self):
 
         assert low <= mean <= high
 
-    def test_bootstrap_intervals_custom_steps(self):
+    def test_bootstrap_intervals_custom_steps(self) -> None:
         """Test bootstrap with custom bootstrap steps."""
         series = pd.Series([0.8, 0.85, 0.9])
 
@@ -103,7 +60,7 @@ def test_bootstrap_intervals_custom_steps(self):
 
         assert low <= mean <= high
 
-    def test_bootstrap_intervals_valid_confidence(self):
+    def test_bootstrap_intervals_valid_confidence(self) -> None:
         """Test bootstrap_intervals with valid confidence levels."""
         data = pd.Series([0.8, 0.9, 0.7, 0.85, 0.75])
 
@@ -121,7 +78,7 @@ def test_bootstrap_intervals_valid_confidence(self):
         ci_90_width = high_90 - low_90
         assert ci_90_width < ci_95_width
 
-    def test_bootstrap_intervals_edge_cases(self):
+    def test_bootstrap_intervals_edge_cases(self) -> None:
         """Test bootstrap_intervals with edge cases."""
         # Test with single value
         single_value = pd.Series([0.5])
@@ -135,7 +92,7 @@ def test_bootstrap_intervals_edge_cases(self):
         assert abs(mean - 0.8) < 0.001
         assert abs(high - 0.8) < 0.001
 
-    def test_bootstrap_intervals_confidence_levels(self):
+    def test_bootstrap_intervals_confidence_levels(self) -> None:
         """Test bootstrap_intervals with different confidence levels."""
         data = pd.Series([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
 
@@ -151,7 +108,7 @@ def test_bootstrap_intervals_confidence_levels(self):
 class TestCalculateScoreStatistics:
     """Tests for _calculate_score_statistics function."""
 
-    def test_score_statistics_multiple_scores(self):
+    def test_score_statistics_multiple_scores(self) -> None:
         """Test score statistics with multiple scores includes confidence interval."""
         scores = [0.8, 0.85, 0.9, 0.75, 0.88]
         result = _calculate_score_statistics(scores)
@@ -172,7 +129,7 @@ def test_score_statistics_multiple_scores(self):
         assert ci["confidence_level"] == 95
         assert ci["low"] < ci["mean"] < ci["high"]
 
-    def test_score_statistics_two_scores(self):
+    def test_score_statistics_two_scores(self) -> None:
         """Test score statistics with exactly 2 scores includes CI."""
         scores = [0.7, 0.9]
         result = _calculate_score_statistics(scores)
@@ -181,7 +138,7 @@ def test_score_statistics_two_scores(self):
         assert result["mean"] == 0.8
         assert result["confidence_interval"] is not None
 
-    def test_score_statistics_single_score_no_ci(self):
+    def test_score_statistics_single_score_no_ci(self) -> None:
         """Test score statistics with single score has no confidence interval."""
         scores = [0.8]
         result = _calculate_score_statistics(scores)
@@ -191,7 +148,7 @@ def test_score_statistics_single_score_no_ci(self):
         assert result["std"] == 0.0  # No std for single value
         assert result["confidence_interval"] is None
 
-    def test_score_statistics_empty_scores(self):
+    def test_score_statistics_empty_scores(self) -> None:
         """Test score statistics with empty list returns zeros and no CI."""
         result = _calculate_score_statistics([])
 
@@ -207,9 +164,11 @@ def test_score_statistics_empty_scores(self):
 class TestCalculateBasicStats:
     """Tests for calculate_basic_stats function."""
 
-    def test_basic_stats_with_results(self, sample_results):
+    def test_basic_stats_with_results(
+        self, sample_results_statistics: list[EvaluationResult]
+    ) -> None:
         """Test basic stats calculation with results."""
-        stats = calculate_basic_stats(sample_results)
+        stats = calculate_basic_stats(sample_results_statistics)
 
         assert stats["TOTAL"] == 4
         assert stats["PASS"] == 2
@@ -219,7 +178,7 @@ def test_basic_stats_with_results(self, sample_results):
         assert stats["fail_rate"] == 25.0
         assert stats["error_rate"] == 25.0
 
-    def test_basic_stats_empty_results(self):
+    def test_basic_stats_empty_results(self) -> None:
         """Test basic stats with empty results."""
         stats = calculate_basic_stats([])
 
@@ -231,7 +190,7 @@ def test_basic_stats_empty_results(self):
         assert stats["fail_rate"] == 0.0
         assert stats["error_rate"] == 0.0
 
-    def test_basic_stats_all_pass(self):
+    def test_basic_stats_all_pass(self) -> None:
         """Test basic stats with all passing results."""
         results = [
             EvaluationResult(
@@ -252,7 +211,7 @@ def test_basic_stats_all_pass(self):
         assert stats["pass_rate"] == 100.0
         assert stats["fail_rate"] == 0.0
 
-    def test_basic_stats_all_fail(self):
+    def test_basic_stats_all_fail(self) -> None:
         """Test basic stats with all failing results."""
         results = [
             EvaluationResult(
@@ -273,7 +232,7 @@ def test_basic_stats_all_fail(self):
         assert stats["fail_rate"] == 100.0
         assert stats["pass_rate"] == 0.0
 
-    def test_basic_stats_all_error(self):
+    def test_basic_stats_all_error(self) -> None:
         """Test basic stats with all error results."""
         results = [
             EvaluationResult(
@@ -293,7 +252,7 @@ def test_basic_stats_all_error(self):
         assert stats["ERROR"] == 2
         assert stats["error_rate"] == 100.0
 
-    def test_calculate_basic_stats_mixed_results(self):
+    def test_calculate_basic_stats_mixed_results(self) -> None:
         """Test calculate_basic_stats with mixed results."""
         results = [
             EvaluationResult(
@@ -352,7 +311,7 @@ def test_calculate_basic_stats_mixed_results(self):
         }
         assert stats == expected
 
-    def test_calculate_basic_stats_single_result(self):
+    def test_calculate_basic_stats_single_result(self) -> None:
         """Test calculate_basic_stats with single result."""
         results = [
             EvaluationResult(
@@ -388,9 +347,11 @@ def test_calculate_basic_stats_single_result(self):
 class TestCalculateDetailedStats:
     """Tests for calculate_detailed_stats function."""
 
-    def test_detailed_stats_with_results(self, sample_results):
+    def test_detailed_stats_with_results(
+        self, sample_results_statistics: list[EvaluationResult]
+    ) -> None:
         """Test detailed stats calculation."""
-        stats = calculate_detailed_stats(sample_results)
+        stats = calculate_detailed_stats(sample_results_statistics)
 
         assert "by_metric" in stats
         assert "by_conversation" in stats
@@ -399,16 +360,18 @@ def test_detailed_stats_with_results(self, sample_results):
         assert "conv1" in stats["by_conversation"]
         assert "conv2" in stats["by_conversation"]
 
-    def test_detailed_stats_empty_results(self):
+    def test_detailed_stats_empty_results(self) -> None:
         """Test detailed stats with empty results."""
         stats = calculate_detailed_stats([])
 
-        assert stats["by_metric"] == {}
-        assert stats["by_conversation"] == {}
+        assert not stats["by_metric"]
+        assert not stats["by_conversation"]
 
-    def test_detailed_stats_metric_breakdown(self, sample_results):
+    def test_detailed_stats_metric_breakdown(
+        self, sample_results_statistics: list[EvaluationResult]
+    ) -> None:
         """Test metric breakdown in detailed stats."""
-        stats = calculate_detailed_stats(sample_results)
+        stats = calculate_detailed_stats(sample_results_statistics)
 
         metric1_stats = stats["by_metric"]["metric1"]
         assert metric1_stats["pass"] == 1
@@ -418,9 +381,11 @@ def test_detailed_stats_metric_breakdown(self, sample_results):
         assert metric2_stats["pass"] == 1
         assert metric2_stats["error"] == 1
 
-    def test_detailed_stats_conversation_breakdown(self, sample_results):
+    def test_detailed_stats_conversation_breakdown(
+        self, sample_results_statistics: list[EvaluationResult]
+    ) -> None:
         """Test conversation breakdown in detailed stats."""
-        stats = calculate_detailed_stats(sample_results)
+        stats = calculate_detailed_stats(sample_results_statistics)
 
         conv1_stats = stats["by_conversation"]["conv1"]
         assert conv1_stats["pass"] == 1
@@ -430,9 +395,11 @@ def test_detailed_stats_conversation_breakdown(self, sample_results):
         assert conv2_stats["pass"] == 1
         assert conv2_stats["error"] == 1
 
-    def test_detailed_stats_includes_rates(self, sample_results):
+    def test_detailed_stats_includes_rates(
+        self, sample_results_statistics: list[EvaluationResult]
+    ) -> None:
         """Test that detailed stats include percentage rates."""
-        stats = calculate_detailed_stats(sample_results)
+        stats = calculate_detailed_stats(sample_results_statistics)
 
         metric1_stats = stats["by_metric"]["metric1"]
         assert "pass_rate" in metric1_stats
@@ -440,7 +407,7 @@ def test_detailed_stats_includes_rates(self, sample_results):
         assert metric1_stats["pass_rate"] == 50.0
         assert metric1_stats["fail_rate"] == 50.0
 
-    def test_detailed_stats_single_metric(self):
+    def test_detailed_stats_single_metric(self) -> None:
         """Test detailed stats with single metric."""
         results = [
             EvaluationResult(
@@ -458,7 +425,7 @@ def test_detailed_stats_single_metric(self):
         assert len(stats["by_metric"]) == 1
         assert "single_metric" in stats["by_metric"]
 
-    def test_calculate_detailed_stats_single_metric_single_conversation(self):
+    def test_calculate_detailed_stats_single_metric_single_conversation(self) -> None:
         """Test calculate_detailed_stats with single metric and conversation."""
         results = [
             EvaluationResult(
@@ -499,7 +466,7 @@ def test_calculate_detailed_stats_single_metric_single_conversation(self):
         assert conv_stats["error"] == 0
         assert conv_stats["pass_rate"] == 50.0
 
-    def test_calculate_detailed_stats_multiple_metrics_conversations(self):
+    def test_calculate_detailed_stats_multiple_metrics_conversations(self) -> None:
         """Test calculate_detailed_stats with multiple metrics and conversations."""
         results = [
             EvaluationResult(
@@ -579,7 +546,7 @@ def test_calculate_detailed_stats_multiple_metrics_conversations(self):
         assert conv2_stats["pass_rate"] == 50.0
         assert conv2_stats["error_rate"] == 50.0
 
-    def test_calculate_detailed_stats_score_statistics(self):
+    def test_calculate_detailed_stats_score_statistics(self) -> None:
         """Test calculate_detailed_stats includes score statistics."""
         results = [
             EvaluationResult(
@@ -632,7 +599,7 @@ def test_calculate_detailed_stats_score_statistics(self):
         assert "high" in ci
         assert ci["confidence_level"] == 95
 
-    def test_calculate_detailed_stats_no_scores(self):
+    def test_calculate_detailed_stats_no_scores(self) -> None:
         """Test calculate_detailed_stats with results that have no scores."""
         results = [
             EvaluationResult(
@@ -658,7 +625,7 @@ def test_calculate_detailed_stats_no_scores(self):
         # Confidence interval should be None when no scores
         assert score_stats["confidence_interval"] is None
 
-    def test_calculate_detailed_stats_single_score_no_confidence_interval(self):
+    def test_calculate_detailed_stats_single_score_no_confidence_interval(self) -> None:
         """Test calculate_detailed_stats with single score has no CI (needs 2+)."""
         results = [
             EvaluationResult(
@@ -680,7 +647,7 @@ def test_calculate_detailed_stats_single_score_no_confidence_interval(self):
         # Confidence interval should be None for single score
         assert score_stats["confidence_interval"] is None
 
-    def test_calculate_detailed_stats_by_tag(self):
+    def test_calculate_detailed_stats_by_tag(self) -> None:
         """Test calculate_detailed_stats includes by_tag breakdown."""
         results = [
             EvaluationResult(
@@ -738,7 +705,7 @@ def test_calculate_detailed_stats_by_tag(self):
         assert staging_stats["fail_rate"] == 100.0
         assert "score_statistics" in staging_stats
 
-    def test_calculate_detailed_stats_default_tag(self):
+    def test_calculate_detailed_stats_default_tag(self) -> None:
         """Test calculate_detailed_stats with default 'eval' tag."""
         results = [
             EvaluationResult(
@@ -761,14 +728,14 @@ def test_calculate_detailed_stats_default_tag(self):
 class TestCalculateApiTokenUsage:
     """Tests for calculate_api_token_usage function."""
 
-    def test_calculate_api_token_usage_empty_data(self):
+    def test_calculate_api_token_usage_empty_data(self) -> None:
         """Test calculate_api_token_usage with empty data."""
         result = calculate_api_token_usage([])
         assert result["total_api_input_tokens"] == 0
         assert result["total_api_output_tokens"] == 0
         assert result["total_api_tokens"] == 0
 
-    def test_calculate_api_token_usage_single_turn(self):
+    def test_calculate_api_token_usage_single_turn(self) -> None:
         """Test calculate_api_token_usage with single turn."""
         turn = TurnData(
             turn_id="turn1",
@@ -786,7 +753,7 @@ def test_calculate_api_token_usage_single_turn(self):
         assert result["total_api_output_tokens"] == 50
         assert result["total_api_tokens"] == 150
 
-    def test_calculate_api_token_usage_multiple_turns(self):
+    def test_calculate_api_token_usage_multiple_turns(self) -> None:
         """Test calculate_api_token_usage with multiple turns."""
         turns = [
             TurnData(
@@ -813,7 +780,7 @@ def test_calculate_api_token_usage_multiple_turns(self):
         assert result["total_api_output_tokens"] == 125
         assert result["total_api_tokens"] == 375
 
-    def test_calculate_api_token_usage_multiple_conversations(self):
+    def test_calculate_api_token_usage_multiple_conversations(self) -> None:
         """Test calculate_api_token_usage with multiple conversations."""
         eval_data1 = EvaluationData(
             conversation_group_id="conv1",
@@ -844,7 +811,7 @@ def test_calculate_api_token_usage_multiple_conversations(self):
         assert result["total_api_output_tokens"] == 150
         assert result["total_api_tokens"] == 450
 
-    def test_calculate_api_token_usage_zero_tokens(self):
+    def test_calculate_api_token_usage_zero_tokens(self) -> None:
         """Test calculate_api_token_usage with zero token values."""
         turn = TurnData(
             turn_id="turn1",
@@ -866,7 +833,7 @@ def test_calculate_api_token_usage_zero_tokens(self):
 class TestCalculateBasicStatsWithTokens:
     """Tests for calculate_basic_stats token tracking fields."""
 
-    def test_basic_stats_includes_token_fields(self):
+    def test_basic_stats_includes_token_fields(self) -> None:
         """Test that basic stats includes token fields."""
         results = [
             EvaluationResult(
@@ -885,7 +852,7 @@ def test_basic_stats_includes_token_fields(self):
         assert "total_judge_llm_output_tokens" in stats
         assert "total_judge_llm_tokens" in stats
 
-    def test_basic_stats_sums_token_values(self):
+    def test_basic_stats_sums_token_values(self) -> None:
         """Test that basic stats correctly sums token values."""
         results = [
             EvaluationResult(
@@ -914,7 +881,7 @@ def test_basic_stats_sums_token_values(self):
         assert stats["total_judge_llm_output_tokens"] == 150
         assert stats["total_judge_llm_tokens"] == 450
 
-    def test_basic_stats_zero_tokens_by_default(self):
+    def test_basic_stats_zero_tokens_by_default(self) -> None:
         """Test that results without tokens default to zero."""
         results = [
             EvaluationResult(
@@ -931,7 +898,7 @@ def test_basic_stats_zero_tokens_by_default(self):
         assert stats["total_judge_llm_output_tokens"] == 0
         assert stats["total_judge_llm_tokens"] == 0
 
-    def test_basic_stats_empty_results_zero_tokens(self):
+    def test_basic_stats_empty_results_zero_tokens(self) -> None:
         """Test that empty results have zero tokens."""
         stats = calculate_basic_stats([])
         assert stats["total_judge_llm_input_tokens"] == 0
diff --git a/tests/unit/core/script/test_manager.py b/tests/unit/core/script/test_manager.py
index f44c83e9..33b72350 100644
--- a/tests/unit/core/script/test_manager.py
+++ b/tests/unit/core/script/test_manager.py
@@ -2,6 +2,7 @@
 
 import tempfile
 from pathlib import Path
+import os
 
 import pytest
 
@@ -12,7 +13,7 @@
 class TestScriptExecutionManager:
     """Unit tests for ScriptExecutionManager."""
 
-    def test_run_script_success(self):
+    def test_run_script_success(self) -> None:
         """Test running a successful script."""
         # Create a simple script that exits successfully
         script_content = "#!/bin/bash\nexit 0\n"
@@ -32,7 +33,7 @@ def test_run_script_success(self):
         finally:
             script_path.unlink()
 
-    def test_run_script_failure(self):
+    def test_run_script_failure(self) -> None:
         """Test running a script that fails."""
         # Create a script that exits with error code
         script_content = "#!/bin/bash\nexit 1\n"
@@ -51,14 +52,14 @@ def test_run_script_failure(self):
         finally:
             script_path.unlink()
 
-    def test_run_script_not_found(self):
+    def test_run_script_not_found(self) -> None:
         """Test running non-existent script raises error."""
         manager = ScriptExecutionManager()
 
         with pytest.raises(ScriptExecutionError, match="not found"):
             manager.run_script("/nonexistent/script.sh")
 
-    def test_run_script_not_executable(self):
+    def test_run_script_not_executable(self) -> None:
         """Test running non-executable file raises error."""
         script_content = "#!/bin/bash\nexit 0\n"
 
@@ -77,7 +78,7 @@ def test_run_script_not_executable(self):
         finally:
             script_path.unlink()
 
-    def test_run_script_not_a_file(self):
+    def test_run_script_not_a_file(self) -> None:
         """Test running a directory raises error."""
         with tempfile.TemporaryDirectory() as tmpdir:
             manager = ScriptExecutionManager()
@@ -85,7 +86,7 @@ def test_run_script_not_a_file(self):
             with pytest.raises(ScriptExecutionError, match="not a file"):
                 manager.run_script(tmpdir)
 
-    def test_run_script_with_output(self):
+    def test_run_script_with_output(self) -> None:
         """Test script with stdout output."""
         script_content = '#!/bin/bash\necho "Test output"\nexit 0\n'
 
@@ -103,7 +104,7 @@ def test_run_script_with_output(self):
         finally:
             script_path.unlink()
 
-    def test_run_script_with_stderr(self):
+    def test_run_script_with_stderr(self) -> None:
         """Test script with stderr output."""
         script_content = '#!/bin/bash\necho "Error message" >&2\nexit 1\n'
 
@@ -121,7 +122,7 @@ def test_run_script_with_stderr(self):
         finally:
             script_path.unlink()
 
-    def test_run_script_accepts_string_path(self):
+    def test_run_script_accepts_string_path(self) -> None:
         """Test that run_script accepts string path."""
         script_content = "#!/bin/bash\nexit 0\n"
 
@@ -139,7 +140,7 @@ def test_run_script_accepts_string_path(self):
         finally:
             Path(script_path).unlink()
 
-    def test_run_script_resolves_relative_path(self):
+    def test_run_script_resolves_relative_path(self) -> None:
         """Test that relative paths are resolved."""
         script_content = "#!/bin/bash\nexit 0\n"
 
@@ -149,8 +150,6 @@ def test_run_script_resolves_relative_path(self):
             script_path.chmod(0o755)
 
             # Use relative path
-            import os
-
             original_cwd = os.getcwd()
             try:
                 os.chdir(tmpdir)
@@ -160,7 +159,7 @@ def test_run_script_resolves_relative_path(self):
             finally:
                 os.chdir(original_cwd)
 
-    def test_run_script_timeout(self):
+    def test_run_script_timeout(self) -> None:
         """Test script timeout raises error."""
         # Create a script that sleeps
         script_content = "#!/bin/bash\nsleep 10\nexit 0\n"
diff --git a/tests/unit/core/script/test_manager_additional.py b/tests/unit/core/script/test_manager_additional.py
index ce42bbe8..642e2e42 100644
--- a/tests/unit/core/script/test_manager_additional.py
+++ b/tests/unit/core/script/test_manager_additional.py
@@ -1,8 +1,10 @@
 """Additional tests for script manager to increase coverage."""
 
+from pathlib import Path
 import subprocess
-
+import logging
 import pytest
+from pytest_mock import MockFixture
 
 from lightspeed_evaluation.core.script.manager import ScriptExecutionManager
 from lightspeed_evaluation.core.system.exceptions import ScriptExecutionError
@@ -11,7 +13,9 @@
 class TestScriptExecutionManagerAdditional:
     """Additional tests for ScriptExecutionManager."""
 
-    def test_run_script_timeout_error(self, tmp_path, mocker):
+    def test_run_script_timeout_error(
+        self, tmp_path: Path, mocker: MockFixture
+    ) -> None:
         """Test script execution with timeout."""
         # Create a script file
         script = tmp_path / "test_script.sh"
@@ -27,7 +31,9 @@ def test_run_script_timeout_error(self, tmp_path, mocker):
         with pytest.raises(ScriptExecutionError, match="timeout"):
             manager.run_script(script)
 
-    def test_run_script_subprocess_error(self, tmp_path, mocker):
+    def test_run_script_subprocess_error(
+        self, tmp_path: Path, mocker: MockFixture
+    ) -> None:
         """Test script execution with subprocess error."""
         script = tmp_path / "test_script.sh"
         script.write_text("#!/bin/bash\necho 'test'\n")
@@ -42,7 +48,9 @@ def test_run_script_subprocess_error(self, tmp_path, mocker):
         with pytest.raises(ScriptExecutionError, match="Error running script"):
             manager.run_script(script)
 
-    def test_run_script_unexpected_error(self, tmp_path, mocker):
+    def test_run_script_unexpected_error(
+        self, tmp_path: Path, mocker: MockFixture
+    ) -> None:
         """Test script execution with unexpected error."""
         script = tmp_path / "test_script.sh"
         script.write_text("#!/bin/bash\necho 'test'\n")
@@ -57,7 +65,9 @@ def test_run_script_unexpected_error(self, tmp_path, mocker):
         with pytest.raises(ScriptExecutionError, match="Unexpected error"):
             manager.run_script(script)
 
-    def test_run_script_with_path_object(self, tmp_path, mocker):
+    def test_run_script_with_path_object(
+        self, tmp_path: Path, mocker: MockFixture
+    ) -> None:
         """Test run_script accepts Path objects."""
         script = tmp_path / "test_script.sh"
         script.write_text("#!/bin/bash\necho 'test'\n")
@@ -73,7 +83,7 @@ def test_run_script_with_path_object(self, tmp_path, mocker):
 
         assert result is True
 
-    def test_script_not_file_error(self, tmp_path):
+    def test_script_not_file_error(self, tmp_path: Path) -> None:
         """Test error when script path is not a file."""
         # Create a directory instead of file
         script_dir = tmp_path / "script_dir"
@@ -84,9 +94,10 @@ def test_script_not_file_error(self, tmp_path):
         with pytest.raises(ScriptExecutionError, match="not a file"):
             manager.run_script(script_dir)
 
-    def test_script_output_logging(self, tmp_path, mocker, caplog):
+    def test_script_output_logging(
+        self, tmp_path: Path, mocker: MockFixture, caplog: pytest.LogCaptureFixture
+    ) -> None:
         """Test that script output is logged."""
-        import logging
 
         caplog.set_level(logging.DEBUG)
 
@@ -108,9 +119,10 @@ def test_script_output_logging(self, tmp_path, mocker, caplog):
         # Check that output was logged
         assert "test output" in caplog.text or "completed successfully" in caplog.text
 
-    def test_script_stderr_logging_on_failure(self, tmp_path, mocker, caplog):
+    def test_script_stderr_logging_on_failure(
+        self, tmp_path: Path, mocker: MockFixture, caplog: pytest.LogCaptureFixture
+    ) -> None:
         """Test that stderr is logged as error on failure."""
-        import logging
 
         caplog.set_level(logging.ERROR)
 
@@ -131,9 +143,10 @@ def test_script_stderr_logging_on_failure(self, tmp_path, mocker, caplog):
 
         assert result is False
 
-    def test_script_stderr_logging_on_success(self, tmp_path, mocker, caplog):
+    def test_script_stderr_logging_on_success(
+        self, tmp_path: Path, mocker: MockFixture, caplog: pytest.LogCaptureFixture
+    ) -> None:
         """Test that stderr is logged as debug on success."""
-        import logging
 
         caplog.set_level(logging.DEBUG)
 
diff --git a/tests/unit/core/system/test_env_validator.py b/tests/unit/core/system/test_env_validator.py
index e28fa578..ff67a363 100644
--- a/tests/unit/core/system/test_env_validator.py
+++ b/tests/unit/core/system/test_env_validator.py
@@ -1,6 +1,7 @@
 """Unit tests for environment validator."""
 
 import pytest
+from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.system.env_validator import (
     validate_anthropic_env,
@@ -19,21 +20,21 @@
 class TestProviderValidators:
     """Tests for individual provider validators."""
 
-    def test_validate_openai_env_success(self, mocker):
+    def test_validate_openai_env_success(self, mocker: MockerFixture) -> None:
         """Test OpenAI validation succeeds with API key."""
         mocker.patch.dict("os.environ", {"OPENAI_API_KEY": "test_key"})
 
         # Should not raise
         validate_openai_env()
 
-    def test_validate_openai_env_failure(self, mocker):
+    def test_validate_openai_env_failure(self, mocker: MockerFixture) -> None:
         """Test OpenAI validation fails without API key."""
         mocker.patch.dict("os.environ", {}, clear=True)
 
         with pytest.raises(LLMError, match="OPENAI_API_KEY"):
             validate_openai_env()
 
-    def test_validate_azure_env_success(self, mocker):
+    def test_validate_azure_env_success(self, mocker: MockerFixture) -> None:
         """Test Azure validation succeeds with required vars."""
         mocker.patch.dict(
             "os.environ",
@@ -45,14 +46,14 @@ def test_validate_azure_env_success(self, mocker):
 
         validate_azure_env()
 
-    def test_validate_azure_env_failure(self, mocker):
+    def test_validate_azure_env_failure(self, mocker: MockerFixture) -> None:
         """Test Azure validation fails without required vars."""
         mocker.patch.dict("os.environ", {}, clear=True)
 
         with pytest.raises(LLMError, match="Azure"):
             validate_azure_env()
 
-    def test_validate_watsonx_env_success(self, mocker):
+    def test_validate_watsonx_env_success(self, mocker: MockerFixture) -> None:
         """Test Watsonx validation succeeds with required vars."""
         mocker.patch.dict(
             "os.environ",
@@ -65,46 +66,50 @@ def test_validate_watsonx_env_success(self, mocker):
 
         validate_watsonx_env()
 
-    def test_validate_watsonx_env_failure(self, mocker):
+    def test_validate_watsonx_env_failure(self, mocker: MockerFixture) -> None:
         """Test Watsonx validation fails without required vars."""
         mocker.patch.dict("os.environ", {}, clear=True)
 
         with pytest.raises(LLMError, match="Watsonx"):
             validate_watsonx_env()
 
-    def test_validate_anthropic_env_success(self, mocker):
+    def test_validate_anthropic_env_success(self, mocker: MockerFixture) -> None:
         """Test Anthropic validation succeeds with API key."""
         mocker.patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test_key"})
 
         validate_anthropic_env()
 
-    def test_validate_anthropic_env_failure(self, mocker):
+    def test_validate_anthropic_env_failure(self, mocker: MockerFixture) -> None:
         """Test Anthropic validation fails without API key."""
         mocker.patch.dict("os.environ", {}, clear=True)
 
         with pytest.raises(LLMError, match="ANTHROPIC_API_KEY"):
             validate_anthropic_env()
 
-    def test_validate_gemini_env_with_google_api_key(self, mocker):
+    def test_validate_gemini_env_with_google_api_key(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test Gemini validation succeeds with GOOGLE_API_KEY."""
         mocker.patch.dict("os.environ", {"GOOGLE_API_KEY": "test_key"})
 
         validate_gemini_env()
 
-    def test_validate_gemini_env_with_gemini_api_key(self, mocker):
+    def test_validate_gemini_env_with_gemini_api_key(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test Gemini validation succeeds with GEMINI_API_KEY."""
         mocker.patch.dict("os.environ", {"GEMINI_API_KEY": "test_key"})
 
         validate_gemini_env()
 
-    def test_validate_gemini_env_failure(self, mocker):
+    def test_validate_gemini_env_failure(self, mocker: MockerFixture) -> None:
         """Test Gemini validation fails without API keys."""
         mocker.patch.dict("os.environ", {}, clear=True)
 
         with pytest.raises(LLMError, match="GOOGLE_API_KEY or GEMINI_API_KEY"):
             validate_gemini_env()
 
-    def test_validate_vertex_env_success(self, mocker):
+    def test_validate_vertex_env_success(self, mocker: MockerFixture) -> None:
         """Test Vertex AI validation succeeds with credentials."""
         mocker.patch.dict(
             "os.environ", {"GOOGLE_APPLICATION_CREDENTIALS": "/path/to/creds.json"}
@@ -112,21 +117,23 @@ def test_validate_vertex_env_success(self, mocker):
 
         validate_vertex_env()
 
-    def test_validate_vertex_env_failure(self, mocker):
+    def test_validate_vertex_env_failure(self, mocker: MockerFixture) -> None:
         """Test Vertex AI validation fails without credentials."""
         mocker.patch.dict("os.environ", {}, clear=True)
 
         with pytest.raises(LLMError, match="GOOGLE_APPLICATION_CREDENTIALS"):
             validate_vertex_env()
 
-    def test_validate_ollama_env_with_host(self, mocker):
+    def test_validate_ollama_env_with_host(self, mocker: MockerFixture) -> None:
         """Test Ollama validation with OLLAMA_HOST set."""
         mocker.patch.dict("os.environ", {"OLLAMA_HOST": "http://localhost:11434"})
 
         # Should not raise or print warning
         validate_ollama_env()
 
-    def test_validate_ollama_env_without_host(self, mocker, capsys):
+    def test_validate_ollama_env_without_host(
+        self, mocker: MockerFixture, capsys: pytest.CaptureFixture
+    ) -> None:
         """Test Ollama validation without OLLAMA_HOST prints info."""
         mocker.patch.dict("os.environ", {}, clear=True)
 
@@ -135,7 +142,7 @@ def test_validate_ollama_env_without_host(self, mocker, capsys):
         captured = capsys.readouterr()
         assert "OLLAMA_HOST" in captured.out or "localhost" in captured.out
 
-    def test_validate_hosted_vllm_env_success(self, mocker):
+    def test_validate_hosted_vllm_env_success(self, mocker: MockerFixture) -> None:
         """Test hosted vLLM validation succeeds with required vars."""
         mocker.patch.dict(
             "os.environ",
@@ -147,7 +154,7 @@ def test_validate_hosted_vllm_env_success(self, mocker):
 
         validate_hosted_vllm_env()
 
-    def test_validate_hosted_vllm_env_failure(self, mocker):
+    def test_validate_hosted_vllm_env_failure(self, mocker: MockerFixture) -> None:
         """Test hosted vLLM validation fails without required vars."""
         mocker.patch.dict("os.environ", {}, clear=True)
 
@@ -158,13 +165,13 @@ def test_validate_hosted_vllm_env_failure(self, mocker):
 class TestValidateProviderEnv:
     """Tests for validate_provider_env dispatcher."""
 
-    def test_validate_provider_openai(self, mocker):
+    def test_validate_provider_openai(self, mocker: MockerFixture) -> None:
         """Test provider validation dispatches to OpenAI validator."""
         mocker.patch.dict("os.environ", {"OPENAI_API_KEY": "test"})
 
         validate_provider_env("openai")
 
-    def test_validate_provider_azure(self, mocker):
+    def test_validate_provider_azure(self, mocker: MockerFixture) -> None:
         """Test provider validation dispatches to Azure validator."""
         mocker.patch.dict(
             "os.environ",
@@ -176,7 +183,7 @@ def test_validate_provider_azure(self, mocker):
 
         validate_provider_env("azure")
 
-    def test_validate_provider_watsonx(self, mocker):
+    def test_validate_provider_watsonx(self, mocker: MockerFixture) -> None:
         """Test provider validation dispatches to Watsonx validator."""
         mocker.patch.dict(
             "os.environ",
@@ -189,24 +196,24 @@ def test_validate_provider_watsonx(self, mocker):
 
         validate_provider_env("watsonx")
 
-    def test_validate_provider_unknown(self, mocker):
+    def test_validate_provider_unknown(self) -> None:
         """Test unknown provider doesn't raise error."""
         # Unknown providers should be handled gracefully
         validate_provider_env("unknown_provider")
 
-    def test_validate_provider_anthropic(self, mocker):
+    def test_validate_provider_anthropic(self, mocker: MockerFixture) -> None:
         """Test provider validation for Anthropic."""
         mocker.patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test"})
 
         validate_provider_env("anthropic")
 
-    def test_validate_provider_gemini(self, mocker):
+    def test_validate_provider_gemini(self, mocker: MockerFixture) -> None:
         """Test provider validation for Gemini."""
         mocker.patch.dict("os.environ", {"GOOGLE_API_KEY": "test"})
 
         validate_provider_env("gemini")
 
-    def test_validate_provider_vertex(self, mocker):
+    def test_validate_provider_vertex(self, mocker: MockerFixture) -> None:
         """Test provider validation for Vertex AI."""
         mocker.patch.dict(
             "os.environ", {"GOOGLE_APPLICATION_CREDENTIALS": "/path/to/creds"}
@@ -214,13 +221,13 @@ def test_validate_provider_vertex(self, mocker):
 
         validate_provider_env("vertex")
 
-    def test_validate_provider_ollama(self, mocker):
+    def test_validate_provider_ollama(self, mocker: MockerFixture) -> None:
         """Test provider validation for Ollama."""
         mocker.patch.dict("os.environ", {})
 
         validate_provider_env("ollama")
 
-    def test_validate_provider_hosted_vllm(self, mocker):
+    def test_validate_provider_hosted_vllm(self, mocker: MockerFixture) -> None:
         """Test provider validation for hosted vLLM."""
         mocker.patch.dict(
             "os.environ",
diff --git a/tests/unit/core/system/test_lazy_import.py b/tests/unit/core/system/test_lazy_import.py
index 522f9f29..7a26f702 100644
--- a/tests/unit/core/system/test_lazy_import.py
+++ b/tests/unit/core/system/test_lazy_import.py
@@ -8,7 +8,7 @@
 class TestCreateLazyGetattr:
     """Tests for create_lazy_getattr function."""
 
-    def test_lazy_import_success(self):
+    def test_lazy_import_success(self) -> None:
         """Test successful lazy import."""
         lazy_imports = {
             "EvaluationResult": (
@@ -24,7 +24,7 @@ def test_lazy_import_success(self):
         assert result_class is not None
         assert result_class.__name__ == "EvaluationResult"
 
-    def test_lazy_import_unknown_attribute(self):
+    def test_lazy_import_unknown_attribute(self) -> None:
         """Test lazy import with unknown attribute."""
         lazy_imports = {
             "KnownClass": ("lightspeed_evaluation.core.models", "EvaluationResult"),
@@ -35,7 +35,7 @@ def test_lazy_import_unknown_attribute(self):
         with pytest.raises(AttributeError, match="has no attribute 'UnknownClass'"):
             __getattr__("UnknownClass")
 
-    def test_lazy_import_failed_import(self):
+    def test_lazy_import_failed_import(self) -> None:
         """Test lazy import with invalid module path."""
         lazy_imports = {
             "FakeClass": ("nonexistent.module", "FakeClass"),
@@ -46,7 +46,7 @@ def test_lazy_import_failed_import(self):
         with pytest.raises(ImportError, match="Failed to import"):
             __getattr__("FakeClass")
 
-    def test_lazy_import_multiple_classes(self):
+    def test_lazy_import_multiple_classes(self) -> None:
         """Test lazy importing multiple classes."""
         lazy_imports = {
             "EvaluationResult": (
@@ -64,9 +64,9 @@ def test_lazy_import_multiple_classes(self):
         assert result_class.__name__ == "EvaluationResult"
         assert config_class.__name__ == "SystemConfig"
 
-    def test_lazy_import_preserves_module_name_in_error(self):
+    def test_lazy_import_preserves_module_name_in_error(self) -> None:
         """Test that module name appears in error messages."""
-        lazy_imports = {}
+        lazy_imports: dict[str, tuple[str, str]] = {}
 
         __getattr__ = create_lazy_getattr(lazy_imports, "my_custom_module")
 
diff --git a/tests/unit/core/system/test_loader.py b/tests/unit/core/system/test_loader.py
index 5d87ee19..5b452dd6 100644
--- a/tests/unit/core/system/test_loader.py
+++ b/tests/unit/core/system/test_loader.py
@@ -17,7 +17,7 @@
 class TestPopulateMetricMappings:
     """Unit tests for populate_metric_mappings function."""
 
-    def test_populate_metric_mappings_turn_level(self):
+    def test_populate_metric_mappings_turn_level(self) -> None:
         """Test populating turn-level metrics."""
         config = SystemConfig()
         config.default_turn_metrics_metadata = {
@@ -31,7 +31,7 @@ def test_populate_metric_mappings_turn_level(self):
         assert "ragas:faithfulness" in TURN_LEVEL_METRICS
         assert "custom:answer_correctness" in TURN_LEVEL_METRICS
 
-    def test_populate_metric_mappings_conversation_level(self):
+    def test_populate_metric_mappings_conversation_level(self) -> None:
         """Test populating conversation-level metrics."""
         config = SystemConfig()
         config.default_turn_metrics_metadata = {}
@@ -45,7 +45,7 @@ def test_populate_metric_mappings_conversation_level(self):
         assert "deepeval:conversation_completeness" in CONVERSATION_LEVEL_METRICS
         assert "deepeval:conversation_relevancy" in CONVERSATION_LEVEL_METRICS
 
-    def test_populate_metric_mappings_clears_previous(self):
+    def test_populate_metric_mappings_clears_previous(self) -> None:
         """Test that populate clears previous mappings."""
         config1 = SystemConfig()
         config1.default_turn_metrics_metadata = {"metric1": {}}
@@ -70,14 +70,14 @@ def test_populate_metric_mappings_clears_previous(self):
 class TestConfigLoader:
     """Unit tests for ConfigLoader."""
 
-    def test_load_system_config_file_not_found(self):
+    def test_load_system_config_file_not_found(self) -> None:
         """Test loading non-existent config file raises error."""
         loader = ConfigLoader()
 
         with pytest.raises(ValueError, match="file not found"):
             loader.load_system_config("/nonexistent/config.yaml")
 
-    def test_load_system_config_invalid_yaml(self):
+    def test_load_system_config_invalid_yaml(self) -> None:
         """Test loading invalid YAML raises error."""
         with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
             f.write("invalid: yaml: [[[")
@@ -90,7 +90,7 @@ def test_load_system_config_invalid_yaml(self):
         finally:
             Path(temp_path).unlink()
 
-    def test_load_system_config_empty_file(self):
+    def test_load_system_config_empty_file(self) -> None:
         """Test loading empty YAML raises error."""
         with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
             f.write("")
@@ -103,7 +103,7 @@ def test_load_system_config_empty_file(self):
         finally:
             Path(temp_path).unlink()
 
-    def test_load_system_config_not_dict(self):
+    def test_load_system_config_not_dict(self) -> None:
         """Test loading YAML with non-dict root raises error."""
         with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
             f.write("- item1\n- item2\n")
@@ -116,7 +116,7 @@ def test_load_system_config_not_dict(self):
         finally:
             Path(temp_path).unlink()
 
-    def test_load_system_config_minimal_valid(self):
+    def test_load_system_config_minimal_valid(self) -> None:
         """Test loading minimal valid config."""
         yaml_content = """
 llm:
@@ -148,7 +148,7 @@ def test_load_system_config_minimal_valid(self):
         finally:
             Path(temp_path).unlink()
 
-    def test_load_system_config_with_all_sections(self):
+    def test_load_system_config_with_all_sections(self) -> None:
         """Test loading config with all sections."""
         yaml_content = """
 core:
@@ -215,7 +215,7 @@ def test_load_system_config_with_all_sections(self):
         finally:
             Path(temp_path).unlink()
 
-    def test_load_system_config_populates_metrics(self):
+    def test_load_system_config_populates_metrics(self) -> None:
         """Test that loading config populates global metric mappings."""
         yaml_content = """
 llm:
@@ -260,7 +260,7 @@ def test_load_system_config_populates_metrics(self):
         finally:
             Path(temp_path).unlink()
 
-    def test_load_system_config_with_defaults(self):
+    def test_load_system_config_with_defaults(self) -> None:
         """Test that missing sections use defaults."""
         yaml_content = """
 llm:
@@ -288,7 +288,7 @@ def test_load_system_config_with_defaults(self):
         finally:
             Path(temp_path).unlink()
 
-    def test_create_system_config_missing_metrics_metadata(self):
+    def test_create_system_config_missing_metrics_metadata(self) -> None:
         """Test creating config when metrics_metadata is missing."""
         yaml_content = """
 llm:
@@ -305,12 +305,12 @@ def test_create_system_config_missing_metrics_metadata(self):
             config = loader.load_system_config(temp_path)
 
             # Should handle missing metrics_metadata gracefully
-            assert config.default_turn_metrics_metadata == {}
-            assert config.default_conversation_metrics_metadata == {}
+            assert not config.default_turn_metrics_metadata
+            assert not config.default_conversation_metrics_metadata
         finally:
             Path(temp_path).unlink()
 
-    def test_create_system_config_partial_metrics_metadata(self):
+    def test_create_system_config_partial_metrics_metadata(self) -> None:
         """Test creating config with partial metrics_metadata."""
         yaml_content = """
 llm:
@@ -335,11 +335,11 @@ def test_create_system_config_partial_metrics_metadata(self):
 
             # Should handle missing conversation_level
             assert len(config.default_turn_metrics_metadata) > 0
-            assert config.default_conversation_metrics_metadata == {}
+            assert not config.default_conversation_metrics_metadata
         finally:
             Path(temp_path).unlink()
 
-    def test_load_system_config_empty_sections(self):
+    def test_load_system_config_empty_sections(self) -> None:
         """Test loading config with empty sections."""
         yaml_content = """
 llm:
diff --git a/tests/unit/core/system/test_setup.py b/tests/unit/core/system/test_setup.py
index 5fa25789..c3e8dc0e 100644
--- a/tests/unit/core/system/test_setup.py
+++ b/tests/unit/core/system/test_setup.py
@@ -3,6 +3,9 @@
 import logging
 import os
 
+from pytest_mock import MockerFixture
+from _pytest.capture import CaptureFixture
+
 from lightspeed_evaluation.core.models import LoggingConfig
 from lightspeed_evaluation.core.system.setup import (
     setup_environment_variables,
@@ -13,9 +16,9 @@
 class TestSetupEnvironmentVariables:
     """Tests for environment variable setup."""
 
-    def test_setup_default_environment_variables(self, mocker):
+    def test_setup_default_environment_variables(self, mocker: MockerFixture) -> None:
         """Test setting up default environment variables."""
-        config_data = {}
+        config_data: dict = {}
 
         # Use mocker to patch os.environ
         mocker.patch.dict(os.environ, {}, clear=True)
@@ -34,7 +37,7 @@ def test_setup_default_environment_variables(self, mocker):
         assert os.environ["SSL_CERTIFI_BUNDLE"] == "/path/to/certifi/cacert.pem"
         mock_where.assert_called_once()
 
-    def test_setup_custom_environment_variables(self, mocker):
+    def test_setup_custom_environment_variables(self, mocker: MockerFixture) -> None:
         """Test setting up custom environment variables."""
         config_data = {
             "environment": {
@@ -58,7 +61,9 @@ def test_setup_custom_environment_variables(self, mocker):
         assert os.environ["DEEPEVAL_TELEMETRY_OPT_OUT"] == "YES"
         assert os.environ["SSL_CERTIFI_BUNDLE"] == "/path/to/certifi/cacert.pem"
 
-    def test_setup_environment_variables_override_defaults(self, mocker):
+    def test_setup_environment_variables_override_defaults(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test overriding default environment variables."""
         config_data = {"environment": {"LITELLM_LOG": "DEBUG"}}
 
@@ -74,7 +79,9 @@ def test_setup_environment_variables_override_defaults(self, mocker):
         assert os.environ["LITELLM_LOG"] == "DEBUG"
         assert os.environ["SSL_CERTIFI_BUNDLE"] == "/path/to/certifi/cacert.pem"
 
-    def test_setup_environment_variables_handles_key_error(self, mocker, capsys):
+    def test_setup_environment_variables_handles_key_error(
+        self, mocker: MockerFixture, capsys: CaptureFixture
+    ) -> None:
         """Test handling of KeyError during environment setup."""
         config_data = {"environment": None}  # This will cause issues
 
@@ -95,7 +102,9 @@ def test_setup_environment_variables_handles_key_error(self, mocker, capsys):
         captured = capsys.readouterr()
         assert "Warning" in captured.out or "fallback" in captured.out
 
-    def test_setup_environment_variables_handles_type_error(self, mocker, capsys):
+    def test_setup_environment_variables_handles_type_error(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test handling of TypeError during environment setup."""
         config_data = {"environment": "invalid_type"}
 
@@ -111,7 +120,9 @@ def test_setup_environment_variables_handles_type_error(self, mocker, capsys):
         assert os.environ["RAGAS_DO_NOT_TRACK"] == "true"
         assert os.environ["SSL_CERTIFI_BUNDLE"] == "/path/to/certifi/cacert.pem"
 
-    def test_setup_ssl_certifi_bundle_set_when_ssl_cert_file_is_none(self, mocker):
+    def test_setup_ssl_certifi_bundle_set_when_ssl_cert_file_is_none(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test SSL_CERTIFI_BUNDLE is still set even when ssl_cert_file is None."""
         config_data = {"llm": {"ssl_verify": True, "ssl_cert_file": None}}
         mocker.patch.dict(os.environ, {}, clear=True)
@@ -129,7 +140,7 @@ def test_setup_ssl_certifi_bundle_set_when_ssl_cert_file_is_none(self, mocker):
 class TestSetupLogging:
     """Tests for logging setup."""
 
-    def test_setup_logging_basic(self):
+    def test_setup_logging_basic(self) -> None:
         """Test basic logging setup."""
         logging_config = LoggingConfig(
             source_level="INFO",
@@ -143,7 +154,7 @@ def test_setup_logging_basic(self):
         assert logger.name == "lightspeed_evaluation"
         assert logger.level == logging.INFO
 
-    def test_setup_logging_debug_level(self):
+    def test_setup_logging_debug_level(self) -> None:
         """Test logging setup with DEBUG level."""
         logging_config = LoggingConfig(
             source_level="DEBUG",
@@ -155,7 +166,7 @@ def test_setup_logging_debug_level(self):
 
         assert logger.level == logging.DEBUG
 
-    def test_setup_logging_with_package_overrides(self):
+    def test_setup_logging_with_package_overrides(self) -> None:
         """Test logging setup with package overrides."""
         logging_config = LoggingConfig(
             source_level="INFO",
@@ -179,7 +190,7 @@ def test_setup_logging_with_package_overrides(self):
         urllib3_logger = logging.getLogger("urllib3")
         assert urllib3_logger.level == logging.CRITICAL
 
-    def test_setup_logging_sets_default_noisy_packages(self):
+    def test_setup_logging_sets_default_noisy_packages(self) -> None:
         """Test that noisy packages get default levels set."""
         logging_config = LoggingConfig(
             source_level="INFO",
@@ -193,7 +204,9 @@ def test_setup_logging_sets_default_noisy_packages(self):
         matplotlib_logger = logging.getLogger("matplotlib")
         assert matplotlib_logger.level == logging.ERROR
 
-    def test_setup_logging_handles_invalid_override_level(self, capsys):
+    def test_setup_logging_handles_invalid_override_level(
+        self, capsys: CaptureFixture
+    ) -> None:
         """Test handling of invalid log level in overrides."""
         logging_config = LoggingConfig(
             source_level="INFO",
@@ -211,7 +224,7 @@ def test_setup_logging_handles_invalid_override_level(self, capsys):
         captured = capsys.readouterr()
         assert "Warning" in captured.out or "Invalid" in captured.out
 
-    def test_setup_logging_error_level(self):
+    def test_setup_logging_error_level(self) -> None:
         """Test logging setup with ERROR level."""
         logging_config = LoggingConfig(
             source_level="ERROR",
@@ -223,7 +236,7 @@ def test_setup_logging_error_level(self):
 
         assert logger.level == logging.ERROR
 
-    def test_setup_logging_custom_format(self):
+    def test_setup_logging_custom_format(self) -> None:
         """Test logging with custom format."""
         custom_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
         logging_config = LoggingConfig(
@@ -238,7 +251,7 @@ def test_setup_logging_custom_format(self):
         # Format is applied to root logger, not easy to verify directly
         # but at least verify it doesn't crash
 
-    def test_setup_logging_warning_level(self):
+    def test_setup_logging_warning_level(self) -> None:
         """Test logging setup with WARNING level."""
         logging_config = LoggingConfig(
             source_level="WARNING",
@@ -250,7 +263,7 @@ def test_setup_logging_warning_level(self):
 
         assert logger.level == logging.WARNING
 
-    def test_setup_logging_applies_to_all_default_packages(self):
+    def test_setup_logging_applies_to_all_default_packages(self) -> None:
         """Test that all default noisy packages get configured."""
         logging_config = LoggingConfig(
             source_level="INFO",
diff --git a/tests/unit/core/system/test_ssl_certifi.py b/tests/unit/core/system/test_ssl_certifi.py
index 93cbd6ea..d92bf73c 100644
--- a/tests/unit/core/system/test_ssl_certifi.py
+++ b/tests/unit/core/system/test_ssl_certifi.py
@@ -2,6 +2,8 @@
 
 from pathlib import Path
 
+from pytest_mock import MockerFixture
+
 from lightspeed_evaluation.core.system.ssl_certifi import (
     create_ssl_certifi_bundle,
     get_ssl_cert_files_paths_from_system_yaml,
@@ -13,7 +15,7 @@
 class TestGetSslCertFilesPathsFromSystemYaml:
     """Tests for extracting SSL cert paths from config data."""
 
-    def test_extracts_cert_when_ssl_verify_true(self):
+    def test_extracts_cert_when_ssl_verify_true(self) -> None:
         """Test extracting SSL cert when ssl_verify is True."""
         config = {
             "ssl_verify": True,
@@ -24,7 +26,7 @@ def test_extracts_cert_when_ssl_verify_true(self):
 
         assert result == ["/path/to/cert.pem"]
 
-    def test_ignores_cert_when_ssl_verify_false(self):
+    def test_ignores_cert_when_ssl_verify_false(self) -> None:
         """Test that ssl_cert_file is ignored when ssl_verify is False."""
         config = {
             "ssl_verify": False,
@@ -33,9 +35,9 @@ def test_ignores_cert_when_ssl_verify_false(self):
 
         result = get_ssl_cert_files_paths_from_system_yaml(config)
 
-        assert result == []
+        assert not result
 
-    def test_nested_configs(self):
+    def test_nested_configs(self) -> None:
         """Test extracting SSL certs from nested configuration."""
         config = {
             "service_a": {
@@ -67,7 +69,7 @@ def test_nested_configs(self):
 class TestGetSystemSslCertFile:
     """Tests for getting system SSL cert file from environment."""
 
-    def test_returns_cert_file_when_env_set(self, mocker):
+    def test_returns_cert_file_when_env_set(self, mocker: MockerFixture) -> None:
         """Test when SSL_CERT_FILE environment variable is set."""
         mocker.patch.dict("os.environ", {"SSL_CERT_FILE": "/system/cert.pem"})
 
@@ -75,19 +77,19 @@ def test_returns_cert_file_when_env_set(self, mocker):
 
         assert result == ["/system/cert.pem"]
 
-    def test_returns_empty_when_env_not_set(self, mocker):
+    def test_returns_empty_when_env_not_set(self, mocker: MockerFixture) -> None:
         """Test when SSL_CERT_FILE environment variable is not set."""
         mocker.patch.dict("os.environ", {}, clear=True)
 
         result = get_system_ssl_cert_file()
 
-        assert result == []
+        assert not result
 
 
 class TestGetUniqueSslCertPaths:
     """Tests for getting unique SSL certificate paths."""
 
-    def test_returns_unique_paths(self):
+    def test_returns_unique_paths(self) -> None:
         """Test that duplicate paths are removed."""
         cert_paths = [
             "/path/to/cert_a.pem",
@@ -99,17 +101,19 @@ def test_returns_unique_paths(self):
 
         assert set(result) == {"/path/to/cert_a.pem", "/path/to/cert_b.pem"}
 
-    def test_returns_empty_list_when_no_paths(self):
+    def test_returns_empty_list_when_no_paths(self) -> None:
         """Test that an empty list is returned when no paths are provided."""
         result = _get_unique_ssl_cert_paths([])
 
-        assert result == []
+        assert not result
 
 
 class TestCreateSslCertifiBundle:
     """Tests for creating combined SSL certificate bundle."""
 
-    def test_returns_certifi_bundle_when_no_custom_certs(self, mocker):
+    def test_returns_certifi_bundle_when_no_custom_certs(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test that certifi bundle is returned when no custom certs exist."""
         mocker.patch.dict("os.environ", {}, clear=True)
 
@@ -122,7 +126,9 @@ def test_returns_certifi_bundle_when_no_custom_certs(self, mocker):
 
         assert result == "/path/to/certifi/cacert.pem"
 
-    def test_combines_certifi_with_custom_cert(self, mocker, tmp_path):
+    def test_combines_certifi_with_custom_cert(
+        self, mocker: MockerFixture, tmp_path: Path
+    ) -> None:
         """Test combining certifi bundle with custom cert from config."""
         mocker.patch.dict("os.environ", {}, clear=True)
 
@@ -143,12 +149,14 @@ def test_combines_certifi_with_custom_cert(self, mocker, tmp_path):
         mock_where.return_value = str(certifi_bundle)
 
         result = create_ssl_certifi_bundle(config)
-        content = Path(result).read_text()
+        content = Path(result).read_text(encoding="utf-8")
 
         assert "CERTIFI BUNDLE" in content
         assert "CUSTOM CERT" in content
 
-    def test_combines_config_and_env_certs(self, mocker, tmp_path):
+    def test_combines_config_and_env_certs(
+        self, mocker: MockerFixture, tmp_path: Path
+    ) -> None:
         """Test combining certs from both config and environment."""
         certifi_bundle = tmp_path / "certifi.pem"
         certifi_bundle.write_text("CERTIFI BUNDLE\n")
@@ -172,13 +180,15 @@ def test_combines_config_and_env_certs(self, mocker, tmp_path):
         mock_where.return_value = str(certifi_bundle)
 
         result = create_ssl_certifi_bundle(config)
-        content = Path(result).read_text()
+        content = Path(result).read_text(encoding="utf-8")
 
         assert "CERTIFI BUNDLE" in content
         assert "CONFIG CERT" in content
         assert "ENV CERT" in content
 
-    def test_registers_atexit_cleanup(self, mocker, tmp_path):
+    def test_registers_atexit_cleanup(
+        self, mocker: MockerFixture, tmp_path: Path
+    ) -> None:
         """Test that atexit cleanup is registered for temp bundle."""
         mocker.patch.dict("os.environ", {})
 
diff --git a/tests/unit/core/system/test_validator.py b/tests/unit/core/system/test_validator.py
index b39c4ba9..95799a6d 100644
--- a/tests/unit/core/system/test_validator.py
+++ b/tests/unit/core/system/test_validator.py
@@ -4,6 +4,9 @@
 from pathlib import Path
 
 import pytest
+from pytest_mock import MockerFixture
+
+from pydantic import ValidationError
 
 from lightspeed_evaluation.core.models import EvaluationData, TurnData
 from lightspeed_evaluation.core.system.exceptions import DataValidationError
@@ -11,13 +14,12 @@
     DataValidator,
     format_pydantic_error,
 )
-from pydantic import ValidationError
 
 
 class TestFormatPydanticError:
     """Unit tests for format_pydantic_error helper function."""
 
-    def test_format_single_error(self):
+    def test_format_single_error(self) -> None:
         """Test formatting a single Pydantic validation error."""
         try:
             TurnData(turn_id="1", query="", response="Valid")
@@ -26,7 +28,7 @@ def test_format_single_error(self):
             assert "query" in formatted
             assert "at least 1 character" in formatted
 
-    def test_format_multiple_errors(self):
+    def test_format_multiple_errors(self) -> None:
         """Test formatting multiple validation errors."""
         try:
             TurnData(turn_id="", query="", response="")
@@ -40,7 +42,7 @@ def test_format_multiple_errors(self):
 class TestDataValidator:
     """Unit tests for DataValidator."""
 
-    def test_validate_evaluation_data_valid(self):
+    def test_validate_evaluation_data_valid(self) -> None:
         """Test validation passes with valid data."""
         validator = DataValidator(api_enabled=False)
 
@@ -53,12 +55,18 @@ def test_validate_evaluation_data_valid(self):
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = validator._validate_evaluation_data([conv_data])
+        result = (
+            validator._validate_evaluation_data(  # pylint: disable=protected-access
+                [conv_data]
+            )
+        )
 
         assert result is True
         assert len(validator.validation_errors) == 0
 
-    def test_validate_metrics_availability_unknown_turn_metric(self, mocker):
+    def test_validate_metrics_availability_unknown_turn_metric(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test validation fails for unknown turn metric."""
         # Mock the global metrics sets
         mocker.patch(
@@ -76,7 +84,11 @@ def test_validate_metrics_availability_unknown_turn_metric(self, mocker):
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = validator._validate_evaluation_data([conv_data])
+        result = (
+            validator._validate_evaluation_data(  # pylint: disable=protected-access
+                [conv_data]
+            )
+        )
 
         assert result is False
         assert len(validator.validation_errors) > 0
@@ -84,7 +96,9 @@ def test_validate_metrics_availability_unknown_turn_metric(self, mocker):
             "Unknown turn metric" in error for error in validator.validation_errors
         )
 
-    def test_validate_metrics_availability_unknown_conversation_metric(self, mocker):
+    def test_validate_metrics_availability_unknown_conversation_metric(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test validation fails for unknown conversation metric."""
         mocker.patch(
             "lightspeed_evaluation.core.system.validator.CONVERSATION_LEVEL_METRICS",
@@ -100,7 +114,11 @@ def test_validate_metrics_availability_unknown_conversation_metric(self, mocker)
             conversation_metrics=["unknown:conversation_metric"],
         )
 
-        result = validator._validate_evaluation_data([conv_data])
+        result = (
+            validator._validate_evaluation_data(  # pylint: disable=protected-access
+                [conv_data]
+            )
+        )
 
         assert result is False
         assert any(
@@ -108,7 +126,7 @@ def test_validate_metrics_availability_unknown_conversation_metric(self, mocker)
             for error in validator.validation_errors
         )
 
-    def test_validate_metric_requirements_missing_response(self):
+    def test_validate_metric_requirements_missing_response(self) -> None:
         """Test validation fails when required response field is missing."""
         validator = DataValidator(api_enabled=False)
 
@@ -120,12 +138,16 @@ def test_validate_metric_requirements_missing_response(self):
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = validator._validate_evaluation_data([conv_data])
+        result = (
+            validator._validate_evaluation_data(  # pylint: disable=protected-access
+                [conv_data]
+            )
+        )
 
         assert result is False
         assert any("response" in error.lower() for error in validator.validation_errors)
 
-    def test_validate_metric_requirements_missing_contexts(self):
+    def test_validate_metric_requirements_missing_contexts(self) -> None:
         """Test validation fails when required contexts are missing."""
         validator = DataValidator(api_enabled=False)
 
@@ -138,14 +160,18 @@ def test_validate_metric_requirements_missing_contexts(self):
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = validator._validate_evaluation_data([conv_data])
+        result = (
+            validator._validate_evaluation_data(  # pylint: disable=protected-access
+                [conv_data]
+            )
+        )
 
         assert result is False
         assert any("contexts" in error.lower() for error in validator.validation_errors)
 
     def test_validate_metric_requirements_api_enabled_allows_missing_response(
-        self, mocker
-    ):
+        self, mocker: MockerFixture
+    ) -> None:
         """Test that missing response is allowed when API is enabled."""
         # Mock the global metrics sets
         mocker.patch(
@@ -163,12 +189,16 @@ def test_validate_metric_requirements_api_enabled_allows_missing_response(
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = validator._validate_evaluation_data([conv_data])
+        result = (
+            validator._validate_evaluation_data(  # pylint: disable=protected-access
+                [conv_data]
+            )
+        )
 
         # Should pass because API will populate response
         assert result is True
 
-    def test_validate_metric_requirements_expected_response_missing(self):
+    def test_validate_metric_requirements_expected_response_missing(self) -> None:
         """Test validation fails when expected_response is required but missing."""
         validator = DataValidator(api_enabled=False)
 
@@ -182,7 +212,11 @@ def test_validate_metric_requirements_expected_response_missing(self):
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = validator._validate_evaluation_data([conv_data])
+        result = (
+            validator._validate_evaluation_data(  # pylint: disable=protected-access
+                [conv_data]
+            )
+        )
 
         assert result is False
         assert any(
@@ -190,7 +224,7 @@ def test_validate_metric_requirements_expected_response_missing(self):
             for error in validator.validation_errors
         )
 
-    def test_validate_metric_requirements_tool_eval_missing_fields(self):
+    def test_validate_metric_requirements_tool_eval_missing_fields(self) -> None:
         """Test validation fails when tool_eval required fields are missing."""
         validator = DataValidator(api_enabled=False)
 
@@ -204,14 +238,20 @@ def test_validate_metric_requirements_tool_eval_missing_fields(self):
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = validator._validate_evaluation_data([conv_data])
+        result = (
+            validator._validate_evaluation_data(  # pylint: disable=protected-access
+                [conv_data]
+            )
+        )
 
         assert result is False
         assert any(
             "tool_calls" in error.lower() for error in validator.validation_errors
         )
 
-    def test_validate_metric_requirements_skip_script_when_api_disabled(self, mocker):
+    def test_validate_metric_requirements_skip_script_when_api_disabled(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test script metrics validation is skipped when API is disabled."""
         # Mock the global metrics sets
         mocker.patch(
@@ -231,19 +271,23 @@ def test_validate_metric_requirements_skip_script_when_api_disabled(self, mocker
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
         # Should not validate script requirements when API disabled
-        result = validator._validate_evaluation_data([conv_data])
+        result = (
+            validator._validate_evaluation_data(  # pylint: disable=protected-access
+                [conv_data]
+            )
+        )
 
         # Should pass because script validation is skipped
         assert result is True
 
-    def test_load_evaluation_data_file_not_found(self):
+    def test_load_evaluation_data_file_not_found(self) -> None:
         """Test loading non-existent file raises error."""
         validator = DataValidator()
 
         with pytest.raises(DataValidationError, match="file not found"):
             validator.load_evaluation_data("/nonexistent/file.yaml")
 
-    def test_load_evaluation_data_invalid_yaml(self):
+    def test_load_evaluation_data_invalid_yaml(self) -> None:
         """Test loading invalid YAML raises error."""
         with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
             f.write("invalid: yaml: content: [")
@@ -256,7 +300,7 @@ def test_load_evaluation_data_invalid_yaml(self):
         finally:
             Path(temp_path).unlink()
 
-    def test_load_evaluation_data_empty_file(self):
+    def test_load_evaluation_data_empty_file(self) -> None:
         """Test loading empty YAML file raises error."""
         with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
             f.write("")
@@ -269,7 +313,7 @@ def test_load_evaluation_data_empty_file(self):
         finally:
             Path(temp_path).unlink()
 
-    def test_load_evaluation_data_not_list(self):
+    def test_load_evaluation_data_not_list(self) -> None:
         """Test loading YAML with non-list root raises error."""
         with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
             f.write("conversation_group_id: test\n")
@@ -282,7 +326,7 @@ def test_load_evaluation_data_not_list(self):
         finally:
             Path(temp_path).unlink()
 
-    def test_load_evaluation_data_valid(self, mocker):
+    def test_load_evaluation_data_valid(self, mocker: MockerFixture) -> None:
         """Test loading valid evaluation data file."""
         yaml_content = """
 - conversation_group_id: test_conv
@@ -290,7 +334,7 @@ def test_load_evaluation_data_valid(self, mocker):
     - turn_id: "1"
       query: "What is Python?"
       response: "Python is a programming language."
-"""
+            """
 
         with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
             f.write(yaml_content)
@@ -316,7 +360,7 @@ def test_load_evaluation_data_valid(self, mocker):
         finally:
             Path(temp_path).unlink()
 
-    def test_check_metric_requirements_missing_contexts(self):
+    def test_check_metric_requirements_missing_contexts(self) -> None:
         """Test validation fails for missing contexts when required."""
         validator = DataValidator(api_enabled=False)
 
@@ -329,12 +373,16 @@ def test_check_metric_requirements_missing_contexts(self):
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = validator._validate_evaluation_data([conv_data])
+        result = (
+            validator._validate_evaluation_data(  # pylint: disable=protected-access
+                [conv_data]
+            )
+        )
 
         assert result is False
         assert any("contexts" in error.lower() for error in validator.validation_errors)
 
-    def test_check_metric_requirements_whitespace_only_string(self):
+    def test_check_metric_requirements_whitespace_only_string(self) -> None:
         """Test validation fails for whitespace-only required string."""
         validator = DataValidator(api_enabled=False)
 
@@ -346,11 +394,15 @@ def test_check_metric_requirements_whitespace_only_string(self):
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = validator._validate_evaluation_data([conv_data])
+        result = (
+            validator._validate_evaluation_data(  # pylint: disable=protected-access
+                [conv_data]
+            )
+        )
 
         assert result is False
 
-    def test_validate_multiple_conversations(self):
+    def test_validate_multiple_conversations(self) -> None:
         """Test validating multiple conversations."""
         validator = DataValidator(api_enabled=False)
 
@@ -360,11 +412,17 @@ def test_validate_multiple_conversations(self):
         conv1 = EvaluationData(conversation_group_id="conv1", turns=[turn1])
         conv2 = EvaluationData(conversation_group_id="conv2", turns=[turn2])
 
-        result = validator._validate_evaluation_data([conv1, conv2])
+        result = (
+            validator._validate_evaluation_data(  # pylint: disable=protected-access
+                [conv1, conv2]
+            )
+        )
 
         assert result is True
 
-    def test_validate_evaluation_data_accumulates_errors(self, mocker):
+    def test_validate_evaluation_data_accumulates_errors(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test that validation accumulates multiple errors."""
         mocker.patch(
             "lightspeed_evaluation.core.system.validator.TURN_LEVEL_METRICS",
@@ -389,7 +447,11 @@ def test_validate_evaluation_data_accumulates_errors(self, mocker):
 
         conv = EvaluationData(conversation_group_id="test", turns=[turn1, turn2])
 
-        result = validator._validate_evaluation_data([conv])
+        result = (
+            validator._validate_evaluation_data(  # pylint: disable=protected-access
+                [conv]
+            )
+        )
 
         assert result is False
         # Should have errors for both issues
@@ -399,7 +461,7 @@ def test_validate_evaluation_data_accumulates_errors(self, mocker):
 class TestFilterByScope:
     """Unit test for filter by scope."""
 
-    def test_filter_by_scope_no_filter(self):
+    def test_filter_by_scope_no_filter(self) -> None:
         """Test no filtering when both tags and conv_ids are None."""
         validator = DataValidator()
         data = [
@@ -412,10 +474,10 @@ def test_filter_by_scope_no_filter(self):
                 turns=[TurnData(turn_id="t1", query="Q", response="A")],
             ),
         ]
-        result = validator._filter_by_scope(data)
+        result = validator._filter_by_scope(data)  # pylint: disable=protected-access
         assert len(result) == 2
 
-    def test_filter_by_scope_tags_only(self):
+    def test_filter_by_scope_tags_only(self) -> None:
         """Test filtering by tags only."""
         validator = DataValidator()
         data = [
@@ -435,11 +497,13 @@ def test_filter_by_scope_tags_only(self):
                 turns=[TurnData(turn_id="t1", query="Q", response="A")],
             ),
         ]
-        result = validator._filter_by_scope(data, tags=["basic"])
+        result = validator._filter_by_scope(  # pylint: disable=protected-access
+            data, tags=["basic"]
+        )
         assert len(result) == 2
         assert all(c.tag == "basic" for c in result)
 
-    def test_filter_by_scope_conv_ids_only(self):
+    def test_filter_by_scope_conv_ids_only(self) -> None:
         """Test filtering by conversation IDs only."""
         validator = DataValidator()
         data = [
@@ -456,11 +520,13 @@ def test_filter_by_scope_conv_ids_only(self):
                 turns=[TurnData(turn_id="t1", query="Q", response="A")],
             ),
         ]
-        result = validator._filter_by_scope(data, conv_ids=["conv_1", "conv_3"])
+        result = validator._filter_by_scope(  # pylint: disable=protected-access
+            data, conv_ids=["conv_1", "conv_3"]
+        )
         assert len(result) == 2
         assert {c.conversation_group_id for c in result} == {"conv_1", "conv_3"}
 
-    def test_filter_by_scope_tags_and_conv_ids(self):
+    def test_filter_by_scope_tags_and_conv_ids(self) -> None:
         """Test filtering by both tags and conv_ids uses OR logic."""
         validator = DataValidator()
         data = [
@@ -480,10 +546,12 @@ def test_filter_by_scope_tags_and_conv_ids(self):
                 turns=[TurnData(turn_id="t1", query="Q", response="A")],
             ),
         ]
-        result = validator._filter_by_scope(data, tags=["basic"], conv_ids=["conv_3"])
+        result = validator._filter_by_scope(  # pylint: disable=protected-access
+            data, tags=["basic"], conv_ids=["conv_3"]
+        )
         assert len(result) == 2  # conv_1 (basic tag) + conv_3 (by ID)
 
-    def test_filter_by_scope_no_match_returns_empty(self):
+    def test_filter_by_scope_no_match_returns_empty(self) -> None:
         """Test filtering with no matching criteria returns empty list."""
         validator = DataValidator()
         data = [
@@ -493,5 +561,7 @@ def test_filter_by_scope_no_match_returns_empty(self):
                 turns=[TurnData(turn_id="t1", query="Q", response="A")],
             ),
         ]
-        result = validator._filter_by_scope(data, tags=["nonexistent"])
+        result = validator._filter_by_scope(  # pylint: disable=protected-access
+            data, tags=["nonexistent"]
+        )
         assert len(result) == 0
diff --git a/tests/unit/pipeline/evaluation/conftest.py b/tests/unit/pipeline/evaluation/conftest.py
new file mode 100644
index 00000000..a1131cd5
--- /dev/null
+++ b/tests/unit/pipeline/evaluation/conftest.py
@@ -0,0 +1,223 @@
+"""Pytest configuration and fixtures for evaluation tests."""
+
+import pytest
+from pytest_mock import MockerFixture
+
+from lightspeed_evaluation.core.models import (
+    EvaluationData,
+    SystemConfig,
+    TurnData,
+)
+from lightspeed_evaluation.core.system.loader import ConfigLoader
+from lightspeed_evaluation.core.metrics.manager import MetricManager
+from lightspeed_evaluation.core.script import ScriptExecutionManager
+from lightspeed_evaluation.core.models import EvaluationResult, EvaluationRequest
+from lightspeed_evaluation.pipeline.evaluation.amender import APIDataAmender
+from lightspeed_evaluation.pipeline.evaluation.errors import EvaluationErrorHandler
+from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator
+from lightspeed_evaluation.pipeline.evaluation.processor import (
+    ProcessorComponents,
+    ConversationProcessor,
+)
+
+
+@pytest.fixture
+def config_loader(mocker: MockerFixture) -> ConfigLoader:
+    """Create a mock config loader with system config."""
+    loader = mocker.Mock(spec=ConfigLoader)
+
+    config = SystemConfig()
+    config.default_turn_metrics_metadata = {
+        "ragas:faithfulness": {"threshold": 0.7, "default": True},
+        "custom:answer_correctness": {"threshold": 0.8, "default": False},
+    }
+    config.default_conversation_metrics_metadata = {
+        "deepeval:conversation_completeness": {"threshold": 0.6, "default": True},
+    }
+    config.api.enabled = True
+
+    loader.system_config = config
+    return loader
+
+
+@pytest.fixture
+def mock_metric_manager(mocker: MockerFixture) -> MetricManager:
+    """Create a mock metric manager."""
+    manager = mocker.Mock(spec=MetricManager)
+
+    def get_threshold(
+        metric_id: str,
+        _level: str,
+        _conv_data: EvaluationData | None = None,
+        _turn_data: TurnData | None = None,
+    ) -> float:
+        thresholds = {
+            "ragas:faithfulness": 0.7,
+            "custom:answer_correctness": 0.8,
+            "deepeval:conversation_completeness": 0.6,
+        }
+        return thresholds.get(metric_id, 0.5)
+
+    manager.get_effective_threshold.side_effect = get_threshold
+    # Mock get_metric_metadata to return None (no metadata) to support iteration
+    # in _extract_metadata_for_csv
+    manager.get_metric_metadata.return_value = None
+    return manager
+
+
+@pytest.fixture
+def mock_script_manager(mocker: MockerFixture) -> ScriptExecutionManager:
+    """Create a mock script execution manager."""
+    manager = mocker.Mock(spec=ScriptExecutionManager)
+    return manager
+
+
+@pytest.fixture
+def mock_config_loader(mocker: MockerFixture) -> ConfigLoader:
+    """Create a mock config loader with system config."""
+    loader = mocker.Mock(spec=ConfigLoader)
+
+    config = SystemConfig()
+    config.api.enabled = False
+    config.output.output_dir = "/tmp/test_output"
+    config.output.base_filename = "test"
+    config.core.max_threads = 2
+
+    loader.system_config = config
+    return loader
+
+
+@pytest.fixture
+def sample_evaluation_data() -> list[EvaluationData]:
+    """Create sample evaluation data."""
+    turn1 = TurnData(
+        turn_id="turn1",
+        query="What is Python?",
+        response="Python is a programming language.",
+        contexts=["Python context"],
+        turn_metrics=["ragas:faithfulness"],
+    )
+    conv_data = EvaluationData(
+        conversation_group_id="conv1",
+        turns=[turn1],
+    )
+    return [conv_data]
+
+
+@pytest.fixture
+def processor_components(mocker: MockerFixture) -> ProcessorComponents:
+    """Create processor components."""
+    metrics_evaluator = mocker.Mock(spec=MetricsEvaluator)
+    api_amender = mocker.Mock(spec=APIDataAmender)
+    error_handler = mocker.Mock(spec=EvaluationErrorHandler)
+    metric_manager = mocker.Mock(spec=MetricManager)
+    script_manager = mocker.Mock(spec=ScriptExecutionManager)
+
+    # Default behavior for metric resolution
+    metric_manager.resolve_metrics.return_value = ["ragas:faithfulness"]
+
+    return ProcessorComponents(
+        metrics_evaluator=metrics_evaluator,
+        api_amender=api_amender,
+        error_handler=error_handler,
+        metric_manager=metric_manager,
+        script_manager=script_manager,
+    )
+
+
+@pytest.fixture
+def sample_conv_data() -> EvaluationData:
+    """Create sample conversation data."""
+    turn1 = TurnData(
+        turn_id="turn1",
+        query="What is Python?",
+        response="Python is a programming language.",
+        contexts=["Context"],
+        turn_metrics=["ragas:faithfulness"],
+    )
+    return EvaluationData(
+        conversation_group_id="conv1",
+        turns=[turn1],
+    )
+
+
+@pytest.fixture
+def mock_metrics_evaluator(mocker: MockerFixture) -> MetricsEvaluator:
+    """Create a mock metrics evaluator."""
+    evaluator = mocker.Mock(spec=MetricsEvaluator)
+
+    def evaluate_metric(request: EvaluationRequest) -> EvaluationResult:
+        """Mock evaluate_metric that returns a result based on metric."""
+        return EvaluationResult(
+            conversation_group_id=request.conv_data.conversation_group_id,
+            turn_id=request.turn_id,
+            metric_identifier=request.metric_identifier,
+            result="PASS",
+            score=0.85,
+            reason="Test evaluation",
+            threshold=0.7,
+        )
+
+    evaluator.evaluate_metric.side_effect = evaluate_metric
+    return evaluator
+
+
+@pytest.fixture
+def mock_api_amender(mocker: MockerFixture) -> APIDataAmender:
+    """Create a mock API data amender."""
+    amender = mocker.Mock(spec=APIDataAmender)
+    return amender
+
+
+@pytest.fixture
+def mock_error_handler(mocker: MockerFixture) -> EvaluationErrorHandler:
+    """Create a mock error handler."""
+    handler = mocker.Mock(spec=EvaluationErrorHandler)
+
+    # Configure create_error_result to return a proper EvaluationResult
+    def create_error_result_side_effect(
+        conv_id: str,
+        metric_id: str,
+        reason: str,
+        *,
+        turn_id: str | None = None,
+        query: str = "",
+    ) -> EvaluationResult:
+        return EvaluationResult(
+            conversation_group_id=conv_id,
+            turn_id=turn_id,
+            metric_identifier=metric_id,
+            result="ERROR",
+            reason=reason,
+            query=query,
+        )
+
+    handler.create_error_result.side_effect = create_error_result_side_effect
+    return handler
+
+
+@pytest.fixture
+def processor_components_pr(
+    mock_metrics_evaluator: MetricsEvaluator,  # pylint: disable=redefined-outer-name
+    mock_api_amender: APIDataAmender,  # pylint: disable=redefined-outer-name
+    mock_error_handler: EvaluationErrorHandler,  # pylint: disable=redefined-outer-name
+    mock_metric_manager: MetricManager,  # pylint: disable=redefined-outer-name
+    mock_script_manager: ScriptExecutionManager,  # pylint: disable=redefined-outer-name
+) -> ProcessorComponents:
+    """Create processor components fixture for PR tests."""
+    return ProcessorComponents(
+        metrics_evaluator=mock_metrics_evaluator,
+        api_amender=mock_api_amender,
+        error_handler=mock_error_handler,
+        metric_manager=mock_metric_manager,
+        script_manager=mock_script_manager,
+    )
+
+
+@pytest.fixture
+def processor(
+    config_loader: ConfigLoader,  # pylint: disable=redefined-outer-name
+    processor_components_pr: ProcessorComponents,  # pylint: disable=redefined-outer-name
+) -> ConversationProcessor:
+    """Create ConversationProcessor instance for PR tests."""
+    return ConversationProcessor(config_loader, processor_components_pr)
diff --git a/tests/unit/pipeline/evaluation/test_amender.py b/tests/unit/pipeline/evaluation/test_amender.py
index 39bbd77d..bc2168df 100644
--- a/tests/unit/pipeline/evaluation/test_amender.py
+++ b/tests/unit/pipeline/evaluation/test_amender.py
@@ -1,5 +1,7 @@
 """Unit tests for pipeline evaluation amender module."""
 
+from pytest_mock import MockerFixture
+
 from lightspeed_evaluation.core.models import APIResponse, TurnData
 from lightspeed_evaluation.core.system.exceptions import APIError
 from lightspeed_evaluation.pipeline.evaluation.amender import APIDataAmender
@@ -8,7 +10,7 @@
 class TestAPIDataAmender:
     """Unit tests for APIDataAmender."""
 
-    def test_amend_single_turn_no_client(self):
+    def test_amend_single_turn_no_client(self) -> None:
         """Test amendment returns None when no API client is available."""
         amender = APIDataAmender(None)
 
@@ -20,7 +22,7 @@ def test_amend_single_turn_no_client(self):
         assert conversation_id is None
         assert turn.response is None  # Not modified
 
-    def test_amend_single_turn_success(self, mocker):
+    def test_amend_single_turn_success(self, mocker: MockerFixture) -> None:
         """Test amending single turn data successfully."""
         mock_client = mocker.Mock()
         api_response = APIResponse(
@@ -51,7 +53,9 @@ def test_amend_single_turn_success(self, mocker):
         assert turn.conversation_id == "conv_123"
         assert turn.contexts == ["Context 1", "Context 2"]
 
-    def test_amend_single_turn_with_conversation_id(self, mocker):
+    def test_amend_single_turn_with_conversation_id(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test amending turn with existing conversation ID."""
         mock_client = mocker.Mock()
         api_response = APIResponse(
@@ -82,7 +86,7 @@ def test_amend_single_turn_with_conversation_id(self, mocker):
         assert turn.conversation_id == "conv_123"
         assert turn.contexts == ["Context 3"]
 
-    def test_amend_single_turn_with_tool_calls(self, mocker):
+    def test_amend_single_turn_with_tool_calls(self, mocker: MockerFixture) -> None:
         """Test amending turn data with tool calls."""
         mock_client = mocker.Mock()
         api_response = APIResponse(
@@ -107,7 +111,7 @@ def test_amend_single_turn_with_tool_calls(self, mocker):
         assert turn.response == "Tool response"
         assert turn.tool_calls == [[{"tool": "test_tool", "args": {"param": "value"}}]]
 
-    def test_amend_single_turn_with_attachments(self, mocker):
+    def test_amend_single_turn_with_attachments(self, mocker: MockerFixture) -> None:
         """Test amending turn data with attachments."""
         mock_client = mocker.Mock()
         api_response = APIResponse(
@@ -144,7 +148,7 @@ def test_amend_single_turn_with_attachments(self, mocker):
         assert turn.response == "Attachment response"
         assert turn.contexts == ["Attachment context"]
 
-    def test_amend_single_turn_api_error(self, mocker):
+    def test_amend_single_turn_api_error(self, mocker: MockerFixture) -> None:
         """Test handling API error during turn amendment."""
         mock_client = mocker.Mock()
         mock_client.query.side_effect = APIError("Connection failed")
@@ -163,7 +167,9 @@ def test_amend_single_turn_api_error(self, mocker):
         assert turn.response is None
         assert turn.conversation_id is None
 
-    def test_amend_single_turn_no_contexts_in_response(self, mocker):
+    def test_amend_single_turn_no_contexts_in_response(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test amending turn when API response has no contexts."""
         mock_client = mocker.Mock()
         api_response = APIResponse(
@@ -184,11 +190,14 @@ def test_amend_single_turn_no_contexts_in_response(self, mocker):
         assert error_msg is None
         assert conversation_id == "conv_no_ctx"
 
-        # Turn data should be amended (contexts should remain None since API response has empty contexts)
+        # Turn data should be amended (contexts should remain None since API response
+        # has empty contexts)
         assert turn.response == "No context response"
         assert turn.contexts is None
 
-    def test_amend_single_turn_no_tool_calls_in_response(self, mocker):
+    def test_amend_single_turn_no_tool_calls_in_response(
+        self, mocker: MockerFixture
+    ) -> None:
         """Test amending turn when API response has no tool calls."""
         mock_client = mocker.Mock()
         api_response = APIResponse(
@@ -209,6 +218,7 @@ def test_amend_single_turn_no_tool_calls_in_response(self, mocker):
         assert error_msg is None
         assert conversation_id == "conv_no_tools"
 
-        # Turn data should be amended (tool_calls should remain None since API response has empty tool_calls)
+        # Turn data should be amended (tool_calls should remain None since API response
+        # has empty tool_calls)
         assert turn.response == "No tools response"
         assert turn.tool_calls is None
diff --git a/tests/unit/pipeline/evaluation/test_errors.py b/tests/unit/pipeline/evaluation/test_errors.py
index 011c21b3..b8477c08 100644
--- a/tests/unit/pipeline/evaluation/test_errors.py
+++ b/tests/unit/pipeline/evaluation/test_errors.py
@@ -7,7 +7,7 @@
 class TestEvaluationErrorHandler:
     """Unit tests for EvaluationErrorHandler."""
 
-    def test_mark_all_metrics_as_error_with_turn_metrics(self):
+    def test_mark_all_metrics_as_error_with_turn_metrics(self) -> None:
         """Test marking all metrics as error with turn metrics."""
         handler = EvaluationErrorHandler()
 
@@ -21,7 +21,7 @@ def test_mark_all_metrics_as_error_with_turn_metrics(self):
             ["ragas:faithfulness", "custom:answer_correctness"],
             ["ragas:response_relevancy"],
         ]
-        resolved_conversation_metrics = []
+        resolved_conversation_metrics: list = []
 
         results = handler.mark_all_metrics_as_error(
             conv_data,
@@ -51,14 +51,14 @@ def test_mark_all_metrics_as_error_with_turn_metrics(self):
         assert results[2].metric_identifier == "ragas:response_relevancy"
         assert results[2].query == "Query 2"
 
-    def test_mark_all_metrics_as_error_with_conversation_metrics(self):
+    def test_mark_all_metrics_as_error_with_conversation_metrics(self) -> None:
         """Test marking conversation-level metrics as error."""
         handler = EvaluationErrorHandler()
 
         turn = TurnData(turn_id="1", query="Query", response="Response")
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        resolved_turn_metrics = [[]]
+        resolved_turn_metrics: list[list[str]] = [[]]
         resolved_conversation_metrics = [
             "deepeval:conversation_completeness",
             "deepeval:conversation_relevancy",
@@ -83,7 +83,7 @@ def test_mark_all_metrics_as_error_with_conversation_metrics(self):
         assert results[1].turn_id is None
         assert results[1].metric_identifier == "deepeval:conversation_relevancy"
 
-    def test_mark_all_metrics_as_error_mixed(self):
+    def test_mark_all_metrics_as_error_mixed(self) -> None:
         """Test marking both turn and conversation metrics as error."""
         handler = EvaluationErrorHandler()
 
@@ -111,15 +111,15 @@ def test_mark_all_metrics_as_error_mixed(self):
         assert results[1].turn_id is None
         assert results[1].metric_identifier == "deepeval:conversation_completeness"
 
-    def test_mark_all_metrics_as_error_empty_metrics(self):
+    def test_mark_all_metrics_as_error_empty_metrics(self) -> None:
         """Test marking with no metrics to mark."""
         handler = EvaluationErrorHandler()
 
         turn = TurnData(turn_id="1", query="Query", response="Response")
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        resolved_turn_metrics = [[]]
-        resolved_conversation_metrics = []
+        resolved_turn_metrics: list[list[str]] = [[]]
+        resolved_conversation_metrics: list[str] = []
 
         results = handler.mark_all_metrics_as_error(
             conv_data, "Error", resolved_turn_metrics, resolved_conversation_metrics
@@ -128,7 +128,7 @@ def test_mark_all_metrics_as_error_empty_metrics(self):
         # Should have no results
         assert len(results) == 0
 
-    def test_mark_turn_metrics_as_error(self):
+    def test_mark_turn_metrics_as_error(self) -> None:
         """Test marking metrics for a single turn as error."""
         handler = EvaluationErrorHandler()
 
@@ -166,7 +166,7 @@ def test_mark_turn_metrics_as_error(self):
         assert results[1].result == "ERROR"
         assert results[1].reason == error_reason
 
-    def test_mark_cascade_error(self):
+    def test_mark_cascade_error(self) -> None:
         """Test marking remaining turns and conversation metrics as error after API failure."""
         handler = EvaluationErrorHandler()
 
diff --git a/tests/unit/pipeline/evaluation/test_evaluator.py b/tests/unit/pipeline/evaluation/test_evaluator.py
index 6e24ce88..01d44109 100644
--- a/tests/unit/pipeline/evaluation/test_evaluator.py
+++ b/tests/unit/pipeline/evaluation/test_evaluator.py
@@ -1,74 +1,31 @@
 """Unit tests for pipeline evaluation evaluator module."""
 
 import pytest
+from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.llm.custom import TokenTracker
 from lightspeed_evaluation.core.models import (
     EvaluationData,
     EvaluationRequest,
     EvaluationScope,
-    SystemConfig,
     TurnData,
 )
 from lightspeed_evaluation.core.system.loader import ConfigLoader
+from lightspeed_evaluation.core.metrics.manager import MetricManager
+from lightspeed_evaluation.core.script import ScriptExecutionManager
 from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator
 
 
-@pytest.fixture
-def config_loader(mocker):
-    """Create a mock config loader with system config."""
-    loader = mocker.Mock(spec=ConfigLoader)
-
-    config = SystemConfig()
-    config.default_turn_metrics_metadata = {
-        "ragas:faithfulness": {"threshold": 0.7, "default": True},
-        "custom:answer_correctness": {"threshold": 0.8, "default": False},
-    }
-    config.default_conversation_metrics_metadata = {
-        "deepeval:conversation_completeness": {"threshold": 0.6, "default": True},
-    }
-    config.api.enabled = True
-
-    loader.system_config = config
-    return loader
-
-
-@pytest.fixture
-def mock_metric_manager(mocker):
-    """Create a mock metric manager."""
-    from lightspeed_evaluation.core.metrics.manager import MetricManager
-
-    manager = mocker.Mock(spec=MetricManager)
-
-    def get_threshold(metric_id, level, conv_data=None, turn_data=None):
-        thresholds = {
-            "ragas:faithfulness": 0.7,
-            "custom:answer_correctness": 0.8,
-            "deepeval:conversation_completeness": 0.6,
-        }
-        return thresholds.get(metric_id, 0.5)
-
-    manager.get_effective_threshold.side_effect = get_threshold
-    # Mock get_metric_metadata to return None (no metadata) to support iteration in _extract_metadata_for_csv
-    manager.get_metric_metadata.return_value = None
-    return manager
-
-
-@pytest.fixture
-def mock_script_manager(mocker):
-    """Create a mock script execution manager."""
-    from lightspeed_evaluation.core.script import ScriptExecutionManager
-
-    manager = mocker.Mock(spec=ScriptExecutionManager)
-    return manager
-
-
 class TestMetricsEvaluator:
     """Unit tests for MetricsEvaluator."""
 
     def test_initialization(
-        self, config_loader, mock_metric_manager, mock_script_manager, mocker
-    ):
+        self,
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+    ) -> None:
         """Test evaluator initialization."""
         # Mock the metric handlers
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager")
@@ -98,8 +55,10 @@ def test_initialization(
         )  # ragas, deepeval, geval, custom, script, nlp
 
     def test_initialization_raises_error_without_config(
-        self, mock_metric_manager, mock_script_manager
-    ):
+        self,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+    ) -> None:
         """Test initialization fails without system config."""
         loader = ConfigLoader()
         loader.system_config = None
@@ -108,8 +67,12 @@ def test_initialization_raises_error_without_config(
             MetricsEvaluator(loader, mock_metric_manager, mock_script_manager)
 
     def test_evaluate_metric_turn_level_pass(
-        self, config_loader, mock_metric_manager, mock_script_manager, mocker
-    ):
+        self,
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+    ) -> None:
         """Test evaluating turn-level metric that passes."""
         # Mock the handlers
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager")
@@ -165,8 +128,12 @@ def test_evaluate_metric_turn_level_pass(
         assert result.contexts == '["Context"]'
 
     def test_evaluate_metric_turn_level_fail(
-        self, config_loader, mock_metric_manager, mock_script_manager, mocker
-    ):
+        self,
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+    ) -> None:
         """Test evaluating turn-level metric that fails."""
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager")
         mocker.patch(
@@ -210,8 +177,12 @@ def test_evaluate_metric_turn_level_fail(
         assert result.threshold == 0.7
 
     def test_evaluate_metric_conversation_level(
-        self, config_loader, mock_metric_manager, mock_script_manager, mocker
-    ):
+        self,
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+    ) -> None:
         """Test evaluating conversation-level metric."""
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager")
         mocker.patch(
@@ -250,8 +221,12 @@ def test_evaluate_metric_conversation_level(
         assert result.turn_id is None  # Conversation-level
 
     def test_evaluate_metric_unsupported_framework(
-        self, config_loader, mock_metric_manager, mock_script_manager, mocker
-    ):
+        self,
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+    ) -> None:
         """Test evaluating metric with unsupported framework."""
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager")
         mocker.patch(
@@ -285,8 +260,12 @@ def test_evaluate_metric_unsupported_framework(
         assert "Unsupported framework" in result.reason
 
     def test_evaluate_metric_returns_none_score(
-        self, config_loader, mock_metric_manager, mock_script_manager, mocker
-    ):
+        self,
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+    ) -> None:
         """Test handling when metric evaluation returns None score."""
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager")
         mocker.patch(
@@ -328,8 +307,12 @@ def test_evaluate_metric_returns_none_score(
         assert result.reason == "Evaluation failed"
 
     def test_evaluate_metric_exception_handling(
-        self, config_loader, mock_metric_manager, mock_script_manager, mocker
-    ):
+        self,
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+    ) -> None:
         """Test exception handling during metric evaluation."""
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager")
         mocker.patch(
@@ -376,9 +359,14 @@ def test_evaluate_metric_exception_handling(
         assert result.expected_response is None
 
     def test_evaluate_metric_skip_script_when_api_disabled(
-        self, config_loader, mock_metric_manager, mock_script_manager, mocker
-    ):
+        self,
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+    ) -> None:
         """Test script metrics are skipped when API is disabled."""
+        assert config_loader.system_config is not None
         config_loader.system_config.api.enabled = False
 
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager")
@@ -413,8 +401,12 @@ def test_evaluate_metric_skip_script_when_api_disabled(
         assert result is None
 
     def test_determine_status_with_threshold(
-        self, config_loader, mock_metric_manager, mock_script_manager, mocker
-    ):
+        self,
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+    ) -> None:
         """Test _determine_status method."""
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager")
         mocker.patch(
@@ -436,15 +428,28 @@ def test_determine_status_with_threshold(
         )
 
         # Test PASS
-        assert evaluator._determine_status(0.8, 0.7) == "PASS"
-        assert evaluator._determine_status(0.7, 0.7) == "PASS"  # Equal passes
+        assert (
+            evaluator._determine_status(0.8, 0.7)  # pylint: disable=protected-access
+            == "PASS"
+        )
+        assert (
+            evaluator._determine_status(0.7, 0.7)  # pylint: disable=protected-access
+            == "PASS"
+        )  # Equal passes
 
         # Test FAIL
-        assert evaluator._determine_status(0.6, 0.7) == "FAIL"
+        assert (
+            evaluator._determine_status(0.6, 0.7)  # pylint: disable=protected-access
+            == "FAIL"
+        )
 
     def test_determine_status_without_threshold(
-        self, config_loader, mock_metric_manager, mock_script_manager, mocker
-    ):
+        self,
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+    ) -> None:
         """Test _determine_status uses default 0.5 when threshold is None."""
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager")
         mocker.patch(
@@ -466,17 +471,23 @@ def test_determine_status_without_threshold(
         )
 
         # Should use 0.5 as default
-        assert evaluator._determine_status(0.6, None) == "PASS"
-        assert evaluator._determine_status(0.4, None) == "FAIL"
+        assert (
+            evaluator._determine_status(0.6, None)  # pylint: disable=protected-access
+            == "PASS"
+        )
+        assert (
+            evaluator._determine_status(0.4, None)  # pylint: disable=protected-access
+            == "FAIL"
+        )
 
-    def _setup_evaluate_test(
+    def _setup_evaluate_test(  # pylint: disable=too-many-arguments, too-many-positional-arguments
         self,
-        config_loader,
-        mock_metric_manager,
-        mock_script_manager,
-        mocker,
-        mock_return,
-    ):
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+        mock_return: tuple[float, str] | list[tuple[float, str]],
+    ) -> tuple[MetricsEvaluator, dict]:
         """Helper to setup common mocks for _evaluate_wrapper() tests.
 
         Returns:
@@ -489,7 +500,10 @@ def _setup_evaluate_test(
         )
 
         # Create a helper to setup mock with return values
-        def create_mock_handler(mocker, mock_return):
+        def create_mock_handler(  # type: ignore[no-untyped-def]
+            mocker: MockerFixture,
+            mock_return: tuple[float, str] | list[tuple[float, str]],
+        ):
             mock = mocker.Mock()
             if isinstance(mock_return, list):
                 mock.evaluate.side_effect = mock_return
@@ -544,14 +558,14 @@ def create_mock_handler(mocker, mock_return):
         "metric_identifier",
         ["ragas:context_recall", "custom:answer_correctness", "nlp:rouge"],
     )
-    def test_evaluate_with_expected_response_list(
+    def test_evaluate_with_expected_response_list(  # pylint: disable=too-many-arguments, too-many-positional-arguments
         self,
-        config_loader,
-        mock_metric_manager,
-        mock_script_manager,
-        mocker,
-        metric_identifier,
-    ):
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+        metric_identifier: str,
+    ) -> None:
         """Test _evaluate_wrapper() with list expected_response for metric that requires it."""
         evaluator, mock_handlers = self._setup_evaluate_test(
             config_loader,
@@ -572,7 +586,9 @@ def test_evaluate_with_expected_response_list(
         request = EvaluationRequest.for_turn(conv_data, metric_identifier, 0, turn_data)
         scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False)
 
-        metric_result = evaluator._evaluate_wrapper(request, scope, 0.7)
+        metric_result = evaluator._evaluate_wrapper(  # pylint: disable=protected-access
+            request, scope, 0.7
+        )
 
         assert metric_result.score == 0.85
         assert metric_result.reason == "High score"
@@ -583,8 +599,12 @@ def test_evaluate_with_expected_response_list(
         assert mock_handlers[framework].evaluate.call_count == 2
 
     def test_evaluate_with_expected_response_list_fail(
-        self, config_loader, mock_metric_manager, mock_script_manager, mocker
-    ):
+        self,
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+    ) -> None:
         """Test _evaluate_wrapper() with list expected_response for metric that requires it."""
         scores_reasons = [(0.3, "Score 1"), (0.65, "Score 2"), (0.45, "Score 3")]
         evaluator, mock_handlers = self._setup_evaluate_test(
@@ -608,7 +628,9 @@ def test_evaluate_with_expected_response_list_fail(
         )
         scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False)
 
-        metric_result = evaluator._evaluate_wrapper(request, scope, 0.7)
+        metric_result = evaluator._evaluate_wrapper(  # pylint: disable=protected-access
+            request, scope, 0.7
+        )
         reason_combined = "\n".join(
             [f"{score}; {reason}" for score, reason in scores_reasons]
         )
@@ -619,8 +641,12 @@ def test_evaluate_with_expected_response_list_fail(
         assert mock_handlers["ragas"].evaluate.call_count == 3
 
     def test_evaluate_with_expected_response_string(
-        self, config_loader, mock_metric_manager, mock_script_manager, mocker
-    ):
+        self,
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+    ) -> None:
         """Test _evaluate_wrapper() with string expected_response."""
         evaluator, mock_handlers = self._setup_evaluate_test(
             config_loader,
@@ -639,7 +665,9 @@ def test_evaluate_with_expected_response_string(
         )
         scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False)
 
-        metric_result = evaluator._evaluate_wrapper(request, scope, 0.7)
+        metric_result = evaluator._evaluate_wrapper(  # pylint: disable=protected-access
+            request, scope, 0.7
+        )
 
         assert metric_result.score == 0.85
         assert metric_result.reason == "Good score"
@@ -654,15 +682,15 @@ def test_evaluate_with_expected_response_string(
         [None, "string", ["string1", "string2"]],
         ids=["none", "string", "string_list"],
     )
-    def test_evaluate_with_expected_response_not_needed(
+    def test_evaluate_with_expected_response_not_needed(  # pylint: disable=too-many-arguments, too-many-positional-arguments
         self,
-        config_loader,
-        mock_metric_manager,
-        mock_script_manager,
-        mocker,
-        metric_identifier,
-        expected_response,
-    ):
+        config_loader: ConfigLoader,
+        mock_metric_manager: MetricManager,
+        mock_script_manager: ScriptExecutionManager,
+        mocker: MockerFixture,
+        metric_identifier: str,
+        expected_response: str | list[str] | None,
+    ) -> None:
         """Test _evaluate_wrapper() with metric that does not require expected_response."""
         evaluator, mock_handlers = self._setup_evaluate_test(
             config_loader,
@@ -683,7 +711,9 @@ def test_evaluate_with_expected_response_not_needed(
         request = EvaluationRequest.for_turn(conv_data, metric_identifier, 0, turn_data)
         scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False)
 
-        metric_result = evaluator._evaluate_wrapper(request, scope, 0.7)
+        metric_result = evaluator._evaluate_wrapper(  # pylint: disable=protected-access
+            request, scope, 0.7
+        )
 
         assert metric_result.score == 0.3
         assert metric_result.reason == "Low score"
@@ -697,21 +727,21 @@ def test_evaluate_with_expected_response_not_needed(
 class TestTokenTracker:
     """Unit tests for TokenTracker class."""
 
-    def test_token_tracker_initialization(self):
+    def test_token_tracker_initialization(self) -> None:
         """Test TokenTracker initializes with zero counts."""
         tracker = TokenTracker()
         input_tokens, output_tokens = tracker.get_counts()
         assert input_tokens == 0
         assert output_tokens == 0
 
-    def test_token_tracker_get_counts_returns_tuple(self):
+    def test_token_tracker_get_counts_returns_tuple(self) -> None:
         """Test get_counts returns a tuple."""
         tracker = TokenTracker()
         result = tracker.get_counts()
         assert isinstance(result, tuple)
         assert len(result) == 2
 
-    def test_token_tracker_reset(self):
+    def test_token_tracker_reset(self) -> None:
         """Test reset clears token counts."""
         tracker = TokenTracker()
         tracker.input_tokens = 100
@@ -719,31 +749,31 @@ def test_token_tracker_reset(self):
         tracker.reset()
         assert tracker.get_counts() == (0, 0)
 
-    def test_token_tracker_start_stop(self):
+    def test_token_tracker_start_stop(self) -> None:
         """Test start and stop methods."""
         tracker = TokenTracker()
         tracker.start()
-        assert tracker._callback_registered is True
+        assert tracker._callback_registered is True  # pylint: disable=protected-access
         tracker.stop()
-        assert tracker._callback_registered is False
+        assert tracker._callback_registered is False  # pylint: disable=protected-access
 
-    def test_token_tracker_double_start(self):
+    def test_token_tracker_double_start(self) -> None:
         """Test calling start twice doesn't register callback twice."""
         tracker = TokenTracker()
         tracker.start()
         tracker.start()  # Should not fail
-        assert tracker._callback_registered is True
+        assert tracker._callback_registered is True  # pylint: disable=protected-access
         tracker.stop()
 
-    def test_token_tracker_double_stop(self):
+    def test_token_tracker_double_stop(self) -> None:
         """Test calling stop twice doesn't fail."""
         tracker = TokenTracker()
         tracker.start()
         tracker.stop()
         tracker.stop()  # Should not fail
-        assert tracker._callback_registered is False
+        assert tracker._callback_registered is False  # pylint: disable=protected-access
 
-    def test_token_tracker_independent_instances(self):
+    def test_token_tracker_independent_instances(self) -> None:
         """Test multiple TokenTracker instances are independent."""
         tracker1 = TokenTracker()
         tracker2 = TokenTracker()
diff --git a/tests/unit/pipeline/evaluation/test_pipeline.py b/tests/unit/pipeline/evaluation/test_pipeline.py
index a922a87e..aeaba1fc 100644
--- a/tests/unit/pipeline/evaluation/test_pipeline.py
+++ b/tests/unit/pipeline/evaluation/test_pipeline.py
@@ -1,53 +1,22 @@
 """Unit tests for EvaluationPipeline."""
 
 import pytest
+from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.models import (
     EvaluationData,
     EvaluationResult,
-    SystemConfig,
-    TurnData,
 )
 from lightspeed_evaluation.core.system.loader import ConfigLoader
 from lightspeed_evaluation.pipeline.evaluation.pipeline import EvaluationPipeline
 
 
-@pytest.fixture
-def mock_config_loader(mocker):
-    """Create a mock config loader with system config."""
-    loader = mocker.Mock(spec=ConfigLoader)
-
-    config = SystemConfig()
-    config.api.enabled = False
-    config.output.output_dir = "/tmp/test_output"
-    config.output.base_filename = "test"
-    config.core.max_threads = 2
-
-    loader.system_config = config
-    return loader
-
-
-@pytest.fixture
-def sample_evaluation_data():
-    """Create sample evaluation data."""
-    turn1 = TurnData(
-        turn_id="turn1",
-        query="What is Python?",
-        response="Python is a programming language.",
-        contexts=["Python context"],
-        turn_metrics=["ragas:faithfulness"],
-    )
-    conv_data = EvaluationData(
-        conversation_group_id="conv1",
-        turns=[turn1],
-    )
-    return [conv_data]
-
-
 class TestEvaluationPipeline:
     """Unit tests for EvaluationPipeline."""
 
-    def test_initialization_success(self, mock_config_loader, mocker):
+    def test_initialization_success(
+        self, mock_config_loader: ConfigLoader, mocker: MockerFixture
+    ) -> None:
         """Test successful pipeline initialization."""
         # Mock components
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.pipeline.MetricManager")
@@ -74,7 +43,7 @@ def test_initialization_success(self, mock_config_loader, mocker):
         assert pipeline.system_config is not None
         assert pipeline.output_dir == "/tmp/test_output"
 
-    def test_initialization_without_config(self, mocker):
+    def test_initialization_without_config(self, mocker: MockerFixture) -> None:
         """Test initialization fails without system config."""
         loader = mocker.Mock(spec=ConfigLoader)
         loader.system_config = None
@@ -82,8 +51,11 @@ def test_initialization_without_config(self, mocker):
         with pytest.raises(ValueError, match="SystemConfig must be loaded"):
             EvaluationPipeline(loader)
 
-    def test_create_api_client_when_enabled(self, mock_config_loader, mocker):
+    def test_create_api_client_when_enabled(
+        self, mock_config_loader: ConfigLoader, mocker: MockerFixture
+    ) -> None:
         """Test API client creation when enabled."""
+        assert mock_config_loader.system_config is not None
         mock_config_loader.system_config.api.enabled = True
         mock_config_loader.system_config.api.api_base = "http://test.com"
         mock_config_loader.system_config.api.endpoint_type = "test"
@@ -113,8 +85,11 @@ def test_create_api_client_when_enabled(self, mock_config_loader, mocker):
         assert pipeline.api_client is not None
         mock_api_client.assert_called_once()
 
-    def test_create_api_client_when_disabled(self, mock_config_loader, mocker):
+    def test_create_api_client_when_disabled(
+        self, mock_config_loader: ConfigLoader, mocker: MockerFixture
+    ) -> None:
         """Test no API client when disabled."""
+        assert mock_config_loader.system_config is not None
         mock_config_loader.system_config.api.enabled = False
 
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.pipeline.MetricManager")
@@ -139,8 +114,11 @@ def test_create_api_client_when_disabled(self, mock_config_loader, mocker):
         assert pipeline.api_client is None
 
     def test_run_evaluation_success(
-        self, mock_config_loader, sample_evaluation_data, mocker
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        sample_evaluation_data: list[EvaluationData],
+        mocker: MockerFixture,
+    ) -> None:
         """Test successful evaluation run."""
         # Mock all components
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.pipeline.MetricManager")
@@ -182,9 +160,13 @@ def test_run_evaluation_success(
         assert results[0].result == "PASS"
 
     def test_run_evaluation_saves_amended_data_when_api_enabled(
-        self, mock_config_loader, sample_evaluation_data, mocker
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        sample_evaluation_data: list[EvaluationData],
+        mocker: MockerFixture,
+    ) -> None:
         """Test amended data is saved when API is enabled."""
+        assert mock_config_loader.system_config is not None
         mock_config_loader.system_config.api.enabled = True
 
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.pipeline.MetricManager")
@@ -220,9 +202,13 @@ def test_run_evaluation_saves_amended_data_when_api_enabled(
         mock_save.assert_called_once()
 
     def test_save_amended_data_handles_exception(
-        self, mock_config_loader, sample_evaluation_data, mocker
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        sample_evaluation_data: list[EvaluationData],
+        mocker: MockerFixture,
+    ) -> None:
         """Test save amended data handles exceptions gracefully."""
+        assert mock_config_loader.system_config is not None
         mock_config_loader.system_config.api.enabled = True
 
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.pipeline.MetricManager")
@@ -259,8 +245,11 @@ def test_save_amended_data_handles_exception(
 
         assert results is not None
 
-    def test_close_with_api_client(self, mock_config_loader, mocker):
+    def test_close_with_api_client(
+        self, mock_config_loader: ConfigLoader, mocker: MockerFixture
+    ) -> None:
         """Test close method with API client."""
+        assert mock_config_loader.system_config is not None
         mock_config_loader.system_config.api.enabled = True
         mock_config_loader.system_config.api.api_base = "http://test.com"
         mock_config_loader.system_config.api.endpoint_type = "test"
@@ -302,8 +291,11 @@ def test_close_with_api_client(self, mock_config_loader, mocker):
 
         mock_api_client.close.assert_called_once()
 
-    def test_close_without_api_client(self, mock_config_loader, mocker):
+    def test_close_without_api_client(
+        self, mock_config_loader: ConfigLoader, mocker: MockerFixture
+    ) -> None:
         """Test close method without API client."""
+        assert mock_config_loader.system_config is not None
         mock_config_loader.system_config.api.enabled = False
 
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.pipeline.MetricManager")
@@ -332,7 +324,9 @@ def test_close_without_api_client(self, mock_config_loader, mocker):
         # Should not raise any errors
         pipeline.close()
 
-    def test_output_dir_override(self, mock_config_loader, mocker):
+    def test_output_dir_override(
+        self, mock_config_loader: ConfigLoader, mocker: MockerFixture
+    ) -> None:
         """Test output directory can be overridden."""
         mocker.patch("lightspeed_evaluation.pipeline.evaluation.pipeline.MetricManager")
         mocker.patch(
diff --git a/tests/unit/pipeline/evaluation/test_processor.py b/tests/unit/pipeline/evaluation/test_processor.py
index c3f0d033..16b3d4f5 100644
--- a/tests/unit/pipeline/evaluation/test_processor.py
+++ b/tests/unit/pipeline/evaluation/test_processor.py
@@ -1,8 +1,13 @@
 """Unit tests for ConversationProcessor."""
 
+from typing import Callable
+import logging
+
 import pytest
+from _pytest.logging import LogCaptureFixture
+from pytest_mock import MockerFixture
 
-from lightspeed_evaluation.core.metrics.manager import MetricManager
+from lightspeed_evaluation.core.metrics.manager import MetricLevel
 from lightspeed_evaluation.core.models import (
     EvaluationData,
     EvaluationRequest,
@@ -10,13 +15,8 @@
     SystemConfig,
     TurnData,
 )
-from lightspeed_evaluation.core.script import (
-    ScriptExecutionError,
-    ScriptExecutionManager,
-)
+from lightspeed_evaluation.core.script import ScriptExecutionError
 from lightspeed_evaluation.core.system.loader import ConfigLoader
-from lightspeed_evaluation.pipeline.evaluation.amender import APIDataAmender
-from lightspeed_evaluation.pipeline.evaluation.errors import EvaluationErrorHandler
 from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator
 from lightspeed_evaluation.pipeline.evaluation.processor import (
     ConversationProcessor,
@@ -24,57 +24,14 @@
 )
 
 
-@pytest.fixture
-def mock_config_loader(mocker):
-    """Create a mock config loader."""
-    loader = mocker.Mock(spec=ConfigLoader)
-    config = SystemConfig()
-    config.api.enabled = False
-    loader.system_config = config
-    return loader
-
-
-@pytest.fixture
-def processor_components(mocker):
-    """Create processor components."""
-    metrics_evaluator = mocker.Mock(spec=MetricsEvaluator)
-    api_amender = mocker.Mock(spec=APIDataAmender)
-    error_handler = mocker.Mock(spec=EvaluationErrorHandler)
-    metric_manager = mocker.Mock(spec=MetricManager)
-    script_manager = mocker.Mock(spec=ScriptExecutionManager)
-
-    # Default behavior for metric resolution
-    metric_manager.resolve_metrics.return_value = ["ragas:faithfulness"]
-
-    return ProcessorComponents(
-        metrics_evaluator=metrics_evaluator,
-        api_amender=api_amender,
-        error_handler=error_handler,
-        metric_manager=metric_manager,
-        script_manager=script_manager,
-    )
-
-
-@pytest.fixture
-def sample_conv_data():
-    """Create sample conversation data."""
-    turn1 = TurnData(
-        turn_id="turn1",
-        query="What is Python?",
-        response="Python is a programming language.",
-        contexts=["Context"],
-        turn_metrics=["ragas:faithfulness"],
-    )
-    return EvaluationData(
-        conversation_group_id="conv1",
-        turns=[turn1],
-    )
-
-
 class TestConversationProcessor:
     """Unit tests for ConversationProcessor."""
 
-    def test_initialization(self, mock_config_loader, processor_components):
+    def test_initialization(
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+    ) -> None:
         """Test processor initialization."""
         processor = ConversationProcessor(mock_config_loader, processor_components)
 
@@ -83,8 +40,12 @@ def test_initialization(self, mock_config_loader, processor_components):
         assert processor.components == processor_components
 
     def test_process_conversation_skips_when_no_metrics(
-        self, mock_config_loader, processor_components, sample_conv_data, mocker
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        sample_conv_data: EvaluationData,
+        mocker: MockerFixture,  # pylint: disable=unused-argument
+    ) -> None:
         """Test processing skips when no metrics specified."""
         # Mock metric manager to return empty lists
         processor_components.metric_manager.resolve_metrics.return_value = []
@@ -95,15 +56,16 @@ def test_process_conversation_skips_when_no_metrics(
         assert len(results) == 0
 
     def test_process_conversation_turn_metrics(
-        self, mock_config_loader, processor_components, sample_conv_data, mocker
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        sample_conv_data: EvaluationData,
+        mocker: MockerFixture,  # pylint: disable=unused-argument
+    ) -> None:
         """Test processing with turn-level metrics."""
-        from lightspeed_evaluation.core.models import EvaluationResult
 
         # Configure metric manager to return turn metrics and empty conversation metrics
-        def resolve_side_effect(metrics, level):
-            from lightspeed_evaluation.core.metrics.manager import MetricLevel
-
+        def resolve_side_effect(_metrics: list[str], level: MetricLevel) -> list[str]:
             if level == MetricLevel.TURN:
                 return ["ragas:faithfulness"]
             return []
@@ -134,10 +96,12 @@ def resolve_side_effect(metrics, level):
         assert all(r.result == "PASS" for r in results)
 
     def test_process_conversation_conversation_metrics(
-        self, mock_config_loader, processor_components, mocker
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        mocker: MockerFixture,  # pylint: disable=unused-argument
+    ) -> None:
         """Test processing with conversation-level metrics."""
-        from lightspeed_evaluation.core.models import EvaluationResult
 
         turn1 = TurnData(turn_id="turn1", query="Q", response="R")
         conv_data = EvaluationData(
@@ -147,9 +111,7 @@ def test_process_conversation_conversation_metrics(
         )
 
         # Mock metric resolution
-        def resolve_side_effect(metrics, level):
-            from lightspeed_evaluation.core.metrics.manager import MetricLevel
-
+        def resolve_side_effect(_metrics: list[str], level: MetricLevel) -> list[str]:
             if level == MetricLevel.TURN:
                 return []
             return ["deepeval:conversation_completeness"]
@@ -178,18 +140,20 @@ def resolve_side_effect(metrics, level):
         assert results[0].turn_id is None  # Conversation-level
 
     def test_process_conversation_with_setup_script_success(
-        self, mock_config_loader, processor_components, sample_conv_data, mocker
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        sample_conv_data: EvaluationData,
+        mocker: MockerFixture,  # pylint: disable=unused-argument
+    ) -> None:
         """Test processing with successful setup script."""
-        from lightspeed_evaluation.core.models import EvaluationResult
 
         sample_conv_data.setup_script = "setup.sh"
+        assert mock_config_loader.system_config is not None
         mock_config_loader.system_config.api.enabled = True
 
         # Configure metric manager to return turn metrics and empty conversation metrics
-        def resolve_side_effect(metrics, level):
-            from lightspeed_evaluation.core.metrics.manager import MetricLevel
-
+        def resolve_side_effect(_metrics: list[str], level: MetricLevel) -> list[str]:
             if level == MetricLevel.TURN:
                 return ["ragas:faithfulness"]
             return []
@@ -225,10 +189,15 @@ def resolve_side_effect(metrics, level):
         assert len(results) > 0
 
     def test_process_conversation_with_setup_script_failure(
-        self, mock_config_loader, processor_components, sample_conv_data, mocker
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        sample_conv_data: EvaluationData,
+        mocker: MockerFixture,  # pylint: disable=unused-argument
+    ) -> None:
         """Test processing handles setup script failure."""
         sample_conv_data.setup_script = "setup.sh"
+        assert mock_config_loader.system_config is not None
         mock_config_loader.system_config.api.enabled = True
 
         processor_components.script_manager.run_script.side_effect = (
@@ -242,18 +211,20 @@ def test_process_conversation_with_setup_script_failure(
         processor_components.error_handler.mark_all_metrics_as_error.assert_called_once()
 
     def test_process_conversation_with_cleanup_script(
-        self, mock_config_loader, processor_components, sample_conv_data, mocker
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        sample_conv_data: EvaluationData,
+        mocker: MockerFixture,  # pylint: disable=unused-argument
+    ) -> None:
         """Test cleanup script is always called."""
-        from lightspeed_evaluation.core.models import EvaluationResult
 
         sample_conv_data.cleanup_script = "cleanup.sh"
+        assert mock_config_loader.system_config is not None
         mock_config_loader.system_config.api.enabled = True
 
         # Configure metric manager to return turn metrics and empty conversation metrics
-        def resolve_side_effect(metrics, level):
-            from lightspeed_evaluation.core.metrics.manager import MetricLevel
-
+        def resolve_side_effect(_metrics: list[str], level: MetricLevel) -> list[str]:
             if level == MetricLevel.TURN:
                 return ["ragas:faithfulness"]
             return []
@@ -290,17 +261,19 @@ def resolve_side_effect(metrics, level):
         assert any("cleanup.sh" in str(call) for call in calls)
 
     def test_process_conversation_with_api_amendment(
-        self, mock_config_loader, processor_components, sample_conv_data, mocker
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        sample_conv_data: EvaluationData,
+        mocker: MockerFixture,  # pylint: disable=unused-argument
+    ) -> None:
         """Test API amendment during turn processing."""
-        from lightspeed_evaluation.core.models import EvaluationResult
 
+        assert mock_config_loader.system_config is not None
         mock_config_loader.system_config.api.enabled = True
 
         # Configure metric manager to return turn metrics and empty conversation metrics
-        def resolve_side_effect(metrics, level):
-            from lightspeed_evaluation.core.metrics.manager import MetricLevel
-
+        def resolve_side_effect(_metrics: list[str], level: MetricLevel) -> list[str]:
             if level == MetricLevel.TURN:
                 return ["ragas:faithfulness"]
             return []
@@ -335,9 +308,13 @@ def resolve_side_effect(metrics, level):
         assert len(results) > 0
 
     def test_process_conversation_with_api_error_cascade(
-        self, mock_config_loader, processor_components, mocker
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        mocker: MockerFixture,  # pylint: disable=unused-argument
+    ) -> None:
         """Test API error causes cascade failure."""
+        assert mock_config_loader.system_config is not None
         mock_config_loader.system_config.api.enabled = True
 
         # Create multi-turn conversation
@@ -373,10 +350,13 @@ def test_process_conversation_with_api_error_cascade(
         processor_components.error_handler.mark_cascade_error.assert_called_once()
 
     def test_evaluate_turn(
-        self, mock_config_loader, processor_components, sample_conv_data, mocker
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        sample_conv_data: EvaluationData,
+        mocker: MockerFixture,  # pylint: disable=unused-argument
+    ) -> None:
         """Test _evaluate_turn method."""
-        from lightspeed_evaluation.core.models import EvaluationResult
 
         mock_result = EvaluationResult(
             conversation_group_id="conv1",
@@ -392,7 +372,7 @@ def test_evaluate_turn(
         )
 
         processor = ConversationProcessor(mock_config_loader, processor_components)
-        results = processor._evaluate_turn(
+        results = processor._evaluate_turn(  # pylint: disable=protected-access
             sample_conv_data, 0, sample_conv_data.turns[0], ["ragas:faithfulness"]
         )
 
@@ -400,10 +380,13 @@ def test_evaluate_turn(
         assert results[0].result == "PASS"
 
     def test_evaluate_conversation(
-        self, mock_config_loader, processor_components, sample_conv_data, mocker
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        sample_conv_data: EvaluationData,
+        mocker: MockerFixture,  # pylint: disable=unused-argument
+    ) -> None:
         """Test _evaluate_conversation method."""
-        from lightspeed_evaluation.core.models import EvaluationResult
 
         mock_result = EvaluationResult(
             conversation_group_id="conv1",
@@ -419,7 +402,7 @@ def test_evaluate_conversation(
         )
 
         processor = ConversationProcessor(mock_config_loader, processor_components)
-        results = processor._evaluate_conversation(
+        results = processor._evaluate_conversation(  # pylint: disable=protected-access
             sample_conv_data, ["deepeval:conversation_completeness"]
         )
 
@@ -427,46 +410,67 @@ def test_evaluate_conversation(
         assert results[0].turn_id is None
 
     def test_run_setup_script_skips_when_api_disabled(
-        self, mock_config_loader, processor_components, sample_conv_data
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        sample_conv_data: EvaluationData,
+    ) -> None:
         """Test setup script is skipped when API disabled."""
         sample_conv_data.setup_script = "setup.sh"
+        assert mock_config_loader.system_config is not None
         mock_config_loader.system_config.api.enabled = False
 
         processor = ConversationProcessor(mock_config_loader, processor_components)
-        error = processor._run_setup_script(sample_conv_data)
+        error = processor._run_setup_script(  # pylint: disable=protected-access
+            sample_conv_data
+        )
 
         assert error is None
         processor_components.script_manager.run_script.assert_not_called()
 
     def test_run_cleanup_script_skips_when_api_disabled(
-        self, mock_config_loader, processor_components, sample_conv_data
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        sample_conv_data: EvaluationData,
+    ) -> None:
         """Test cleanup script is skipped when API disabled."""
         sample_conv_data.cleanup_script = "cleanup.sh"
+        assert mock_config_loader.system_config is not None
         mock_config_loader.system_config.api.enabled = False
 
         processor = ConversationProcessor(mock_config_loader, processor_components)
-        processor._run_cleanup_script(sample_conv_data)
+        processor._run_cleanup_script(  # pylint: disable=protected-access
+            sample_conv_data
+        )
 
         processor_components.script_manager.run_script.assert_not_called()
 
     def test_run_cleanup_script_logs_warning_on_failure(
-        self, mock_config_loader, processor_components, sample_conv_data
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        sample_conv_data: EvaluationData,
+    ) -> None:
         """Test cleanup script failure is logged as warning."""
         sample_conv_data.cleanup_script = "cleanup.sh"
+        assert mock_config_loader.system_config is not None
         mock_config_loader.system_config.api.enabled = True
 
         processor_components.script_manager.run_script.return_value = False
 
         processor = ConversationProcessor(mock_config_loader, processor_components)
         # Should not raise, just log warning
-        processor._run_cleanup_script(sample_conv_data)
+        processor._run_cleanup_script(  # pylint: disable=protected-access
+            sample_conv_data
+        )
 
     def test_get_metrics_summary(
-        self, mock_config_loader, processor_components, sample_conv_data
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        sample_conv_data: EvaluationData,
+    ) -> None:
         """Test get_metrics_summary method."""
         processor_components.metric_manager.count_metrics_for_conversation.return_value = {
             "turn_metrics": 2,
@@ -480,118 +484,12 @@ def test_get_metrics_summary(
         assert summary["conversation_metrics"] == 1
 
 
-# Fixtures for TestConversationProcessorEvaluateTurn
-@pytest.fixture
-def config_loader(mocker):
-    """Create a mock config loader with system config."""
-    loader = mocker.Mock(spec=ConfigLoader)
-
-    config = SystemConfig()
-    config.default_turn_metrics_metadata = {
-        "ragas:faithfulness": {"threshold": 0.7, "default": True},
-        "custom:answer_correctness": {"threshold": 0.8, "default": False},
-    }
-    config.default_conversation_metrics_metadata = {
-        "deepeval:conversation_completeness": {"threshold": 0.6, "default": True},
-    }
-    config.api.enabled = False
-
-    loader.system_config = config
-    return loader
-
-
-@pytest.fixture
-def mock_metrics_evaluator(mocker):
-    """Create a mock metrics evaluator."""
-    evaluator = mocker.Mock(spec=MetricsEvaluator)
-
-    def evaluate_metric(request):
-        """Mock evaluate_metric that returns a result based on metric."""
-        return EvaluationResult(
-            conversation_group_id=request.conv_data.conversation_group_id,
-            turn_id=request.turn_id,
-            metric_identifier=request.metric_identifier,
-            result="PASS",
-            score=0.85,
-            reason="Test evaluation",
-            threshold=0.7,
-        )
-
-    evaluator.evaluate_metric.side_effect = evaluate_metric
-    return evaluator
-
-
-@pytest.fixture
-def mock_api_amender(mocker):
-    """Create a mock API data amender."""
-    amender = mocker.Mock(spec=APIDataAmender)
-    return amender
-
-
-@pytest.fixture
-def mock_error_handler(mocker):
-    """Create a mock error handler."""
-    handler = mocker.Mock(spec=EvaluationErrorHandler)
-
-    # Configure create_error_result to return a proper EvaluationResult
-    def create_error_result_side_effect(
-        conv_id, metric_id, reason, *, turn_id=None, query=""
-    ):
-        return EvaluationResult(
-            conversation_group_id=conv_id,
-            turn_id=turn_id,
-            metric_identifier=metric_id,
-            result="ERROR",
-            reason=reason,
-            query=query,
-        )
-
-    handler.create_error_result.side_effect = create_error_result_side_effect
-    return handler
-
-
-@pytest.fixture
-def mock_metric_manager(mocker):
-    """Create a mock metric manager."""
-    manager = mocker.Mock(spec=MetricManager)
-    return manager
-
-
-@pytest.fixture
-def mock_script_manager(mocker):
-    """Create a mock script execution manager."""
-    manager = mocker.Mock(spec=ScriptExecutionManager)
-    return manager
-
-
-@pytest.fixture
-def processor_components_pr(
-    mock_metrics_evaluator,
-    mock_api_amender,
-    mock_error_handler,
-    mock_metric_manager,
-    mock_script_manager,
-):
-    """Create processor components fixture for PR tests."""
-    return ProcessorComponents(
-        metrics_evaluator=mock_metrics_evaluator,
-        api_amender=mock_api_amender,
-        error_handler=mock_error_handler,
-        metric_manager=mock_metric_manager,
-        script_manager=mock_script_manager,
-    )
-
-
-@pytest.fixture
-def processor(config_loader, processor_components_pr):
-    """Create ConversationProcessor instance for PR tests."""
-    return ConversationProcessor(config_loader, processor_components_pr)
-
-
 class TestConversationProcessorEvaluateTurn:
     """Unit tests for ConversationProcessor._evaluate_turn method."""
 
-    def test_evaluate_turn_with_valid_metrics(self, processor, mock_metrics_evaluator):
+    def test_evaluate_turn_with_valid_metrics(
+        self, processor: ConversationProcessor, mock_metrics_evaluator: MetricsEvaluator
+    ) -> None:
         """Test _evaluate_turn with all valid metrics."""
         turn_data = TurnData(
             turn_id="1",
@@ -603,7 +501,9 @@ def test_evaluate_turn_with_valid_metrics(self, processor, mock_metrics_evaluato
 
         turn_metrics = ["ragas:faithfulness", "custom:answer_correctness"]
 
-        results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
+        results = processor._evaluate_turn(  # pylint: disable=protected-access
+            conv_data, 0, turn_data, turn_metrics
+        )
 
         # Should evaluate both metrics
         assert len(results) == 2
@@ -618,10 +518,12 @@ def test_evaluate_turn_with_valid_metrics(self, processor, mock_metrics_evaluato
         assert calls[1][0][0].metric_identifier == "custom:answer_correctness"
 
     def test_evaluate_turn_with_invalid_metric(
-        self, processor, mock_metrics_evaluator, caplog
-    ):
+        self,
+        processor: ConversationProcessor,
+        mock_metrics_evaluator: MetricsEvaluator,
+        caplog: LogCaptureFixture,
+    ) -> None:
         """Test _evaluate_turn with an invalid metric - creates ERROR result and logs error."""
-        import logging
 
         turn_data = TurnData(
             turn_id="1",
@@ -637,7 +539,9 @@ def test_evaluate_turn_with_invalid_metric(
         turn_metrics = ["ragas:faithfulness", "custom:answer_correctness"]
 
         with caplog.at_level(logging.ERROR):
-            results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
+            results = processor._evaluate_turn(  # pylint: disable=protected-access
+                conv_data, 0, turn_data, turn_metrics
+            )
 
         # Should get 2 results: 1 ERROR for invalid metric, 1 PASS for valid metric
         assert len(results) == 2
@@ -658,10 +562,12 @@ def test_evaluate_turn_with_invalid_metric(
         assert "check Validation Errors" in caplog.text
 
     def test_evaluate_turn_with_all_invalid_metrics(
-        self, processor, mock_metrics_evaluator, caplog
-    ):
+        self,
+        processor: ConversationProcessor,
+        mock_metrics_evaluator: MetricsEvaluator,
+        caplog: LogCaptureFixture,
+    ) -> None:
         """Test _evaluate_turn with all metrics invalid - returns ERROR results."""
-        import logging
 
         turn_data = TurnData(
             turn_id="1",
@@ -678,7 +584,9 @@ def test_evaluate_turn_with_all_invalid_metrics(
         turn_metrics = ["ragas:faithfulness", "custom:answer_correctness"]
 
         with caplog.at_level(logging.ERROR):
-            results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
+            results = processor._evaluate_turn(  # pylint: disable=protected-access
+                conv_data, 0, turn_data, turn_metrics
+            )
 
         # Should return ERROR results for both invalid metrics
         assert len(results) == 2
@@ -694,10 +602,12 @@ def test_evaluate_turn_with_all_invalid_metrics(
         assert "Invalid turn metric 'custom:answer_correctness'" in caplog.text
 
     def test_evaluate_turn_with_mixed_valid_invalid_metrics(
-        self, processor, mock_metrics_evaluator, caplog
-    ):
+        self,
+        processor: ConversationProcessor,
+        mock_metrics_evaluator: MetricsEvaluator,
+        caplog: LogCaptureFixture,
+    ) -> None:
         """Test _evaluate_turn with mix of valid and invalid metrics."""
-        import logging
 
         turn_data = TurnData(
             turn_id="1",
@@ -717,7 +627,9 @@ def test_evaluate_turn_with_mixed_valid_invalid_metrics(
         ]
 
         with caplog.at_level(logging.ERROR):
-            results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
+            results = processor._evaluate_turn(  # pylint: disable=protected-access
+                conv_data, 0, turn_data, turn_metrics
+            )
 
         # Should get 3 results: 2 valid metrics (PASS) and 1 invalid metric (ERROR)
         assert len(results) == 3
@@ -734,7 +646,9 @@ def test_evaluate_turn_with_mixed_valid_invalid_metrics(
         # Verify error was logged for invalid metric
         assert "Invalid turn metric 'custom:answer_correctness'" in caplog.text
 
-    def test_evaluate_turn_with_empty_metrics(self, processor, mock_metrics_evaluator):
+    def test_evaluate_turn_with_empty_metrics(
+        self, processor: ConversationProcessor, mock_metrics_evaluator: MetricsEvaluator
+    ) -> None:
         """Test _evaluate_turn with empty metrics list."""
         turn_data = TurnData(
             turn_id="1",
@@ -743,9 +657,11 @@ def test_evaluate_turn_with_empty_metrics(self, processor, mock_metrics_evaluato
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data])
 
-        turn_metrics = []
+        turn_metrics: list[str] = []
 
-        results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
+        results = processor._evaluate_turn(  # pylint: disable=protected-access
+            conv_data, 0, turn_data, turn_metrics
+        )
 
         # Should return empty results
         assert len(results) == 0
@@ -754,8 +670,8 @@ def test_evaluate_turn_with_empty_metrics(self, processor, mock_metrics_evaluato
         assert mock_metrics_evaluator.evaluate_metric.call_count == 0
 
     def test_evaluate_turn_creates_correct_request(
-        self, processor, mock_metrics_evaluator
-    ):
+        self, processor: ConversationProcessor, mock_metrics_evaluator: MetricsEvaluator
+    ) -> None:
         """Test _evaluate_turn creates correct EvaluationRequest."""
         turn_data = TurnData(
             turn_id="turn_123",
@@ -767,7 +683,9 @@ def test_evaluate_turn_creates_correct_request(
 
         turn_metrics = ["ragas:faithfulness"]
 
-        processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
+        processor._evaluate_turn(  # pylint: disable=protected-access
+            conv_data, 0, turn_data, turn_metrics
+        )
 
         # Verify the request structure
         assert mock_metrics_evaluator.evaluate_metric.call_count == 1
@@ -780,8 +698,8 @@ def test_evaluate_turn_creates_correct_request(
         assert call_args.turn_idx == 0
 
     def test_evaluate_turn_handles_evaluator_returning_none(
-        self, processor, mock_metrics_evaluator
-    ):
+        self, processor: ConversationProcessor, mock_metrics_evaluator: MetricsEvaluator
+    ) -> None:
         """Test _evaluate_turn handles when evaluator returns None."""
         turn_data = TurnData(
             turn_id="1",
@@ -796,7 +714,9 @@ def test_evaluate_turn_handles_evaluator_returning_none(
 
         turn_metrics = ["ragas:faithfulness"]
 
-        results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
+        results = processor._evaluate_turn(  # pylint: disable=protected-access
+            conv_data, 0, turn_data, turn_metrics
+        )
 
         # Should return empty results when evaluator returns None
         assert len(results) == 0
@@ -805,8 +725,8 @@ def test_evaluate_turn_handles_evaluator_returning_none(
         assert mock_metrics_evaluator.evaluate_metric.call_count == 1
 
     def test_evaluate_turn_multiple_turns_correct_index(
-        self, processor, mock_metrics_evaluator
-    ):
+        self, processor: ConversationProcessor, mock_metrics_evaluator: MetricsEvaluator
+    ) -> None:
         """Test _evaluate_turn uses correct turn index."""
         turn_data_1 = TurnData(turn_id="1", query="Q1", response="R1")
         turn_data_2 = TurnData(turn_id="2", query="Q2", response="R2")
@@ -820,7 +740,9 @@ def test_evaluate_turn_multiple_turns_correct_index(
         turn_metrics = ["ragas:faithfulness"]
 
         # Evaluate second turn (index 1)
-        processor._evaluate_turn(conv_data, 1, turn_data_2, turn_metrics)
+        processor._evaluate_turn(  # pylint: disable=protected-access
+            conv_data, 1, turn_data_2, turn_metrics
+        )
 
         # Verify correct turn index
         call_args = mock_metrics_evaluator.evaluate_metric.call_args[0][0]
@@ -828,8 +750,8 @@ def test_evaluate_turn_multiple_turns_correct_index(
         assert call_args.turn_id == "2"
 
     def test_evaluate_turn_preserves_metric_order(
-        self, processor, mock_metrics_evaluator
-    ):
+        self, processor: ConversationProcessor, mock_metrics_evaluator: MetricsEvaluator
+    ) -> None:
         """Test _evaluate_turn evaluates metrics in the order provided."""
         turn_data = TurnData(
             turn_id="1",
@@ -844,7 +766,9 @@ def test_evaluate_turn_preserves_metric_order(
             "ragas:context_recall",
         ]
 
-        processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
+        processor._evaluate_turn(  # pylint: disable=protected-access
+            conv_data, 0, turn_data, turn_metrics
+        )
 
         # Verify metrics were evaluated in order
         assert mock_metrics_evaluator.evaluate_metric.call_count == 3
@@ -854,7 +778,7 @@ def test_evaluate_turn_preserves_metric_order(
         assert calls[1][0][0].metric_identifier == "ragas:faithfulness"
         assert calls[2][0][0].metric_identifier == "ragas:context_recall"
 
-    def test_is_metric_invalid_functionality(self):
+    def test_is_metric_invalid_functionality(self) -> None:
         """Test TurnData.is_metric_invalid and add_invalid_metric methods."""
         turn_data = TurnData(turn_id="1", query="Q", response="R")
 
@@ -886,7 +810,7 @@ class TestSkipOnFailure:
     """Unit tests for skip_on_failure feature."""
 
     @pytest.fixture
-    def multi_turn_conv_data(self):
+    def multi_turn_conv_data(self) -> EvaluationData:
         """Create conversation data with multiple turns."""
         turns = [
             TurnData(
@@ -904,10 +828,12 @@ def multi_turn_conv_data(self):
         )
 
     @pytest.fixture
-    def config_loader_factory(self, mocker):
+    def config_loader_factory(
+        self, mocker: MockerFixture
+    ) -> Callable[[bool], ConfigLoader]:
         """Factory to create config loader with configurable skip_on_failure."""
 
-        def _create(skip_on_failure: bool):
+        def _create(skip_on_failure: bool) -> ConfigLoader:
             loader = mocker.Mock(spec=ConfigLoader)
             config = SystemConfig()
             config.api.enabled = False
@@ -926,14 +852,14 @@ def _create(skip_on_failure: bool):
             (True, False, False),  # System enabled, conv disables
         ],
     )
-    def test_is_skip_on_failure_enabled(
+    def test_is_skip_on_failure_enabled(  # pylint: disable=too-many-arguments, too-many-positional-arguments
         self,
-        config_loader_factory,
-        processor_components,
-        system_skip,
-        conv_skip,
-        expected,
-    ):
+        config_loader_factory: Callable[[bool], ConfigLoader],
+        processor_components: ProcessorComponents,
+        system_skip: bool,
+        conv_skip: bool,
+        expected: bool,
+    ) -> None:
         """Test skip_on_failure resolution from system config and conversation override."""
         conv_data = EvaluationData(
             conversation_group_id="test",
@@ -943,7 +869,12 @@ def test_is_skip_on_failure_enabled(
         processor = ConversationProcessor(
             config_loader_factory(system_skip), processor_components
         )
-        assert processor._is_skip_on_failure_enabled(conv_data) is expected
+        assert (
+            processor._is_skip_on_failure_enabled(  # pylint: disable=protected-access
+                conv_data
+            )
+            is expected
+        )
 
     @pytest.mark.parametrize(
         "results_status,expected",
@@ -954,8 +885,12 @@ def test_is_skip_on_failure_enabled(
         ],
     )
     def test_has_failure(
-        self, mock_config_loader, processor_components, results_status, expected
-    ):
+        self,
+        mock_config_loader: ConfigLoader,
+        processor_components: ProcessorComponents,
+        results_status: list[str],
+        expected: bool,
+    ) -> None:
         """Test _has_failure detection for FAIL and ERROR results."""
         processor = ConversationProcessor(mock_config_loader, processor_components)
         results = [
@@ -964,17 +899,20 @@ def test_has_failure(
             )
             for i, status in enumerate(results_status)
         ]
-        assert processor._has_failure(results) is expected
+        assert (
+            processor._has_failure(results)  # pylint: disable=protected-access
+            is expected
+        )
 
     @pytest.mark.parametrize("skip_enabled,expect_skip", [(True, True), (False, False)])
-    def test_skip_on_failure_behavior(
+    def test_skip_on_failure_behavior(  # pylint: disable=too-many-arguments, too-many-positional-arguments
         self,
-        config_loader_factory,
-        processor_components,
-        multi_turn_conv_data,
-        skip_enabled,
-        expect_skip,
-    ):
+        config_loader_factory: Callable[[bool], ConfigLoader],
+        processor_components: ProcessorComponents,
+        multi_turn_conv_data: EvaluationData,
+        skip_enabled: bool,
+        expect_skip: bool,
+    ) -> None:
         """Test skip_on_failure skips remaining turns when enabled, continues when disabled."""
         # Configure metric manager
         processor_components.metric_manager.resolve_metrics.side_effect = [
diff --git a/tests/unit/runner/test_evaluation.py b/tests/unit/runner/test_evaluation.py
index 056d5118..a9a18fcf 100644
--- a/tests/unit/runner/test_evaluation.py
+++ b/tests/unit/runner/test_evaluation.py
@@ -1,13 +1,15 @@
 """Unit tests for runner/evaluation.py."""
 
 import argparse
+from typing import Any
 
 import pytest
+from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.runner.evaluation import main, run_evaluation
 
 
-def _make_eval_args(**kwargs) -> argparse.Namespace:
+def _make_eval_args(**kwargs: Any) -> argparse.Namespace:
     """Helper to create eval_args namespace with defaults."""
     defaults = {
         "system_config": "config/system.yaml",
@@ -23,7 +25,11 @@ def _make_eval_args(**kwargs) -> argparse.Namespace:
 class TestRunEvaluation:
     """Unit tests for run_evaluation function."""
 
-    def test_run_evaluation_success(self, mocker, capsys):
+    def test_run_evaluation_success(
+        self,
+        mocker: MockerFixture,
+        capsys: pytest.CaptureFixture,  # pylint: disable=unused-argument
+    ) -> None:
         """Test successful evaluation run."""
         # Mock ConfigLoader
         mock_loader = mocker.Mock()
@@ -89,7 +95,11 @@ def test_run_evaluation_success(self, mocker, capsys):
         assert result["PASS"] == 1
         mock_pipeline.close.assert_called_once()
 
-    def test_run_evaluation_with_output_dir_override(self, mocker, capsys):
+    def test_run_evaluation_with_output_dir_override(
+        self,
+        mocker: MockerFixture,
+        capsys: pytest.CaptureFixture,  # pylint: disable=unused-argument
+    ) -> None:
         """Test evaluation with custom output directory."""
         mock_loader = mocker.Mock()
         mock_config = mocker.Mock()
@@ -145,7 +155,9 @@ def test_run_evaluation_with_output_dir_override(self, mocker, capsys):
         call_args = mock_pipeline_class.call_args
         assert call_args[0][1] == "/custom/output"
 
-    def test_run_evaluation_file_not_found(self, mocker, capsys):
+    def test_run_evaluation_file_not_found(
+        self, mocker: MockerFixture, capsys: pytest.CaptureFixture
+    ) -> None:
         """Test evaluation handles FileNotFoundError."""
         mock_config_loader = mocker.patch(
             "lightspeed_evaluation.runner.evaluation.ConfigLoader"
@@ -160,7 +172,9 @@ def test_run_evaluation_file_not_found(self, mocker, capsys):
         captured = capsys.readouterr()
         assert "Evaluation failed" in captured.out
 
-    def test_run_evaluation_value_error(self, mocker, capsys):
+    def test_run_evaluation_value_error(
+        self, mocker: MockerFixture, capsys: pytest.CaptureFixture
+    ) -> None:
         """Test evaluation handles ValueError."""
         mock_loader = mocker.Mock()
         mock_config = mocker.Mock()
@@ -186,7 +200,9 @@ def test_run_evaluation_value_error(self, mocker, capsys):
         captured = capsys.readouterr()
         assert "Evaluation failed" in captured.out
 
-    def test_run_evaluation_with_errors_in_results(self, mocker, capsys):
+    def test_run_evaluation_with_errors_in_results(
+        self, mocker: MockerFixture, capsys: pytest.CaptureFixture
+    ) -> None:
         """Test evaluation reports errors in results."""
         mock_loader = mocker.Mock()
         mock_config = mocker.Mock()
@@ -237,11 +253,16 @@ def test_run_evaluation_with_errors_in_results(self, mocker, capsys):
 
         result = run_evaluation(_make_eval_args())
 
+        assert result is not None
         assert result["ERROR"] == 3
         captured = capsys.readouterr()
         assert "3 evaluations had errors" in captured.out
 
-    def test_run_evaluation_closes_pipeline_on_exception(self, mocker, capsys):
+    def test_run_evaluation_closes_pipeline_on_exception(
+        self,
+        mocker: MockerFixture,
+        capsys: pytest.CaptureFixture,  # pylint: disable=unused-argument
+    ) -> None:
         """Test pipeline is closed even if evaluation fails."""
         mock_loader = mocker.Mock()
         mock_config = mocker.Mock()
@@ -275,7 +296,9 @@ def test_run_evaluation_closes_pipeline_on_exception(self, mocker, capsys):
         mock_pipeline.close.assert_called_once()
         assert result is None
 
-    def test_run_evaluation_with_empty_filter_result(self, mocker, capsys):
+    def test_run_evaluation_with_empty_filter_result(
+        self, mocker: MockerFixture, capsys: pytest.CaptureFixture
+    ) -> None:
         """Test evaluation returns empty result when filter matches nothing."""
         mock_loader = mocker.Mock()
         mock_config = mocker.Mock()
@@ -295,6 +318,7 @@ def test_run_evaluation_with_empty_filter_result(self, mocker, capsys):
 
         result = run_evaluation(_make_eval_args(tags=["nonexistent"]))
 
+        assert result is not None
         assert result["TOTAL"] == 0
         mock_validator.return_value.load_evaluation_data.assert_called_once_with(
             "config/evaluation_data.yaml", tags=["nonexistent"], conv_ids=None
@@ -304,7 +328,7 @@ def test_run_evaluation_with_empty_filter_result(self, mocker, capsys):
         captured = capsys.readouterr()
         assert "No conversation groups matched the filter criteria" in captured.out
 
-    def test_run_evaluation_with_filter_parameters(self, mocker):
+    def test_run_evaluation_with_filter_parameters(self, mocker: MockerFixture) -> None:
         """Test that filter parameters are correctly passed to DataValidator."""
         mock_loader = mocker.Mock()
         mock_config = mocker.Mock()
@@ -365,7 +389,7 @@ def test_run_evaluation_with_filter_parameters(self, mocker):
 class TestMain:
     """Unit tests for main CLI function."""
 
-    def test_main_default_args(self, mocker):
+    def test_main_default_args(self, mocker: MockerFixture) -> None:
         """Test main with default arguments."""
         mocker.patch(
             "sys.argv",
@@ -392,7 +416,7 @@ def test_main_default_args(self, mocker):
         assert args.eval_data == "config/evaluation_data.yaml"
         assert args.output_dir is None
 
-    def test_main_custom_args(self, mocker):
+    def test_main_custom_args(self, mocker: MockerFixture) -> None:
         """Test main with custom arguments."""
         mocker.patch(
             "sys.argv",
@@ -427,7 +451,7 @@ def test_main_custom_args(self, mocker):
         assert args.eval_data == "custom/eval.yaml"
         assert args.output_dir == "/custom/output"
 
-    def test_main_returns_error_on_failure(self, mocker):
+    def test_main_returns_error_on_failure(self, mocker: MockerFixture) -> None:
         """Test main returns error code on failure."""
         mocker.patch(
             "sys.argv",
@@ -455,7 +479,13 @@ def test_main_returns_error_on_failure(self, mocker):
             ),
         ],
     )
-    def test_main_with_filters(self, mocker, args, expected_tags, expected_conv_ids):
+    def test_main_with_filters(
+        self,
+        mocker: MockerFixture,
+        args: list[str],
+        expected_tags: list[str] | None,
+        expected_conv_ids: list[str] | None,
+    ) -> None:
         """Test main with filter arguments."""
         mocker.patch("sys.argv", ["lightspeed-eval"] + args)
 

From f08c041a280f8fc95ac601eed3a5fab414ffdb52 Mon Sep 17 00:00:00 2001
From: Eva Micankova <emicanko@redhat.com>
Date: Thu, 29 Jan 2026 10:49:32 +0100
Subject: [PATCH 2/3] Moving pylint disable to file level

---
 tests/script/conftest.py                      |  4 +-
 tests/script/test_compare_evaluations.py      | 66 ++++---------
 tests/script/test_run_multi_provider_eval.py  | 76 ++++++--------
 tests/unit/core/api/test_client.py            | 36 +++----
 tests/unit/core/llm/test_custom.py            |  8 +-
 tests/unit/core/metrics/conftest.py           |  6 +-
 tests/unit/core/metrics/test_geval.py         | 34 +++----
 tests/unit/core/metrics/test_manager.py       |  4 +-
 tests/unit/core/metrics/test_nlp.py           |  6 +-
 tests/unit/core/output/test_final_coverage.py | 10 +-
 tests/unit/core/output/test_generator.py      | 64 ++++--------
 tests/unit/core/system/test_validator.py      | 98 ++++---------------
 tests/unit/pipeline/evaluation/conftest.py    | 16 +--
 .../pipeline/evaluation/test_evaluator.py     | 57 ++++-------
 .../pipeline/evaluation/test_processor.py     | 90 ++++++-----------
 tests/unit/runner/test_evaluation.py          |  8 +-
 16 files changed, 196 insertions(+), 387 deletions(-)

diff --git a/tests/script/conftest.py b/tests/script/conftest.py
index 8ab273da..752800a2 100644
--- a/tests/script/conftest.py
+++ b/tests/script/conftest.py
@@ -1,3 +1,5 @@
+# pylint: disable=redefined-outer-name
+
 """Pytest configuration and fixtures for script tests."""
 
 from pathlib import Path
@@ -126,7 +128,7 @@ def temp_config_files(tmp_path: Path) -> dict:
 
 
 @pytest.fixture
-def runner(  # pylint: disable=redefined-outer-name
+def runner(
     temp_config_files: dict,
 ) -> MultiProviderEvaluationRunner:
     """Create a MultiProviderEvaluationRunner instance for testing."""
diff --git a/tests/script/test_compare_evaluations.py b/tests/script/test_compare_evaluations.py
index e03bebdb..56bca05c 100755
--- a/tests/script/test_compare_evaluations.py
+++ b/tests/script/test_compare_evaluations.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python3
+# pylint: disable=protected-access
+
 """Pytest tests to verify the compare_evaluations.py script works correctly."""
 
 import json
@@ -144,9 +146,7 @@ def test_compare_score_distributions_basic(
         scores1 = [0.8, 0.9, 0.7, 0.85, 0.75, 0.88, 0.82, 0.79, 0.86, 0.81]
         scores2 = [0.6, 0.65, 0.55, 0.62, 0.58, 0.63, 0.59, 0.61, 0.64, 0.57]
 
-        result = comparison_instance._compare_score_distributions(  # pylint: disable=protected-access
-            scores1, scores2
-        )
+        result = comparison_instance._compare_score_distributions(scores1, scores2)
         # Check structure
         assert "run1_stats" in result
         assert "run2_stats" in result
@@ -180,9 +180,7 @@ def test_compare_score_distributions_scipy_example(
         scores1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
         scores2 = [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
 
-        result = comparison_instance._compare_score_distributions(  # pylint: disable=protected-access
-            scores1, scores2
-        )
+        result = comparison_instance._compare_score_distributions(scores1, scores2)
 
         # The means should be 5.5 and 6.5 respectively
         assert abs(result["run1_stats"]["mean"] - 5.5) < 0.01
@@ -200,9 +198,7 @@ def test_compare_score_distributions_identical_data(
         scores1 = [0.8, 0.8, 0.8, 0.8, 0.8]
         scores2 = [0.8, 0.8, 0.8, 0.8, 0.8]
 
-        result = comparison_instance._compare_score_distributions(  # pylint: disable=protected-access
-            scores1, scores2
-        )
+        result = comparison_instance._compare_score_distributions(scores1, scores2)
 
         assert result["run1_stats"]["mean"] == result["run2_stats"]["mean"]
         assert result["mean_difference"] == 0.0
@@ -224,9 +220,7 @@ def test_perform_pass_rate_tests_basic(
             "total2": 20,
         }
 
-        comparison_instance._perform_pass_rate_tests(  # pylint: disable=protected-access
-            comparison, test_data
-        )
+        comparison_instance._perform_pass_rate_tests(comparison, test_data)
 
         # Check that tests were performed
         assert "tests" in comparison
@@ -252,9 +246,7 @@ def test_perform_pass_rate_tests_scipy_chisquare_example(
             "total2": 20,
         }
 
-        comparison_instance._perform_pass_rate_tests(  # pylint: disable=protected-access
-            comparison, test_data
-        )
+        comparison_instance._perform_pass_rate_tests(comparison, test_data)
 
         # Verify structure
         assert "tests" in comparison
@@ -289,9 +281,7 @@ def test_perform_pass_rate_tests_edge_cases(
             "total2": 15,
         }
 
-        comparison_instance._perform_pass_rate_tests(  # pylint: disable=protected-access
-            comparison, test_data
-        )
+        comparison_instance._perform_pass_rate_tests(comparison, test_data)
 
         # Should handle gracefully (no tests performed or error recorded)
         assert "tests" in comparison
@@ -303,9 +293,7 @@ def test_check_confidence_interval_overlap_no_overlap(
         ci1 = {"low": 0.1, "high": 0.3, "mean": 0.2, "confidence_level": 0.95}
         ci2 = {"low": 0.7, "high": 0.9, "mean": 0.8, "confidence_level": 0.95}
 
-        result = comparison_instance._check_confidence_interval_overlap(  # pylint: disable=protected-access
-            ci1, ci2
-        )
+        result = comparison_instance._check_confidence_interval_overlap(ci1, ci2)
 
         assert "intervals_overlap" in result
         assert "significant" in result
@@ -319,9 +307,7 @@ def test_check_confidence_interval_overlap_with_overlap(
         ci1 = {"low": 0.2, "high": 0.6, "mean": 0.4, "confidence_level": 0.95}
         ci2 = {"low": 0.4, "high": 0.8, "mean": 0.6, "confidence_level": 0.95}
 
-        result = comparison_instance._check_confidence_interval_overlap(  # pylint: disable=protected-access
-            ci1, ci2
-        )
+        result = comparison_instance._check_confidence_interval_overlap(ci1, ci2)
 
         assert "intervals_overlap" in result
         assert "significant" in result
@@ -332,9 +318,7 @@ def test_check_confidence_interval_overlap_none_inputs(
         self, comparison_instance: EvaluationComparison
     ) -> None:
         """Test _check_confidence_interval_overlap with None inputs."""
-        result = comparison_instance._check_confidence_interval_overlap(  # pylint: disable=protected-access
-            None, None
-        )
+        result = comparison_instance._check_confidence_interval_overlap(None, None)
 
         assert "test_performed" in result
         # Should handle None inputs gracefully - might not perform test
@@ -345,9 +329,7 @@ def test_check_confidence_interval_overlap_partial_none(
         """Test _check_confidence_interval_overlap with one None input."""
         ci1 = {"low": 0.2, "high": 0.6, "mean": 0.4, "confidence_level": 0.95}
 
-        result = comparison_instance._check_confidence_interval_overlap(  # pylint: disable=protected-access
-            ci1, None
-        )
+        result = comparison_instance._check_confidence_interval_overlap(ci1, None)
         assert "test_performed" in result
         # Should handle partial None inputs gracefully
 
@@ -360,9 +342,7 @@ def test_compare_score_distributions_known_statistical_results(
         scores1 = [1.0, 1.1, 1.2, 1.3, 1.4]  # Mean ≈ 1.2, low variance
         scores2 = [2.0, 2.1, 2.2, 2.3, 2.4]  # Mean ≈ 2.2, low variance
 
-        result = comparison_instance._compare_score_distributions(  # pylint: disable=protected-access
-            scores1, scores2
-        )
+        result = comparison_instance._compare_score_distributions(scores1, scores2)
 
         # These should be significantly different
         assert abs(result["mean_difference"] - 1.0) < 0.01
@@ -394,9 +374,7 @@ def test_perform_pass_rate_tests_known_chi_square_result(
             "total2": 20,
         }
 
-        comparison_instance._perform_pass_rate_tests(  # pylint: disable=protected-access
-            comparison, test_data
-        )
+        comparison_instance._perform_pass_rate_tests(comparison, test_data)
 
         # Verify the chi-square test was performed and has reasonable results
         if "chi_square" in comparison["tests"]:
@@ -425,9 +403,7 @@ def test_perform_pass_rate_tests_fisher_exact_small_sample(
             "total2": 5,
         }
 
-        comparison_instance._perform_pass_rate_tests(  # pylint: disable=protected-access
-            comparison, test_data
-        )
+        comparison_instance._perform_pass_rate_tests(comparison, test_data)
 
         # Verify Fisher exact test results
         if "fisher_exact" in comparison["tests"]:
@@ -450,9 +426,7 @@ def test_check_confidence_interval_overlap_exact_boundaries(
             "confidence_level": 0.95,
         }
 
-        result = comparison_instance._check_confidence_interval_overlap(  # pylint: disable=protected-access
-            ci1, ci2
-        )
+        result = comparison_instance._check_confidence_interval_overlap(ci1, ci2)
 
         # Touching at boundary might be considered overlap or not, depending on implementation
         assert "intervals_overlap" in result
@@ -467,9 +441,7 @@ def test_compare_score_distributions_single_values(
         scores1 = [0.8]
         scores2 = [0.6]
 
-        result = comparison_instance._compare_score_distributions(  # pylint: disable=protected-access
-            scores1, scores2
-        )
+        result = comparison_instance._compare_score_distributions(scores1, scores2)
 
         # Should handle single values gracefully
         assert result["run1_stats"]["count"] == 1
@@ -498,9 +470,7 @@ def test_perform_pass_rate_tests_extreme_ratios(
             "total2": 10,
         }
 
-        comparison_instance._perform_pass_rate_tests(  # pylint: disable=protected-access
-            comparison, test_data
-        )
+        comparison_instance._perform_pass_rate_tests(comparison, test_data)
 
         # Should handle extreme cases
         assert "tests" in comparison
diff --git a/tests/script/test_run_multi_provider_eval.py b/tests/script/test_run_multi_provider_eval.py
index ef0057dc..a65b235a 100644
--- a/tests/script/test_run_multi_provider_eval.py
+++ b/tests/script/test_run_multi_provider_eval.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python3
+# pylint: disable=protected-access,too-few-public-methods
+
 """Pytest tests for run_multi_provider_eval.py script."""
 
 import json
@@ -241,9 +243,7 @@ def test_load_valid_yaml(
         self, runner: MultiProviderEvaluationRunner, temp_config_files: dict[str, Path]
     ) -> None:
         """Test loading a valid YAML file."""
-        config = runner._load_yaml(  # pylint: disable=protected-access
-            temp_config_files["providers_config"]
-        )
+        config = runner._load_yaml(temp_config_files["providers_config"])
         assert isinstance(config, dict)
         assert "providers" in config
         assert "openai" in config["providers"]
@@ -259,7 +259,7 @@ def test_load_invalid_yaml(
             f.write("invalid: yaml: content: [")
 
         with pytest.raises(ValueError, match="Error parsing YAML file"):
-            runner._load_yaml(invalid_yaml)  # pylint: disable=protected-access
+            runner._load_yaml(invalid_yaml)
 
     def test_load_yaml_non_dict_type(
         self, runner: MultiProviderEvaluationRunner, tmp_path: Path
@@ -270,19 +270,17 @@ def test_load_yaml_non_dict_type(
             yaml.dump(["item1", "item2", "item3"], f)
 
         with pytest.raises(ValueError, match="must be a mapping, got list"):
-            runner._load_yaml(list_yaml)  # pylint: disable=protected-access
+            runner._load_yaml(list_yaml)
 
 
-class TestCreateProviderModelConfigs:  # pylint: disable=too-few-public-methods
+class TestCreateProviderModelConfigs:
     """Tests for _create_provider_model_configs method."""
 
     def test_create_configs_multiple_providers(
         self, runner: MultiProviderEvaluationRunner
     ) -> None:
         """Test creating configs with multiple providers."""
-        configs = (
-            runner._create_provider_model_configs()  # pylint: disable=protected-access
-        )
+        configs = runner._create_provider_model_configs()
 
         assert len(configs) == 3  # 2 openai models + 1 watsonx model
 
@@ -311,11 +309,9 @@ def test_llm_config_stays_constant(
         original_llm_provider = runner.system_config["llm"]["provider"]
         original_llm_model = runner.system_config["llm"]["model"]
 
-        modified = (
-            runner._create_modified_system_config(  # pylint: disable=protected-access
-                provider_id="watsonx",
-                model="ibm/granite-13b-chat-v2",
-            )
+        modified = runner._create_modified_system_config(
+            provider_id="watsonx",
+            model="ibm/granite-13b-chat-v2",
         )
 
         # LLM judge should remain unchanged
@@ -348,11 +344,9 @@ def test_api_config_is_modified(self, temp_config_files: dict[str, Path]) -> Non
             eval_data_path=str(temp_config_files["eval_data"]),
         )
 
-        modified = (
-            runner._create_modified_system_config(  # pylint: disable=protected-access
-                provider_id="watsonx",
-                model="ibm/granite-13b-chat-v2",
-            )
+        modified = runner._create_modified_system_config(
+            provider_id="watsonx",
+            model="ibm/granite-13b-chat-v2",
         )
 
         # API config should be modified with provider and model only
@@ -372,11 +366,9 @@ def test_create_temp_config_file(
         self, runner: MultiProviderEvaluationRunner
     ) -> None:
         """Test that a temporary config file is created."""
-        temp_path = (
-            runner._create_temp_system_config(  # pylint: disable=protected-access
-                provider_id="openai",
-                model="gpt-4o-mini",
-            )
+        temp_path = runner._create_temp_system_config(
+            provider_id="openai",
+            model="gpt-4o-mini",
         )
 
         try:
@@ -424,7 +416,7 @@ def track_temp_file(*args: Any, **kwargs: Any) -> Any:
                 side_effect=Exception("YAML dump failed"),
             ):
                 with pytest.raises(Exception, match="YAML dump failed"):
-                    runner._create_temp_system_config(  # pylint: disable=protected-access
+                    runner._create_temp_system_config(
                         provider_id="openai",
                         model="gpt-4o-mini",
                     )
@@ -441,11 +433,9 @@ def test_temp_config_sanitizes_special_characters(
         self, runner: MultiProviderEvaluationRunner
     ) -> None:
         """Test that special characters in provider_id and model are sanitized."""
-        temp_path = (
-            runner._create_temp_system_config(  # pylint: disable=protected-access
-                provider_id="open..ai//test",
-                model="gpt:4o-mini/special",
-            )
+        temp_path = runner._create_temp_system_config(
+            provider_id="open..ai//test",
+            model="gpt:4o-mini/special",
         )
 
         try:
@@ -483,7 +473,7 @@ def test_path_traversal_blocked_in_provider_id(
             return_value={"PASS": 0, "FAIL": 0, "ERROR": 1},
         ):
             # Attempt path traversal in provider_id
-            result = runner._run_single_evaluation(  # pylint: disable=protected-access
+            result = runner._run_single_evaluation(
                 provider_name="malicious",
                 provider_id="../../etc",
                 model="test",
@@ -510,7 +500,7 @@ def test_path_traversal_blocked_in_model(
             return_value={"PASS": 0, "FAIL": 0, "ERROR": 1},
         ):
             # Attempt path traversal in model
-            result = runner._run_single_evaluation(  # pylint: disable=protected-access
+            result = runner._run_single_evaluation(
                 provider_name="openai",
                 provider_id="openai",
                 model="../../../etc/passwd",
@@ -540,7 +530,7 @@ def test_run_single_evaluation_success(
             "script.run_multi_provider_eval.run_evaluation",
             return_value={"PASS": 5, "FAIL": 2, "ERROR": 0},
         ) as mock_run_eval:
-            result = runner._run_single_evaluation(  # pylint: disable=protected-access
+            result = runner._run_single_evaluation(
                 provider_name="openai",
                 provider_id="openai",
                 model="gpt-4o-mini",
@@ -560,7 +550,7 @@ def test_run_single_evaluation_failure(
         """Test evaluation failure handling."""
         # Mock run_evaluation to return None (failure)
         with patch("script.run_multi_provider_eval.run_evaluation", return_value=None):
-            result = runner._run_single_evaluation(  # pylint: disable=protected-access
+            result = runner._run_single_evaluation(
                 provider_name="openai",
                 provider_id="openai",
                 model="gpt-4o-mini",
@@ -578,7 +568,7 @@ def test_run_single_evaluation_invalid_summary(
             "script.run_multi_provider_eval.run_evaluation",
             return_value={"PASS": 5, "FAIL": 2},  # Missing ERROR key
         ):
-            result = runner._run_single_evaluation(  # pylint: disable=protected-access
+            result = runner._run_single_evaluation(
                 provider_name="openai",
                 provider_id="openai",
                 model="gpt-4o-mini",
@@ -589,7 +579,7 @@ def test_run_single_evaluation_invalid_summary(
             assert "summary" not in result
 
 
-class TestRunEvaluations:  # pylint: disable=too-few-public-methods
+class TestRunEvaluations:
     """Tests for run_evaluations method."""
 
     def test_run_evaluations_sequential(
@@ -614,7 +604,7 @@ def test_run_evaluations_sequential(
             assert mock_single_eval.call_count == 3
 
 
-class TestGenerateSummary:  # pylint: disable=too-few-public-methods
+class TestGenerateSummary:
     """Tests for generate_summary method."""
 
     def test_generate_summary_mixed_results(
@@ -675,9 +665,7 @@ def test_percentage_to_decimal_conversion(
         self, runner: MultiProviderEvaluationRunner, sample_evaluation_summary: dict
     ) -> None:
         """Test that percentage rates (80.0) convert to decimals (0.8)."""
-        stats = runner._analyze_single_model(  # pylint: disable=protected-access
-            "test/model", sample_evaluation_summary
-        )
+        stats = runner._analyze_single_model("test/model", sample_evaluation_summary)
 
         # Verify percentage conversion
         assert abs(stats["overall"]["pass_rate"] - 0.8) < 0.01
@@ -686,15 +674,11 @@ def test_percentage_to_decimal_conversion(
     def test_composite_score(self, runner: MultiProviderEvaluationRunner) -> None:
         """Test composite score calculation."""
         # Perfect model should get score of 1.0
-        perfect = runner._calculate_composite_score(  # pylint: disable=protected-access
-            1.0, 0.0, 1.0, 1.0
-        )
+        perfect = runner._calculate_composite_score(1.0, 0.0, 1.0, 1.0)
         assert abs(perfect - 1.0) < 0.0001
 
         # Poor model should get score of 0.0
-        poor = runner._calculate_composite_score(  # pylint: disable=protected-access
-            0.0, 1.0, 0.0, 0.0
-        )
+        poor = runner._calculate_composite_score(0.0, 1.0, 0.0, 0.0)
         assert poor == 0.0
 
     def test_model_ranking(self, runner: MultiProviderEvaluationRunner) -> None:
diff --git a/tests/unit/core/api/test_client.py b/tests/unit/core/api/test_client.py
index caa7d2b3..67117604 100644
--- a/tests/unit/core/api/test_client.py
+++ b/tests/unit/core/api/test_client.py
@@ -1,3 +1,5 @@
+# pylint: disable=protected-access
+
 """Unit tests for core API client module."""
 
 from pathlib import Path
@@ -248,9 +250,7 @@ def test_handle_response_errors_non_200(
         mock_response.read.return_value = b'{"detail": "Not found"}'
 
         with pytest.raises(httpx.HTTPStatusError):
-            client._handle_response_errors(  # pylint: disable=protected-access
-                mock_response
-            )
+            client._handle_response_errors(mock_response)
 
     def test_extract_error_message_with_detail(
         self, api_config: APIConfig, mocker: MockerFixture
@@ -263,9 +263,7 @@ def test_extract_error_message_with_detail(
         mock_response = mocker.Mock()
         mock_response.read.return_value = b'{"detail": "Error message"}'
 
-        error_msg = client._extract_error_message(  # pylint: disable=protected-access
-            mock_response
-        )
+        error_msg = client._extract_error_message(mock_response)
         assert "Error message" in error_msg
 
     def test_extract_error_message_with_nested_detail(
@@ -281,9 +279,7 @@ def test_extract_error_message_with_nested_detail(
             b'{"detail": {"response": "Error", "cause": "Reason"}}'
         )
 
-        error_msg = client._extract_error_message(  # pylint: disable=protected-access
-            mock_response
-        )
+        error_msg = client._extract_error_message(mock_response)
         assert "Error" in error_msg
         assert "Reason" in error_msg
 
@@ -405,9 +401,7 @@ def test_prepare_request_basic(
         mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
 
         client = APIClient(basic_api_config)
-        request = client._prepare_request(  # pylint: disable=protected-access
-            "What is Python?"
-        )
+        request = client._prepare_request("What is Python?")
 
         assert request.query == "What is Python?"
         assert request.provider == "openai"
@@ -420,9 +414,7 @@ def test_prepare_request_with_conversation_id(
         mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
 
         client = APIClient(basic_api_config)
-        request = client._prepare_request(  # pylint: disable=protected-access
-            "Follow-up", conversation_id="conv_123"
-        )
+        request = client._prepare_request("Follow-up", conversation_id="conv_123")
 
         assert request.query == "Follow-up"
         assert request.conversation_id == "conv_123"
@@ -434,7 +426,7 @@ def test_prepare_request_with_attachments(
         mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client")
 
         client = APIClient(basic_api_config)
-        request = client._prepare_request(  # pylint: disable=protected-access
+        request = client._prepare_request(
             "Analyze this", attachments=["file1.txt", "file2.pdf"]
         )
 
@@ -478,15 +470,11 @@ def test_get_cache_key_generates_consistent_hash(
         client = APIClient(config)
 
         # Create identical requests
-        request1 = client._prepare_request(  # pylint: disable=protected-access
-            "test query"
-        )
-        request2 = client._prepare_request(  # pylint: disable=protected-access
-            "test query"
-        )
+        request1 = client._prepare_request("test query")
+        request2 = client._prepare_request("test query")
 
-        key1 = client._get_cache_key(request1)  # pylint: disable=protected-access
-        key2 = client._get_cache_key(request2)  # pylint: disable=protected-access
+        key1 = client._get_cache_key(request1)
+        key2 = client._get_cache_key(request2)
 
         # Same request should generate same cache key
         assert key1 == key2
diff --git a/tests/unit/core/llm/test_custom.py b/tests/unit/core/llm/test_custom.py
index bbd9d3ca..774ce120 100644
--- a/tests/unit/core/llm/test_custom.py
+++ b/tests/unit/core/llm/test_custom.py
@@ -1,3 +1,5 @@
+# pylint: disable=protected-access,disable=too-few-public-methods
+
 """Unit tests for custom LLM classes."""
 
 import pytest
@@ -7,7 +9,7 @@
 from lightspeed_evaluation.core.system.exceptions import LLMError
 
 
-class TestTokenTracker:  # pylint: disable=too-few-public-methods
+class TestTokenTracker:
     """Tests for TokenTracker."""
 
     def test_token_callback_accumulates_tokens(self, mocker: MockerFixture) -> None:
@@ -20,9 +22,7 @@ def test_token_callback_accumulates_tokens(self, mocker: MockerFixture) -> None:
         mock_response.usage.prompt_tokens = 10
         mock_response.usage.completion_tokens = 20
 
-        tracker._token_callback(  # pylint: disable=protected-access
-            {}, mock_response, 0.0, 0.0
-        )
+        tracker._token_callback({}, mock_response, 0.0, 0.0)
 
         input_tokens, output_tokens = tracker.get_counts()
         assert input_tokens == 10
diff --git a/tests/unit/core/metrics/conftest.py b/tests/unit/core/metrics/conftest.py
index 6938d5ff..8018a2d2 100644
--- a/tests/unit/core/metrics/conftest.py
+++ b/tests/unit/core/metrics/conftest.py
@@ -1,3 +1,5 @@
+# pylint: disable=redefined-outer-name
+
 """Pytest configuration and fixtures for metrics tests."""
 
 import sys
@@ -67,7 +69,7 @@ def sample_turn_data() -> TurnData:
 
 
 @pytest.fixture
-def sample_scope(  # pylint: disable=redefined-outer-name
+def sample_scope(
     sample_turn_data: TurnData,
 ) -> EvaluationScope:
     """Create sample EvaluationScope for turn-level evaluation."""
@@ -79,7 +81,7 @@ def sample_scope(  # pylint: disable=redefined-outer-name
 
 
 @pytest.fixture
-def conversation_scope(  # pylint: disable=redefined-outer-name
+def conversation_scope(
     sample_turn_data: TurnData,
 ) -> EvaluationScope:
     """Create sample EvaluationScope for conversation-level evaluation."""
diff --git a/tests/unit/core/metrics/test_geval.py b/tests/unit/core/metrics/test_geval.py
index 79ac617f..d228b9f4 100644
--- a/tests/unit/core/metrics/test_geval.py
+++ b/tests/unit/core/metrics/test_geval.py
@@ -1,3 +1,5 @@
+# pylint: disable=too-many-public-methods,protected-access
+
 """Tests for GEval metrics handler."""
 
 from unittest.mock import MagicMock, patch
@@ -9,7 +11,7 @@
 from lightspeed_evaluation.core.metrics.manager import MetricLevel
 
 
-class TestGEvalHandler:  # pylint: disable=too-many-public-methods
+class TestGEvalHandler:
     """Test cases for GEvalHandler class."""
 
     @pytest.fixture
@@ -50,9 +52,7 @@ def test_initialization(
     def test_convert_evaluation_params_field_names(self, handler: GEvalHandler) -> None:
         """Test conversion of evaluation data field names to LLMTestCaseParams enum."""
         params = ["query", "response", "expected_response"]
-        result = handler._convert_evaluation_params(  # pylint: disable=protected-access
-            params
-        )
+        result = handler._convert_evaluation_params(params)
 
         assert result is not None
         assert len(result) == 3
@@ -65,9 +65,7 @@ def test_convert_evaluation_params_with_contexts(
     ) -> None:
         """Test conversion including contexts and retrieval_context fields."""
         params = ["query", "response", "contexts", "retrieval_context"]
-        result = handler._convert_evaluation_params(  # pylint: disable=protected-access
-            params
-        )
+        result = handler._convert_evaluation_params(params)
 
         assert result is not None
         assert len(result) == 4
@@ -81,9 +79,7 @@ def test_convert_evaluation_params_enum_values_backward_compat(
     ) -> None:
         """Test conversion with direct enum value strings (backward compatibility)."""
         params = ["INPUT", "ACTUAL_OUTPUT", "EXPECTED_OUTPUT"]
-        result = handler._convert_evaluation_params(  # pylint: disable=protected-access
-            params
-        )
+        result = handler._convert_evaluation_params(params)
 
         assert result is not None
         assert len(result) == 3
@@ -96,9 +92,7 @@ def test_convert_evaluation_params_invalid_returns_none(
     ) -> None:
         """Test that invalid params return None to allow GEval auto-detection."""
         params = ["invalid_param", "another_invalid"]
-        result = handler._convert_evaluation_params(  # pylint: disable=protected-access
-            params
-        )
+        result = handler._convert_evaluation_params(params)
 
         assert result is None
 
@@ -106,9 +100,7 @@ def test_convert_evaluation_params_empty_returns_none(
         self, handler: GEvalHandler
     ) -> None:
         """Test that empty params list returns None."""
-        result = handler._convert_evaluation_params(  # pylint: disable=protected-access
-            []
-        )
+        result = handler._convert_evaluation_params([])
         assert result is None
 
     def test_convert_evaluation_params_mixed_invalid_returns_none(
@@ -116,9 +108,7 @@ def test_convert_evaluation_params_mixed_invalid_returns_none(
     ) -> None:
         """Test that any invalid param causes None return."""
         params = ["query", "invalid_param", "response"]
-        result = handler._convert_evaluation_params(  # pylint: disable=protected-access
-            params
-        )
+        result = handler._convert_evaluation_params(params)
 
         # Should return None because of the invalid param
         assert result is None
@@ -135,7 +125,7 @@ def test_get_geval_config_uses_metric_manager(
         mock_metric_manager.get_metric_metadata.return_value = expected_config
 
         conv_data = MagicMock()
-        config = handler._get_geval_config(  # pylint: disable=protected-access
+        config = handler._get_geval_config(
             metric_name="test_metric",
             conv_data=conv_data,
             turn_data=None,
@@ -160,7 +150,7 @@ def test_get_geval_config_turn_level(
         conv_data = MagicMock()
         turn_data = MagicMock()
 
-        config = handler._get_geval_config(  # pylint: disable=protected-access
+        config = handler._get_geval_config(
             metric_name="turn_metric",
             conv_data=conv_data,
             turn_data=turn_data,
@@ -182,7 +172,7 @@ def test_get_geval_config_returns_none_when_not_found(
         mock_metric_manager.get_metric_metadata.return_value = None
 
         conv_data = MagicMock()
-        config = handler._get_geval_config(  # pylint: disable=protected-access
+        config = handler._get_geval_config(
             metric_name="nonexistent_metric",
             conv_data=conv_data,
             turn_data=None,
diff --git a/tests/unit/core/metrics/test_manager.py b/tests/unit/core/metrics/test_manager.py
index 756f2c8e..d525c80c 100644
--- a/tests/unit/core/metrics/test_manager.py
+++ b/tests/unit/core/metrics/test_manager.py
@@ -1,3 +1,5 @@
+# pylint: disable=too-many-public-methods
+
 """Unit tests for core metrics manager module."""
 
 from lightspeed_evaluation.core.metrics.manager import MetricLevel, MetricManager
@@ -8,7 +10,7 @@
 )
 
 
-class TestMetricManager:  # pylint: disable=too-many-public-methods
+class TestMetricManager:
     """Unit tests for MetricManager."""
 
     def test_resolve_metrics_with_none_uses_defaults(
diff --git a/tests/unit/core/metrics/test_nlp.py b/tests/unit/core/metrics/test_nlp.py
index 453cb27c..5e2ac427 100644
--- a/tests/unit/core/metrics/test_nlp.py
+++ b/tests/unit/core/metrics/test_nlp.py
@@ -1,3 +1,5 @@
+# pylint: disable=too-many-arguments,too-many-positional-arguments,disable=too-few-public-methods
+
 """Tests for NLP metrics module.
 
 This module tests the NLP-based evaluation metrics:
@@ -26,7 +28,7 @@
 from lightspeed_evaluation.core.system.exceptions import MetricError
 
 
-class TestNLPMetricsInit:  # pylint: disable=too-few-public-methods
+class TestNLPMetricsInit:
     """Test NLPMetrics initialization."""
 
     def test_initialization(self, nlp_metrics: NLPMetrics) -> None:
@@ -292,7 +294,7 @@ def test_bleu_failure_raises_metric_error(
             ),
         ],
     )
-    def test_ragas_metric_failure_raises_metric_error(  # pylint: disable=too-many-arguments,too-many-positional-arguments
+    def test_ragas_metric_failure_raises_metric_error(
         self,
         nlp_metrics: NLPMetrics,
         sample_scope: EvaluationScope,
diff --git a/tests/unit/core/output/test_final_coverage.py b/tests/unit/core/output/test_final_coverage.py
index bbf9c8e8..b482fbd4 100644
--- a/tests/unit/core/output/test_final_coverage.py
+++ b/tests/unit/core/output/test_final_coverage.py
@@ -1,3 +1,5 @@
+# pylint: disable=protected-access,too-few-public-methods
+
 """Additional tests to boost coverage towards 75%."""
 
 from pathlib import Path
@@ -97,7 +99,7 @@ def test_calculate_stats_with_single_result(self, tmp_path: Path) -> None:
             )
         ]
 
-        stats = handler._calculate_stats(results)  # pylint: disable=protected-access
+        stats = handler._calculate_stats(results)
 
         assert stats["basic"]["TOTAL"] == 1
         assert stats["basic"]["PASS"] == 1
@@ -123,9 +125,7 @@ def test_generate_csv_with_minimal_columns(
             )
         ]
 
-        csv_file = handler._generate_csv_report(  # pylint: disable=protected-access
-            results, "test"
-        )
+        csv_file = handler._generate_csv_report(results, "test")
 
         assert csv_file.exists()
         content = csv_file.read_text()
@@ -134,7 +134,7 @@ def test_generate_csv_with_minimal_columns(
         assert "PASS" in content
 
 
-class TestSystemLoaderEdgeCases:  # pylint: disable=too-few-public-methods
+class TestSystemLoaderEdgeCases:
     """Edge case tests for system loader."""
 
     def test_validate_metrics_with_mixed_valid_invalid(self) -> None:
diff --git a/tests/unit/core/output/test_generator.py b/tests/unit/core/output/test_generator.py
index 5b4b2ef8..a6252e8b 100644
--- a/tests/unit/core/output/test_generator.py
+++ b/tests/unit/core/output/test_generator.py
@@ -1,3 +1,5 @@
+# pylint: disable=protected-access
+
 """Unit tests for output generator."""
 
 import json
@@ -26,9 +28,7 @@ def test_calculate_stats_with_results(
     ) -> None:
         """Test statistics calculation."""
         handler = OutputHandler(output_dir=str(tmp_path))
-        stats = handler._calculate_stats(  # pylint: disable=protected-access
-            sample_results
-        )
+        stats = handler._calculate_stats(sample_results)
 
         assert stats["basic"]["TOTAL"] == 2
         assert stats["basic"]["PASS"] == 1
@@ -38,7 +38,7 @@ def test_calculate_stats_with_results(
     def test_calculate_stats_empty(self, tmp_path: Path) -> None:
         """Test statistics with empty results."""
         handler = OutputHandler(output_dir=str(tmp_path))
-        stats = handler._calculate_stats([])  # pylint: disable=protected-access
+        stats = handler._calculate_stats([])
 
         assert stats["basic"]["TOTAL"] == 0
         assert not stats["detailed"]["by_metric"]
@@ -55,9 +55,7 @@ def test_generate_csv_report(
             system_config=mock_system_config,
         )
 
-        csv_file = handler._generate_csv_report(  # pylint: disable=protected-access
-            sample_results, "test"
-        )
+        csv_file = handler._generate_csv_report(sample_results, "test")
 
         assert csv_file.exists()
         assert csv_file.suffix == ".csv"
@@ -72,9 +70,7 @@ def test_generate_json_summary(
     ) -> None:
         """Test JSON summary generation."""
         handler = OutputHandler(output_dir=str(tmp_path))
-        stats = handler._calculate_stats(  # pylint: disable=protected-access
-            sample_results
-        )
+        stats = handler._calculate_stats(sample_results)
         api_tokens = {
             "total_api_input_tokens": 100,
             "total_api_output_tokens": 200,
@@ -82,7 +78,7 @@ def test_generate_json_summary(
         }
         streaming_stats: dict = {}
 
-        json_file = handler._generate_json_summary(  # pylint: disable=protected-access
+        json_file = handler._generate_json_summary(
             sample_results,
             "test",
             stats["basic"],
@@ -108,9 +104,7 @@ def test_generate_text_summary(
     ) -> None:
         """Test text summary generation."""
         handler = OutputHandler(output_dir=str(tmp_path))
-        stats = handler._calculate_stats(  # pylint: disable=protected-access
-            sample_results
-        )
+        stats = handler._calculate_stats(sample_results)
         api_tokens = {
             "total_api_input_tokens": 100,
             "total_api_output_tokens": 200,
@@ -118,7 +112,7 @@ def test_generate_text_summary(
         }
         streaming_stats: dict = {}
 
-        txt_file = handler._generate_text_summary(  # pylint: disable=protected-access
+        txt_file = handler._generate_text_summary(
             sample_results,
             "test",
             stats["basic"],
@@ -202,13 +196,9 @@ def test_generate_individual_reports_csv_only(
         config.visualization.enabled_graphs = []
 
         handler = OutputHandler(output_dir=str(tmp_path), system_config=config)
-        stats = handler._calculate_stats(  # pylint: disable=protected-access
-            sample_results
-        )
+        stats = handler._calculate_stats(sample_results)
 
-        handler._generate_individual_reports(  # pylint: disable=protected-access
-            sample_results, "test", ["csv"], stats
-        )
+        handler._generate_individual_reports(sample_results, "test", ["csv"], stats)
 
         assert (tmp_path / "test_detailed.csv").exists()
 
@@ -225,13 +215,9 @@ def test_generate_individual_reports_json_only(
         config.model_fields.keys.return_value = []
 
         handler = OutputHandler(output_dir=str(tmp_path), system_config=config)
-        stats = handler._calculate_stats(  # pylint: disable=protected-access
-            sample_results
-        )
+        stats = handler._calculate_stats(sample_results)
 
-        handler._generate_individual_reports(  # pylint: disable=protected-access
-            sample_results, "test", ["json"], stats
-        )
+        handler._generate_individual_reports(sample_results, "test", ["json"], stats)
 
         assert (tmp_path / "test_summary.json").exists()
 
@@ -248,12 +234,8 @@ def test_generate_individual_reports_txt_only(
         config.model_fields.keys.return_value = []
 
         handler = OutputHandler(output_dir=str(tmp_path), system_config=config)
-        stats = handler._calculate_stats(  # pylint: disable=protected-access
-            sample_results
-        )
-        handler._generate_individual_reports(  # pylint: disable=protected-access
-            sample_results, "test", ["txt"], stats
-        )
+        stats = handler._calculate_stats(sample_results)
+        handler._generate_individual_reports(sample_results, "test", ["txt"], stats)
 
         assert (tmp_path / "test_summary.txt").exists()
 
@@ -279,9 +261,7 @@ def test_csv_with_all_columns(
         config.visualization.enabled_graphs = []
 
         handler = OutputHandler(output_dir=str(tmp_path), system_config=config)
-        csv_file = handler._generate_csv_report(  # pylint: disable=protected-access
-            sample_results, "test"
-        )
+        csv_file = handler._generate_csv_report(sample_results, "test")
 
         content = csv_file.read_text()
         assert "query" in content
@@ -409,9 +389,7 @@ def test_generate_csv_with_specific_results(
         mocker.patch("builtins.print")
 
         handler = OutputHandler(output_dir=str(tmp_path))
-        csv_file = handler._generate_csv_report(  # pylint: disable=protected-access
-            results, "test_eval"
-        )
+        csv_file = handler._generate_csv_report(results, "test_eval")
 
         assert csv_file.exists()
         assert csv_file.suffix == ".csv"
@@ -461,9 +439,7 @@ def test_csv_columns_configuration(
         system_config.visualization.enabled_graphs = []
 
         handler = OutputHandler(output_dir=str(tmp_path), system_config=system_config)
-        csv_file = handler._generate_csv_report(  # pylint: disable=protected-access
-            results, "test_eval"
-        )
+        csv_file = handler._generate_csv_report(results, "test_eval")
 
         with open(csv_file, encoding="utf-8") as f:
             reader = csv_module.reader(f)
@@ -487,9 +463,7 @@ def test_filename_timestamp_format(
         )
         mock_datetime.now.return_value.strftime.return_value = "20240101_120000"
 
-        csv_file = handler._generate_csv_report(  # pylint: disable=protected-access
-            results, "test_20240101_120000"
-        )
+        csv_file = handler._generate_csv_report(results, "test_20240101_120000")
 
         assert "test_20240101_120000" in csv_file.name
         assert csv_file.suffix == ".csv"
diff --git a/tests/unit/core/system/test_validator.py b/tests/unit/core/system/test_validator.py
index 95799a6d..1a9a0c33 100644
--- a/tests/unit/core/system/test_validator.py
+++ b/tests/unit/core/system/test_validator.py
@@ -1,3 +1,5 @@
+# pylint: disable=protected-access
+
 """Unit tests for core system validator module."""
 
 import tempfile
@@ -55,11 +57,7 @@ def test_validate_evaluation_data_valid(self) -> None:
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = (
-            validator._validate_evaluation_data(  # pylint: disable=protected-access
-                [conv_data]
-            )
-        )
+        result = validator._validate_evaluation_data([conv_data])
 
         assert result is True
         assert len(validator.validation_errors) == 0
@@ -84,11 +82,7 @@ def test_validate_metrics_availability_unknown_turn_metric(
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = (
-            validator._validate_evaluation_data(  # pylint: disable=protected-access
-                [conv_data]
-            )
-        )
+        result = validator._validate_evaluation_data([conv_data])
 
         assert result is False
         assert len(validator.validation_errors) > 0
@@ -114,11 +108,7 @@ def test_validate_metrics_availability_unknown_conversation_metric(
             conversation_metrics=["unknown:conversation_metric"],
         )
 
-        result = (
-            validator._validate_evaluation_data(  # pylint: disable=protected-access
-                [conv_data]
-            )
-        )
+        result = validator._validate_evaluation_data([conv_data])
 
         assert result is False
         assert any(
@@ -138,11 +128,7 @@ def test_validate_metric_requirements_missing_response(self) -> None:
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = (
-            validator._validate_evaluation_data(  # pylint: disable=protected-access
-                [conv_data]
-            )
-        )
+        result = validator._validate_evaluation_data([conv_data])
 
         assert result is False
         assert any("response" in error.lower() for error in validator.validation_errors)
@@ -160,11 +146,7 @@ def test_validate_metric_requirements_missing_contexts(self) -> None:
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = (
-            validator._validate_evaluation_data(  # pylint: disable=protected-access
-                [conv_data]
-            )
-        )
+        result = validator._validate_evaluation_data([conv_data])
 
         assert result is False
         assert any("contexts" in error.lower() for error in validator.validation_errors)
@@ -189,11 +171,7 @@ def test_validate_metric_requirements_api_enabled_allows_missing_response(
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = (
-            validator._validate_evaluation_data(  # pylint: disable=protected-access
-                [conv_data]
-            )
-        )
+        result = validator._validate_evaluation_data([conv_data])
 
         # Should pass because API will populate response
         assert result is True
@@ -212,11 +190,7 @@ def test_validate_metric_requirements_expected_response_missing(self) -> None:
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = (
-            validator._validate_evaluation_data(  # pylint: disable=protected-access
-                [conv_data]
-            )
-        )
+        result = validator._validate_evaluation_data([conv_data])
 
         assert result is False
         assert any(
@@ -238,11 +212,7 @@ def test_validate_metric_requirements_tool_eval_missing_fields(self) -> None:
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = (
-            validator._validate_evaluation_data(  # pylint: disable=protected-access
-                [conv_data]
-            )
-        )
+        result = validator._validate_evaluation_data([conv_data])
 
         assert result is False
         assert any(
@@ -271,11 +241,7 @@ def test_validate_metric_requirements_skip_script_when_api_disabled(
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
         # Should not validate script requirements when API disabled
-        result = (
-            validator._validate_evaluation_data(  # pylint: disable=protected-access
-                [conv_data]
-            )
-        )
+        result = validator._validate_evaluation_data([conv_data])
 
         # Should pass because script validation is skipped
         assert result is True
@@ -373,11 +339,7 @@ def test_check_metric_requirements_missing_contexts(self) -> None:
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = (
-            validator._validate_evaluation_data(  # pylint: disable=protected-access
-                [conv_data]
-            )
-        )
+        result = validator._validate_evaluation_data([conv_data])
 
         assert result is False
         assert any("contexts" in error.lower() for error in validator.validation_errors)
@@ -394,11 +356,7 @@ def test_check_metric_requirements_whitespace_only_string(self) -> None:
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn])
 
-        result = (
-            validator._validate_evaluation_data(  # pylint: disable=protected-access
-                [conv_data]
-            )
-        )
+        result = validator._validate_evaluation_data([conv_data])
 
         assert result is False
 
@@ -412,11 +370,7 @@ def test_validate_multiple_conversations(self) -> None:
         conv1 = EvaluationData(conversation_group_id="conv1", turns=[turn1])
         conv2 = EvaluationData(conversation_group_id="conv2", turns=[turn2])
 
-        result = (
-            validator._validate_evaluation_data(  # pylint: disable=protected-access
-                [conv1, conv2]
-            )
-        )
+        result = validator._validate_evaluation_data([conv1, conv2])
 
         assert result is True
 
@@ -447,11 +401,7 @@ def test_validate_evaluation_data_accumulates_errors(
 
         conv = EvaluationData(conversation_group_id="test", turns=[turn1, turn2])
 
-        result = (
-            validator._validate_evaluation_data(  # pylint: disable=protected-access
-                [conv]
-            )
-        )
+        result = validator._validate_evaluation_data([conv])
 
         assert result is False
         # Should have errors for both issues
@@ -474,7 +424,7 @@ def test_filter_by_scope_no_filter(self) -> None:
                 turns=[TurnData(turn_id="t1", query="Q", response="A")],
             ),
         ]
-        result = validator._filter_by_scope(data)  # pylint: disable=protected-access
+        result = validator._filter_by_scope(data)
         assert len(result) == 2
 
     def test_filter_by_scope_tags_only(self) -> None:
@@ -497,9 +447,7 @@ def test_filter_by_scope_tags_only(self) -> None:
                 turns=[TurnData(turn_id="t1", query="Q", response="A")],
             ),
         ]
-        result = validator._filter_by_scope(  # pylint: disable=protected-access
-            data, tags=["basic"]
-        )
+        result = validator._filter_by_scope(data, tags=["basic"])
         assert len(result) == 2
         assert all(c.tag == "basic" for c in result)
 
@@ -520,9 +468,7 @@ def test_filter_by_scope_conv_ids_only(self) -> None:
                 turns=[TurnData(turn_id="t1", query="Q", response="A")],
             ),
         ]
-        result = validator._filter_by_scope(  # pylint: disable=protected-access
-            data, conv_ids=["conv_1", "conv_3"]
-        )
+        result = validator._filter_by_scope(data, conv_ids=["conv_1", "conv_3"])
         assert len(result) == 2
         assert {c.conversation_group_id for c in result} == {"conv_1", "conv_3"}
 
@@ -546,9 +492,7 @@ def test_filter_by_scope_tags_and_conv_ids(self) -> None:
                 turns=[TurnData(turn_id="t1", query="Q", response="A")],
             ),
         ]
-        result = validator._filter_by_scope(  # pylint: disable=protected-access
-            data, tags=["basic"], conv_ids=["conv_3"]
-        )
+        result = validator._filter_by_scope(data, tags=["basic"], conv_ids=["conv_3"])
         assert len(result) == 2  # conv_1 (basic tag) + conv_3 (by ID)
 
     def test_filter_by_scope_no_match_returns_empty(self) -> None:
@@ -561,7 +505,5 @@ def test_filter_by_scope_no_match_returns_empty(self) -> None:
                 turns=[TurnData(turn_id="t1", query="Q", response="A")],
             ),
         ]
-        result = validator._filter_by_scope(  # pylint: disable=protected-access
-            data, tags=["nonexistent"]
-        )
+        result = validator._filter_by_scope(data, tags=["nonexistent"])
         assert len(result) == 0
diff --git a/tests/unit/pipeline/evaluation/conftest.py b/tests/unit/pipeline/evaluation/conftest.py
index a1131cd5..09a2e18c 100644
--- a/tests/unit/pipeline/evaluation/conftest.py
+++ b/tests/unit/pipeline/evaluation/conftest.py
@@ -1,3 +1,5 @@
+# pylint: disable=redefined-outer-name
+
 """Pytest configuration and fixtures for evaluation tests."""
 
 import pytest
@@ -198,11 +200,11 @@ def create_error_result_side_effect(
 
 @pytest.fixture
 def processor_components_pr(
-    mock_metrics_evaluator: MetricsEvaluator,  # pylint: disable=redefined-outer-name
-    mock_api_amender: APIDataAmender,  # pylint: disable=redefined-outer-name
-    mock_error_handler: EvaluationErrorHandler,  # pylint: disable=redefined-outer-name
-    mock_metric_manager: MetricManager,  # pylint: disable=redefined-outer-name
-    mock_script_manager: ScriptExecutionManager,  # pylint: disable=redefined-outer-name
+    mock_metrics_evaluator: MetricsEvaluator,
+    mock_api_amender: APIDataAmender,
+    mock_error_handler: EvaluationErrorHandler,
+    mock_metric_manager: MetricManager,
+    mock_script_manager: ScriptExecutionManager,
 ) -> ProcessorComponents:
     """Create processor components fixture for PR tests."""
     return ProcessorComponents(
@@ -216,8 +218,8 @@ def processor_components_pr(
 
 @pytest.fixture
 def processor(
-    config_loader: ConfigLoader,  # pylint: disable=redefined-outer-name
-    processor_components_pr: ProcessorComponents,  # pylint: disable=redefined-outer-name
+    config_loader: ConfigLoader,
+    processor_components_pr: ProcessorComponents,
 ) -> ConversationProcessor:
     """Create ConversationProcessor instance for PR tests."""
     return ConversationProcessor(config_loader, processor_components_pr)
diff --git a/tests/unit/pipeline/evaluation/test_evaluator.py b/tests/unit/pipeline/evaluation/test_evaluator.py
index 01d44109..92061aba 100644
--- a/tests/unit/pipeline/evaluation/test_evaluator.py
+++ b/tests/unit/pipeline/evaluation/test_evaluator.py
@@ -1,3 +1,5 @@
+# pylint: disable=protected-access,redefined-outer-name,too-many-arguments,too-many-positional-arguments
+
 """Unit tests for pipeline evaluation evaluator module."""
 
 import pytest
@@ -428,20 +430,11 @@ def test_determine_status_with_threshold(
         )
 
         # Test PASS
-        assert (
-            evaluator._determine_status(0.8, 0.7)  # pylint: disable=protected-access
-            == "PASS"
-        )
-        assert (
-            evaluator._determine_status(0.7, 0.7)  # pylint: disable=protected-access
-            == "PASS"
-        )  # Equal passes
+        assert evaluator._determine_status(0.8, 0.7) == "PASS"
+        assert evaluator._determine_status(0.7, 0.7) == "PASS"  # Equal passes
 
         # Test FAIL
-        assert (
-            evaluator._determine_status(0.6, 0.7)  # pylint: disable=protected-access
-            == "FAIL"
-        )
+        assert evaluator._determine_status(0.6, 0.7) == "FAIL"
 
     def test_determine_status_without_threshold(
         self,
@@ -471,16 +464,10 @@ def test_determine_status_without_threshold(
         )
 
         # Should use 0.5 as default
-        assert (
-            evaluator._determine_status(0.6, None)  # pylint: disable=protected-access
-            == "PASS"
-        )
-        assert (
-            evaluator._determine_status(0.4, None)  # pylint: disable=protected-access
-            == "FAIL"
-        )
+        assert evaluator._determine_status(0.6, None) == "PASS"
+        assert evaluator._determine_status(0.4, None) == "FAIL"
 
-    def _setup_evaluate_test(  # pylint: disable=too-many-arguments, too-many-positional-arguments
+    def _setup_evaluate_test(
         self,
         config_loader: ConfigLoader,
         mock_metric_manager: MetricManager,
@@ -558,7 +545,7 @@ def create_mock_handler(  # type: ignore[no-untyped-def]
         "metric_identifier",
         ["ragas:context_recall", "custom:answer_correctness", "nlp:rouge"],
     )
-    def test_evaluate_with_expected_response_list(  # pylint: disable=too-many-arguments, too-many-positional-arguments
+    def test_evaluate_with_expected_response_list(
         self,
         config_loader: ConfigLoader,
         mock_metric_manager: MetricManager,
@@ -586,9 +573,7 @@ def test_evaluate_with_expected_response_list(  # pylint: disable=too-many-argum
         request = EvaluationRequest.for_turn(conv_data, metric_identifier, 0, turn_data)
         scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False)
 
-        metric_result = evaluator._evaluate_wrapper(  # pylint: disable=protected-access
-            request, scope, 0.7
-        )
+        metric_result = evaluator._evaluate_wrapper(request, scope, 0.7)
 
         assert metric_result.score == 0.85
         assert metric_result.reason == "High score"
@@ -628,9 +613,7 @@ def test_evaluate_with_expected_response_list_fail(
         )
         scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False)
 
-        metric_result = evaluator._evaluate_wrapper(  # pylint: disable=protected-access
-            request, scope, 0.7
-        )
+        metric_result = evaluator._evaluate_wrapper(request, scope, 0.7)
         reason_combined = "\n".join(
             [f"{score}; {reason}" for score, reason in scores_reasons]
         )
@@ -665,9 +648,7 @@ def test_evaluate_with_expected_response_string(
         )
         scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False)
 
-        metric_result = evaluator._evaluate_wrapper(  # pylint: disable=protected-access
-            request, scope, 0.7
-        )
+        metric_result = evaluator._evaluate_wrapper(request, scope, 0.7)
 
         assert metric_result.score == 0.85
         assert metric_result.reason == "Good score"
@@ -682,7 +663,7 @@ def test_evaluate_with_expected_response_string(
         [None, "string", ["string1", "string2"]],
         ids=["none", "string", "string_list"],
     )
-    def test_evaluate_with_expected_response_not_needed(  # pylint: disable=too-many-arguments, too-many-positional-arguments
+    def test_evaluate_with_expected_response_not_needed(
         self,
         config_loader: ConfigLoader,
         mock_metric_manager: MetricManager,
@@ -711,9 +692,7 @@ def test_evaluate_with_expected_response_not_needed(  # pylint: disable=too-many
         request = EvaluationRequest.for_turn(conv_data, metric_identifier, 0, turn_data)
         scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False)
 
-        metric_result = evaluator._evaluate_wrapper(  # pylint: disable=protected-access
-            request, scope, 0.7
-        )
+        metric_result = evaluator._evaluate_wrapper(request, scope, 0.7)
 
         assert metric_result.score == 0.3
         assert metric_result.reason == "Low score"
@@ -753,16 +732,16 @@ def test_token_tracker_start_stop(self) -> None:
         """Test start and stop methods."""
         tracker = TokenTracker()
         tracker.start()
-        assert tracker._callback_registered is True  # pylint: disable=protected-access
+        assert tracker._callback_registered is True
         tracker.stop()
-        assert tracker._callback_registered is False  # pylint: disable=protected-access
+        assert tracker._callback_registered is False
 
     def test_token_tracker_double_start(self) -> None:
         """Test calling start twice doesn't register callback twice."""
         tracker = TokenTracker()
         tracker.start()
         tracker.start()  # Should not fail
-        assert tracker._callback_registered is True  # pylint: disable=protected-access
+        assert tracker._callback_registered is True
         tracker.stop()
 
     def test_token_tracker_double_stop(self) -> None:
@@ -771,7 +750,7 @@ def test_token_tracker_double_stop(self) -> None:
         tracker.start()
         tracker.stop()
         tracker.stop()  # Should not fail
-        assert tracker._callback_registered is False  # pylint: disable=protected-access
+        assert tracker._callback_registered is False
 
     def test_token_tracker_independent_instances(self) -> None:
         """Test multiple TokenTracker instances are independent."""
diff --git a/tests/unit/pipeline/evaluation/test_processor.py b/tests/unit/pipeline/evaluation/test_processor.py
index 16b3d4f5..6b18ef84 100644
--- a/tests/unit/pipeline/evaluation/test_processor.py
+++ b/tests/unit/pipeline/evaluation/test_processor.py
@@ -1,3 +1,5 @@
+# pylint: disable=unused-argument,protected-access,too-many-arguments, too-many-positional-arguments
+
 """Unit tests for ConversationProcessor."""
 
 from typing import Callable
@@ -44,7 +46,7 @@ def test_process_conversation_skips_when_no_metrics(
         mock_config_loader: ConfigLoader,
         processor_components: ProcessorComponents,
         sample_conv_data: EvaluationData,
-        mocker: MockerFixture,  # pylint: disable=unused-argument
+        mocker: MockerFixture,
     ) -> None:
         """Test processing skips when no metrics specified."""
         # Mock metric manager to return empty lists
@@ -60,7 +62,7 @@ def test_process_conversation_turn_metrics(
         mock_config_loader: ConfigLoader,
         processor_components: ProcessorComponents,
         sample_conv_data: EvaluationData,
-        mocker: MockerFixture,  # pylint: disable=unused-argument
+        mocker: MockerFixture,
     ) -> None:
         """Test processing with turn-level metrics."""
 
@@ -99,7 +101,7 @@ def test_process_conversation_conversation_metrics(
         self,
         mock_config_loader: ConfigLoader,
         processor_components: ProcessorComponents,
-        mocker: MockerFixture,  # pylint: disable=unused-argument
+        mocker: MockerFixture,
     ) -> None:
         """Test processing with conversation-level metrics."""
 
@@ -144,7 +146,7 @@ def test_process_conversation_with_setup_script_success(
         mock_config_loader: ConfigLoader,
         processor_components: ProcessorComponents,
         sample_conv_data: EvaluationData,
-        mocker: MockerFixture,  # pylint: disable=unused-argument
+        mocker: MockerFixture,
     ) -> None:
         """Test processing with successful setup script."""
 
@@ -193,7 +195,7 @@ def test_process_conversation_with_setup_script_failure(
         mock_config_loader: ConfigLoader,
         processor_components: ProcessorComponents,
         sample_conv_data: EvaluationData,
-        mocker: MockerFixture,  # pylint: disable=unused-argument
+        mocker: MockerFixture,
     ) -> None:
         """Test processing handles setup script failure."""
         sample_conv_data.setup_script = "setup.sh"
@@ -215,7 +217,7 @@ def test_process_conversation_with_cleanup_script(
         mock_config_loader: ConfigLoader,
         processor_components: ProcessorComponents,
         sample_conv_data: EvaluationData,
-        mocker: MockerFixture,  # pylint: disable=unused-argument
+        mocker: MockerFixture,
     ) -> None:
         """Test cleanup script is always called."""
 
@@ -265,7 +267,7 @@ def test_process_conversation_with_api_amendment(
         mock_config_loader: ConfigLoader,
         processor_components: ProcessorComponents,
         sample_conv_data: EvaluationData,
-        mocker: MockerFixture,  # pylint: disable=unused-argument
+        mocker: MockerFixture,
     ) -> None:
         """Test API amendment during turn processing."""
 
@@ -311,7 +313,7 @@ def test_process_conversation_with_api_error_cascade(
         self,
         mock_config_loader: ConfigLoader,
         processor_components: ProcessorComponents,
-        mocker: MockerFixture,  # pylint: disable=unused-argument
+        mocker: MockerFixture,
     ) -> None:
         """Test API error causes cascade failure."""
         assert mock_config_loader.system_config is not None
@@ -354,7 +356,7 @@ def test_evaluate_turn(
         mock_config_loader: ConfigLoader,
         processor_components: ProcessorComponents,
         sample_conv_data: EvaluationData,
-        mocker: MockerFixture,  # pylint: disable=unused-argument
+        mocker: MockerFixture,
     ) -> None:
         """Test _evaluate_turn method."""
 
@@ -372,7 +374,7 @@ def test_evaluate_turn(
         )
 
         processor = ConversationProcessor(mock_config_loader, processor_components)
-        results = processor._evaluate_turn(  # pylint: disable=protected-access
+        results = processor._evaluate_turn(
             sample_conv_data, 0, sample_conv_data.turns[0], ["ragas:faithfulness"]
         )
 
@@ -384,7 +386,7 @@ def test_evaluate_conversation(
         mock_config_loader: ConfigLoader,
         processor_components: ProcessorComponents,
         sample_conv_data: EvaluationData,
-        mocker: MockerFixture,  # pylint: disable=unused-argument
+        mocker: MockerFixture,
     ) -> None:
         """Test _evaluate_conversation method."""
 
@@ -402,7 +404,7 @@ def test_evaluate_conversation(
         )
 
         processor = ConversationProcessor(mock_config_loader, processor_components)
-        results = processor._evaluate_conversation(  # pylint: disable=protected-access
+        results = processor._evaluate_conversation(
             sample_conv_data, ["deepeval:conversation_completeness"]
         )
 
@@ -421,9 +423,7 @@ def test_run_setup_script_skips_when_api_disabled(
         mock_config_loader.system_config.api.enabled = False
 
         processor = ConversationProcessor(mock_config_loader, processor_components)
-        error = processor._run_setup_script(  # pylint: disable=protected-access
-            sample_conv_data
-        )
+        error = processor._run_setup_script(sample_conv_data)
 
         assert error is None
         processor_components.script_manager.run_script.assert_not_called()
@@ -440,9 +440,7 @@ def test_run_cleanup_script_skips_when_api_disabled(
         mock_config_loader.system_config.api.enabled = False
 
         processor = ConversationProcessor(mock_config_loader, processor_components)
-        processor._run_cleanup_script(  # pylint: disable=protected-access
-            sample_conv_data
-        )
+        processor._run_cleanup_script(sample_conv_data)
 
         processor_components.script_manager.run_script.assert_not_called()
 
@@ -461,9 +459,7 @@ def test_run_cleanup_script_logs_warning_on_failure(
 
         processor = ConversationProcessor(mock_config_loader, processor_components)
         # Should not raise, just log warning
-        processor._run_cleanup_script(  # pylint: disable=protected-access
-            sample_conv_data
-        )
+        processor._run_cleanup_script(sample_conv_data)
 
     def test_get_metrics_summary(
         self,
@@ -501,9 +497,7 @@ def test_evaluate_turn_with_valid_metrics(
 
         turn_metrics = ["ragas:faithfulness", "custom:answer_correctness"]
 
-        results = processor._evaluate_turn(  # pylint: disable=protected-access
-            conv_data, 0, turn_data, turn_metrics
-        )
+        results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
 
         # Should evaluate both metrics
         assert len(results) == 2
@@ -539,9 +533,7 @@ def test_evaluate_turn_with_invalid_metric(
         turn_metrics = ["ragas:faithfulness", "custom:answer_correctness"]
 
         with caplog.at_level(logging.ERROR):
-            results = processor._evaluate_turn(  # pylint: disable=protected-access
-                conv_data, 0, turn_data, turn_metrics
-            )
+            results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
 
         # Should get 2 results: 1 ERROR for invalid metric, 1 PASS for valid metric
         assert len(results) == 2
@@ -584,9 +576,7 @@ def test_evaluate_turn_with_all_invalid_metrics(
         turn_metrics = ["ragas:faithfulness", "custom:answer_correctness"]
 
         with caplog.at_level(logging.ERROR):
-            results = processor._evaluate_turn(  # pylint: disable=protected-access
-                conv_data, 0, turn_data, turn_metrics
-            )
+            results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
 
         # Should return ERROR results for both invalid metrics
         assert len(results) == 2
@@ -627,9 +617,7 @@ def test_evaluate_turn_with_mixed_valid_invalid_metrics(
         ]
 
         with caplog.at_level(logging.ERROR):
-            results = processor._evaluate_turn(  # pylint: disable=protected-access
-                conv_data, 0, turn_data, turn_metrics
-            )
+            results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
 
         # Should get 3 results: 2 valid metrics (PASS) and 1 invalid metric (ERROR)
         assert len(results) == 3
@@ -659,9 +647,7 @@ def test_evaluate_turn_with_empty_metrics(
 
         turn_metrics: list[str] = []
 
-        results = processor._evaluate_turn(  # pylint: disable=protected-access
-            conv_data, 0, turn_data, turn_metrics
-        )
+        results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
 
         # Should return empty results
         assert len(results) == 0
@@ -683,9 +669,7 @@ def test_evaluate_turn_creates_correct_request(
 
         turn_metrics = ["ragas:faithfulness"]
 
-        processor._evaluate_turn(  # pylint: disable=protected-access
-            conv_data, 0, turn_data, turn_metrics
-        )
+        processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
 
         # Verify the request structure
         assert mock_metrics_evaluator.evaluate_metric.call_count == 1
@@ -714,9 +698,7 @@ def test_evaluate_turn_handles_evaluator_returning_none(
 
         turn_metrics = ["ragas:faithfulness"]
 
-        results = processor._evaluate_turn(  # pylint: disable=protected-access
-            conv_data, 0, turn_data, turn_metrics
-        )
+        results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
 
         # Should return empty results when evaluator returns None
         assert len(results) == 0
@@ -740,9 +722,7 @@ def test_evaluate_turn_multiple_turns_correct_index(
         turn_metrics = ["ragas:faithfulness"]
 
         # Evaluate second turn (index 1)
-        processor._evaluate_turn(  # pylint: disable=protected-access
-            conv_data, 1, turn_data_2, turn_metrics
-        )
+        processor._evaluate_turn(conv_data, 1, turn_data_2, turn_metrics)
 
         # Verify correct turn index
         call_args = mock_metrics_evaluator.evaluate_metric.call_args[0][0]
@@ -766,9 +746,7 @@ def test_evaluate_turn_preserves_metric_order(
             "ragas:context_recall",
         ]
 
-        processor._evaluate_turn(  # pylint: disable=protected-access
-            conv_data, 0, turn_data, turn_metrics
-        )
+        processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics)
 
         # Verify metrics were evaluated in order
         assert mock_metrics_evaluator.evaluate_metric.call_count == 3
@@ -852,7 +830,7 @@ def _create(skip_on_failure: bool) -> ConfigLoader:
             (True, False, False),  # System enabled, conv disables
         ],
     )
-    def test_is_skip_on_failure_enabled(  # pylint: disable=too-many-arguments, too-many-positional-arguments
+    def test_is_skip_on_failure_enabled(
         self,
         config_loader_factory: Callable[[bool], ConfigLoader],
         processor_components: ProcessorComponents,
@@ -869,12 +847,7 @@ def test_is_skip_on_failure_enabled(  # pylint: disable=too-many-arguments, too-
         processor = ConversationProcessor(
             config_loader_factory(system_skip), processor_components
         )
-        assert (
-            processor._is_skip_on_failure_enabled(  # pylint: disable=protected-access
-                conv_data
-            )
-            is expected
-        )
+        assert processor._is_skip_on_failure_enabled(conv_data) is expected
 
     @pytest.mark.parametrize(
         "results_status,expected",
@@ -899,13 +872,10 @@ def test_has_failure(
             )
             for i, status in enumerate(results_status)
         ]
-        assert (
-            processor._has_failure(results)  # pylint: disable=protected-access
-            is expected
-        )
+        assert processor._has_failure(results) is expected
 
     @pytest.mark.parametrize("skip_enabled,expect_skip", [(True, True), (False, False)])
-    def test_skip_on_failure_behavior(  # pylint: disable=too-many-arguments, too-many-positional-arguments
+    def test_skip_on_failure_behavior(
         self,
         config_loader_factory: Callable[[bool], ConfigLoader],
         processor_components: ProcessorComponents,
diff --git a/tests/unit/runner/test_evaluation.py b/tests/unit/runner/test_evaluation.py
index a9a18fcf..8931f137 100644
--- a/tests/unit/runner/test_evaluation.py
+++ b/tests/unit/runner/test_evaluation.py
@@ -1,3 +1,5 @@
+# pylint: disable=unused-argument
+
 """Unit tests for runner/evaluation.py."""
 
 import argparse
@@ -28,7 +30,7 @@ class TestRunEvaluation:
     def test_run_evaluation_success(
         self,
         mocker: MockerFixture,
-        capsys: pytest.CaptureFixture,  # pylint: disable=unused-argument
+        capsys: pytest.CaptureFixture,
     ) -> None:
         """Test successful evaluation run."""
         # Mock ConfigLoader
@@ -98,7 +100,7 @@ def test_run_evaluation_success(
     def test_run_evaluation_with_output_dir_override(
         self,
         mocker: MockerFixture,
-        capsys: pytest.CaptureFixture,  # pylint: disable=unused-argument
+        capsys: pytest.CaptureFixture,
     ) -> None:
         """Test evaluation with custom output directory."""
         mock_loader = mocker.Mock()
@@ -261,7 +263,7 @@ def test_run_evaluation_with_errors_in_results(
     def test_run_evaluation_closes_pipeline_on_exception(
         self,
         mocker: MockerFixture,
-        capsys: pytest.CaptureFixture,  # pylint: disable=unused-argument
+        capsys: pytest.CaptureFixture,
     ) -> None:
         """Test pipeline is closed even if evaluation fails."""
         mock_loader = mocker.Mock()

From c3ebe9c341c8a243e420f44e239835611287ab0a Mon Sep 17 00:00:00 2001
From: Eva Micankova <emicanko@redhat.com>
Date: Thu, 29 Jan 2026 15:50:58 +0100
Subject: [PATCH 3/3] Removing unittest.mock

---
 tests/script/test_run_multi_provider_eval.py | 200 ++---
 tests/unit/core/metrics/test_geval.py        | 767 ++++++++++---------
 2 files changed, 504 insertions(+), 463 deletions(-)

diff --git a/tests/script/test_run_multi_provider_eval.py b/tests/script/test_run_multi_provider_eval.py
index a65b235a..0b271c62 100644
--- a/tests/script/test_run_multi_provider_eval.py
+++ b/tests/script/test_run_multi_provider_eval.py
@@ -6,13 +6,13 @@
 import json
 from pathlib import Path
 from typing import Any
-from unittest.mock import patch
 import tempfile as temp_module
 import logging
 import multiprocessing
 import shutil
 
 import pytest
+from pytest_mock import MockerFixture
 import yaml
 
 from script.run_multi_provider_eval import MultiProviderEvaluationRunner
@@ -390,6 +390,7 @@ def test_create_temp_config_file(
     def test_temp_config_cleanup_on_yaml_dump_failure(
         self,
         runner: MultiProviderEvaluationRunner,
+        mocker: MockerFixture,
     ) -> None:
         """Test that temp file is cleaned up when yaml.dump() fails."""
 
@@ -406,28 +407,26 @@ def track_temp_file(*args: Any, **kwargs: Any) -> Any:
             return temp_file
 
         # Mock NamedTemporaryFile to track the created file
-        with patch(
+        mocker.patch(
             "script.run_multi_provider_eval.tempfile.NamedTemporaryFile",
             side_effect=track_temp_file,
-        ):
-            # Mock yaml.dump to raise an exception
-            with patch(
-                "script.run_multi_provider_eval.yaml.dump",
-                side_effect=Exception("YAML dump failed"),
-            ):
-                with pytest.raises(Exception, match="YAML dump failed"):
-                    runner._create_temp_system_config(
-                        provider_id="openai",
-                        model="gpt-4o-mini",
-                    )
-
-                # Verify the temp file was cleaned up after the exception
-                assert (
-                    created_temp_path is not None
-                ), "Temp file should have been created"
-                assert (
-                    not created_temp_path.exists()
-                ), "Temp file should have been cleaned up"
+        )
+
+        # Mock yaml.dump to raise an exception
+        mocker.patch(
+            "script.run_multi_provider_eval.yaml.dump",
+            side_effect=Exception("YAML dump failed"),
+        )
+
+        with pytest.raises(Exception, match="YAML dump failed"):
+            runner._create_temp_system_config(
+                provider_id="openai",
+                model="gpt-4o-mini",
+            )
+
+        # Verify the temp file was cleaned up after the exception
+        assert created_temp_path is not None, "Temp file should have been created"
+        assert not created_temp_path.exists(), "Temp file should have been cleaned up"
 
     def test_temp_config_sanitizes_special_characters(
         self, runner: MultiProviderEvaluationRunner
@@ -465,131 +464,134 @@ def runner(
         )
 
     def test_path_traversal_blocked_in_provider_id(
-        self, runner: MultiProviderEvaluationRunner
+        self, runner: MultiProviderEvaluationRunner, mocker: MockerFixture
     ) -> None:
         """Test that path traversal in provider_id is sanitized."""
-        with patch(
+        mocker.patch(
             "script.run_multi_provider_eval.run_evaluation",
             return_value={"PASS": 0, "FAIL": 0, "ERROR": 1},
-        ):
-            # Attempt path traversal in provider_id
-            result = runner._run_single_evaluation(
-                provider_name="malicious",
-                provider_id="../../etc",
-                model="test",
-            )
+        )
+
+        # Attempt path traversal in provider_id
+        result = runner._run_single_evaluation(
+            provider_name="malicious",
+            provider_id="../../etc",
+            model="test",
+        )
 
-            # Verify that the output path is sanitized and stays within base
-            output_path = Path(result["output_dir"])
-            base_path = runner.output_base.resolve()
-            assert output_path.resolve().is_relative_to(base_path)
-            # Verify dangerous characters are removed
-            assert ".." not in str(output_path)
-            assert "/" not in str(output_path.relative_to(base_path).parts[0])
+        # Verify that the output path is sanitized and stays within base
+        output_path = Path(result["output_dir"])
+        base_path = runner.output_base.resolve()
+        assert output_path.resolve().is_relative_to(base_path)
+        # Verify dangerous characters are removed
+        assert ".." not in str(output_path)
+        assert "/" not in str(output_path.relative_to(base_path).parts[0])
 
-            # Cleanup
-            if output_path.exists():
-                shutil.rmtree(output_path.parent, ignore_errors=True)
+        # Cleanup
+        if output_path.exists():
+            shutil.rmtree(output_path.parent, ignore_errors=True)
 
     def test_path_traversal_blocked_in_model(
-        self, runner: MultiProviderEvaluationRunner
+        self, runner: MultiProviderEvaluationRunner, mocker: MockerFixture
     ) -> None:
         """Test that path traversal in model name is sanitized."""
-        with patch(
+        mocker.patch(
             "script.run_multi_provider_eval.run_evaluation",
             return_value={"PASS": 0, "FAIL": 0, "ERROR": 1},
-        ):
-            # Attempt path traversal in model
-            result = runner._run_single_evaluation(
-                provider_name="openai",
-                provider_id="openai",
-                model="../../../etc/passwd",
-            )
+        )
 
-            # Verify that the output path is sanitized and stays within base
-            output_path = Path(result["output_dir"])
-            base_path = runner.output_base.resolve()
-            assert output_path.resolve().is_relative_to(base_path)
-            # Verify dangerous characters are removed
-            assert ".." not in str(output_path)
+        # Attempt path traversal in model
+        result = runner._run_single_evaluation(
+            provider_name="openai",
+            provider_id="openai",
+            model="../../../etc/passwd",
+        )
 
-            # Cleanup
-            if output_path.exists():
-                shutil.rmtree(output_path.parent.parent, ignore_errors=True)
+        # Verify that the output path is sanitized and stays within base
+        output_path = Path(result["output_dir"])
+        base_path = runner.output_base.resolve()
+        assert output_path.resolve().is_relative_to(base_path)
+        # Verify dangerous characters are removed
+        assert ".." not in str(output_path)
+
+        # Cleanup
+        if output_path.exists():
+            shutil.rmtree(output_path.parent.parent, ignore_errors=True)
 
 
 class TestRunSingleEvaluation:
     """Tests for _run_single_evaluation method."""
 
     def test_run_single_evaluation_success(
-        self, runner: MultiProviderEvaluationRunner
+        self, runner: MultiProviderEvaluationRunner, mocker: MockerFixture
     ) -> None:
         """Test successful single evaluation."""
         # Mock run_evaluation to return a successful summary
-        with patch(
+        mock_run_eval = mocker.patch(
             "script.run_multi_provider_eval.run_evaluation",
             return_value={"PASS": 5, "FAIL": 2, "ERROR": 0},
-        ) as mock_run_eval:
-            result = runner._run_single_evaluation(
-                provider_name="openai",
-                provider_id="openai",
-                model="gpt-4o-mini",
-            )
+        )
+
+        result = runner._run_single_evaluation(
+            provider_name="openai",
+            provider_id="openai",
+            model="gpt-4o-mini",
+        )
 
-            assert result["success"] is True
-            assert result["provider_id"] == "openai"
-            assert result["model"] == "gpt-4o-mini"
-            assert result["summary"]["PASS"] == 5
-            assert result["error"] is None
-            assert "duration_seconds" in result
-            mock_run_eval.assert_called_once()
+        assert result["success"] is True
+        assert result["provider_id"] == "openai"
+        assert result["model"] == "gpt-4o-mini"
+        assert result["summary"]["PASS"] == 5
+        assert result["error"] is None
+        assert "duration_seconds" in result
+        mock_run_eval.assert_called_once()
 
     def test_run_single_evaluation_failure(
-        self, runner: MultiProviderEvaluationRunner
+        self, runner: MultiProviderEvaluationRunner, mocker: MockerFixture
     ) -> None:
         """Test evaluation failure handling."""
         # Mock run_evaluation to return None (failure)
-        with patch("script.run_multi_provider_eval.run_evaluation", return_value=None):
-            result = runner._run_single_evaluation(
-                provider_name="openai",
-                provider_id="openai",
-                model="gpt-4o-mini",
-            )
+        mocker.patch("script.run_multi_provider_eval.run_evaluation", return_value=None)
+        result = runner._run_single_evaluation(
+            provider_name="openai",
+            provider_id="openai",
+            model="gpt-4o-mini",
+        )
 
-            assert result["success"] is False
-            assert result["error"] == "Evaluation returned None (failed)"
+        assert result["success"] is False
+        assert result["error"] == "Evaluation returned None (failed)"
 
     def test_run_single_evaluation_invalid_summary(
-        self, runner: MultiProviderEvaluationRunner
+        self, runner: MultiProviderEvaluationRunner, mocker: MockerFixture
     ) -> None:
         """Test evaluation with invalid summary structure."""
         # Mock run_evaluation to return a summary missing required keys
-        with patch(
+        mocker.patch(
             "script.run_multi_provider_eval.run_evaluation",
             return_value={"PASS": 5, "FAIL": 2},  # Missing ERROR key
-        ):
-            result = runner._run_single_evaluation(
-                provider_name="openai",
-                provider_id="openai",
-                model="gpt-4o-mini",
-            )
+        )
+        result = runner._run_single_evaluation(
+            provider_name="openai",
+            provider_id="openai",
+            model="gpt-4o-mini",
+        )
 
-            assert result["success"] is False
-            assert "Invalid summary structure" in result["error"]
-            assert "summary" not in result
+        assert result["success"] is False
+        assert "Invalid summary structure" in result["error"]
+        assert "summary" not in result
 
 
 class TestRunEvaluations:
     """Tests for run_evaluations method."""
 
     def test_run_evaluations_sequential(
-        self, runner: MultiProviderEvaluationRunner
+        self, runner: MultiProviderEvaluationRunner, mocker: MockerFixture
     ) -> None:
         """Test sequential evaluation execution."""
         # Force sequential mode
         runner.max_workers = 1
 
-        with patch.object(
+        mock_single_eval = mocker.patch.object(
             runner,
             "_run_single_evaluation",
             return_value={
@@ -597,11 +599,11 @@ def test_run_evaluations_sequential(
                 "provider_id": "test",
                 "model": "test-model",
             },
-        ) as mock_single_eval:
-            results = runner.run_evaluations()
+        )
+        results = runner.run_evaluations()
 
-            assert len(results) == 3  # 2 openai + 1 watsonx
-            assert mock_single_eval.call_count == 3
+        assert len(results) == 3  # 2 openai + 1 watsonx
+        assert mock_single_eval.call_count == 3
 
 
 class TestGenerateSummary:
diff --git a/tests/unit/core/metrics/test_geval.py b/tests/unit/core/metrics/test_geval.py
index d228b9f4..6247b2a5 100644
--- a/tests/unit/core/metrics/test_geval.py
+++ b/tests/unit/core/metrics/test_geval.py
@@ -1,10 +1,10 @@
 # pylint: disable=too-many-public-methods,protected-access
 
 """Tests for GEval metrics handler."""
-
-from unittest.mock import MagicMock, patch
+from typing import Any
 
 import pytest
+from pytest_mock import MockerFixture
 from deepeval.test_case import LLMTestCaseParams
 
 from lightspeed_evaluation.core.metrics.geval import GEvalHandler
@@ -15,22 +15,20 @@ class TestGEvalHandler:
     """Test cases for GEvalHandler class."""
 
     @pytest.fixture
-    def mock_llm_manager(self) -> MagicMock:
+    def mock_llm_manager(self, mocker: MockerFixture) -> Any:
         """Create a mock DeepEvalLLMManager."""
-        mock_manager = MagicMock()
-        mock_llm = MagicMock()
+        mock_manager = mocker.MagicMock()
+        mock_llm = mocker.MagicMock()
         mock_manager.get_llm.return_value = mock_llm
         return mock_manager
 
     @pytest.fixture
-    def mock_metric_manager(self) -> MagicMock:
+    def mock_metric_manager(self, mocker: MockerFixture) -> Any:
         """Create a mock MetricManager."""
-        return MagicMock()
+        return mocker.MagicMock()
 
     @pytest.fixture
-    def handler(
-        self, mock_llm_manager: MagicMock, mock_metric_manager: MagicMock
-    ) -> GEvalHandler:
+    def handler(self, mock_llm_manager: Any, mock_metric_manager: Any) -> GEvalHandler:
         """Create a GEvalHandler instance with mocked dependencies."""
         return GEvalHandler(
             deepeval_llm_manager=mock_llm_manager,
@@ -38,7 +36,7 @@ def handler(
         )
 
     def test_initialization(
-        self, mock_llm_manager: MagicMock, mock_metric_manager: MagicMock
+        self, mock_llm_manager: Any, mock_metric_manager: Any
     ) -> None:
         """Test GEvalHandler initialization with required dependencies."""
         handler = GEvalHandler(
@@ -114,7 +112,10 @@ def test_convert_evaluation_params_mixed_invalid_returns_none(
         assert result is None
 
     def test_get_geval_config_uses_metric_manager(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test that _get_geval_config delegates to MetricManager."""
         expected_config = {
@@ -124,7 +125,7 @@ def test_get_geval_config_uses_metric_manager(
         }
         mock_metric_manager.get_metric_metadata.return_value = expected_config
 
-        conv_data = MagicMock()
+        conv_data = mocker.MagicMock()
         config = handler._get_geval_config(
             metric_name="test_metric",
             conv_data=conv_data,
@@ -141,14 +142,17 @@ def test_get_geval_config_uses_metric_manager(
         )
 
     def test_get_geval_config_turn_level(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test retrieving turn-level config uses correct MetricLevel."""
         expected_config = {"criteria": "Turn criteria", "threshold": 0.9}
         mock_metric_manager.get_metric_metadata.return_value = expected_config
 
-        conv_data = MagicMock()
-        turn_data = MagicMock()
+        conv_data = mocker.MagicMock()
+        turn_data = mocker.MagicMock()
 
         config = handler._get_geval_config(
             metric_name="turn_metric",
@@ -166,12 +170,15 @@ def test_get_geval_config_turn_level(
         )
 
     def test_get_geval_config_returns_none_when_not_found(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test that None is returned when MetricManager finds no config."""
         mock_metric_manager.get_metric_metadata.return_value = None
 
-        conv_data = MagicMock()
+        conv_data = mocker.MagicMock()
         config = handler._get_geval_config(
             metric_name="nonexistent_metric",
             conv_data=conv_data,
@@ -182,12 +189,15 @@ def test_get_geval_config_returns_none_when_not_found(
         assert config is None
 
     def test_evaluate_missing_config(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test that evaluate returns error when config is not found."""
         mock_metric_manager.get_metric_metadata.return_value = None
 
-        conv_data = MagicMock()
+        conv_data = mocker.MagicMock()
         score, reason = handler.evaluate(
             metric_name="nonexistent",
             conv_data=conv_data,
@@ -200,7 +210,10 @@ def test_evaluate_missing_config(
         assert "configuration not found" in reason.lower()
 
     def test_evaluate_missing_criteria(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test that evaluate requires 'criteria' in config."""
         mock_metric_manager.get_metric_metadata.return_value = {
@@ -209,7 +222,7 @@ def test_evaluate_missing_criteria(
             # Missing 'criteria'
         }
 
-        conv_data = MagicMock()
+        conv_data = mocker.MagicMock()
         score, reason = handler.evaluate(
             metric_name="test_metric",
             conv_data=conv_data,
@@ -222,14 +235,17 @@ def test_evaluate_missing_criteria(
         assert "criteria" in reason.lower()
 
     def test_evaluate_turn_missing_turn_data(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test that turn-level evaluation requires turn_data."""
         mock_metric_manager.get_metric_metadata.return_value = {
             "criteria": "Test criteria"
         }
 
-        conv_data = MagicMock()
+        conv_data = mocker.MagicMock()
         score, reason = handler.evaluate(
             metric_name="test_metric",
             conv_data=conv_data,
@@ -242,389 +258,412 @@ def test_evaluate_turn_missing_turn_data(
         assert "turn data required" in reason.lower()
 
     def test_evaluate_turn_success(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test successful turn-level evaluation."""
-        with patch(
+        mock_geval_class = mocker.patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
-        ) as mock_geval_class:
-            # Mock GEval metric instance
-            mock_metric = MagicMock()
-            mock_metric.score = 0.85
-            mock_metric.reason = "Test passed"
-            mock_geval_class.return_value = mock_metric
-
-            # Setup metric manager to return config
-            mock_metric_manager.get_metric_metadata.return_value = {
-                "criteria": "Test criteria",
-                "evaluation_params": ["query", "response"],
-                "evaluation_steps": ["Step 1", "Step 2"],
-                "threshold": 0.7,
-            }
-
-            # Mock turn data
-            turn_data = MagicMock()
-            turn_data.query = "Test query"
-            turn_data.response = "Test response"
-            turn_data.expected_response = None
-            turn_data.contexts = None
-
-            conv_data = MagicMock()
-
-            score, reason = handler.evaluate(
-                metric_name="test_metric",
-                conv_data=conv_data,
-                _turn_idx=0,
-                turn_data=turn_data,
-                is_conversation=False,
-            )
-
-            assert score == 0.85
-            assert reason == "Test passed"
-            mock_metric.measure.assert_called_once()
+        )
+        # Mock GEval metric instance
+        mock_metric = mocker.MagicMock()
+        mock_metric.score = 0.85
+        mock_metric.reason = "Test passed"
+        mock_geval_class.return_value = mock_metric
+
+        # Setup metric manager to return config
+        mock_metric_manager.get_metric_metadata.return_value = {
+            "criteria": "Test criteria",
+            "evaluation_params": ["query", "response"],
+            "evaluation_steps": ["Step 1", "Step 2"],
+            "threshold": 0.7,
+        }
+
+        # Mock turn data
+        turn_data = mocker.MagicMock()
+        turn_data.query = "Test query"
+        turn_data.response = "Test response"
+        turn_data.expected_response = None
+        turn_data.contexts = None
+
+        conv_data = mocker.MagicMock()
+
+        score, reason = handler.evaluate(
+            metric_name="test_metric",
+            conv_data=conv_data,
+            _turn_idx=0,
+            turn_data=turn_data,
+            is_conversation=False,
+        )
+
+        assert score == 0.85
+        assert reason == "Test passed"
+        mock_metric.measure.assert_called_once()
 
     def test_evaluate_turn_with_optional_fields(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test turn-level evaluation includes optional fields when present."""
-        with patch(
+        mock_geval_class = mocker.patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
-        ) as mock_geval_class:
-            with patch(
-                "lightspeed_evaluation.core.metrics.geval.LLMTestCase"
-            ) as mock_test_case_class:
-                mock_metric = MagicMock()
-                mock_metric.score = 0.75
-                mock_metric.reason = "Good match"
-                mock_geval_class.return_value = mock_metric
-
-                mock_test_case = MagicMock()
-                mock_test_case_class.return_value = mock_test_case
-
-                # Setup metric manager
-                mock_metric_manager.get_metric_metadata.return_value = {
-                    "criteria": "Compare against expected",
-                    "evaluation_params": ["query", "response", "expected_response"],
-                    "threshold": 0.7,
-                }
-
-                # Mock turn data with all optional fields
-                turn_data = MagicMock()
-                turn_data.query = "Test query"
-                turn_data.response = "Test response"
-                turn_data.expected_response = "Expected response"
-                turn_data.contexts = ["Context 1", "Context 2"]
-
-                conv_data = MagicMock()
-
-                handler.evaluate(
-                    metric_name="test_metric",
-                    conv_data=conv_data,
-                    _turn_idx=0,
-                    turn_data=turn_data,
-                    is_conversation=False,
-                )
-
-                # Verify test case was created with optional fields
-                call_kwargs = mock_test_case_class.call_args[1]
-                assert call_kwargs["input"] == "Test query"
-                assert call_kwargs["actual_output"] == "Test response"
-                assert call_kwargs["expected_output"] == "Expected response"
-                assert call_kwargs["context"] == ["Context 1", "Context 2"]
+        )
+        mock_test_case_class = mocker.patch(
+            "lightspeed_evaluation.core.metrics.geval.LLMTestCase"
+        )
+        mock_metric = mocker.MagicMock()
+        mock_metric.score = 0.75
+        mock_metric.reason = "Good match"
+        mock_geval_class.return_value = mock_metric
+
+        mock_test_case = mocker.MagicMock()
+        mock_test_case_class.return_value = mock_test_case
+
+        # Setup metric manager
+        mock_metric_manager.get_metric_metadata.return_value = {
+            "criteria": "Compare against expected",
+            "evaluation_params": ["query", "response", "expected_response"],
+            "threshold": 0.7,
+        }
+
+        # Mock turn data with all optional fields
+        turn_data = mocker.MagicMock()
+        turn_data.query = "Test query"
+        turn_data.response = "Test response"
+        turn_data.expected_response = "Expected response"
+        turn_data.contexts = ["Context 1", "Context 2"]
+
+        conv_data = mocker.MagicMock()
+
+        handler.evaluate(
+            metric_name="test_metric",
+            conv_data=conv_data,
+            _turn_idx=0,
+            turn_data=turn_data,
+            is_conversation=False,
+        )
+
+        # Verify test case was created with optional fields
+        call_kwargs = mock_test_case_class.call_args[1]
+        assert call_kwargs["input"] == "Test query"
+        assert call_kwargs["actual_output"] == "Test response"
+        assert call_kwargs["expected_output"] == "Expected response"
+        assert call_kwargs["context"] == ["Context 1", "Context 2"]
 
     def test_evaluate_turn_none_score_returns_zero(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test that None score from metric is converted to 0.0."""
-        with patch(
+        mock_geval_class = mocker.patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
-        ) as mock_geval_class:
-            mock_metric = MagicMock()
-            mock_metric.score = None
-            mock_metric.reason = "Could not evaluate"
-            mock_geval_class.return_value = mock_metric
-
-            mock_metric_manager.get_metric_metadata.return_value = {
-                "criteria": "Test criteria",
-                "threshold": 0.7,
-            }
-
-            turn_data = MagicMock()
-            turn_data.query = "Test query"
-            turn_data.response = "Test response"
-            turn_data.expected_response = None
-            turn_data.contexts = None
-
-            conv_data = MagicMock()
-
-            score, reason = handler.evaluate(
-                metric_name="test_metric",
-                conv_data=conv_data,
-                _turn_idx=0,
-                turn_data=turn_data,
-                is_conversation=False,
-            )
-
-            # Should return 0.0 when score is None
-            assert score == 0.0
-            assert reason == "Could not evaluate"
+        )
+        mock_metric = mocker.MagicMock()
+        mock_metric.score = None
+        mock_metric.reason = "Could not evaluate"
+        mock_geval_class.return_value = mock_metric
+
+        mock_metric_manager.get_metric_metadata.return_value = {
+            "criteria": "Test criteria",
+            "threshold": 0.7,
+        }
+
+        turn_data = mocker.MagicMock()
+        turn_data.query = "Test query"
+        turn_data.response = "Test response"
+        turn_data.expected_response = None
+        turn_data.contexts = None
+
+        conv_data = mocker.MagicMock()
+
+        score, reason = handler.evaluate(
+            metric_name="test_metric",
+            conv_data=conv_data,
+            _turn_idx=0,
+            turn_data=turn_data,
+            is_conversation=False,
+        )
+
+        # Should return 0.0 when score is None
+        assert score == 0.0
+        assert reason == "Could not evaluate"
 
     def test_evaluate_turn_handles_exceptions(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test that turn evaluation handles exceptions gracefully."""
-        with patch(
+        mock_geval_class = mocker.patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
-        ) as mock_geval_class:
-            mock_metric = MagicMock()
-            mock_metric.measure.side_effect = ValueError("Test error")
-            mock_geval_class.return_value = mock_metric
-
-            mock_metric_manager.get_metric_metadata.return_value = {
-                "criteria": "Test criteria",
-                "threshold": 0.7,
-            }
-
-            turn_data = MagicMock()
-            turn_data.query = "Test query"
-            turn_data.response = "Test response"
-            turn_data.expected_response = None
-            turn_data.contexts = None
-
-            conv_data = MagicMock()
-
-            score, reason = handler.evaluate(
-                metric_name="test_metric",
-                conv_data=conv_data,
-                _turn_idx=0,
-                turn_data=turn_data,
-                is_conversation=False,
-            )
-
-            assert score is None
-            assert "evaluation error" in reason.lower()
-            assert "Test error" in reason
+        )
+        mock_metric = mocker.MagicMock()
+        mock_metric.measure.side_effect = ValueError("Test error")
+        mock_geval_class.return_value = mock_metric
+
+        mock_metric_manager.get_metric_metadata.return_value = {
+            "criteria": "Test criteria",
+            "threshold": 0.7,
+        }
+
+        turn_data = mocker.MagicMock()
+        turn_data.query = "Test query"
+        turn_data.response = "Test response"
+        turn_data.expected_response = None
+        turn_data.contexts = None
+
+        conv_data = mocker.MagicMock()
+
+        score, reason = handler.evaluate(
+            metric_name="test_metric",
+            conv_data=conv_data,
+            _turn_idx=0,
+            turn_data=turn_data,
+            is_conversation=False,
+        )
+
+        assert score is None
+        assert "evaluation error" in reason.lower()
+        assert "Test error" in reason
 
     def test_evaluate_turn_uses_default_params_when_none_provided(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test that default evaluation_params are used when none provided."""
-        with patch(
+        mock_geval_class = mocker.patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
-        ) as mock_geval_class:
-            mock_metric = MagicMock()
-            mock_metric.score = 0.8
-            mock_metric.reason = "Good"
-            mock_geval_class.return_value = mock_metric
-
-            # Config with no evaluation_params
-            mock_metric_manager.get_metric_metadata.return_value = {
-                "criteria": "Test criteria",
-                "threshold": 0.7,
-            }
-
-            turn_data = MagicMock()
-            turn_data.query = "Test query"
-            turn_data.response = "Test response"
-            turn_data.expected_response = None
-            turn_data.contexts = None
-
-            conv_data = MagicMock()
-
-            handler.evaluate(
-                metric_name="test_metric",
-                conv_data=conv_data,
-                _turn_idx=0,
-                turn_data=turn_data,
-                is_conversation=False,
-            )
-
-            # Verify GEval was called with default params
-            call_kwargs = mock_geval_class.call_args[1]
-            assert call_kwargs["evaluation_params"] == [
-                LLMTestCaseParams.INPUT,
-                LLMTestCaseParams.ACTUAL_OUTPUT,
-            ]
+        )
+        mock_metric = mocker.MagicMock()
+        mock_metric.score = 0.8
+        mock_metric.reason = "Good"
+        mock_geval_class.return_value = mock_metric
+
+        # Config with no evaluation_params
+        mock_metric_manager.get_metric_metadata.return_value = {
+            "criteria": "Test criteria",
+            "threshold": 0.7,
+        }
+
+        turn_data = mocker.MagicMock()
+        turn_data.query = "Test query"
+        turn_data.response = "Test response"
+        turn_data.expected_response = None
+        turn_data.contexts = None
+
+        conv_data = mocker.MagicMock()
+
+        handler.evaluate(
+            metric_name="test_metric",
+            conv_data=conv_data,
+            _turn_idx=0,
+            turn_data=turn_data,
+            is_conversation=False,
+        )
+
+        # Verify GEval was called with default params
+        call_kwargs = mock_geval_class.call_args[1]
+        assert call_kwargs["evaluation_params"] == [
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+        ]
 
     def test_evaluate_conversation_success(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test successful conversation-level evaluation."""
-        with patch(
+        mock_geval_class = mocker.patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
-        ) as mock_geval_class:
-            mock_metric = MagicMock()
-            mock_metric.score = 0.90
-            mock_metric.reason = "Conversation coherent"
-            mock_geval_class.return_value = mock_metric
-
-            mock_metric_manager.get_metric_metadata.return_value = {
-                "criteria": "Conversation criteria",
-                "evaluation_params": ["query", "response"],
-                "threshold": 0.6,
-            }
-
-            # Mock conversation data with multiple turns
-            turn1 = MagicMock()
-            turn1.query = "Query 1"
-            turn1.response = "Response 1"
-
-            turn2 = MagicMock()
-            turn2.query = "Query 2"
-            turn2.response = "Response 2"
-
-            conv_data = MagicMock()
-            conv_data.turns = [turn1, turn2]
-
-            score, reason = handler.evaluate(
-                metric_name="test_metric",
-                conv_data=conv_data,
-                _turn_idx=None,
-                turn_data=None,
-                is_conversation=True,
-            )
-
-            assert score == 0.90
-            assert reason == "Conversation coherent"
-            mock_metric.measure.assert_called_once()
+        )
+        mock_metric = mocker.MagicMock()
+        mock_metric.score = 0.90
+        mock_metric.reason = "Conversation coherent"
+        mock_geval_class.return_value = mock_metric
+
+        mock_metric_manager.get_metric_metadata.return_value = {
+            "criteria": "Conversation criteria",
+            "evaluation_params": ["query", "response"],
+            "threshold": 0.6,
+        }
+
+        # Mock conversation data with multiple turns
+        turn1 = mocker.MagicMock()
+        turn1.query = "Query 1"
+        turn1.response = "Response 1"
+
+        turn2 = mocker.MagicMock()
+        turn2.query = "Query 2"
+        turn2.response = "Response 2"
+
+        conv_data = mocker.MagicMock()
+        conv_data.turns = [turn1, turn2]
+
+        score, reason = handler.evaluate(
+            metric_name="test_metric",
+            conv_data=conv_data,
+            _turn_idx=None,
+            turn_data=None,
+            is_conversation=True,
+        )
+
+        assert score == 0.90
+        assert reason == "Conversation coherent"
+        mock_metric.measure.assert_called_once()
 
     def test_evaluate_conversation_aggregates_turns(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test that conversation evaluation properly aggregates turn data."""
-        with patch(
+        mock_geval_class = mocker.patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
-        ) as mock_geval_class:
-            with patch(
-                "lightspeed_evaluation.core.metrics.geval.LLMTestCase"
-            ) as mock_test_case_class:
-                mock_metric = MagicMock()
-                mock_metric.score = 0.85
-                mock_metric.reason = "Good conversation"
-                mock_geval_class.return_value = mock_metric
-
-                mock_test_case = MagicMock()
-                mock_test_case_class.return_value = mock_test_case
-
-                mock_metric_manager.get_metric_metadata.return_value = {
-                    "criteria": "Conversation flow",
-                    "threshold": 0.7,
-                }
-
-                # Create multiple turns including one with None response
-                turn1 = MagicMock()
-                turn1.query = "First question"
-                turn1.response = "First answer"
-
-                turn2 = MagicMock()
-                turn2.query = "Second question"
-                turn2.response = "Second answer"
-
-                turn3 = MagicMock()
-                turn3.query = "Third question"
-                turn3.response = None  # Test None response handling
-
-                conv_data = MagicMock()
-                conv_data.turns = [turn1, turn2, turn3]
-
-                handler.evaluate(
-                    metric_name="test_metric",
-                    conv_data=conv_data,
-                    _turn_idx=None,
-                    turn_data=None,
-                    is_conversation=True,
-                )
-
-                # Verify test case was created with aggregated input/output
-                call_kwargs = mock_test_case_class.call_args[1]
-                assert "Turn 1 - User: First question" in call_kwargs["input"]
-                assert "Turn 2 - User: Second question" in call_kwargs["input"]
-                assert "Turn 3 - User: Third question" in call_kwargs["input"]
-                assert (
-                    "Turn 1 - Assistant: First answer" in call_kwargs["actual_output"]
-                )
-                assert (
-                    "Turn 2 - Assistant: Second answer" in call_kwargs["actual_output"]
-                )
-                assert "Turn 3 - Assistant:" in call_kwargs["actual_output"]
+        )
+        mock_test_case_class = mocker.patch(
+            "lightspeed_evaluation.core.metrics.geval.LLMTestCase"
+        )
+        mock_metric = mocker.MagicMock()
+        mock_metric.score = 0.85
+        mock_metric.reason = "Good conversation"
+        mock_geval_class.return_value = mock_metric
+
+        mock_test_case = mocker.MagicMock()
+        mock_test_case_class.return_value = mock_test_case
+
+        mock_metric_manager.get_metric_metadata.return_value = {
+            "criteria": "Conversation flow",
+            "threshold": 0.7,
+        }
+
+        # Create multiple turns including one with None response
+        turn1 = mocker.MagicMock()
+        turn1.query = "First question"
+        turn1.response = "First answer"
+
+        turn2 = mocker.MagicMock()
+        turn2.query = "Second question"
+        turn2.response = "Second answer"
+
+        turn3 = mocker.MagicMock()
+        turn3.query = "Third question"
+        turn3.response = None  # Test None response handling
+
+        conv_data = mocker.MagicMock()
+        conv_data.turns = [turn1, turn2, turn3]
+
+        handler.evaluate(
+            metric_name="test_metric",
+            conv_data=conv_data,
+            _turn_idx=None,
+            turn_data=None,
+            is_conversation=True,
+        )
+
+        # Verify test case was created with aggregated input/output
+        call_kwargs = mock_test_case_class.call_args[1]
+        assert "Turn 1 - User: First question" in call_kwargs["input"]
+        assert "Turn 2 - User: Second question" in call_kwargs["input"]
+        assert "Turn 3 - User: Third question" in call_kwargs["input"]
+        assert "Turn 1 - Assistant: First answer" in call_kwargs["actual_output"]
+        assert "Turn 2 - Assistant: Second answer" in call_kwargs["actual_output"]
+        assert "Turn 3 - Assistant:" in call_kwargs["actual_output"]
 
     def test_evaluate_conversation_with_evaluation_steps(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test that evaluation_steps are passed to GEval when provided."""
-        with patch(
+        mock_geval_class = mocker.patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
-        ) as mock_geval_class:
-            mock_metric = MagicMock()
-            mock_metric.score = 0.88
-            mock_metric.reason = "Follows steps"
-            mock_geval_class.return_value = mock_metric
-
-            mock_metric_manager.get_metric_metadata.return_value = {
-                "criteria": "Multi-step evaluation",
-                "evaluation_params": ["query", "response"],
-                "evaluation_steps": [
-                    "Check coherence",
-                    "Verify context",
-                    "Assess relevance",
-                ],
-                "threshold": 0.7,
-            }
-
-            turn1 = MagicMock()
-            turn1.query = "Query 1"
-            turn1.response = "Response 1"
-
-            conv_data = MagicMock()
-            conv_data.turns = [turn1]
-
-            handler.evaluate(
-                metric_name="test_metric",
-                conv_data=conv_data,
-                _turn_idx=None,
-                turn_data=None,
-                is_conversation=True,
-            )
-
-            # Verify evaluation_steps were passed to GEval
-            call_kwargs = mock_geval_class.call_args[1]
-            assert call_kwargs["evaluation_steps"] == [
+        )
+        mock_metric = mocker.MagicMock()
+        mock_metric.score = 0.88
+        mock_metric.reason = "Follows steps"
+        mock_geval_class.return_value = mock_metric
+
+        mock_metric_manager.get_metric_metadata.return_value = {
+            "criteria": "Multi-step evaluation",
+            "evaluation_params": ["query", "response"],
+            "evaluation_steps": [
                 "Check coherence",
                 "Verify context",
                 "Assess relevance",
-            ]
+            ],
+            "threshold": 0.7,
+        }
+
+        turn1 = mocker.MagicMock()
+        turn1.query = "Query 1"
+        turn1.response = "Response 1"
+
+        conv_data = mocker.MagicMock()
+        conv_data.turns = [turn1]
+
+        handler.evaluate(
+            metric_name="test_metric",
+            conv_data=conv_data,
+            _turn_idx=None,
+            turn_data=None,
+            is_conversation=True,
+        )
+
+        # Verify evaluation_steps were passed to GEval
+        call_kwargs = mock_geval_class.call_args[1]
+        assert call_kwargs["evaluation_steps"] == [
+            "Check coherence",
+            "Verify context",
+            "Assess relevance",
+        ]
 
     def test_evaluate_conversation_handles_exceptions(
-        self, handler: GEvalHandler, mock_metric_manager: MagicMock
+        self,
+        handler: GEvalHandler,
+        mock_metric_manager: Any,
+        mocker: MockerFixture,
     ) -> None:
         """Test that conversation evaluation handles exceptions gracefully."""
-        with patch(
+        mock_geval_class = mocker.patch(
             "lightspeed_evaluation.core.metrics.geval.GEval"
-        ) as mock_geval_class:
-            mock_metric = MagicMock()
-            mock_metric.measure.side_effect = RuntimeError("API error")
-            mock_geval_class.return_value = mock_metric
-
-            mock_metric_manager.get_metric_metadata.return_value = {
-                "criteria": "Test criteria",
-                "threshold": 0.7,
-            }
-
-            turn1 = MagicMock()
-            turn1.query = "Query 1"
-            turn1.response = "Response 1"
-
-            conv_data = MagicMock()
-            conv_data.turns = [turn1]
-
-            score, reason = handler.evaluate(
-                metric_name="test_metric",
-                conv_data=conv_data,
-                _turn_idx=None,
-                turn_data=None,
-                is_conversation=True,
-            )
-
-            assert score is None
-            assert "evaluation error" in reason.lower()
-            assert "API error" in reason
+        )
+        mock_metric = mocker.MagicMock()
+        mock_metric.measure.side_effect = RuntimeError("API error")
+        mock_geval_class.return_value = mock_metric
+
+        mock_metric_manager.get_metric_metadata.return_value = {
+            "criteria": "Test criteria",
+            "threshold": 0.7,
+        }
+
+        turn1 = mocker.MagicMock()
+        turn1.query = "Query 1"
+        turn1.response = "Response 1"
+
+        conv_data = mocker.MagicMock()
+        conv_data.turns = [turn1]
+
+        score, reason = handler.evaluate(
+            metric_name="test_metric",
+            conv_data=conv_data,
+            _turn_idx=None,
+            turn_data=None,
+            is_conversation=True,
+        )
+
+        assert score is None
+        assert "evaluation error" in reason.lower()
+        assert "API error" in reason