lightspeed-core · xmican10 · Jan 28, 2026 · Jan 28, 2026 · Jan 29, 2026 · Jan 29, 2026
diff --git a/Makefile b/Makefile
@@ -39,7 +39,7 @@ update-deps: ## Check pyproject.toml for changes, update the lock file if needed
 	uv sync --group dev
 
 check-types: ## Checks type hints in sources
-	uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/
+	uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/ tests
 
 black-check:
 	uv run black . --check
@@ -73,10 +73,10 @@ help: ## Show this help screen
 
 pylint:
 	uv run pylint src
-	uv run pylint lsc_agent_eval/src
+	uv run pylint --disable=R0801 lsc_agent_eval/src tests
 
 pyright:
-	uv run pyright src lsc_agent_eval/src
+	uv run pyright src lsc_agent_eval/src tests
 
 docstyle:
 	uv run pydocstyle -v .

diff --git a/pyproject.toml b/pyproject.toml
@@ -89,6 +89,7 @@ warn_required_dynamic_aliases = true
 
 [tool.pylint.MASTER]
 load-plugins = ["pylint_pydantic"]
+init-hook = "import sys; sys.path.append('.')"
 
 [tool.ruff]
 [tool.ruff.lint.flake8-tidy-imports]

diff --git a/pyrightconfig.json b/pyrightconfig.json
@@ -0,0 +1,12 @@
+{
+  "reportAttributeAccessIssue": "warning",
+  "executionEnvironments": [
+    {
+      "root": "tests",
+      "reportAttributeAccessIssue": "none",
+      "extraPaths": [
+        "."
+      ]
+    }
+  ]
+}
diff --git a/script/__init__.py b/script/__init__.py
@@ -0,0 +1 @@
+"""Script utilities for lightspeed-evaluation."""
diff --git a/script/compare_evaluations.py b/script/compare_evaluations.py
@@ -421,7 +421,7 @@ def _check_confidence_interval_overlap(
         Returns:
             Dictionary containing overlap test results
         """
-        result = {
+        result: dict[str, Any] = {
             "test_performed": False,
             "intervals_overlap": None,
             "significant": None,

diff --git a/script/run_multi_provider_eval.py b/script/run_multi_provider_eval.py
@@ -318,7 +318,7 @@ def _create_provider_model_configs(self) -> list[dict[str, Any]]:
         Returns:
             List of dictionaries with provider, model, and settings
         """
-        configs = []
+        configs: list[dict[str, Any]] = []
 
         # Get providers from the config
         providers = self.providers_config.get("providers", {})
@@ -781,7 +781,7 @@ def _analyze_single_model(
 
         # Calculate score statistics
         if all_scores:
-            score_stats = {
+            score_stats: dict[str, Any] = {
                 "mean": float(np.mean(all_scores)),
                 "median": float(np.median(all_scores)),
                 "std": float(np.std(all_scores)),
@@ -818,10 +818,10 @@ def _analyze_single_model(
                     logger.warning(
                         "scipy not available, skipping confidence interval calculation"
                     )
-                    score_stats["confidence_interval"] = None
+                    score_stats["confidence_interval"] = None  # type: ignore[assignment]
             else:
                 # Single score - no confidence interval
-                score_stats["confidence_interval"] = None
+                score_stats["confidence_interval"] = None  # type: ignore[assignment]
         else:
             score_stats = {
                 "mean": 0.0,

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,9 @@
+"""Pytest configuration and fixtures for lightspeed-evaluation tests."""
+
+import sys
+from pathlib import Path
+
+# Add project root to Python path so we can import from script directory
+project_root = Path(__file__).parent.parent
+if str(project_root) not in sys.path:
+    sys.path.insert(0, str(project_root))
diff --git a/tests/script/conftest.py b/tests/script/conftest.py
@@ -0,0 +1,214 @@
+# pylint: disable=redefined-outer-name
+
+"""Pytest configuration and fixtures for script tests."""
+
+from pathlib import Path
+from typing import Any
+
+import pytest
+import yaml
+
+from script.run_multi_provider_eval import MultiProviderEvaluationRunner
+
+
+@pytest.fixture
+def script_path() -> Path:
+    """Return the path to the compare_evaluations.py script."""
+    # Test is in tests/script/, script is in project_root/script/
+    return Path(__file__).parent.parent.parent / "script" / "compare_evaluations.py"
+
+
+@pytest.fixture
+def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
+    """Return sample evaluation data for testing."""
+    sample_results1 = [
+        {
+            "conversation_group_id": "conv1",
+            "turn_id": "1",
+            "metric_identifier": "ragas:faithfulness",
+            "result": "PASS",
+            "score": 0.8,
+            "threshold": 0.7,
+            "execution_time": 1.0,
+        },
+        {
+            "conversation_group_id": "conv1",
+            "turn_id": "2",
+            "metric_identifier": "ragas:faithfulness",
+            "result": "PASS",
+            "score": 0.9,
+            "threshold": 0.7,
+            "execution_time": 1.2,
+        },
+    ]
+
+    sample_results2 = [
+        {
+            "conversation_group_id": "conv1",
+            "turn_id": "1",
+            "metric_identifier": "ragas:faithfulness",
+            "result": "PASS",
+            "score": 0.85,
+            "threshold": 0.7,
+            "execution_time": 1.1,
+        },
+        {
+            "conversation_group_id": "conv1",
+            "turn_id": "2",
+            "metric_identifier": "ragas:faithfulness",
+            "result": "FAIL",
+            "score": 0.6,
+            "threshold": 0.7,
+            "execution_time": 1.0,
+        },
+    ]
+
+    return sample_results1, sample_results2
+
+
+@pytest.fixture
+def temp_config_files(tmp_path: Path) -> dict:
+    """Create temporary configuration files for testing."""
+    # Create multi_eval_config.yaml
+    providers_config = {
+        "providers": {
+            "openai": {
+                "models": ["gpt-4o-mini", "gpt-4-turbo"],
+            },
+            "watsonx": {
+                "models": ["ibm/granite-13b-chat-v2"],
+            },
+        },
+        "settings": {"output_base": str(tmp_path / "eval_output")},
+    }
+    providers_path = tmp_path / "multi_eval_config.yaml"
+    with open(providers_path, "w", encoding="utf-8") as f:
+        yaml.dump(providers_config, f)
+
+    # Create system.yaml
+    system_config = {
+        "llm": {
+            "provider": "openai",
+            "model": "gpt-4o-mini",
+            "temperature": 0.0,
+        },
+        "api": {"enabled": False},
+        "output": {"output_dir": "./eval_output"},
+    }
+    system_path = tmp_path / "system.yaml"
+    with open(system_path, "w", encoding="utf-8") as f:
+        yaml.dump(system_config, f)
+
+    # Create evaluation_data.yaml
+    eval_data = [
+        {
+            "conversation_group_id": "test_conv",
+            "turns": [
+                {
+                    "turn_id": "turn_1",
+                    "query": "Test query",
+                    "response": "Test response",
+                    "contexts": ["Context 1"],
+                    "expected_response": "Expected",
+                    "turn_metrics": ["ragas:response_relevancy"],
+                }
+            ],
+        }
+    ]
+    eval_path = tmp_path / "evaluation_data.yaml"
+    with open(eval_path, "w", encoding="utf-8") as f:
+        yaml.dump(eval_data, f)
+
+    return {
+        "providers_config": providers_path,
+        "system_config": system_path,
+        "eval_data": eval_path,
+        "output_dir": tmp_path / "eval_output",
+    }
+
+
+@pytest.fixture
+def runner(
+    temp_config_files: dict,
+) -> MultiProviderEvaluationRunner:
+    """Create a MultiProviderEvaluationRunner instance for testing."""
+    return MultiProviderEvaluationRunner(
+        providers_config_path=str(temp_config_files["providers_config"]),
+        system_config_path=str(temp_config_files["system_config"]),
+        eval_data_path=str(temp_config_files["eval_data"]),
+    )
+
+
+@pytest.fixture
+def sample_evaluation_summary() -> dict[str, Any]:
+    """Create a sample evaluation summary JSON for testing analysis."""
+    return {
+        "timestamp": "2025-01-01T12:00:00",
+        "total_evaluations": 10,
+        "summary_stats": {
+            "overall": {
+                "TOTAL": 10,
+                "PASS": 8,
+                "FAIL": 2,
+                "ERROR": 0,
+                "pass_rate": 80.0,  # Percentage format
+                "fail_rate": 20.0,
+                "error_rate": 0.0,
+            },
+            "by_metric": {
+                "ragas:faithfulness": {
+                    "pass": 4,
+                    "fail": 0,
+                    "error": 0,
+                    "pass_rate": 100.0,
+                    "fail_rate": 0.0,
+                    "error_rate": 0.0,
+                    "score_statistics": {
+                        "mean": 0.95,
+                        "median": 0.95,
+                        "std": 0.02,
+                        "min": 0.92,
+                        "max": 0.98,
+                        "count": 4,
+                    },
+                },
+                "ragas:response_relevancy": {
+                    "pass": 4,
+                    "fail": 2,
+                    "error": 0,
+                    "pass_rate": 66.67,
+                    "fail_rate": 33.33,
+                    "error_rate": 0.0,
+                    "score_statistics": {
+                        "mean": 0.75,
+                        "median": 0.78,
+                        "std": 0.12,
+                        "min": 0.55,
+                        "max": 0.88,
+                        "count": 6,
+                    },
+                },
+            },
+        },
+        "results": [
+            {
+                "conversation_group_id": "conv1",
+                "turn_id": "turn1",
+                "metric_identifier": "ragas:faithfulness",
+                "result": "PASS",
+                "score": 0.95,
+                "threshold": 0.8,
+                "execution_time": 1.0,
+            },
+            {
+                "conversation_group_id": "conv1",
+                "turn_id": "turn2",
+                "metric_identifier": "ragas:response_relevancy",
+                "result": "PASS",
+                "score": 0.85,
+                "threshold": 0.7,
+                "execution_time": 1.2,
+            },
+        ]
+        * 5,  # Repeat to get 10 results
+    }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Script utilities for lightspeed-evaluation."""