From 979fd8823fde27d2fa975c80c75356d41be3bc03 Mon Sep 17 00:00:00 2001 From: Eva Micankova Date: Wed, 28 Jan 2026 16:29:49 +0100 Subject: [PATCH 1/3] Enhance test quality --- Makefile | 6 +- pyproject.toml | 1 + pyrightconfig.json | 12 + script/__init__.py | 1 + script/compare_evaluations.py | 2 +- script/run_multi_provider_eval.py | 8 +- tests/conftest.py | 9 + tests/script/conftest.py | 212 ++++++++ tests/script/test_compare_evaluations.py | 232 ++++----- tests/script/test_run_multi_provider_eval.py | 398 ++++++--------- tests/unit/core/api/conftest.py | 42 ++ tests/unit/core/api/test_client.py | 159 +++--- tests/unit/core/api/test_streaming_parser.py | 101 ++-- tests/unit/core/config/test_models.py | 64 ++- tests/unit/core/llm/conftest.py | 29 ++ tests/unit/core/llm/test_custom.py | 19 +- tests/unit/core/llm/test_deepeval_manager.py | 30 +- tests/unit/core/llm/test_llm_manager.py | 65 +-- tests/unit/core/llm/test_manager.py | 26 +- tests/unit/core/metrics/conftest.py | 142 ++++++ tests/unit/core/metrics/custom/test_custom.py | 15 +- .../core/metrics/custom/test_tool_eval.py | 90 ++-- tests/unit/core/metrics/test_geval.py | 127 +++-- tests/unit/core/metrics/test_keywords_eval.py | 22 +- tests/unit/core/metrics/test_manager.py | 124 +++-- tests/unit/core/metrics/test_nlp.py | 170 ++----- tests/unit/core/models/test_api_additional.py | 49 +- tests/unit/core/models/test_data.py | 144 ++++-- .../core/models/test_system_additional.py | 56 ++- tests/unit/core/output/conftest.py | 95 ++++ tests/unit/core/output/test_final_coverage.py | 30 +- tests/unit/core/output/test_generator.py | 228 +++++---- tests/unit/core/output/test_statistics.py | 149 +++--- tests/unit/core/script/test_manager.py | 23 +- .../core/script/test_manager_additional.py | 37 +- tests/unit/core/system/test_env_validator.py | 59 ++- tests/unit/core/system/test_lazy_import.py | 12 +- tests/unit/core/system/test_loader.py | 34 +- tests/unit/core/system/test_setup.py | 45 +- tests/unit/core/system/test_ssl_certifi.py | 42 +- tests/unit/core/system/test_validator.py | 162 ++++-- tests/unit/pipeline/evaluation/conftest.py | 223 ++++++++ .../unit/pipeline/evaluation/test_amender.py | 30 +- tests/unit/pipeline/evaluation/test_errors.py | 20 +- .../pipeline/evaluation/test_evaluator.py | 268 +++++----- .../unit/pipeline/evaluation/test_pipeline.py | 88 ++-- .../pipeline/evaluation/test_processor.py | 476 ++++++++---------- tests/unit/runner/test_evaluation.py | 56 ++- 48 files changed, 2671 insertions(+), 1761 deletions(-) create mode 100644 pyrightconfig.json create mode 100644 script/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/script/conftest.py create mode 100644 tests/unit/core/api/conftest.py create mode 100644 tests/unit/core/llm/conftest.py create mode 100644 tests/unit/core/metrics/conftest.py create mode 100644 tests/unit/core/output/conftest.py create mode 100644 tests/unit/pipeline/evaluation/conftest.py diff --git a/Makefile b/Makefile index 135f0b7f..f99bea30 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,7 @@ update-deps: ## Check pyproject.toml for changes, update the lock file if needed uv sync --group dev check-types: ## Checks type hints in sources - uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/ + uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/ tests black-check: uv run black . --check @@ -73,10 +73,10 @@ help: ## Show this help screen pylint: uv run pylint src - uv run pylint lsc_agent_eval/src + uv run pylint --disable=R0801 lsc_agent_eval/src tests pyright: - uv run pyright src lsc_agent_eval/src + uv run pyright src lsc_agent_eval/src tests docstyle: uv run pydocstyle -v . diff --git a/pyproject.toml b/pyproject.toml index 71c12676..d9efb241 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,6 +89,7 @@ warn_required_dynamic_aliases = true [tool.pylint.MASTER] load-plugins = ["pylint_pydantic"] +init-hook = "import sys; sys.path.append('.')" [tool.ruff] [tool.ruff.lint.flake8-tidy-imports] diff --git a/pyrightconfig.json b/pyrightconfig.json new file mode 100644 index 00000000..22193a37 --- /dev/null +++ b/pyrightconfig.json @@ -0,0 +1,12 @@ +{ + "reportAttributeAccessIssue": "warning", + "executionEnvironments": [ + { + "root": "tests", + "reportAttributeAccessIssue": "none", + "extraPaths": [ + "." + ] + } + ] +} diff --git a/script/__init__.py b/script/__init__.py new file mode 100644 index 00000000..d6eec20f --- /dev/null +++ b/script/__init__.py @@ -0,0 +1 @@ +"""Script utilities for lightspeed-evaluation.""" diff --git a/script/compare_evaluations.py b/script/compare_evaluations.py index ac993598..be8e1699 100755 --- a/script/compare_evaluations.py +++ b/script/compare_evaluations.py @@ -421,7 +421,7 @@ def _check_confidence_interval_overlap( Returns: Dictionary containing overlap test results """ - result = { + result: dict[str, Any] = { "test_performed": False, "intervals_overlap": None, "significant": None, diff --git a/script/run_multi_provider_eval.py b/script/run_multi_provider_eval.py index 4cca522b..34a5d471 100755 --- a/script/run_multi_provider_eval.py +++ b/script/run_multi_provider_eval.py @@ -318,7 +318,7 @@ def _create_provider_model_configs(self) -> list[dict[str, Any]]: Returns: List of dictionaries with provider, model, and settings """ - configs = [] + configs: list[dict[str, Any]] = [] # Get providers from the config providers = self.providers_config.get("providers", {}) @@ -781,7 +781,7 @@ def _analyze_single_model( # Calculate score statistics if all_scores: - score_stats = { + score_stats: dict[str, Any] = { "mean": float(np.mean(all_scores)), "median": float(np.median(all_scores)), "std": float(np.std(all_scores)), @@ -818,10 +818,10 @@ def _analyze_single_model( logger.warning( "scipy not available, skipping confidence interval calculation" ) - score_stats["confidence_interval"] = None + score_stats["confidence_interval"] = None # type: ignore[assignment] else: # Single score - no confidence interval - score_stats["confidence_interval"] = None + score_stats["confidence_interval"] = None # type: ignore[assignment] else: score_stats = { "mean": 0.0, diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..f38a1ee1 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,9 @@ +"""Pytest configuration and fixtures for lightspeed-evaluation tests.""" + +import sys +from pathlib import Path + +# Add project root to Python path so we can import from script directory +project_root = Path(__file__).parent.parent +if str(project_root) not in sys.path: + sys.path.insert(0, str(project_root)) diff --git a/tests/script/conftest.py b/tests/script/conftest.py new file mode 100644 index 00000000..8ab273da --- /dev/null +++ b/tests/script/conftest.py @@ -0,0 +1,212 @@ +"""Pytest configuration and fixtures for script tests.""" + +from pathlib import Path +from typing import Any + +import pytest +import yaml + +from script.run_multi_provider_eval import MultiProviderEvaluationRunner + + +@pytest.fixture +def script_path() -> Path: + """Return the path to the compare_evaluations.py script.""" + # Test is in tests/script/, script is in project_root/script/ + return Path(__file__).parent.parent.parent / "script" / "compare_evaluations.py" + + +@pytest.fixture +def sample_evaluation_data() -> tuple[list[dict], list[dict]]: + """Return sample evaluation data for testing.""" + sample_results1 = [ + { + "conversation_group_id": "conv1", + "turn_id": "1", + "metric_identifier": "ragas:faithfulness", + "result": "PASS", + "score": 0.8, + "threshold": 0.7, + "execution_time": 1.0, + }, + { + "conversation_group_id": "conv1", + "turn_id": "2", + "metric_identifier": "ragas:faithfulness", + "result": "PASS", + "score": 0.9, + "threshold": 0.7, + "execution_time": 1.2, + }, + ] + + sample_results2 = [ + { + "conversation_group_id": "conv1", + "turn_id": "1", + "metric_identifier": "ragas:faithfulness", + "result": "PASS", + "score": 0.85, + "threshold": 0.7, + "execution_time": 1.1, + }, + { + "conversation_group_id": "conv1", + "turn_id": "2", + "metric_identifier": "ragas:faithfulness", + "result": "FAIL", + "score": 0.6, + "threshold": 0.7, + "execution_time": 1.0, + }, + ] + + return sample_results1, sample_results2 + + +@pytest.fixture +def temp_config_files(tmp_path: Path) -> dict: + """Create temporary configuration files for testing.""" + # Create multi_eval_config.yaml + providers_config = { + "providers": { + "openai": { + "models": ["gpt-4o-mini", "gpt-4-turbo"], + }, + "watsonx": { + "models": ["ibm/granite-13b-chat-v2"], + }, + }, + "settings": {"output_base": str(tmp_path / "eval_output")}, + } + providers_path = tmp_path / "multi_eval_config.yaml" + with open(providers_path, "w", encoding="utf-8") as f: + yaml.dump(providers_config, f) + + # Create system.yaml + system_config = { + "llm": { + "provider": "openai", + "model": "gpt-4o-mini", + "temperature": 0.0, + }, + "api": {"enabled": False}, + "output": {"output_dir": "./eval_output"}, + } + system_path = tmp_path / "system.yaml" + with open(system_path, "w", encoding="utf-8") as f: + yaml.dump(system_config, f) + + # Create evaluation_data.yaml + eval_data = [ + { + "conversation_group_id": "test_conv", + "turns": [ + { + "turn_id": "turn_1", + "query": "Test query", + "response": "Test response", + "contexts": ["Context 1"], + "expected_response": "Expected", + "turn_metrics": ["ragas:response_relevancy"], + } + ], + } + ] + eval_path = tmp_path / "evaluation_data.yaml" + with open(eval_path, "w", encoding="utf-8") as f: + yaml.dump(eval_data, f) + + return { + "providers_config": providers_path, + "system_config": system_path, + "eval_data": eval_path, + "output_dir": tmp_path / "eval_output", + } + + +@pytest.fixture +def runner( # pylint: disable=redefined-outer-name + temp_config_files: dict, +) -> MultiProviderEvaluationRunner: + """Create a MultiProviderEvaluationRunner instance for testing.""" + return MultiProviderEvaluationRunner( + providers_config_path=str(temp_config_files["providers_config"]), + system_config_path=str(temp_config_files["system_config"]), + eval_data_path=str(temp_config_files["eval_data"]), + ) + + +@pytest.fixture +def sample_evaluation_summary() -> dict[str, Any]: + """Create a sample evaluation summary JSON for testing analysis.""" + return { + "timestamp": "2025-01-01T12:00:00", + "total_evaluations": 10, + "summary_stats": { + "overall": { + "TOTAL": 10, + "PASS": 8, + "FAIL": 2, + "ERROR": 0, + "pass_rate": 80.0, # Percentage format + "fail_rate": 20.0, + "error_rate": 0.0, + }, + "by_metric": { + "ragas:faithfulness": { + "pass": 4, + "fail": 0, + "error": 0, + "pass_rate": 100.0, + "fail_rate": 0.0, + "error_rate": 0.0, + "score_statistics": { + "mean": 0.95, + "median": 0.95, + "std": 0.02, + "min": 0.92, + "max": 0.98, + "count": 4, + }, + }, + "ragas:response_relevancy": { + "pass": 4, + "fail": 2, + "error": 0, + "pass_rate": 66.67, + "fail_rate": 33.33, + "error_rate": 0.0, + "score_statistics": { + "mean": 0.75, + "median": 0.78, + "std": 0.12, + "min": 0.55, + "max": 0.88, + "count": 6, + }, + }, + }, + }, + "results": [ + { + "conversation_group_id": "conv1", + "turn_id": "turn1", + "metric_identifier": "ragas:faithfulness", + "result": "PASS", + "score": 0.95, + "threshold": 0.8, + "execution_time": 1.0, + }, + { + "conversation_group_id": "conv1", + "turn_id": "turn2", + "metric_identifier": "ragas:response_relevancy", + "result": "PASS", + "score": 0.85, + "threshold": 0.7, + "execution_time": 1.2, + }, + ] + * 5, # Repeat to get 10 results + } diff --git a/tests/script/test_compare_evaluations.py b/tests/script/test_compare_evaluations.py index 020704e9..e03bebdb 100755 --- a/tests/script/test_compare_evaluations.py +++ b/tests/script/test_compare_evaluations.py @@ -7,65 +7,15 @@ import sys from pathlib import Path +from typing import Any import pytest +from script.compare_evaluations import EvaluationComparison -@pytest.fixture -def script_path(): - """Return the path to the compare_evaluations.py script.""" - # Test is in tests/script/, script is in project_root/script/ - return Path(__file__).parent.parent.parent / "script" / "compare_evaluations.py" - - -@pytest.fixture -def sample_evaluation_data(): - """Return sample evaluation data for testing.""" - sample_results1 = [ - { - "conversation_group_id": "conv1", - "turn_id": "1", - "metric_identifier": "ragas:faithfulness", - "result": "PASS", - "score": 0.8, - "threshold": 0.7, - "execution_time": 1.0, - }, - { - "conversation_group_id": "conv1", - "turn_id": "2", - "metric_identifier": "ragas:faithfulness", - "result": "PASS", - "score": 0.9, - "threshold": 0.7, - "execution_time": 1.2, - }, - ] - - sample_results2 = [ - { - "conversation_group_id": "conv1", - "turn_id": "1", - "metric_identifier": "ragas:faithfulness", - "result": "PASS", - "score": 0.85, - "threshold": 0.7, - "execution_time": 1.1, - }, - { - "conversation_group_id": "conv1", - "turn_id": "2", - "metric_identifier": "ragas:faithfulness", - "result": "FAIL", - "score": 0.6, - "threshold": 0.7, - "execution_time": 1.0, - }, - ] - return sample_results1, sample_results2 - - -def create_sample_summary(results, timestamp="2025-01-01T00:00:00"): +def create_sample_summary( + results: list[dict[str, Any]], timestamp: str = "2025-01-01T00:00:00" +) -> dict[str, Any]: """Create a sample evaluation summary.""" return { "timestamp": timestamp, @@ -97,7 +47,10 @@ def create_sample_summary(results, timestamp="2025-01-01T00:00:00"): } -def test_basic_comparison(script_path, sample_evaluation_data): +def test_basic_comparison( + script_path: Path, + sample_evaluation_data: tuple[list[dict[str, Any]], list[dict[str, Any]]], +) -> None: """Test basic comparison functionality.""" sample_results1, sample_results2 = sample_evaluation_data @@ -109,9 +62,9 @@ def test_basic_comparison(script_path, sample_evaluation_data): file1 = Path(temp_dir) / "summary1.json" file2 = Path(temp_dir) / "summary2.json" - with open(file1, "w") as f: + with open(file1, "w", encoding="utf-8") as f: json.dump(summary1, f) - with open(file2, "w") as f: + with open(file2, "w", encoding="utf-8") as f: json.dump(summary2, f) # Test the script @@ -119,6 +72,7 @@ def test_basic_comparison(script_path, sample_evaluation_data): [sys.executable, str(script_path), str(file1), str(file2)], capture_output=True, text=True, + check=False, ) assert result.returncode == 0, f"Script failed with error: {result.stderr}" @@ -128,12 +82,15 @@ def test_basic_comparison(script_path, sample_evaluation_data): ), "Output should contain comparison report" -def test_invalid_arguments(script_path): +def test_invalid_arguments(script_path: Path) -> None: """Test error handling for invalid arguments.""" # Test with only one file result = subprocess.run( - [sys.executable, str(script_path), "file1.json"], capture_output=True, text=True + [sys.executable, str(script_path), "file1.json"], + capture_output=True, + text=True, + check=False, ) assert result.returncode != 0, "Script should fail with only one file" @@ -146,6 +103,7 @@ def test_invalid_arguments(script_path): [sys.executable, str(script_path), "file1.json", "file2.json", "file3.json"], capture_output=True, text=True, + check=False, ) assert result.returncode != 0, "Script should fail with three files" @@ -154,13 +112,14 @@ def test_invalid_arguments(script_path): ), f"Expected error message not found in stderr: {result.stderr}" -def test_nonexistent_files(script_path): +def test_nonexistent_files(script_path: Path) -> None: """Test error handling for nonexistent files.""" result = subprocess.run( [sys.executable, str(script_path), "nonexistent1.json", "nonexistent2.json"], capture_output=True, text=True, + check=False, ) assert result.returncode != 0, "Script should fail with nonexistent files" @@ -173,25 +132,21 @@ class TestEvaluationComparisonMethods: """Unit tests for EvaluationComparison internal methods.""" @pytest.fixture - def comparison_instance(self): + def comparison_instance(self) -> EvaluationComparison: """Create an EvaluationComparison instance for testing.""" - # Import here to avoid module loading issues - import sys - - # Add project root to path (tests/script/ -> tests/ -> project_root/) - sys.path.append(str(Path(__file__).parent.parent.parent)) - from script.compare_evaluations import EvaluationComparison - return EvaluationComparison(alpha=0.05) - def test_compare_score_distributions_basic(self, comparison_instance): + def test_compare_score_distributions_basic( + self, comparison_instance: EvaluationComparison + ) -> None: """Test _compare_score_distributions with basic score data.""" # Test data based on normal distributions scores1 = [0.8, 0.9, 0.7, 0.85, 0.75, 0.88, 0.82, 0.79, 0.86, 0.81] scores2 = [0.6, 0.65, 0.55, 0.62, 0.58, 0.63, 0.59, 0.61, 0.64, 0.57] - result = comparison_instance._compare_score_distributions(scores1, scores2) - + result = comparison_instance._compare_score_distributions( # pylint: disable=protected-access + scores1, scores2 + ) # Check structure assert "run1_stats" in result assert "run2_stats" in result @@ -216,14 +171,18 @@ def test_compare_score_distributions_basic(self, comparison_instance): assert "p_value" in result["tests"]["mann_whitney_u"] assert "significant" in result["tests"]["mann_whitney_u"] - def test_compare_score_distributions_scipy_example(self, comparison_instance): + def test_compare_score_distributions_scipy_example( + self, comparison_instance: EvaluationComparison + ) -> None: """Test _compare_score_distributions using scipy documentation examples.""" # Example inspired by scipy.stats.ttest_ind documentation # Two samples with different means scores1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] scores2 = [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0] - result = comparison_instance._compare_score_distributions(scores1, scores2) + result = comparison_instance._compare_score_distributions( # pylint: disable=protected-access + scores1, scores2 + ) # The means should be 5.5 and 6.5 respectively assert abs(result["run1_stats"]["mean"] - 5.5) < 0.01 @@ -234,21 +193,27 @@ def test_compare_score_distributions_scipy_example(self, comparison_instance): # (though the exact p-values depend on the implementation) assert "tests" in result - def test_compare_score_distributions_identical_data(self, comparison_instance): + def test_compare_score_distributions_identical_data( + self, comparison_instance: EvaluationComparison + ) -> None: """Test _compare_score_distributions with identical data.""" scores1 = [0.8, 0.8, 0.8, 0.8, 0.8] scores2 = [0.8, 0.8, 0.8, 0.8, 0.8] - result = comparison_instance._compare_score_distributions(scores1, scores2) + result = comparison_instance._compare_score_distributions( # pylint: disable=protected-access + scores1, scores2 + ) assert result["run1_stats"]["mean"] == result["run2_stats"]["mean"] assert result["mean_difference"] == 0.0 assert result["relative_change"] == 0.0 - def test_perform_pass_rate_tests_basic(self, comparison_instance): + def test_perform_pass_rate_tests_basic( + self, comparison_instance: EvaluationComparison + ) -> None: """Test _perform_pass_rate_tests with basic contingency table data.""" # Based on scipy.stats.chi2_contingency example - comparison = {"tests": {}} + comparison: dict = {"tests": {}} # Example: Run1 has 16 pass, 4 fail; Run2 has 18 pass, 2 fail test_data = { "pass_count1": 16, @@ -259,7 +224,9 @@ def test_perform_pass_rate_tests_basic(self, comparison_instance): "total2": 20, } - comparison_instance._perform_pass_rate_tests(comparison, test_data) + comparison_instance._perform_pass_rate_tests( # pylint: disable=protected-access + comparison, test_data + ) # Check that tests were performed assert "tests" in comparison @@ -269,11 +236,13 @@ def test_perform_pass_rate_tests_basic(self, comparison_instance): ) assert has_tests or "error" in comparison["tests"] - def test_perform_pass_rate_tests_scipy_chisquare_example(self, comparison_instance): + def test_perform_pass_rate_tests_scipy_chisquare_example( + self, comparison_instance: EvaluationComparison + ) -> None: """Test _perform_pass_rate_tests using scipy chisquare documentation example.""" # Based on the scipy documentation example: chisquare([16, 18, 16, 14, 12, 12]) # Convert to pass/fail format for our function - comparison = {"tests": {}} + comparison: dict = {"tests": {}} test_data = { "pass_count1": 16, "fail_count1": 4, # Making total 20 @@ -283,7 +252,9 @@ def test_perform_pass_rate_tests_scipy_chisquare_example(self, comparison_instan "total2": 20, } - comparison_instance._perform_pass_rate_tests(comparison, test_data) + comparison_instance._perform_pass_rate_tests( # pylint: disable=protected-access + comparison, test_data + ) # Verify structure assert "tests" in comparison @@ -303,10 +274,12 @@ def test_perform_pass_rate_tests_scipy_chisquare_example(self, comparison_instan assert "p_value" in fisher assert "significant" in fisher - def test_perform_pass_rate_tests_edge_cases(self, comparison_instance): + def test_perform_pass_rate_tests_edge_cases( + self, comparison_instance: EvaluationComparison + ) -> None: """Test _perform_pass_rate_tests with edge cases.""" # Test with zero totals - comparison = {"tests": {}} + comparison: dict = {"tests": {}} test_data = { "pass_count1": 0, "fail_count1": 0, @@ -316,61 +289,80 @@ def test_perform_pass_rate_tests_edge_cases(self, comparison_instance): "total2": 15, } - comparison_instance._perform_pass_rate_tests(comparison, test_data) + comparison_instance._perform_pass_rate_tests( # pylint: disable=protected-access + comparison, test_data + ) # Should handle gracefully (no tests performed or error recorded) assert "tests" in comparison - def test_check_confidence_interval_overlap_no_overlap(self, comparison_instance): + def test_check_confidence_interval_overlap_no_overlap( + self, comparison_instance: EvaluationComparison + ) -> None: """Test _check_confidence_interval_overlap with non-overlapping intervals.""" ci1 = {"low": 0.1, "high": 0.3, "mean": 0.2, "confidence_level": 0.95} ci2 = {"low": 0.7, "high": 0.9, "mean": 0.8, "confidence_level": 0.95} - result = comparison_instance._check_confidence_interval_overlap(ci1, ci2) + result = comparison_instance._check_confidence_interval_overlap( # pylint: disable=protected-access + ci1, ci2 + ) assert "intervals_overlap" in result assert "significant" in result assert result["intervals_overlap"] is False assert result["significant"] is True - def test_check_confidence_interval_overlap_with_overlap(self, comparison_instance): + def test_check_confidence_interval_overlap_with_overlap( + self, comparison_instance: EvaluationComparison + ) -> None: """Test _check_confidence_interval_overlap with overlapping intervals.""" ci1 = {"low": 0.2, "high": 0.6, "mean": 0.4, "confidence_level": 0.95} ci2 = {"low": 0.4, "high": 0.8, "mean": 0.6, "confidence_level": 0.95} - result = comparison_instance._check_confidence_interval_overlap(ci1, ci2) + result = comparison_instance._check_confidence_interval_overlap( # pylint: disable=protected-access + ci1, ci2 + ) assert "intervals_overlap" in result assert "significant" in result assert result["intervals_overlap"] is True assert result["significant"] is False - def test_check_confidence_interval_overlap_none_inputs(self, comparison_instance): + def test_check_confidence_interval_overlap_none_inputs( + self, comparison_instance: EvaluationComparison + ) -> None: """Test _check_confidence_interval_overlap with None inputs.""" - result = comparison_instance._check_confidence_interval_overlap(None, None) + result = comparison_instance._check_confidence_interval_overlap( # pylint: disable=protected-access + None, None + ) assert "test_performed" in result # Should handle None inputs gracefully - might not perform test - def test_check_confidence_interval_overlap_partial_none(self, comparison_instance): + def test_check_confidence_interval_overlap_partial_none( + self, comparison_instance: EvaluationComparison + ) -> None: """Test _check_confidence_interval_overlap with one None input.""" ci1 = {"low": 0.2, "high": 0.6, "mean": 0.4, "confidence_level": 0.95} - result = comparison_instance._check_confidence_interval_overlap(ci1, None) - + result = comparison_instance._check_confidence_interval_overlap( # pylint: disable=protected-access + ci1, None + ) assert "test_performed" in result # Should handle partial None inputs gracefully def test_compare_score_distributions_known_statistical_results( - self, comparison_instance - ): + self, comparison_instance: EvaluationComparison + ) -> None: """Test _compare_score_distributions with known statistical results.""" # Use data that should produce predictable statistical results # Two clearly different distributions scores1 = [1.0, 1.1, 1.2, 1.3, 1.4] # Mean ≈ 1.2, low variance scores2 = [2.0, 2.1, 2.2, 2.3, 2.4] # Mean ≈ 2.2, low variance - result = comparison_instance._compare_score_distributions(scores1, scores2) + result = comparison_instance._compare_score_distributions( # pylint: disable=protected-access + scores1, scores2 + ) # These should be significantly different assert abs(result["mean_difference"] - 1.0) < 0.01 @@ -386,11 +378,13 @@ def test_compare_score_distributions_known_statistical_results( assert result["tests"]["mann_whitney_u"]["p_value"] < 0.05 assert result["tests"]["mann_whitney_u"]["significant"] is True - def test_perform_pass_rate_tests_known_chi_square_result(self, comparison_instance): + def test_perform_pass_rate_tests_known_chi_square_result( + self, comparison_instance: EvaluationComparison + ) -> None: """Test _perform_pass_rate_tests with data that should produce known chi-square results.""" # Based on scipy documentation example for chi2_contingency # Create a 2x2 contingency table: [[16, 4], [18, 2]] - comparison = {"tests": {}} + comparison: dict = {"tests": {}} test_data = { "pass_count1": 16, "fail_count1": 4, @@ -400,7 +394,9 @@ def test_perform_pass_rate_tests_known_chi_square_result(self, comparison_instan "total2": 20, } - comparison_instance._perform_pass_rate_tests(comparison, test_data) + comparison_instance._perform_pass_rate_tests( # pylint: disable=protected-access + comparison, test_data + ) # Verify the chi-square test was performed and has reasonable results if "chi_square" in comparison["tests"]: @@ -415,11 +411,11 @@ def test_perform_pass_rate_tests_known_chi_square_result(self, comparison_instan assert 0 <= chi_square["p_value"] <= 1 # p-value is a probability def test_perform_pass_rate_tests_fisher_exact_small_sample( - self, comparison_instance - ): + self, comparison_instance: EvaluationComparison + ) -> None: """Test _perform_pass_rate_tests with small sample sizes suitable for Fisher exact test.""" # Small sample sizes where Fisher exact test is more appropriate - comparison = {"tests": {}} + comparison: dict = {"tests": {}} test_data = { "pass_count1": 3, "fail_count1": 2, @@ -429,7 +425,9 @@ def test_perform_pass_rate_tests_fisher_exact_small_sample( "total2": 5, } - comparison_instance._perform_pass_rate_tests(comparison, test_data) + comparison_instance._perform_pass_rate_tests( # pylint: disable=protected-access + comparison, test_data + ) # Verify Fisher exact test results if "fisher_exact" in comparison["tests"]: @@ -440,8 +438,8 @@ def test_perform_pass_rate_tests_fisher_exact_small_sample( assert 0 <= fisher["p_value"] <= 1 # p-value is a probability def test_check_confidence_interval_overlap_exact_boundaries( - self, comparison_instance - ): + self, comparison_instance: EvaluationComparison + ) -> None: """Test _check_confidence_interval_overlap with exact boundary conditions.""" # Test case where intervals just touch at boundaries ci1 = {"low": 0.1, "high": 0.5, "mean": 0.3, "confidence_level": 0.95} @@ -452,7 +450,9 @@ def test_check_confidence_interval_overlap_exact_boundaries( "confidence_level": 0.95, } - result = comparison_instance._check_confidence_interval_overlap(ci1, ci2) + result = comparison_instance._check_confidence_interval_overlap( # pylint: disable=protected-access + ci1, ci2 + ) # Touching at boundary might be considered overlap or not, depending on implementation assert "intervals_overlap" in result @@ -460,12 +460,16 @@ def test_check_confidence_interval_overlap_exact_boundaries( assert isinstance(result["intervals_overlap"], bool) assert isinstance(result["significant"], bool) - def test_compare_score_distributions_single_values(self, comparison_instance): + def test_compare_score_distributions_single_values( + self, comparison_instance: EvaluationComparison + ) -> None: """Test _compare_score_distributions with single values (edge case).""" scores1 = [0.8] scores2 = [0.6] - result = comparison_instance._compare_score_distributions(scores1, scores2) + result = comparison_instance._compare_score_distributions( # pylint: disable=protected-access + scores1, scores2 + ) # Should handle single values gracefully assert result["run1_stats"]["count"] == 1 @@ -479,10 +483,12 @@ def test_compare_score_distributions_single_values(self, comparison_instance): # Statistical tests might not be performed with single values assert "tests" in result - def test_perform_pass_rate_tests_extreme_ratios(self, comparison_instance): + def test_perform_pass_rate_tests_extreme_ratios( + self, comparison_instance: EvaluationComparison + ) -> None: """Test _perform_pass_rate_tests with extreme pass rate differences.""" # One run with 100% pass rate, another with 0% pass rate - comparison = {"tests": {}} + comparison: dict = {"tests": {}} test_data = { "pass_count1": 10, "fail_count1": 0, @@ -492,7 +498,9 @@ def test_perform_pass_rate_tests_extreme_ratios(self, comparison_instance): "total2": 10, } - comparison_instance._perform_pass_rate_tests(comparison, test_data) + comparison_instance._perform_pass_rate_tests( # pylint: disable=protected-access + comparison, test_data + ) # Should handle extreme cases assert "tests" in comparison diff --git a/tests/script/test_run_multi_provider_eval.py b/tests/script/test_run_multi_provider_eval.py index 103950a0..ef0057dc 100644 --- a/tests/script/test_run_multi_provider_eval.py +++ b/tests/script/test_run_multi_provider_eval.py @@ -2,94 +2,24 @@ """Pytest tests for run_multi_provider_eval.py script.""" import json -import sys from pathlib import Path +from typing import Any from unittest.mock import patch +import tempfile as temp_module +import logging +import multiprocessing +import shutil import pytest import yaml -# Add the script directory to the path -sys.path.insert(0, str(Path(__file__).parent.parent.parent / "script")) - -from run_multi_provider_eval import MultiProviderEvaluationRunner - - -@pytest.fixture -def temp_config_files(tmp_path): - """Create temporary configuration files for testing.""" - # Create multi_eval_config.yaml - providers_config = { - "providers": { - "openai": { - "models": ["gpt-4o-mini", "gpt-4-turbo"], - }, - "watsonx": { - "models": ["ibm/granite-13b-chat-v2"], - }, - }, - "settings": {"output_base": str(tmp_path / "eval_output")}, - } - providers_path = tmp_path / "multi_eval_config.yaml" - with open(providers_path, "w", encoding="utf-8") as f: - yaml.dump(providers_config, f) - - # Create system.yaml - system_config = { - "llm": { - "provider": "openai", - "model": "gpt-4o-mini", - "temperature": 0.0, - }, - "api": {"enabled": False}, - "output": {"output_dir": "./eval_output"}, - } - system_path = tmp_path / "system.yaml" - with open(system_path, "w", encoding="utf-8") as f: - yaml.dump(system_config, f) - - # Create evaluation_data.yaml - eval_data = [ - { - "conversation_group_id": "test_conv", - "turns": [ - { - "turn_id": "turn_1", - "query": "Test query", - "response": "Test response", - "contexts": ["Context 1"], - "expected_response": "Expected", - "turn_metrics": ["ragas:response_relevancy"], - } - ], - } - ] - eval_path = tmp_path / "evaluation_data.yaml" - with open(eval_path, "w", encoding="utf-8") as f: - yaml.dump(eval_data, f) - - return { - "providers_config": providers_path, - "system_config": system_path, - "eval_data": eval_path, - "output_dir": tmp_path / "eval_output", - } - - -@pytest.fixture -def runner(temp_config_files): - """Create a MultiProviderEvaluationRunner instance for testing.""" - return MultiProviderEvaluationRunner( - providers_config_path=str(temp_config_files["providers_config"]), - system_config_path=str(temp_config_files["system_config"]), - eval_data_path=str(temp_config_files["eval_data"]), - ) +from script.run_multi_provider_eval import MultiProviderEvaluationRunner class TestMultiProviderEvaluationRunnerInit: """Tests for MultiProviderEvaluationRunner initialization.""" - def test_init_success(self, temp_config_files): + def test_init_success(self, temp_config_files: dict[str, Path]) -> None: """Test successful initialization of the runner.""" runner = MultiProviderEvaluationRunner( providers_config_path=str(temp_config_files["providers_config"]), @@ -103,9 +33,9 @@ def test_init_success(self, temp_config_files): assert runner.system_config_path == Path(temp_config_files["system_config"]) assert runner.eval_data_path == Path(temp_config_files["eval_data"]) assert runner.output_base.exists() - assert runner.results == [] + assert not runner.results - def test_init_config_not_found(self, temp_config_files): + def test_init_config_not_found(self, temp_config_files: dict[str, Path]) -> None: """Test initialization fails when any config file is missing.""" with pytest.raises(FileNotFoundError, match="Providers config not found"): MultiProviderEvaluationRunner( @@ -114,7 +44,9 @@ def test_init_config_not_found(self, temp_config_files): eval_data_path=str(temp_config_files["eval_data"]), ) - def test_max_workers_from_constructor(self, temp_config_files): + def test_max_workers_from_constructor( + self, temp_config_files: dict[str, Path] + ) -> None: """Test max_workers configured via constructor argument.""" runner = MultiProviderEvaluationRunner( providers_config_path=str(temp_config_files["providers_config"]), @@ -124,7 +56,9 @@ def test_max_workers_from_constructor(self, temp_config_files): ) assert runner.max_workers == 4 - def test_max_workers_from_config_file(self, temp_config_files, tmp_path): + def test_max_workers_from_config_file( + self, temp_config_files: dict[str, Path], tmp_path: Path + ) -> None: """Test max_workers configured via config file.""" # Create config with max_workers setting config_with_workers = { @@ -147,7 +81,9 @@ def test_max_workers_from_config_file(self, temp_config_files, tmp_path): ) assert runner.max_workers == 6 - def test_max_workers_string_coercion(self, temp_config_files, tmp_path): + def test_max_workers_string_coercion( + self, temp_config_files: dict[str, Path], tmp_path: Path + ) -> None: """Test max_workers string value from YAML is coerced to int.""" # Create config with string max_workers config_with_string = { @@ -171,7 +107,9 @@ def test_max_workers_string_coercion(self, temp_config_files, tmp_path): assert runner.max_workers == 4 assert isinstance(runner.max_workers, int) - def test_max_workers_invalid_value(self, temp_config_files, tmp_path): + def test_max_workers_invalid_value( + self, temp_config_files: dict[str, Path], tmp_path: Path + ) -> None: """Test max_workers with invalid value raises clear error.""" # Create config with invalid max_workers config_invalid = { @@ -194,7 +132,9 @@ def test_max_workers_invalid_value(self, temp_config_files, tmp_path): eval_data_path=str(temp_config_files["eval_data"]), ) - def test_max_workers_minimum_value(self, temp_config_files): + def test_max_workers_minimum_value( + self, temp_config_files: dict[str, Path] + ) -> None: """Test max_workers is enforced to be at least 1.""" runner = MultiProviderEvaluationRunner( providers_config_path=str(temp_config_files["providers_config"]), @@ -213,10 +153,12 @@ def test_max_workers_minimum_value(self, temp_config_files): assert runner2.max_workers == 1 # Should be clamped to 1 def test_resource_warning_high_thread_count( - self, temp_config_files, tmp_path, caplog - ): + self, + temp_config_files: dict[str, Path], + tmp_path: Path, + caplog: pytest.LogCaptureFixture, + ) -> None: """Test warning is logged when total threads is very high.""" - import logging # Create system config with high max_threads system_config = { @@ -248,11 +190,12 @@ def test_resource_warning_high_thread_count( assert runner.max_workers == 4 def test_no_resource_warning_reasonable_config( - self, temp_config_files, tmp_path, caplog - ): + self, + temp_config_files: dict[str, Path], + tmp_path: Path, + caplog: pytest.LogCaptureFixture, + ) -> None: """Test no warning with reasonable thread count.""" - import logging - import multiprocessing # Calculate safe thread count based on actual CPU count cpu_count = multiprocessing.cpu_count() @@ -280,50 +223,66 @@ def test_no_resource_warning_reasonable_config( ) # Check no warning was logged + total_threads = max_workers * max_threads assert not any( "High resource usage detected" in record.message for record in caplog.records - ), f"Expected no warning with {max_workers} workers × {max_threads} threads = {max_workers * max_threads} on {cpu_count} CPUs" + ), ( + f"Expected no warning: {max_workers} workers × {max_threads} " + f"threads = {total_threads} on {cpu_count} CPUs" + ) assert runner.max_workers == max_workers class TestLoadYAML: """Tests for _load_yaml method.""" - def test_load_valid_yaml(self, runner, temp_config_files): + def test_load_valid_yaml( + self, runner: MultiProviderEvaluationRunner, temp_config_files: dict[str, Path] + ) -> None: """Test loading a valid YAML file.""" - config = runner._load_yaml(temp_config_files["providers_config"]) + config = runner._load_yaml( # pylint: disable=protected-access + temp_config_files["providers_config"] + ) assert isinstance(config, dict) assert "providers" in config assert "openai" in config["providers"] assert "models" in config["providers"]["openai"] assert "settings" in config - def test_load_invalid_yaml(self, runner, tmp_path): + def test_load_invalid_yaml( + self, runner: MultiProviderEvaluationRunner, tmp_path: Path + ) -> None: """Test loading an invalid YAML file.""" invalid_yaml = tmp_path / "invalid.yaml" with open(invalid_yaml, "w", encoding="utf-8") as f: f.write("invalid: yaml: content: [") with pytest.raises(ValueError, match="Error parsing YAML file"): - runner._load_yaml(invalid_yaml) + runner._load_yaml(invalid_yaml) # pylint: disable=protected-access - def test_load_yaml_non_dict_type(self, runner, tmp_path): + def test_load_yaml_non_dict_type( + self, runner: MultiProviderEvaluationRunner, tmp_path: Path + ) -> None: """Test that YAML files not containing dictionaries are rejected.""" list_yaml = tmp_path / "list.yaml" with open(list_yaml, "w", encoding="utf-8") as f: yaml.dump(["item1", "item2", "item3"], f) with pytest.raises(ValueError, match="must be a mapping, got list"): - runner._load_yaml(list_yaml) + runner._load_yaml(list_yaml) # pylint: disable=protected-access -class TestCreateProviderModelConfigs: +class TestCreateProviderModelConfigs: # pylint: disable=too-few-public-methods """Tests for _create_provider_model_configs method.""" - def test_create_configs_multiple_providers(self, runner): + def test_create_configs_multiple_providers( + self, runner: MultiProviderEvaluationRunner + ) -> None: """Test creating configs with multiple providers.""" - configs = runner._create_provider_model_configs() + configs = ( + runner._create_provider_model_configs() # pylint: disable=protected-access + ) assert len(configs) == 3 # 2 openai models + 1 watsonx model @@ -345,21 +304,25 @@ def test_create_configs_multiple_providers(self, runner): class TestCreateModifiedSystemConfig: """Tests for _create_modified_system_config method.""" - def test_llm_config_stays_constant(self, runner): + def test_llm_config_stays_constant( + self, runner: MultiProviderEvaluationRunner + ) -> None: """Test that LLM judge config is NOT modified (stays constant for fair comparison).""" original_llm_provider = runner.system_config["llm"]["provider"] original_llm_model = runner.system_config["llm"]["model"] - modified = runner._create_modified_system_config( - provider_id="watsonx", - model="ibm/granite-13b-chat-v2", + modified = ( + runner._create_modified_system_config( # pylint: disable=protected-access + provider_id="watsonx", + model="ibm/granite-13b-chat-v2", + ) ) # LLM judge should remain unchanged assert modified["llm"]["provider"] == original_llm_provider assert modified["llm"]["model"] == original_llm_model - def test_api_config_is_modified(self, temp_config_files): + def test_api_config_is_modified(self, temp_config_files: dict[str, Path]) -> None: """Test that API config is modified when API is enabled.""" # Create system config with API enabled system_config = { @@ -385,9 +348,11 @@ def test_api_config_is_modified(self, temp_config_files): eval_data_path=str(temp_config_files["eval_data"]), ) - modified = runner._create_modified_system_config( - provider_id="watsonx", - model="ibm/granite-13b-chat-v2", + modified = ( + runner._create_modified_system_config( # pylint: disable=protected-access + provider_id="watsonx", + model="ibm/granite-13b-chat-v2", + ) ) # API config should be modified with provider and model only @@ -403,11 +368,15 @@ def test_api_config_is_modified(self, temp_config_files): class TestCreateTempSystemConfig: """Tests for _create_temp_system_config method.""" - def test_create_temp_config_file(self, runner): + def test_create_temp_config_file( + self, runner: MultiProviderEvaluationRunner + ) -> None: """Test that a temporary config file is created.""" - temp_path = runner._create_temp_system_config( - provider_id="openai", - model="gpt-4o-mini", + temp_path = ( + runner._create_temp_system_config( # pylint: disable=protected-access + provider_id="openai", + model="gpt-4o-mini", + ) ) try: @@ -426,32 +395,36 @@ def test_create_temp_config_file(self, runner): if temp_path.exists(): temp_path.unlink() - def test_temp_config_cleanup_on_yaml_dump_failure(self, runner, tmp_path): + def test_temp_config_cleanup_on_yaml_dump_failure( + self, + runner: MultiProviderEvaluationRunner, + ) -> None: """Test that temp file is cleaned up when yaml.dump() fails.""" - import tempfile as temp_module # Track the temp file path that gets created created_temp_path = None original_named_temp_file = temp_module.NamedTemporaryFile - def track_temp_file(*args, **kwargs): + def track_temp_file(*args: Any, **kwargs: Any) -> Any: nonlocal created_temp_path - temp_file = original_named_temp_file(*args, **kwargs) + temp_file = original_named_temp_file( # pylint: disable=consider-using-with + *args, **kwargs + ) created_temp_path = Path(temp_file.name) return temp_file # Mock NamedTemporaryFile to track the created file with patch( - "run_multi_provider_eval.tempfile.NamedTemporaryFile", + "script.run_multi_provider_eval.tempfile.NamedTemporaryFile", side_effect=track_temp_file, ): # Mock yaml.dump to raise an exception with patch( - "run_multi_provider_eval.yaml.dump", + "script.run_multi_provider_eval.yaml.dump", side_effect=Exception("YAML dump failed"), ): with pytest.raises(Exception, match="YAML dump failed"): - runner._create_temp_system_config( + runner._create_temp_system_config( # pylint: disable=protected-access provider_id="openai", model="gpt-4o-mini", ) @@ -464,15 +437,20 @@ def track_temp_file(*args, **kwargs): not created_temp_path.exists() ), "Temp file should have been cleaned up" - def test_temp_config_sanitizes_special_characters(self, runner): + def test_temp_config_sanitizes_special_characters( + self, runner: MultiProviderEvaluationRunner + ) -> None: """Test that special characters in provider_id and model are sanitized.""" - temp_path = runner._create_temp_system_config( - provider_id="open..ai//test", - model="gpt:4o-mini/special", + temp_path = ( + runner._create_temp_system_config( # pylint: disable=protected-access + provider_id="open..ai//test", + model="gpt:4o-mini/special", + ) ) try: - # Verify filename doesn't contain path separators or colons (except drive letter on Windows) + # Verify filename doesn't contain path separators or colons + # (except drive letter on Windows) assert "/" not in temp_path.name # On some systems, : might appear in drive letters on Windows, so we're lenient # The key is that path traversal characters are neutralized @@ -486,7 +464,9 @@ class TestPathTraversalSecurity: """Tests for path traversal security.""" @pytest.fixture - def runner(self, temp_config_files): + def runner( + self, temp_config_files: dict[str, Path] + ) -> MultiProviderEvaluationRunner: """Create a runner instance for testing.""" return MultiProviderEvaluationRunner( providers_config_path=str(temp_config_files["providers_config"]), @@ -494,14 +474,16 @@ def runner(self, temp_config_files): eval_data_path=str(temp_config_files["eval_data"]), ) - def test_path_traversal_blocked_in_provider_id(self, runner): + def test_path_traversal_blocked_in_provider_id( + self, runner: MultiProviderEvaluationRunner + ) -> None: """Test that path traversal in provider_id is sanitized.""" with patch( - "run_multi_provider_eval.run_evaluation", + "script.run_multi_provider_eval.run_evaluation", return_value={"PASS": 0, "FAIL": 0, "ERROR": 1}, ): # Attempt path traversal in provider_id - result = runner._run_single_evaluation( + result = runner._run_single_evaluation( # pylint: disable=protected-access provider_name="malicious", provider_id="../../etc", model="test", @@ -517,18 +499,18 @@ def test_path_traversal_blocked_in_provider_id(self, runner): # Cleanup if output_path.exists(): - import shutil - shutil.rmtree(output_path.parent, ignore_errors=True) - def test_path_traversal_blocked_in_model(self, runner): + def test_path_traversal_blocked_in_model( + self, runner: MultiProviderEvaluationRunner + ) -> None: """Test that path traversal in model name is sanitized.""" with patch( - "run_multi_provider_eval.run_evaluation", + "script.run_multi_provider_eval.run_evaluation", return_value={"PASS": 0, "FAIL": 0, "ERROR": 1}, ): # Attempt path traversal in model - result = runner._run_single_evaluation( + result = runner._run_single_evaluation( # pylint: disable=protected-access provider_name="openai", provider_id="openai", model="../../../etc/passwd", @@ -543,22 +525,22 @@ def test_path_traversal_blocked_in_model(self, runner): # Cleanup if output_path.exists(): - import shutil - shutil.rmtree(output_path.parent.parent, ignore_errors=True) class TestRunSingleEvaluation: """Tests for _run_single_evaluation method.""" - def test_run_single_evaluation_success(self, runner): + def test_run_single_evaluation_success( + self, runner: MultiProviderEvaluationRunner + ) -> None: """Test successful single evaluation.""" # Mock run_evaluation to return a successful summary with patch( - "run_multi_provider_eval.run_evaluation", + "script.run_multi_provider_eval.run_evaluation", return_value={"PASS": 5, "FAIL": 2, "ERROR": 0}, ) as mock_run_eval: - result = runner._run_single_evaluation( + result = runner._run_single_evaluation( # pylint: disable=protected-access provider_name="openai", provider_id="openai", model="gpt-4o-mini", @@ -572,11 +554,13 @@ def test_run_single_evaluation_success(self, runner): assert "duration_seconds" in result mock_run_eval.assert_called_once() - def test_run_single_evaluation_failure(self, runner): + def test_run_single_evaluation_failure( + self, runner: MultiProviderEvaluationRunner + ) -> None: """Test evaluation failure handling.""" # Mock run_evaluation to return None (failure) - with patch("run_multi_provider_eval.run_evaluation", return_value=None): - result = runner._run_single_evaluation( + with patch("script.run_multi_provider_eval.run_evaluation", return_value=None): + result = runner._run_single_evaluation( # pylint: disable=protected-access provider_name="openai", provider_id="openai", model="gpt-4o-mini", @@ -585,14 +569,16 @@ def test_run_single_evaluation_failure(self, runner): assert result["success"] is False assert result["error"] == "Evaluation returned None (failed)" - def test_run_single_evaluation_invalid_summary(self, runner): + def test_run_single_evaluation_invalid_summary( + self, runner: MultiProviderEvaluationRunner + ) -> None: """Test evaluation with invalid summary structure.""" # Mock run_evaluation to return a summary missing required keys with patch( - "run_multi_provider_eval.run_evaluation", + "script.run_multi_provider_eval.run_evaluation", return_value={"PASS": 5, "FAIL": 2}, # Missing ERROR key ): - result = runner._run_single_evaluation( + result = runner._run_single_evaluation( # pylint: disable=protected-access provider_name="openai", provider_id="openai", model="gpt-4o-mini", @@ -603,10 +589,12 @@ def test_run_single_evaluation_invalid_summary(self, runner): assert "summary" not in result -class TestRunEvaluations: +class TestRunEvaluations: # pylint: disable=too-few-public-methods """Tests for run_evaluations method.""" - def test_run_evaluations_sequential(self, runner): + def test_run_evaluations_sequential( + self, runner: MultiProviderEvaluationRunner + ) -> None: """Test sequential evaluation execution.""" # Force sequential mode runner.max_workers = 1 @@ -626,10 +614,12 @@ def test_run_evaluations_sequential(self, runner): assert mock_single_eval.call_count == 3 -class TestGenerateSummary: +class TestGenerateSummary: # pylint: disable=too-few-public-methods """Tests for generate_summary method.""" - def test_generate_summary_mixed_results(self, runner): + def test_generate_summary_mixed_results( + self, runner: MultiProviderEvaluationRunner + ) -> None: """Test summary generation with mixed results.""" runner.results = [ {"success": True, "provider_id": "openai", "model": "gpt-4o-mini"}, @@ -644,87 +634,15 @@ def test_generate_summary_mixed_results(self, runner): assert summary["success_rate"] == "50.0%" -@pytest.fixture -def sample_evaluation_summary(): - """Create a sample evaluation summary JSON for testing analysis.""" - return { - "timestamp": "2025-01-01T12:00:00", - "total_evaluations": 10, - "summary_stats": { - "overall": { - "TOTAL": 10, - "PASS": 8, - "FAIL": 2, - "ERROR": 0, - "pass_rate": 80.0, # Percentage format - "fail_rate": 20.0, - "error_rate": 0.0, - }, - "by_metric": { - "ragas:faithfulness": { - "pass": 4, - "fail": 0, - "error": 0, - "pass_rate": 100.0, - "fail_rate": 0.0, - "error_rate": 0.0, - "score_statistics": { - "mean": 0.95, - "median": 0.95, - "std": 0.02, - "min": 0.92, - "max": 0.98, - "count": 4, - }, - }, - "ragas:response_relevancy": { - "pass": 4, - "fail": 2, - "error": 0, - "pass_rate": 66.67, - "fail_rate": 33.33, - "error_rate": 0.0, - "score_statistics": { - "mean": 0.75, - "median": 0.78, - "std": 0.12, - "min": 0.55, - "max": 0.88, - "count": 6, - }, - }, - }, - }, - "results": [ - { - "conversation_group_id": "conv1", - "turn_id": "turn1", - "metric_identifier": "ragas:faithfulness", - "result": "PASS", - "score": 0.95, - "threshold": 0.8, - "execution_time": 1.0, - }, - { - "conversation_group_id": "conv1", - "turn_id": "turn2", - "metric_identifier": "ragas:response_relevancy", - "result": "PASS", - "score": 0.85, - "threshold": 0.7, - "execution_time": 1.2, - }, - ] - * 5, # Repeat to get 10 results - } - - class TestBestModelAnalysis: """Tests for best model analysis functionality.""" def test_analyze_model_performance( - self, runner, tmp_path, sample_evaluation_summary - ): + self, + runner: MultiProviderEvaluationRunner, + tmp_path: Path, + sample_evaluation_summary: dict[str, Any], + ) -> None: """Test successful model performance analysis.""" # Setup: Create evaluation summary files model_dir = tmp_path / "eval_output" / "openai" / "gpt-4o-mini" @@ -753,25 +671,33 @@ def test_analyze_model_performance( assert stats["overall"]["passed"] == 8 assert 0.0 <= stats["composite_score"] <= 1.0 - def test_percentage_to_decimal_conversion(self, runner, sample_evaluation_summary): + def test_percentage_to_decimal_conversion( + self, runner: MultiProviderEvaluationRunner, sample_evaluation_summary: dict + ) -> None: """Test that percentage rates (80.0) convert to decimals (0.8).""" - stats = runner._analyze_single_model("test/model", sample_evaluation_summary) + stats = runner._analyze_single_model( # pylint: disable=protected-access + "test/model", sample_evaluation_summary + ) # Verify percentage conversion assert abs(stats["overall"]["pass_rate"] - 0.8) < 0.01 assert 0.0 <= stats["overall"]["pass_rate"] <= 1.0 - def test_composite_score(self, runner): + def test_composite_score(self, runner: MultiProviderEvaluationRunner) -> None: """Test composite score calculation.""" # Perfect model should get score of 1.0 - perfect = runner._calculate_composite_score(1.0, 0.0, 1.0, 1.0) + perfect = runner._calculate_composite_score( # pylint: disable=protected-access + 1.0, 0.0, 1.0, 1.0 + ) assert abs(perfect - 1.0) < 0.0001 # Poor model should get score of 0.0 - poor = runner._calculate_composite_score(0.0, 1.0, 0.0, 0.0) + poor = runner._calculate_composite_score( # pylint: disable=protected-access + 0.0, 1.0, 0.0, 0.0 + ) assert poor == 0.0 - def test_model_ranking(self, runner): + def test_model_ranking(self, runner: MultiProviderEvaluationRunner) -> None: """Test models are ranked by composite score.""" runner.model_stats = { "model1": {"composite_score": 0.85}, @@ -786,7 +712,9 @@ def test_model_ranking(self, runner): assert ranked[1][0] == "model1" # Second: 0.85 assert ranked[2][0] == "model3" # Lowest: 0.70 - def test_save_analysis_to_yaml(self, runner, tmp_path): + def test_save_analysis_to_yaml( + self, runner: MultiProviderEvaluationRunner, tmp_path: Path + ) -> None: """Test saving analysis results to YAML file.""" runner.output_base = tmp_path runner.model_stats = { @@ -806,7 +734,9 @@ def test_save_analysis_to_yaml(self, runner, tmp_path): assert data["best_model"]["model"] == "model1" assert data["best_model"]["composite_score"] == 0.85 - def test_print_report(self, runner, capsys): + def test_print_report( + self, runner: MultiProviderEvaluationRunner, capsys: pytest.CaptureFixture[str] + ) -> None: """Test statistical comparison report output.""" runner.model_stats = { "model1": { diff --git a/tests/unit/core/api/conftest.py b/tests/unit/core/api/conftest.py new file mode 100644 index 00000000..244e678c --- /dev/null +++ b/tests/unit/core/api/conftest.py @@ -0,0 +1,42 @@ +"""Pytest configuration and fixtures for api tests.""" + +from typing import Any + +import pytest + +from pytest_mock import MockerFixture +from lightspeed_evaluation.core.models import APIConfig + + +@pytest.fixture +def api_config() -> APIConfig: + """Create test API config.""" + return APIConfig( + enabled=True, + api_base="http://localhost:8080", + version="v1", + endpoint_type="query", + timeout=30, + cache_enabled=False, + ) + + +@pytest.fixture +def basic_api_config() -> APIConfig: + """Create basic API configuration for streaming.""" + return APIConfig( + enabled=True, + api_base="http://localhost:8080", + endpoint_type="streaming", + timeout=30, + provider="openai", + model="gpt-4", + cache_enabled=False, + ) + + +@pytest.fixture +def mock_response(mocker: MockerFixture) -> Any: + """Create a mock streaming response.""" + response = mocker.Mock() + return response diff --git a/tests/unit/core/api/test_client.py b/tests/unit/core/api/test_client.py index 7224c0dc..caa7d2b3 100644 --- a/tests/unit/core/api/test_client.py +++ b/tests/unit/core/api/test_client.py @@ -1,45 +1,21 @@ """Unit tests for core API client module.""" +from pathlib import Path import pytest +import httpx +from pytest_mock import MockerFixture +from pydantic import ValidationError from lightspeed_evaluation.core.models import APIConfig, APIResponse from lightspeed_evaluation.core.system.exceptions import APIError from lightspeed_evaluation.core.api.client import APIClient -@pytest.fixture -def api_config(): - """Create test API config.""" - return APIConfig( - enabled=True, - api_base="http://localhost:8080", - version="v1", - endpoint_type="query", - timeout=30, - cache_enabled=False, - ) - - -@pytest.fixture -def basic_api_config(): - """Create basic API configuration for streaming.""" - return APIConfig( - enabled=True, - api_base="http://localhost:8080", - endpoint_type="streaming", - timeout=30, - provider="openai", - model="gpt-4", - cache_enabled=False, - ) - - class TestAPIClient: """Unit tests for APIClient.""" - def test_initialization_unsupported_endpoint_type(self): + def test_initialization_unsupported_endpoint_type(self) -> None: """Test initialization fails with unsupported endpoint type.""" - from pydantic import ValidationError # Pydantic will validate the endpoint_type, so this should raise ValidationError with pytest.raises(ValidationError, match="Endpoint type must be one of"): @@ -51,7 +27,9 @@ def test_initialization_unsupported_endpoint_type(self): timeout=30, ) - def test_query_standard_endpoint_success(self, api_config, mocker): + def test_query_standard_endpoint_success( + self, api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test successful query to standard endpoint.""" mock_response = mocker.Mock() mock_response.status_code = 200 @@ -79,7 +57,9 @@ def test_query_standard_endpoint_success(self, api_config, mocker): assert result.conversation_id == "conv_123" assert result.contexts == ["Context 1"] - def test_query_with_conversation_id(self, api_config, mocker): + def test_query_with_conversation_id( + self, api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test query with existing conversation_id.""" mock_response = mocker.Mock() mock_response.status_code = 200 @@ -107,7 +87,9 @@ def test_query_with_conversation_id(self, api_config, mocker): request_data = call_kwargs[1]["json"] assert request_data["conversation_id"] == "conv_123" - def test_query_with_attachments(self, api_config, mocker): + def test_query_with_attachments( + self, api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test query with attachments.""" mock_response = mocker.Mock() mock_response.status_code = 200 @@ -137,9 +119,10 @@ def test_query_with_attachments(self, api_config, mocker): assert request_data["attachments"][0]["content"] == "file1.txt" assert request_data["attachments"][1]["content"] == "file2.pdf" - def test_query_http_error(self, api_config, mocker): + def test_query_http_error( + self, api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test query handling HTTP errors.""" - import httpx mock_response = mocker.Mock() mock_response.status_code = 500 @@ -162,9 +145,10 @@ def test_query_http_error(self, api_config, mocker): with pytest.raises(APIError, match="API error: 500"): client.query("Test query") - def test_query_timeout_error(self, api_config, mocker): + def test_query_timeout_error( + self, api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test query handling timeout.""" - import httpx mock_client = mocker.Mock() mock_client.post.side_effect = httpx.TimeoutException("Timeout") @@ -180,7 +164,9 @@ def test_query_timeout_error(self, api_config, mocker): with pytest.raises(APIError, match="timeout"): client.query("Test query") - def test_query_missing_response_field(self, api_config, mocker): + def test_query_missing_response_field( + self, api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test query handling missing response field.""" mock_response = mocker.Mock() mock_response.status_code = 200 @@ -203,7 +189,7 @@ def test_query_missing_response_field(self, api_config, mocker): with pytest.raises(APIError, match="missing 'response' field"): client.query("Test query") - def test_query_streaming_endpoint(self, mocker): + def test_query_streaming_endpoint(self, mocker: MockerFixture) -> None: """Test query to streaming endpoint.""" config = APIConfig( enabled=True, @@ -247,9 +233,10 @@ def test_query_streaming_endpoint(self, mocker): assert result.response == "Streamed response" assert result.conversation_id == "conv_123" - def test_handle_response_errors_non_200(self, api_config, mocker): + def test_handle_response_errors_non_200( + self, api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test _handle_response_errors with non-200 status.""" - import httpx mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client") @@ -261,9 +248,13 @@ def test_handle_response_errors_non_200(self, api_config, mocker): mock_response.read.return_value = b'{"detail": "Not found"}' with pytest.raises(httpx.HTTPStatusError): - client._handle_response_errors(mock_response) + client._handle_response_errors( # pylint: disable=protected-access + mock_response + ) - def test_extract_error_message_with_detail(self, api_config, mocker): + def test_extract_error_message_with_detail( + self, api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test _extract_error_message with detail field.""" mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client") @@ -272,10 +263,14 @@ def test_extract_error_message_with_detail(self, api_config, mocker): mock_response = mocker.Mock() mock_response.read.return_value = b'{"detail": "Error message"}' - error_msg = client._extract_error_message(mock_response) + error_msg = client._extract_error_message( # pylint: disable=protected-access + mock_response + ) assert "Error message" in error_msg - def test_extract_error_message_with_nested_detail(self, api_config, mocker): + def test_extract_error_message_with_nested_detail( + self, api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test _extract_error_message with nested detail.""" mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client") @@ -286,11 +281,15 @@ def test_extract_error_message_with_nested_detail(self, api_config, mocker): b'{"detail": {"response": "Error", "cause": "Reason"}}' ) - error_msg = client._extract_error_message(mock_response) + error_msg = client._extract_error_message( # pylint: disable=protected-access + mock_response + ) assert "Error" in error_msg assert "Reason" in error_msg - def test_standard_query_formats_tool_calls(self, api_config, mocker): + def test_standard_query_formats_tool_calls( + self, api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test that standard query formats tool calls correctly.""" mock_response = mocker.Mock() mock_response.status_code = 200 @@ -325,7 +324,9 @@ def test_standard_query_formats_tool_calls(self, api_config, mocker): class TestAPIClientConfiguration: """Additional tests for APIClient configuration and initialization.""" - def test_initialization_streaming_endpoint(self, basic_api_config, mocker): + def test_initialization_streaming_endpoint( + self, basic_api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test client initialization with streaming endpoint.""" mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client") @@ -336,7 +337,9 @@ def test_initialization_streaming_endpoint(self, basic_api_config, mocker): assert client.timeout == 30 assert client.cache is None - def test_initialization_with_cache(self, tmp_path, mocker): + def test_initialization_with_cache( + self, tmp_path: Path, mocker: MockerFixture + ) -> None: """Test client initialization with cache enabled.""" config = APIConfig( enabled=True, @@ -357,7 +360,9 @@ def test_initialization_with_cache(self, tmp_path, mocker): assert client.cache is not None mock_cache.assert_called_once_with(str(tmp_path / "test_cache")) - def test_validate_endpoint_type_valid(self, basic_api_config, mocker): + def test_validate_endpoint_type_valid( + self, basic_api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test validation with valid endpoint type.""" mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client") @@ -365,7 +370,9 @@ def test_validate_endpoint_type_valid(self, basic_api_config, mocker): client = APIClient(basic_api_config) assert client.endpoint_type == "streaming" - def test_setup_client_with_api_key(self, basic_api_config, mocker): + def test_setup_client_with_api_key( + self, basic_api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test client setup includes API key from environment.""" mocker.patch.dict("os.environ", {"API_KEY": "test_secret_key"}) mock_client = mocker.Mock() @@ -379,7 +386,9 @@ def test_setup_client_with_api_key(self, basic_api_config, mocker): # Verify headers were updated (should include Authorization header) assert mock_client.headers.update.call_count >= 1 - def test_query_requires_initialized_client(self, basic_api_config, mocker): + def test_query_requires_initialized_client( + self, basic_api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test query fails if client not initialized.""" mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client") @@ -389,33 +398,43 @@ def test_query_requires_initialized_client(self, basic_api_config, mocker): with pytest.raises(APIError, match="not initialized"): client.query("test query") - def test_prepare_request_basic(self, basic_api_config, mocker): + def test_prepare_request_basic( + self, basic_api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test request preparation with basic parameters.""" mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client") client = APIClient(basic_api_config) - request = client._prepare_request("What is Python?") + request = client._prepare_request( # pylint: disable=protected-access + "What is Python?" + ) assert request.query == "What is Python?" assert request.provider == "openai" assert request.model == "gpt-4" - def test_prepare_request_with_conversation_id(self, basic_api_config, mocker): + def test_prepare_request_with_conversation_id( + self, basic_api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test request preparation with conversation ID.""" mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client") client = APIClient(basic_api_config) - request = client._prepare_request("Follow-up", conversation_id="conv_123") + request = client._prepare_request( # pylint: disable=protected-access + "Follow-up", conversation_id="conv_123" + ) assert request.query == "Follow-up" assert request.conversation_id == "conv_123" - def test_prepare_request_with_attachments(self, basic_api_config, mocker): + def test_prepare_request_with_attachments( + self, basic_api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test request preparation with attachments.""" mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client") client = APIClient(basic_api_config) - request = client._prepare_request( + request = client._prepare_request( # pylint: disable=protected-access "Analyze this", attachments=["file1.txt", "file2.pdf"] ) @@ -423,7 +442,9 @@ def test_prepare_request_with_attachments(self, basic_api_config, mocker): # Attachments may be processed, just verify they're present in some form assert hasattr(request, "attachments") - def test_close_client(self, basic_api_config, mocker): + def test_close_client( + self, basic_api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test closing the HTTP client.""" mock_http_client = mocker.Mock() mocker.patch( @@ -436,7 +457,9 @@ def test_close_client(self, basic_api_config, mocker): mock_http_client.close.assert_called_once() - def test_get_cache_key_generates_consistent_hash(self, tmp_path, mocker): + def test_get_cache_key_generates_consistent_hash( + self, tmp_path: Path, mocker: MockerFixture + ) -> None: """Test cache key generation is consistent for same request.""" config = APIConfig( enabled=True, @@ -455,11 +478,15 @@ def test_get_cache_key_generates_consistent_hash(self, tmp_path, mocker): client = APIClient(config) # Create identical requests - request1 = client._prepare_request("test query") - request2 = client._prepare_request("test query") + request1 = client._prepare_request( # pylint: disable=protected-access + "test query" + ) + request2 = client._prepare_request( # pylint: disable=protected-access + "test query" + ) - key1 = client._get_cache_key(request1) - key2 = client._get_cache_key(request2) + key1 = client._get_cache_key(request1) # pylint: disable=protected-access + key2 = client._get_cache_key(request2) # pylint: disable=protected-access # Same request should generate same cache key assert key1 == key2 @@ -467,8 +494,8 @@ def test_get_cache_key_generates_consistent_hash(self, tmp_path, mocker): assert len(key1) > 0 def test_client_initialization_sets_content_type_header( - self, basic_api_config, mocker - ): + self, basic_api_config: APIConfig, mocker: MockerFixture + ) -> None: """Test client initialization sets Content-Type header.""" mock_client = mocker.Mock() mocker.patch( @@ -485,7 +512,7 @@ def test_client_initialization_sets_content_type_header( for call in calls ) - def test_standard_endpoint_initialization(self, mocker): + def test_standard_endpoint_initialization(self, mocker: MockerFixture) -> None: """Test initialization with standard (non-streaming) endpoint.""" config = APIConfig( enabled=True, diff --git a/tests/unit/core/api/test_streaming_parser.py b/tests/unit/core/api/test_streaming_parser.py index f78dfae9..3c20d5a6 100644 --- a/tests/unit/core/api/test_streaming_parser.py +++ b/tests/unit/core/api/test_streaming_parser.py @@ -1,5 +1,6 @@ """Unit tests for streaming parser.""" +from typing import Any import pytest from lightspeed_evaluation.core.api.streaming_parser import ( @@ -10,17 +11,10 @@ ) -@pytest.fixture -def mock_response(mocker): - """Create a mock streaming response.""" - response = mocker.Mock() - return response - - class TestParseStreamingResponse: """Unit tests for parse_streaming_response.""" - def test_parse_complete_response(self, mock_response): + def test_parse_complete_response(self, mock_response: Any) -> None: """Test parsing a complete streaming response.""" lines = [ 'data: {"event": "start", "data": {"conversation_id": "conv_123"}}', @@ -38,11 +32,16 @@ def test_parse_complete_response(self, mock_response): assert "streaming_duration" in result assert "tokens_per_second" in result - def test_parse_response_with_tool_calls(self, mock_response): + def test_parse_response_with_tool_calls(self, mock_response: Any) -> None: """Test parsing response with tool calls.""" lines = [ 'data: {"event": "start", "data": {"conversation_id": "conv_456"}}', - 'data: {"event": "tool_call", "data": {"token": {"tool_name": "search", "arguments": {"query": "test"}}}}', + ( + "data: {" + '"event": "tool_call", ' + '"data": {"token": {"tool_name": "search", "arguments": {"query": "test"}}}' + "}" + ), 'data: {"event": "turn_complete", "data": {"token": "Final response"}}', ] mock_response.iter_lines.return_value = lines @@ -54,7 +53,7 @@ def test_parse_response_with_tool_calls(self, mock_response): assert len(result["tool_calls"]) == 1 assert result["tool_calls"][0][0]["tool_name"] == "search" - def test_parse_response_missing_final_response(self, mock_response): + def test_parse_response_missing_final_response(self, mock_response: Any) -> None: """Test parsing fails when final response is missing.""" lines = [ 'data: {"event": "start", "data": {"conversation_id": "conv_789"}}', @@ -64,7 +63,7 @@ def test_parse_response_missing_final_response(self, mock_response): with pytest.raises(ValueError, match="No final response found"): parse_streaming_response(mock_response) - def test_parse_response_missing_conversation_id(self, mock_response): + def test_parse_response_missing_conversation_id(self, mock_response: Any) -> None: """Test parsing fails when conversation ID is missing.""" lines = [ 'data: {"event": "turn_complete", "data": {"token": "Response"}}', @@ -74,7 +73,7 @@ def test_parse_response_missing_conversation_id(self, mock_response): with pytest.raises(ValueError, match="No Conversation ID found"): parse_streaming_response(mock_response) - def test_parse_response_with_error_event(self, mock_response): + def test_parse_response_with_error_event(self, mock_response: Any) -> None: """Test parsing handles error events.""" lines = [ 'data: {"event": "error", "data": {"token": "API Error occurred"}}', @@ -84,7 +83,7 @@ def test_parse_response_with_error_event(self, mock_response): with pytest.raises(ValueError, match="Streaming API error: API Error occurred"): parse_streaming_response(mock_response) - def test_parse_response_skips_empty_lines(self, mock_response): + def test_parse_response_skips_empty_lines(self, mock_response: Any) -> None: """Test parser skips empty lines.""" lines = [ "", @@ -100,7 +99,7 @@ def test_parse_response_skips_empty_lines(self, mock_response): assert result["response"] == "Response" assert result["conversation_id"] == "conv_123" - def test_parse_response_skips_non_data_lines(self, mock_response): + def test_parse_response_skips_non_data_lines(self, mock_response: Any) -> None: """Test parser skips lines without 'data:' prefix.""" lines = [ "event: start", @@ -115,12 +114,22 @@ def test_parse_response_skips_non_data_lines(self, mock_response): assert result["response"] == "Response" assert result["conversation_id"] == "conv_123" - def test_parse_response_with_multiple_tool_calls(self, mock_response): + def test_parse_response_with_multiple_tool_calls(self, mock_response: Any) -> None: """Test parsing multiple tool calls.""" lines = [ 'data: {"event": "start", "data": {"conversation_id": "conv_123"}}', - 'data: {"event": "tool_call", "data": {"token": {"tool_name": "search", "arguments": {"q": "test"}}}}', - 'data: {"event": "tool_call", "data": {"token": {"tool_name": "calculate", "arguments": {"expr": "2+2"}}}}', + ( + "data: {" + '"event": "tool_call", ' + '"data": {"token": {"tool_name": "search", "arguments": {"q": "test"}}}' + "}" + ), + ( + "data: {" + '"event": "tool_call", ' + '"data": {"token": {"tool_name": "calculate", "arguments": {"expr": "2+2"}}}' + "}" + ), 'data: {"event": "turn_complete", "data": {"token": "Done"}}', ] mock_response.iter_lines.return_value = lines @@ -135,7 +144,7 @@ def test_parse_response_with_multiple_tool_calls(self, mock_response): class TestParseSSELine: """Unit tests for _parse_sse_line.""" - def test_parse_valid_json(self): + def test_parse_valid_json(self) -> None: """Test parsing valid JSON SSE line.""" json_data = '{"event": "start", "data": {"conversation_id": "123"}}' @@ -146,7 +155,7 @@ def test_parse_valid_json(self): assert event == "start" assert data["conversation_id"] == "123" - def test_parse_invalid_json(self): + def test_parse_invalid_json(self) -> None: """Test parsing invalid JSON returns None.""" json_data = "not valid json" @@ -154,17 +163,17 @@ def test_parse_invalid_json(self): assert result is None - def test_parse_missing_event_field(self): + def test_parse_missing_event_field(self) -> None: """Test parsing with missing event field.""" json_data = '{"data": {"some": "data"}}' result = _parse_sse_line(json_data) assert result is not None - event, data = result + event, _ = result assert event == "" # Default empty string - def test_parse_missing_data_field(self): + def test_parse_missing_data_field(self) -> None: """Test parsing with missing data field.""" json_data = '{"event": "test"}' @@ -179,7 +188,7 @@ def test_parse_missing_data_field(self): class TestParseToolCall: """Unit tests for _parse_tool_call.""" - def test_parse_valid_tool_call(self): + def test_parse_valid_tool_call(self) -> None: """Test parsing valid tool call.""" token = {"tool_name": "search", "arguments": {"query": "test"}} @@ -189,7 +198,7 @@ def test_parse_valid_tool_call(self): assert result["tool_name"] == "search" assert result["arguments"]["query"] == "test" - def test_parse_tool_call_missing_tool_name(self): + def test_parse_tool_call_missing_tool_name(self) -> None: """Test parsing tool call without tool_name.""" token = {"arguments": {"query": "test"}} @@ -197,7 +206,7 @@ def test_parse_tool_call_missing_tool_name(self): assert result is None - def test_parse_tool_call_missing_arguments(self): + def test_parse_tool_call_missing_arguments(self) -> None: """Test parsing tool call without arguments.""" token = {"tool_name": "search"} @@ -205,7 +214,7 @@ def test_parse_tool_call_missing_arguments(self): assert result is None - def test_parse_tool_call_with_empty_arguments(self): + def test_parse_tool_call_with_empty_arguments(self) -> None: """Test parsing tool call with empty arguments dict.""" token = {"tool_name": "search", "arguments": {}} @@ -215,11 +224,11 @@ def test_parse_tool_call_with_empty_arguments(self): assert result["tool_name"] == "search" assert result["arguments"] == {} - def test_parse_tool_call_invalid_structure(self): + def test_parse_tool_call_invalid_structure(self) -> None: """Test parsing malformed tool call.""" token = "not a dict" - result = _parse_tool_call(token) + result = _parse_tool_call(token) # pyright: ignore[reportArgumentType] assert result is None @@ -227,13 +236,13 @@ def test_parse_tool_call_invalid_structure(self): class TestFormatToolSequences: """Unit tests for _format_tool_sequences.""" - def test_format_empty_tool_calls(self): + def test_format_empty_tool_calls(self) -> None: """Test formatting empty tool calls list.""" result = _format_tool_sequences([]) assert result == [] - def test_format_single_tool_call(self): + def test_format_single_tool_call(self) -> None: """Test formatting single tool call.""" tool_calls = [{"tool_name": "search", "arguments": {"query": "test"}}] @@ -243,7 +252,7 @@ def test_format_single_tool_call(self): assert len(result[0]) == 1 assert result[0][0]["tool_name"] == "search" - def test_format_multiple_tool_calls(self): + def test_format_multiple_tool_calls(self) -> None: """Test formatting multiple tool calls into sequences.""" tool_calls = [ {"tool_name": "search", "arguments": {"query": "test"}}, @@ -260,7 +269,7 @@ def test_format_multiple_tool_calls(self): class TestStreamingPerformanceMetrics: """Unit tests for streaming performance metrics (TTFT, tokens per second).""" - def test_time_to_first_token_captured(self, mock_response): + def test_time_to_first_token_captured(self, mock_response: Any) -> None: """Test that time to first token is captured on first content event.""" lines = [ 'data: {"event": "start", "data": {"conversation_id": "conv_123"}}', @@ -274,7 +283,7 @@ def test_time_to_first_token_captured(self, mock_response): assert result["time_to_first_token"] is not None assert result["time_to_first_token"] >= 0 - def test_streaming_duration_captured(self, mock_response): + def test_streaming_duration_captured(self, mock_response: Any) -> None: """Test that streaming duration is captured.""" lines = [ 'data: {"event": "start", "data": {"conversation_id": "conv_123"}}', @@ -290,7 +299,7 @@ def test_streaming_duration_captured(self, mock_response): # Duration should be >= TTFT assert result["streaming_duration"] >= result["time_to_first_token"] - def test_tokens_per_second_with_token_counts(self, mock_response): + def test_tokens_per_second_with_token_counts(self, mock_response: Any) -> None: """Test tokens per second calculation when token counts are provided.""" lines = [ 'data: {"event": "start", "data": {"conversation_id": "conv_123"}}', @@ -308,7 +317,7 @@ def test_tokens_per_second_with_token_counts(self, mock_response): assert result["tokens_per_second"] is not None assert result["tokens_per_second"] > 0 - def test_tokens_per_second_without_token_counts(self, mock_response): + def test_tokens_per_second_without_token_counts(self, mock_response: Any) -> None: """Test tokens per second is None when no output tokens.""" lines = [ 'data: {"event": "start", "data": {"conversation_id": "conv_123"}}', @@ -322,7 +331,7 @@ def test_tokens_per_second_without_token_counts(self, mock_response): assert result["output_tokens"] == 0 assert result["tokens_per_second"] is None - def test_ttft_captured_on_token_event(self, mock_response): + def test_ttft_captured_on_token_event(self, mock_response: Any) -> None: """Test TTFT is captured on first token event (not just turn_complete).""" lines = [ 'data: {"event": "start", "data": {"conversation_id": "conv_123"}}', @@ -337,11 +346,16 @@ def test_ttft_captured_on_token_event(self, mock_response): assert result["time_to_first_token"] is not None assert result["time_to_first_token"] >= 0 - def test_ttft_captured_on_tool_call_event(self, mock_response): + def test_ttft_captured_on_tool_call_event(self, mock_response: Any) -> None: """Test TTFT is captured on tool_call event.""" lines = [ 'data: {"event": "start", "data": {"conversation_id": "conv_123"}}', - 'data: {"event": "tool_call", "data": {"token": {"tool_name": "search", "arguments": {}}}}', + ( + "data: {" + '"event": "tool_call", ' + '"data": {"token": {"tool_name": "search", "arguments": {}}}' + "}" + ), 'data: {"event": "turn_complete", "data": {"token": "Final response"}}', ] mock_response.iter_lines.return_value = lines @@ -352,12 +366,17 @@ def test_ttft_captured_on_tool_call_event(self, mock_response): assert result["time_to_first_token"] is not None assert result["time_to_first_token"] >= 0 - def test_performance_metrics_with_complete_flow(self, mock_response): + def test_performance_metrics_with_complete_flow(self, mock_response: Any) -> None: """Test complete streaming flow with all performance metrics.""" lines = [ 'data: {"event": "start", "data": {"conversation_id": "conv_perf_test"}}', 'data: {"event": "token", "data": {"token": "Streaming..."}}', - 'data: {"event": "tool_call", "data": {"token": {"tool_name": "search", "arguments": {"q": "test"}}}}', + ( + "data: {" + '"event": "tool_call", ' + '"data": {"token": {"tool_name": "search", "arguments": {"q": "test"}}}' + "}" + ), 'data: {"event": "turn_complete", "data": {"token": "Complete response"}}', 'data: {"event": "end", "data": {"input_tokens": 100, "output_tokens": 250}}', ] diff --git a/tests/unit/core/config/test_models.py b/tests/unit/core/config/test_models.py index 07d21f69..9519b7ad 100644 --- a/tests/unit/core/config/test_models.py +++ b/tests/unit/core/config/test_models.py @@ -1,6 +1,7 @@ """Unit tests for core.config.models module.""" import pytest +from pydantic import ValidationError from lightspeed_evaluation.core.models import ( CoreConfig, EvaluationData, @@ -10,13 +11,12 @@ SystemConfig, TurnData, ) -from pydantic import ValidationError class TestTurnData: """Unit tests for TurnData model.""" - def test_valid_turn_data_creation(self): + def test_valid_turn_data_creation(self) -> None: """Test creating valid TurnData instance.""" turn = TurnData( turn_id="1", @@ -31,27 +31,32 @@ def test_valid_turn_data_creation(self): assert turn.response == "Python is a programming language." assert turn.contexts is not None assert len(turn.contexts) == 1 - assert turn.contexts[0] == "Python context" + assert ( + turn.contexts[0] # pylint: disable=unsubscriptable-object + == "Python context" + ) assert turn.expected_response == "Python is a high-level language." - def test_turn_data_invalid_empty_query(self): + def test_turn_data_invalid_empty_query(self) -> None: """Test validation error for empty query.""" with pytest.raises( ValidationError, match="String should have at least 1 character" ): TurnData(turn_id="1", query="", response="Valid response") - def test_turn_data_invalid_context_missing_content(self): + def test_turn_data_invalid_context_missing_content(self) -> None: """Test validation error for non-string context.""" with pytest.raises(ValidationError, match="Input should be a valid string"): TurnData( turn_id="1", query="Valid query", response="Valid response", - contexts=[{"title": "No content field"}], + contexts=[ + {"title": "No content field"} + ], # pyright: ignore[reportArgumentType] ) - def test_turn_data_multiple_contexts(self): + def test_turn_data_multiple_contexts(self) -> None: """Test TurnData with multiple valid contexts.""" contexts = [ "First context", @@ -61,17 +66,26 @@ def test_turn_data_multiple_contexts(self): turn = TurnData( turn_id="1", query="Test query", response="Test response", contexts=contexts ) - + assert turn.contexts is not None assert len(turn.contexts) == 3 - assert turn.contexts[0] == "First context" - assert turn.contexts[1] == "Second context" - assert turn.contexts[2] == "Third context" + assert ( + turn.contexts[0] # pylint: disable=unsubscriptable-object + == "First context" + ) + assert ( + turn.contexts[1] # pylint: disable=unsubscriptable-object + == "Second context" + ) + assert ( + turn.contexts[2] # pylint: disable=unsubscriptable-object + == "Third context" + ) class TestEvaluationData: """Unit tests for EvaluationData model.""" - def test_valid_evaluation_data_creation(self): + def test_valid_evaluation_data_creation(self) -> None: """Test creating valid EvaluationData instance.""" turn = TurnData( turn_id="1", @@ -92,7 +106,7 @@ def test_valid_evaluation_data_creation(self): assert len(eval_data.turns) == 1 assert eval_data.turns[0].turn_metrics == ["ragas:faithfulness"] - def test_evaluation_data_with_minimal_fields(self): + def test_evaluation_data_with_minimal_fields(self) -> None: """Test EvaluationData with only required fields.""" turn = TurnData(turn_id="1", query="Test query", response="Test response") eval_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) @@ -103,7 +117,7 @@ def test_evaluation_data_with_minimal_fields(self): assert len(eval_data.turns) == 1 assert eval_data.turns[0].turn_metrics is None - def test_evaluation_data_invalid_empty_conversation_id(self): + def test_evaluation_data_invalid_empty_conversation_id(self) -> None: """Test validation error for empty conversation_group_id.""" turn = TurnData(turn_id="1", query="Test query", response="Test response") with pytest.raises( @@ -111,7 +125,7 @@ def test_evaluation_data_invalid_empty_conversation_id(self): ): EvaluationData(conversation_group_id="", turns=[turn]) - def test_evaluation_data_invalid_metric_format_missing_colon(self): + def test_evaluation_data_invalid_metric_format_missing_colon(self) -> None: """Test validation error for metric without colon.""" with pytest.raises( ValidationError, match='must be in format "framework:metric_name"' @@ -123,7 +137,7 @@ def test_evaluation_data_invalid_metric_format_missing_colon(self): turn_metrics=["invalid_metric"], ) - def test_evaluation_data_with_metadata(self): + def test_evaluation_data_with_metadata(self) -> None: """Test EvaluationData with metadata fields.""" turn = TurnData( turn_id="1", @@ -150,7 +164,7 @@ def test_evaluation_data_with_metadata(self): class TestLLMConfig: """Unit tests for LLMConfig model.""" - def test_valid_llm_config_creation(self): + def test_valid_llm_config_creation(self) -> None: """Test creating valid LLMConfig instance.""" config = LLMConfig( provider="openai", @@ -168,7 +182,7 @@ def test_valid_llm_config_creation(self): assert config.timeout == 60 assert config.num_retries == 3 - def test_llm_config_with_defaults(self): + def test_llm_config_with_defaults(self) -> None: """Test LLMConfig with default values.""" config = LLMConfig(provider="openai", model="gpt-4") @@ -183,7 +197,7 @@ def test_llm_config_with_defaults(self): class TestSystemConfig: """Unit tests for SystemConfig model.""" - def test_valid_system_config_creation(self): + def test_valid_system_config_creation(self) -> None: """Test creating valid SystemConfig instance.""" config = SystemConfig( core=CoreConfig(max_threads=42), @@ -200,7 +214,7 @@ def test_valid_system_config_creation(self): assert config.output.enabled_outputs == ["json"] assert config.core.max_threads == 42 - def test_system_config_with_defaults(self): + def test_system_config_with_defaults(self) -> None: """Test SystemConfig with default values.""" config = SystemConfig() @@ -211,7 +225,7 @@ def test_system_config_with_defaults(self): assert "csv" in config.output.enabled_outputs assert config.core.max_threads is None - def test_system_config_logging_defaults(self): + def test_system_config_logging_defaults(self) -> None: """Test SystemConfig logging configuration defaults.""" config = SystemConfig() @@ -224,7 +238,7 @@ def test_system_config_logging_defaults(self): class TestEvaluationResult: """Unit tests for EvaluationResult model.""" - def test_valid_evaluation_result_creation(self): + def test_valid_evaluation_result_creation(self) -> None: """Test creating valid EvaluationResult instance.""" result = EvaluationResult( conversation_group_id="test_conv", @@ -242,7 +256,7 @@ def test_valid_evaluation_result_creation(self): assert result.score == 0.85 assert result.reason == "High faithfulness score" - def test_evaluation_result_conversation_level(self): + def test_evaluation_result_conversation_level(self) -> None: """Test EvaluationResult for conversation-level metric.""" result = EvaluationResult( conversation_group_id="test_conv", @@ -257,7 +271,7 @@ def test_evaluation_result_conversation_level(self): assert result.metric_identifier == "deepeval:conversation_completeness" assert result.score == 0.92 - def test_evaluation_result_validation_invalid_result(self): + def test_evaluation_result_validation_invalid_result(self) -> None: """Test EvaluationResult validation with invalid result.""" with pytest.raises(ValidationError, match="Result must be one of"): EvaluationResult( @@ -268,7 +282,7 @@ def test_evaluation_result_validation_invalid_result(self): score=0.5, ) - def test_evaluation_result_validation_invalid_score(self): + def test_evaluation_result_validation_invalid_score(self) -> None: """Test EvaluationResult validation with invalid score.""" with pytest.raises(ValidationError, match="less than or equal to 1"): EvaluationResult( diff --git a/tests/unit/core/llm/conftest.py b/tests/unit/core/llm/conftest.py new file mode 100644 index 00000000..461496de --- /dev/null +++ b/tests/unit/core/llm/conftest.py @@ -0,0 +1,29 @@ +"""Pytest configuration and fixtures for llm tests.""" + +import pytest + +from lightspeed_evaluation.core.models import LLMConfig + + +@pytest.fixture +def llm_params() -> dict: + """Create sample LLM parameters.""" + return { + "temperature": 0.5, + "max_completion_tokens": 1024, + "timeout": 120, + "num_retries": 5, + } + + +@pytest.fixture +def basic_llm_config() -> LLMConfig: + """Create basic LLM configuration.""" + return LLMConfig( + provider="openai", + model="gpt-4", + temperature=0.0, + max_tokens=512, + timeout=60, + num_retries=3, + ) diff --git a/tests/unit/core/llm/test_custom.py b/tests/unit/core/llm/test_custom.py index 19801e36..bbd9d3ca 100644 --- a/tests/unit/core/llm/test_custom.py +++ b/tests/unit/core/llm/test_custom.py @@ -1,15 +1,16 @@ """Unit tests for custom LLM classes.""" import pytest +from pytest_mock import MockerFixture from lightspeed_evaluation.core.llm.custom import BaseCustomLLM, TokenTracker from lightspeed_evaluation.core.system.exceptions import LLMError -class TestTokenTracker: +class TestTokenTracker: # pylint: disable=too-few-public-methods """Tests for TokenTracker.""" - def test_token_callback_accumulates_tokens(self, mocker): + def test_token_callback_accumulates_tokens(self, mocker: MockerFixture) -> None: """Test that token callback accumulates token counts.""" tracker = TokenTracker() @@ -19,7 +20,9 @@ def test_token_callback_accumulates_tokens(self, mocker): mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 20 - tracker._token_callback({}, mock_response, 0.0, 0.0) + tracker._token_callback( # pylint: disable=protected-access + {}, mock_response, 0.0, 0.0 + ) input_tokens, output_tokens = tracker.get_counts() assert input_tokens == 10 @@ -29,7 +32,7 @@ def test_token_callback_accumulates_tokens(self, mocker): class TestBaseCustomLLM: """Tests for BaseCustomLLM.""" - def test_setup_ssl_verify_enabled(self, mocker): + def test_setup_ssl_verify_enabled(self, mocker: MockerFixture) -> None: """Test SSL verification enabled by default.""" mock_litellm = mocker.patch("lightspeed_evaluation.core.llm.custom.litellm") mocker.patch.dict("os.environ", {"SSL_CERTIFI_BUNDLE": "/path/to/bundle.pem"}) @@ -38,7 +41,7 @@ def test_setup_ssl_verify_enabled(self, mocker): assert mock_litellm.ssl_verify == "/path/to/bundle.pem" - def test_setup_ssl_verify_disabled(self, mocker): + def test_setup_ssl_verify_disabled(self, mocker: MockerFixture) -> None: """Test SSL verification can be disabled.""" mock_litellm = mocker.patch("lightspeed_evaluation.core.llm.custom.litellm") mocker.patch.dict("os.environ", {}) @@ -47,7 +50,7 @@ def test_setup_ssl_verify_disabled(self, mocker): assert mock_litellm.ssl_verify is False - def test_call_returns_single_response(self, mocker): + def test_call_returns_single_response(self, mocker: MockerFixture) -> None: """Test call returns single string when n=1.""" mock_litellm = mocker.patch("lightspeed_evaluation.core.llm.custom.litellm") mocker.patch.dict("os.environ", {}) @@ -64,7 +67,7 @@ def test_call_returns_single_response(self, mocker): assert result == "Test response" - def test_call_with_temperature_override(self, mocker): + def test_call_with_temperature_override(self, mocker: MockerFixture) -> None: """Test call with temperature override.""" mock_litellm = mocker.patch("lightspeed_evaluation.core.llm.custom.litellm") mocker.patch.dict("os.environ", {}) @@ -81,7 +84,7 @@ def test_call_with_temperature_override(self, mocker): call_args = mock_litellm.completion.call_args[1] assert call_args["temperature"] == 0.9 - def test_call_raises_llm_error_on_failure(self, mocker): + def test_call_raises_llm_error_on_failure(self, mocker: MockerFixture) -> None: """Test call raises LLMError on failure.""" mock_litellm = mocker.patch("lightspeed_evaluation.core.llm.custom.litellm") mocker.patch.dict("os.environ", {}) diff --git a/tests/unit/core/llm/test_deepeval_manager.py b/tests/unit/core/llm/test_deepeval_manager.py index 367d7380..9ff27e41 100644 --- a/tests/unit/core/llm/test_deepeval_manager.py +++ b/tests/unit/core/llm/test_deepeval_manager.py @@ -1,25 +1,15 @@ """Unit tests for DeepEval LLM Manager.""" import pytest +from pytest_mock import MockerFixture from lightspeed_evaluation.core.llm.deepeval import DeepEvalLLMManager -@pytest.fixture -def llm_params(): - """Create sample LLM parameters.""" - return { - "temperature": 0.5, - "max_completion_tokens": 1024, - "timeout": 120, - "num_retries": 5, - } - - class TestDeepEvalLLMManager: """Tests for DeepEvalLLMManager.""" - def test_initialization(self, llm_params, mocker): + def test_initialization(self, llm_params: dict, mocker: MockerFixture) -> None: """Test manager initialization.""" mock_model = mocker.patch( "lightspeed_evaluation.core.llm.deepeval.LiteLLMModel" @@ -31,7 +21,9 @@ def test_initialization(self, llm_params, mocker): assert manager.llm_params == llm_params mock_model.assert_called_once() - def test_initialization_with_default_temperature(self, mocker): + def test_initialization_with_default_temperature( + self, mocker: MockerFixture + ) -> None: """Test initialization with default temperature.""" mock_model = mocker.patch( "lightspeed_evaluation.core.llm.deepeval.LiteLLMModel" @@ -44,7 +36,9 @@ def test_initialization_with_default_temperature(self, mocker): call_kwargs = mock_model.call_args.kwargs assert call_kwargs["temperature"] == 0.0 - def test_initialization_with_default_num_retries(self, mocker): + def test_initialization_with_default_num_retries( + self, mocker: MockerFixture + ) -> None: """Test initialization with default num_retries.""" mock_model = mocker.patch( "lightspeed_evaluation.core.llm.deepeval.LiteLLMModel" @@ -57,7 +51,7 @@ def test_initialization_with_default_num_retries(self, mocker): call_kwargs = mock_model.call_args.kwargs assert call_kwargs["num_retries"] == 3 - def test_get_llm(self, llm_params, mocker): + def test_get_llm(self, llm_params: dict, mocker: MockerFixture) -> None: """Test get_llm method.""" mock_model_instance = mocker.Mock() mocker.patch( @@ -70,7 +64,7 @@ def test_get_llm(self, llm_params, mocker): assert llm == mock_model_instance - def test_get_model_info(self, llm_params, mocker): + def test_get_model_info(self, llm_params: dict, mocker: MockerFixture) -> None: """Test get_model_info method.""" mocker.patch("lightspeed_evaluation.core.llm.deepeval.LiteLLMModel") @@ -83,7 +77,9 @@ def test_get_model_info(self, llm_params, mocker): assert info["timeout"] == 120 assert info["num_retries"] == 5 - def test_initialization_prints_message(self, llm_params, mocker, capsys): + def test_initialization_prints_message( + self, llm_params: dict, mocker: MockerFixture, capsys: pytest.CaptureFixture + ) -> None: """Test that initialization prints configuration message.""" mocker.patch("lightspeed_evaluation.core.llm.deepeval.LiteLLMModel") diff --git a/tests/unit/core/llm/test_llm_manager.py b/tests/unit/core/llm/test_llm_manager.py index f4fc77d0..22810c5d 100644 --- a/tests/unit/core/llm/test_llm_manager.py +++ b/tests/unit/core/llm/test_llm_manager.py @@ -1,28 +1,18 @@ """Unit tests for LLM Manager.""" import pytest +from pytest_mock import MockerFixture from lightspeed_evaluation.core.models import LLMConfig, SystemConfig from lightspeed_evaluation.core.llm.manager import LLMManager -@pytest.fixture -def basic_llm_config(): - """Create basic LLM configuration.""" - return LLMConfig( - provider="openai", - model="gpt-4", - temperature=0.0, - max_tokens=512, - timeout=60, - num_retries=3, - ) - - class TestLLMManager: """Tests for LLMManager.""" - def test_initialization_openai(self, basic_llm_config, mocker): + def test_initialization_openai( + self, basic_llm_config: LLMConfig, mocker: MockerFixture + ) -> None: """Test initialization with OpenAI provider.""" mocker.patch("lightspeed_evaluation.core.llm.manager.validate_provider_env") @@ -31,7 +21,7 @@ def test_initialization_openai(self, basic_llm_config, mocker): assert manager.model_name == "gpt-4" assert manager.config.provider == "openai" - def test_initialization_azure(self, mocker): + def test_initialization_azure(self, mocker: MockerFixture) -> None: """Test initialization with Azure provider.""" config = LLMConfig( provider="azure", @@ -45,7 +35,7 @@ def test_initialization_azure(self, mocker): assert "azure" in manager.model_name - def test_initialization_azure_with_deployment(self, mocker): + def test_initialization_azure_with_deployment(self, mocker: MockerFixture) -> None: """Test initialization with Azure deployment name.""" config = LLMConfig( provider="azure", @@ -59,7 +49,7 @@ def test_initialization_azure_with_deployment(self, mocker): assert manager.model_name == "azure/my-deployment" - def test_initialization_watsonx(self, mocker): + def test_initialization_watsonx(self, mocker: MockerFixture) -> None: """Test initialization with WatsonX provider.""" config = LLMConfig( provider="watsonx", @@ -72,7 +62,7 @@ def test_initialization_watsonx(self, mocker): assert manager.model_name == "watsonx/ibm/granite-13b" - def test_initialization_anthropic(self, mocker): + def test_initialization_anthropic(self, mocker: MockerFixture) -> None: """Test initialization with Anthropic provider.""" config = LLMConfig( provider="anthropic", @@ -85,7 +75,7 @@ def test_initialization_anthropic(self, mocker): assert manager.model_name == "anthropic/claude-3-opus" - def test_initialization_gemini(self, mocker): + def test_initialization_gemini(self, mocker: MockerFixture) -> None: """Test initialization with Gemini provider.""" config = LLMConfig( provider="gemini", @@ -98,7 +88,7 @@ def test_initialization_gemini(self, mocker): assert manager.model_name == "gemini/gemini-pro" - def test_initialization_vertex(self, mocker): + def test_initialization_vertex(self, mocker: MockerFixture) -> None: """Test initialization with Vertex AI provider.""" config = LLMConfig( provider="vertex", @@ -111,7 +101,7 @@ def test_initialization_vertex(self, mocker): assert manager.model_name == "gemini-pro" - def test_initialization_ollama(self, mocker): + def test_initialization_ollama(self, mocker: MockerFixture) -> None: """Test initialization with Ollama provider.""" config = LLMConfig( provider="ollama", @@ -124,7 +114,7 @@ def test_initialization_ollama(self, mocker): assert manager.model_name == "ollama/llama2" - def test_initialization_hosted_vllm(self, mocker): + def test_initialization_hosted_vllm(self, mocker: MockerFixture) -> None: """Test initialization with hosted vLLM provider.""" config = LLMConfig( provider="hosted_vllm", @@ -137,7 +127,9 @@ def test_initialization_hosted_vllm(self, mocker): assert manager.model_name == "hosted_vllm/mistral-7b" - def test_initialization_generic_provider(self, basic_llm_config, mocker, capsys): + def test_initialization_generic_provider( + self, mocker: MockerFixture, capsys: pytest.CaptureFixture + ) -> None: """Test initialization with unknown/generic provider.""" config = LLMConfig( provider="custom_provider", @@ -155,7 +147,9 @@ def test_initialization_generic_provider(self, basic_llm_config, mocker, capsys) captured = capsys.readouterr() assert "generic" in captured.out.lower() or "warning" in captured.out.lower() - def test_get_model_name(self, basic_llm_config, mocker): + def test_get_model_name( + self, basic_llm_config: LLMConfig, mocker: MockerFixture + ) -> None: """Test get_model_name method.""" mocker.patch("lightspeed_evaluation.core.llm.manager.validate_provider_env") @@ -163,7 +157,9 @@ def test_get_model_name(self, basic_llm_config, mocker): assert manager.get_model_name() == "gpt-4" - def test_get_llm_params(self, basic_llm_config, mocker): + def test_get_llm_params( + self, basic_llm_config: LLMConfig, mocker: MockerFixture + ) -> None: """Test get_llm_params method.""" mocker.patch("lightspeed_evaluation.core.llm.manager.validate_provider_env") @@ -176,7 +172,9 @@ def test_get_llm_params(self, basic_llm_config, mocker): assert params["timeout"] == 60 assert params["num_retries"] == 3 - def test_get_config(self, basic_llm_config, mocker): + def test_get_config( + self, basic_llm_config: LLMConfig, mocker: MockerFixture + ) -> None: """Test get_config method.""" mocker.patch("lightspeed_evaluation.core.llm.manager.validate_provider_env") @@ -187,7 +185,7 @@ def test_get_config(self, basic_llm_config, mocker): assert config.provider == "openai" assert config.model == "gpt-4" - def test_from_system_config(self, mocker): + def test_from_system_config(self, mocker: MockerFixture) -> None: """Test creating manager from SystemConfig.""" system_config = SystemConfig() system_config.llm = LLMConfig( @@ -203,7 +201,9 @@ def test_from_system_config(self, mocker): assert manager.config.model == "gpt-3.5-turbo" assert manager.config.temperature == 0.5 - def test_from_llm_config(self, basic_llm_config, mocker): + def test_from_llm_config( + self, basic_llm_config: LLMConfig, mocker: MockerFixture + ) -> None: """Test creating manager from LLMConfig.""" mocker.patch("lightspeed_evaluation.core.llm.manager.validate_provider_env") @@ -211,7 +211,7 @@ def test_from_llm_config(self, basic_llm_config, mocker): assert manager.config == basic_llm_config - def test_llm_params_with_custom_values(self, mocker): + def test_llm_params_with_custom_values(self, mocker: MockerFixture) -> None: """Test LLM params with custom configuration values.""" config = LLMConfig( provider="openai", @@ -231,7 +231,12 @@ def test_llm_params_with_custom_values(self, mocker): assert params["timeout"] == 120 assert params["num_retries"] == 5 - def test_initialization_prints_message(self, basic_llm_config, mocker, capsys): + def test_initialization_prints_message( + self, + basic_llm_config: LLMConfig, + mocker: MockerFixture, + capsys: pytest.CaptureFixture, + ) -> None: """Test that initialization prints configuration message.""" mocker.patch("lightspeed_evaluation.core.llm.manager.validate_provider_env") diff --git a/tests/unit/core/llm/test_manager.py b/tests/unit/core/llm/test_manager.py index b3164148..9d83a962 100644 --- a/tests/unit/core/llm/test_manager.py +++ b/tests/unit/core/llm/test_manager.py @@ -12,12 +12,12 @@ class TestLLMError: """Unit tests for LLMError exception.""" - def test_llm_error_creation(self): + def test_llm_error_creation(self) -> None: """Test creating LLMError exception.""" error = LLMError("Test error message") assert str(error) == "Test error message" - def test_llm_error_inheritance(self): + def test_llm_error_inheritance(self) -> None: """Test that LLMError inherits from Exception.""" error = LLMError("Test error") assert isinstance(error, Exception) @@ -26,7 +26,7 @@ def test_llm_error_inheritance(self): class TestLLMManager: """Unit tests for LLMManager class.""" - def test_llm_manager_initialization_openai(self, mocker: MockerFixture): + def test_llm_manager_initialization_openai(self, mocker: MockerFixture) -> None: """Test LLMManager initialization with OpenAI provider.""" config = LLMConfig(provider="openai", model="gpt-4") @@ -38,7 +38,9 @@ def test_llm_manager_initialization_openai(self, mocker: MockerFixture): assert manager.model_name == "gpt-4" mock_print.assert_called_with("✅ LLM Manager: openai/gpt-4 -> gpt-4") - def test_llm_manager_initialization_generic_provider(self, mocker: MockerFixture): + def test_llm_manager_initialization_generic_provider( + self, mocker: MockerFixture + ) -> None: """Test LLMManager initialization with unknown/generic provider.""" config = LLMConfig(provider="custom", model="custom-model") @@ -48,7 +50,7 @@ def test_llm_manager_initialization_generic_provider(self, mocker: MockerFixture assert manager.model_name == "custom/custom-model" mock_print.assert_any_call("⚠️ Using generic provider format for custom") - def test_llm_manager_openai_missing_api_key(self, mocker: MockerFixture): + def test_llm_manager_openai_missing_api_key(self, mocker: MockerFixture) -> None: """Test LLMManager with OpenAI provider but missing API key.""" config = LLMConfig(provider="openai", model="gpt-4") @@ -58,7 +60,7 @@ def test_llm_manager_openai_missing_api_key(self, mocker: MockerFixture): ): LLMManager(config) - def test_get_model_name(self, mocker: MockerFixture): + def test_get_model_name(self, mocker: MockerFixture) -> None: """Test get_model_name method.""" config = LLMConfig(provider="openai", model="gpt-4") @@ -66,7 +68,7 @@ def test_get_model_name(self, mocker: MockerFixture): manager = LLMManager(config) assert manager.get_model_name() == "gpt-4" - def test_get_llm_params(self, mocker: MockerFixture): + def test_get_llm_params(self, mocker: MockerFixture) -> None: """Test get_llm_params method.""" config = LLMConfig( provider="openai", @@ -91,7 +93,7 @@ def test_get_llm_params(self, mocker: MockerFixture): } assert params == expected - def test_get_llm_params_with_ssl_verify_false(self, mocker: MockerFixture): + def test_get_llm_params_with_ssl_verify_false(self, mocker: MockerFixture) -> None: """Test get_llm_params method with ssl_verify set to False.""" config = LLMConfig( provider="openai", @@ -117,7 +119,7 @@ def test_get_llm_params_with_ssl_verify_false(self, mocker: MockerFixture): } assert params == expected - def test_get_config(self, mocker: MockerFixture): + def test_get_config(self, mocker: MockerFixture) -> None: """Test get_config method.""" config = LLMConfig(provider="openai", model="gpt-4") @@ -125,7 +127,7 @@ def test_get_config(self, mocker: MockerFixture): manager = LLMManager(config) assert manager.get_config() == config - def test_from_system_config(self, mocker: MockerFixture): + def test_from_system_config(self, mocker: MockerFixture) -> None: """Test from_system_config class method.""" system_config = SystemConfig.model_validate( { @@ -146,7 +148,7 @@ def test_from_system_config(self, mocker: MockerFixture): assert manager.config.temperature == 0.5 assert manager.config.max_tokens == 2000 - def test_provider_case_insensitive(self, mocker: MockerFixture): + def test_provider_case_insensitive(self, mocker: MockerFixture) -> None: """Test that provider names are handled case-insensitively.""" config = LLMConfig(provider="OpenAI", model="gpt-4") @@ -154,7 +156,7 @@ def test_provider_case_insensitive(self, mocker: MockerFixture): manager = LLMManager(config) assert manager.model_name == "gpt-4" - def test_multiple_providers_in_sequence(self, mocker: MockerFixture): + def test_multiple_providers_in_sequence(self, mocker: MockerFixture) -> None: """Test creating managers for different providers in sequence.""" providers_data = [ ("openai", "gpt-4", {"OPENAI_API_KEY": "test-key"}, "gpt-4"), diff --git a/tests/unit/core/metrics/conftest.py b/tests/unit/core/metrics/conftest.py new file mode 100644 index 00000000..6938d5ff --- /dev/null +++ b/tests/unit/core/metrics/conftest.py @@ -0,0 +1,142 @@ +"""Pytest configuration and fixtures for metrics tests.""" + +import sys + +import pytest +from pytest_mock import MockerFixture + +from lightspeed_evaluation.core.metrics.nlp import NLPMetrics +from lightspeed_evaluation.core.models import EvaluationScope, TurnData, SystemConfig + + +@pytest.fixture +def system_config() -> SystemConfig: + """Create a test system config with metrics metadata.""" + config = SystemConfig() + + # Set up test metrics metadata + config.default_turn_metrics_metadata = { + "ragas:faithfulness": { + "threshold": 0.7, + "default": True, + "description": "Test", + }, + "ragas:response_relevancy": { + "threshold": 0.8, + "default": False, + "description": "Test", + }, + "custom:answer_correctness": { + "threshold": 0.75, + "default": True, + "description": "Test", + }, + } + + config.default_conversation_metrics_metadata = { + "deepeval:conversation_completeness": { + "threshold": 0.6, + "default": True, + "description": "Test", + }, + "deepeval:conversation_relevancy": { + "threshold": 0.7, + "default": False, + "description": "Test", + }, + } + + return config + + +@pytest.fixture +def nlp_metrics() -> NLPMetrics: + """Create NLPMetrics instance.""" + return NLPMetrics() + + +@pytest.fixture +def sample_turn_data() -> TurnData: + """Create sample TurnData for testing.""" + return TurnData( + turn_id="test_turn", + query="What is the capital of France?", + response="The capital of France is Paris.", + expected_response="The capital of France is Paris.", + ) + + +@pytest.fixture +def sample_scope( # pylint: disable=redefined-outer-name + sample_turn_data: TurnData, +) -> EvaluationScope: + """Create sample EvaluationScope for turn-level evaluation.""" + return EvaluationScope( + turn_idx=0, + turn_data=sample_turn_data, + is_conversation=False, + ) + + +@pytest.fixture +def conversation_scope( # pylint: disable=redefined-outer-name + sample_turn_data: TurnData, +) -> EvaluationScope: + """Create sample EvaluationScope for conversation-level evaluation.""" + return EvaluationScope( + turn_idx=0, + turn_data=sample_turn_data, + is_conversation=True, + ) + + +@pytest.fixture +def mock_bleu_scorer(mocker: MockerFixture) -> MockerFixture: + """Mock sacrebleu BLEU with configurable return value. + + Uses sys.modules injection to mock sacrebleu without requiring it to be installed. + """ + mock_result = mocker.MagicMock() + mock_result.score = 85.0 # sacrebleu returns 0-100 scale + + mock_scorer_instance = mocker.MagicMock() + mock_scorer_instance.corpus_score = mocker.MagicMock(return_value=mock_result) + + mock_bleu_class = mocker.MagicMock(return_value=mock_scorer_instance) + + # Create a fake sacrebleu module and inject it into sys.modules + mock_sacrebleu = mocker.MagicMock() + mock_sacrebleu.BLEU = mock_bleu_class + mocker.patch.dict(sys.modules, {"sacrebleu": mock_sacrebleu}) + + return mock_scorer_instance + + +@pytest.fixture +def mock_rouge_scorer(mocker: MockerFixture) -> MockerFixture: + """Mock RougeScore with configurable return value. + + Returns different scores for precision, recall, fmeasure. + """ + mock_scorer_instance = mocker.MagicMock() + # Return scores for precision, recall, fmeasure (called in that order) + mock_scorer_instance.single_turn_score = mocker.MagicMock( + side_effect=[0.95, 0.89, 0.92] + ) + mocker.patch( + "lightspeed_evaluation.core.metrics.nlp.RougeScore", + return_value=mock_scorer_instance, + ) + return mock_scorer_instance + + +@pytest.fixture +def mock_similarity_scorer(mocker: MockerFixture) -> MockerFixture: + """Mock NonLLMStringSimilarity with configurable return value.""" + mock_scorer_instance = mocker.MagicMock() + mock_scorer_instance.single_turn_score = mocker.MagicMock(return_value=0.78) + mocker.patch( + "lightspeed_evaluation.core.metrics.nlp.NonLLMStringSimilarity", + return_value=mock_scorer_instance, + ) + return mock_scorer_instance diff --git a/tests/unit/core/metrics/custom/test_custom.py b/tests/unit/core/metrics/custom/test_custom.py index a8a0cda3..039ad6b5 100644 --- a/tests/unit/core/metrics/custom/test_custom.py +++ b/tests/unit/core/metrics/custom/test_custom.py @@ -1,5 +1,6 @@ """Tests for custom metrics module.""" +from pytest_mock import MockerFixture from lightspeed_evaluation.core.metrics.custom.custom import CustomMetrics from lightspeed_evaluation.core.metrics.manager import MetricLevel from lightspeed_evaluation.core.models import EvaluationScope, TurnData @@ -8,7 +9,9 @@ class TestCustomMetricsToolEval: """Test CustomMetrics tool_eval functionality.""" - def test_evaluate_tool_calls_with_none_tool_calls(self, mocker): + def test_evaluate_tool_calls_with_none_tool_calls( + self, mocker: MockerFixture + ) -> None: """Test that None tool_calls is handled correctly.""" # Mock LLM manager mock_llm_manager = mocker.Mock() @@ -33,7 +36,7 @@ def test_evaluate_tool_calls_with_none_tool_calls(self, mocker): assert score == 1.0 assert "Alternative 2 matched" in reason - def test_default_config_uses_full_ordered(self, mocker): + def test_default_config_uses_full_ordered(self, mocker: MockerFixture) -> None: """Test that default config uses full_match=True and ordered=True.""" mock_llm_manager = mocker.Mock() mock_llm_manager.get_model_name.return_value = "test-model" @@ -63,7 +66,7 @@ def test_default_config_uses_full_ordered(self, mocker): assert "full" in reason assert "ordered" in reason - def test_config_ordered_false_from_metadata(self, mocker): + def test_config_ordered_false_from_metadata(self, mocker: MockerFixture) -> None: """Test that ordered=False is read from turn_metrics_metadata.""" mock_llm_manager = mocker.Mock() mock_llm_manager.get_model_name.return_value = "test-model" @@ -93,7 +96,7 @@ def test_config_ordered_false_from_metadata(self, mocker): assert score == 1.0 assert "unordered" in reason - def test_config_match_partial_from_metadata(self, mocker): + def test_config_match_partial_from_metadata(self, mocker: MockerFixture) -> None: """Test that full_match=False is read from turn_metrics_metadata.""" mock_llm_manager = mocker.Mock() mock_llm_manager.get_model_name.return_value = "test-model" @@ -121,7 +124,9 @@ def test_config_match_partial_from_metadata(self, mocker): assert "partial" in reason assert "1/1 matched" in reason - def test_config_from_system_defaults_via_metric_manager(self, mocker): + def test_config_from_system_defaults_via_metric_manager( + self, mocker: MockerFixture + ) -> None: """Test that config is read from system.yaml via MetricManager.""" mock_llm_manager = mocker.Mock() mock_llm_manager.get_model_name.return_value = "test-model" diff --git a/tests/unit/core/metrics/custom/test_tool_eval.py b/tests/unit/core/metrics/custom/test_tool_eval.py index ad199c6d..7dae5e4c 100644 --- a/tests/unit/core/metrics/custom/test_tool_eval.py +++ b/tests/unit/core/metrics/custom/test_tool_eval.py @@ -12,7 +12,7 @@ class TestEvaluateToolCalls: """Test cases for evaluate_tool_calls function.""" - def test_primary_pattern_match(self): + def test_primary_pattern_match(self) -> None: """Test successful match with primary pattern.""" expected = [ [ # Primary pattern @@ -27,7 +27,7 @@ def test_primary_pattern_match(self): assert "Primary pattern matched" in details assert "Tool calls match expected structure and arguments" in details - def test_alternative_pattern_match(self): + def test_alternative_pattern_match(self) -> None: """Test successful match with alternative pattern.""" expected = [ [ # Primary pattern @@ -45,24 +45,26 @@ def test_alternative_pattern_match(self): assert "Alternative 2 matched" in details assert "Tool calls match expected structure and arguments" in details - def test_empty_pattern_match_primary(self): + def test_empty_pattern_match_primary(self) -> None: """Test empty pattern match as primary.""" - expected = [[]] # Primary: no tools expected - actual = [] + expected: list[list[dict]] = [[]] # Primary: no tools expected + actual: list = [] - success, details = evaluate_tool_calls(expected, actual) + success, details = evaluate_tool_calls( + expected, actual # pyright: ignore[reportArgumentType] + ) assert success is True assert "Primary pattern matched" in details assert "No tool calls made (valid alternate skip scenario)" in details - def test_empty_pattern_match_alternative(self): + def test_empty_pattern_match_alternative(self) -> None: """Test empty pattern match as alternative.""" expected = [ [[{"tool_name": "test_tool", "arguments": {}}]], # Primary: some tool [], # Alternative: no tools (skip scenario) ] - actual = [] + actual: list = [] success, details = evaluate_tool_calls(expected, actual) @@ -70,7 +72,7 @@ def test_empty_pattern_match_alternative(self): assert "Alternative 2 matched" in details assert "valid alternate skip scenario" in details - def test_no_pattern_match(self): + def test_no_pattern_match(self) -> None: """Test when no patterns match.""" expected = [ [ # Primary pattern @@ -87,13 +89,15 @@ def test_no_pattern_match(self): assert success is False assert "didn't match any of the 2 expected pattern(s)" in details - def test_error_handling(self): + def test_error_handling(self) -> None: """Test error handling in evaluate_tool_calls.""" # Invalid expected format should be handled gracefully expected = "invalid" # Not a list - actual = [] + actual: list = [] - success, details = evaluate_tool_calls(expected, actual) + success, details = evaluate_tool_calls( + expected, actual # pyright: ignore[reportArgumentType] + ) assert success is False # The function iterates over the string characters, so we get a different error @@ -106,7 +110,7 @@ def test_error_handling(self): class TestCompareToolCalls: """Test cases for compare_tool_calls function.""" - def test_exact_match(self): + def test_exact_match(self) -> None: """Test exact tool call match.""" expected = [[{"tool_name": "test_tool", "arguments": {"key": "value"}}]] actual = [[{"tool_name": "test_tool", "arguments": {"key": "value"}}]] @@ -115,7 +119,7 @@ def test_exact_match(self): assert result["success"] is True - def test_length_mismatch(self): + def test_length_mismatch(self) -> None: """Test tool call sequence length mismatch.""" expected = [ [{"tool_name": "tool1", "arguments": {}}], @@ -127,10 +131,10 @@ def test_length_mismatch(self): assert result["success"] is False - def test_empty_sequences(self): + def test_empty_sequences(self) -> None: """Test empty tool call sequences.""" - expected = [] - actual = [] + expected: list = [] + actual: list = [] result = compare_tool_calls(expected, actual) @@ -140,7 +144,7 @@ def test_empty_sequences(self): class TestCompareToolCallSequence: """Test cases for _compare_tool_call_sequence function.""" - def test_sequence_match(self): + def test_sequence_match(self) -> None: """Test matching tool call sequence.""" expected = [ {"tool_name": "tool1", "arguments": {"key1": "value1"}}, @@ -155,7 +159,7 @@ def test_sequence_match(self): assert result is True - def test_sequence_length_mismatch(self): + def test_sequence_length_mismatch(self) -> None: """Test tool call sequence with different lengths.""" expected = [{"tool_name": "tool1", "arguments": {}}] actual = [ @@ -171,7 +175,7 @@ def test_sequence_length_mismatch(self): class TestCompareSingleToolCall: """Test cases for _compare_single_tool_call function.""" - def test_tool_name_match(self): + def test_tool_name_match(self) -> None: """Test matching tool names and arguments.""" expected = {"tool_name": "test_tool", "arguments": {"key": "value"}} actual = {"tool_name": "test_tool", "arguments": {"key": "value"}} @@ -180,7 +184,7 @@ def test_tool_name_match(self): assert result is True - def test_tool_name_mismatch(self): + def test_tool_name_mismatch(self) -> None: """Test mismatched tool names.""" expected = {"tool_name": "tool1", "arguments": {}} actual = {"tool_name": "tool2", "arguments": {}} @@ -189,7 +193,7 @@ def test_tool_name_mismatch(self): assert result is False - def test_missing_arguments(self): + def test_missing_arguments(self) -> None: """Test tool calls with missing arguments.""" expected = {"tool_name": "test_tool", "arguments": {"key": "value"}} actual = {"tool_name": "test_tool"} # Missing arguments @@ -202,7 +206,7 @@ def test_missing_arguments(self): class TestCompareToolArguments: """Test cases for _compare_tool_arguments function.""" - def test_exact_arguments_match(self): + def test_exact_arguments_match(self) -> None: """Test exact argument matching.""" expected = {"key1": "value1", "key2": "value2"} actual = {"key1": "value1", "key2": "value2"} @@ -211,7 +215,7 @@ def test_exact_arguments_match(self): assert result is True - def test_regex_pattern_match(self): + def test_regex_pattern_match(self) -> None: """Test regex pattern matching in arguments.""" expected = {"name": "web-server-[0-9]+"} actual = {"name": "web-server-123"} @@ -220,7 +224,7 @@ def test_regex_pattern_match(self): assert result is True - def test_missing_argument_key(self): + def test_missing_argument_key(self) -> None: """Test missing argument key.""" expected = {"key1": "value1", "key2": "value2"} actual = {"key1": "value1"} # Missing key2 @@ -229,7 +233,7 @@ def test_missing_argument_key(self): assert result is False - def test_extra_argument_keys(self): + def test_extra_argument_keys(self) -> None: """Test extra argument keys.""" expected = {"key1": "value1"} actual = {"key1": "value1", "key2": "value2"} # Extra key2 @@ -238,7 +242,7 @@ def test_extra_argument_keys(self): assert result is False - def test_invalid_regex_pattern(self): + def test_invalid_regex_pattern(self) -> None: """Test invalid regex pattern handling.""" expected = {"name": "[invalid_regex"} # Invalid regex actual = {"name": "test"} @@ -247,12 +251,14 @@ def test_invalid_regex_pattern(self): assert result is False - def test_non_dict_arguments(self): + def test_non_dict_arguments(self) -> None: """Test non-dictionary arguments.""" expected = "not_a_dict" actual = {"key": "value"} - result = _compare_tool_arguments(expected, actual) + result = _compare_tool_arguments( + expected, actual # pyright: ignore[reportArgumentType] + ) assert result is False @@ -260,7 +266,7 @@ def test_non_dict_arguments(self): class TestOrderedParameter: """Test cases for the ordered parameter in tool evaluation.""" - def test_ordered_true_default_matches_in_order(self): + def test_ordered_true_default_matches_in_order(self) -> None: """Test ordered=True (default) matches when order is correct, fails otherwise.""" expected = [ [ @@ -286,7 +292,7 @@ def test_ordered_true_default_matches_in_order(self): success, _ = evaluate_tool_calls(expected, actual_wrong, ordered=True) assert success is False - def test_ordered_false_matches_any_order(self): + def test_ordered_false_matches_any_order(self) -> None: """Test ordered=False succeeds regardless of order.""" expected = [ [ @@ -303,7 +309,7 @@ def test_ordered_false_matches_any_order(self): assert success is True assert "unordered" in details - def test_ordered_false_fails_when_content_differs(self): + def test_ordered_false_fails_when_content_differs(self) -> None: """Test ordered=False still fails when tool calls don't match.""" expected = [ [ @@ -319,7 +325,7 @@ def test_ordered_false_fails_when_content_differs(self): success, _ = evaluate_tool_calls(expected, actual, ordered=False) assert success is False - def test_unordered_handles_duplicates_correctly(self): + def test_unordered_handles_duplicates_correctly(self) -> None: """Test unordered matching handles duplicate sequences properly.""" # Each expected item must match exactly one actual item expected = [ @@ -343,7 +349,7 @@ def test_unordered_handles_duplicates_correctly(self): assert evaluate_tool_calls(expected, actual_valid, ordered=False)[0] is True assert evaluate_tool_calls(expected, actual_invalid, ordered=False)[0] is False - def test_tools_within_sequence_always_ordered(self): + def test_tools_within_sequence_always_ordered(self) -> None: """Test that tools within a single sequence must always match in order. The `ordered` parameter only affects sequence order, not tool order within. @@ -369,7 +375,7 @@ def test_tools_within_sequence_always_ordered(self): class TestMatchParameter: """Test cases for full_match parameter (full vs partial matching).""" - def test_full_match_default_requires_exact_count(self): + def test_full_match_default_requires_exact_count(self) -> None: """Test full_match=True (default) requires all expected to match all actual.""" expected = [ [ @@ -396,7 +402,7 @@ def test_full_match_default_requires_exact_count(self): success, _ = evaluate_tool_calls(expected, actual_extra, full_match=True) assert success is False - def test_partial_match_allows_extra_actual_tools(self): + def test_partial_match_allows_extra_actual_tools(self) -> None: """Test full_match=False allows extra actual tools.""" expected = [ [ @@ -413,7 +419,7 @@ def test_partial_match_allows_extra_actual_tools(self): assert success is True assert "partial" in details - def test_partial_match_succeeds_with_some_matches(self): + def test_partial_match_succeeds_with_some_matches(self) -> None: """Test full_match=False succeeds if any expected tool is found.""" expected = [ [ @@ -432,7 +438,7 @@ def test_partial_match_succeeds_with_some_matches(self): assert "1/2 matched" in details assert "1 unmatched" in details - def test_partial_match_fails_when_no_matches(self): + def test_partial_match_fails_when_no_matches(self) -> None: """Test full_match=False fails when no expected tools are found.""" expected = [ [ @@ -448,7 +454,7 @@ def test_partial_match_fails_when_no_matches(self): success, _ = evaluate_tool_calls(expected, actual, full_match=False) assert success is False - def test_partial_match_ordered_reports_statistics(self): + def test_partial_match_ordered_reports_statistics(self) -> None: """Test full_match=False with ordered=True reports match statistics.""" expected = [ [ @@ -470,7 +476,7 @@ def test_partial_match_ordered_reports_statistics(self): assert "2/2 matched" in details assert "0 unmatched" in details - def test_partial_match_ordered_finds_all_items(self): + def test_partial_match_ordered_finds_all_items(self) -> None: """Test full_match=False ordered finds all items using greedy matching.""" expected = [ [ @@ -493,7 +499,7 @@ def test_partial_match_ordered_finds_all_items(self): assert success is True assert "2/2 matched" in details - def test_partial_match_unordered_ignores_order(self): + def test_partial_match_unordered_ignores_order(self) -> None: """Test full_match=False with ordered=False ignores order.""" expected = [ [ @@ -516,7 +522,7 @@ def test_partial_match_unordered_ignores_order(self): assert "unordered" in details assert "2/2 matched" in details - def test_partial_match_all_matched_reports_correctly(self): + def test_partial_match_all_matched_reports_correctly(self) -> None: """Test full_match=False reports all matched correctly.""" expected = [ [ diff --git a/tests/unit/core/metrics/test_geval.py b/tests/unit/core/metrics/test_geval.py index 0018a180..79ac617f 100644 --- a/tests/unit/core/metrics/test_geval.py +++ b/tests/unit/core/metrics/test_geval.py @@ -9,11 +9,11 @@ from lightspeed_evaluation.core.metrics.manager import MetricLevel -class TestGEvalHandler: +class TestGEvalHandler: # pylint: disable=too-many-public-methods """Test cases for GEvalHandler class.""" @pytest.fixture - def mock_llm_manager(self): + def mock_llm_manager(self) -> MagicMock: """Create a mock DeepEvalLLMManager.""" mock_manager = MagicMock() mock_llm = MagicMock() @@ -21,19 +21,23 @@ def mock_llm_manager(self): return mock_manager @pytest.fixture - def mock_metric_manager(self): + def mock_metric_manager(self) -> MagicMock: """Create a mock MetricManager.""" return MagicMock() @pytest.fixture - def handler(self, mock_llm_manager, mock_metric_manager): + def handler( + self, mock_llm_manager: MagicMock, mock_metric_manager: MagicMock + ) -> GEvalHandler: """Create a GEvalHandler instance with mocked dependencies.""" return GEvalHandler( deepeval_llm_manager=mock_llm_manager, metric_manager=mock_metric_manager, ) - def test_initialization(self, mock_llm_manager, mock_metric_manager): + def test_initialization( + self, mock_llm_manager: MagicMock, mock_metric_manager: MagicMock + ) -> None: """Test GEvalHandler initialization with required dependencies.""" handler = GEvalHandler( deepeval_llm_manager=mock_llm_manager, @@ -43,10 +47,12 @@ def test_initialization(self, mock_llm_manager, mock_metric_manager): assert handler.deepeval_llm_manager == mock_llm_manager assert handler.metric_manager == mock_metric_manager - def test_convert_evaluation_params_field_names(self, handler): + def test_convert_evaluation_params_field_names(self, handler: GEvalHandler) -> None: """Test conversion of evaluation data field names to LLMTestCaseParams enum.""" params = ["query", "response", "expected_response"] - result = handler._convert_evaluation_params(params) + result = handler._convert_evaluation_params( # pylint: disable=protected-access + params + ) assert result is not None assert len(result) == 3 @@ -54,10 +60,14 @@ def test_convert_evaluation_params_field_names(self, handler): assert LLMTestCaseParams.ACTUAL_OUTPUT in result assert LLMTestCaseParams.EXPECTED_OUTPUT in result - def test_convert_evaluation_params_with_contexts(self, handler): + def test_convert_evaluation_params_with_contexts( + self, handler: GEvalHandler + ) -> None: """Test conversion including contexts and retrieval_context fields.""" params = ["query", "response", "contexts", "retrieval_context"] - result = handler._convert_evaluation_params(params) + result = handler._convert_evaluation_params( # pylint: disable=protected-access + params + ) assert result is not None assert len(result) == 4 @@ -66,10 +76,14 @@ def test_convert_evaluation_params_with_contexts(self, handler): assert LLMTestCaseParams.CONTEXT in result assert LLMTestCaseParams.RETRIEVAL_CONTEXT in result - def test_convert_evaluation_params_enum_values_backward_compat(self, handler): + def test_convert_evaluation_params_enum_values_backward_compat( + self, handler: GEvalHandler + ) -> None: """Test conversion with direct enum value strings (backward compatibility).""" params = ["INPUT", "ACTUAL_OUTPUT", "EXPECTED_OUTPUT"] - result = handler._convert_evaluation_params(params) + result = handler._convert_evaluation_params( # pylint: disable=protected-access + params + ) assert result is not None assert len(result) == 3 @@ -77,28 +91,41 @@ def test_convert_evaluation_params_enum_values_backward_compat(self, handler): assert LLMTestCaseParams.ACTUAL_OUTPUT in result assert LLMTestCaseParams.EXPECTED_OUTPUT in result - def test_convert_evaluation_params_invalid_returns_none(self, handler): + def test_convert_evaluation_params_invalid_returns_none( + self, handler: GEvalHandler + ) -> None: """Test that invalid params return None to allow GEval auto-detection.""" params = ["invalid_param", "another_invalid"] - result = handler._convert_evaluation_params(params) + result = handler._convert_evaluation_params( # pylint: disable=protected-access + params + ) assert result is None - def test_convert_evaluation_params_empty_returns_none(self, handler): + def test_convert_evaluation_params_empty_returns_none( + self, handler: GEvalHandler + ) -> None: """Test that empty params list returns None.""" - result = handler._convert_evaluation_params([]) - + result = handler._convert_evaluation_params( # pylint: disable=protected-access + [] + ) assert result is None - def test_convert_evaluation_params_mixed_invalid_returns_none(self, handler): + def test_convert_evaluation_params_mixed_invalid_returns_none( + self, handler: GEvalHandler + ) -> None: """Test that any invalid param causes None return.""" params = ["query", "invalid_param", "response"] - result = handler._convert_evaluation_params(params) + result = handler._convert_evaluation_params( # pylint: disable=protected-access + params + ) # Should return None because of the invalid param assert result is None - def test_get_geval_config_uses_metric_manager(self, handler, mock_metric_manager): + def test_get_geval_config_uses_metric_manager( + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test that _get_geval_config delegates to MetricManager.""" expected_config = { "criteria": "Test criteria", @@ -108,7 +135,7 @@ def test_get_geval_config_uses_metric_manager(self, handler, mock_metric_manager mock_metric_manager.get_metric_metadata.return_value = expected_config conv_data = MagicMock() - config = handler._get_geval_config( + config = handler._get_geval_config( # pylint: disable=protected-access metric_name="test_metric", conv_data=conv_data, turn_data=None, @@ -123,7 +150,9 @@ def test_get_geval_config_uses_metric_manager(self, handler, mock_metric_manager turn_data=None, ) - def test_get_geval_config_turn_level(self, handler, mock_metric_manager): + def test_get_geval_config_turn_level( + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test retrieving turn-level config uses correct MetricLevel.""" expected_config = {"criteria": "Turn criteria", "threshold": 0.9} mock_metric_manager.get_metric_metadata.return_value = expected_config @@ -131,7 +160,7 @@ def test_get_geval_config_turn_level(self, handler, mock_metric_manager): conv_data = MagicMock() turn_data = MagicMock() - config = handler._get_geval_config( + config = handler._get_geval_config( # pylint: disable=protected-access metric_name="turn_metric", conv_data=conv_data, turn_data=turn_data, @@ -147,13 +176,13 @@ def test_get_geval_config_turn_level(self, handler, mock_metric_manager): ) def test_get_geval_config_returns_none_when_not_found( - self, handler, mock_metric_manager - ): + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test that None is returned when MetricManager finds no config.""" mock_metric_manager.get_metric_metadata.return_value = None conv_data = MagicMock() - config = handler._get_geval_config( + config = handler._get_geval_config( # pylint: disable=protected-access metric_name="nonexistent_metric", conv_data=conv_data, turn_data=None, @@ -162,7 +191,9 @@ def test_get_geval_config_returns_none_when_not_found( assert config is None - def test_evaluate_missing_config(self, handler, mock_metric_manager): + def test_evaluate_missing_config( + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test that evaluate returns error when config is not found.""" mock_metric_manager.get_metric_metadata.return_value = None @@ -178,7 +209,9 @@ def test_evaluate_missing_config(self, handler, mock_metric_manager): assert score is None assert "configuration not found" in reason.lower() - def test_evaluate_missing_criteria(self, handler, mock_metric_manager): + def test_evaluate_missing_criteria( + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test that evaluate requires 'criteria' in config.""" mock_metric_manager.get_metric_metadata.return_value = { "threshold": 0.8, @@ -198,7 +231,9 @@ def test_evaluate_missing_criteria(self, handler, mock_metric_manager): assert score is None assert "criteria" in reason.lower() - def test_evaluate_turn_missing_turn_data(self, handler, mock_metric_manager): + def test_evaluate_turn_missing_turn_data( + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test that turn-level evaluation requires turn_data.""" mock_metric_manager.get_metric_metadata.return_value = { "criteria": "Test criteria" @@ -216,7 +251,9 @@ def test_evaluate_turn_missing_turn_data(self, handler, mock_metric_manager): assert score is None assert "turn data required" in reason.lower() - def test_evaluate_turn_success(self, handler, mock_metric_manager): + def test_evaluate_turn_success( + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test successful turn-level evaluation.""" with patch( "lightspeed_evaluation.core.metrics.geval.GEval" @@ -256,7 +293,9 @@ def test_evaluate_turn_success(self, handler, mock_metric_manager): assert reason == "Test passed" mock_metric.measure.assert_called_once() - def test_evaluate_turn_with_optional_fields(self, handler, mock_metric_manager): + def test_evaluate_turn_with_optional_fields( + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test turn-level evaluation includes optional fields when present.""" with patch( "lightspeed_evaluation.core.metrics.geval.GEval" @@ -303,7 +342,9 @@ def test_evaluate_turn_with_optional_fields(self, handler, mock_metric_manager): assert call_kwargs["expected_output"] == "Expected response" assert call_kwargs["context"] == ["Context 1", "Context 2"] - def test_evaluate_turn_none_score_returns_zero(self, handler, mock_metric_manager): + def test_evaluate_turn_none_score_returns_zero( + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test that None score from metric is converted to 0.0.""" with patch( "lightspeed_evaluation.core.metrics.geval.GEval" @@ -338,7 +379,9 @@ def test_evaluate_turn_none_score_returns_zero(self, handler, mock_metric_manage assert score == 0.0 assert reason == "Could not evaluate" - def test_evaluate_turn_handles_exceptions(self, handler, mock_metric_manager): + def test_evaluate_turn_handles_exceptions( + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test that turn evaluation handles exceptions gracefully.""" with patch( "lightspeed_evaluation.core.metrics.geval.GEval" @@ -373,8 +416,8 @@ def test_evaluate_turn_handles_exceptions(self, handler, mock_metric_manager): assert "Test error" in reason def test_evaluate_turn_uses_default_params_when_none_provided( - self, handler, mock_metric_manager - ): + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test that default evaluation_params are used when none provided.""" with patch( "lightspeed_evaluation.core.metrics.geval.GEval" @@ -413,7 +456,9 @@ def test_evaluate_turn_uses_default_params_when_none_provided( LLMTestCaseParams.ACTUAL_OUTPUT, ] - def test_evaluate_conversation_success(self, handler, mock_metric_manager): + def test_evaluate_conversation_success( + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test successful conversation-level evaluation.""" with patch( "lightspeed_evaluation.core.metrics.geval.GEval" @@ -453,7 +498,9 @@ def test_evaluate_conversation_success(self, handler, mock_metric_manager): assert reason == "Conversation coherent" mock_metric.measure.assert_called_once() - def test_evaluate_conversation_aggregates_turns(self, handler, mock_metric_manager): + def test_evaluate_conversation_aggregates_turns( + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test that conversation evaluation properly aggregates turn data.""" with patch( "lightspeed_evaluation.core.metrics.geval.GEval" @@ -512,8 +559,8 @@ def test_evaluate_conversation_aggregates_turns(self, handler, mock_metric_manag assert "Turn 3 - Assistant:" in call_kwargs["actual_output"] def test_evaluate_conversation_with_evaluation_steps( - self, handler, mock_metric_manager - ): + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test that evaluation_steps are passed to GEval when provided.""" with patch( "lightspeed_evaluation.core.metrics.geval.GEval" @@ -558,8 +605,8 @@ def test_evaluate_conversation_with_evaluation_steps( ] def test_evaluate_conversation_handles_exceptions( - self, handler, mock_metric_manager - ): + self, handler: GEvalHandler, mock_metric_manager: MagicMock + ) -> None: """Test that conversation evaluation handles exceptions gracefully.""" with patch( "lightspeed_evaluation.core.metrics.geval.GEval" diff --git a/tests/unit/core/metrics/test_keywords_eval.py b/tests/unit/core/metrics/test_keywords_eval.py index df140918..19cc0eea 100644 --- a/tests/unit/core/metrics/test_keywords_eval.py +++ b/tests/unit/core/metrics/test_keywords_eval.py @@ -7,7 +7,7 @@ class TestKeywordsEval: """Test cases for keywords eval metric.""" - def test_keywords_eval_first_list_all_matched(self): + def test_keywords_eval_first_list_all_matched(self) -> None: """Test successful keywords evaluation when first list has all keywords matched.""" turn_data = TurnData( turn_id="test_turn", @@ -25,7 +25,7 @@ def test_keywords_eval_first_list_all_matched(self): assert "Keywords eval successful: Option 1" in reason assert "all keywords matched: 'yes', 'openshift-monitoring'" in reason - def test_keywords_eval_first_list_fails_second_succeeds(self): + def test_keywords_eval_first_list_fails_second_succeeds(self) -> None: """Test keywords evaluation when first list fails but second list succeeds.""" turn_data = TurnData( turn_id="test_turn", @@ -46,7 +46,7 @@ def test_keywords_eval_first_list_fails_second_succeeds(self): assert "Keywords eval successful: Option 2" in reason assert "all keywords matched: 'monitoring', 'confirmed'" in reason - def test_keywords_eval_all_lists_fail(self): + def test_keywords_eval_all_lists_fail(self) -> None: """Test keywords evaluation when all lists fail.""" turn_data = TurnData( turn_id="test_turn", @@ -70,7 +70,7 @@ def test_keywords_eval_all_lists_fail(self): "Option 2: unmatched ['confirmed', 'monitoring'], matched [none]" in reason ) - def test_keywords_eval_partial_match_in_failed_list(self): + def test_keywords_eval_partial_match_in_failed_list(self) -> None: """Test keywords evaluation with partial matches in failed lists.""" turn_data = TurnData( turn_id="test_turn", @@ -92,7 +92,7 @@ def test_keywords_eval_partial_match_in_failed_list(self): assert "Option 1: unmatched ['yes', 'confirmed'], matched [none]" in reason assert "Option 2: unmatched ['openshift'], matched ['monitoring']" in reason - def test_keywords_eval_case_insensitive(self): + def test_keywords_eval_case_insensitive(self) -> None: """Test that keywords evaluation is case insensitive.""" turn_data = TurnData( turn_id="test_turn", @@ -109,7 +109,7 @@ def test_keywords_eval_case_insensitive(self): assert "Keywords eval successful: Option 1" in reason assert "all keywords matched: 'yes', 'openshift-monitoring'" in reason - def test_keywords_eval_substring_matching(self): + def test_keywords_eval_substring_matching(self) -> None: """Test that keywords evaluation works with substring matching.""" turn_data = TurnData( turn_id="test_turn", @@ -129,7 +129,7 @@ def test_keywords_eval_substring_matching(self): assert "Keywords eval successful: Option 1" in reason assert "all keywords matched: 'monitoring', 'success'" in reason - def test_keywords_eval_no_expected_keywords(self): + def test_keywords_eval_no_expected_keywords(self) -> None: """Test keywords evaluation when no expected keywords provided.""" turn_data = TurnData( turn_id="test_turn", @@ -143,7 +143,7 @@ def test_keywords_eval_no_expected_keywords(self): assert score is None assert "No expected keywords provided" in reason - def test_keywords_eval_no_response(self): + def test_keywords_eval_no_response(self) -> None: """Test keywords evaluation when no response provided.""" turn_data = TurnData( turn_id="test_turn", @@ -157,7 +157,7 @@ def test_keywords_eval_no_response(self): assert score == 0.0 assert "No response provided" in reason - def test_keywords_eval_empty_response(self): + def test_keywords_eval_empty_response(self) -> None: """Test keywords evaluation with empty response.""" # Create turn data with valid response first, then modify it turn_data = TurnData( @@ -174,14 +174,14 @@ def test_keywords_eval_empty_response(self): assert score == 0.0 assert "No response provided" in reason - def test_keywords_eval_conversation_level_error(self): + def test_keywords_eval_conversation_level_error(self) -> None: """Test that keywords_eval returns error for conversation-level evaluation.""" score, reason = evaluate_keywords(None, None, None, True) assert score is None assert "Keywords eval is a turn-level metric" in reason - def test_keywords_eval_no_turn_data(self): + def test_keywords_eval_no_turn_data(self) -> None: """Test keywords evaluation when no turn data provided.""" score, reason = evaluate_keywords(None, 0, None, False) diff --git a/tests/unit/core/metrics/test_manager.py b/tests/unit/core/metrics/test_manager.py index b6fc312d..756f2c8e 100644 --- a/tests/unit/core/metrics/test_manager.py +++ b/tests/unit/core/metrics/test_manager.py @@ -1,7 +1,5 @@ """Unit tests for core metrics manager module.""" -import pytest - from lightspeed_evaluation.core.metrics.manager import MetricLevel, MetricManager from lightspeed_evaluation.core.models import ( EvaluationData, @@ -10,50 +8,12 @@ ) -@pytest.fixture -def system_config(): - """Create a test system config with metrics metadata.""" - config = SystemConfig() - - # Set up test metrics metadata - config.default_turn_metrics_metadata = { - "ragas:faithfulness": { - "threshold": 0.7, - "default": True, - "description": "Test", - }, - "ragas:response_relevancy": { - "threshold": 0.8, - "default": False, - "description": "Test", - }, - "custom:answer_correctness": { - "threshold": 0.75, - "default": True, - "description": "Test", - }, - } - - config.default_conversation_metrics_metadata = { - "deepeval:conversation_completeness": { - "threshold": 0.6, - "default": True, - "description": "Test", - }, - "deepeval:conversation_relevancy": { - "threshold": 0.7, - "default": False, - "description": "Test", - }, - } - - return config - - -class TestMetricManager: +class TestMetricManager: # pylint: disable=too-many-public-methods """Unit tests for MetricManager.""" - def test_resolve_metrics_with_none_uses_defaults(self, system_config): + def test_resolve_metrics_with_none_uses_defaults( + self, system_config: SystemConfig + ) -> None: """Test that None resolves to system defaults.""" manager = MetricManager(system_config) @@ -64,16 +24,20 @@ def test_resolve_metrics_with_none_uses_defaults(self, system_config): assert "custom:answer_correctness" in metrics assert "ragas:response_relevancy" not in metrics # default=False - def test_resolve_metrics_with_empty_list_skips_evaluation(self, system_config): + def test_resolve_metrics_with_empty_list_skips_evaluation( + self, system_config: SystemConfig + ) -> None: """Test that empty list skips evaluation.""" manager = MetricManager(system_config) metrics = manager.resolve_metrics([], MetricLevel.TURN) # Should return empty list - assert metrics == [] + assert not metrics - def test_resolve_metrics_with_explicit_list(self, system_config): + def test_resolve_metrics_with_explicit_list( + self, system_config: SystemConfig + ) -> None: """Test that explicit list is returned as-is.""" manager = MetricManager(system_config) @@ -83,7 +47,9 @@ def test_resolve_metrics_with_explicit_list(self, system_config): # Should return the exact list provided assert metrics == explicit_metrics - def test_resolve_metrics_conversation_level_defaults(self, system_config): + def test_resolve_metrics_conversation_level_defaults( + self, system_config: SystemConfig + ) -> None: """Test conversation-level default metrics.""" manager = MetricManager(system_config) @@ -93,7 +59,9 @@ def test_resolve_metrics_conversation_level_defaults(self, system_config): assert "deepeval:conversation_completeness" in metrics assert "deepeval:conversation_relevancy" not in metrics - def test_get_metric_metadata_from_system_defaults(self, system_config): + def test_get_metric_metadata_from_system_defaults( + self, system_config: SystemConfig + ) -> None: """Test getting full metadata from system defaults.""" manager = MetricManager(system_config) @@ -106,7 +74,9 @@ def test_get_metric_metadata_from_system_defaults(self, system_config): assert metadata["default"] is True assert metadata["description"] == "Test" - def test_get_metric_metadata_turn_level_override(self, system_config): + def test_get_metric_metadata_turn_level_override( + self, system_config: SystemConfig + ) -> None: """Test turn-level metadata completely overrides system defaults.""" manager = MetricManager(system_config) @@ -133,7 +103,9 @@ def test_get_metric_metadata_turn_level_override(self, system_config): assert "default" not in metadata assert "description" not in metadata - def test_get_metric_metadata_conversation_level_override(self, system_config): + def test_get_metric_metadata_conversation_level_override( + self, system_config: SystemConfig + ) -> None: """Test conversation-level metadata overrides system defaults.""" manager = MetricManager(system_config) @@ -159,7 +131,7 @@ def test_get_metric_metadata_conversation_level_override(self, system_config): assert metadata["threshold"] == 0.85 assert metadata["criteria"] == "Custom criteria" - def test_get_metric_metadata_not_found(self, system_config): + def test_get_metric_metadata_not_found(self, system_config: SystemConfig) -> None: """Test getting metadata for unknown metric returns None.""" manager = MetricManager(system_config) @@ -167,7 +139,7 @@ def test_get_metric_metadata_not_found(self, system_config): assert metadata is None - def test_get_metric_metadata_preserves_all_fields(self, system_config): + def test_get_metric_metadata_preserves_all_fields(self) -> None: """Test that all metadata fields are preserved.""" config = SystemConfig() config.default_turn_metrics_metadata = { @@ -198,7 +170,9 @@ def test_get_metric_metadata_preserves_all_fields(self, system_config): assert metadata["default"] is True assert metadata["description"] == "GEval metric for technical accuracy" - def test_get_effective_threshold_from_system_defaults(self, system_config): + def test_get_effective_threshold_from_system_defaults( + self, system_config: SystemConfig + ) -> None: """Test getting threshold from system defaults.""" manager = MetricManager(system_config) @@ -208,7 +182,9 @@ def test_get_effective_threshold_from_system_defaults(self, system_config): assert threshold == 0.7 - def test_get_effective_threshold_turn_level_override(self, system_config): + def test_get_effective_threshold_turn_level_override( + self, system_config: SystemConfig + ) -> None: """Test turn-level metadata overrides system defaults.""" manager = MetricManager(system_config) @@ -226,7 +202,9 @@ def test_get_effective_threshold_turn_level_override(self, system_config): # Should use turn-specific threshold assert threshold == 0.9 - def test_get_effective_threshold_conversation_level_override(self, system_config): + def test_get_effective_threshold_conversation_level_override( + self, system_config: SystemConfig + ) -> None: """Test conversation-level metadata overrides system defaults.""" manager = MetricManager(system_config) @@ -248,7 +226,9 @@ def test_get_effective_threshold_conversation_level_override(self, system_config # Should use conversation-specific threshold assert threshold == 0.85 - def test_get_effective_threshold_not_found(self, system_config): + def test_get_effective_threshold_not_found( + self, system_config: SystemConfig + ) -> None: """Test getting threshold for unknown metric returns None.""" manager = MetricManager(system_config) @@ -256,7 +236,9 @@ def test_get_effective_threshold_not_found(self, system_config): assert threshold is None - def test_get_effective_threshold_no_metadata_at_level(self, system_config): + def test_get_effective_threshold_no_metadata_at_level( + self, system_config: SystemConfig + ) -> None: """Test threshold lookup when no metadata at level.""" manager = MetricManager(system_config) @@ -274,7 +256,9 @@ def test_get_effective_threshold_no_metadata_at_level(self, system_config): # Should fall back to system defaults assert threshold == 0.7 - def test_get_effective_threshold_metric_not_in_level_metadata(self, system_config): + def test_get_effective_threshold_metric_not_in_level_metadata( + self, system_config: SystemConfig + ) -> None: """Test threshold for metric not in level metadata.""" manager = MetricManager(system_config) @@ -293,7 +277,9 @@ def test_get_effective_threshold_metric_not_in_level_metadata(self, system_confi # Should fall back to system defaults assert threshold == 0.7 - def test_count_metrics_for_conversation_all_defaults(self, system_config): + def test_count_metrics_for_conversation_all_defaults( + self, system_config: SystemConfig + ) -> None: """Test counting metrics when using all defaults.""" manager = MetricManager(system_config) @@ -313,7 +299,9 @@ def test_count_metrics_for_conversation_all_defaults(self, system_config): assert counts["conversation_metrics"] == 1 assert counts["total_turns"] == 2 - def test_count_metrics_for_conversation_explicit_metrics(self, system_config): + def test_count_metrics_for_conversation_explicit_metrics( + self, system_config: SystemConfig + ) -> None: """Test counting with explicit metrics.""" manager = MetricManager(system_config) @@ -340,7 +328,9 @@ def test_count_metrics_for_conversation_explicit_metrics(self, system_config): assert counts["conversation_metrics"] == 1 assert counts["total_turns"] == 2 - def test_count_metrics_for_conversation_skip_evaluation(self, system_config): + def test_count_metrics_for_conversation_skip_evaluation( + self, system_config: SystemConfig + ) -> None: """Test counting when evaluation is skipped.""" manager = MetricManager(system_config) @@ -359,7 +349,9 @@ def test_count_metrics_for_conversation_skip_evaluation(self, system_config): assert counts["conversation_metrics"] == 0 assert counts["total_turns"] == 1 - def test_count_metrics_for_conversation_mixed(self, system_config): + def test_count_metrics_for_conversation_mixed( + self, system_config: SystemConfig + ) -> None: """Test counting with mixed default and explicit metrics.""" manager = MetricManager(system_config) @@ -383,7 +375,7 @@ def test_count_metrics_for_conversation_mixed(self, system_config): assert counts["conversation_metrics"] == 1 assert counts["total_turns"] == 3 - def test_extract_default_metrics_empty_metadata(self): + def test_extract_default_metrics_empty_metadata(self) -> None: """Test extracting defaults when no metrics have default=true.""" config = SystemConfig() config.default_turn_metrics_metadata = { @@ -397,7 +389,9 @@ def test_extract_default_metrics_empty_metadata(self): # Should return empty list when no defaults assert metrics == [] - def test_get_effective_threshold_with_both_metadata_sources(self, system_config): + def test_get_effective_threshold_with_both_metadata_sources( + self, system_config: SystemConfig + ) -> None: """Test that level metadata takes priority over system defaults.""" manager = MetricManager(system_config) diff --git a/tests/unit/core/metrics/test_nlp.py b/tests/unit/core/metrics/test_nlp.py index ad3225dd..453cb27c 100644 --- a/tests/unit/core/metrics/test_nlp.py +++ b/tests/unit/core/metrics/test_nlp.py @@ -15,6 +15,7 @@ import sys import pytest +from pytest_mock import MockerFixture from lightspeed_evaluation.core.constants import ( ROUGE_TYPE_ROUGE1, @@ -25,109 +26,10 @@ from lightspeed_evaluation.core.system.exceptions import MetricError -# ============================================================================ -# Fixtures -# ============================================================================ - - -@pytest.fixture -def nlp_metrics(): - """Create NLPMetrics instance.""" - return NLPMetrics() - - -@pytest.fixture -def sample_turn_data(): - """Create sample TurnData for testing.""" - return TurnData( - turn_id="test_turn", - query="What is the capital of France?", - response="The capital of France is Paris.", - expected_response="The capital of France is Paris.", - ) - - -@pytest.fixture -def sample_scope(sample_turn_data): - """Create sample EvaluationScope for turn-level evaluation.""" - return EvaluationScope( - turn_idx=0, - turn_data=sample_turn_data, - is_conversation=False, - ) - - -@pytest.fixture -def conversation_scope(sample_turn_data): - """Create sample EvaluationScope for conversation-level evaluation.""" - return EvaluationScope( - turn_idx=0, - turn_data=sample_turn_data, - is_conversation=True, - ) - - -@pytest.fixture -def mock_bleu_scorer(mocker): - """Mock sacrebleu BLEU with configurable return value. - - Uses sys.modules injection to mock sacrebleu without requiring it to be installed. - """ - mock_result = mocker.MagicMock() - mock_result.score = 85.0 # sacrebleu returns 0-100 scale - - mock_scorer_instance = mocker.MagicMock() - mock_scorer_instance.corpus_score = mocker.MagicMock(return_value=mock_result) - - mock_bleu_class = mocker.MagicMock(return_value=mock_scorer_instance) - - # Create a fake sacrebleu module and inject it into sys.modules - mock_sacrebleu = mocker.MagicMock() - mock_sacrebleu.BLEU = mock_bleu_class - mocker.patch.dict(sys.modules, {"sacrebleu": mock_sacrebleu}) - - return mock_scorer_instance - - -@pytest.fixture -def mock_rouge_scorer(mocker): - """Mock RougeScore with configurable return value. - - Returns different scores for precision, recall, fmeasure. - """ - mock_scorer_instance = mocker.MagicMock() - # Return scores for precision, recall, fmeasure (called in that order) - mock_scorer_instance.single_turn_score = mocker.MagicMock( - side_effect=[0.95, 0.89, 0.92] - ) - mocker.patch( - "lightspeed_evaluation.core.metrics.nlp.RougeScore", - return_value=mock_scorer_instance, - ) - return mock_scorer_instance - - -@pytest.fixture -def mock_similarity_scorer(mocker): - """Mock NonLLMStringSimilarity with configurable return value.""" - mock_scorer_instance = mocker.MagicMock() - mock_scorer_instance.single_turn_score = mocker.MagicMock(return_value=0.78) - mocker.patch( - "lightspeed_evaluation.core.metrics.nlp.NonLLMStringSimilarity", - return_value=mock_scorer_instance, - ) - return mock_scorer_instance - - -# ============================================================================ -# Tests -# ============================================================================ - - -class TestNLPMetricsInit: +class TestNLPMetricsInit: # pylint: disable=too-few-public-methods """Test NLPMetrics initialization.""" - def test_initialization(self, nlp_metrics): + def test_initialization(self, nlp_metrics: NLPMetrics) -> None: """Test that NLPMetrics initializes correctly.""" assert nlp_metrics is not None assert "bleu" in nlp_metrics.supported_metrics @@ -138,14 +40,18 @@ def test_initialization(self, nlp_metrics): class TestNLPMetricsValidation: """Tests for metric-level validation.""" - def test_conversation_level_rejected(self, nlp_metrics, conversation_scope): + def test_conversation_level_rejected( + self, nlp_metrics: NLPMetrics, conversation_scope: EvaluationScope + ) -> None: """Test that NLP metrics reject conversation-level evaluation.""" score, reason = nlp_metrics.evaluate("bleu", None, conversation_scope) assert score is None assert "turn-level metric" in reason - def test_unsupported_metric(self, nlp_metrics, sample_scope): + def test_unsupported_metric( + self, nlp_metrics: NLPMetrics, sample_scope: EvaluationScope + ) -> None: """Test evaluate with unsupported metric name.""" score, reason = nlp_metrics.evaluate("unsupported_metric", None, sample_scope) @@ -157,16 +63,22 @@ class TestBLEUScore: """Tests for BLEU score metric.""" def test_bleu_successful_evaluation( - self, nlp_metrics, sample_scope, mock_bleu_scorer - ): + self, + nlp_metrics: NLPMetrics, + sample_scope: EvaluationScope, + mock_bleu_scorer: MockerFixture, + ) -> None: """Test BLEU score with valid inputs.""" + assert mock_bleu_scorer is not None # Fixture sets up the mock score, reason = nlp_metrics.evaluate("bleu", None, sample_scope) assert score is not None assert score == pytest.approx(0.85, abs=0.01) assert "NLP BLEU" in reason - def test_bleu_with_custom_ngram(self, nlp_metrics, mocker): + def test_bleu_with_custom_ngram( + self, nlp_metrics: NLPMetrics, mocker: MockerFixture + ) -> None: """Test BLEU score with custom max_ngram configuration.""" mock_result = mocker.MagicMock() mock_result.score = 90.0 @@ -200,7 +112,9 @@ def test_bleu_with_custom_ngram(self, nlp_metrics, mocker): # Verify BLEU was initialized with max_ngram_order=2 mock_bleu_class.assert_called_once_with(max_ngram_order=2) - def test_bleu_with_invalid_ngram_uses_default(self, nlp_metrics, mocker): + def test_bleu_with_invalid_ngram_uses_default( + self, nlp_metrics: NLPMetrics, mocker: MockerFixture + ) -> None: """Test BLEU score falls back to default when invalid max_ngram provided.""" mock_result = mocker.MagicMock() mock_result.score = 85.0 @@ -237,9 +151,13 @@ class TestROUGEScore: """Tests for ROUGE score metric.""" def test_rouge_successful_evaluation( - self, nlp_metrics, sample_scope, mock_rouge_scorer - ): + self, + nlp_metrics: NLPMetrics, + sample_scope: EvaluationScope, + mock_rouge_scorer: MockerFixture, + ) -> None: """Test ROUGE score with valid inputs.""" + assert mock_rouge_scorer is not None # Fixture sets up the mock score, reason = nlp_metrics.evaluate("rouge", None, sample_scope) assert score is not None @@ -250,7 +168,9 @@ def test_rouge_successful_evaluation( assert "recall" in reason assert "fmeasure" in reason - def test_rouge_with_custom_rouge_type(self, nlp_metrics, mocker): + def test_rouge_with_custom_rouge_type( + self, nlp_metrics: NLPMetrics, mocker: MockerFixture + ) -> None: """Test ROUGE score with custom rouge_type via turn_metrics_metadata.""" mock_scorer_instance = mocker.MagicMock() # Return different scores for each mode (precision, recall, fmeasure) @@ -290,9 +210,13 @@ class TestSemanticSimilarityDistance: """Tests for string distance similarity (NonLLMStringSimilarity) metric.""" def test_semantic_similarity_distance_successful_evaluation( - self, nlp_metrics, sample_scope, mock_similarity_scorer - ): + self, + nlp_metrics: NLPMetrics, + sample_scope: EvaluationScope, + mock_similarity_scorer: MockerFixture, + ) -> None: """Test string distance similarity with valid inputs.""" + assert mock_similarity_scorer is not None # Fixture sets up the mock score, reason = nlp_metrics.evaluate( "semantic_similarity_distance", None, sample_scope ) @@ -302,8 +226,8 @@ def test_semantic_similarity_distance_successful_evaluation( assert "NLP String Distance" in reason def test_semantic_similarity_distance_with_custom_measure( - self, nlp_metrics, mocker - ): + self, nlp_metrics: NLPMetrics, mocker: MockerFixture + ) -> None: """Test string distance similarity with custom distance measure config.""" mock_scorer_instance = mocker.MagicMock() mock_scorer_instance.single_turn_score = mocker.MagicMock(return_value=0.95) @@ -336,7 +260,12 @@ def test_semantic_similarity_distance_with_custom_measure( class TestMetricErrorHandling: """Tests for error handling across all NLP metrics.""" - def test_bleu_failure_raises_metric_error(self, nlp_metrics, sample_scope, mocker): + def test_bleu_failure_raises_metric_error( + self, + nlp_metrics: NLPMetrics, + sample_scope: EvaluationScope, + mocker: MockerFixture, + ) -> None: """Test that BLEU raises MetricError when scoring fails.""" mock_scorer_instance = mocker.MagicMock() mock_scorer_instance.corpus_score = mocker.MagicMock( @@ -363,9 +292,14 @@ def test_bleu_failure_raises_metric_error(self, nlp_metrics, sample_scope, mocke ), ], ) - def test_ragas_metric_failure_raises_metric_error( - self, nlp_metrics, sample_scope, mocker, metric_name, scorer_path - ): + def test_ragas_metric_failure_raises_metric_error( # pylint: disable=too-many-arguments,too-many-positional-arguments + self, + nlp_metrics: NLPMetrics, + sample_scope: EvaluationScope, + mocker: MockerFixture, + metric_name: str, + scorer_path: str, + ) -> None: """Test that Ragas-based metrics raise MetricError when scoring fails.""" mock_scorer_instance = mocker.MagicMock() mock_scorer_instance.single_turn_score = mocker.MagicMock( diff --git a/tests/unit/core/models/test_api_additional.py b/tests/unit/core/models/test_api_additional.py index 8456cd46..77fb68c0 100644 --- a/tests/unit/core/models/test_api_additional.py +++ b/tests/unit/core/models/test_api_additional.py @@ -14,7 +14,7 @@ class TestRAGChunk: """Tests for RAGChunk model.""" - def test_rag_chunk_creation(self): + def test_rag_chunk_creation(self) -> None: """Test creating RAG chunk.""" chunk = RAGChunk(content="test content", source="test source", score=0.95) @@ -22,22 +22,26 @@ def test_rag_chunk_creation(self): assert chunk.source == "test source" assert chunk.score == 0.95 - def test_rag_chunk_without_score(self): + def test_rag_chunk_without_score(self) -> None: """Test RAG chunk without score.""" chunk = RAGChunk(content="content", source="source") assert chunk.score is None - def test_rag_chunk_extra_field_forbidden(self): + def test_rag_chunk_extra_field_forbidden(self) -> None: """Test that extra fields are forbidden.""" with pytest.raises(ValidationError): - RAGChunk(content="content", source="source", extra_field="not allowed") + RAGChunk( + content="content", + source="source", + extra_field="not allowed", # pyright: ignore[reportCallIssue] + ) class TestAttachmentData: """Tests for AttachmentData model.""" - def test_attachment_creation(self): + def test_attachment_creation(self) -> None: """Test creating attachment.""" attachment = AttachmentData(content="file content") @@ -45,7 +49,7 @@ def test_attachment_creation(self): assert attachment.attachment_type == "configuration" assert attachment.content_type == "text/plain" - def test_attachment_custom_type(self): + def test_attachment_custom_type(self) -> None: """Test attachment with custom types.""" attachment = AttachmentData( content="yaml data", @@ -60,7 +64,7 @@ def test_attachment_custom_type(self): class TestAPIRequest: """Tests for APIRequest model.""" - def test_create_simple_request(self): + def test_create_simple_request(self) -> None: """Test creating simple API request.""" request = APIRequest.create(query="What is Python?") @@ -68,7 +72,7 @@ def test_create_simple_request(self): assert request.provider is None assert request.model is None - def test_create_request_with_all_params(self): + def test_create_request_with_all_params(self) -> None: """Test creating request with all parameters.""" request = APIRequest.create( query="Test query", @@ -86,7 +90,7 @@ def test_create_request_with_all_params(self): assert request.conversation_id == "conv123" assert request.system_prompt == "Custom prompt" - def test_create_request_with_attachments(self): + def test_create_request_with_attachments(self) -> None: """Test creating request with attachments.""" # APIRequest.create expects string attachments, not AttachmentData objects attachments = ["file1", "file2"] @@ -98,9 +102,12 @@ def test_create_request_with_attachments(self): assert request.attachments is not None assert len(request.attachments) == 2 - assert request.attachments[0].content == "file1" + assert ( + request.attachments[0].content # pylint: disable=unsubscriptable-object + == "file1" + ) - def test_request_empty_query_validation(self): + def test_request_empty_query_validation(self) -> None: """Test that empty query fails validation.""" with pytest.raises(ValidationError): APIRequest(query="") @@ -109,7 +116,7 @@ def test_request_empty_query_validation(self): class TestAPIResponse: """Tests for APIResponse model.""" - def test_response_creation(self): + def test_response_creation(self) -> None: """Test creating API response.""" response = APIResponse( response="Test response", @@ -121,16 +128,16 @@ def test_response_creation(self): assert response.conversation_id == "conv123" assert len(response.contexts) == 2 - def test_response_empty_contexts(self): + def test_response_empty_contexts(self) -> None: """Test response with empty contexts.""" response = APIResponse( response="Test", conversation_id="conv123", ) - assert response.contexts == [] + assert not response.contexts - def test_response_with_tool_calls(self): + def test_response_with_tool_calls(self) -> None: """Test response with tool calls.""" response = APIResponse( response="Test", @@ -140,7 +147,7 @@ def test_response_with_tool_calls(self): assert len(response.tool_calls) == 1 - def test_from_raw_response(self): + def test_from_raw_response(self) -> None: """Test creating response from raw API data.""" raw_data = { "response": "Test response", @@ -158,14 +165,14 @@ def test_from_raw_response(self): assert len(response.contexts) == 2 assert "chunk1" in response.contexts - def test_from_raw_response_without_conversation_id(self): + def test_from_raw_response_without_conversation_id(self) -> None: """Test that from_raw_response fails without conversation_id.""" raw_data = {"response": "Test"} with pytest.raises(ValueError, match="conversation_id is required"): APIResponse.from_raw_response(raw_data) - def test_response_with_streaming_performance_metrics(self): + def test_response_with_streaming_performance_metrics(self) -> None: """Test response with streaming performance metrics.""" response = APIResponse( response="Test", @@ -179,7 +186,7 @@ def test_response_with_streaming_performance_metrics(self): assert response.streaming_duration == 2.5 assert response.tokens_per_second == 85.3 - def test_response_without_streaming_metrics(self): + def test_response_without_streaming_metrics(self) -> None: """Test response defaults for streaming metrics (None for non-streaming).""" response = APIResponse( response="Test", @@ -190,7 +197,7 @@ def test_response_without_streaming_metrics(self): assert response.streaming_duration is None assert response.tokens_per_second is None - def test_from_raw_response_with_streaming_metrics(self): + def test_from_raw_response_with_streaming_metrics(self) -> None: """Test creating response from raw data with streaming metrics.""" raw_data = { "response": "Test response", @@ -210,7 +217,7 @@ def test_from_raw_response_with_streaming_metrics(self): assert response.streaming_duration == 3.456 assert response.tokens_per_second == 46.5 - def test_from_raw_response_without_streaming_metrics(self): + def test_from_raw_response_without_streaming_metrics(self) -> None: """Test creating response from raw data without streaming metrics (query endpoint).""" raw_data = { "response": "Test response", diff --git a/tests/unit/core/models/test_data.py b/tests/unit/core/models/test_data.py index 5ae0a876..3afc4a73 100644 --- a/tests/unit/core/models/test_data.py +++ b/tests/unit/core/models/test_data.py @@ -13,7 +13,7 @@ class TestTurnData: """General tests for TurnData model.""" - def test_minimal_fields(self): + def test_minimal_fields(self) -> None: """Test TurnData with only required fields.""" turn = TurnData(turn_id="turn1", query="Test query") @@ -22,12 +22,12 @@ def test_minimal_fields(self): assert turn.response is None assert turn.contexts is None - def test_empty_turn_id_fails(self): + def test_empty_turn_id_fails(self) -> None: """Test that empty turn_id fails validation.""" with pytest.raises(ValidationError): TurnData(turn_id="", query="Test") - def test_empty_query_fails(self): + def test_empty_query_fails(self) -> None: """Test that empty query fails validation.""" with pytest.raises(ValidationError): TurnData(turn_id="turn1", query="") @@ -36,13 +36,13 @@ def test_empty_query_fails(self): class TestTurnDataToolCallsValidation: """Test cases for TurnData expected_tool_calls field validation and conversion.""" - def test_single_set_format_conversion(self): + def test_single_set_format_conversion(self) -> None: """Test that single set format is converted to multiple sets format.""" # Single set format (backward compatibility) turn_data = TurnData( turn_id="test_single", query="Test query", - expected_tool_calls=[ + expected_tool_calls=[ # pyright: ignore[reportArgumentType] [{"tool_name": "test_tool", "arguments": {"key": "value"}}] ], ) @@ -50,12 +50,21 @@ def test_single_set_format_conversion(self): # Should be converted to multiple sets format expected = turn_data.expected_tool_calls assert expected is not None - assert len(expected) == 1 # One alternative set - assert len(expected[0]) == 1 # One sequence in the set - assert len(expected[0][0]) == 1 # One tool call in the sequence - assert expected[0][0][0]["tool_name"] == "test_tool" + assert ( + len(expected) == 1 # pylint: disable=unsubscriptable-object + ) # One alternative set + assert ( + len(expected[0]) == 1 # pylint: disable=unsubscriptable-object + ) # One sequence in the set + assert ( + len(expected[0][0]) == 1 # pylint: disable=unsubscriptable-object + ) # One tool call in the sequence + assert ( + expected[0][0][0]["tool_name"] # pylint: disable=unsubscriptable-object + == "test_tool" + ) - def test_multiple_sets_format_preserved(self): + def test_multiple_sets_format_preserved(self) -> None: """Test that multiple sets format is preserved as-is.""" # Multiple sets format turn_data = TurnData( @@ -70,10 +79,16 @@ def test_multiple_sets_format_preserved(self): expected = turn_data.expected_tool_calls assert expected is not None assert len(expected) == 2 # Two alternative sets - assert expected[0][0][0]["tool_name"] == "tool1" - assert expected[1][0][0]["tool_name"] == "tool2" + assert ( + expected[0][0][0]["tool_name"] # pylint: disable=unsubscriptable-object + == "tool1" + ) + assert ( + expected[1][0][0]["tool_name"] # pylint: disable=unsubscriptable-object + == "tool2" + ) - def test_empty_alternatives_allowed(self): + def test_empty_alternatives_allowed(self) -> None: """Test that empty alternatives are allowed as fallback.""" turn_data = TurnData( turn_id="test_flexible", @@ -87,10 +102,14 @@ def test_empty_alternatives_allowed(self): expected = turn_data.expected_tool_calls assert expected is not None assert len(expected) == 2 - assert len(expected[0]) == 1 # First set has one sequence - assert len(expected[1]) == 0 # Second set is empty - - def test_complex_sequences(self): + assert ( + len(expected[0]) == 1 # pylint: disable=unsubscriptable-object + ) # First set has one sequence + assert ( + len(expected[1]) == 0 # pylint: disable=unsubscriptable-object + ) # Second set is empty + + def test_complex_sequences(self) -> None: """Test complex tool call sequences.""" turn_data = TurnData( turn_id="test_complex", @@ -107,17 +126,21 @@ def test_complex_sequences(self): expected = turn_data.expected_tool_calls assert expected is not None assert len(expected) == 2 - assert len(expected[0]) == 2 # Two sequences in first set - assert len(expected[1]) == 1 # One sequence in second set - - def test_none_expected_tool_calls(self): + assert ( + len(expected[0]) == 2 # pylint: disable=unsubscriptable-object + ) # Two sequences in first set + assert ( + len(expected[1]) == 1 # pylint: disable=unsubscriptable-object + ) # One sequence in second set + + def test_none_expected_tool_calls(self) -> None: """Test that None is handled correctly.""" turn_data = TurnData( turn_id="test_none", query="Test query", expected_tool_calls=None ) assert turn_data.expected_tool_calls is None - def test_regex_arguments_preserved(self): + def test_regex_arguments_preserved(self) -> None: """Test that regex patterns in arguments are preserved.""" turn_data = TurnData( turn_id="test_regex", @@ -129,18 +152,23 @@ def test_regex_arguments_preserved(self): expected = turn_data.expected_tool_calls assert expected is not None - assert expected[0][0][0]["arguments"]["name"] == "web-server-[0-9]+" + assert ( + expected[0][0][0]["arguments"][ # pylint: disable=unsubscriptable-object + "name" + ] + == "web-server-[0-9]+" + ) - def test_invalid_format_rejected(self): + def test_invalid_format_rejected(self) -> None: """Test that non-list format is rejected.""" with pytest.raises(ValidationError): TurnData( turn_id="test_invalid", query="Test query", - expected_tool_calls="not_a_list", + expected_tool_calls="not_a_list", # pyright: ignore[reportArgumentType] ) - def test_invalid_tool_call_structure_rejected(self): + def test_invalid_tool_call_structure_rejected(self) -> None: """Test that invalid tool call structure is rejected.""" with pytest.raises(ValidationError): TurnData( @@ -149,7 +177,7 @@ def test_invalid_tool_call_structure_rejected(self): expected_tool_calls=[[[{"invalid": "structure"}]]], ) - def test_empty_sequence_rejected(self): + def test_empty_sequence_rejected(self) -> None: """Test that empty sequences are rejected.""" with pytest.raises( ValidationError, @@ -161,7 +189,7 @@ def test_empty_sequence_rejected(self): expected_tool_calls=[[]], ) - def test_empty_set_as_first_element_rejected(self): + def test_empty_set_as_first_element_rejected(self) -> None: """Test that empty set as the first element is rejected.""" with pytest.raises(ValidationError, match="Empty set cannot be the first"): TurnData( @@ -170,7 +198,7 @@ def test_empty_set_as_first_element_rejected(self): expected_tool_calls=[[], []], ) - def test_multiple_empty_alternatives_rejected(self): + def test_multiple_empty_alternatives_rejected(self) -> None: """Test that multiple empty alternatives are rejected as redundant.""" with pytest.raises( ValidationError, match="Found 2 empty alternatives.*redundant" @@ -189,19 +217,19 @@ def test_multiple_empty_alternatives_rejected(self): class TestTurnDataFormatDetection: """Test cases for format detection logic.""" - def test_empty_list_rejected(self): + def test_empty_list_rejected(self) -> None: """Test that empty list is rejected.""" with pytest.raises( ValidationError, match="Empty set cannot be the only alternative" ): TurnData(turn_id="test", query="Test", expected_tool_calls=[]) - def test_is_single_set_format_detection(self): + def test_is_single_set_format_detection(self) -> None: """Test detection of single set format.""" turn_data = TurnData( turn_id="test", query="Test", - expected_tool_calls=[ + expected_tool_calls=[ # pyright: ignore[reportArgumentType] [{"tool_name": "tool1", "arguments": {}}], [{"tool_name": "tool2", "arguments": {}}], ], @@ -210,7 +238,9 @@ def test_is_single_set_format_detection(self): expected = turn_data.expected_tool_calls assert expected is not None assert len(expected) == 1 # One alternative set - assert len(expected[0]) == 2 # Two sequences in that set + assert ( + len(expected[0]) == 2 # pylint: disable=unsubscriptable-object + ) # Two sequences in that set class TestTurnDataExpectedResponseValidation: @@ -220,7 +250,7 @@ class TestTurnDataExpectedResponseValidation: "valid_response", ["Single word", ["Response option 1", "Response option 2"]], ) - def test_valid_expected_response(self, valid_response): + def test_valid_expected_response(self, valid_response: str | list[str]) -> None: """Test valid expected_response values.""" turn_data = TurnData( turn_id="test_turn", @@ -229,7 +259,7 @@ def test_valid_expected_response(self, valid_response): ) assert turn_data.expected_response == valid_response - def test_none_expected_response_valid(self): + def test_none_expected_response_valid(self) -> None: """Test that None is valid for expected_response.""" turn_data = TurnData( turn_id="test_turn", @@ -248,7 +278,9 @@ def test_none_expected_response_valid(self): (["valid", " "], "cannot be empty or whitespace"), ], ) - def test_invalid_expected_response(self, invalid_response, match_pattern): + def test_invalid_expected_response( + self, invalid_response: str | list[str], match_pattern: str + ) -> None: """Test that invalid expected_response values are rejected.""" with pytest.raises(ValidationError, match=match_pattern): TurnData( @@ -261,7 +293,7 @@ def test_invalid_expected_response(self, invalid_response, match_pattern): class TestTurnDataKeywordsValidation: """Test cases for expected_keywords validation in TurnData.""" - def test_valid_single_group(self): + def test_valid_single_group(self) -> None: """Test valid expected_keywords with single group.""" turn_data = TurnData( turn_id="test_turn", @@ -270,7 +302,7 @@ def test_valid_single_group(self): ) assert turn_data.expected_keywords == [["keyword1", "keyword2"]] - def test_valid_multiple_groups(self): + def test_valid_multiple_groups(self) -> None: """Test valid expected_keywords with multiple groups.""" turn_data = TurnData( turn_id="test_turn", @@ -280,23 +312,26 @@ def test_valid_multiple_groups(self): ["monitoring", "namespace"], ], ) + assert turn_data.expected_keywords is not None assert len(turn_data.expected_keywords) == 2 - def test_none_is_valid(self): + def test_none_is_valid(self) -> None: """Test that None is valid for expected_keywords.""" turn_data = TurnData( turn_id="test_turn", query="Test query", expected_keywords=None ) assert turn_data.expected_keywords is None - def test_non_list_rejected(self): + def test_non_list_rejected(self) -> None: """Test that non-list expected_keywords is rejected.""" with pytest.raises(ValidationError, match="Input should be a valid list"): TurnData( - turn_id="test_turn", query="Test query", expected_keywords="not_a_list" + turn_id="test_turn", + query="Test query", + expected_keywords="not_a_list", # pyright: ignore[reportArgumentType] ) - def test_empty_inner_list_rejected(self): + def test_empty_inner_list_rejected(self) -> None: """Test that empty inner lists are rejected.""" with pytest.raises(ValidationError, match="cannot be empty"): TurnData( @@ -305,7 +340,7 @@ def test_empty_inner_list_rejected(self): expected_keywords=[[], ["valid_list"]], ) - def test_empty_string_element_rejected(self): + def test_empty_string_element_rejected(self) -> None: """Test that empty string elements are rejected.""" with pytest.raises(ValidationError, match="cannot be empty or whitespace"): TurnData( @@ -318,7 +353,7 @@ def test_empty_string_element_rejected(self): class TestEvaluationData: """Tests for EvaluationData model.""" - def test_valid_creation(self): + def test_valid_creation(self) -> None: """Test EvaluationData creation with valid data.""" turns = [ TurnData(turn_id="turn1", query="First query"), @@ -337,30 +372,31 @@ def test_valid_creation(self): assert eval_data.tag == "test_tag" assert len(eval_data.turns) == 2 assert eval_data.description == "Test conversation" + assert eval_data.conversation_metrics is not None assert len(eval_data.conversation_metrics) == 1 - def test_default_tag_value(self): + def test_default_tag_value(self) -> None: """Test EvaluationData has correct default tag value.""" turn = TurnData(turn_id="turn1", query="Query") eval_data = EvaluationData(conversation_group_id="conv1", turns=[turn]) assert eval_data.tag == "eval" - def test_empty_tag_rejected(self): + def test_empty_tag_rejected(self) -> None: """Test that empty tag is rejected.""" turn = TurnData(turn_id="turn1", query="Query") with pytest.raises(ValidationError): EvaluationData(conversation_group_id="conv1", turns=[turn], tag="") - def test_empty_conversation_id_rejected(self): + def test_empty_conversation_id_rejected(self) -> None: """Test that empty conversation_group_id is rejected.""" turn = TurnData(turn_id="turn1", query="Query") with pytest.raises(ValidationError): EvaluationData(conversation_group_id="", turns=[turn]) - def test_empty_turns_rejected(self): + def test_empty_turns_rejected(self) -> None: """Test that empty turns list is rejected.""" with pytest.raises(ValidationError): EvaluationData(conversation_group_id="conv1", turns=[]) @@ -369,7 +405,7 @@ def test_empty_turns_rejected(self): class TestEvaluationResult: """Tests for EvaluationResult model.""" - def test_default_values(self): + def test_default_values(self) -> None: """Test EvaluationResult has correct default values.""" result = EvaluationResult( conversation_group_id="conv1", @@ -385,7 +421,7 @@ def test_default_values(self): assert result.reason == "" assert result.execution_time == 0 - def test_explicit_tag_value(self): + def test_explicit_tag_value(self) -> None: """Test EvaluationResult with explicit tag value.""" result = EvaluationResult( conversation_group_id="conv1", @@ -398,7 +434,7 @@ def test_explicit_tag_value(self): assert result.tag == "custom_tag" - def test_empty_tag_rejected(self): + def test_empty_tag_rejected(self) -> None: """Test that empty tag is rejected.""" with pytest.raises(ValidationError): EvaluationResult( @@ -410,7 +446,7 @@ def test_empty_tag_rejected(self): threshold=0.7, ) - def test_invalid_result_status_rejected(self): + def test_invalid_result_status_rejected(self) -> None: """Test that invalid result status is rejected.""" with pytest.raises(ValidationError, match="Result must be one of"): EvaluationResult( @@ -421,7 +457,7 @@ def test_invalid_result_status_rejected(self): threshold=0.7, ) - def test_negative_execution_time_rejected(self): + def test_negative_execution_time_rejected(self) -> None: """Test that negative execution_time is rejected.""" with pytest.raises(ValidationError): EvaluationResult( @@ -433,7 +469,7 @@ def test_negative_execution_time_rejected(self): execution_time=-1, ) - def test_conversation_level_metric_allows_none_turn_id(self): + def test_conversation_level_metric_allows_none_turn_id(self) -> None: """Test that turn_id can be None for conversation-level metrics.""" result = EvaluationResult( conversation_group_id="conv1", diff --git a/tests/unit/core/models/test_system_additional.py b/tests/unit/core/models/test_system_additional.py index 282d19d0..1a860a28 100644 --- a/tests/unit/core/models/test_system_additional.py +++ b/tests/unit/core/models/test_system_additional.py @@ -4,6 +4,7 @@ import tempfile import pytest from pydantic import ValidationError +from pytest_mock import MockerFixture from lightspeed_evaluation.core.models import ( LLMConfig, @@ -18,47 +19,47 @@ class TestLLMConfig: """Additional tests for LLMConfig.""" - def test_temperature_validation_min(self): + def test_temperature_validation_min(self) -> None: """Test temperature minimum validation.""" with pytest.raises(ValidationError): LLMConfig(temperature=-0.1) - def test_temperature_validation_max(self): + def test_temperature_validation_max(self) -> None: """Test temperature maximum validation.""" with pytest.raises(ValidationError): LLMConfig(temperature=2.1) - def test_max_tokens_validation(self): + def test_max_tokens_validation(self) -> None: """Test max_tokens minimum validation.""" with pytest.raises(ValidationError): LLMConfig(max_tokens=0) - def test_timeout_validation(self): + def test_timeout_validation(self) -> None: """Test timeout minimum validation.""" with pytest.raises(ValidationError): LLMConfig(timeout=0) - def test_num_retries_validation(self): + def test_num_retries_validation(self) -> None: """Test num_retries minimum validation.""" with pytest.raises(ValidationError): LLMConfig(num_retries=-1) - def test_ssl_verify_default(self): + def test_ssl_verify_default(self) -> None: """Test ssl_verify has correct default value.""" config = LLMConfig() assert config.ssl_verify is True - def test_ssl_verify_false(self): + def test_ssl_verify_false(self) -> None: """Test ssl_verify can be set to False.""" config = LLMConfig(ssl_verify=False) assert config.ssl_verify is False - def test_ssl_cert_file_default(self): + def test_ssl_cert_file_default(self) -> None: """Test ssl_cert_file defaults to None.""" config = LLMConfig() assert config.ssl_cert_file is None - def test_ssl_cert_file_valid_path(self): + def test_ssl_cert_file_valid_path(self) -> None: """Test ssl_cert_file with valid certificate file.""" with tempfile.NamedTemporaryFile(mode="w", suffix=".crt", delete=False) as f: cert_path = f.name @@ -66,12 +67,13 @@ def test_ssl_cert_file_valid_path(self): try: config = LLMConfig(ssl_cert_file=cert_path) + assert config.ssl_cert_file is not None assert config.ssl_cert_file == os.path.abspath(cert_path) assert os.path.isabs(config.ssl_cert_file) finally: os.unlink(cert_path) - def test_ssl_cert_file_expands_env_variables(self, mocker): + def test_ssl_cert_file_expands_env_variables(self, mocker: MockerFixture) -> None: """Test ssl_cert_file expands environment variables.""" with tempfile.NamedTemporaryFile(mode="w", suffix=".crt", delete=False) as f: cert_path = f.name @@ -87,14 +89,14 @@ def test_ssl_cert_file_expands_env_variables(self, mocker): finally: os.unlink(cert_path) - def test_ssl_cert_file_nonexistent_raises_error(self): + def test_ssl_cert_file_nonexistent_raises_error(self) -> None: """Test ssl_cert_file validation fails for non-existent file.""" with pytest.raises(ValidationError) as exc_info: LLMConfig(ssl_cert_file="/tmp/nonexistent_cert_12345.crt") assert "not found" in str(exc_info.value).lower() - def test_ssl_cert_file_directory_raises_error(self): + def test_ssl_cert_file_directory_raises_error(self) -> None: """Test ssl_cert_file validation fails for directory paths.""" temp_dir = tempfile.gettempdir() with pytest.raises(ValidationError): @@ -104,7 +106,7 @@ def test_ssl_cert_file_directory_raises_error(self): class TestEmbeddingConfig: """Tests for EmbeddingConfig.""" - def test_default_values(self): + def test_default_values(self) -> None: """Test default embedding configuration.""" config = EmbeddingConfig() @@ -112,7 +114,7 @@ def test_default_values(self): assert config.model is not None assert config.cache_enabled is True - def test_custom_embedding_model(self): + def test_custom_embedding_model(self) -> None: """Test custom embedding model configuration.""" config = EmbeddingConfig( provider="openai", @@ -126,7 +128,7 @@ def test_custom_embedding_model(self): class TestAPIConfig: """Tests for APIConfig.""" - def test_default_api_config(self): + def test_default_api_config(self) -> None: """Test default API configuration.""" config = APIConfig() @@ -134,7 +136,7 @@ def test_default_api_config(self): assert isinstance(config.cache_enabled, bool) assert config.timeout > 0 - def test_custom_api_config(self): + def test_custom_api_config(self) -> None: """Test custom API configuration.""" config = APIConfig( enabled=True, @@ -146,7 +148,7 @@ def test_custom_api_config(self): assert config.api_base == "https://custom.api.com" assert config.timeout == 300 - def test_timeout_validation(self): + def test_timeout_validation(self) -> None: """Test API timeout validation.""" with pytest.raises(ValidationError): APIConfig(timeout=0) @@ -155,14 +157,14 @@ def test_timeout_validation(self): class TestOutputConfig: """Tests for OutputConfig.""" - def test_default_output_config(self): + def test_default_output_config(self) -> None: """Test default output configuration.""" config = OutputConfig() assert "csv" in config.enabled_outputs assert len(config.csv_columns) > 0 - def test_custom_output_config(self): + def test_custom_output_config(self) -> None: """Test custom output configuration.""" config = OutputConfig( enabled_outputs=["json"], @@ -172,7 +174,7 @@ def test_custom_output_config(self): assert config.enabled_outputs == ["json"] assert len(config.csv_columns) == 2 - def test_minimal_csv_columns(self): + def test_minimal_csv_columns(self) -> None: """Test with minimal CSV columns.""" config = OutputConfig(csv_columns=["result"]) assert len(config.csv_columns) >= 1 @@ -181,7 +183,7 @@ def test_minimal_csv_columns(self): class TestVisualizationConfig: """Tests for VisualizationConfig.""" - def test_default_visualization_config(self): + def test_default_visualization_config(self) -> None: """Test default visualization configuration.""" config = VisualizationConfig() @@ -189,12 +191,12 @@ def test_default_visualization_config(self): assert config.dpi > 0 assert len(config.figsize) == 2 - def test_custom_visualization_config(self): + def test_custom_visualization_config(self) -> None: """Test custom visualization configuration.""" config = VisualizationConfig( enabled_graphs=["pass_rates", "score_distribution"], dpi=150, - figsize=(12, 8), + figsize=(12, 8), # pyright: ignore[reportArgumentType] ) assert "pass_rates" in config.enabled_graphs @@ -202,7 +204,7 @@ def test_custom_visualization_config(self): assert config.dpi == 150 assert config.figsize == [12, 8] # Pydantic converts tuple to list - def test_dpi_validation(self): + def test_dpi_validation(self) -> None: """Test DPI validation.""" with pytest.raises(ValidationError): VisualizationConfig(dpi=0) @@ -211,7 +213,7 @@ def test_dpi_validation(self): class TestLoggingConfig: """Tests for LoggingConfig.""" - def test_default_logging_config(self): + def test_default_logging_config(self) -> None: """Test default logging configuration.""" config = LoggingConfig() @@ -219,7 +221,7 @@ def test_default_logging_config(self): assert config.package_level is not None assert isinstance(config.package_overrides, dict) - def test_custom_logging_config(self): + def test_custom_logging_config(self) -> None: """Test custom logging configuration.""" config = LoggingConfig( source_level="DEBUG", @@ -231,7 +233,7 @@ def test_custom_logging_config(self): assert config.package_level == "ERROR" assert config.package_overrides["httpx"] == "CRITICAL" - def test_show_timestamps_toggle(self): + def test_show_timestamps_toggle(self) -> None: """Test show_timestamps configuration.""" config1 = LoggingConfig(show_timestamps=True) config2 = LoggingConfig(show_timestamps=False) diff --git a/tests/unit/core/output/conftest.py b/tests/unit/core/output/conftest.py new file mode 100644 index 00000000..7203df1d --- /dev/null +++ b/tests/unit/core/output/conftest.py @@ -0,0 +1,95 @@ +"""Pytest configuration and fixtures for output tests.""" + +import pytest +from pytest_mock import MockerFixture +from lightspeed_evaluation.core.models import EvaluationResult + + +@pytest.fixture +def sample_results() -> list[EvaluationResult]: + """Create sample evaluation results.""" + return [ + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn1", + metric_identifier="ragas:faithfulness", + score=0.85, + result="PASS", + threshold=0.7, + reason="Good", + query="What is Python?", + response="Python is a programming language", + ), + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn2", + metric_identifier="ragas:answer_relevancy", + score=0.60, + result="FAIL", + threshold=0.7, + reason="Low score", + query="How?", + response="It works", + ), + ] + + +@pytest.fixture +def sample_results_statistics() -> list[EvaluationResult]: + """Create sample evaluation results.""" + return [ + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn1", + metric_identifier="metric1", + score=0.9, + result="PASS", + threshold=0.7, + reason="Good", + ), + EvaluationResult( + conversation_group_id="conv1", + turn_id="turn2", + metric_identifier="metric1", + score=0.5, + result="FAIL", + threshold=0.7, + reason="Low score", + ), + EvaluationResult( + conversation_group_id="conv2", + turn_id="turn1", + metric_identifier="metric2", + score=0.8, + result="PASS", + threshold=0.7, + reason="Good", + ), + EvaluationResult( + conversation_group_id="conv2", + turn_id="turn2", + metric_identifier="metric2", + score=None, + result="ERROR", + threshold=0.7, + reason="Failed", + ), + ] + + +@pytest.fixture +def mock_system_config(mocker: MockerFixture) -> MockerFixture: + """Create mock system config.""" + config = mocker.Mock() + config.output.enabled_outputs = ["csv", "json", "txt"] + config.output.csv_columns = [ + "conversation_group_id", + "turn_id", + "metric_identifier", + "result", + "score", + ] + config.visualization.enabled_graphs = [] + # Mock model_fields to support iteration in _write_config_params and _build_config_dict + config.model_fields.keys.return_value = [] + return config diff --git a/tests/unit/core/output/test_final_coverage.py b/tests/unit/core/output/test_final_coverage.py index 4139413c..bbf9c8e8 100644 --- a/tests/unit/core/output/test_final_coverage.py +++ b/tests/unit/core/output/test_final_coverage.py @@ -1,16 +1,21 @@ """Additional tests to boost coverage towards 75%.""" +from pathlib import Path + +from pytest_mock import MockerFixture from lightspeed_evaluation.core.models import EvaluationResult +from lightspeed_evaluation.core.output.generator import OutputHandler from lightspeed_evaluation.core.output.statistics import ( calculate_basic_stats, calculate_detailed_stats, ) +from lightspeed_evaluation.core.system.loader import validate_metrics class TestStatisticsEdgeCases: """Edge case tests for statistics module.""" - def test_stats_with_mixed_results(self): + def test_stats_with_mixed_results(self) -> None: """Test statistics with all result types.""" results = [ EvaluationResult( @@ -32,7 +37,7 @@ def test_stats_with_mixed_results(self): assert len(detailed["by_metric"]) > 0 assert len(detailed["by_conversation"]) == 2 - def test_detailed_stats_single_conversation_multiple_metrics(self): + def test_detailed_stats_single_conversation_multiple_metrics(self) -> None: """Test detailed stats with one conversation, multiple metrics.""" results = [ EvaluationResult( @@ -52,7 +57,7 @@ def test_detailed_stats_single_conversation_multiple_metrics(self): assert len(detailed["by_metric"]) == 10 assert detailed["by_conversation"]["conv1"]["pass"] == 10 - def test_detailed_stats_multiple_conversations_single_metric(self): + def test_detailed_stats_multiple_conversations_single_metric(self) -> None: """Test detailed stats with multiple conversations, one metric.""" results = [ EvaluationResult( @@ -77,9 +82,8 @@ def test_detailed_stats_multiple_conversations_single_metric(self): class TestOutputHandlerEdgeCases: """Edge case tests for output handler.""" - def test_calculate_stats_with_single_result(self, tmp_path): + def test_calculate_stats_with_single_result(self, tmp_path: Path) -> None: """Test stats calculation with exactly one result.""" - from lightspeed_evaluation.core.output.generator import OutputHandler handler = OutputHandler(output_dir=str(tmp_path)) results = [ @@ -93,15 +97,16 @@ def test_calculate_stats_with_single_result(self, tmp_path): ) ] - stats = handler._calculate_stats(results) + stats = handler._calculate_stats(results) # pylint: disable=protected-access assert stats["basic"]["TOTAL"] == 1 assert stats["basic"]["PASS"] == 1 assert stats["basic"]["pass_rate"] == 100.0 - def test_generate_csv_with_minimal_columns(self, tmp_path, mocker): + def test_generate_csv_with_minimal_columns( + self, tmp_path: Path, mocker: MockerFixture + ) -> None: """Test CSV generation with minimal column set.""" - from lightspeed_evaluation.core.output.generator import OutputHandler config = mocker.Mock() config.output.csv_columns = ["conversation_group_id", "result"] @@ -118,7 +123,9 @@ def test_generate_csv_with_minimal_columns(self, tmp_path, mocker): ) ] - csv_file = handler._generate_csv_report(results, "test") + csv_file = handler._generate_csv_report( # pylint: disable=protected-access + results, "test" + ) assert csv_file.exists() content = csv_file.read_text() @@ -127,12 +134,11 @@ def test_generate_csv_with_minimal_columns(self, tmp_path, mocker): assert "PASS" in content -class TestSystemLoaderEdgeCases: +class TestSystemLoaderEdgeCases: # pylint: disable=too-few-public-methods """Edge case tests for system loader.""" - def test_validate_metrics_with_mixed_valid_invalid(self): + def test_validate_metrics_with_mixed_valid_invalid(self) -> None: """Test validating mix of valid and invalid metrics.""" - from lightspeed_evaluation.core.system.loader import validate_metrics turn_metrics = [ "ragas:faithfulness", diff --git a/tests/unit/core/output/test_generator.py b/tests/unit/core/output/test_generator.py index 0f8aa4a2..5b4b2ef8 100644 --- a/tests/unit/core/output/test_generator.py +++ b/tests/unit/core/output/test_generator.py @@ -1,64 +1,19 @@ """Unit tests for output generator.""" import json +from pathlib import Path -import pytest +import csv as csv_module +from pytest_mock import MockerFixture from lightspeed_evaluation.core.models import EvaluationResult from lightspeed_evaluation.core.output.generator import OutputHandler -@pytest.fixture -def sample_results(): - """Create sample evaluation results.""" - return [ - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn1", - metric_identifier="ragas:faithfulness", - score=0.85, - result="PASS", - threshold=0.7, - reason="Good", - query="What is Python?", - response="Python is a programming language", - ), - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn2", - metric_identifier="ragas:answer_relevancy", - score=0.60, - result="FAIL", - threshold=0.7, - reason="Low score", - query="How?", - response="It works", - ), - ] - - -@pytest.fixture -def mock_system_config(mocker): - """Create mock system config.""" - config = mocker.Mock() - config.output.enabled_outputs = ["csv", "json", "txt"] - config.output.csv_columns = [ - "conversation_group_id", - "turn_id", - "metric_identifier", - "result", - "score", - ] - config.visualization.enabled_graphs = [] - # Mock model_fields to support iteration in _write_config_params and _build_config_dict - config.model_fields.keys.return_value = [] - return config - - class TestOutputHandler: """Tests for OutputHandler.""" - def test_initialization(self, tmp_path): + def test_initialization(self, tmp_path: Path) -> None: """Test handler initialization.""" handler = OutputHandler(output_dir=str(tmp_path), base_filename="test") @@ -66,32 +21,43 @@ def test_initialization(self, tmp_path): assert handler.base_filename == "test" assert tmp_path.exists() - def test_calculate_stats_with_results(self, tmp_path, sample_results): + def test_calculate_stats_with_results( + self, tmp_path: Path, sample_results: list[EvaluationResult] + ) -> None: """Test statistics calculation.""" handler = OutputHandler(output_dir=str(tmp_path)) - stats = handler._calculate_stats(sample_results) + stats = handler._calculate_stats( # pylint: disable=protected-access + sample_results + ) assert stats["basic"]["TOTAL"] == 2 assert stats["basic"]["PASS"] == 1 assert stats["basic"]["FAIL"] == 1 assert "detailed" in stats - def test_calculate_stats_empty(self, tmp_path): + def test_calculate_stats_empty(self, tmp_path: Path) -> None: """Test statistics with empty results.""" handler = OutputHandler(output_dir=str(tmp_path)) - stats = handler._calculate_stats([]) + stats = handler._calculate_stats([]) # pylint: disable=protected-access assert stats["basic"]["TOTAL"] == 0 - assert stats["detailed"]["by_metric"] == {} - - def test_generate_csv_report(self, tmp_path, sample_results, mock_system_config): + assert not stats["detailed"]["by_metric"] + + def test_generate_csv_report( + self, + tmp_path: Path, + sample_results: list[EvaluationResult], + mock_system_config: MockerFixture, + ) -> None: """Test CSV generation.""" handler = OutputHandler( output_dir=str(tmp_path), system_config=mock_system_config, ) - csv_file = handler._generate_csv_report(sample_results, "test") + csv_file = handler._generate_csv_report( # pylint: disable=protected-access + sample_results, "test" + ) assert csv_file.exists() assert csv_file.suffix == ".csv" @@ -101,18 +67,22 @@ def test_generate_csv_report(self, tmp_path, sample_results, mock_system_config) assert "conversation_group_id" in content assert "conv1" in content - def test_generate_json_summary(self, tmp_path, sample_results): + def test_generate_json_summary( + self, tmp_path: Path, sample_results: list[EvaluationResult] + ) -> None: """Test JSON summary generation.""" handler = OutputHandler(output_dir=str(tmp_path)) - stats = handler._calculate_stats(sample_results) + stats = handler._calculate_stats( # pylint: disable=protected-access + sample_results + ) api_tokens = { "total_api_input_tokens": 100, "total_api_output_tokens": 200, "total_api_tokens": 300, } - streaming_stats = {} + streaming_stats: dict = {} - json_file = handler._generate_json_summary( + json_file = handler._generate_json_summary( # pylint: disable=protected-access sample_results, "test", stats["basic"], @@ -124,7 +94,7 @@ def test_generate_json_summary(self, tmp_path, sample_results): assert json_file.exists() # Verify structure - with open(json_file) as f: + with open(json_file, encoding="utf-8") as f: data = json.load(f) assert "summary_stats" in data or "results" in data @@ -133,18 +103,22 @@ def test_generate_json_summary(self, tmp_path, sample_results): assert "summary_stats" in data assert data["summary_stats"]["overall"]["total_api_tokens"] == 300 - def test_generate_text_summary(self, tmp_path, sample_results): + def test_generate_text_summary( + self, tmp_path: Path, sample_results: list[EvaluationResult] + ) -> None: """Test text summary generation.""" handler = OutputHandler(output_dir=str(tmp_path)) - stats = handler._calculate_stats(sample_results) + stats = handler._calculate_stats( # pylint: disable=protected-access + sample_results + ) api_tokens = { "total_api_input_tokens": 100, "total_api_output_tokens": 200, "total_api_tokens": 300, } - streaming_stats = {} + streaming_stats: dict = {} - txt_file = handler._generate_text_summary( + txt_file = handler._generate_text_summary( # pylint: disable=protected-access sample_results, "test", stats["basic"], @@ -160,15 +134,19 @@ def test_generate_text_summary(self, tmp_path, sample_results): # Verify API token usage is included assert "Token Usage (API Calls)" in content - def test_get_output_directory(self, tmp_path): + def test_get_output_directory(self, tmp_path: Path) -> None: """Test get output directory.""" handler = OutputHandler(output_dir=str(tmp_path)) assert handler.get_output_directory() == tmp_path def test_generate_reports_creates_files( - self, tmp_path, sample_results, mock_system_config, mocker - ): + self, + tmp_path: Path, + sample_results: list[EvaluationResult], + mock_system_config: MockerFixture, + mocker: MockerFixture, + ) -> None: """Test that generate_reports creates output files.""" mock_now = mocker.Mock() mock_now.strftime.return_value = "20250101_120000" @@ -192,8 +170,8 @@ def test_generate_reports_creates_files( assert (tmp_path / "eval_20250101_120000_summary.txt").exists() def test_generate_reports_with_empty_results( - self, tmp_path, mock_system_config, mocker - ): + self, tmp_path: Path, mock_system_config: MockerFixture, mocker: MockerFixture + ) -> None: """Test generating reports with no results.""" mock_now = mocker.Mock() mock_now.strftime.return_value = "20250101_120000" @@ -212,8 +190,11 @@ def test_generate_reports_with_empty_results( handler.generate_reports([]) def test_generate_individual_reports_csv_only( - self, tmp_path, sample_results, mocker - ): + self, + tmp_path: Path, + sample_results: list[EvaluationResult], + mocker: MockerFixture, + ) -> None: """Test generating only CSV.""" config = mocker.Mock() config.output.enabled_outputs = ["csv"] @@ -221,15 +202,22 @@ def test_generate_individual_reports_csv_only( config.visualization.enabled_graphs = [] handler = OutputHandler(output_dir=str(tmp_path), system_config=config) - stats = handler._calculate_stats(sample_results) + stats = handler._calculate_stats( # pylint: disable=protected-access + sample_results + ) - handler._generate_individual_reports(sample_results, "test", ["csv"], stats) + handler._generate_individual_reports( # pylint: disable=protected-access + sample_results, "test", ["csv"], stats + ) assert (tmp_path / "test_detailed.csv").exists() def test_generate_individual_reports_json_only( - self, tmp_path, sample_results, mocker - ): + self, + tmp_path: Path, + sample_results: list[EvaluationResult], + mocker: MockerFixture, + ) -> None: """Test generating only JSON.""" config = mocker.Mock() config.output.enabled_outputs = ["json"] @@ -237,15 +225,22 @@ def test_generate_individual_reports_json_only( config.model_fields.keys.return_value = [] handler = OutputHandler(output_dir=str(tmp_path), system_config=config) - stats = handler._calculate_stats(sample_results) + stats = handler._calculate_stats( # pylint: disable=protected-access + sample_results + ) - handler._generate_individual_reports(sample_results, "test", ["json"], stats) + handler._generate_individual_reports( # pylint: disable=protected-access + sample_results, "test", ["json"], stats + ) assert (tmp_path / "test_summary.json").exists() def test_generate_individual_reports_txt_only( - self, tmp_path, sample_results, mocker - ): + self, + tmp_path: Path, + sample_results: list[EvaluationResult], + mocker: MockerFixture, + ) -> None: """Test generating only TXT.""" config = mocker.Mock() config.output.enabled_outputs = ["txt"] @@ -253,13 +248,21 @@ def test_generate_individual_reports_txt_only( config.model_fields.keys.return_value = [] handler = OutputHandler(output_dir=str(tmp_path), system_config=config) - stats = handler._calculate_stats(sample_results) - - handler._generate_individual_reports(sample_results, "test", ["txt"], stats) + stats = handler._calculate_stats( # pylint: disable=protected-access + sample_results + ) + handler._generate_individual_reports( # pylint: disable=protected-access + sample_results, "test", ["txt"], stats + ) assert (tmp_path / "test_summary.txt").exists() - def test_csv_with_all_columns(self, tmp_path, sample_results, mocker): + def test_csv_with_all_columns( + self, + tmp_path: Path, + sample_results: list[EvaluationResult], + mocker: MockerFixture, + ) -> None: """Test CSV with all available columns.""" config = mocker.Mock() config.output.csv_columns = [ @@ -276,14 +279,21 @@ def test_csv_with_all_columns(self, tmp_path, sample_results, mocker): config.visualization.enabled_graphs = [] handler = OutputHandler(output_dir=str(tmp_path), system_config=config) - csv_file = handler._generate_csv_report(sample_results, "test") + csv_file = handler._generate_csv_report( # pylint: disable=protected-access + sample_results, "test" + ) content = csv_file.read_text() assert "query" in content assert "response" in content assert "Python" in content - def test_generate_reports_without_config(self, tmp_path, sample_results, mocker): + def test_generate_reports_without_config( + self, + tmp_path: Path, + sample_results: list[EvaluationResult], + mocker: MockerFixture, + ) -> None: """Test generating reports without system config.""" mock_now = mocker.Mock() mock_now.strftime.return_value = "20250101_120000" @@ -303,7 +313,9 @@ def test_generate_reports_without_config(self, tmp_path, sample_results, mocker) class TestOutputHandlerInitialization: """Additional tests for OutputHandler initialization and configuration.""" - def test_output_handler_initialization_default(self, tmp_path, mocker): + def test_output_handler_initialization_default( + self, tmp_path: Path, mocker: MockerFixture + ) -> None: """Test OutputHandler initialization with default parameters.""" mock_print = mocker.patch("builtins.print") @@ -316,7 +328,9 @@ def test_output_handler_initialization_default(self, tmp_path, mocker): mock_print.assert_called_with(f"✅ Output handler initialized: {tmp_path}") - def test_output_handler_initialization_custom(self, tmp_path, mocker): + def test_output_handler_initialization_custom( + self, tmp_path: Path, mocker: MockerFixture + ) -> None: """Test OutputHandler initialization with custom parameters.""" system_config = mocker.Mock() system_config.llm.provider = "openai" @@ -333,7 +347,9 @@ def test_output_handler_initialization_custom(self, tmp_path, mocker): assert handler.base_filename == "custom_eval" assert handler.system_config == system_config - def test_output_handler_creates_directory(self, tmp_path, mocker): + def test_output_handler_creates_directory( + self, tmp_path: Path, mocker: MockerFixture + ) -> None: """Test that OutputHandler creates output directory if it doesn't exist.""" output_path = tmp_path / "new_output_dir" @@ -344,7 +360,9 @@ def test_output_handler_creates_directory(self, tmp_path, mocker): assert handler.output_dir.exists() assert handler.output_dir.is_dir() - def test_generate_csv_with_specific_results(self, tmp_path, mocker): + def test_generate_csv_with_specific_results( + self, tmp_path: Path, mocker: MockerFixture + ) -> None: """Test CSV report generation with specific results.""" results = [ EvaluationResult( @@ -391,14 +409,13 @@ def test_generate_csv_with_specific_results(self, tmp_path, mocker): mocker.patch("builtins.print") handler = OutputHandler(output_dir=str(tmp_path)) - csv_file = handler._generate_csv_report(results, "test_eval") + csv_file = handler._generate_csv_report( # pylint: disable=protected-access + results, "test_eval" + ) assert csv_file.exists() assert csv_file.suffix == ".csv" - # Read and verify CSV content - import csv as csv_module - with open(csv_file, encoding="utf-8") as f: reader = csv_module.DictReader(f) rows = list(reader) @@ -420,7 +437,9 @@ def test_generate_csv_with_specific_results(self, tmp_path, mocker): assert rows[2]["query"] == "Create namespace" assert rows[2]["contexts"] == "" - def test_csv_columns_configuration(self, tmp_path, mocker): + def test_csv_columns_configuration( + self, tmp_path: Path, mocker: MockerFixture + ) -> None: """Test that CSV uses configured columns.""" results = [ EvaluationResult( @@ -442,10 +461,9 @@ def test_csv_columns_configuration(self, tmp_path, mocker): system_config.visualization.enabled_graphs = [] handler = OutputHandler(output_dir=str(tmp_path), system_config=system_config) - csv_file = handler._generate_csv_report(results, "test_eval") - - # Read CSV headers - import csv as csv_module + csv_file = handler._generate_csv_report( # pylint: disable=protected-access + results, "test_eval" + ) with open(csv_file, encoding="utf-8") as f: reader = csv_module.reader(f) @@ -453,9 +471,11 @@ def test_csv_columns_configuration(self, tmp_path, mocker): assert headers == ["conversation_group_id", "result", "score"] - def test_filename_timestamp_format(self, tmp_path, mocker): + def test_filename_timestamp_format( + self, tmp_path: Path, mocker: MockerFixture + ) -> None: """Test that generated filenames include proper timestamps.""" - results = [] + results: list = [] mocker.patch("builtins.print") @@ -467,7 +487,9 @@ def test_filename_timestamp_format(self, tmp_path, mocker): ) mock_datetime.now.return_value.strftime.return_value = "20240101_120000" - csv_file = handler._generate_csv_report(results, "test_20240101_120000") + csv_file = handler._generate_csv_report( # pylint: disable=protected-access + results, "test_20240101_120000" + ) assert "test_20240101_120000" in csv_file.name assert csv_file.suffix == ".csv" diff --git a/tests/unit/core/output/test_statistics.py b/tests/unit/core/output/test_statistics.py index 83fe8b0a..e94b8578 100644 --- a/tests/unit/core/output/test_statistics.py +++ b/tests/unit/core/output/test_statistics.py @@ -13,53 +13,10 @@ ) -@pytest.fixture -def sample_results(): - """Create sample evaluation results.""" - return [ - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn1", - metric_identifier="metric1", - score=0.9, - result="PASS", - threshold=0.7, - reason="Good", - ), - EvaluationResult( - conversation_group_id="conv1", - turn_id="turn2", - metric_identifier="metric1", - score=0.5, - result="FAIL", - threshold=0.7, - reason="Low score", - ), - EvaluationResult( - conversation_group_id="conv2", - turn_id="turn1", - metric_identifier="metric2", - score=0.8, - result="PASS", - threshold=0.7, - reason="Good", - ), - EvaluationResult( - conversation_group_id="conv2", - turn_id="turn2", - metric_identifier="metric2", - score=None, - result="ERROR", - threshold=0.7, - reason="Failed", - ), - ] - - class TestBootstrapIntervals: """Tests for bootstrap_intervals function.""" - def test_bootstrap_intervals_basic(self): + def test_bootstrap_intervals_basic(self) -> None: """Test basic bootstrap interval calculation.""" series = pd.Series([0.8, 0.85, 0.9, 0.75, 0.88]) @@ -71,21 +28,21 @@ def test_bootstrap_intervals_basic(self): assert 0 <= low <= 1 assert 0 <= high <= 1 - def test_bootstrap_intervals_invalid_confidence(self): + def test_bootstrap_intervals_invalid_confidence(self) -> None: """Test bootstrap with invalid confidence value.""" series = pd.Series([0.8, 0.85, 0.9]) with pytest.raises(ValueError, match="Invalid confidence"): bootstrap_intervals(series, confidence=150) - def test_bootstrap_intervals_negative_confidence(self): + def test_bootstrap_intervals_negative_confidence(self) -> None: """Test bootstrap with negative confidence value.""" series = pd.Series([0.8, 0.85, 0.9]) with pytest.raises(ValueError, match="Invalid confidence"): bootstrap_intervals(series, confidence=-10) - def test_bootstrap_intervals_custom_confidence(self): + def test_bootstrap_intervals_custom_confidence(self) -> None: """Test bootstrap with custom confidence level.""" series = pd.Series([0.8, 0.85, 0.9, 0.75, 0.88]) @@ -95,7 +52,7 @@ def test_bootstrap_intervals_custom_confidence(self): assert low <= mean <= high - def test_bootstrap_intervals_custom_steps(self): + def test_bootstrap_intervals_custom_steps(self) -> None: """Test bootstrap with custom bootstrap steps.""" series = pd.Series([0.8, 0.85, 0.9]) @@ -103,7 +60,7 @@ def test_bootstrap_intervals_custom_steps(self): assert low <= mean <= high - def test_bootstrap_intervals_valid_confidence(self): + def test_bootstrap_intervals_valid_confidence(self) -> None: """Test bootstrap_intervals with valid confidence levels.""" data = pd.Series([0.8, 0.9, 0.7, 0.85, 0.75]) @@ -121,7 +78,7 @@ def test_bootstrap_intervals_valid_confidence(self): ci_90_width = high_90 - low_90 assert ci_90_width < ci_95_width - def test_bootstrap_intervals_edge_cases(self): + def test_bootstrap_intervals_edge_cases(self) -> None: """Test bootstrap_intervals with edge cases.""" # Test with single value single_value = pd.Series([0.5]) @@ -135,7 +92,7 @@ def test_bootstrap_intervals_edge_cases(self): assert abs(mean - 0.8) < 0.001 assert abs(high - 0.8) < 0.001 - def test_bootstrap_intervals_confidence_levels(self): + def test_bootstrap_intervals_confidence_levels(self) -> None: """Test bootstrap_intervals with different confidence levels.""" data = pd.Series([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]) @@ -151,7 +108,7 @@ def test_bootstrap_intervals_confidence_levels(self): class TestCalculateScoreStatistics: """Tests for _calculate_score_statistics function.""" - def test_score_statistics_multiple_scores(self): + def test_score_statistics_multiple_scores(self) -> None: """Test score statistics with multiple scores includes confidence interval.""" scores = [0.8, 0.85, 0.9, 0.75, 0.88] result = _calculate_score_statistics(scores) @@ -172,7 +129,7 @@ def test_score_statistics_multiple_scores(self): assert ci["confidence_level"] == 95 assert ci["low"] < ci["mean"] < ci["high"] - def test_score_statistics_two_scores(self): + def test_score_statistics_two_scores(self) -> None: """Test score statistics with exactly 2 scores includes CI.""" scores = [0.7, 0.9] result = _calculate_score_statistics(scores) @@ -181,7 +138,7 @@ def test_score_statistics_two_scores(self): assert result["mean"] == 0.8 assert result["confidence_interval"] is not None - def test_score_statistics_single_score_no_ci(self): + def test_score_statistics_single_score_no_ci(self) -> None: """Test score statistics with single score has no confidence interval.""" scores = [0.8] result = _calculate_score_statistics(scores) @@ -191,7 +148,7 @@ def test_score_statistics_single_score_no_ci(self): assert result["std"] == 0.0 # No std for single value assert result["confidence_interval"] is None - def test_score_statistics_empty_scores(self): + def test_score_statistics_empty_scores(self) -> None: """Test score statistics with empty list returns zeros and no CI.""" result = _calculate_score_statistics([]) @@ -207,9 +164,11 @@ def test_score_statistics_empty_scores(self): class TestCalculateBasicStats: """Tests for calculate_basic_stats function.""" - def test_basic_stats_with_results(self, sample_results): + def test_basic_stats_with_results( + self, sample_results_statistics: list[EvaluationResult] + ) -> None: """Test basic stats calculation with results.""" - stats = calculate_basic_stats(sample_results) + stats = calculate_basic_stats(sample_results_statistics) assert stats["TOTAL"] == 4 assert stats["PASS"] == 2 @@ -219,7 +178,7 @@ def test_basic_stats_with_results(self, sample_results): assert stats["fail_rate"] == 25.0 assert stats["error_rate"] == 25.0 - def test_basic_stats_empty_results(self): + def test_basic_stats_empty_results(self) -> None: """Test basic stats with empty results.""" stats = calculate_basic_stats([]) @@ -231,7 +190,7 @@ def test_basic_stats_empty_results(self): assert stats["fail_rate"] == 0.0 assert stats["error_rate"] == 0.0 - def test_basic_stats_all_pass(self): + def test_basic_stats_all_pass(self) -> None: """Test basic stats with all passing results.""" results = [ EvaluationResult( @@ -252,7 +211,7 @@ def test_basic_stats_all_pass(self): assert stats["pass_rate"] == 100.0 assert stats["fail_rate"] == 0.0 - def test_basic_stats_all_fail(self): + def test_basic_stats_all_fail(self) -> None: """Test basic stats with all failing results.""" results = [ EvaluationResult( @@ -273,7 +232,7 @@ def test_basic_stats_all_fail(self): assert stats["fail_rate"] == 100.0 assert stats["pass_rate"] == 0.0 - def test_basic_stats_all_error(self): + def test_basic_stats_all_error(self) -> None: """Test basic stats with all error results.""" results = [ EvaluationResult( @@ -293,7 +252,7 @@ def test_basic_stats_all_error(self): assert stats["ERROR"] == 2 assert stats["error_rate"] == 100.0 - def test_calculate_basic_stats_mixed_results(self): + def test_calculate_basic_stats_mixed_results(self) -> None: """Test calculate_basic_stats with mixed results.""" results = [ EvaluationResult( @@ -352,7 +311,7 @@ def test_calculate_basic_stats_mixed_results(self): } assert stats == expected - def test_calculate_basic_stats_single_result(self): + def test_calculate_basic_stats_single_result(self) -> None: """Test calculate_basic_stats with single result.""" results = [ EvaluationResult( @@ -388,9 +347,11 @@ def test_calculate_basic_stats_single_result(self): class TestCalculateDetailedStats: """Tests for calculate_detailed_stats function.""" - def test_detailed_stats_with_results(self, sample_results): + def test_detailed_stats_with_results( + self, sample_results_statistics: list[EvaluationResult] + ) -> None: """Test detailed stats calculation.""" - stats = calculate_detailed_stats(sample_results) + stats = calculate_detailed_stats(sample_results_statistics) assert "by_metric" in stats assert "by_conversation" in stats @@ -399,16 +360,18 @@ def test_detailed_stats_with_results(self, sample_results): assert "conv1" in stats["by_conversation"] assert "conv2" in stats["by_conversation"] - def test_detailed_stats_empty_results(self): + def test_detailed_stats_empty_results(self) -> None: """Test detailed stats with empty results.""" stats = calculate_detailed_stats([]) - assert stats["by_metric"] == {} - assert stats["by_conversation"] == {} + assert not stats["by_metric"] + assert not stats["by_conversation"] - def test_detailed_stats_metric_breakdown(self, sample_results): + def test_detailed_stats_metric_breakdown( + self, sample_results_statistics: list[EvaluationResult] + ) -> None: """Test metric breakdown in detailed stats.""" - stats = calculate_detailed_stats(sample_results) + stats = calculate_detailed_stats(sample_results_statistics) metric1_stats = stats["by_metric"]["metric1"] assert metric1_stats["pass"] == 1 @@ -418,9 +381,11 @@ def test_detailed_stats_metric_breakdown(self, sample_results): assert metric2_stats["pass"] == 1 assert metric2_stats["error"] == 1 - def test_detailed_stats_conversation_breakdown(self, sample_results): + def test_detailed_stats_conversation_breakdown( + self, sample_results_statistics: list[EvaluationResult] + ) -> None: """Test conversation breakdown in detailed stats.""" - stats = calculate_detailed_stats(sample_results) + stats = calculate_detailed_stats(sample_results_statistics) conv1_stats = stats["by_conversation"]["conv1"] assert conv1_stats["pass"] == 1 @@ -430,9 +395,11 @@ def test_detailed_stats_conversation_breakdown(self, sample_results): assert conv2_stats["pass"] == 1 assert conv2_stats["error"] == 1 - def test_detailed_stats_includes_rates(self, sample_results): + def test_detailed_stats_includes_rates( + self, sample_results_statistics: list[EvaluationResult] + ) -> None: """Test that detailed stats include percentage rates.""" - stats = calculate_detailed_stats(sample_results) + stats = calculate_detailed_stats(sample_results_statistics) metric1_stats = stats["by_metric"]["metric1"] assert "pass_rate" in metric1_stats @@ -440,7 +407,7 @@ def test_detailed_stats_includes_rates(self, sample_results): assert metric1_stats["pass_rate"] == 50.0 assert metric1_stats["fail_rate"] == 50.0 - def test_detailed_stats_single_metric(self): + def test_detailed_stats_single_metric(self) -> None: """Test detailed stats with single metric.""" results = [ EvaluationResult( @@ -458,7 +425,7 @@ def test_detailed_stats_single_metric(self): assert len(stats["by_metric"]) == 1 assert "single_metric" in stats["by_metric"] - def test_calculate_detailed_stats_single_metric_single_conversation(self): + def test_calculate_detailed_stats_single_metric_single_conversation(self) -> None: """Test calculate_detailed_stats with single metric and conversation.""" results = [ EvaluationResult( @@ -499,7 +466,7 @@ def test_calculate_detailed_stats_single_metric_single_conversation(self): assert conv_stats["error"] == 0 assert conv_stats["pass_rate"] == 50.0 - def test_calculate_detailed_stats_multiple_metrics_conversations(self): + def test_calculate_detailed_stats_multiple_metrics_conversations(self) -> None: """Test calculate_detailed_stats with multiple metrics and conversations.""" results = [ EvaluationResult( @@ -579,7 +546,7 @@ def test_calculate_detailed_stats_multiple_metrics_conversations(self): assert conv2_stats["pass_rate"] == 50.0 assert conv2_stats["error_rate"] == 50.0 - def test_calculate_detailed_stats_score_statistics(self): + def test_calculate_detailed_stats_score_statistics(self) -> None: """Test calculate_detailed_stats includes score statistics.""" results = [ EvaluationResult( @@ -632,7 +599,7 @@ def test_calculate_detailed_stats_score_statistics(self): assert "high" in ci assert ci["confidence_level"] == 95 - def test_calculate_detailed_stats_no_scores(self): + def test_calculate_detailed_stats_no_scores(self) -> None: """Test calculate_detailed_stats with results that have no scores.""" results = [ EvaluationResult( @@ -658,7 +625,7 @@ def test_calculate_detailed_stats_no_scores(self): # Confidence interval should be None when no scores assert score_stats["confidence_interval"] is None - def test_calculate_detailed_stats_single_score_no_confidence_interval(self): + def test_calculate_detailed_stats_single_score_no_confidence_interval(self) -> None: """Test calculate_detailed_stats with single score has no CI (needs 2+).""" results = [ EvaluationResult( @@ -680,7 +647,7 @@ def test_calculate_detailed_stats_single_score_no_confidence_interval(self): # Confidence interval should be None for single score assert score_stats["confidence_interval"] is None - def test_calculate_detailed_stats_by_tag(self): + def test_calculate_detailed_stats_by_tag(self) -> None: """Test calculate_detailed_stats includes by_tag breakdown.""" results = [ EvaluationResult( @@ -738,7 +705,7 @@ def test_calculate_detailed_stats_by_tag(self): assert staging_stats["fail_rate"] == 100.0 assert "score_statistics" in staging_stats - def test_calculate_detailed_stats_default_tag(self): + def test_calculate_detailed_stats_default_tag(self) -> None: """Test calculate_detailed_stats with default 'eval' tag.""" results = [ EvaluationResult( @@ -761,14 +728,14 @@ def test_calculate_detailed_stats_default_tag(self): class TestCalculateApiTokenUsage: """Tests for calculate_api_token_usage function.""" - def test_calculate_api_token_usage_empty_data(self): + def test_calculate_api_token_usage_empty_data(self) -> None: """Test calculate_api_token_usage with empty data.""" result = calculate_api_token_usage([]) assert result["total_api_input_tokens"] == 0 assert result["total_api_output_tokens"] == 0 assert result["total_api_tokens"] == 0 - def test_calculate_api_token_usage_single_turn(self): + def test_calculate_api_token_usage_single_turn(self) -> None: """Test calculate_api_token_usage with single turn.""" turn = TurnData( turn_id="turn1", @@ -786,7 +753,7 @@ def test_calculate_api_token_usage_single_turn(self): assert result["total_api_output_tokens"] == 50 assert result["total_api_tokens"] == 150 - def test_calculate_api_token_usage_multiple_turns(self): + def test_calculate_api_token_usage_multiple_turns(self) -> None: """Test calculate_api_token_usage with multiple turns.""" turns = [ TurnData( @@ -813,7 +780,7 @@ def test_calculate_api_token_usage_multiple_turns(self): assert result["total_api_output_tokens"] == 125 assert result["total_api_tokens"] == 375 - def test_calculate_api_token_usage_multiple_conversations(self): + def test_calculate_api_token_usage_multiple_conversations(self) -> None: """Test calculate_api_token_usage with multiple conversations.""" eval_data1 = EvaluationData( conversation_group_id="conv1", @@ -844,7 +811,7 @@ def test_calculate_api_token_usage_multiple_conversations(self): assert result["total_api_output_tokens"] == 150 assert result["total_api_tokens"] == 450 - def test_calculate_api_token_usage_zero_tokens(self): + def test_calculate_api_token_usage_zero_tokens(self) -> None: """Test calculate_api_token_usage with zero token values.""" turn = TurnData( turn_id="turn1", @@ -866,7 +833,7 @@ def test_calculate_api_token_usage_zero_tokens(self): class TestCalculateBasicStatsWithTokens: """Tests for calculate_basic_stats token tracking fields.""" - def test_basic_stats_includes_token_fields(self): + def test_basic_stats_includes_token_fields(self) -> None: """Test that basic stats includes token fields.""" results = [ EvaluationResult( @@ -885,7 +852,7 @@ def test_basic_stats_includes_token_fields(self): assert "total_judge_llm_output_tokens" in stats assert "total_judge_llm_tokens" in stats - def test_basic_stats_sums_token_values(self): + def test_basic_stats_sums_token_values(self) -> None: """Test that basic stats correctly sums token values.""" results = [ EvaluationResult( @@ -914,7 +881,7 @@ def test_basic_stats_sums_token_values(self): assert stats["total_judge_llm_output_tokens"] == 150 assert stats["total_judge_llm_tokens"] == 450 - def test_basic_stats_zero_tokens_by_default(self): + def test_basic_stats_zero_tokens_by_default(self) -> None: """Test that results without tokens default to zero.""" results = [ EvaluationResult( @@ -931,7 +898,7 @@ def test_basic_stats_zero_tokens_by_default(self): assert stats["total_judge_llm_output_tokens"] == 0 assert stats["total_judge_llm_tokens"] == 0 - def test_basic_stats_empty_results_zero_tokens(self): + def test_basic_stats_empty_results_zero_tokens(self) -> None: """Test that empty results have zero tokens.""" stats = calculate_basic_stats([]) assert stats["total_judge_llm_input_tokens"] == 0 diff --git a/tests/unit/core/script/test_manager.py b/tests/unit/core/script/test_manager.py index f44c83e9..33b72350 100644 --- a/tests/unit/core/script/test_manager.py +++ b/tests/unit/core/script/test_manager.py @@ -2,6 +2,7 @@ import tempfile from pathlib import Path +import os import pytest @@ -12,7 +13,7 @@ class TestScriptExecutionManager: """Unit tests for ScriptExecutionManager.""" - def test_run_script_success(self): + def test_run_script_success(self) -> None: """Test running a successful script.""" # Create a simple script that exits successfully script_content = "#!/bin/bash\nexit 0\n" @@ -32,7 +33,7 @@ def test_run_script_success(self): finally: script_path.unlink() - def test_run_script_failure(self): + def test_run_script_failure(self) -> None: """Test running a script that fails.""" # Create a script that exits with error code script_content = "#!/bin/bash\nexit 1\n" @@ -51,14 +52,14 @@ def test_run_script_failure(self): finally: script_path.unlink() - def test_run_script_not_found(self): + def test_run_script_not_found(self) -> None: """Test running non-existent script raises error.""" manager = ScriptExecutionManager() with pytest.raises(ScriptExecutionError, match="not found"): manager.run_script("/nonexistent/script.sh") - def test_run_script_not_executable(self): + def test_run_script_not_executable(self) -> None: """Test running non-executable file raises error.""" script_content = "#!/bin/bash\nexit 0\n" @@ -77,7 +78,7 @@ def test_run_script_not_executable(self): finally: script_path.unlink() - def test_run_script_not_a_file(self): + def test_run_script_not_a_file(self) -> None: """Test running a directory raises error.""" with tempfile.TemporaryDirectory() as tmpdir: manager = ScriptExecutionManager() @@ -85,7 +86,7 @@ def test_run_script_not_a_file(self): with pytest.raises(ScriptExecutionError, match="not a file"): manager.run_script(tmpdir) - def test_run_script_with_output(self): + def test_run_script_with_output(self) -> None: """Test script with stdout output.""" script_content = '#!/bin/bash\necho "Test output"\nexit 0\n' @@ -103,7 +104,7 @@ def test_run_script_with_output(self): finally: script_path.unlink() - def test_run_script_with_stderr(self): + def test_run_script_with_stderr(self) -> None: """Test script with stderr output.""" script_content = '#!/bin/bash\necho "Error message" >&2\nexit 1\n' @@ -121,7 +122,7 @@ def test_run_script_with_stderr(self): finally: script_path.unlink() - def test_run_script_accepts_string_path(self): + def test_run_script_accepts_string_path(self) -> None: """Test that run_script accepts string path.""" script_content = "#!/bin/bash\nexit 0\n" @@ -139,7 +140,7 @@ def test_run_script_accepts_string_path(self): finally: Path(script_path).unlink() - def test_run_script_resolves_relative_path(self): + def test_run_script_resolves_relative_path(self) -> None: """Test that relative paths are resolved.""" script_content = "#!/bin/bash\nexit 0\n" @@ -149,8 +150,6 @@ def test_run_script_resolves_relative_path(self): script_path.chmod(0o755) # Use relative path - import os - original_cwd = os.getcwd() try: os.chdir(tmpdir) @@ -160,7 +159,7 @@ def test_run_script_resolves_relative_path(self): finally: os.chdir(original_cwd) - def test_run_script_timeout(self): + def test_run_script_timeout(self) -> None: """Test script timeout raises error.""" # Create a script that sleeps script_content = "#!/bin/bash\nsleep 10\nexit 0\n" diff --git a/tests/unit/core/script/test_manager_additional.py b/tests/unit/core/script/test_manager_additional.py index ce42bbe8..642e2e42 100644 --- a/tests/unit/core/script/test_manager_additional.py +++ b/tests/unit/core/script/test_manager_additional.py @@ -1,8 +1,10 @@ """Additional tests for script manager to increase coverage.""" +from pathlib import Path import subprocess - +import logging import pytest +from pytest_mock import MockFixture from lightspeed_evaluation.core.script.manager import ScriptExecutionManager from lightspeed_evaluation.core.system.exceptions import ScriptExecutionError @@ -11,7 +13,9 @@ class TestScriptExecutionManagerAdditional: """Additional tests for ScriptExecutionManager.""" - def test_run_script_timeout_error(self, tmp_path, mocker): + def test_run_script_timeout_error( + self, tmp_path: Path, mocker: MockFixture + ) -> None: """Test script execution with timeout.""" # Create a script file script = tmp_path / "test_script.sh" @@ -27,7 +31,9 @@ def test_run_script_timeout_error(self, tmp_path, mocker): with pytest.raises(ScriptExecutionError, match="timeout"): manager.run_script(script) - def test_run_script_subprocess_error(self, tmp_path, mocker): + def test_run_script_subprocess_error( + self, tmp_path: Path, mocker: MockFixture + ) -> None: """Test script execution with subprocess error.""" script = tmp_path / "test_script.sh" script.write_text("#!/bin/bash\necho 'test'\n") @@ -42,7 +48,9 @@ def test_run_script_subprocess_error(self, tmp_path, mocker): with pytest.raises(ScriptExecutionError, match="Error running script"): manager.run_script(script) - def test_run_script_unexpected_error(self, tmp_path, mocker): + def test_run_script_unexpected_error( + self, tmp_path: Path, mocker: MockFixture + ) -> None: """Test script execution with unexpected error.""" script = tmp_path / "test_script.sh" script.write_text("#!/bin/bash\necho 'test'\n") @@ -57,7 +65,9 @@ def test_run_script_unexpected_error(self, tmp_path, mocker): with pytest.raises(ScriptExecutionError, match="Unexpected error"): manager.run_script(script) - def test_run_script_with_path_object(self, tmp_path, mocker): + def test_run_script_with_path_object( + self, tmp_path: Path, mocker: MockFixture + ) -> None: """Test run_script accepts Path objects.""" script = tmp_path / "test_script.sh" script.write_text("#!/bin/bash\necho 'test'\n") @@ -73,7 +83,7 @@ def test_run_script_with_path_object(self, tmp_path, mocker): assert result is True - def test_script_not_file_error(self, tmp_path): + def test_script_not_file_error(self, tmp_path: Path) -> None: """Test error when script path is not a file.""" # Create a directory instead of file script_dir = tmp_path / "script_dir" @@ -84,9 +94,10 @@ def test_script_not_file_error(self, tmp_path): with pytest.raises(ScriptExecutionError, match="not a file"): manager.run_script(script_dir) - def test_script_output_logging(self, tmp_path, mocker, caplog): + def test_script_output_logging( + self, tmp_path: Path, mocker: MockFixture, caplog: pytest.LogCaptureFixture + ) -> None: """Test that script output is logged.""" - import logging caplog.set_level(logging.DEBUG) @@ -108,9 +119,10 @@ def test_script_output_logging(self, tmp_path, mocker, caplog): # Check that output was logged assert "test output" in caplog.text or "completed successfully" in caplog.text - def test_script_stderr_logging_on_failure(self, tmp_path, mocker, caplog): + def test_script_stderr_logging_on_failure( + self, tmp_path: Path, mocker: MockFixture, caplog: pytest.LogCaptureFixture + ) -> None: """Test that stderr is logged as error on failure.""" - import logging caplog.set_level(logging.ERROR) @@ -131,9 +143,10 @@ def test_script_stderr_logging_on_failure(self, tmp_path, mocker, caplog): assert result is False - def test_script_stderr_logging_on_success(self, tmp_path, mocker, caplog): + def test_script_stderr_logging_on_success( + self, tmp_path: Path, mocker: MockFixture, caplog: pytest.LogCaptureFixture + ) -> None: """Test that stderr is logged as debug on success.""" - import logging caplog.set_level(logging.DEBUG) diff --git a/tests/unit/core/system/test_env_validator.py b/tests/unit/core/system/test_env_validator.py index e28fa578..ff67a363 100644 --- a/tests/unit/core/system/test_env_validator.py +++ b/tests/unit/core/system/test_env_validator.py @@ -1,6 +1,7 @@ """Unit tests for environment validator.""" import pytest +from pytest_mock import MockerFixture from lightspeed_evaluation.core.system.env_validator import ( validate_anthropic_env, @@ -19,21 +20,21 @@ class TestProviderValidators: """Tests for individual provider validators.""" - def test_validate_openai_env_success(self, mocker): + def test_validate_openai_env_success(self, mocker: MockerFixture) -> None: """Test OpenAI validation succeeds with API key.""" mocker.patch.dict("os.environ", {"OPENAI_API_KEY": "test_key"}) # Should not raise validate_openai_env() - def test_validate_openai_env_failure(self, mocker): + def test_validate_openai_env_failure(self, mocker: MockerFixture) -> None: """Test OpenAI validation fails without API key.""" mocker.patch.dict("os.environ", {}, clear=True) with pytest.raises(LLMError, match="OPENAI_API_KEY"): validate_openai_env() - def test_validate_azure_env_success(self, mocker): + def test_validate_azure_env_success(self, mocker: MockerFixture) -> None: """Test Azure validation succeeds with required vars.""" mocker.patch.dict( "os.environ", @@ -45,14 +46,14 @@ def test_validate_azure_env_success(self, mocker): validate_azure_env() - def test_validate_azure_env_failure(self, mocker): + def test_validate_azure_env_failure(self, mocker: MockerFixture) -> None: """Test Azure validation fails without required vars.""" mocker.patch.dict("os.environ", {}, clear=True) with pytest.raises(LLMError, match="Azure"): validate_azure_env() - def test_validate_watsonx_env_success(self, mocker): + def test_validate_watsonx_env_success(self, mocker: MockerFixture) -> None: """Test Watsonx validation succeeds with required vars.""" mocker.patch.dict( "os.environ", @@ -65,46 +66,50 @@ def test_validate_watsonx_env_success(self, mocker): validate_watsonx_env() - def test_validate_watsonx_env_failure(self, mocker): + def test_validate_watsonx_env_failure(self, mocker: MockerFixture) -> None: """Test Watsonx validation fails without required vars.""" mocker.patch.dict("os.environ", {}, clear=True) with pytest.raises(LLMError, match="Watsonx"): validate_watsonx_env() - def test_validate_anthropic_env_success(self, mocker): + def test_validate_anthropic_env_success(self, mocker: MockerFixture) -> None: """Test Anthropic validation succeeds with API key.""" mocker.patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test_key"}) validate_anthropic_env() - def test_validate_anthropic_env_failure(self, mocker): + def test_validate_anthropic_env_failure(self, mocker: MockerFixture) -> None: """Test Anthropic validation fails without API key.""" mocker.patch.dict("os.environ", {}, clear=True) with pytest.raises(LLMError, match="ANTHROPIC_API_KEY"): validate_anthropic_env() - def test_validate_gemini_env_with_google_api_key(self, mocker): + def test_validate_gemini_env_with_google_api_key( + self, mocker: MockerFixture + ) -> None: """Test Gemini validation succeeds with GOOGLE_API_KEY.""" mocker.patch.dict("os.environ", {"GOOGLE_API_KEY": "test_key"}) validate_gemini_env() - def test_validate_gemini_env_with_gemini_api_key(self, mocker): + def test_validate_gemini_env_with_gemini_api_key( + self, mocker: MockerFixture + ) -> None: """Test Gemini validation succeeds with GEMINI_API_KEY.""" mocker.patch.dict("os.environ", {"GEMINI_API_KEY": "test_key"}) validate_gemini_env() - def test_validate_gemini_env_failure(self, mocker): + def test_validate_gemini_env_failure(self, mocker: MockerFixture) -> None: """Test Gemini validation fails without API keys.""" mocker.patch.dict("os.environ", {}, clear=True) with pytest.raises(LLMError, match="GOOGLE_API_KEY or GEMINI_API_KEY"): validate_gemini_env() - def test_validate_vertex_env_success(self, mocker): + def test_validate_vertex_env_success(self, mocker: MockerFixture) -> None: """Test Vertex AI validation succeeds with credentials.""" mocker.patch.dict( "os.environ", {"GOOGLE_APPLICATION_CREDENTIALS": "/path/to/creds.json"} @@ -112,21 +117,23 @@ def test_validate_vertex_env_success(self, mocker): validate_vertex_env() - def test_validate_vertex_env_failure(self, mocker): + def test_validate_vertex_env_failure(self, mocker: MockerFixture) -> None: """Test Vertex AI validation fails without credentials.""" mocker.patch.dict("os.environ", {}, clear=True) with pytest.raises(LLMError, match="GOOGLE_APPLICATION_CREDENTIALS"): validate_vertex_env() - def test_validate_ollama_env_with_host(self, mocker): + def test_validate_ollama_env_with_host(self, mocker: MockerFixture) -> None: """Test Ollama validation with OLLAMA_HOST set.""" mocker.patch.dict("os.environ", {"OLLAMA_HOST": "http://localhost:11434"}) # Should not raise or print warning validate_ollama_env() - def test_validate_ollama_env_without_host(self, mocker, capsys): + def test_validate_ollama_env_without_host( + self, mocker: MockerFixture, capsys: pytest.CaptureFixture + ) -> None: """Test Ollama validation without OLLAMA_HOST prints info.""" mocker.patch.dict("os.environ", {}, clear=True) @@ -135,7 +142,7 @@ def test_validate_ollama_env_without_host(self, mocker, capsys): captured = capsys.readouterr() assert "OLLAMA_HOST" in captured.out or "localhost" in captured.out - def test_validate_hosted_vllm_env_success(self, mocker): + def test_validate_hosted_vllm_env_success(self, mocker: MockerFixture) -> None: """Test hosted vLLM validation succeeds with required vars.""" mocker.patch.dict( "os.environ", @@ -147,7 +154,7 @@ def test_validate_hosted_vllm_env_success(self, mocker): validate_hosted_vllm_env() - def test_validate_hosted_vllm_env_failure(self, mocker): + def test_validate_hosted_vllm_env_failure(self, mocker: MockerFixture) -> None: """Test hosted vLLM validation fails without required vars.""" mocker.patch.dict("os.environ", {}, clear=True) @@ -158,13 +165,13 @@ def test_validate_hosted_vllm_env_failure(self, mocker): class TestValidateProviderEnv: """Tests for validate_provider_env dispatcher.""" - def test_validate_provider_openai(self, mocker): + def test_validate_provider_openai(self, mocker: MockerFixture) -> None: """Test provider validation dispatches to OpenAI validator.""" mocker.patch.dict("os.environ", {"OPENAI_API_KEY": "test"}) validate_provider_env("openai") - def test_validate_provider_azure(self, mocker): + def test_validate_provider_azure(self, mocker: MockerFixture) -> None: """Test provider validation dispatches to Azure validator.""" mocker.patch.dict( "os.environ", @@ -176,7 +183,7 @@ def test_validate_provider_azure(self, mocker): validate_provider_env("azure") - def test_validate_provider_watsonx(self, mocker): + def test_validate_provider_watsonx(self, mocker: MockerFixture) -> None: """Test provider validation dispatches to Watsonx validator.""" mocker.patch.dict( "os.environ", @@ -189,24 +196,24 @@ def test_validate_provider_watsonx(self, mocker): validate_provider_env("watsonx") - def test_validate_provider_unknown(self, mocker): + def test_validate_provider_unknown(self) -> None: """Test unknown provider doesn't raise error.""" # Unknown providers should be handled gracefully validate_provider_env("unknown_provider") - def test_validate_provider_anthropic(self, mocker): + def test_validate_provider_anthropic(self, mocker: MockerFixture) -> None: """Test provider validation for Anthropic.""" mocker.patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test"}) validate_provider_env("anthropic") - def test_validate_provider_gemini(self, mocker): + def test_validate_provider_gemini(self, mocker: MockerFixture) -> None: """Test provider validation for Gemini.""" mocker.patch.dict("os.environ", {"GOOGLE_API_KEY": "test"}) validate_provider_env("gemini") - def test_validate_provider_vertex(self, mocker): + def test_validate_provider_vertex(self, mocker: MockerFixture) -> None: """Test provider validation for Vertex AI.""" mocker.patch.dict( "os.environ", {"GOOGLE_APPLICATION_CREDENTIALS": "/path/to/creds"} @@ -214,13 +221,13 @@ def test_validate_provider_vertex(self, mocker): validate_provider_env("vertex") - def test_validate_provider_ollama(self, mocker): + def test_validate_provider_ollama(self, mocker: MockerFixture) -> None: """Test provider validation for Ollama.""" mocker.patch.dict("os.environ", {}) validate_provider_env("ollama") - def test_validate_provider_hosted_vllm(self, mocker): + def test_validate_provider_hosted_vllm(self, mocker: MockerFixture) -> None: """Test provider validation for hosted vLLM.""" mocker.patch.dict( "os.environ", diff --git a/tests/unit/core/system/test_lazy_import.py b/tests/unit/core/system/test_lazy_import.py index 522f9f29..7a26f702 100644 --- a/tests/unit/core/system/test_lazy_import.py +++ b/tests/unit/core/system/test_lazy_import.py @@ -8,7 +8,7 @@ class TestCreateLazyGetattr: """Tests for create_lazy_getattr function.""" - def test_lazy_import_success(self): + def test_lazy_import_success(self) -> None: """Test successful lazy import.""" lazy_imports = { "EvaluationResult": ( @@ -24,7 +24,7 @@ def test_lazy_import_success(self): assert result_class is not None assert result_class.__name__ == "EvaluationResult" - def test_lazy_import_unknown_attribute(self): + def test_lazy_import_unknown_attribute(self) -> None: """Test lazy import with unknown attribute.""" lazy_imports = { "KnownClass": ("lightspeed_evaluation.core.models", "EvaluationResult"), @@ -35,7 +35,7 @@ def test_lazy_import_unknown_attribute(self): with pytest.raises(AttributeError, match="has no attribute 'UnknownClass'"): __getattr__("UnknownClass") - def test_lazy_import_failed_import(self): + def test_lazy_import_failed_import(self) -> None: """Test lazy import with invalid module path.""" lazy_imports = { "FakeClass": ("nonexistent.module", "FakeClass"), @@ -46,7 +46,7 @@ def test_lazy_import_failed_import(self): with pytest.raises(ImportError, match="Failed to import"): __getattr__("FakeClass") - def test_lazy_import_multiple_classes(self): + def test_lazy_import_multiple_classes(self) -> None: """Test lazy importing multiple classes.""" lazy_imports = { "EvaluationResult": ( @@ -64,9 +64,9 @@ def test_lazy_import_multiple_classes(self): assert result_class.__name__ == "EvaluationResult" assert config_class.__name__ == "SystemConfig" - def test_lazy_import_preserves_module_name_in_error(self): + def test_lazy_import_preserves_module_name_in_error(self) -> None: """Test that module name appears in error messages.""" - lazy_imports = {} + lazy_imports: dict[str, tuple[str, str]] = {} __getattr__ = create_lazy_getattr(lazy_imports, "my_custom_module") diff --git a/tests/unit/core/system/test_loader.py b/tests/unit/core/system/test_loader.py index 5d87ee19..5b452dd6 100644 --- a/tests/unit/core/system/test_loader.py +++ b/tests/unit/core/system/test_loader.py @@ -17,7 +17,7 @@ class TestPopulateMetricMappings: """Unit tests for populate_metric_mappings function.""" - def test_populate_metric_mappings_turn_level(self): + def test_populate_metric_mappings_turn_level(self) -> None: """Test populating turn-level metrics.""" config = SystemConfig() config.default_turn_metrics_metadata = { @@ -31,7 +31,7 @@ def test_populate_metric_mappings_turn_level(self): assert "ragas:faithfulness" in TURN_LEVEL_METRICS assert "custom:answer_correctness" in TURN_LEVEL_METRICS - def test_populate_metric_mappings_conversation_level(self): + def test_populate_metric_mappings_conversation_level(self) -> None: """Test populating conversation-level metrics.""" config = SystemConfig() config.default_turn_metrics_metadata = {} @@ -45,7 +45,7 @@ def test_populate_metric_mappings_conversation_level(self): assert "deepeval:conversation_completeness" in CONVERSATION_LEVEL_METRICS assert "deepeval:conversation_relevancy" in CONVERSATION_LEVEL_METRICS - def test_populate_metric_mappings_clears_previous(self): + def test_populate_metric_mappings_clears_previous(self) -> None: """Test that populate clears previous mappings.""" config1 = SystemConfig() config1.default_turn_metrics_metadata = {"metric1": {}} @@ -70,14 +70,14 @@ def test_populate_metric_mappings_clears_previous(self): class TestConfigLoader: """Unit tests for ConfigLoader.""" - def test_load_system_config_file_not_found(self): + def test_load_system_config_file_not_found(self) -> None: """Test loading non-existent config file raises error.""" loader = ConfigLoader() with pytest.raises(ValueError, match="file not found"): loader.load_system_config("/nonexistent/config.yaml") - def test_load_system_config_invalid_yaml(self): + def test_load_system_config_invalid_yaml(self) -> None: """Test loading invalid YAML raises error.""" with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: f.write("invalid: yaml: [[[") @@ -90,7 +90,7 @@ def test_load_system_config_invalid_yaml(self): finally: Path(temp_path).unlink() - def test_load_system_config_empty_file(self): + def test_load_system_config_empty_file(self) -> None: """Test loading empty YAML raises error.""" with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: f.write("") @@ -103,7 +103,7 @@ def test_load_system_config_empty_file(self): finally: Path(temp_path).unlink() - def test_load_system_config_not_dict(self): + def test_load_system_config_not_dict(self) -> None: """Test loading YAML with non-dict root raises error.""" with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: f.write("- item1\n- item2\n") @@ -116,7 +116,7 @@ def test_load_system_config_not_dict(self): finally: Path(temp_path).unlink() - def test_load_system_config_minimal_valid(self): + def test_load_system_config_minimal_valid(self) -> None: """Test loading minimal valid config.""" yaml_content = """ llm: @@ -148,7 +148,7 @@ def test_load_system_config_minimal_valid(self): finally: Path(temp_path).unlink() - def test_load_system_config_with_all_sections(self): + def test_load_system_config_with_all_sections(self) -> None: """Test loading config with all sections.""" yaml_content = """ core: @@ -215,7 +215,7 @@ def test_load_system_config_with_all_sections(self): finally: Path(temp_path).unlink() - def test_load_system_config_populates_metrics(self): + def test_load_system_config_populates_metrics(self) -> None: """Test that loading config populates global metric mappings.""" yaml_content = """ llm: @@ -260,7 +260,7 @@ def test_load_system_config_populates_metrics(self): finally: Path(temp_path).unlink() - def test_load_system_config_with_defaults(self): + def test_load_system_config_with_defaults(self) -> None: """Test that missing sections use defaults.""" yaml_content = """ llm: @@ -288,7 +288,7 @@ def test_load_system_config_with_defaults(self): finally: Path(temp_path).unlink() - def test_create_system_config_missing_metrics_metadata(self): + def test_create_system_config_missing_metrics_metadata(self) -> None: """Test creating config when metrics_metadata is missing.""" yaml_content = """ llm: @@ -305,12 +305,12 @@ def test_create_system_config_missing_metrics_metadata(self): config = loader.load_system_config(temp_path) # Should handle missing metrics_metadata gracefully - assert config.default_turn_metrics_metadata == {} - assert config.default_conversation_metrics_metadata == {} + assert not config.default_turn_metrics_metadata + assert not config.default_conversation_metrics_metadata finally: Path(temp_path).unlink() - def test_create_system_config_partial_metrics_metadata(self): + def test_create_system_config_partial_metrics_metadata(self) -> None: """Test creating config with partial metrics_metadata.""" yaml_content = """ llm: @@ -335,11 +335,11 @@ def test_create_system_config_partial_metrics_metadata(self): # Should handle missing conversation_level assert len(config.default_turn_metrics_metadata) > 0 - assert config.default_conversation_metrics_metadata == {} + assert not config.default_conversation_metrics_metadata finally: Path(temp_path).unlink() - def test_load_system_config_empty_sections(self): + def test_load_system_config_empty_sections(self) -> None: """Test loading config with empty sections.""" yaml_content = """ llm: diff --git a/tests/unit/core/system/test_setup.py b/tests/unit/core/system/test_setup.py index 5fa25789..c3e8dc0e 100644 --- a/tests/unit/core/system/test_setup.py +++ b/tests/unit/core/system/test_setup.py @@ -3,6 +3,9 @@ import logging import os +from pytest_mock import MockerFixture +from _pytest.capture import CaptureFixture + from lightspeed_evaluation.core.models import LoggingConfig from lightspeed_evaluation.core.system.setup import ( setup_environment_variables, @@ -13,9 +16,9 @@ class TestSetupEnvironmentVariables: """Tests for environment variable setup.""" - def test_setup_default_environment_variables(self, mocker): + def test_setup_default_environment_variables(self, mocker: MockerFixture) -> None: """Test setting up default environment variables.""" - config_data = {} + config_data: dict = {} # Use mocker to patch os.environ mocker.patch.dict(os.environ, {}, clear=True) @@ -34,7 +37,7 @@ def test_setup_default_environment_variables(self, mocker): assert os.environ["SSL_CERTIFI_BUNDLE"] == "/path/to/certifi/cacert.pem" mock_where.assert_called_once() - def test_setup_custom_environment_variables(self, mocker): + def test_setup_custom_environment_variables(self, mocker: MockerFixture) -> None: """Test setting up custom environment variables.""" config_data = { "environment": { @@ -58,7 +61,9 @@ def test_setup_custom_environment_variables(self, mocker): assert os.environ["DEEPEVAL_TELEMETRY_OPT_OUT"] == "YES" assert os.environ["SSL_CERTIFI_BUNDLE"] == "/path/to/certifi/cacert.pem" - def test_setup_environment_variables_override_defaults(self, mocker): + def test_setup_environment_variables_override_defaults( + self, mocker: MockerFixture + ) -> None: """Test overriding default environment variables.""" config_data = {"environment": {"LITELLM_LOG": "DEBUG"}} @@ -74,7 +79,9 @@ def test_setup_environment_variables_override_defaults(self, mocker): assert os.environ["LITELLM_LOG"] == "DEBUG" assert os.environ["SSL_CERTIFI_BUNDLE"] == "/path/to/certifi/cacert.pem" - def test_setup_environment_variables_handles_key_error(self, mocker, capsys): + def test_setup_environment_variables_handles_key_error( + self, mocker: MockerFixture, capsys: CaptureFixture + ) -> None: """Test handling of KeyError during environment setup.""" config_data = {"environment": None} # This will cause issues @@ -95,7 +102,9 @@ def test_setup_environment_variables_handles_key_error(self, mocker, capsys): captured = capsys.readouterr() assert "Warning" in captured.out or "fallback" in captured.out - def test_setup_environment_variables_handles_type_error(self, mocker, capsys): + def test_setup_environment_variables_handles_type_error( + self, mocker: MockerFixture + ) -> None: """Test handling of TypeError during environment setup.""" config_data = {"environment": "invalid_type"} @@ -111,7 +120,9 @@ def test_setup_environment_variables_handles_type_error(self, mocker, capsys): assert os.environ["RAGAS_DO_NOT_TRACK"] == "true" assert os.environ["SSL_CERTIFI_BUNDLE"] == "/path/to/certifi/cacert.pem" - def test_setup_ssl_certifi_bundle_set_when_ssl_cert_file_is_none(self, mocker): + def test_setup_ssl_certifi_bundle_set_when_ssl_cert_file_is_none( + self, mocker: MockerFixture + ) -> None: """Test SSL_CERTIFI_BUNDLE is still set even when ssl_cert_file is None.""" config_data = {"llm": {"ssl_verify": True, "ssl_cert_file": None}} mocker.patch.dict(os.environ, {}, clear=True) @@ -129,7 +140,7 @@ def test_setup_ssl_certifi_bundle_set_when_ssl_cert_file_is_none(self, mocker): class TestSetupLogging: """Tests for logging setup.""" - def test_setup_logging_basic(self): + def test_setup_logging_basic(self) -> None: """Test basic logging setup.""" logging_config = LoggingConfig( source_level="INFO", @@ -143,7 +154,7 @@ def test_setup_logging_basic(self): assert logger.name == "lightspeed_evaluation" assert logger.level == logging.INFO - def test_setup_logging_debug_level(self): + def test_setup_logging_debug_level(self) -> None: """Test logging setup with DEBUG level.""" logging_config = LoggingConfig( source_level="DEBUG", @@ -155,7 +166,7 @@ def test_setup_logging_debug_level(self): assert logger.level == logging.DEBUG - def test_setup_logging_with_package_overrides(self): + def test_setup_logging_with_package_overrides(self) -> None: """Test logging setup with package overrides.""" logging_config = LoggingConfig( source_level="INFO", @@ -179,7 +190,7 @@ def test_setup_logging_with_package_overrides(self): urllib3_logger = logging.getLogger("urllib3") assert urllib3_logger.level == logging.CRITICAL - def test_setup_logging_sets_default_noisy_packages(self): + def test_setup_logging_sets_default_noisy_packages(self) -> None: """Test that noisy packages get default levels set.""" logging_config = LoggingConfig( source_level="INFO", @@ -193,7 +204,9 @@ def test_setup_logging_sets_default_noisy_packages(self): matplotlib_logger = logging.getLogger("matplotlib") assert matplotlib_logger.level == logging.ERROR - def test_setup_logging_handles_invalid_override_level(self, capsys): + def test_setup_logging_handles_invalid_override_level( + self, capsys: CaptureFixture + ) -> None: """Test handling of invalid log level in overrides.""" logging_config = LoggingConfig( source_level="INFO", @@ -211,7 +224,7 @@ def test_setup_logging_handles_invalid_override_level(self, capsys): captured = capsys.readouterr() assert "Warning" in captured.out or "Invalid" in captured.out - def test_setup_logging_error_level(self): + def test_setup_logging_error_level(self) -> None: """Test logging setup with ERROR level.""" logging_config = LoggingConfig( source_level="ERROR", @@ -223,7 +236,7 @@ def test_setup_logging_error_level(self): assert logger.level == logging.ERROR - def test_setup_logging_custom_format(self): + def test_setup_logging_custom_format(self) -> None: """Test logging with custom format.""" custom_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" logging_config = LoggingConfig( @@ -238,7 +251,7 @@ def test_setup_logging_custom_format(self): # Format is applied to root logger, not easy to verify directly # but at least verify it doesn't crash - def test_setup_logging_warning_level(self): + def test_setup_logging_warning_level(self) -> None: """Test logging setup with WARNING level.""" logging_config = LoggingConfig( source_level="WARNING", @@ -250,7 +263,7 @@ def test_setup_logging_warning_level(self): assert logger.level == logging.WARNING - def test_setup_logging_applies_to_all_default_packages(self): + def test_setup_logging_applies_to_all_default_packages(self) -> None: """Test that all default noisy packages get configured.""" logging_config = LoggingConfig( source_level="INFO", diff --git a/tests/unit/core/system/test_ssl_certifi.py b/tests/unit/core/system/test_ssl_certifi.py index 93cbd6ea..d92bf73c 100644 --- a/tests/unit/core/system/test_ssl_certifi.py +++ b/tests/unit/core/system/test_ssl_certifi.py @@ -2,6 +2,8 @@ from pathlib import Path +from pytest_mock import MockerFixture + from lightspeed_evaluation.core.system.ssl_certifi import ( create_ssl_certifi_bundle, get_ssl_cert_files_paths_from_system_yaml, @@ -13,7 +15,7 @@ class TestGetSslCertFilesPathsFromSystemYaml: """Tests for extracting SSL cert paths from config data.""" - def test_extracts_cert_when_ssl_verify_true(self): + def test_extracts_cert_when_ssl_verify_true(self) -> None: """Test extracting SSL cert when ssl_verify is True.""" config = { "ssl_verify": True, @@ -24,7 +26,7 @@ def test_extracts_cert_when_ssl_verify_true(self): assert result == ["/path/to/cert.pem"] - def test_ignores_cert_when_ssl_verify_false(self): + def test_ignores_cert_when_ssl_verify_false(self) -> None: """Test that ssl_cert_file is ignored when ssl_verify is False.""" config = { "ssl_verify": False, @@ -33,9 +35,9 @@ def test_ignores_cert_when_ssl_verify_false(self): result = get_ssl_cert_files_paths_from_system_yaml(config) - assert result == [] + assert not result - def test_nested_configs(self): + def test_nested_configs(self) -> None: """Test extracting SSL certs from nested configuration.""" config = { "service_a": { @@ -67,7 +69,7 @@ def test_nested_configs(self): class TestGetSystemSslCertFile: """Tests for getting system SSL cert file from environment.""" - def test_returns_cert_file_when_env_set(self, mocker): + def test_returns_cert_file_when_env_set(self, mocker: MockerFixture) -> None: """Test when SSL_CERT_FILE environment variable is set.""" mocker.patch.dict("os.environ", {"SSL_CERT_FILE": "/system/cert.pem"}) @@ -75,19 +77,19 @@ def test_returns_cert_file_when_env_set(self, mocker): assert result == ["/system/cert.pem"] - def test_returns_empty_when_env_not_set(self, mocker): + def test_returns_empty_when_env_not_set(self, mocker: MockerFixture) -> None: """Test when SSL_CERT_FILE environment variable is not set.""" mocker.patch.dict("os.environ", {}, clear=True) result = get_system_ssl_cert_file() - assert result == [] + assert not result class TestGetUniqueSslCertPaths: """Tests for getting unique SSL certificate paths.""" - def test_returns_unique_paths(self): + def test_returns_unique_paths(self) -> None: """Test that duplicate paths are removed.""" cert_paths = [ "/path/to/cert_a.pem", @@ -99,17 +101,19 @@ def test_returns_unique_paths(self): assert set(result) == {"/path/to/cert_a.pem", "/path/to/cert_b.pem"} - def test_returns_empty_list_when_no_paths(self): + def test_returns_empty_list_when_no_paths(self) -> None: """Test that an empty list is returned when no paths are provided.""" result = _get_unique_ssl_cert_paths([]) - assert result == [] + assert not result class TestCreateSslCertifiBundle: """Tests for creating combined SSL certificate bundle.""" - def test_returns_certifi_bundle_when_no_custom_certs(self, mocker): + def test_returns_certifi_bundle_when_no_custom_certs( + self, mocker: MockerFixture + ) -> None: """Test that certifi bundle is returned when no custom certs exist.""" mocker.patch.dict("os.environ", {}, clear=True) @@ -122,7 +126,9 @@ def test_returns_certifi_bundle_when_no_custom_certs(self, mocker): assert result == "/path/to/certifi/cacert.pem" - def test_combines_certifi_with_custom_cert(self, mocker, tmp_path): + def test_combines_certifi_with_custom_cert( + self, mocker: MockerFixture, tmp_path: Path + ) -> None: """Test combining certifi bundle with custom cert from config.""" mocker.patch.dict("os.environ", {}, clear=True) @@ -143,12 +149,14 @@ def test_combines_certifi_with_custom_cert(self, mocker, tmp_path): mock_where.return_value = str(certifi_bundle) result = create_ssl_certifi_bundle(config) - content = Path(result).read_text() + content = Path(result).read_text(encoding="utf-8") assert "CERTIFI BUNDLE" in content assert "CUSTOM CERT" in content - def test_combines_config_and_env_certs(self, mocker, tmp_path): + def test_combines_config_and_env_certs( + self, mocker: MockerFixture, tmp_path: Path + ) -> None: """Test combining certs from both config and environment.""" certifi_bundle = tmp_path / "certifi.pem" certifi_bundle.write_text("CERTIFI BUNDLE\n") @@ -172,13 +180,15 @@ def test_combines_config_and_env_certs(self, mocker, tmp_path): mock_where.return_value = str(certifi_bundle) result = create_ssl_certifi_bundle(config) - content = Path(result).read_text() + content = Path(result).read_text(encoding="utf-8") assert "CERTIFI BUNDLE" in content assert "CONFIG CERT" in content assert "ENV CERT" in content - def test_registers_atexit_cleanup(self, mocker, tmp_path): + def test_registers_atexit_cleanup( + self, mocker: MockerFixture, tmp_path: Path + ) -> None: """Test that atexit cleanup is registered for temp bundle.""" mocker.patch.dict("os.environ", {}) diff --git a/tests/unit/core/system/test_validator.py b/tests/unit/core/system/test_validator.py index b39c4ba9..95799a6d 100644 --- a/tests/unit/core/system/test_validator.py +++ b/tests/unit/core/system/test_validator.py @@ -4,6 +4,9 @@ from pathlib import Path import pytest +from pytest_mock import MockerFixture + +from pydantic import ValidationError from lightspeed_evaluation.core.models import EvaluationData, TurnData from lightspeed_evaluation.core.system.exceptions import DataValidationError @@ -11,13 +14,12 @@ DataValidator, format_pydantic_error, ) -from pydantic import ValidationError class TestFormatPydanticError: """Unit tests for format_pydantic_error helper function.""" - def test_format_single_error(self): + def test_format_single_error(self) -> None: """Test formatting a single Pydantic validation error.""" try: TurnData(turn_id="1", query="", response="Valid") @@ -26,7 +28,7 @@ def test_format_single_error(self): assert "query" in formatted assert "at least 1 character" in formatted - def test_format_multiple_errors(self): + def test_format_multiple_errors(self) -> None: """Test formatting multiple validation errors.""" try: TurnData(turn_id="", query="", response="") @@ -40,7 +42,7 @@ def test_format_multiple_errors(self): class TestDataValidator: """Unit tests for DataValidator.""" - def test_validate_evaluation_data_valid(self): + def test_validate_evaluation_data_valid(self) -> None: """Test validation passes with valid data.""" validator = DataValidator(api_enabled=False) @@ -53,12 +55,18 @@ def test_validate_evaluation_data_valid(self): ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = validator._validate_evaluation_data([conv_data]) + result = ( + validator._validate_evaluation_data( # pylint: disable=protected-access + [conv_data] + ) + ) assert result is True assert len(validator.validation_errors) == 0 - def test_validate_metrics_availability_unknown_turn_metric(self, mocker): + def test_validate_metrics_availability_unknown_turn_metric( + self, mocker: MockerFixture + ) -> None: """Test validation fails for unknown turn metric.""" # Mock the global metrics sets mocker.patch( @@ -76,7 +84,11 @@ def test_validate_metrics_availability_unknown_turn_metric(self, mocker): ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = validator._validate_evaluation_data([conv_data]) + result = ( + validator._validate_evaluation_data( # pylint: disable=protected-access + [conv_data] + ) + ) assert result is False assert len(validator.validation_errors) > 0 @@ -84,7 +96,9 @@ def test_validate_metrics_availability_unknown_turn_metric(self, mocker): "Unknown turn metric" in error for error in validator.validation_errors ) - def test_validate_metrics_availability_unknown_conversation_metric(self, mocker): + def test_validate_metrics_availability_unknown_conversation_metric( + self, mocker: MockerFixture + ) -> None: """Test validation fails for unknown conversation metric.""" mocker.patch( "lightspeed_evaluation.core.system.validator.CONVERSATION_LEVEL_METRICS", @@ -100,7 +114,11 @@ def test_validate_metrics_availability_unknown_conversation_metric(self, mocker) conversation_metrics=["unknown:conversation_metric"], ) - result = validator._validate_evaluation_data([conv_data]) + result = ( + validator._validate_evaluation_data( # pylint: disable=protected-access + [conv_data] + ) + ) assert result is False assert any( @@ -108,7 +126,7 @@ def test_validate_metrics_availability_unknown_conversation_metric(self, mocker) for error in validator.validation_errors ) - def test_validate_metric_requirements_missing_response(self): + def test_validate_metric_requirements_missing_response(self) -> None: """Test validation fails when required response field is missing.""" validator = DataValidator(api_enabled=False) @@ -120,12 +138,16 @@ def test_validate_metric_requirements_missing_response(self): ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = validator._validate_evaluation_data([conv_data]) + result = ( + validator._validate_evaluation_data( # pylint: disable=protected-access + [conv_data] + ) + ) assert result is False assert any("response" in error.lower() for error in validator.validation_errors) - def test_validate_metric_requirements_missing_contexts(self): + def test_validate_metric_requirements_missing_contexts(self) -> None: """Test validation fails when required contexts are missing.""" validator = DataValidator(api_enabled=False) @@ -138,14 +160,18 @@ def test_validate_metric_requirements_missing_contexts(self): ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = validator._validate_evaluation_data([conv_data]) + result = ( + validator._validate_evaluation_data( # pylint: disable=protected-access + [conv_data] + ) + ) assert result is False assert any("contexts" in error.lower() for error in validator.validation_errors) def test_validate_metric_requirements_api_enabled_allows_missing_response( - self, mocker - ): + self, mocker: MockerFixture + ) -> None: """Test that missing response is allowed when API is enabled.""" # Mock the global metrics sets mocker.patch( @@ -163,12 +189,16 @@ def test_validate_metric_requirements_api_enabled_allows_missing_response( ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = validator._validate_evaluation_data([conv_data]) + result = ( + validator._validate_evaluation_data( # pylint: disable=protected-access + [conv_data] + ) + ) # Should pass because API will populate response assert result is True - def test_validate_metric_requirements_expected_response_missing(self): + def test_validate_metric_requirements_expected_response_missing(self) -> None: """Test validation fails when expected_response is required but missing.""" validator = DataValidator(api_enabled=False) @@ -182,7 +212,11 @@ def test_validate_metric_requirements_expected_response_missing(self): ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = validator._validate_evaluation_data([conv_data]) + result = ( + validator._validate_evaluation_data( # pylint: disable=protected-access + [conv_data] + ) + ) assert result is False assert any( @@ -190,7 +224,7 @@ def test_validate_metric_requirements_expected_response_missing(self): for error in validator.validation_errors ) - def test_validate_metric_requirements_tool_eval_missing_fields(self): + def test_validate_metric_requirements_tool_eval_missing_fields(self) -> None: """Test validation fails when tool_eval required fields are missing.""" validator = DataValidator(api_enabled=False) @@ -204,14 +238,20 @@ def test_validate_metric_requirements_tool_eval_missing_fields(self): ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = validator._validate_evaluation_data([conv_data]) + result = ( + validator._validate_evaluation_data( # pylint: disable=protected-access + [conv_data] + ) + ) assert result is False assert any( "tool_calls" in error.lower() for error in validator.validation_errors ) - def test_validate_metric_requirements_skip_script_when_api_disabled(self, mocker): + def test_validate_metric_requirements_skip_script_when_api_disabled( + self, mocker: MockerFixture + ) -> None: """Test script metrics validation is skipped when API is disabled.""" # Mock the global metrics sets mocker.patch( @@ -231,19 +271,23 @@ def test_validate_metric_requirements_skip_script_when_api_disabled(self, mocker conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) # Should not validate script requirements when API disabled - result = validator._validate_evaluation_data([conv_data]) + result = ( + validator._validate_evaluation_data( # pylint: disable=protected-access + [conv_data] + ) + ) # Should pass because script validation is skipped assert result is True - def test_load_evaluation_data_file_not_found(self): + def test_load_evaluation_data_file_not_found(self) -> None: """Test loading non-existent file raises error.""" validator = DataValidator() with pytest.raises(DataValidationError, match="file not found"): validator.load_evaluation_data("/nonexistent/file.yaml") - def test_load_evaluation_data_invalid_yaml(self): + def test_load_evaluation_data_invalid_yaml(self) -> None: """Test loading invalid YAML raises error.""" with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: f.write("invalid: yaml: content: [") @@ -256,7 +300,7 @@ def test_load_evaluation_data_invalid_yaml(self): finally: Path(temp_path).unlink() - def test_load_evaluation_data_empty_file(self): + def test_load_evaluation_data_empty_file(self) -> None: """Test loading empty YAML file raises error.""" with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: f.write("") @@ -269,7 +313,7 @@ def test_load_evaluation_data_empty_file(self): finally: Path(temp_path).unlink() - def test_load_evaluation_data_not_list(self): + def test_load_evaluation_data_not_list(self) -> None: """Test loading YAML with non-list root raises error.""" with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: f.write("conversation_group_id: test\n") @@ -282,7 +326,7 @@ def test_load_evaluation_data_not_list(self): finally: Path(temp_path).unlink() - def test_load_evaluation_data_valid(self, mocker): + def test_load_evaluation_data_valid(self, mocker: MockerFixture) -> None: """Test loading valid evaluation data file.""" yaml_content = """ - conversation_group_id: test_conv @@ -290,7 +334,7 @@ def test_load_evaluation_data_valid(self, mocker): - turn_id: "1" query: "What is Python?" response: "Python is a programming language." -""" + """ with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: f.write(yaml_content) @@ -316,7 +360,7 @@ def test_load_evaluation_data_valid(self, mocker): finally: Path(temp_path).unlink() - def test_check_metric_requirements_missing_contexts(self): + def test_check_metric_requirements_missing_contexts(self) -> None: """Test validation fails for missing contexts when required.""" validator = DataValidator(api_enabled=False) @@ -329,12 +373,16 @@ def test_check_metric_requirements_missing_contexts(self): ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = validator._validate_evaluation_data([conv_data]) + result = ( + validator._validate_evaluation_data( # pylint: disable=protected-access + [conv_data] + ) + ) assert result is False assert any("contexts" in error.lower() for error in validator.validation_errors) - def test_check_metric_requirements_whitespace_only_string(self): + def test_check_metric_requirements_whitespace_only_string(self) -> None: """Test validation fails for whitespace-only required string.""" validator = DataValidator(api_enabled=False) @@ -346,11 +394,15 @@ def test_check_metric_requirements_whitespace_only_string(self): ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = validator._validate_evaluation_data([conv_data]) + result = ( + validator._validate_evaluation_data( # pylint: disable=protected-access + [conv_data] + ) + ) assert result is False - def test_validate_multiple_conversations(self): + def test_validate_multiple_conversations(self) -> None: """Test validating multiple conversations.""" validator = DataValidator(api_enabled=False) @@ -360,11 +412,17 @@ def test_validate_multiple_conversations(self): conv1 = EvaluationData(conversation_group_id="conv1", turns=[turn1]) conv2 = EvaluationData(conversation_group_id="conv2", turns=[turn2]) - result = validator._validate_evaluation_data([conv1, conv2]) + result = ( + validator._validate_evaluation_data( # pylint: disable=protected-access + [conv1, conv2] + ) + ) assert result is True - def test_validate_evaluation_data_accumulates_errors(self, mocker): + def test_validate_evaluation_data_accumulates_errors( + self, mocker: MockerFixture + ) -> None: """Test that validation accumulates multiple errors.""" mocker.patch( "lightspeed_evaluation.core.system.validator.TURN_LEVEL_METRICS", @@ -389,7 +447,11 @@ def test_validate_evaluation_data_accumulates_errors(self, mocker): conv = EvaluationData(conversation_group_id="test", turns=[turn1, turn2]) - result = validator._validate_evaluation_data([conv]) + result = ( + validator._validate_evaluation_data( # pylint: disable=protected-access + [conv] + ) + ) assert result is False # Should have errors for both issues @@ -399,7 +461,7 @@ def test_validate_evaluation_data_accumulates_errors(self, mocker): class TestFilterByScope: """Unit test for filter by scope.""" - def test_filter_by_scope_no_filter(self): + def test_filter_by_scope_no_filter(self) -> None: """Test no filtering when both tags and conv_ids are None.""" validator = DataValidator() data = [ @@ -412,10 +474,10 @@ def test_filter_by_scope_no_filter(self): turns=[TurnData(turn_id="t1", query="Q", response="A")], ), ] - result = validator._filter_by_scope(data) + result = validator._filter_by_scope(data) # pylint: disable=protected-access assert len(result) == 2 - def test_filter_by_scope_tags_only(self): + def test_filter_by_scope_tags_only(self) -> None: """Test filtering by tags only.""" validator = DataValidator() data = [ @@ -435,11 +497,13 @@ def test_filter_by_scope_tags_only(self): turns=[TurnData(turn_id="t1", query="Q", response="A")], ), ] - result = validator._filter_by_scope(data, tags=["basic"]) + result = validator._filter_by_scope( # pylint: disable=protected-access + data, tags=["basic"] + ) assert len(result) == 2 assert all(c.tag == "basic" for c in result) - def test_filter_by_scope_conv_ids_only(self): + def test_filter_by_scope_conv_ids_only(self) -> None: """Test filtering by conversation IDs only.""" validator = DataValidator() data = [ @@ -456,11 +520,13 @@ def test_filter_by_scope_conv_ids_only(self): turns=[TurnData(turn_id="t1", query="Q", response="A")], ), ] - result = validator._filter_by_scope(data, conv_ids=["conv_1", "conv_3"]) + result = validator._filter_by_scope( # pylint: disable=protected-access + data, conv_ids=["conv_1", "conv_3"] + ) assert len(result) == 2 assert {c.conversation_group_id for c in result} == {"conv_1", "conv_3"} - def test_filter_by_scope_tags_and_conv_ids(self): + def test_filter_by_scope_tags_and_conv_ids(self) -> None: """Test filtering by both tags and conv_ids uses OR logic.""" validator = DataValidator() data = [ @@ -480,10 +546,12 @@ def test_filter_by_scope_tags_and_conv_ids(self): turns=[TurnData(turn_id="t1", query="Q", response="A")], ), ] - result = validator._filter_by_scope(data, tags=["basic"], conv_ids=["conv_3"]) + result = validator._filter_by_scope( # pylint: disable=protected-access + data, tags=["basic"], conv_ids=["conv_3"] + ) assert len(result) == 2 # conv_1 (basic tag) + conv_3 (by ID) - def test_filter_by_scope_no_match_returns_empty(self): + def test_filter_by_scope_no_match_returns_empty(self) -> None: """Test filtering with no matching criteria returns empty list.""" validator = DataValidator() data = [ @@ -493,5 +561,7 @@ def test_filter_by_scope_no_match_returns_empty(self): turns=[TurnData(turn_id="t1", query="Q", response="A")], ), ] - result = validator._filter_by_scope(data, tags=["nonexistent"]) + result = validator._filter_by_scope( # pylint: disable=protected-access + data, tags=["nonexistent"] + ) assert len(result) == 0 diff --git a/tests/unit/pipeline/evaluation/conftest.py b/tests/unit/pipeline/evaluation/conftest.py new file mode 100644 index 00000000..a1131cd5 --- /dev/null +++ b/tests/unit/pipeline/evaluation/conftest.py @@ -0,0 +1,223 @@ +"""Pytest configuration and fixtures for evaluation tests.""" + +import pytest +from pytest_mock import MockerFixture + +from lightspeed_evaluation.core.models import ( + EvaluationData, + SystemConfig, + TurnData, +) +from lightspeed_evaluation.core.system.loader import ConfigLoader +from lightspeed_evaluation.core.metrics.manager import MetricManager +from lightspeed_evaluation.core.script import ScriptExecutionManager +from lightspeed_evaluation.core.models import EvaluationResult, EvaluationRequest +from lightspeed_evaluation.pipeline.evaluation.amender import APIDataAmender +from lightspeed_evaluation.pipeline.evaluation.errors import EvaluationErrorHandler +from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator +from lightspeed_evaluation.pipeline.evaluation.processor import ( + ProcessorComponents, + ConversationProcessor, +) + + +@pytest.fixture +def config_loader(mocker: MockerFixture) -> ConfigLoader: + """Create a mock config loader with system config.""" + loader = mocker.Mock(spec=ConfigLoader) + + config = SystemConfig() + config.default_turn_metrics_metadata = { + "ragas:faithfulness": {"threshold": 0.7, "default": True}, + "custom:answer_correctness": {"threshold": 0.8, "default": False}, + } + config.default_conversation_metrics_metadata = { + "deepeval:conversation_completeness": {"threshold": 0.6, "default": True}, + } + config.api.enabled = True + + loader.system_config = config + return loader + + +@pytest.fixture +def mock_metric_manager(mocker: MockerFixture) -> MetricManager: + """Create a mock metric manager.""" + manager = mocker.Mock(spec=MetricManager) + + def get_threshold( + metric_id: str, + _level: str, + _conv_data: EvaluationData | None = None, + _turn_data: TurnData | None = None, + ) -> float: + thresholds = { + "ragas:faithfulness": 0.7, + "custom:answer_correctness": 0.8, + "deepeval:conversation_completeness": 0.6, + } + return thresholds.get(metric_id, 0.5) + + manager.get_effective_threshold.side_effect = get_threshold + # Mock get_metric_metadata to return None (no metadata) to support iteration + # in _extract_metadata_for_csv + manager.get_metric_metadata.return_value = None + return manager + + +@pytest.fixture +def mock_script_manager(mocker: MockerFixture) -> ScriptExecutionManager: + """Create a mock script execution manager.""" + manager = mocker.Mock(spec=ScriptExecutionManager) + return manager + + +@pytest.fixture +def mock_config_loader(mocker: MockerFixture) -> ConfigLoader: + """Create a mock config loader with system config.""" + loader = mocker.Mock(spec=ConfigLoader) + + config = SystemConfig() + config.api.enabled = False + config.output.output_dir = "/tmp/test_output" + config.output.base_filename = "test" + config.core.max_threads = 2 + + loader.system_config = config + return loader + + +@pytest.fixture +def sample_evaluation_data() -> list[EvaluationData]: + """Create sample evaluation data.""" + turn1 = TurnData( + turn_id="turn1", + query="What is Python?", + response="Python is a programming language.", + contexts=["Python context"], + turn_metrics=["ragas:faithfulness"], + ) + conv_data = EvaluationData( + conversation_group_id="conv1", + turns=[turn1], + ) + return [conv_data] + + +@pytest.fixture +def processor_components(mocker: MockerFixture) -> ProcessorComponents: + """Create processor components.""" + metrics_evaluator = mocker.Mock(spec=MetricsEvaluator) + api_amender = mocker.Mock(spec=APIDataAmender) + error_handler = mocker.Mock(spec=EvaluationErrorHandler) + metric_manager = mocker.Mock(spec=MetricManager) + script_manager = mocker.Mock(spec=ScriptExecutionManager) + + # Default behavior for metric resolution + metric_manager.resolve_metrics.return_value = ["ragas:faithfulness"] + + return ProcessorComponents( + metrics_evaluator=metrics_evaluator, + api_amender=api_amender, + error_handler=error_handler, + metric_manager=metric_manager, + script_manager=script_manager, + ) + + +@pytest.fixture +def sample_conv_data() -> EvaluationData: + """Create sample conversation data.""" + turn1 = TurnData( + turn_id="turn1", + query="What is Python?", + response="Python is a programming language.", + contexts=["Context"], + turn_metrics=["ragas:faithfulness"], + ) + return EvaluationData( + conversation_group_id="conv1", + turns=[turn1], + ) + + +@pytest.fixture +def mock_metrics_evaluator(mocker: MockerFixture) -> MetricsEvaluator: + """Create a mock metrics evaluator.""" + evaluator = mocker.Mock(spec=MetricsEvaluator) + + def evaluate_metric(request: EvaluationRequest) -> EvaluationResult: + """Mock evaluate_metric that returns a result based on metric.""" + return EvaluationResult( + conversation_group_id=request.conv_data.conversation_group_id, + turn_id=request.turn_id, + metric_identifier=request.metric_identifier, + result="PASS", + score=0.85, + reason="Test evaluation", + threshold=0.7, + ) + + evaluator.evaluate_metric.side_effect = evaluate_metric + return evaluator + + +@pytest.fixture +def mock_api_amender(mocker: MockerFixture) -> APIDataAmender: + """Create a mock API data amender.""" + amender = mocker.Mock(spec=APIDataAmender) + return amender + + +@pytest.fixture +def mock_error_handler(mocker: MockerFixture) -> EvaluationErrorHandler: + """Create a mock error handler.""" + handler = mocker.Mock(spec=EvaluationErrorHandler) + + # Configure create_error_result to return a proper EvaluationResult + def create_error_result_side_effect( + conv_id: str, + metric_id: str, + reason: str, + *, + turn_id: str | None = None, + query: str = "", + ) -> EvaluationResult: + return EvaluationResult( + conversation_group_id=conv_id, + turn_id=turn_id, + metric_identifier=metric_id, + result="ERROR", + reason=reason, + query=query, + ) + + handler.create_error_result.side_effect = create_error_result_side_effect + return handler + + +@pytest.fixture +def processor_components_pr( + mock_metrics_evaluator: MetricsEvaluator, # pylint: disable=redefined-outer-name + mock_api_amender: APIDataAmender, # pylint: disable=redefined-outer-name + mock_error_handler: EvaluationErrorHandler, # pylint: disable=redefined-outer-name + mock_metric_manager: MetricManager, # pylint: disable=redefined-outer-name + mock_script_manager: ScriptExecutionManager, # pylint: disable=redefined-outer-name +) -> ProcessorComponents: + """Create processor components fixture for PR tests.""" + return ProcessorComponents( + metrics_evaluator=mock_metrics_evaluator, + api_amender=mock_api_amender, + error_handler=mock_error_handler, + metric_manager=mock_metric_manager, + script_manager=mock_script_manager, + ) + + +@pytest.fixture +def processor( + config_loader: ConfigLoader, # pylint: disable=redefined-outer-name + processor_components_pr: ProcessorComponents, # pylint: disable=redefined-outer-name +) -> ConversationProcessor: + """Create ConversationProcessor instance for PR tests.""" + return ConversationProcessor(config_loader, processor_components_pr) diff --git a/tests/unit/pipeline/evaluation/test_amender.py b/tests/unit/pipeline/evaluation/test_amender.py index 39bbd77d..bc2168df 100644 --- a/tests/unit/pipeline/evaluation/test_amender.py +++ b/tests/unit/pipeline/evaluation/test_amender.py @@ -1,5 +1,7 @@ """Unit tests for pipeline evaluation amender module.""" +from pytest_mock import MockerFixture + from lightspeed_evaluation.core.models import APIResponse, TurnData from lightspeed_evaluation.core.system.exceptions import APIError from lightspeed_evaluation.pipeline.evaluation.amender import APIDataAmender @@ -8,7 +10,7 @@ class TestAPIDataAmender: """Unit tests for APIDataAmender.""" - def test_amend_single_turn_no_client(self): + def test_amend_single_turn_no_client(self) -> None: """Test amendment returns None when no API client is available.""" amender = APIDataAmender(None) @@ -20,7 +22,7 @@ def test_amend_single_turn_no_client(self): assert conversation_id is None assert turn.response is None # Not modified - def test_amend_single_turn_success(self, mocker): + def test_amend_single_turn_success(self, mocker: MockerFixture) -> None: """Test amending single turn data successfully.""" mock_client = mocker.Mock() api_response = APIResponse( @@ -51,7 +53,9 @@ def test_amend_single_turn_success(self, mocker): assert turn.conversation_id == "conv_123" assert turn.contexts == ["Context 1", "Context 2"] - def test_amend_single_turn_with_conversation_id(self, mocker): + def test_amend_single_turn_with_conversation_id( + self, mocker: MockerFixture + ) -> None: """Test amending turn with existing conversation ID.""" mock_client = mocker.Mock() api_response = APIResponse( @@ -82,7 +86,7 @@ def test_amend_single_turn_with_conversation_id(self, mocker): assert turn.conversation_id == "conv_123" assert turn.contexts == ["Context 3"] - def test_amend_single_turn_with_tool_calls(self, mocker): + def test_amend_single_turn_with_tool_calls(self, mocker: MockerFixture) -> None: """Test amending turn data with tool calls.""" mock_client = mocker.Mock() api_response = APIResponse( @@ -107,7 +111,7 @@ def test_amend_single_turn_with_tool_calls(self, mocker): assert turn.response == "Tool response" assert turn.tool_calls == [[{"tool": "test_tool", "args": {"param": "value"}}]] - def test_amend_single_turn_with_attachments(self, mocker): + def test_amend_single_turn_with_attachments(self, mocker: MockerFixture) -> None: """Test amending turn data with attachments.""" mock_client = mocker.Mock() api_response = APIResponse( @@ -144,7 +148,7 @@ def test_amend_single_turn_with_attachments(self, mocker): assert turn.response == "Attachment response" assert turn.contexts == ["Attachment context"] - def test_amend_single_turn_api_error(self, mocker): + def test_amend_single_turn_api_error(self, mocker: MockerFixture) -> None: """Test handling API error during turn amendment.""" mock_client = mocker.Mock() mock_client.query.side_effect = APIError("Connection failed") @@ -163,7 +167,9 @@ def test_amend_single_turn_api_error(self, mocker): assert turn.response is None assert turn.conversation_id is None - def test_amend_single_turn_no_contexts_in_response(self, mocker): + def test_amend_single_turn_no_contexts_in_response( + self, mocker: MockerFixture + ) -> None: """Test amending turn when API response has no contexts.""" mock_client = mocker.Mock() api_response = APIResponse( @@ -184,11 +190,14 @@ def test_amend_single_turn_no_contexts_in_response(self, mocker): assert error_msg is None assert conversation_id == "conv_no_ctx" - # Turn data should be amended (contexts should remain None since API response has empty contexts) + # Turn data should be amended (contexts should remain None since API response + # has empty contexts) assert turn.response == "No context response" assert turn.contexts is None - def test_amend_single_turn_no_tool_calls_in_response(self, mocker): + def test_amend_single_turn_no_tool_calls_in_response( + self, mocker: MockerFixture + ) -> None: """Test amending turn when API response has no tool calls.""" mock_client = mocker.Mock() api_response = APIResponse( @@ -209,6 +218,7 @@ def test_amend_single_turn_no_tool_calls_in_response(self, mocker): assert error_msg is None assert conversation_id == "conv_no_tools" - # Turn data should be amended (tool_calls should remain None since API response has empty tool_calls) + # Turn data should be amended (tool_calls should remain None since API response + # has empty tool_calls) assert turn.response == "No tools response" assert turn.tool_calls is None diff --git a/tests/unit/pipeline/evaluation/test_errors.py b/tests/unit/pipeline/evaluation/test_errors.py index 011c21b3..b8477c08 100644 --- a/tests/unit/pipeline/evaluation/test_errors.py +++ b/tests/unit/pipeline/evaluation/test_errors.py @@ -7,7 +7,7 @@ class TestEvaluationErrorHandler: """Unit tests for EvaluationErrorHandler.""" - def test_mark_all_metrics_as_error_with_turn_metrics(self): + def test_mark_all_metrics_as_error_with_turn_metrics(self) -> None: """Test marking all metrics as error with turn metrics.""" handler = EvaluationErrorHandler() @@ -21,7 +21,7 @@ def test_mark_all_metrics_as_error_with_turn_metrics(self): ["ragas:faithfulness", "custom:answer_correctness"], ["ragas:response_relevancy"], ] - resolved_conversation_metrics = [] + resolved_conversation_metrics: list = [] results = handler.mark_all_metrics_as_error( conv_data, @@ -51,14 +51,14 @@ def test_mark_all_metrics_as_error_with_turn_metrics(self): assert results[2].metric_identifier == "ragas:response_relevancy" assert results[2].query == "Query 2" - def test_mark_all_metrics_as_error_with_conversation_metrics(self): + def test_mark_all_metrics_as_error_with_conversation_metrics(self) -> None: """Test marking conversation-level metrics as error.""" handler = EvaluationErrorHandler() turn = TurnData(turn_id="1", query="Query", response="Response") conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - resolved_turn_metrics = [[]] + resolved_turn_metrics: list[list[str]] = [[]] resolved_conversation_metrics = [ "deepeval:conversation_completeness", "deepeval:conversation_relevancy", @@ -83,7 +83,7 @@ def test_mark_all_metrics_as_error_with_conversation_metrics(self): assert results[1].turn_id is None assert results[1].metric_identifier == "deepeval:conversation_relevancy" - def test_mark_all_metrics_as_error_mixed(self): + def test_mark_all_metrics_as_error_mixed(self) -> None: """Test marking both turn and conversation metrics as error.""" handler = EvaluationErrorHandler() @@ -111,15 +111,15 @@ def test_mark_all_metrics_as_error_mixed(self): assert results[1].turn_id is None assert results[1].metric_identifier == "deepeval:conversation_completeness" - def test_mark_all_metrics_as_error_empty_metrics(self): + def test_mark_all_metrics_as_error_empty_metrics(self) -> None: """Test marking with no metrics to mark.""" handler = EvaluationErrorHandler() turn = TurnData(turn_id="1", query="Query", response="Response") conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - resolved_turn_metrics = [[]] - resolved_conversation_metrics = [] + resolved_turn_metrics: list[list[str]] = [[]] + resolved_conversation_metrics: list[str] = [] results = handler.mark_all_metrics_as_error( conv_data, "Error", resolved_turn_metrics, resolved_conversation_metrics @@ -128,7 +128,7 @@ def test_mark_all_metrics_as_error_empty_metrics(self): # Should have no results assert len(results) == 0 - def test_mark_turn_metrics_as_error(self): + def test_mark_turn_metrics_as_error(self) -> None: """Test marking metrics for a single turn as error.""" handler = EvaluationErrorHandler() @@ -166,7 +166,7 @@ def test_mark_turn_metrics_as_error(self): assert results[1].result == "ERROR" assert results[1].reason == error_reason - def test_mark_cascade_error(self): + def test_mark_cascade_error(self) -> None: """Test marking remaining turns and conversation metrics as error after API failure.""" handler = EvaluationErrorHandler() diff --git a/tests/unit/pipeline/evaluation/test_evaluator.py b/tests/unit/pipeline/evaluation/test_evaluator.py index 6e24ce88..01d44109 100644 --- a/tests/unit/pipeline/evaluation/test_evaluator.py +++ b/tests/unit/pipeline/evaluation/test_evaluator.py @@ -1,74 +1,31 @@ """Unit tests for pipeline evaluation evaluator module.""" import pytest +from pytest_mock import MockerFixture from lightspeed_evaluation.core.llm.custom import TokenTracker from lightspeed_evaluation.core.models import ( EvaluationData, EvaluationRequest, EvaluationScope, - SystemConfig, TurnData, ) from lightspeed_evaluation.core.system.loader import ConfigLoader +from lightspeed_evaluation.core.metrics.manager import MetricManager +from lightspeed_evaluation.core.script import ScriptExecutionManager from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator -@pytest.fixture -def config_loader(mocker): - """Create a mock config loader with system config.""" - loader = mocker.Mock(spec=ConfigLoader) - - config = SystemConfig() - config.default_turn_metrics_metadata = { - "ragas:faithfulness": {"threshold": 0.7, "default": True}, - "custom:answer_correctness": {"threshold": 0.8, "default": False}, - } - config.default_conversation_metrics_metadata = { - "deepeval:conversation_completeness": {"threshold": 0.6, "default": True}, - } - config.api.enabled = True - - loader.system_config = config - return loader - - -@pytest.fixture -def mock_metric_manager(mocker): - """Create a mock metric manager.""" - from lightspeed_evaluation.core.metrics.manager import MetricManager - - manager = mocker.Mock(spec=MetricManager) - - def get_threshold(metric_id, level, conv_data=None, turn_data=None): - thresholds = { - "ragas:faithfulness": 0.7, - "custom:answer_correctness": 0.8, - "deepeval:conversation_completeness": 0.6, - } - return thresholds.get(metric_id, 0.5) - - manager.get_effective_threshold.side_effect = get_threshold - # Mock get_metric_metadata to return None (no metadata) to support iteration in _extract_metadata_for_csv - manager.get_metric_metadata.return_value = None - return manager - - -@pytest.fixture -def mock_script_manager(mocker): - """Create a mock script execution manager.""" - from lightspeed_evaluation.core.script import ScriptExecutionManager - - manager = mocker.Mock(spec=ScriptExecutionManager) - return manager - - class TestMetricsEvaluator: """Unit tests for MetricsEvaluator.""" def test_initialization( - self, config_loader, mock_metric_manager, mock_script_manager, mocker - ): + self, + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + ) -> None: """Test evaluator initialization.""" # Mock the metric handlers mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager") @@ -98,8 +55,10 @@ def test_initialization( ) # ragas, deepeval, geval, custom, script, nlp def test_initialization_raises_error_without_config( - self, mock_metric_manager, mock_script_manager - ): + self, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + ) -> None: """Test initialization fails without system config.""" loader = ConfigLoader() loader.system_config = None @@ -108,8 +67,12 @@ def test_initialization_raises_error_without_config( MetricsEvaluator(loader, mock_metric_manager, mock_script_manager) def test_evaluate_metric_turn_level_pass( - self, config_loader, mock_metric_manager, mock_script_manager, mocker - ): + self, + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + ) -> None: """Test evaluating turn-level metric that passes.""" # Mock the handlers mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager") @@ -165,8 +128,12 @@ def test_evaluate_metric_turn_level_pass( assert result.contexts == '["Context"]' def test_evaluate_metric_turn_level_fail( - self, config_loader, mock_metric_manager, mock_script_manager, mocker - ): + self, + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + ) -> None: """Test evaluating turn-level metric that fails.""" mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager") mocker.patch( @@ -210,8 +177,12 @@ def test_evaluate_metric_turn_level_fail( assert result.threshold == 0.7 def test_evaluate_metric_conversation_level( - self, config_loader, mock_metric_manager, mock_script_manager, mocker - ): + self, + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + ) -> None: """Test evaluating conversation-level metric.""" mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager") mocker.patch( @@ -250,8 +221,12 @@ def test_evaluate_metric_conversation_level( assert result.turn_id is None # Conversation-level def test_evaluate_metric_unsupported_framework( - self, config_loader, mock_metric_manager, mock_script_manager, mocker - ): + self, + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + ) -> None: """Test evaluating metric with unsupported framework.""" mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager") mocker.patch( @@ -285,8 +260,12 @@ def test_evaluate_metric_unsupported_framework( assert "Unsupported framework" in result.reason def test_evaluate_metric_returns_none_score( - self, config_loader, mock_metric_manager, mock_script_manager, mocker - ): + self, + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + ) -> None: """Test handling when metric evaluation returns None score.""" mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager") mocker.patch( @@ -328,8 +307,12 @@ def test_evaluate_metric_returns_none_score( assert result.reason == "Evaluation failed" def test_evaluate_metric_exception_handling( - self, config_loader, mock_metric_manager, mock_script_manager, mocker - ): + self, + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + ) -> None: """Test exception handling during metric evaluation.""" mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager") mocker.patch( @@ -376,9 +359,14 @@ def test_evaluate_metric_exception_handling( assert result.expected_response is None def test_evaluate_metric_skip_script_when_api_disabled( - self, config_loader, mock_metric_manager, mock_script_manager, mocker - ): + self, + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + ) -> None: """Test script metrics are skipped when API is disabled.""" + assert config_loader.system_config is not None config_loader.system_config.api.enabled = False mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager") @@ -413,8 +401,12 @@ def test_evaluate_metric_skip_script_when_api_disabled( assert result is None def test_determine_status_with_threshold( - self, config_loader, mock_metric_manager, mock_script_manager, mocker - ): + self, + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + ) -> None: """Test _determine_status method.""" mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager") mocker.patch( @@ -436,15 +428,28 @@ def test_determine_status_with_threshold( ) # Test PASS - assert evaluator._determine_status(0.8, 0.7) == "PASS" - assert evaluator._determine_status(0.7, 0.7) == "PASS" # Equal passes + assert ( + evaluator._determine_status(0.8, 0.7) # pylint: disable=protected-access + == "PASS" + ) + assert ( + evaluator._determine_status(0.7, 0.7) # pylint: disable=protected-access + == "PASS" + ) # Equal passes # Test FAIL - assert evaluator._determine_status(0.6, 0.7) == "FAIL" + assert ( + evaluator._determine_status(0.6, 0.7) # pylint: disable=protected-access + == "FAIL" + ) def test_determine_status_without_threshold( - self, config_loader, mock_metric_manager, mock_script_manager, mocker - ): + self, + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + ) -> None: """Test _determine_status uses default 0.5 when threshold is None.""" mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.LLMManager") mocker.patch( @@ -466,17 +471,23 @@ def test_determine_status_without_threshold( ) # Should use 0.5 as default - assert evaluator._determine_status(0.6, None) == "PASS" - assert evaluator._determine_status(0.4, None) == "FAIL" + assert ( + evaluator._determine_status(0.6, None) # pylint: disable=protected-access + == "PASS" + ) + assert ( + evaluator._determine_status(0.4, None) # pylint: disable=protected-access + == "FAIL" + ) - def _setup_evaluate_test( + def _setup_evaluate_test( # pylint: disable=too-many-arguments, too-many-positional-arguments self, - config_loader, - mock_metric_manager, - mock_script_manager, - mocker, - mock_return, - ): + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + mock_return: tuple[float, str] | list[tuple[float, str]], + ) -> tuple[MetricsEvaluator, dict]: """Helper to setup common mocks for _evaluate_wrapper() tests. Returns: @@ -489,7 +500,10 @@ def _setup_evaluate_test( ) # Create a helper to setup mock with return values - def create_mock_handler(mocker, mock_return): + def create_mock_handler( # type: ignore[no-untyped-def] + mocker: MockerFixture, + mock_return: tuple[float, str] | list[tuple[float, str]], + ): mock = mocker.Mock() if isinstance(mock_return, list): mock.evaluate.side_effect = mock_return @@ -544,14 +558,14 @@ def create_mock_handler(mocker, mock_return): "metric_identifier", ["ragas:context_recall", "custom:answer_correctness", "nlp:rouge"], ) - def test_evaluate_with_expected_response_list( + def test_evaluate_with_expected_response_list( # pylint: disable=too-many-arguments, too-many-positional-arguments self, - config_loader, - mock_metric_manager, - mock_script_manager, - mocker, - metric_identifier, - ): + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + metric_identifier: str, + ) -> None: """Test _evaluate_wrapper() with list expected_response for metric that requires it.""" evaluator, mock_handlers = self._setup_evaluate_test( config_loader, @@ -572,7 +586,9 @@ def test_evaluate_with_expected_response_list( request = EvaluationRequest.for_turn(conv_data, metric_identifier, 0, turn_data) scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False) - metric_result = evaluator._evaluate_wrapper(request, scope, 0.7) + metric_result = evaluator._evaluate_wrapper( # pylint: disable=protected-access + request, scope, 0.7 + ) assert metric_result.score == 0.85 assert metric_result.reason == "High score" @@ -583,8 +599,12 @@ def test_evaluate_with_expected_response_list( assert mock_handlers[framework].evaluate.call_count == 2 def test_evaluate_with_expected_response_list_fail( - self, config_loader, mock_metric_manager, mock_script_manager, mocker - ): + self, + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + ) -> None: """Test _evaluate_wrapper() with list expected_response for metric that requires it.""" scores_reasons = [(0.3, "Score 1"), (0.65, "Score 2"), (0.45, "Score 3")] evaluator, mock_handlers = self._setup_evaluate_test( @@ -608,7 +628,9 @@ def test_evaluate_with_expected_response_list_fail( ) scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False) - metric_result = evaluator._evaluate_wrapper(request, scope, 0.7) + metric_result = evaluator._evaluate_wrapper( # pylint: disable=protected-access + request, scope, 0.7 + ) reason_combined = "\n".join( [f"{score}; {reason}" for score, reason in scores_reasons] ) @@ -619,8 +641,12 @@ def test_evaluate_with_expected_response_list_fail( assert mock_handlers["ragas"].evaluate.call_count == 3 def test_evaluate_with_expected_response_string( - self, config_loader, mock_metric_manager, mock_script_manager, mocker - ): + self, + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + ) -> None: """Test _evaluate_wrapper() with string expected_response.""" evaluator, mock_handlers = self._setup_evaluate_test( config_loader, @@ -639,7 +665,9 @@ def test_evaluate_with_expected_response_string( ) scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False) - metric_result = evaluator._evaluate_wrapper(request, scope, 0.7) + metric_result = evaluator._evaluate_wrapper( # pylint: disable=protected-access + request, scope, 0.7 + ) assert metric_result.score == 0.85 assert metric_result.reason == "Good score" @@ -654,15 +682,15 @@ def test_evaluate_with_expected_response_string( [None, "string", ["string1", "string2"]], ids=["none", "string", "string_list"], ) - def test_evaluate_with_expected_response_not_needed( + def test_evaluate_with_expected_response_not_needed( # pylint: disable=too-many-arguments, too-many-positional-arguments self, - config_loader, - mock_metric_manager, - mock_script_manager, - mocker, - metric_identifier, - expected_response, - ): + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, + metric_identifier: str, + expected_response: str | list[str] | None, + ) -> None: """Test _evaluate_wrapper() with metric that does not require expected_response.""" evaluator, mock_handlers = self._setup_evaluate_test( config_loader, @@ -683,7 +711,9 @@ def test_evaluate_with_expected_response_not_needed( request = EvaluationRequest.for_turn(conv_data, metric_identifier, 0, turn_data) scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False) - metric_result = evaluator._evaluate_wrapper(request, scope, 0.7) + metric_result = evaluator._evaluate_wrapper( # pylint: disable=protected-access + request, scope, 0.7 + ) assert metric_result.score == 0.3 assert metric_result.reason == "Low score" @@ -697,21 +727,21 @@ def test_evaluate_with_expected_response_not_needed( class TestTokenTracker: """Unit tests for TokenTracker class.""" - def test_token_tracker_initialization(self): + def test_token_tracker_initialization(self) -> None: """Test TokenTracker initializes with zero counts.""" tracker = TokenTracker() input_tokens, output_tokens = tracker.get_counts() assert input_tokens == 0 assert output_tokens == 0 - def test_token_tracker_get_counts_returns_tuple(self): + def test_token_tracker_get_counts_returns_tuple(self) -> None: """Test get_counts returns a tuple.""" tracker = TokenTracker() result = tracker.get_counts() assert isinstance(result, tuple) assert len(result) == 2 - def test_token_tracker_reset(self): + def test_token_tracker_reset(self) -> None: """Test reset clears token counts.""" tracker = TokenTracker() tracker.input_tokens = 100 @@ -719,31 +749,31 @@ def test_token_tracker_reset(self): tracker.reset() assert tracker.get_counts() == (0, 0) - def test_token_tracker_start_stop(self): + def test_token_tracker_start_stop(self) -> None: """Test start and stop methods.""" tracker = TokenTracker() tracker.start() - assert tracker._callback_registered is True + assert tracker._callback_registered is True # pylint: disable=protected-access tracker.stop() - assert tracker._callback_registered is False + assert tracker._callback_registered is False # pylint: disable=protected-access - def test_token_tracker_double_start(self): + def test_token_tracker_double_start(self) -> None: """Test calling start twice doesn't register callback twice.""" tracker = TokenTracker() tracker.start() tracker.start() # Should not fail - assert tracker._callback_registered is True + assert tracker._callback_registered is True # pylint: disable=protected-access tracker.stop() - def test_token_tracker_double_stop(self): + def test_token_tracker_double_stop(self) -> None: """Test calling stop twice doesn't fail.""" tracker = TokenTracker() tracker.start() tracker.stop() tracker.stop() # Should not fail - assert tracker._callback_registered is False + assert tracker._callback_registered is False # pylint: disable=protected-access - def test_token_tracker_independent_instances(self): + def test_token_tracker_independent_instances(self) -> None: """Test multiple TokenTracker instances are independent.""" tracker1 = TokenTracker() tracker2 = TokenTracker() diff --git a/tests/unit/pipeline/evaluation/test_pipeline.py b/tests/unit/pipeline/evaluation/test_pipeline.py index a922a87e..aeaba1fc 100644 --- a/tests/unit/pipeline/evaluation/test_pipeline.py +++ b/tests/unit/pipeline/evaluation/test_pipeline.py @@ -1,53 +1,22 @@ """Unit tests for EvaluationPipeline.""" import pytest +from pytest_mock import MockerFixture from lightspeed_evaluation.core.models import ( EvaluationData, EvaluationResult, - SystemConfig, - TurnData, ) from lightspeed_evaluation.core.system.loader import ConfigLoader from lightspeed_evaluation.pipeline.evaluation.pipeline import EvaluationPipeline -@pytest.fixture -def mock_config_loader(mocker): - """Create a mock config loader with system config.""" - loader = mocker.Mock(spec=ConfigLoader) - - config = SystemConfig() - config.api.enabled = False - config.output.output_dir = "/tmp/test_output" - config.output.base_filename = "test" - config.core.max_threads = 2 - - loader.system_config = config - return loader - - -@pytest.fixture -def sample_evaluation_data(): - """Create sample evaluation data.""" - turn1 = TurnData( - turn_id="turn1", - query="What is Python?", - response="Python is a programming language.", - contexts=["Python context"], - turn_metrics=["ragas:faithfulness"], - ) - conv_data = EvaluationData( - conversation_group_id="conv1", - turns=[turn1], - ) - return [conv_data] - - class TestEvaluationPipeline: """Unit tests for EvaluationPipeline.""" - def test_initialization_success(self, mock_config_loader, mocker): + def test_initialization_success( + self, mock_config_loader: ConfigLoader, mocker: MockerFixture + ) -> None: """Test successful pipeline initialization.""" # Mock components mocker.patch("lightspeed_evaluation.pipeline.evaluation.pipeline.MetricManager") @@ -74,7 +43,7 @@ def test_initialization_success(self, mock_config_loader, mocker): assert pipeline.system_config is not None assert pipeline.output_dir == "/tmp/test_output" - def test_initialization_without_config(self, mocker): + def test_initialization_without_config(self, mocker: MockerFixture) -> None: """Test initialization fails without system config.""" loader = mocker.Mock(spec=ConfigLoader) loader.system_config = None @@ -82,8 +51,11 @@ def test_initialization_without_config(self, mocker): with pytest.raises(ValueError, match="SystemConfig must be loaded"): EvaluationPipeline(loader) - def test_create_api_client_when_enabled(self, mock_config_loader, mocker): + def test_create_api_client_when_enabled( + self, mock_config_loader: ConfigLoader, mocker: MockerFixture + ) -> None: """Test API client creation when enabled.""" + assert mock_config_loader.system_config is not None mock_config_loader.system_config.api.enabled = True mock_config_loader.system_config.api.api_base = "http://test.com" mock_config_loader.system_config.api.endpoint_type = "test" @@ -113,8 +85,11 @@ def test_create_api_client_when_enabled(self, mock_config_loader, mocker): assert pipeline.api_client is not None mock_api_client.assert_called_once() - def test_create_api_client_when_disabled(self, mock_config_loader, mocker): + def test_create_api_client_when_disabled( + self, mock_config_loader: ConfigLoader, mocker: MockerFixture + ) -> None: """Test no API client when disabled.""" + assert mock_config_loader.system_config is not None mock_config_loader.system_config.api.enabled = False mocker.patch("lightspeed_evaluation.pipeline.evaluation.pipeline.MetricManager") @@ -139,8 +114,11 @@ def test_create_api_client_when_disabled(self, mock_config_loader, mocker): assert pipeline.api_client is None def test_run_evaluation_success( - self, mock_config_loader, sample_evaluation_data, mocker - ): + self, + mock_config_loader: ConfigLoader, + sample_evaluation_data: list[EvaluationData], + mocker: MockerFixture, + ) -> None: """Test successful evaluation run.""" # Mock all components mocker.patch("lightspeed_evaluation.pipeline.evaluation.pipeline.MetricManager") @@ -182,9 +160,13 @@ def test_run_evaluation_success( assert results[0].result == "PASS" def test_run_evaluation_saves_amended_data_when_api_enabled( - self, mock_config_loader, sample_evaluation_data, mocker - ): + self, + mock_config_loader: ConfigLoader, + sample_evaluation_data: list[EvaluationData], + mocker: MockerFixture, + ) -> None: """Test amended data is saved when API is enabled.""" + assert mock_config_loader.system_config is not None mock_config_loader.system_config.api.enabled = True mocker.patch("lightspeed_evaluation.pipeline.evaluation.pipeline.MetricManager") @@ -220,9 +202,13 @@ def test_run_evaluation_saves_amended_data_when_api_enabled( mock_save.assert_called_once() def test_save_amended_data_handles_exception( - self, mock_config_loader, sample_evaluation_data, mocker - ): + self, + mock_config_loader: ConfigLoader, + sample_evaluation_data: list[EvaluationData], + mocker: MockerFixture, + ) -> None: """Test save amended data handles exceptions gracefully.""" + assert mock_config_loader.system_config is not None mock_config_loader.system_config.api.enabled = True mocker.patch("lightspeed_evaluation.pipeline.evaluation.pipeline.MetricManager") @@ -259,8 +245,11 @@ def test_save_amended_data_handles_exception( assert results is not None - def test_close_with_api_client(self, mock_config_loader, mocker): + def test_close_with_api_client( + self, mock_config_loader: ConfigLoader, mocker: MockerFixture + ) -> None: """Test close method with API client.""" + assert mock_config_loader.system_config is not None mock_config_loader.system_config.api.enabled = True mock_config_loader.system_config.api.api_base = "http://test.com" mock_config_loader.system_config.api.endpoint_type = "test" @@ -302,8 +291,11 @@ def test_close_with_api_client(self, mock_config_loader, mocker): mock_api_client.close.assert_called_once() - def test_close_without_api_client(self, mock_config_loader, mocker): + def test_close_without_api_client( + self, mock_config_loader: ConfigLoader, mocker: MockerFixture + ) -> None: """Test close method without API client.""" + assert mock_config_loader.system_config is not None mock_config_loader.system_config.api.enabled = False mocker.patch("lightspeed_evaluation.pipeline.evaluation.pipeline.MetricManager") @@ -332,7 +324,9 @@ def test_close_without_api_client(self, mock_config_loader, mocker): # Should not raise any errors pipeline.close() - def test_output_dir_override(self, mock_config_loader, mocker): + def test_output_dir_override( + self, mock_config_loader: ConfigLoader, mocker: MockerFixture + ) -> None: """Test output directory can be overridden.""" mocker.patch("lightspeed_evaluation.pipeline.evaluation.pipeline.MetricManager") mocker.patch( diff --git a/tests/unit/pipeline/evaluation/test_processor.py b/tests/unit/pipeline/evaluation/test_processor.py index c3f0d033..16b3d4f5 100644 --- a/tests/unit/pipeline/evaluation/test_processor.py +++ b/tests/unit/pipeline/evaluation/test_processor.py @@ -1,8 +1,13 @@ """Unit tests for ConversationProcessor.""" +from typing import Callable +import logging + import pytest +from _pytest.logging import LogCaptureFixture +from pytest_mock import MockerFixture -from lightspeed_evaluation.core.metrics.manager import MetricManager +from lightspeed_evaluation.core.metrics.manager import MetricLevel from lightspeed_evaluation.core.models import ( EvaluationData, EvaluationRequest, @@ -10,13 +15,8 @@ SystemConfig, TurnData, ) -from lightspeed_evaluation.core.script import ( - ScriptExecutionError, - ScriptExecutionManager, -) +from lightspeed_evaluation.core.script import ScriptExecutionError from lightspeed_evaluation.core.system.loader import ConfigLoader -from lightspeed_evaluation.pipeline.evaluation.amender import APIDataAmender -from lightspeed_evaluation.pipeline.evaluation.errors import EvaluationErrorHandler from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator from lightspeed_evaluation.pipeline.evaluation.processor import ( ConversationProcessor, @@ -24,57 +24,14 @@ ) -@pytest.fixture -def mock_config_loader(mocker): - """Create a mock config loader.""" - loader = mocker.Mock(spec=ConfigLoader) - config = SystemConfig() - config.api.enabled = False - loader.system_config = config - return loader - - -@pytest.fixture -def processor_components(mocker): - """Create processor components.""" - metrics_evaluator = mocker.Mock(spec=MetricsEvaluator) - api_amender = mocker.Mock(spec=APIDataAmender) - error_handler = mocker.Mock(spec=EvaluationErrorHandler) - metric_manager = mocker.Mock(spec=MetricManager) - script_manager = mocker.Mock(spec=ScriptExecutionManager) - - # Default behavior for metric resolution - metric_manager.resolve_metrics.return_value = ["ragas:faithfulness"] - - return ProcessorComponents( - metrics_evaluator=metrics_evaluator, - api_amender=api_amender, - error_handler=error_handler, - metric_manager=metric_manager, - script_manager=script_manager, - ) - - -@pytest.fixture -def sample_conv_data(): - """Create sample conversation data.""" - turn1 = TurnData( - turn_id="turn1", - query="What is Python?", - response="Python is a programming language.", - contexts=["Context"], - turn_metrics=["ragas:faithfulness"], - ) - return EvaluationData( - conversation_group_id="conv1", - turns=[turn1], - ) - - class TestConversationProcessor: """Unit tests for ConversationProcessor.""" - def test_initialization(self, mock_config_loader, processor_components): + def test_initialization( + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + ) -> None: """Test processor initialization.""" processor = ConversationProcessor(mock_config_loader, processor_components) @@ -83,8 +40,12 @@ def test_initialization(self, mock_config_loader, processor_components): assert processor.components == processor_components def test_process_conversation_skips_when_no_metrics( - self, mock_config_loader, processor_components, sample_conv_data, mocker - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + sample_conv_data: EvaluationData, + mocker: MockerFixture, # pylint: disable=unused-argument + ) -> None: """Test processing skips when no metrics specified.""" # Mock metric manager to return empty lists processor_components.metric_manager.resolve_metrics.return_value = [] @@ -95,15 +56,16 @@ def test_process_conversation_skips_when_no_metrics( assert len(results) == 0 def test_process_conversation_turn_metrics( - self, mock_config_loader, processor_components, sample_conv_data, mocker - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + sample_conv_data: EvaluationData, + mocker: MockerFixture, # pylint: disable=unused-argument + ) -> None: """Test processing with turn-level metrics.""" - from lightspeed_evaluation.core.models import EvaluationResult # Configure metric manager to return turn metrics and empty conversation metrics - def resolve_side_effect(metrics, level): - from lightspeed_evaluation.core.metrics.manager import MetricLevel - + def resolve_side_effect(_metrics: list[str], level: MetricLevel) -> list[str]: if level == MetricLevel.TURN: return ["ragas:faithfulness"] return [] @@ -134,10 +96,12 @@ def resolve_side_effect(metrics, level): assert all(r.result == "PASS" for r in results) def test_process_conversation_conversation_metrics( - self, mock_config_loader, processor_components, mocker - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + mocker: MockerFixture, # pylint: disable=unused-argument + ) -> None: """Test processing with conversation-level metrics.""" - from lightspeed_evaluation.core.models import EvaluationResult turn1 = TurnData(turn_id="turn1", query="Q", response="R") conv_data = EvaluationData( @@ -147,9 +111,7 @@ def test_process_conversation_conversation_metrics( ) # Mock metric resolution - def resolve_side_effect(metrics, level): - from lightspeed_evaluation.core.metrics.manager import MetricLevel - + def resolve_side_effect(_metrics: list[str], level: MetricLevel) -> list[str]: if level == MetricLevel.TURN: return [] return ["deepeval:conversation_completeness"] @@ -178,18 +140,20 @@ def resolve_side_effect(metrics, level): assert results[0].turn_id is None # Conversation-level def test_process_conversation_with_setup_script_success( - self, mock_config_loader, processor_components, sample_conv_data, mocker - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + sample_conv_data: EvaluationData, + mocker: MockerFixture, # pylint: disable=unused-argument + ) -> None: """Test processing with successful setup script.""" - from lightspeed_evaluation.core.models import EvaluationResult sample_conv_data.setup_script = "setup.sh" + assert mock_config_loader.system_config is not None mock_config_loader.system_config.api.enabled = True # Configure metric manager to return turn metrics and empty conversation metrics - def resolve_side_effect(metrics, level): - from lightspeed_evaluation.core.metrics.manager import MetricLevel - + def resolve_side_effect(_metrics: list[str], level: MetricLevel) -> list[str]: if level == MetricLevel.TURN: return ["ragas:faithfulness"] return [] @@ -225,10 +189,15 @@ def resolve_side_effect(metrics, level): assert len(results) > 0 def test_process_conversation_with_setup_script_failure( - self, mock_config_loader, processor_components, sample_conv_data, mocker - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + sample_conv_data: EvaluationData, + mocker: MockerFixture, # pylint: disable=unused-argument + ) -> None: """Test processing handles setup script failure.""" sample_conv_data.setup_script = "setup.sh" + assert mock_config_loader.system_config is not None mock_config_loader.system_config.api.enabled = True processor_components.script_manager.run_script.side_effect = ( @@ -242,18 +211,20 @@ def test_process_conversation_with_setup_script_failure( processor_components.error_handler.mark_all_metrics_as_error.assert_called_once() def test_process_conversation_with_cleanup_script( - self, mock_config_loader, processor_components, sample_conv_data, mocker - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + sample_conv_data: EvaluationData, + mocker: MockerFixture, # pylint: disable=unused-argument + ) -> None: """Test cleanup script is always called.""" - from lightspeed_evaluation.core.models import EvaluationResult sample_conv_data.cleanup_script = "cleanup.sh" + assert mock_config_loader.system_config is not None mock_config_loader.system_config.api.enabled = True # Configure metric manager to return turn metrics and empty conversation metrics - def resolve_side_effect(metrics, level): - from lightspeed_evaluation.core.metrics.manager import MetricLevel - + def resolve_side_effect(_metrics: list[str], level: MetricLevel) -> list[str]: if level == MetricLevel.TURN: return ["ragas:faithfulness"] return [] @@ -290,17 +261,19 @@ def resolve_side_effect(metrics, level): assert any("cleanup.sh" in str(call) for call in calls) def test_process_conversation_with_api_amendment( - self, mock_config_loader, processor_components, sample_conv_data, mocker - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + sample_conv_data: EvaluationData, + mocker: MockerFixture, # pylint: disable=unused-argument + ) -> None: """Test API amendment during turn processing.""" - from lightspeed_evaluation.core.models import EvaluationResult + assert mock_config_loader.system_config is not None mock_config_loader.system_config.api.enabled = True # Configure metric manager to return turn metrics and empty conversation metrics - def resolve_side_effect(metrics, level): - from lightspeed_evaluation.core.metrics.manager import MetricLevel - + def resolve_side_effect(_metrics: list[str], level: MetricLevel) -> list[str]: if level == MetricLevel.TURN: return ["ragas:faithfulness"] return [] @@ -335,9 +308,13 @@ def resolve_side_effect(metrics, level): assert len(results) > 0 def test_process_conversation_with_api_error_cascade( - self, mock_config_loader, processor_components, mocker - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + mocker: MockerFixture, # pylint: disable=unused-argument + ) -> None: """Test API error causes cascade failure.""" + assert mock_config_loader.system_config is not None mock_config_loader.system_config.api.enabled = True # Create multi-turn conversation @@ -373,10 +350,13 @@ def test_process_conversation_with_api_error_cascade( processor_components.error_handler.mark_cascade_error.assert_called_once() def test_evaluate_turn( - self, mock_config_loader, processor_components, sample_conv_data, mocker - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + sample_conv_data: EvaluationData, + mocker: MockerFixture, # pylint: disable=unused-argument + ) -> None: """Test _evaluate_turn method.""" - from lightspeed_evaluation.core.models import EvaluationResult mock_result = EvaluationResult( conversation_group_id="conv1", @@ -392,7 +372,7 @@ def test_evaluate_turn( ) processor = ConversationProcessor(mock_config_loader, processor_components) - results = processor._evaluate_turn( + results = processor._evaluate_turn( # pylint: disable=protected-access sample_conv_data, 0, sample_conv_data.turns[0], ["ragas:faithfulness"] ) @@ -400,10 +380,13 @@ def test_evaluate_turn( assert results[0].result == "PASS" def test_evaluate_conversation( - self, mock_config_loader, processor_components, sample_conv_data, mocker - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + sample_conv_data: EvaluationData, + mocker: MockerFixture, # pylint: disable=unused-argument + ) -> None: """Test _evaluate_conversation method.""" - from lightspeed_evaluation.core.models import EvaluationResult mock_result = EvaluationResult( conversation_group_id="conv1", @@ -419,7 +402,7 @@ def test_evaluate_conversation( ) processor = ConversationProcessor(mock_config_loader, processor_components) - results = processor._evaluate_conversation( + results = processor._evaluate_conversation( # pylint: disable=protected-access sample_conv_data, ["deepeval:conversation_completeness"] ) @@ -427,46 +410,67 @@ def test_evaluate_conversation( assert results[0].turn_id is None def test_run_setup_script_skips_when_api_disabled( - self, mock_config_loader, processor_components, sample_conv_data - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + sample_conv_data: EvaluationData, + ) -> None: """Test setup script is skipped when API disabled.""" sample_conv_data.setup_script = "setup.sh" + assert mock_config_loader.system_config is not None mock_config_loader.system_config.api.enabled = False processor = ConversationProcessor(mock_config_loader, processor_components) - error = processor._run_setup_script(sample_conv_data) + error = processor._run_setup_script( # pylint: disable=protected-access + sample_conv_data + ) assert error is None processor_components.script_manager.run_script.assert_not_called() def test_run_cleanup_script_skips_when_api_disabled( - self, mock_config_loader, processor_components, sample_conv_data - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + sample_conv_data: EvaluationData, + ) -> None: """Test cleanup script is skipped when API disabled.""" sample_conv_data.cleanup_script = "cleanup.sh" + assert mock_config_loader.system_config is not None mock_config_loader.system_config.api.enabled = False processor = ConversationProcessor(mock_config_loader, processor_components) - processor._run_cleanup_script(sample_conv_data) + processor._run_cleanup_script( # pylint: disable=protected-access + sample_conv_data + ) processor_components.script_manager.run_script.assert_not_called() def test_run_cleanup_script_logs_warning_on_failure( - self, mock_config_loader, processor_components, sample_conv_data - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + sample_conv_data: EvaluationData, + ) -> None: """Test cleanup script failure is logged as warning.""" sample_conv_data.cleanup_script = "cleanup.sh" + assert mock_config_loader.system_config is not None mock_config_loader.system_config.api.enabled = True processor_components.script_manager.run_script.return_value = False processor = ConversationProcessor(mock_config_loader, processor_components) # Should not raise, just log warning - processor._run_cleanup_script(sample_conv_data) + processor._run_cleanup_script( # pylint: disable=protected-access + sample_conv_data + ) def test_get_metrics_summary( - self, mock_config_loader, processor_components, sample_conv_data - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + sample_conv_data: EvaluationData, + ) -> None: """Test get_metrics_summary method.""" processor_components.metric_manager.count_metrics_for_conversation.return_value = { "turn_metrics": 2, @@ -480,118 +484,12 @@ def test_get_metrics_summary( assert summary["conversation_metrics"] == 1 -# Fixtures for TestConversationProcessorEvaluateTurn -@pytest.fixture -def config_loader(mocker): - """Create a mock config loader with system config.""" - loader = mocker.Mock(spec=ConfigLoader) - - config = SystemConfig() - config.default_turn_metrics_metadata = { - "ragas:faithfulness": {"threshold": 0.7, "default": True}, - "custom:answer_correctness": {"threshold": 0.8, "default": False}, - } - config.default_conversation_metrics_metadata = { - "deepeval:conversation_completeness": {"threshold": 0.6, "default": True}, - } - config.api.enabled = False - - loader.system_config = config - return loader - - -@pytest.fixture -def mock_metrics_evaluator(mocker): - """Create a mock metrics evaluator.""" - evaluator = mocker.Mock(spec=MetricsEvaluator) - - def evaluate_metric(request): - """Mock evaluate_metric that returns a result based on metric.""" - return EvaluationResult( - conversation_group_id=request.conv_data.conversation_group_id, - turn_id=request.turn_id, - metric_identifier=request.metric_identifier, - result="PASS", - score=0.85, - reason="Test evaluation", - threshold=0.7, - ) - - evaluator.evaluate_metric.side_effect = evaluate_metric - return evaluator - - -@pytest.fixture -def mock_api_amender(mocker): - """Create a mock API data amender.""" - amender = mocker.Mock(spec=APIDataAmender) - return amender - - -@pytest.fixture -def mock_error_handler(mocker): - """Create a mock error handler.""" - handler = mocker.Mock(spec=EvaluationErrorHandler) - - # Configure create_error_result to return a proper EvaluationResult - def create_error_result_side_effect( - conv_id, metric_id, reason, *, turn_id=None, query="" - ): - return EvaluationResult( - conversation_group_id=conv_id, - turn_id=turn_id, - metric_identifier=metric_id, - result="ERROR", - reason=reason, - query=query, - ) - - handler.create_error_result.side_effect = create_error_result_side_effect - return handler - - -@pytest.fixture -def mock_metric_manager(mocker): - """Create a mock metric manager.""" - manager = mocker.Mock(spec=MetricManager) - return manager - - -@pytest.fixture -def mock_script_manager(mocker): - """Create a mock script execution manager.""" - manager = mocker.Mock(spec=ScriptExecutionManager) - return manager - - -@pytest.fixture -def processor_components_pr( - mock_metrics_evaluator, - mock_api_amender, - mock_error_handler, - mock_metric_manager, - mock_script_manager, -): - """Create processor components fixture for PR tests.""" - return ProcessorComponents( - metrics_evaluator=mock_metrics_evaluator, - api_amender=mock_api_amender, - error_handler=mock_error_handler, - metric_manager=mock_metric_manager, - script_manager=mock_script_manager, - ) - - -@pytest.fixture -def processor(config_loader, processor_components_pr): - """Create ConversationProcessor instance for PR tests.""" - return ConversationProcessor(config_loader, processor_components_pr) - - class TestConversationProcessorEvaluateTurn: """Unit tests for ConversationProcessor._evaluate_turn method.""" - def test_evaluate_turn_with_valid_metrics(self, processor, mock_metrics_evaluator): + def test_evaluate_turn_with_valid_metrics( + self, processor: ConversationProcessor, mock_metrics_evaluator: MetricsEvaluator + ) -> None: """Test _evaluate_turn with all valid metrics.""" turn_data = TurnData( turn_id="1", @@ -603,7 +501,9 @@ def test_evaluate_turn_with_valid_metrics(self, processor, mock_metrics_evaluato turn_metrics = ["ragas:faithfulness", "custom:answer_correctness"] - results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) + results = processor._evaluate_turn( # pylint: disable=protected-access + conv_data, 0, turn_data, turn_metrics + ) # Should evaluate both metrics assert len(results) == 2 @@ -618,10 +518,12 @@ def test_evaluate_turn_with_valid_metrics(self, processor, mock_metrics_evaluato assert calls[1][0][0].metric_identifier == "custom:answer_correctness" def test_evaluate_turn_with_invalid_metric( - self, processor, mock_metrics_evaluator, caplog - ): + self, + processor: ConversationProcessor, + mock_metrics_evaluator: MetricsEvaluator, + caplog: LogCaptureFixture, + ) -> None: """Test _evaluate_turn with an invalid metric - creates ERROR result and logs error.""" - import logging turn_data = TurnData( turn_id="1", @@ -637,7 +539,9 @@ def test_evaluate_turn_with_invalid_metric( turn_metrics = ["ragas:faithfulness", "custom:answer_correctness"] with caplog.at_level(logging.ERROR): - results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) + results = processor._evaluate_turn( # pylint: disable=protected-access + conv_data, 0, turn_data, turn_metrics + ) # Should get 2 results: 1 ERROR for invalid metric, 1 PASS for valid metric assert len(results) == 2 @@ -658,10 +562,12 @@ def test_evaluate_turn_with_invalid_metric( assert "check Validation Errors" in caplog.text def test_evaluate_turn_with_all_invalid_metrics( - self, processor, mock_metrics_evaluator, caplog - ): + self, + processor: ConversationProcessor, + mock_metrics_evaluator: MetricsEvaluator, + caplog: LogCaptureFixture, + ) -> None: """Test _evaluate_turn with all metrics invalid - returns ERROR results.""" - import logging turn_data = TurnData( turn_id="1", @@ -678,7 +584,9 @@ def test_evaluate_turn_with_all_invalid_metrics( turn_metrics = ["ragas:faithfulness", "custom:answer_correctness"] with caplog.at_level(logging.ERROR): - results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) + results = processor._evaluate_turn( # pylint: disable=protected-access + conv_data, 0, turn_data, turn_metrics + ) # Should return ERROR results for both invalid metrics assert len(results) == 2 @@ -694,10 +602,12 @@ def test_evaluate_turn_with_all_invalid_metrics( assert "Invalid turn metric 'custom:answer_correctness'" in caplog.text def test_evaluate_turn_with_mixed_valid_invalid_metrics( - self, processor, mock_metrics_evaluator, caplog - ): + self, + processor: ConversationProcessor, + mock_metrics_evaluator: MetricsEvaluator, + caplog: LogCaptureFixture, + ) -> None: """Test _evaluate_turn with mix of valid and invalid metrics.""" - import logging turn_data = TurnData( turn_id="1", @@ -717,7 +627,9 @@ def test_evaluate_turn_with_mixed_valid_invalid_metrics( ] with caplog.at_level(logging.ERROR): - results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) + results = processor._evaluate_turn( # pylint: disable=protected-access + conv_data, 0, turn_data, turn_metrics + ) # Should get 3 results: 2 valid metrics (PASS) and 1 invalid metric (ERROR) assert len(results) == 3 @@ -734,7 +646,9 @@ def test_evaluate_turn_with_mixed_valid_invalid_metrics( # Verify error was logged for invalid metric assert "Invalid turn metric 'custom:answer_correctness'" in caplog.text - def test_evaluate_turn_with_empty_metrics(self, processor, mock_metrics_evaluator): + def test_evaluate_turn_with_empty_metrics( + self, processor: ConversationProcessor, mock_metrics_evaluator: MetricsEvaluator + ) -> None: """Test _evaluate_turn with empty metrics list.""" turn_data = TurnData( turn_id="1", @@ -743,9 +657,11 @@ def test_evaluate_turn_with_empty_metrics(self, processor, mock_metrics_evaluato ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data]) - turn_metrics = [] + turn_metrics: list[str] = [] - results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) + results = processor._evaluate_turn( # pylint: disable=protected-access + conv_data, 0, turn_data, turn_metrics + ) # Should return empty results assert len(results) == 0 @@ -754,8 +670,8 @@ def test_evaluate_turn_with_empty_metrics(self, processor, mock_metrics_evaluato assert mock_metrics_evaluator.evaluate_metric.call_count == 0 def test_evaluate_turn_creates_correct_request( - self, processor, mock_metrics_evaluator - ): + self, processor: ConversationProcessor, mock_metrics_evaluator: MetricsEvaluator + ) -> None: """Test _evaluate_turn creates correct EvaluationRequest.""" turn_data = TurnData( turn_id="turn_123", @@ -767,7 +683,9 @@ def test_evaluate_turn_creates_correct_request( turn_metrics = ["ragas:faithfulness"] - processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) + processor._evaluate_turn( # pylint: disable=protected-access + conv_data, 0, turn_data, turn_metrics + ) # Verify the request structure assert mock_metrics_evaluator.evaluate_metric.call_count == 1 @@ -780,8 +698,8 @@ def test_evaluate_turn_creates_correct_request( assert call_args.turn_idx == 0 def test_evaluate_turn_handles_evaluator_returning_none( - self, processor, mock_metrics_evaluator - ): + self, processor: ConversationProcessor, mock_metrics_evaluator: MetricsEvaluator + ) -> None: """Test _evaluate_turn handles when evaluator returns None.""" turn_data = TurnData( turn_id="1", @@ -796,7 +714,9 @@ def test_evaluate_turn_handles_evaluator_returning_none( turn_metrics = ["ragas:faithfulness"] - results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) + results = processor._evaluate_turn( # pylint: disable=protected-access + conv_data, 0, turn_data, turn_metrics + ) # Should return empty results when evaluator returns None assert len(results) == 0 @@ -805,8 +725,8 @@ def test_evaluate_turn_handles_evaluator_returning_none( assert mock_metrics_evaluator.evaluate_metric.call_count == 1 def test_evaluate_turn_multiple_turns_correct_index( - self, processor, mock_metrics_evaluator - ): + self, processor: ConversationProcessor, mock_metrics_evaluator: MetricsEvaluator + ) -> None: """Test _evaluate_turn uses correct turn index.""" turn_data_1 = TurnData(turn_id="1", query="Q1", response="R1") turn_data_2 = TurnData(turn_id="2", query="Q2", response="R2") @@ -820,7 +740,9 @@ def test_evaluate_turn_multiple_turns_correct_index( turn_metrics = ["ragas:faithfulness"] # Evaluate second turn (index 1) - processor._evaluate_turn(conv_data, 1, turn_data_2, turn_metrics) + processor._evaluate_turn( # pylint: disable=protected-access + conv_data, 1, turn_data_2, turn_metrics + ) # Verify correct turn index call_args = mock_metrics_evaluator.evaluate_metric.call_args[0][0] @@ -828,8 +750,8 @@ def test_evaluate_turn_multiple_turns_correct_index( assert call_args.turn_id == "2" def test_evaluate_turn_preserves_metric_order( - self, processor, mock_metrics_evaluator - ): + self, processor: ConversationProcessor, mock_metrics_evaluator: MetricsEvaluator + ) -> None: """Test _evaluate_turn evaluates metrics in the order provided.""" turn_data = TurnData( turn_id="1", @@ -844,7 +766,9 @@ def test_evaluate_turn_preserves_metric_order( "ragas:context_recall", ] - processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) + processor._evaluate_turn( # pylint: disable=protected-access + conv_data, 0, turn_data, turn_metrics + ) # Verify metrics were evaluated in order assert mock_metrics_evaluator.evaluate_metric.call_count == 3 @@ -854,7 +778,7 @@ def test_evaluate_turn_preserves_metric_order( assert calls[1][0][0].metric_identifier == "ragas:faithfulness" assert calls[2][0][0].metric_identifier == "ragas:context_recall" - def test_is_metric_invalid_functionality(self): + def test_is_metric_invalid_functionality(self) -> None: """Test TurnData.is_metric_invalid and add_invalid_metric methods.""" turn_data = TurnData(turn_id="1", query="Q", response="R") @@ -886,7 +810,7 @@ class TestSkipOnFailure: """Unit tests for skip_on_failure feature.""" @pytest.fixture - def multi_turn_conv_data(self): + def multi_turn_conv_data(self) -> EvaluationData: """Create conversation data with multiple turns.""" turns = [ TurnData( @@ -904,10 +828,12 @@ def multi_turn_conv_data(self): ) @pytest.fixture - def config_loader_factory(self, mocker): + def config_loader_factory( + self, mocker: MockerFixture + ) -> Callable[[bool], ConfigLoader]: """Factory to create config loader with configurable skip_on_failure.""" - def _create(skip_on_failure: bool): + def _create(skip_on_failure: bool) -> ConfigLoader: loader = mocker.Mock(spec=ConfigLoader) config = SystemConfig() config.api.enabled = False @@ -926,14 +852,14 @@ def _create(skip_on_failure: bool): (True, False, False), # System enabled, conv disables ], ) - def test_is_skip_on_failure_enabled( + def test_is_skip_on_failure_enabled( # pylint: disable=too-many-arguments, too-many-positional-arguments self, - config_loader_factory, - processor_components, - system_skip, - conv_skip, - expected, - ): + config_loader_factory: Callable[[bool], ConfigLoader], + processor_components: ProcessorComponents, + system_skip: bool, + conv_skip: bool, + expected: bool, + ) -> None: """Test skip_on_failure resolution from system config and conversation override.""" conv_data = EvaluationData( conversation_group_id="test", @@ -943,7 +869,12 @@ def test_is_skip_on_failure_enabled( processor = ConversationProcessor( config_loader_factory(system_skip), processor_components ) - assert processor._is_skip_on_failure_enabled(conv_data) is expected + assert ( + processor._is_skip_on_failure_enabled( # pylint: disable=protected-access + conv_data + ) + is expected + ) @pytest.mark.parametrize( "results_status,expected", @@ -954,8 +885,12 @@ def test_is_skip_on_failure_enabled( ], ) def test_has_failure( - self, mock_config_loader, processor_components, results_status, expected - ): + self, + mock_config_loader: ConfigLoader, + processor_components: ProcessorComponents, + results_status: list[str], + expected: bool, + ) -> None: """Test _has_failure detection for FAIL and ERROR results.""" processor = ConversationProcessor(mock_config_loader, processor_components) results = [ @@ -964,17 +899,20 @@ def test_has_failure( ) for i, status in enumerate(results_status) ] - assert processor._has_failure(results) is expected + assert ( + processor._has_failure(results) # pylint: disable=protected-access + is expected + ) @pytest.mark.parametrize("skip_enabled,expect_skip", [(True, True), (False, False)]) - def test_skip_on_failure_behavior( + def test_skip_on_failure_behavior( # pylint: disable=too-many-arguments, too-many-positional-arguments self, - config_loader_factory, - processor_components, - multi_turn_conv_data, - skip_enabled, - expect_skip, - ): + config_loader_factory: Callable[[bool], ConfigLoader], + processor_components: ProcessorComponents, + multi_turn_conv_data: EvaluationData, + skip_enabled: bool, + expect_skip: bool, + ) -> None: """Test skip_on_failure skips remaining turns when enabled, continues when disabled.""" # Configure metric manager processor_components.metric_manager.resolve_metrics.side_effect = [ diff --git a/tests/unit/runner/test_evaluation.py b/tests/unit/runner/test_evaluation.py index 056d5118..a9a18fcf 100644 --- a/tests/unit/runner/test_evaluation.py +++ b/tests/unit/runner/test_evaluation.py @@ -1,13 +1,15 @@ """Unit tests for runner/evaluation.py.""" import argparse +from typing import Any import pytest +from pytest_mock import MockerFixture from lightspeed_evaluation.runner.evaluation import main, run_evaluation -def _make_eval_args(**kwargs) -> argparse.Namespace: +def _make_eval_args(**kwargs: Any) -> argparse.Namespace: """Helper to create eval_args namespace with defaults.""" defaults = { "system_config": "config/system.yaml", @@ -23,7 +25,11 @@ def _make_eval_args(**kwargs) -> argparse.Namespace: class TestRunEvaluation: """Unit tests for run_evaluation function.""" - def test_run_evaluation_success(self, mocker, capsys): + def test_run_evaluation_success( + self, + mocker: MockerFixture, + capsys: pytest.CaptureFixture, # pylint: disable=unused-argument + ) -> None: """Test successful evaluation run.""" # Mock ConfigLoader mock_loader = mocker.Mock() @@ -89,7 +95,11 @@ def test_run_evaluation_success(self, mocker, capsys): assert result["PASS"] == 1 mock_pipeline.close.assert_called_once() - def test_run_evaluation_with_output_dir_override(self, mocker, capsys): + def test_run_evaluation_with_output_dir_override( + self, + mocker: MockerFixture, + capsys: pytest.CaptureFixture, # pylint: disable=unused-argument + ) -> None: """Test evaluation with custom output directory.""" mock_loader = mocker.Mock() mock_config = mocker.Mock() @@ -145,7 +155,9 @@ def test_run_evaluation_with_output_dir_override(self, mocker, capsys): call_args = mock_pipeline_class.call_args assert call_args[0][1] == "/custom/output" - def test_run_evaluation_file_not_found(self, mocker, capsys): + def test_run_evaluation_file_not_found( + self, mocker: MockerFixture, capsys: pytest.CaptureFixture + ) -> None: """Test evaluation handles FileNotFoundError.""" mock_config_loader = mocker.patch( "lightspeed_evaluation.runner.evaluation.ConfigLoader" @@ -160,7 +172,9 @@ def test_run_evaluation_file_not_found(self, mocker, capsys): captured = capsys.readouterr() assert "Evaluation failed" in captured.out - def test_run_evaluation_value_error(self, mocker, capsys): + def test_run_evaluation_value_error( + self, mocker: MockerFixture, capsys: pytest.CaptureFixture + ) -> None: """Test evaluation handles ValueError.""" mock_loader = mocker.Mock() mock_config = mocker.Mock() @@ -186,7 +200,9 @@ def test_run_evaluation_value_error(self, mocker, capsys): captured = capsys.readouterr() assert "Evaluation failed" in captured.out - def test_run_evaluation_with_errors_in_results(self, mocker, capsys): + def test_run_evaluation_with_errors_in_results( + self, mocker: MockerFixture, capsys: pytest.CaptureFixture + ) -> None: """Test evaluation reports errors in results.""" mock_loader = mocker.Mock() mock_config = mocker.Mock() @@ -237,11 +253,16 @@ def test_run_evaluation_with_errors_in_results(self, mocker, capsys): result = run_evaluation(_make_eval_args()) + assert result is not None assert result["ERROR"] == 3 captured = capsys.readouterr() assert "3 evaluations had errors" in captured.out - def test_run_evaluation_closes_pipeline_on_exception(self, mocker, capsys): + def test_run_evaluation_closes_pipeline_on_exception( + self, + mocker: MockerFixture, + capsys: pytest.CaptureFixture, # pylint: disable=unused-argument + ) -> None: """Test pipeline is closed even if evaluation fails.""" mock_loader = mocker.Mock() mock_config = mocker.Mock() @@ -275,7 +296,9 @@ def test_run_evaluation_closes_pipeline_on_exception(self, mocker, capsys): mock_pipeline.close.assert_called_once() assert result is None - def test_run_evaluation_with_empty_filter_result(self, mocker, capsys): + def test_run_evaluation_with_empty_filter_result( + self, mocker: MockerFixture, capsys: pytest.CaptureFixture + ) -> None: """Test evaluation returns empty result when filter matches nothing.""" mock_loader = mocker.Mock() mock_config = mocker.Mock() @@ -295,6 +318,7 @@ def test_run_evaluation_with_empty_filter_result(self, mocker, capsys): result = run_evaluation(_make_eval_args(tags=["nonexistent"])) + assert result is not None assert result["TOTAL"] == 0 mock_validator.return_value.load_evaluation_data.assert_called_once_with( "config/evaluation_data.yaml", tags=["nonexistent"], conv_ids=None @@ -304,7 +328,7 @@ def test_run_evaluation_with_empty_filter_result(self, mocker, capsys): captured = capsys.readouterr() assert "No conversation groups matched the filter criteria" in captured.out - def test_run_evaluation_with_filter_parameters(self, mocker): + def test_run_evaluation_with_filter_parameters(self, mocker: MockerFixture) -> None: """Test that filter parameters are correctly passed to DataValidator.""" mock_loader = mocker.Mock() mock_config = mocker.Mock() @@ -365,7 +389,7 @@ def test_run_evaluation_with_filter_parameters(self, mocker): class TestMain: """Unit tests for main CLI function.""" - def test_main_default_args(self, mocker): + def test_main_default_args(self, mocker: MockerFixture) -> None: """Test main with default arguments.""" mocker.patch( "sys.argv", @@ -392,7 +416,7 @@ def test_main_default_args(self, mocker): assert args.eval_data == "config/evaluation_data.yaml" assert args.output_dir is None - def test_main_custom_args(self, mocker): + def test_main_custom_args(self, mocker: MockerFixture) -> None: """Test main with custom arguments.""" mocker.patch( "sys.argv", @@ -427,7 +451,7 @@ def test_main_custom_args(self, mocker): assert args.eval_data == "custom/eval.yaml" assert args.output_dir == "/custom/output" - def test_main_returns_error_on_failure(self, mocker): + def test_main_returns_error_on_failure(self, mocker: MockerFixture) -> None: """Test main returns error code on failure.""" mocker.patch( "sys.argv", @@ -455,7 +479,13 @@ def test_main_returns_error_on_failure(self, mocker): ), ], ) - def test_main_with_filters(self, mocker, args, expected_tags, expected_conv_ids): + def test_main_with_filters( + self, + mocker: MockerFixture, + args: list[str], + expected_tags: list[str] | None, + expected_conv_ids: list[str] | None, + ) -> None: """Test main with filter arguments.""" mocker.patch("sys.argv", ["lightspeed-eval"] + args) From f08c041a280f8fc95ac601eed3a5fab414ffdb52 Mon Sep 17 00:00:00 2001 From: Eva Micankova Date: Thu, 29 Jan 2026 10:49:32 +0100 Subject: [PATCH 2/3] Moving pylint disable to file level --- tests/script/conftest.py | 4 +- tests/script/test_compare_evaluations.py | 66 ++++--------- tests/script/test_run_multi_provider_eval.py | 76 ++++++-------- tests/unit/core/api/test_client.py | 36 +++---- tests/unit/core/llm/test_custom.py | 8 +- tests/unit/core/metrics/conftest.py | 6 +- tests/unit/core/metrics/test_geval.py | 34 +++---- tests/unit/core/metrics/test_manager.py | 4 +- tests/unit/core/metrics/test_nlp.py | 6 +- tests/unit/core/output/test_final_coverage.py | 10 +- tests/unit/core/output/test_generator.py | 64 ++++-------- tests/unit/core/system/test_validator.py | 98 ++++--------------- tests/unit/pipeline/evaluation/conftest.py | 16 +-- .../pipeline/evaluation/test_evaluator.py | 57 ++++------- .../pipeline/evaluation/test_processor.py | 90 ++++++----------- tests/unit/runner/test_evaluation.py | 8 +- 16 files changed, 196 insertions(+), 387 deletions(-) diff --git a/tests/script/conftest.py b/tests/script/conftest.py index 8ab273da..752800a2 100644 --- a/tests/script/conftest.py +++ b/tests/script/conftest.py @@ -1,3 +1,5 @@ +# pylint: disable=redefined-outer-name + """Pytest configuration and fixtures for script tests.""" from pathlib import Path @@ -126,7 +128,7 @@ def temp_config_files(tmp_path: Path) -> dict: @pytest.fixture -def runner( # pylint: disable=redefined-outer-name +def runner( temp_config_files: dict, ) -> MultiProviderEvaluationRunner: """Create a MultiProviderEvaluationRunner instance for testing.""" diff --git a/tests/script/test_compare_evaluations.py b/tests/script/test_compare_evaluations.py index e03bebdb..56bca05c 100755 --- a/tests/script/test_compare_evaluations.py +++ b/tests/script/test_compare_evaluations.py @@ -1,4 +1,6 @@ #!/usr/bin/env python3 +# pylint: disable=protected-access + """Pytest tests to verify the compare_evaluations.py script works correctly.""" import json @@ -144,9 +146,7 @@ def test_compare_score_distributions_basic( scores1 = [0.8, 0.9, 0.7, 0.85, 0.75, 0.88, 0.82, 0.79, 0.86, 0.81] scores2 = [0.6, 0.65, 0.55, 0.62, 0.58, 0.63, 0.59, 0.61, 0.64, 0.57] - result = comparison_instance._compare_score_distributions( # pylint: disable=protected-access - scores1, scores2 - ) + result = comparison_instance._compare_score_distributions(scores1, scores2) # Check structure assert "run1_stats" in result assert "run2_stats" in result @@ -180,9 +180,7 @@ def test_compare_score_distributions_scipy_example( scores1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] scores2 = [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0] - result = comparison_instance._compare_score_distributions( # pylint: disable=protected-access - scores1, scores2 - ) + result = comparison_instance._compare_score_distributions(scores1, scores2) # The means should be 5.5 and 6.5 respectively assert abs(result["run1_stats"]["mean"] - 5.5) < 0.01 @@ -200,9 +198,7 @@ def test_compare_score_distributions_identical_data( scores1 = [0.8, 0.8, 0.8, 0.8, 0.8] scores2 = [0.8, 0.8, 0.8, 0.8, 0.8] - result = comparison_instance._compare_score_distributions( # pylint: disable=protected-access - scores1, scores2 - ) + result = comparison_instance._compare_score_distributions(scores1, scores2) assert result["run1_stats"]["mean"] == result["run2_stats"]["mean"] assert result["mean_difference"] == 0.0 @@ -224,9 +220,7 @@ def test_perform_pass_rate_tests_basic( "total2": 20, } - comparison_instance._perform_pass_rate_tests( # pylint: disable=protected-access - comparison, test_data - ) + comparison_instance._perform_pass_rate_tests(comparison, test_data) # Check that tests were performed assert "tests" in comparison @@ -252,9 +246,7 @@ def test_perform_pass_rate_tests_scipy_chisquare_example( "total2": 20, } - comparison_instance._perform_pass_rate_tests( # pylint: disable=protected-access - comparison, test_data - ) + comparison_instance._perform_pass_rate_tests(comparison, test_data) # Verify structure assert "tests" in comparison @@ -289,9 +281,7 @@ def test_perform_pass_rate_tests_edge_cases( "total2": 15, } - comparison_instance._perform_pass_rate_tests( # pylint: disable=protected-access - comparison, test_data - ) + comparison_instance._perform_pass_rate_tests(comparison, test_data) # Should handle gracefully (no tests performed or error recorded) assert "tests" in comparison @@ -303,9 +293,7 @@ def test_check_confidence_interval_overlap_no_overlap( ci1 = {"low": 0.1, "high": 0.3, "mean": 0.2, "confidence_level": 0.95} ci2 = {"low": 0.7, "high": 0.9, "mean": 0.8, "confidence_level": 0.95} - result = comparison_instance._check_confidence_interval_overlap( # pylint: disable=protected-access - ci1, ci2 - ) + result = comparison_instance._check_confidence_interval_overlap(ci1, ci2) assert "intervals_overlap" in result assert "significant" in result @@ -319,9 +307,7 @@ def test_check_confidence_interval_overlap_with_overlap( ci1 = {"low": 0.2, "high": 0.6, "mean": 0.4, "confidence_level": 0.95} ci2 = {"low": 0.4, "high": 0.8, "mean": 0.6, "confidence_level": 0.95} - result = comparison_instance._check_confidence_interval_overlap( # pylint: disable=protected-access - ci1, ci2 - ) + result = comparison_instance._check_confidence_interval_overlap(ci1, ci2) assert "intervals_overlap" in result assert "significant" in result @@ -332,9 +318,7 @@ def test_check_confidence_interval_overlap_none_inputs( self, comparison_instance: EvaluationComparison ) -> None: """Test _check_confidence_interval_overlap with None inputs.""" - result = comparison_instance._check_confidence_interval_overlap( # pylint: disable=protected-access - None, None - ) + result = comparison_instance._check_confidence_interval_overlap(None, None) assert "test_performed" in result # Should handle None inputs gracefully - might not perform test @@ -345,9 +329,7 @@ def test_check_confidence_interval_overlap_partial_none( """Test _check_confidence_interval_overlap with one None input.""" ci1 = {"low": 0.2, "high": 0.6, "mean": 0.4, "confidence_level": 0.95} - result = comparison_instance._check_confidence_interval_overlap( # pylint: disable=protected-access - ci1, None - ) + result = comparison_instance._check_confidence_interval_overlap(ci1, None) assert "test_performed" in result # Should handle partial None inputs gracefully @@ -360,9 +342,7 @@ def test_compare_score_distributions_known_statistical_results( scores1 = [1.0, 1.1, 1.2, 1.3, 1.4] # Mean ≈ 1.2, low variance scores2 = [2.0, 2.1, 2.2, 2.3, 2.4] # Mean ≈ 2.2, low variance - result = comparison_instance._compare_score_distributions( # pylint: disable=protected-access - scores1, scores2 - ) + result = comparison_instance._compare_score_distributions(scores1, scores2) # These should be significantly different assert abs(result["mean_difference"] - 1.0) < 0.01 @@ -394,9 +374,7 @@ def test_perform_pass_rate_tests_known_chi_square_result( "total2": 20, } - comparison_instance._perform_pass_rate_tests( # pylint: disable=protected-access - comparison, test_data - ) + comparison_instance._perform_pass_rate_tests(comparison, test_data) # Verify the chi-square test was performed and has reasonable results if "chi_square" in comparison["tests"]: @@ -425,9 +403,7 @@ def test_perform_pass_rate_tests_fisher_exact_small_sample( "total2": 5, } - comparison_instance._perform_pass_rate_tests( # pylint: disable=protected-access - comparison, test_data - ) + comparison_instance._perform_pass_rate_tests(comparison, test_data) # Verify Fisher exact test results if "fisher_exact" in comparison["tests"]: @@ -450,9 +426,7 @@ def test_check_confidence_interval_overlap_exact_boundaries( "confidence_level": 0.95, } - result = comparison_instance._check_confidence_interval_overlap( # pylint: disable=protected-access - ci1, ci2 - ) + result = comparison_instance._check_confidence_interval_overlap(ci1, ci2) # Touching at boundary might be considered overlap or not, depending on implementation assert "intervals_overlap" in result @@ -467,9 +441,7 @@ def test_compare_score_distributions_single_values( scores1 = [0.8] scores2 = [0.6] - result = comparison_instance._compare_score_distributions( # pylint: disable=protected-access - scores1, scores2 - ) + result = comparison_instance._compare_score_distributions(scores1, scores2) # Should handle single values gracefully assert result["run1_stats"]["count"] == 1 @@ -498,9 +470,7 @@ def test_perform_pass_rate_tests_extreme_ratios( "total2": 10, } - comparison_instance._perform_pass_rate_tests( # pylint: disable=protected-access - comparison, test_data - ) + comparison_instance._perform_pass_rate_tests(comparison, test_data) # Should handle extreme cases assert "tests" in comparison diff --git a/tests/script/test_run_multi_provider_eval.py b/tests/script/test_run_multi_provider_eval.py index ef0057dc..a65b235a 100644 --- a/tests/script/test_run_multi_provider_eval.py +++ b/tests/script/test_run_multi_provider_eval.py @@ -1,4 +1,6 @@ #!/usr/bin/env python3 +# pylint: disable=protected-access,too-few-public-methods + """Pytest tests for run_multi_provider_eval.py script.""" import json @@ -241,9 +243,7 @@ def test_load_valid_yaml( self, runner: MultiProviderEvaluationRunner, temp_config_files: dict[str, Path] ) -> None: """Test loading a valid YAML file.""" - config = runner._load_yaml( # pylint: disable=protected-access - temp_config_files["providers_config"] - ) + config = runner._load_yaml(temp_config_files["providers_config"]) assert isinstance(config, dict) assert "providers" in config assert "openai" in config["providers"] @@ -259,7 +259,7 @@ def test_load_invalid_yaml( f.write("invalid: yaml: content: [") with pytest.raises(ValueError, match="Error parsing YAML file"): - runner._load_yaml(invalid_yaml) # pylint: disable=protected-access + runner._load_yaml(invalid_yaml) def test_load_yaml_non_dict_type( self, runner: MultiProviderEvaluationRunner, tmp_path: Path @@ -270,19 +270,17 @@ def test_load_yaml_non_dict_type( yaml.dump(["item1", "item2", "item3"], f) with pytest.raises(ValueError, match="must be a mapping, got list"): - runner._load_yaml(list_yaml) # pylint: disable=protected-access + runner._load_yaml(list_yaml) -class TestCreateProviderModelConfigs: # pylint: disable=too-few-public-methods +class TestCreateProviderModelConfigs: """Tests for _create_provider_model_configs method.""" def test_create_configs_multiple_providers( self, runner: MultiProviderEvaluationRunner ) -> None: """Test creating configs with multiple providers.""" - configs = ( - runner._create_provider_model_configs() # pylint: disable=protected-access - ) + configs = runner._create_provider_model_configs() assert len(configs) == 3 # 2 openai models + 1 watsonx model @@ -311,11 +309,9 @@ def test_llm_config_stays_constant( original_llm_provider = runner.system_config["llm"]["provider"] original_llm_model = runner.system_config["llm"]["model"] - modified = ( - runner._create_modified_system_config( # pylint: disable=protected-access - provider_id="watsonx", - model="ibm/granite-13b-chat-v2", - ) + modified = runner._create_modified_system_config( + provider_id="watsonx", + model="ibm/granite-13b-chat-v2", ) # LLM judge should remain unchanged @@ -348,11 +344,9 @@ def test_api_config_is_modified(self, temp_config_files: dict[str, Path]) -> Non eval_data_path=str(temp_config_files["eval_data"]), ) - modified = ( - runner._create_modified_system_config( # pylint: disable=protected-access - provider_id="watsonx", - model="ibm/granite-13b-chat-v2", - ) + modified = runner._create_modified_system_config( + provider_id="watsonx", + model="ibm/granite-13b-chat-v2", ) # API config should be modified with provider and model only @@ -372,11 +366,9 @@ def test_create_temp_config_file( self, runner: MultiProviderEvaluationRunner ) -> None: """Test that a temporary config file is created.""" - temp_path = ( - runner._create_temp_system_config( # pylint: disable=protected-access - provider_id="openai", - model="gpt-4o-mini", - ) + temp_path = runner._create_temp_system_config( + provider_id="openai", + model="gpt-4o-mini", ) try: @@ -424,7 +416,7 @@ def track_temp_file(*args: Any, **kwargs: Any) -> Any: side_effect=Exception("YAML dump failed"), ): with pytest.raises(Exception, match="YAML dump failed"): - runner._create_temp_system_config( # pylint: disable=protected-access + runner._create_temp_system_config( provider_id="openai", model="gpt-4o-mini", ) @@ -441,11 +433,9 @@ def test_temp_config_sanitizes_special_characters( self, runner: MultiProviderEvaluationRunner ) -> None: """Test that special characters in provider_id and model are sanitized.""" - temp_path = ( - runner._create_temp_system_config( # pylint: disable=protected-access - provider_id="open..ai//test", - model="gpt:4o-mini/special", - ) + temp_path = runner._create_temp_system_config( + provider_id="open..ai//test", + model="gpt:4o-mini/special", ) try: @@ -483,7 +473,7 @@ def test_path_traversal_blocked_in_provider_id( return_value={"PASS": 0, "FAIL": 0, "ERROR": 1}, ): # Attempt path traversal in provider_id - result = runner._run_single_evaluation( # pylint: disable=protected-access + result = runner._run_single_evaluation( provider_name="malicious", provider_id="../../etc", model="test", @@ -510,7 +500,7 @@ def test_path_traversal_blocked_in_model( return_value={"PASS": 0, "FAIL": 0, "ERROR": 1}, ): # Attempt path traversal in model - result = runner._run_single_evaluation( # pylint: disable=protected-access + result = runner._run_single_evaluation( provider_name="openai", provider_id="openai", model="../../../etc/passwd", @@ -540,7 +530,7 @@ def test_run_single_evaluation_success( "script.run_multi_provider_eval.run_evaluation", return_value={"PASS": 5, "FAIL": 2, "ERROR": 0}, ) as mock_run_eval: - result = runner._run_single_evaluation( # pylint: disable=protected-access + result = runner._run_single_evaluation( provider_name="openai", provider_id="openai", model="gpt-4o-mini", @@ -560,7 +550,7 @@ def test_run_single_evaluation_failure( """Test evaluation failure handling.""" # Mock run_evaluation to return None (failure) with patch("script.run_multi_provider_eval.run_evaluation", return_value=None): - result = runner._run_single_evaluation( # pylint: disable=protected-access + result = runner._run_single_evaluation( provider_name="openai", provider_id="openai", model="gpt-4o-mini", @@ -578,7 +568,7 @@ def test_run_single_evaluation_invalid_summary( "script.run_multi_provider_eval.run_evaluation", return_value={"PASS": 5, "FAIL": 2}, # Missing ERROR key ): - result = runner._run_single_evaluation( # pylint: disable=protected-access + result = runner._run_single_evaluation( provider_name="openai", provider_id="openai", model="gpt-4o-mini", @@ -589,7 +579,7 @@ def test_run_single_evaluation_invalid_summary( assert "summary" not in result -class TestRunEvaluations: # pylint: disable=too-few-public-methods +class TestRunEvaluations: """Tests for run_evaluations method.""" def test_run_evaluations_sequential( @@ -614,7 +604,7 @@ def test_run_evaluations_sequential( assert mock_single_eval.call_count == 3 -class TestGenerateSummary: # pylint: disable=too-few-public-methods +class TestGenerateSummary: """Tests for generate_summary method.""" def test_generate_summary_mixed_results( @@ -675,9 +665,7 @@ def test_percentage_to_decimal_conversion( self, runner: MultiProviderEvaluationRunner, sample_evaluation_summary: dict ) -> None: """Test that percentage rates (80.0) convert to decimals (0.8).""" - stats = runner._analyze_single_model( # pylint: disable=protected-access - "test/model", sample_evaluation_summary - ) + stats = runner._analyze_single_model("test/model", sample_evaluation_summary) # Verify percentage conversion assert abs(stats["overall"]["pass_rate"] - 0.8) < 0.01 @@ -686,15 +674,11 @@ def test_percentage_to_decimal_conversion( def test_composite_score(self, runner: MultiProviderEvaluationRunner) -> None: """Test composite score calculation.""" # Perfect model should get score of 1.0 - perfect = runner._calculate_composite_score( # pylint: disable=protected-access - 1.0, 0.0, 1.0, 1.0 - ) + perfect = runner._calculate_composite_score(1.0, 0.0, 1.0, 1.0) assert abs(perfect - 1.0) < 0.0001 # Poor model should get score of 0.0 - poor = runner._calculate_composite_score( # pylint: disable=protected-access - 0.0, 1.0, 0.0, 0.0 - ) + poor = runner._calculate_composite_score(0.0, 1.0, 0.0, 0.0) assert poor == 0.0 def test_model_ranking(self, runner: MultiProviderEvaluationRunner) -> None: diff --git a/tests/unit/core/api/test_client.py b/tests/unit/core/api/test_client.py index caa7d2b3..67117604 100644 --- a/tests/unit/core/api/test_client.py +++ b/tests/unit/core/api/test_client.py @@ -1,3 +1,5 @@ +# pylint: disable=protected-access + """Unit tests for core API client module.""" from pathlib import Path @@ -248,9 +250,7 @@ def test_handle_response_errors_non_200( mock_response.read.return_value = b'{"detail": "Not found"}' with pytest.raises(httpx.HTTPStatusError): - client._handle_response_errors( # pylint: disable=protected-access - mock_response - ) + client._handle_response_errors(mock_response) def test_extract_error_message_with_detail( self, api_config: APIConfig, mocker: MockerFixture @@ -263,9 +263,7 @@ def test_extract_error_message_with_detail( mock_response = mocker.Mock() mock_response.read.return_value = b'{"detail": "Error message"}' - error_msg = client._extract_error_message( # pylint: disable=protected-access - mock_response - ) + error_msg = client._extract_error_message(mock_response) assert "Error message" in error_msg def test_extract_error_message_with_nested_detail( @@ -281,9 +279,7 @@ def test_extract_error_message_with_nested_detail( b'{"detail": {"response": "Error", "cause": "Reason"}}' ) - error_msg = client._extract_error_message( # pylint: disable=protected-access - mock_response - ) + error_msg = client._extract_error_message(mock_response) assert "Error" in error_msg assert "Reason" in error_msg @@ -405,9 +401,7 @@ def test_prepare_request_basic( mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client") client = APIClient(basic_api_config) - request = client._prepare_request( # pylint: disable=protected-access - "What is Python?" - ) + request = client._prepare_request("What is Python?") assert request.query == "What is Python?" assert request.provider == "openai" @@ -420,9 +414,7 @@ def test_prepare_request_with_conversation_id( mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client") client = APIClient(basic_api_config) - request = client._prepare_request( # pylint: disable=protected-access - "Follow-up", conversation_id="conv_123" - ) + request = client._prepare_request("Follow-up", conversation_id="conv_123") assert request.query == "Follow-up" assert request.conversation_id == "conv_123" @@ -434,7 +426,7 @@ def test_prepare_request_with_attachments( mocker.patch("lightspeed_evaluation.core.api.client.httpx.Client") client = APIClient(basic_api_config) - request = client._prepare_request( # pylint: disable=protected-access + request = client._prepare_request( "Analyze this", attachments=["file1.txt", "file2.pdf"] ) @@ -478,15 +470,11 @@ def test_get_cache_key_generates_consistent_hash( client = APIClient(config) # Create identical requests - request1 = client._prepare_request( # pylint: disable=protected-access - "test query" - ) - request2 = client._prepare_request( # pylint: disable=protected-access - "test query" - ) + request1 = client._prepare_request("test query") + request2 = client._prepare_request("test query") - key1 = client._get_cache_key(request1) # pylint: disable=protected-access - key2 = client._get_cache_key(request2) # pylint: disable=protected-access + key1 = client._get_cache_key(request1) + key2 = client._get_cache_key(request2) # Same request should generate same cache key assert key1 == key2 diff --git a/tests/unit/core/llm/test_custom.py b/tests/unit/core/llm/test_custom.py index bbd9d3ca..774ce120 100644 --- a/tests/unit/core/llm/test_custom.py +++ b/tests/unit/core/llm/test_custom.py @@ -1,3 +1,5 @@ +# pylint: disable=protected-access,disable=too-few-public-methods + """Unit tests for custom LLM classes.""" import pytest @@ -7,7 +9,7 @@ from lightspeed_evaluation.core.system.exceptions import LLMError -class TestTokenTracker: # pylint: disable=too-few-public-methods +class TestTokenTracker: """Tests for TokenTracker.""" def test_token_callback_accumulates_tokens(self, mocker: MockerFixture) -> None: @@ -20,9 +22,7 @@ def test_token_callback_accumulates_tokens(self, mocker: MockerFixture) -> None: mock_response.usage.prompt_tokens = 10 mock_response.usage.completion_tokens = 20 - tracker._token_callback( # pylint: disable=protected-access - {}, mock_response, 0.0, 0.0 - ) + tracker._token_callback({}, mock_response, 0.0, 0.0) input_tokens, output_tokens = tracker.get_counts() assert input_tokens == 10 diff --git a/tests/unit/core/metrics/conftest.py b/tests/unit/core/metrics/conftest.py index 6938d5ff..8018a2d2 100644 --- a/tests/unit/core/metrics/conftest.py +++ b/tests/unit/core/metrics/conftest.py @@ -1,3 +1,5 @@ +# pylint: disable=redefined-outer-name + """Pytest configuration and fixtures for metrics tests.""" import sys @@ -67,7 +69,7 @@ def sample_turn_data() -> TurnData: @pytest.fixture -def sample_scope( # pylint: disable=redefined-outer-name +def sample_scope( sample_turn_data: TurnData, ) -> EvaluationScope: """Create sample EvaluationScope for turn-level evaluation.""" @@ -79,7 +81,7 @@ def sample_scope( # pylint: disable=redefined-outer-name @pytest.fixture -def conversation_scope( # pylint: disable=redefined-outer-name +def conversation_scope( sample_turn_data: TurnData, ) -> EvaluationScope: """Create sample EvaluationScope for conversation-level evaluation.""" diff --git a/tests/unit/core/metrics/test_geval.py b/tests/unit/core/metrics/test_geval.py index 79ac617f..d228b9f4 100644 --- a/tests/unit/core/metrics/test_geval.py +++ b/tests/unit/core/metrics/test_geval.py @@ -1,3 +1,5 @@ +# pylint: disable=too-many-public-methods,protected-access + """Tests for GEval metrics handler.""" from unittest.mock import MagicMock, patch @@ -9,7 +11,7 @@ from lightspeed_evaluation.core.metrics.manager import MetricLevel -class TestGEvalHandler: # pylint: disable=too-many-public-methods +class TestGEvalHandler: """Test cases for GEvalHandler class.""" @pytest.fixture @@ -50,9 +52,7 @@ def test_initialization( def test_convert_evaluation_params_field_names(self, handler: GEvalHandler) -> None: """Test conversion of evaluation data field names to LLMTestCaseParams enum.""" params = ["query", "response", "expected_response"] - result = handler._convert_evaluation_params( # pylint: disable=protected-access - params - ) + result = handler._convert_evaluation_params(params) assert result is not None assert len(result) == 3 @@ -65,9 +65,7 @@ def test_convert_evaluation_params_with_contexts( ) -> None: """Test conversion including contexts and retrieval_context fields.""" params = ["query", "response", "contexts", "retrieval_context"] - result = handler._convert_evaluation_params( # pylint: disable=protected-access - params - ) + result = handler._convert_evaluation_params(params) assert result is not None assert len(result) == 4 @@ -81,9 +79,7 @@ def test_convert_evaluation_params_enum_values_backward_compat( ) -> None: """Test conversion with direct enum value strings (backward compatibility).""" params = ["INPUT", "ACTUAL_OUTPUT", "EXPECTED_OUTPUT"] - result = handler._convert_evaluation_params( # pylint: disable=protected-access - params - ) + result = handler._convert_evaluation_params(params) assert result is not None assert len(result) == 3 @@ -96,9 +92,7 @@ def test_convert_evaluation_params_invalid_returns_none( ) -> None: """Test that invalid params return None to allow GEval auto-detection.""" params = ["invalid_param", "another_invalid"] - result = handler._convert_evaluation_params( # pylint: disable=protected-access - params - ) + result = handler._convert_evaluation_params(params) assert result is None @@ -106,9 +100,7 @@ def test_convert_evaluation_params_empty_returns_none( self, handler: GEvalHandler ) -> None: """Test that empty params list returns None.""" - result = handler._convert_evaluation_params( # pylint: disable=protected-access - [] - ) + result = handler._convert_evaluation_params([]) assert result is None def test_convert_evaluation_params_mixed_invalid_returns_none( @@ -116,9 +108,7 @@ def test_convert_evaluation_params_mixed_invalid_returns_none( ) -> None: """Test that any invalid param causes None return.""" params = ["query", "invalid_param", "response"] - result = handler._convert_evaluation_params( # pylint: disable=protected-access - params - ) + result = handler._convert_evaluation_params(params) # Should return None because of the invalid param assert result is None @@ -135,7 +125,7 @@ def test_get_geval_config_uses_metric_manager( mock_metric_manager.get_metric_metadata.return_value = expected_config conv_data = MagicMock() - config = handler._get_geval_config( # pylint: disable=protected-access + config = handler._get_geval_config( metric_name="test_metric", conv_data=conv_data, turn_data=None, @@ -160,7 +150,7 @@ def test_get_geval_config_turn_level( conv_data = MagicMock() turn_data = MagicMock() - config = handler._get_geval_config( # pylint: disable=protected-access + config = handler._get_geval_config( metric_name="turn_metric", conv_data=conv_data, turn_data=turn_data, @@ -182,7 +172,7 @@ def test_get_geval_config_returns_none_when_not_found( mock_metric_manager.get_metric_metadata.return_value = None conv_data = MagicMock() - config = handler._get_geval_config( # pylint: disable=protected-access + config = handler._get_geval_config( metric_name="nonexistent_metric", conv_data=conv_data, turn_data=None, diff --git a/tests/unit/core/metrics/test_manager.py b/tests/unit/core/metrics/test_manager.py index 756f2c8e..d525c80c 100644 --- a/tests/unit/core/metrics/test_manager.py +++ b/tests/unit/core/metrics/test_manager.py @@ -1,3 +1,5 @@ +# pylint: disable=too-many-public-methods + """Unit tests for core metrics manager module.""" from lightspeed_evaluation.core.metrics.manager import MetricLevel, MetricManager @@ -8,7 +10,7 @@ ) -class TestMetricManager: # pylint: disable=too-many-public-methods +class TestMetricManager: """Unit tests for MetricManager.""" def test_resolve_metrics_with_none_uses_defaults( diff --git a/tests/unit/core/metrics/test_nlp.py b/tests/unit/core/metrics/test_nlp.py index 453cb27c..5e2ac427 100644 --- a/tests/unit/core/metrics/test_nlp.py +++ b/tests/unit/core/metrics/test_nlp.py @@ -1,3 +1,5 @@ +# pylint: disable=too-many-arguments,too-many-positional-arguments,disable=too-few-public-methods + """Tests for NLP metrics module. This module tests the NLP-based evaluation metrics: @@ -26,7 +28,7 @@ from lightspeed_evaluation.core.system.exceptions import MetricError -class TestNLPMetricsInit: # pylint: disable=too-few-public-methods +class TestNLPMetricsInit: """Test NLPMetrics initialization.""" def test_initialization(self, nlp_metrics: NLPMetrics) -> None: @@ -292,7 +294,7 @@ def test_bleu_failure_raises_metric_error( ), ], ) - def test_ragas_metric_failure_raises_metric_error( # pylint: disable=too-many-arguments,too-many-positional-arguments + def test_ragas_metric_failure_raises_metric_error( self, nlp_metrics: NLPMetrics, sample_scope: EvaluationScope, diff --git a/tests/unit/core/output/test_final_coverage.py b/tests/unit/core/output/test_final_coverage.py index bbf9c8e8..b482fbd4 100644 --- a/tests/unit/core/output/test_final_coverage.py +++ b/tests/unit/core/output/test_final_coverage.py @@ -1,3 +1,5 @@ +# pylint: disable=protected-access,too-few-public-methods + """Additional tests to boost coverage towards 75%.""" from pathlib import Path @@ -97,7 +99,7 @@ def test_calculate_stats_with_single_result(self, tmp_path: Path) -> None: ) ] - stats = handler._calculate_stats(results) # pylint: disable=protected-access + stats = handler._calculate_stats(results) assert stats["basic"]["TOTAL"] == 1 assert stats["basic"]["PASS"] == 1 @@ -123,9 +125,7 @@ def test_generate_csv_with_minimal_columns( ) ] - csv_file = handler._generate_csv_report( # pylint: disable=protected-access - results, "test" - ) + csv_file = handler._generate_csv_report(results, "test") assert csv_file.exists() content = csv_file.read_text() @@ -134,7 +134,7 @@ def test_generate_csv_with_minimal_columns( assert "PASS" in content -class TestSystemLoaderEdgeCases: # pylint: disable=too-few-public-methods +class TestSystemLoaderEdgeCases: """Edge case tests for system loader.""" def test_validate_metrics_with_mixed_valid_invalid(self) -> None: diff --git a/tests/unit/core/output/test_generator.py b/tests/unit/core/output/test_generator.py index 5b4b2ef8..a6252e8b 100644 --- a/tests/unit/core/output/test_generator.py +++ b/tests/unit/core/output/test_generator.py @@ -1,3 +1,5 @@ +# pylint: disable=protected-access + """Unit tests for output generator.""" import json @@ -26,9 +28,7 @@ def test_calculate_stats_with_results( ) -> None: """Test statistics calculation.""" handler = OutputHandler(output_dir=str(tmp_path)) - stats = handler._calculate_stats( # pylint: disable=protected-access - sample_results - ) + stats = handler._calculate_stats(sample_results) assert stats["basic"]["TOTAL"] == 2 assert stats["basic"]["PASS"] == 1 @@ -38,7 +38,7 @@ def test_calculate_stats_with_results( def test_calculate_stats_empty(self, tmp_path: Path) -> None: """Test statistics with empty results.""" handler = OutputHandler(output_dir=str(tmp_path)) - stats = handler._calculate_stats([]) # pylint: disable=protected-access + stats = handler._calculate_stats([]) assert stats["basic"]["TOTAL"] == 0 assert not stats["detailed"]["by_metric"] @@ -55,9 +55,7 @@ def test_generate_csv_report( system_config=mock_system_config, ) - csv_file = handler._generate_csv_report( # pylint: disable=protected-access - sample_results, "test" - ) + csv_file = handler._generate_csv_report(sample_results, "test") assert csv_file.exists() assert csv_file.suffix == ".csv" @@ -72,9 +70,7 @@ def test_generate_json_summary( ) -> None: """Test JSON summary generation.""" handler = OutputHandler(output_dir=str(tmp_path)) - stats = handler._calculate_stats( # pylint: disable=protected-access - sample_results - ) + stats = handler._calculate_stats(sample_results) api_tokens = { "total_api_input_tokens": 100, "total_api_output_tokens": 200, @@ -82,7 +78,7 @@ def test_generate_json_summary( } streaming_stats: dict = {} - json_file = handler._generate_json_summary( # pylint: disable=protected-access + json_file = handler._generate_json_summary( sample_results, "test", stats["basic"], @@ -108,9 +104,7 @@ def test_generate_text_summary( ) -> None: """Test text summary generation.""" handler = OutputHandler(output_dir=str(tmp_path)) - stats = handler._calculate_stats( # pylint: disable=protected-access - sample_results - ) + stats = handler._calculate_stats(sample_results) api_tokens = { "total_api_input_tokens": 100, "total_api_output_tokens": 200, @@ -118,7 +112,7 @@ def test_generate_text_summary( } streaming_stats: dict = {} - txt_file = handler._generate_text_summary( # pylint: disable=protected-access + txt_file = handler._generate_text_summary( sample_results, "test", stats["basic"], @@ -202,13 +196,9 @@ def test_generate_individual_reports_csv_only( config.visualization.enabled_graphs = [] handler = OutputHandler(output_dir=str(tmp_path), system_config=config) - stats = handler._calculate_stats( # pylint: disable=protected-access - sample_results - ) + stats = handler._calculate_stats(sample_results) - handler._generate_individual_reports( # pylint: disable=protected-access - sample_results, "test", ["csv"], stats - ) + handler._generate_individual_reports(sample_results, "test", ["csv"], stats) assert (tmp_path / "test_detailed.csv").exists() @@ -225,13 +215,9 @@ def test_generate_individual_reports_json_only( config.model_fields.keys.return_value = [] handler = OutputHandler(output_dir=str(tmp_path), system_config=config) - stats = handler._calculate_stats( # pylint: disable=protected-access - sample_results - ) + stats = handler._calculate_stats(sample_results) - handler._generate_individual_reports( # pylint: disable=protected-access - sample_results, "test", ["json"], stats - ) + handler._generate_individual_reports(sample_results, "test", ["json"], stats) assert (tmp_path / "test_summary.json").exists() @@ -248,12 +234,8 @@ def test_generate_individual_reports_txt_only( config.model_fields.keys.return_value = [] handler = OutputHandler(output_dir=str(tmp_path), system_config=config) - stats = handler._calculate_stats( # pylint: disable=protected-access - sample_results - ) - handler._generate_individual_reports( # pylint: disable=protected-access - sample_results, "test", ["txt"], stats - ) + stats = handler._calculate_stats(sample_results) + handler._generate_individual_reports(sample_results, "test", ["txt"], stats) assert (tmp_path / "test_summary.txt").exists() @@ -279,9 +261,7 @@ def test_csv_with_all_columns( config.visualization.enabled_graphs = [] handler = OutputHandler(output_dir=str(tmp_path), system_config=config) - csv_file = handler._generate_csv_report( # pylint: disable=protected-access - sample_results, "test" - ) + csv_file = handler._generate_csv_report(sample_results, "test") content = csv_file.read_text() assert "query" in content @@ -409,9 +389,7 @@ def test_generate_csv_with_specific_results( mocker.patch("builtins.print") handler = OutputHandler(output_dir=str(tmp_path)) - csv_file = handler._generate_csv_report( # pylint: disable=protected-access - results, "test_eval" - ) + csv_file = handler._generate_csv_report(results, "test_eval") assert csv_file.exists() assert csv_file.suffix == ".csv" @@ -461,9 +439,7 @@ def test_csv_columns_configuration( system_config.visualization.enabled_graphs = [] handler = OutputHandler(output_dir=str(tmp_path), system_config=system_config) - csv_file = handler._generate_csv_report( # pylint: disable=protected-access - results, "test_eval" - ) + csv_file = handler._generate_csv_report(results, "test_eval") with open(csv_file, encoding="utf-8") as f: reader = csv_module.reader(f) @@ -487,9 +463,7 @@ def test_filename_timestamp_format( ) mock_datetime.now.return_value.strftime.return_value = "20240101_120000" - csv_file = handler._generate_csv_report( # pylint: disable=protected-access - results, "test_20240101_120000" - ) + csv_file = handler._generate_csv_report(results, "test_20240101_120000") assert "test_20240101_120000" in csv_file.name assert csv_file.suffix == ".csv" diff --git a/tests/unit/core/system/test_validator.py b/tests/unit/core/system/test_validator.py index 95799a6d..1a9a0c33 100644 --- a/tests/unit/core/system/test_validator.py +++ b/tests/unit/core/system/test_validator.py @@ -1,3 +1,5 @@ +# pylint: disable=protected-access + """Unit tests for core system validator module.""" import tempfile @@ -55,11 +57,7 @@ def test_validate_evaluation_data_valid(self) -> None: ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = ( - validator._validate_evaluation_data( # pylint: disable=protected-access - [conv_data] - ) - ) + result = validator._validate_evaluation_data([conv_data]) assert result is True assert len(validator.validation_errors) == 0 @@ -84,11 +82,7 @@ def test_validate_metrics_availability_unknown_turn_metric( ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = ( - validator._validate_evaluation_data( # pylint: disable=protected-access - [conv_data] - ) - ) + result = validator._validate_evaluation_data([conv_data]) assert result is False assert len(validator.validation_errors) > 0 @@ -114,11 +108,7 @@ def test_validate_metrics_availability_unknown_conversation_metric( conversation_metrics=["unknown:conversation_metric"], ) - result = ( - validator._validate_evaluation_data( # pylint: disable=protected-access - [conv_data] - ) - ) + result = validator._validate_evaluation_data([conv_data]) assert result is False assert any( @@ -138,11 +128,7 @@ def test_validate_metric_requirements_missing_response(self) -> None: ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = ( - validator._validate_evaluation_data( # pylint: disable=protected-access - [conv_data] - ) - ) + result = validator._validate_evaluation_data([conv_data]) assert result is False assert any("response" in error.lower() for error in validator.validation_errors) @@ -160,11 +146,7 @@ def test_validate_metric_requirements_missing_contexts(self) -> None: ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = ( - validator._validate_evaluation_data( # pylint: disable=protected-access - [conv_data] - ) - ) + result = validator._validate_evaluation_data([conv_data]) assert result is False assert any("contexts" in error.lower() for error in validator.validation_errors) @@ -189,11 +171,7 @@ def test_validate_metric_requirements_api_enabled_allows_missing_response( ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = ( - validator._validate_evaluation_data( # pylint: disable=protected-access - [conv_data] - ) - ) + result = validator._validate_evaluation_data([conv_data]) # Should pass because API will populate response assert result is True @@ -212,11 +190,7 @@ def test_validate_metric_requirements_expected_response_missing(self) -> None: ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = ( - validator._validate_evaluation_data( # pylint: disable=protected-access - [conv_data] - ) - ) + result = validator._validate_evaluation_data([conv_data]) assert result is False assert any( @@ -238,11 +212,7 @@ def test_validate_metric_requirements_tool_eval_missing_fields(self) -> None: ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = ( - validator._validate_evaluation_data( # pylint: disable=protected-access - [conv_data] - ) - ) + result = validator._validate_evaluation_data([conv_data]) assert result is False assert any( @@ -271,11 +241,7 @@ def test_validate_metric_requirements_skip_script_when_api_disabled( conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) # Should not validate script requirements when API disabled - result = ( - validator._validate_evaluation_data( # pylint: disable=protected-access - [conv_data] - ) - ) + result = validator._validate_evaluation_data([conv_data]) # Should pass because script validation is skipped assert result is True @@ -373,11 +339,7 @@ def test_check_metric_requirements_missing_contexts(self) -> None: ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = ( - validator._validate_evaluation_data( # pylint: disable=protected-access - [conv_data] - ) - ) + result = validator._validate_evaluation_data([conv_data]) assert result is False assert any("contexts" in error.lower() for error in validator.validation_errors) @@ -394,11 +356,7 @@ def test_check_metric_requirements_whitespace_only_string(self) -> None: ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn]) - result = ( - validator._validate_evaluation_data( # pylint: disable=protected-access - [conv_data] - ) - ) + result = validator._validate_evaluation_data([conv_data]) assert result is False @@ -412,11 +370,7 @@ def test_validate_multiple_conversations(self) -> None: conv1 = EvaluationData(conversation_group_id="conv1", turns=[turn1]) conv2 = EvaluationData(conversation_group_id="conv2", turns=[turn2]) - result = ( - validator._validate_evaluation_data( # pylint: disable=protected-access - [conv1, conv2] - ) - ) + result = validator._validate_evaluation_data([conv1, conv2]) assert result is True @@ -447,11 +401,7 @@ def test_validate_evaluation_data_accumulates_errors( conv = EvaluationData(conversation_group_id="test", turns=[turn1, turn2]) - result = ( - validator._validate_evaluation_data( # pylint: disable=protected-access - [conv] - ) - ) + result = validator._validate_evaluation_data([conv]) assert result is False # Should have errors for both issues @@ -474,7 +424,7 @@ def test_filter_by_scope_no_filter(self) -> None: turns=[TurnData(turn_id="t1", query="Q", response="A")], ), ] - result = validator._filter_by_scope(data) # pylint: disable=protected-access + result = validator._filter_by_scope(data) assert len(result) == 2 def test_filter_by_scope_tags_only(self) -> None: @@ -497,9 +447,7 @@ def test_filter_by_scope_tags_only(self) -> None: turns=[TurnData(turn_id="t1", query="Q", response="A")], ), ] - result = validator._filter_by_scope( # pylint: disable=protected-access - data, tags=["basic"] - ) + result = validator._filter_by_scope(data, tags=["basic"]) assert len(result) == 2 assert all(c.tag == "basic" for c in result) @@ -520,9 +468,7 @@ def test_filter_by_scope_conv_ids_only(self) -> None: turns=[TurnData(turn_id="t1", query="Q", response="A")], ), ] - result = validator._filter_by_scope( # pylint: disable=protected-access - data, conv_ids=["conv_1", "conv_3"] - ) + result = validator._filter_by_scope(data, conv_ids=["conv_1", "conv_3"]) assert len(result) == 2 assert {c.conversation_group_id for c in result} == {"conv_1", "conv_3"} @@ -546,9 +492,7 @@ def test_filter_by_scope_tags_and_conv_ids(self) -> None: turns=[TurnData(turn_id="t1", query="Q", response="A")], ), ] - result = validator._filter_by_scope( # pylint: disable=protected-access - data, tags=["basic"], conv_ids=["conv_3"] - ) + result = validator._filter_by_scope(data, tags=["basic"], conv_ids=["conv_3"]) assert len(result) == 2 # conv_1 (basic tag) + conv_3 (by ID) def test_filter_by_scope_no_match_returns_empty(self) -> None: @@ -561,7 +505,5 @@ def test_filter_by_scope_no_match_returns_empty(self) -> None: turns=[TurnData(turn_id="t1", query="Q", response="A")], ), ] - result = validator._filter_by_scope( # pylint: disable=protected-access - data, tags=["nonexistent"] - ) + result = validator._filter_by_scope(data, tags=["nonexistent"]) assert len(result) == 0 diff --git a/tests/unit/pipeline/evaluation/conftest.py b/tests/unit/pipeline/evaluation/conftest.py index a1131cd5..09a2e18c 100644 --- a/tests/unit/pipeline/evaluation/conftest.py +++ b/tests/unit/pipeline/evaluation/conftest.py @@ -1,3 +1,5 @@ +# pylint: disable=redefined-outer-name + """Pytest configuration and fixtures for evaluation tests.""" import pytest @@ -198,11 +200,11 @@ def create_error_result_side_effect( @pytest.fixture def processor_components_pr( - mock_metrics_evaluator: MetricsEvaluator, # pylint: disable=redefined-outer-name - mock_api_amender: APIDataAmender, # pylint: disable=redefined-outer-name - mock_error_handler: EvaluationErrorHandler, # pylint: disable=redefined-outer-name - mock_metric_manager: MetricManager, # pylint: disable=redefined-outer-name - mock_script_manager: ScriptExecutionManager, # pylint: disable=redefined-outer-name + mock_metrics_evaluator: MetricsEvaluator, + mock_api_amender: APIDataAmender, + mock_error_handler: EvaluationErrorHandler, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, ) -> ProcessorComponents: """Create processor components fixture for PR tests.""" return ProcessorComponents( @@ -216,8 +218,8 @@ def processor_components_pr( @pytest.fixture def processor( - config_loader: ConfigLoader, # pylint: disable=redefined-outer-name - processor_components_pr: ProcessorComponents, # pylint: disable=redefined-outer-name + config_loader: ConfigLoader, + processor_components_pr: ProcessorComponents, ) -> ConversationProcessor: """Create ConversationProcessor instance for PR tests.""" return ConversationProcessor(config_loader, processor_components_pr) diff --git a/tests/unit/pipeline/evaluation/test_evaluator.py b/tests/unit/pipeline/evaluation/test_evaluator.py index 01d44109..92061aba 100644 --- a/tests/unit/pipeline/evaluation/test_evaluator.py +++ b/tests/unit/pipeline/evaluation/test_evaluator.py @@ -1,3 +1,5 @@ +# pylint: disable=protected-access,redefined-outer-name,too-many-arguments,too-many-positional-arguments + """Unit tests for pipeline evaluation evaluator module.""" import pytest @@ -428,20 +430,11 @@ def test_determine_status_with_threshold( ) # Test PASS - assert ( - evaluator._determine_status(0.8, 0.7) # pylint: disable=protected-access - == "PASS" - ) - assert ( - evaluator._determine_status(0.7, 0.7) # pylint: disable=protected-access - == "PASS" - ) # Equal passes + assert evaluator._determine_status(0.8, 0.7) == "PASS" + assert evaluator._determine_status(0.7, 0.7) == "PASS" # Equal passes # Test FAIL - assert ( - evaluator._determine_status(0.6, 0.7) # pylint: disable=protected-access - == "FAIL" - ) + assert evaluator._determine_status(0.6, 0.7) == "FAIL" def test_determine_status_without_threshold( self, @@ -471,16 +464,10 @@ def test_determine_status_without_threshold( ) # Should use 0.5 as default - assert ( - evaluator._determine_status(0.6, None) # pylint: disable=protected-access - == "PASS" - ) - assert ( - evaluator._determine_status(0.4, None) # pylint: disable=protected-access - == "FAIL" - ) + assert evaluator._determine_status(0.6, None) == "PASS" + assert evaluator._determine_status(0.4, None) == "FAIL" - def _setup_evaluate_test( # pylint: disable=too-many-arguments, too-many-positional-arguments + def _setup_evaluate_test( self, config_loader: ConfigLoader, mock_metric_manager: MetricManager, @@ -558,7 +545,7 @@ def create_mock_handler( # type: ignore[no-untyped-def] "metric_identifier", ["ragas:context_recall", "custom:answer_correctness", "nlp:rouge"], ) - def test_evaluate_with_expected_response_list( # pylint: disable=too-many-arguments, too-many-positional-arguments + def test_evaluate_with_expected_response_list( self, config_loader: ConfigLoader, mock_metric_manager: MetricManager, @@ -586,9 +573,7 @@ def test_evaluate_with_expected_response_list( # pylint: disable=too-many-argum request = EvaluationRequest.for_turn(conv_data, metric_identifier, 0, turn_data) scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False) - metric_result = evaluator._evaluate_wrapper( # pylint: disable=protected-access - request, scope, 0.7 - ) + metric_result = evaluator._evaluate_wrapper(request, scope, 0.7) assert metric_result.score == 0.85 assert metric_result.reason == "High score" @@ -628,9 +613,7 @@ def test_evaluate_with_expected_response_list_fail( ) scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False) - metric_result = evaluator._evaluate_wrapper( # pylint: disable=protected-access - request, scope, 0.7 - ) + metric_result = evaluator._evaluate_wrapper(request, scope, 0.7) reason_combined = "\n".join( [f"{score}; {reason}" for score, reason in scores_reasons] ) @@ -665,9 +648,7 @@ def test_evaluate_with_expected_response_string( ) scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False) - metric_result = evaluator._evaluate_wrapper( # pylint: disable=protected-access - request, scope, 0.7 - ) + metric_result = evaluator._evaluate_wrapper(request, scope, 0.7) assert metric_result.score == 0.85 assert metric_result.reason == "Good score" @@ -682,7 +663,7 @@ def test_evaluate_with_expected_response_string( [None, "string", ["string1", "string2"]], ids=["none", "string", "string_list"], ) - def test_evaluate_with_expected_response_not_needed( # pylint: disable=too-many-arguments, too-many-positional-arguments + def test_evaluate_with_expected_response_not_needed( self, config_loader: ConfigLoader, mock_metric_manager: MetricManager, @@ -711,9 +692,7 @@ def test_evaluate_with_expected_response_not_needed( # pylint: disable=too-many request = EvaluationRequest.for_turn(conv_data, metric_identifier, 0, turn_data) scope = EvaluationScope(turn_idx=0, turn_data=turn_data, is_conversation=False) - metric_result = evaluator._evaluate_wrapper( # pylint: disable=protected-access - request, scope, 0.7 - ) + metric_result = evaluator._evaluate_wrapper(request, scope, 0.7) assert metric_result.score == 0.3 assert metric_result.reason == "Low score" @@ -753,16 +732,16 @@ def test_token_tracker_start_stop(self) -> None: """Test start and stop methods.""" tracker = TokenTracker() tracker.start() - assert tracker._callback_registered is True # pylint: disable=protected-access + assert tracker._callback_registered is True tracker.stop() - assert tracker._callback_registered is False # pylint: disable=protected-access + assert tracker._callback_registered is False def test_token_tracker_double_start(self) -> None: """Test calling start twice doesn't register callback twice.""" tracker = TokenTracker() tracker.start() tracker.start() # Should not fail - assert tracker._callback_registered is True # pylint: disable=protected-access + assert tracker._callback_registered is True tracker.stop() def test_token_tracker_double_stop(self) -> None: @@ -771,7 +750,7 @@ def test_token_tracker_double_stop(self) -> None: tracker.start() tracker.stop() tracker.stop() # Should not fail - assert tracker._callback_registered is False # pylint: disable=protected-access + assert tracker._callback_registered is False def test_token_tracker_independent_instances(self) -> None: """Test multiple TokenTracker instances are independent.""" diff --git a/tests/unit/pipeline/evaluation/test_processor.py b/tests/unit/pipeline/evaluation/test_processor.py index 16b3d4f5..6b18ef84 100644 --- a/tests/unit/pipeline/evaluation/test_processor.py +++ b/tests/unit/pipeline/evaluation/test_processor.py @@ -1,3 +1,5 @@ +# pylint: disable=unused-argument,protected-access,too-many-arguments, too-many-positional-arguments + """Unit tests for ConversationProcessor.""" from typing import Callable @@ -44,7 +46,7 @@ def test_process_conversation_skips_when_no_metrics( mock_config_loader: ConfigLoader, processor_components: ProcessorComponents, sample_conv_data: EvaluationData, - mocker: MockerFixture, # pylint: disable=unused-argument + mocker: MockerFixture, ) -> None: """Test processing skips when no metrics specified.""" # Mock metric manager to return empty lists @@ -60,7 +62,7 @@ def test_process_conversation_turn_metrics( mock_config_loader: ConfigLoader, processor_components: ProcessorComponents, sample_conv_data: EvaluationData, - mocker: MockerFixture, # pylint: disable=unused-argument + mocker: MockerFixture, ) -> None: """Test processing with turn-level metrics.""" @@ -99,7 +101,7 @@ def test_process_conversation_conversation_metrics( self, mock_config_loader: ConfigLoader, processor_components: ProcessorComponents, - mocker: MockerFixture, # pylint: disable=unused-argument + mocker: MockerFixture, ) -> None: """Test processing with conversation-level metrics.""" @@ -144,7 +146,7 @@ def test_process_conversation_with_setup_script_success( mock_config_loader: ConfigLoader, processor_components: ProcessorComponents, sample_conv_data: EvaluationData, - mocker: MockerFixture, # pylint: disable=unused-argument + mocker: MockerFixture, ) -> None: """Test processing with successful setup script.""" @@ -193,7 +195,7 @@ def test_process_conversation_with_setup_script_failure( mock_config_loader: ConfigLoader, processor_components: ProcessorComponents, sample_conv_data: EvaluationData, - mocker: MockerFixture, # pylint: disable=unused-argument + mocker: MockerFixture, ) -> None: """Test processing handles setup script failure.""" sample_conv_data.setup_script = "setup.sh" @@ -215,7 +217,7 @@ def test_process_conversation_with_cleanup_script( mock_config_loader: ConfigLoader, processor_components: ProcessorComponents, sample_conv_data: EvaluationData, - mocker: MockerFixture, # pylint: disable=unused-argument + mocker: MockerFixture, ) -> None: """Test cleanup script is always called.""" @@ -265,7 +267,7 @@ def test_process_conversation_with_api_amendment( mock_config_loader: ConfigLoader, processor_components: ProcessorComponents, sample_conv_data: EvaluationData, - mocker: MockerFixture, # pylint: disable=unused-argument + mocker: MockerFixture, ) -> None: """Test API amendment during turn processing.""" @@ -311,7 +313,7 @@ def test_process_conversation_with_api_error_cascade( self, mock_config_loader: ConfigLoader, processor_components: ProcessorComponents, - mocker: MockerFixture, # pylint: disable=unused-argument + mocker: MockerFixture, ) -> None: """Test API error causes cascade failure.""" assert mock_config_loader.system_config is not None @@ -354,7 +356,7 @@ def test_evaluate_turn( mock_config_loader: ConfigLoader, processor_components: ProcessorComponents, sample_conv_data: EvaluationData, - mocker: MockerFixture, # pylint: disable=unused-argument + mocker: MockerFixture, ) -> None: """Test _evaluate_turn method.""" @@ -372,7 +374,7 @@ def test_evaluate_turn( ) processor = ConversationProcessor(mock_config_loader, processor_components) - results = processor._evaluate_turn( # pylint: disable=protected-access + results = processor._evaluate_turn( sample_conv_data, 0, sample_conv_data.turns[0], ["ragas:faithfulness"] ) @@ -384,7 +386,7 @@ def test_evaluate_conversation( mock_config_loader: ConfigLoader, processor_components: ProcessorComponents, sample_conv_data: EvaluationData, - mocker: MockerFixture, # pylint: disable=unused-argument + mocker: MockerFixture, ) -> None: """Test _evaluate_conversation method.""" @@ -402,7 +404,7 @@ def test_evaluate_conversation( ) processor = ConversationProcessor(mock_config_loader, processor_components) - results = processor._evaluate_conversation( # pylint: disable=protected-access + results = processor._evaluate_conversation( sample_conv_data, ["deepeval:conversation_completeness"] ) @@ -421,9 +423,7 @@ def test_run_setup_script_skips_when_api_disabled( mock_config_loader.system_config.api.enabled = False processor = ConversationProcessor(mock_config_loader, processor_components) - error = processor._run_setup_script( # pylint: disable=protected-access - sample_conv_data - ) + error = processor._run_setup_script(sample_conv_data) assert error is None processor_components.script_manager.run_script.assert_not_called() @@ -440,9 +440,7 @@ def test_run_cleanup_script_skips_when_api_disabled( mock_config_loader.system_config.api.enabled = False processor = ConversationProcessor(mock_config_loader, processor_components) - processor._run_cleanup_script( # pylint: disable=protected-access - sample_conv_data - ) + processor._run_cleanup_script(sample_conv_data) processor_components.script_manager.run_script.assert_not_called() @@ -461,9 +459,7 @@ def test_run_cleanup_script_logs_warning_on_failure( processor = ConversationProcessor(mock_config_loader, processor_components) # Should not raise, just log warning - processor._run_cleanup_script( # pylint: disable=protected-access - sample_conv_data - ) + processor._run_cleanup_script(sample_conv_data) def test_get_metrics_summary( self, @@ -501,9 +497,7 @@ def test_evaluate_turn_with_valid_metrics( turn_metrics = ["ragas:faithfulness", "custom:answer_correctness"] - results = processor._evaluate_turn( # pylint: disable=protected-access - conv_data, 0, turn_data, turn_metrics - ) + results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) # Should evaluate both metrics assert len(results) == 2 @@ -539,9 +533,7 @@ def test_evaluate_turn_with_invalid_metric( turn_metrics = ["ragas:faithfulness", "custom:answer_correctness"] with caplog.at_level(logging.ERROR): - results = processor._evaluate_turn( # pylint: disable=protected-access - conv_data, 0, turn_data, turn_metrics - ) + results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) # Should get 2 results: 1 ERROR for invalid metric, 1 PASS for valid metric assert len(results) == 2 @@ -584,9 +576,7 @@ def test_evaluate_turn_with_all_invalid_metrics( turn_metrics = ["ragas:faithfulness", "custom:answer_correctness"] with caplog.at_level(logging.ERROR): - results = processor._evaluate_turn( # pylint: disable=protected-access - conv_data, 0, turn_data, turn_metrics - ) + results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) # Should return ERROR results for both invalid metrics assert len(results) == 2 @@ -627,9 +617,7 @@ def test_evaluate_turn_with_mixed_valid_invalid_metrics( ] with caplog.at_level(logging.ERROR): - results = processor._evaluate_turn( # pylint: disable=protected-access - conv_data, 0, turn_data, turn_metrics - ) + results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) # Should get 3 results: 2 valid metrics (PASS) and 1 invalid metric (ERROR) assert len(results) == 3 @@ -659,9 +647,7 @@ def test_evaluate_turn_with_empty_metrics( turn_metrics: list[str] = [] - results = processor._evaluate_turn( # pylint: disable=protected-access - conv_data, 0, turn_data, turn_metrics - ) + results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) # Should return empty results assert len(results) == 0 @@ -683,9 +669,7 @@ def test_evaluate_turn_creates_correct_request( turn_metrics = ["ragas:faithfulness"] - processor._evaluate_turn( # pylint: disable=protected-access - conv_data, 0, turn_data, turn_metrics - ) + processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) # Verify the request structure assert mock_metrics_evaluator.evaluate_metric.call_count == 1 @@ -714,9 +698,7 @@ def test_evaluate_turn_handles_evaluator_returning_none( turn_metrics = ["ragas:faithfulness"] - results = processor._evaluate_turn( # pylint: disable=protected-access - conv_data, 0, turn_data, turn_metrics - ) + results = processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) # Should return empty results when evaluator returns None assert len(results) == 0 @@ -740,9 +722,7 @@ def test_evaluate_turn_multiple_turns_correct_index( turn_metrics = ["ragas:faithfulness"] # Evaluate second turn (index 1) - processor._evaluate_turn( # pylint: disable=protected-access - conv_data, 1, turn_data_2, turn_metrics - ) + processor._evaluate_turn(conv_data, 1, turn_data_2, turn_metrics) # Verify correct turn index call_args = mock_metrics_evaluator.evaluate_metric.call_args[0][0] @@ -766,9 +746,7 @@ def test_evaluate_turn_preserves_metric_order( "ragas:context_recall", ] - processor._evaluate_turn( # pylint: disable=protected-access - conv_data, 0, turn_data, turn_metrics - ) + processor._evaluate_turn(conv_data, 0, turn_data, turn_metrics) # Verify metrics were evaluated in order assert mock_metrics_evaluator.evaluate_metric.call_count == 3 @@ -852,7 +830,7 @@ def _create(skip_on_failure: bool) -> ConfigLoader: (True, False, False), # System enabled, conv disables ], ) - def test_is_skip_on_failure_enabled( # pylint: disable=too-many-arguments, too-many-positional-arguments + def test_is_skip_on_failure_enabled( self, config_loader_factory: Callable[[bool], ConfigLoader], processor_components: ProcessorComponents, @@ -869,12 +847,7 @@ def test_is_skip_on_failure_enabled( # pylint: disable=too-many-arguments, too- processor = ConversationProcessor( config_loader_factory(system_skip), processor_components ) - assert ( - processor._is_skip_on_failure_enabled( # pylint: disable=protected-access - conv_data - ) - is expected - ) + assert processor._is_skip_on_failure_enabled(conv_data) is expected @pytest.mark.parametrize( "results_status,expected", @@ -899,13 +872,10 @@ def test_has_failure( ) for i, status in enumerate(results_status) ] - assert ( - processor._has_failure(results) # pylint: disable=protected-access - is expected - ) + assert processor._has_failure(results) is expected @pytest.mark.parametrize("skip_enabled,expect_skip", [(True, True), (False, False)]) - def test_skip_on_failure_behavior( # pylint: disable=too-many-arguments, too-many-positional-arguments + def test_skip_on_failure_behavior( self, config_loader_factory: Callable[[bool], ConfigLoader], processor_components: ProcessorComponents, diff --git a/tests/unit/runner/test_evaluation.py b/tests/unit/runner/test_evaluation.py index a9a18fcf..8931f137 100644 --- a/tests/unit/runner/test_evaluation.py +++ b/tests/unit/runner/test_evaluation.py @@ -1,3 +1,5 @@ +# pylint: disable=unused-argument + """Unit tests for runner/evaluation.py.""" import argparse @@ -28,7 +30,7 @@ class TestRunEvaluation: def test_run_evaluation_success( self, mocker: MockerFixture, - capsys: pytest.CaptureFixture, # pylint: disable=unused-argument + capsys: pytest.CaptureFixture, ) -> None: """Test successful evaluation run.""" # Mock ConfigLoader @@ -98,7 +100,7 @@ def test_run_evaluation_success( def test_run_evaluation_with_output_dir_override( self, mocker: MockerFixture, - capsys: pytest.CaptureFixture, # pylint: disable=unused-argument + capsys: pytest.CaptureFixture, ) -> None: """Test evaluation with custom output directory.""" mock_loader = mocker.Mock() @@ -261,7 +263,7 @@ def test_run_evaluation_with_errors_in_results( def test_run_evaluation_closes_pipeline_on_exception( self, mocker: MockerFixture, - capsys: pytest.CaptureFixture, # pylint: disable=unused-argument + capsys: pytest.CaptureFixture, ) -> None: """Test pipeline is closed even if evaluation fails.""" mock_loader = mocker.Mock() From c3ebe9c341c8a243e420f44e239835611287ab0a Mon Sep 17 00:00:00 2001 From: Eva Micankova Date: Thu, 29 Jan 2026 15:50:58 +0100 Subject: [PATCH 3/3] Removing unittest.mock --- tests/script/test_run_multi_provider_eval.py | 200 ++--- tests/unit/core/metrics/test_geval.py | 767 ++++++++++--------- 2 files changed, 504 insertions(+), 463 deletions(-) diff --git a/tests/script/test_run_multi_provider_eval.py b/tests/script/test_run_multi_provider_eval.py index a65b235a..0b271c62 100644 --- a/tests/script/test_run_multi_provider_eval.py +++ b/tests/script/test_run_multi_provider_eval.py @@ -6,13 +6,13 @@ import json from pathlib import Path from typing import Any -from unittest.mock import patch import tempfile as temp_module import logging import multiprocessing import shutil import pytest +from pytest_mock import MockerFixture import yaml from script.run_multi_provider_eval import MultiProviderEvaluationRunner @@ -390,6 +390,7 @@ def test_create_temp_config_file( def test_temp_config_cleanup_on_yaml_dump_failure( self, runner: MultiProviderEvaluationRunner, + mocker: MockerFixture, ) -> None: """Test that temp file is cleaned up when yaml.dump() fails.""" @@ -406,28 +407,26 @@ def track_temp_file(*args: Any, **kwargs: Any) -> Any: return temp_file # Mock NamedTemporaryFile to track the created file - with patch( + mocker.patch( "script.run_multi_provider_eval.tempfile.NamedTemporaryFile", side_effect=track_temp_file, - ): - # Mock yaml.dump to raise an exception - with patch( - "script.run_multi_provider_eval.yaml.dump", - side_effect=Exception("YAML dump failed"), - ): - with pytest.raises(Exception, match="YAML dump failed"): - runner._create_temp_system_config( - provider_id="openai", - model="gpt-4o-mini", - ) - - # Verify the temp file was cleaned up after the exception - assert ( - created_temp_path is not None - ), "Temp file should have been created" - assert ( - not created_temp_path.exists() - ), "Temp file should have been cleaned up" + ) + + # Mock yaml.dump to raise an exception + mocker.patch( + "script.run_multi_provider_eval.yaml.dump", + side_effect=Exception("YAML dump failed"), + ) + + with pytest.raises(Exception, match="YAML dump failed"): + runner._create_temp_system_config( + provider_id="openai", + model="gpt-4o-mini", + ) + + # Verify the temp file was cleaned up after the exception + assert created_temp_path is not None, "Temp file should have been created" + assert not created_temp_path.exists(), "Temp file should have been cleaned up" def test_temp_config_sanitizes_special_characters( self, runner: MultiProviderEvaluationRunner @@ -465,131 +464,134 @@ def runner( ) def test_path_traversal_blocked_in_provider_id( - self, runner: MultiProviderEvaluationRunner + self, runner: MultiProviderEvaluationRunner, mocker: MockerFixture ) -> None: """Test that path traversal in provider_id is sanitized.""" - with patch( + mocker.patch( "script.run_multi_provider_eval.run_evaluation", return_value={"PASS": 0, "FAIL": 0, "ERROR": 1}, - ): - # Attempt path traversal in provider_id - result = runner._run_single_evaluation( - provider_name="malicious", - provider_id="../../etc", - model="test", - ) + ) + + # Attempt path traversal in provider_id + result = runner._run_single_evaluation( + provider_name="malicious", + provider_id="../../etc", + model="test", + ) - # Verify that the output path is sanitized and stays within base - output_path = Path(result["output_dir"]) - base_path = runner.output_base.resolve() - assert output_path.resolve().is_relative_to(base_path) - # Verify dangerous characters are removed - assert ".." not in str(output_path) - assert "/" not in str(output_path.relative_to(base_path).parts[0]) + # Verify that the output path is sanitized and stays within base + output_path = Path(result["output_dir"]) + base_path = runner.output_base.resolve() + assert output_path.resolve().is_relative_to(base_path) + # Verify dangerous characters are removed + assert ".." not in str(output_path) + assert "/" not in str(output_path.relative_to(base_path).parts[0]) - # Cleanup - if output_path.exists(): - shutil.rmtree(output_path.parent, ignore_errors=True) + # Cleanup + if output_path.exists(): + shutil.rmtree(output_path.parent, ignore_errors=True) def test_path_traversal_blocked_in_model( - self, runner: MultiProviderEvaluationRunner + self, runner: MultiProviderEvaluationRunner, mocker: MockerFixture ) -> None: """Test that path traversal in model name is sanitized.""" - with patch( + mocker.patch( "script.run_multi_provider_eval.run_evaluation", return_value={"PASS": 0, "FAIL": 0, "ERROR": 1}, - ): - # Attempt path traversal in model - result = runner._run_single_evaluation( - provider_name="openai", - provider_id="openai", - model="../../../etc/passwd", - ) + ) - # Verify that the output path is sanitized and stays within base - output_path = Path(result["output_dir"]) - base_path = runner.output_base.resolve() - assert output_path.resolve().is_relative_to(base_path) - # Verify dangerous characters are removed - assert ".." not in str(output_path) + # Attempt path traversal in model + result = runner._run_single_evaluation( + provider_name="openai", + provider_id="openai", + model="../../../etc/passwd", + ) - # Cleanup - if output_path.exists(): - shutil.rmtree(output_path.parent.parent, ignore_errors=True) + # Verify that the output path is sanitized and stays within base + output_path = Path(result["output_dir"]) + base_path = runner.output_base.resolve() + assert output_path.resolve().is_relative_to(base_path) + # Verify dangerous characters are removed + assert ".." not in str(output_path) + + # Cleanup + if output_path.exists(): + shutil.rmtree(output_path.parent.parent, ignore_errors=True) class TestRunSingleEvaluation: """Tests for _run_single_evaluation method.""" def test_run_single_evaluation_success( - self, runner: MultiProviderEvaluationRunner + self, runner: MultiProviderEvaluationRunner, mocker: MockerFixture ) -> None: """Test successful single evaluation.""" # Mock run_evaluation to return a successful summary - with patch( + mock_run_eval = mocker.patch( "script.run_multi_provider_eval.run_evaluation", return_value={"PASS": 5, "FAIL": 2, "ERROR": 0}, - ) as mock_run_eval: - result = runner._run_single_evaluation( - provider_name="openai", - provider_id="openai", - model="gpt-4o-mini", - ) + ) + + result = runner._run_single_evaluation( + provider_name="openai", + provider_id="openai", + model="gpt-4o-mini", + ) - assert result["success"] is True - assert result["provider_id"] == "openai" - assert result["model"] == "gpt-4o-mini" - assert result["summary"]["PASS"] == 5 - assert result["error"] is None - assert "duration_seconds" in result - mock_run_eval.assert_called_once() + assert result["success"] is True + assert result["provider_id"] == "openai" + assert result["model"] == "gpt-4o-mini" + assert result["summary"]["PASS"] == 5 + assert result["error"] is None + assert "duration_seconds" in result + mock_run_eval.assert_called_once() def test_run_single_evaluation_failure( - self, runner: MultiProviderEvaluationRunner + self, runner: MultiProviderEvaluationRunner, mocker: MockerFixture ) -> None: """Test evaluation failure handling.""" # Mock run_evaluation to return None (failure) - with patch("script.run_multi_provider_eval.run_evaluation", return_value=None): - result = runner._run_single_evaluation( - provider_name="openai", - provider_id="openai", - model="gpt-4o-mini", - ) + mocker.patch("script.run_multi_provider_eval.run_evaluation", return_value=None) + result = runner._run_single_evaluation( + provider_name="openai", + provider_id="openai", + model="gpt-4o-mini", + ) - assert result["success"] is False - assert result["error"] == "Evaluation returned None (failed)" + assert result["success"] is False + assert result["error"] == "Evaluation returned None (failed)" def test_run_single_evaluation_invalid_summary( - self, runner: MultiProviderEvaluationRunner + self, runner: MultiProviderEvaluationRunner, mocker: MockerFixture ) -> None: """Test evaluation with invalid summary structure.""" # Mock run_evaluation to return a summary missing required keys - with patch( + mocker.patch( "script.run_multi_provider_eval.run_evaluation", return_value={"PASS": 5, "FAIL": 2}, # Missing ERROR key - ): - result = runner._run_single_evaluation( - provider_name="openai", - provider_id="openai", - model="gpt-4o-mini", - ) + ) + result = runner._run_single_evaluation( + provider_name="openai", + provider_id="openai", + model="gpt-4o-mini", + ) - assert result["success"] is False - assert "Invalid summary structure" in result["error"] - assert "summary" not in result + assert result["success"] is False + assert "Invalid summary structure" in result["error"] + assert "summary" not in result class TestRunEvaluations: """Tests for run_evaluations method.""" def test_run_evaluations_sequential( - self, runner: MultiProviderEvaluationRunner + self, runner: MultiProviderEvaluationRunner, mocker: MockerFixture ) -> None: """Test sequential evaluation execution.""" # Force sequential mode runner.max_workers = 1 - with patch.object( + mock_single_eval = mocker.patch.object( runner, "_run_single_evaluation", return_value={ @@ -597,11 +599,11 @@ def test_run_evaluations_sequential( "provider_id": "test", "model": "test-model", }, - ) as mock_single_eval: - results = runner.run_evaluations() + ) + results = runner.run_evaluations() - assert len(results) == 3 # 2 openai + 1 watsonx - assert mock_single_eval.call_count == 3 + assert len(results) == 3 # 2 openai + 1 watsonx + assert mock_single_eval.call_count == 3 class TestGenerateSummary: diff --git a/tests/unit/core/metrics/test_geval.py b/tests/unit/core/metrics/test_geval.py index d228b9f4..6247b2a5 100644 --- a/tests/unit/core/metrics/test_geval.py +++ b/tests/unit/core/metrics/test_geval.py @@ -1,10 +1,10 @@ # pylint: disable=too-many-public-methods,protected-access """Tests for GEval metrics handler.""" - -from unittest.mock import MagicMock, patch +from typing import Any import pytest +from pytest_mock import MockerFixture from deepeval.test_case import LLMTestCaseParams from lightspeed_evaluation.core.metrics.geval import GEvalHandler @@ -15,22 +15,20 @@ class TestGEvalHandler: """Test cases for GEvalHandler class.""" @pytest.fixture - def mock_llm_manager(self) -> MagicMock: + def mock_llm_manager(self, mocker: MockerFixture) -> Any: """Create a mock DeepEvalLLMManager.""" - mock_manager = MagicMock() - mock_llm = MagicMock() + mock_manager = mocker.MagicMock() + mock_llm = mocker.MagicMock() mock_manager.get_llm.return_value = mock_llm return mock_manager @pytest.fixture - def mock_metric_manager(self) -> MagicMock: + def mock_metric_manager(self, mocker: MockerFixture) -> Any: """Create a mock MetricManager.""" - return MagicMock() + return mocker.MagicMock() @pytest.fixture - def handler( - self, mock_llm_manager: MagicMock, mock_metric_manager: MagicMock - ) -> GEvalHandler: + def handler(self, mock_llm_manager: Any, mock_metric_manager: Any) -> GEvalHandler: """Create a GEvalHandler instance with mocked dependencies.""" return GEvalHandler( deepeval_llm_manager=mock_llm_manager, @@ -38,7 +36,7 @@ def handler( ) def test_initialization( - self, mock_llm_manager: MagicMock, mock_metric_manager: MagicMock + self, mock_llm_manager: Any, mock_metric_manager: Any ) -> None: """Test GEvalHandler initialization with required dependencies.""" handler = GEvalHandler( @@ -114,7 +112,10 @@ def test_convert_evaluation_params_mixed_invalid_returns_none( assert result is None def test_get_geval_config_uses_metric_manager( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test that _get_geval_config delegates to MetricManager.""" expected_config = { @@ -124,7 +125,7 @@ def test_get_geval_config_uses_metric_manager( } mock_metric_manager.get_metric_metadata.return_value = expected_config - conv_data = MagicMock() + conv_data = mocker.MagicMock() config = handler._get_geval_config( metric_name="test_metric", conv_data=conv_data, @@ -141,14 +142,17 @@ def test_get_geval_config_uses_metric_manager( ) def test_get_geval_config_turn_level( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test retrieving turn-level config uses correct MetricLevel.""" expected_config = {"criteria": "Turn criteria", "threshold": 0.9} mock_metric_manager.get_metric_metadata.return_value = expected_config - conv_data = MagicMock() - turn_data = MagicMock() + conv_data = mocker.MagicMock() + turn_data = mocker.MagicMock() config = handler._get_geval_config( metric_name="turn_metric", @@ -166,12 +170,15 @@ def test_get_geval_config_turn_level( ) def test_get_geval_config_returns_none_when_not_found( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test that None is returned when MetricManager finds no config.""" mock_metric_manager.get_metric_metadata.return_value = None - conv_data = MagicMock() + conv_data = mocker.MagicMock() config = handler._get_geval_config( metric_name="nonexistent_metric", conv_data=conv_data, @@ -182,12 +189,15 @@ def test_get_geval_config_returns_none_when_not_found( assert config is None def test_evaluate_missing_config( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test that evaluate returns error when config is not found.""" mock_metric_manager.get_metric_metadata.return_value = None - conv_data = MagicMock() + conv_data = mocker.MagicMock() score, reason = handler.evaluate( metric_name="nonexistent", conv_data=conv_data, @@ -200,7 +210,10 @@ def test_evaluate_missing_config( assert "configuration not found" in reason.lower() def test_evaluate_missing_criteria( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test that evaluate requires 'criteria' in config.""" mock_metric_manager.get_metric_metadata.return_value = { @@ -209,7 +222,7 @@ def test_evaluate_missing_criteria( # Missing 'criteria' } - conv_data = MagicMock() + conv_data = mocker.MagicMock() score, reason = handler.evaluate( metric_name="test_metric", conv_data=conv_data, @@ -222,14 +235,17 @@ def test_evaluate_missing_criteria( assert "criteria" in reason.lower() def test_evaluate_turn_missing_turn_data( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test that turn-level evaluation requires turn_data.""" mock_metric_manager.get_metric_metadata.return_value = { "criteria": "Test criteria" } - conv_data = MagicMock() + conv_data = mocker.MagicMock() score, reason = handler.evaluate( metric_name="test_metric", conv_data=conv_data, @@ -242,389 +258,412 @@ def test_evaluate_turn_missing_turn_data( assert "turn data required" in reason.lower() def test_evaluate_turn_success( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test successful turn-level evaluation.""" - with patch( + mock_geval_class = mocker.patch( "lightspeed_evaluation.core.metrics.geval.GEval" - ) as mock_geval_class: - # Mock GEval metric instance - mock_metric = MagicMock() - mock_metric.score = 0.85 - mock_metric.reason = "Test passed" - mock_geval_class.return_value = mock_metric - - # Setup metric manager to return config - mock_metric_manager.get_metric_metadata.return_value = { - "criteria": "Test criteria", - "evaluation_params": ["query", "response"], - "evaluation_steps": ["Step 1", "Step 2"], - "threshold": 0.7, - } - - # Mock turn data - turn_data = MagicMock() - turn_data.query = "Test query" - turn_data.response = "Test response" - turn_data.expected_response = None - turn_data.contexts = None - - conv_data = MagicMock() - - score, reason = handler.evaluate( - metric_name="test_metric", - conv_data=conv_data, - _turn_idx=0, - turn_data=turn_data, - is_conversation=False, - ) - - assert score == 0.85 - assert reason == "Test passed" - mock_metric.measure.assert_called_once() + ) + # Mock GEval metric instance + mock_metric = mocker.MagicMock() + mock_metric.score = 0.85 + mock_metric.reason = "Test passed" + mock_geval_class.return_value = mock_metric + + # Setup metric manager to return config + mock_metric_manager.get_metric_metadata.return_value = { + "criteria": "Test criteria", + "evaluation_params": ["query", "response"], + "evaluation_steps": ["Step 1", "Step 2"], + "threshold": 0.7, + } + + # Mock turn data + turn_data = mocker.MagicMock() + turn_data.query = "Test query" + turn_data.response = "Test response" + turn_data.expected_response = None + turn_data.contexts = None + + conv_data = mocker.MagicMock() + + score, reason = handler.evaluate( + metric_name="test_metric", + conv_data=conv_data, + _turn_idx=0, + turn_data=turn_data, + is_conversation=False, + ) + + assert score == 0.85 + assert reason == "Test passed" + mock_metric.measure.assert_called_once() def test_evaluate_turn_with_optional_fields( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test turn-level evaluation includes optional fields when present.""" - with patch( + mock_geval_class = mocker.patch( "lightspeed_evaluation.core.metrics.geval.GEval" - ) as mock_geval_class: - with patch( - "lightspeed_evaluation.core.metrics.geval.LLMTestCase" - ) as mock_test_case_class: - mock_metric = MagicMock() - mock_metric.score = 0.75 - mock_metric.reason = "Good match" - mock_geval_class.return_value = mock_metric - - mock_test_case = MagicMock() - mock_test_case_class.return_value = mock_test_case - - # Setup metric manager - mock_metric_manager.get_metric_metadata.return_value = { - "criteria": "Compare against expected", - "evaluation_params": ["query", "response", "expected_response"], - "threshold": 0.7, - } - - # Mock turn data with all optional fields - turn_data = MagicMock() - turn_data.query = "Test query" - turn_data.response = "Test response" - turn_data.expected_response = "Expected response" - turn_data.contexts = ["Context 1", "Context 2"] - - conv_data = MagicMock() - - handler.evaluate( - metric_name="test_metric", - conv_data=conv_data, - _turn_idx=0, - turn_data=turn_data, - is_conversation=False, - ) - - # Verify test case was created with optional fields - call_kwargs = mock_test_case_class.call_args[1] - assert call_kwargs["input"] == "Test query" - assert call_kwargs["actual_output"] == "Test response" - assert call_kwargs["expected_output"] == "Expected response" - assert call_kwargs["context"] == ["Context 1", "Context 2"] + ) + mock_test_case_class = mocker.patch( + "lightspeed_evaluation.core.metrics.geval.LLMTestCase" + ) + mock_metric = mocker.MagicMock() + mock_metric.score = 0.75 + mock_metric.reason = "Good match" + mock_geval_class.return_value = mock_metric + + mock_test_case = mocker.MagicMock() + mock_test_case_class.return_value = mock_test_case + + # Setup metric manager + mock_metric_manager.get_metric_metadata.return_value = { + "criteria": "Compare against expected", + "evaluation_params": ["query", "response", "expected_response"], + "threshold": 0.7, + } + + # Mock turn data with all optional fields + turn_data = mocker.MagicMock() + turn_data.query = "Test query" + turn_data.response = "Test response" + turn_data.expected_response = "Expected response" + turn_data.contexts = ["Context 1", "Context 2"] + + conv_data = mocker.MagicMock() + + handler.evaluate( + metric_name="test_metric", + conv_data=conv_data, + _turn_idx=0, + turn_data=turn_data, + is_conversation=False, + ) + + # Verify test case was created with optional fields + call_kwargs = mock_test_case_class.call_args[1] + assert call_kwargs["input"] == "Test query" + assert call_kwargs["actual_output"] == "Test response" + assert call_kwargs["expected_output"] == "Expected response" + assert call_kwargs["context"] == ["Context 1", "Context 2"] def test_evaluate_turn_none_score_returns_zero( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test that None score from metric is converted to 0.0.""" - with patch( + mock_geval_class = mocker.patch( "lightspeed_evaluation.core.metrics.geval.GEval" - ) as mock_geval_class: - mock_metric = MagicMock() - mock_metric.score = None - mock_metric.reason = "Could not evaluate" - mock_geval_class.return_value = mock_metric - - mock_metric_manager.get_metric_metadata.return_value = { - "criteria": "Test criteria", - "threshold": 0.7, - } - - turn_data = MagicMock() - turn_data.query = "Test query" - turn_data.response = "Test response" - turn_data.expected_response = None - turn_data.contexts = None - - conv_data = MagicMock() - - score, reason = handler.evaluate( - metric_name="test_metric", - conv_data=conv_data, - _turn_idx=0, - turn_data=turn_data, - is_conversation=False, - ) - - # Should return 0.0 when score is None - assert score == 0.0 - assert reason == "Could not evaluate" + ) + mock_metric = mocker.MagicMock() + mock_metric.score = None + mock_metric.reason = "Could not evaluate" + mock_geval_class.return_value = mock_metric + + mock_metric_manager.get_metric_metadata.return_value = { + "criteria": "Test criteria", + "threshold": 0.7, + } + + turn_data = mocker.MagicMock() + turn_data.query = "Test query" + turn_data.response = "Test response" + turn_data.expected_response = None + turn_data.contexts = None + + conv_data = mocker.MagicMock() + + score, reason = handler.evaluate( + metric_name="test_metric", + conv_data=conv_data, + _turn_idx=0, + turn_data=turn_data, + is_conversation=False, + ) + + # Should return 0.0 when score is None + assert score == 0.0 + assert reason == "Could not evaluate" def test_evaluate_turn_handles_exceptions( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test that turn evaluation handles exceptions gracefully.""" - with patch( + mock_geval_class = mocker.patch( "lightspeed_evaluation.core.metrics.geval.GEval" - ) as mock_geval_class: - mock_metric = MagicMock() - mock_metric.measure.side_effect = ValueError("Test error") - mock_geval_class.return_value = mock_metric - - mock_metric_manager.get_metric_metadata.return_value = { - "criteria": "Test criteria", - "threshold": 0.7, - } - - turn_data = MagicMock() - turn_data.query = "Test query" - turn_data.response = "Test response" - turn_data.expected_response = None - turn_data.contexts = None - - conv_data = MagicMock() - - score, reason = handler.evaluate( - metric_name="test_metric", - conv_data=conv_data, - _turn_idx=0, - turn_data=turn_data, - is_conversation=False, - ) - - assert score is None - assert "evaluation error" in reason.lower() - assert "Test error" in reason + ) + mock_metric = mocker.MagicMock() + mock_metric.measure.side_effect = ValueError("Test error") + mock_geval_class.return_value = mock_metric + + mock_metric_manager.get_metric_metadata.return_value = { + "criteria": "Test criteria", + "threshold": 0.7, + } + + turn_data = mocker.MagicMock() + turn_data.query = "Test query" + turn_data.response = "Test response" + turn_data.expected_response = None + turn_data.contexts = None + + conv_data = mocker.MagicMock() + + score, reason = handler.evaluate( + metric_name="test_metric", + conv_data=conv_data, + _turn_idx=0, + turn_data=turn_data, + is_conversation=False, + ) + + assert score is None + assert "evaluation error" in reason.lower() + assert "Test error" in reason def test_evaluate_turn_uses_default_params_when_none_provided( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test that default evaluation_params are used when none provided.""" - with patch( + mock_geval_class = mocker.patch( "lightspeed_evaluation.core.metrics.geval.GEval" - ) as mock_geval_class: - mock_metric = MagicMock() - mock_metric.score = 0.8 - mock_metric.reason = "Good" - mock_geval_class.return_value = mock_metric - - # Config with no evaluation_params - mock_metric_manager.get_metric_metadata.return_value = { - "criteria": "Test criteria", - "threshold": 0.7, - } - - turn_data = MagicMock() - turn_data.query = "Test query" - turn_data.response = "Test response" - turn_data.expected_response = None - turn_data.contexts = None - - conv_data = MagicMock() - - handler.evaluate( - metric_name="test_metric", - conv_data=conv_data, - _turn_idx=0, - turn_data=turn_data, - is_conversation=False, - ) - - # Verify GEval was called with default params - call_kwargs = mock_geval_class.call_args[1] - assert call_kwargs["evaluation_params"] == [ - LLMTestCaseParams.INPUT, - LLMTestCaseParams.ACTUAL_OUTPUT, - ] + ) + mock_metric = mocker.MagicMock() + mock_metric.score = 0.8 + mock_metric.reason = "Good" + mock_geval_class.return_value = mock_metric + + # Config with no evaluation_params + mock_metric_manager.get_metric_metadata.return_value = { + "criteria": "Test criteria", + "threshold": 0.7, + } + + turn_data = mocker.MagicMock() + turn_data.query = "Test query" + turn_data.response = "Test response" + turn_data.expected_response = None + turn_data.contexts = None + + conv_data = mocker.MagicMock() + + handler.evaluate( + metric_name="test_metric", + conv_data=conv_data, + _turn_idx=0, + turn_data=turn_data, + is_conversation=False, + ) + + # Verify GEval was called with default params + call_kwargs = mock_geval_class.call_args[1] + assert call_kwargs["evaluation_params"] == [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + ] def test_evaluate_conversation_success( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test successful conversation-level evaluation.""" - with patch( + mock_geval_class = mocker.patch( "lightspeed_evaluation.core.metrics.geval.GEval" - ) as mock_geval_class: - mock_metric = MagicMock() - mock_metric.score = 0.90 - mock_metric.reason = "Conversation coherent" - mock_geval_class.return_value = mock_metric - - mock_metric_manager.get_metric_metadata.return_value = { - "criteria": "Conversation criteria", - "evaluation_params": ["query", "response"], - "threshold": 0.6, - } - - # Mock conversation data with multiple turns - turn1 = MagicMock() - turn1.query = "Query 1" - turn1.response = "Response 1" - - turn2 = MagicMock() - turn2.query = "Query 2" - turn2.response = "Response 2" - - conv_data = MagicMock() - conv_data.turns = [turn1, turn2] - - score, reason = handler.evaluate( - metric_name="test_metric", - conv_data=conv_data, - _turn_idx=None, - turn_data=None, - is_conversation=True, - ) - - assert score == 0.90 - assert reason == "Conversation coherent" - mock_metric.measure.assert_called_once() + ) + mock_metric = mocker.MagicMock() + mock_metric.score = 0.90 + mock_metric.reason = "Conversation coherent" + mock_geval_class.return_value = mock_metric + + mock_metric_manager.get_metric_metadata.return_value = { + "criteria": "Conversation criteria", + "evaluation_params": ["query", "response"], + "threshold": 0.6, + } + + # Mock conversation data with multiple turns + turn1 = mocker.MagicMock() + turn1.query = "Query 1" + turn1.response = "Response 1" + + turn2 = mocker.MagicMock() + turn2.query = "Query 2" + turn2.response = "Response 2" + + conv_data = mocker.MagicMock() + conv_data.turns = [turn1, turn2] + + score, reason = handler.evaluate( + metric_name="test_metric", + conv_data=conv_data, + _turn_idx=None, + turn_data=None, + is_conversation=True, + ) + + assert score == 0.90 + assert reason == "Conversation coherent" + mock_metric.measure.assert_called_once() def test_evaluate_conversation_aggregates_turns( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test that conversation evaluation properly aggregates turn data.""" - with patch( + mock_geval_class = mocker.patch( "lightspeed_evaluation.core.metrics.geval.GEval" - ) as mock_geval_class: - with patch( - "lightspeed_evaluation.core.metrics.geval.LLMTestCase" - ) as mock_test_case_class: - mock_metric = MagicMock() - mock_metric.score = 0.85 - mock_metric.reason = "Good conversation" - mock_geval_class.return_value = mock_metric - - mock_test_case = MagicMock() - mock_test_case_class.return_value = mock_test_case - - mock_metric_manager.get_metric_metadata.return_value = { - "criteria": "Conversation flow", - "threshold": 0.7, - } - - # Create multiple turns including one with None response - turn1 = MagicMock() - turn1.query = "First question" - turn1.response = "First answer" - - turn2 = MagicMock() - turn2.query = "Second question" - turn2.response = "Second answer" - - turn3 = MagicMock() - turn3.query = "Third question" - turn3.response = None # Test None response handling - - conv_data = MagicMock() - conv_data.turns = [turn1, turn2, turn3] - - handler.evaluate( - metric_name="test_metric", - conv_data=conv_data, - _turn_idx=None, - turn_data=None, - is_conversation=True, - ) - - # Verify test case was created with aggregated input/output - call_kwargs = mock_test_case_class.call_args[1] - assert "Turn 1 - User: First question" in call_kwargs["input"] - assert "Turn 2 - User: Second question" in call_kwargs["input"] - assert "Turn 3 - User: Third question" in call_kwargs["input"] - assert ( - "Turn 1 - Assistant: First answer" in call_kwargs["actual_output"] - ) - assert ( - "Turn 2 - Assistant: Second answer" in call_kwargs["actual_output"] - ) - assert "Turn 3 - Assistant:" in call_kwargs["actual_output"] + ) + mock_test_case_class = mocker.patch( + "lightspeed_evaluation.core.metrics.geval.LLMTestCase" + ) + mock_metric = mocker.MagicMock() + mock_metric.score = 0.85 + mock_metric.reason = "Good conversation" + mock_geval_class.return_value = mock_metric + + mock_test_case = mocker.MagicMock() + mock_test_case_class.return_value = mock_test_case + + mock_metric_manager.get_metric_metadata.return_value = { + "criteria": "Conversation flow", + "threshold": 0.7, + } + + # Create multiple turns including one with None response + turn1 = mocker.MagicMock() + turn1.query = "First question" + turn1.response = "First answer" + + turn2 = mocker.MagicMock() + turn2.query = "Second question" + turn2.response = "Second answer" + + turn3 = mocker.MagicMock() + turn3.query = "Third question" + turn3.response = None # Test None response handling + + conv_data = mocker.MagicMock() + conv_data.turns = [turn1, turn2, turn3] + + handler.evaluate( + metric_name="test_metric", + conv_data=conv_data, + _turn_idx=None, + turn_data=None, + is_conversation=True, + ) + + # Verify test case was created with aggregated input/output + call_kwargs = mock_test_case_class.call_args[1] + assert "Turn 1 - User: First question" in call_kwargs["input"] + assert "Turn 2 - User: Second question" in call_kwargs["input"] + assert "Turn 3 - User: Third question" in call_kwargs["input"] + assert "Turn 1 - Assistant: First answer" in call_kwargs["actual_output"] + assert "Turn 2 - Assistant: Second answer" in call_kwargs["actual_output"] + assert "Turn 3 - Assistant:" in call_kwargs["actual_output"] def test_evaluate_conversation_with_evaluation_steps( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test that evaluation_steps are passed to GEval when provided.""" - with patch( + mock_geval_class = mocker.patch( "lightspeed_evaluation.core.metrics.geval.GEval" - ) as mock_geval_class: - mock_metric = MagicMock() - mock_metric.score = 0.88 - mock_metric.reason = "Follows steps" - mock_geval_class.return_value = mock_metric - - mock_metric_manager.get_metric_metadata.return_value = { - "criteria": "Multi-step evaluation", - "evaluation_params": ["query", "response"], - "evaluation_steps": [ - "Check coherence", - "Verify context", - "Assess relevance", - ], - "threshold": 0.7, - } - - turn1 = MagicMock() - turn1.query = "Query 1" - turn1.response = "Response 1" - - conv_data = MagicMock() - conv_data.turns = [turn1] - - handler.evaluate( - metric_name="test_metric", - conv_data=conv_data, - _turn_idx=None, - turn_data=None, - is_conversation=True, - ) - - # Verify evaluation_steps were passed to GEval - call_kwargs = mock_geval_class.call_args[1] - assert call_kwargs["evaluation_steps"] == [ + ) + mock_metric = mocker.MagicMock() + mock_metric.score = 0.88 + mock_metric.reason = "Follows steps" + mock_geval_class.return_value = mock_metric + + mock_metric_manager.get_metric_metadata.return_value = { + "criteria": "Multi-step evaluation", + "evaluation_params": ["query", "response"], + "evaluation_steps": [ "Check coherence", "Verify context", "Assess relevance", - ] + ], + "threshold": 0.7, + } + + turn1 = mocker.MagicMock() + turn1.query = "Query 1" + turn1.response = "Response 1" + + conv_data = mocker.MagicMock() + conv_data.turns = [turn1] + + handler.evaluate( + metric_name="test_metric", + conv_data=conv_data, + _turn_idx=None, + turn_data=None, + is_conversation=True, + ) + + # Verify evaluation_steps were passed to GEval + call_kwargs = mock_geval_class.call_args[1] + assert call_kwargs["evaluation_steps"] == [ + "Check coherence", + "Verify context", + "Assess relevance", + ] def test_evaluate_conversation_handles_exceptions( - self, handler: GEvalHandler, mock_metric_manager: MagicMock + self, + handler: GEvalHandler, + mock_metric_manager: Any, + mocker: MockerFixture, ) -> None: """Test that conversation evaluation handles exceptions gracefully.""" - with patch( + mock_geval_class = mocker.patch( "lightspeed_evaluation.core.metrics.geval.GEval" - ) as mock_geval_class: - mock_metric = MagicMock() - mock_metric.measure.side_effect = RuntimeError("API error") - mock_geval_class.return_value = mock_metric - - mock_metric_manager.get_metric_metadata.return_value = { - "criteria": "Test criteria", - "threshold": 0.7, - } - - turn1 = MagicMock() - turn1.query = "Query 1" - turn1.response = "Response 1" - - conv_data = MagicMock() - conv_data.turns = [turn1] - - score, reason = handler.evaluate( - metric_name="test_metric", - conv_data=conv_data, - _turn_idx=None, - turn_data=None, - is_conversation=True, - ) - - assert score is None - assert "evaluation error" in reason.lower() - assert "API error" in reason + ) + mock_metric = mocker.MagicMock() + mock_metric.measure.side_effect = RuntimeError("API error") + mock_geval_class.return_value = mock_metric + + mock_metric_manager.get_metric_metadata.return_value = { + "criteria": "Test criteria", + "threshold": 0.7, + } + + turn1 = mocker.MagicMock() + turn1.query = "Query 1" + turn1.response = "Response 1" + + conv_data = mocker.MagicMock() + conv_data.turns = [turn1] + + score, reason = handler.evaluate( + metric_name="test_metric", + conv_data=conv_data, + _turn_idx=None, + turn_data=None, + is_conversation=True, + ) + + assert score is None + assert "evaluation error" in reason.lower() + assert "API error" in reason