Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ update-deps: ## Check pyproject.toml for changes, update the lock file if needed
uv sync --group dev

check-types: ## Checks type hints in sources
uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/
uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/ tests

black-check:
uv run black . --check
Expand Down Expand Up @@ -73,10 +73,10 @@ help: ## Show this help screen

pylint:
uv run pylint src
uv run pylint lsc_agent_eval/src
uv run pylint --disable=R0801 lsc_agent_eval/src tests

pyright:
uv run pyright src lsc_agent_eval/src
uv run pyright src lsc_agent_eval/src tests

docstyle:
uv run pydocstyle -v .
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ warn_required_dynamic_aliases = true

[tool.pylint.MASTER]
load-plugins = ["pylint_pydantic"]
init-hook = "import sys; sys.path.append('.')"

[tool.ruff]
[tool.ruff.lint.flake8-tidy-imports]
Expand Down
12 changes: 12 additions & 0 deletions pyrightconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"reportAttributeAccessIssue": "warning",
"executionEnvironments": [
{
"root": "tests",
"reportAttributeAccessIssue": "none",
"extraPaths": [
"."
]
}
]
}
1 change: 1 addition & 0 deletions script/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Script utilities for lightspeed-evaluation."""
2 changes: 1 addition & 1 deletion script/compare_evaluations.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ def _check_confidence_interval_overlap(
Returns:
Dictionary containing overlap test results
"""
result = {
result: dict[str, Any] = {
"test_performed": False,
"intervals_overlap": None,
"significant": None,
Expand Down
8 changes: 4 additions & 4 deletions script/run_multi_provider_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def _create_provider_model_configs(self) -> list[dict[str, Any]]:
Returns:
List of dictionaries with provider, model, and settings
"""
configs = []
configs: list[dict[str, Any]] = []

# Get providers from the config
providers = self.providers_config.get("providers", {})
Expand Down Expand Up @@ -781,7 +781,7 @@ def _analyze_single_model(

# Calculate score statistics
if all_scores:
score_stats = {
score_stats: dict[str, Any] = {
"mean": float(np.mean(all_scores)),
"median": float(np.median(all_scores)),
"std": float(np.std(all_scores)),
Expand Down Expand Up @@ -818,10 +818,10 @@ def _analyze_single_model(
logger.warning(
"scipy not available, skipping confidence interval calculation"
)
score_stats["confidence_interval"] = None
score_stats["confidence_interval"] = None # type: ignore[assignment]
else:
# Single score - no confidence interval
score_stats["confidence_interval"] = None
score_stats["confidence_interval"] = None # type: ignore[assignment]
else:
score_stats = {
"mean": 0.0,
Expand Down
9 changes: 9 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Pytest configuration and fixtures for lightspeed-evaluation tests."""

import sys
from pathlib import Path

# Add project root to Python path so we can import from script directory
project_root = Path(__file__).parent.parent
if str(project_root) not in sys.path:
sys.path.insert(0, str(project_root))
214 changes: 214 additions & 0 deletions tests/script/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
# pylint: disable=redefined-outer-name
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Remove the pylint suppression for redefined-outer-name.

Line 1 disables a lint rule; please fix the underlying naming issue instead of suppressing it.
As per coding guidelines: Do not disable lint warnings with # noqa, # type: ignore, or # pylint: disable comments - fix the underlying issue instead.

🤖 Prompt for AI Agents
In `@tests/script/conftest.py` at line 1, Remove the top-line pylint suppression
for redefined-outer-name and fix the underlying naming conflicts in conftest.py:
locate any fixture or helper function names in conftest.py that shadow
outer/module-level names (e.g., fixture functions or variables with the same
identifier as imports or globals), rename those functions/fixtures to unique
descriptive names, update all test imports/uses to the new names, and ensure no
identifiers in conftest.py duplicate outer scope names so the
redefined-outer-name lint is no longer triggered.


"""Pytest configuration and fixtures for script tests."""

from pathlib import Path
from typing import Any

import pytest
import yaml

from script.run_multi_provider_eval import MultiProviderEvaluationRunner


@pytest.fixture
def script_path() -> Path:
"""Return the path to the compare_evaluations.py script."""
# Test is in tests/script/, script is in project_root/script/
return Path(__file__).parent.parent.parent / "script" / "compare_evaluations.py"


@pytest.fixture
def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
"""Return sample evaluation data for testing."""
sample_results1 = [
{
"conversation_group_id": "conv1",
"turn_id": "1",
"metric_identifier": "ragas:faithfulness",
"result": "PASS",
"score": 0.8,
"threshold": 0.7,
"execution_time": 1.0,
},
{
"conversation_group_id": "conv1",
"turn_id": "2",
"metric_identifier": "ragas:faithfulness",
"result": "PASS",
"score": 0.9,
"threshold": 0.7,
"execution_time": 1.2,
},
]

sample_results2 = [
{
"conversation_group_id": "conv1",
"turn_id": "1",
"metric_identifier": "ragas:faithfulness",
"result": "PASS",
"score": 0.85,
"threshold": 0.7,
"execution_time": 1.1,
},
{
"conversation_group_id": "conv1",
"turn_id": "2",
"metric_identifier": "ragas:faithfulness",
"result": "FAIL",
"score": 0.6,
"threshold": 0.7,
"execution_time": 1.0,
},
]

return sample_results1, sample_results2


@pytest.fixture
def temp_config_files(tmp_path: Path) -> dict:
"""Create temporary configuration files for testing."""
# Create multi_eval_config.yaml
providers_config = {
"providers": {
"openai": {
"models": ["gpt-4o-mini", "gpt-4-turbo"],
},
"watsonx": {
"models": ["ibm/granite-13b-chat-v2"],
},
},
"settings": {"output_base": str(tmp_path / "eval_output")},
}
providers_path = tmp_path / "multi_eval_config.yaml"
with open(providers_path, "w", encoding="utf-8") as f:
yaml.dump(providers_config, f)

# Create system.yaml
system_config = {
"llm": {
"provider": "openai",
"model": "gpt-4o-mini",
"temperature": 0.0,
},
"api": {"enabled": False},
"output": {"output_dir": "./eval_output"},
}
system_path = tmp_path / "system.yaml"
with open(system_path, "w", encoding="utf-8") as f:
yaml.dump(system_config, f)

# Create evaluation_data.yaml
eval_data = [
{
"conversation_group_id": "test_conv",
"turns": [
{
"turn_id": "turn_1",
"query": "Test query",
"response": "Test response",
"contexts": ["Context 1"],
"expected_response": "Expected",
"turn_metrics": ["ragas:response_relevancy"],
}
],
}
]
eval_path = tmp_path / "evaluation_data.yaml"
with open(eval_path, "w", encoding="utf-8") as f:
yaml.dump(eval_data, f)

return {
"providers_config": providers_path,
"system_config": system_path,
"eval_data": eval_path,
"output_dir": tmp_path / "eval_output",
}


@pytest.fixture
def runner(
temp_config_files: dict,
) -> MultiProviderEvaluationRunner:
"""Create a MultiProviderEvaluationRunner instance for testing."""
return MultiProviderEvaluationRunner(
providers_config_path=str(temp_config_files["providers_config"]),
system_config_path=str(temp_config_files["system_config"]),
eval_data_path=str(temp_config_files["eval_data"]),
)


@pytest.fixture
def sample_evaluation_summary() -> dict[str, Any]:
"""Create a sample evaluation summary JSON for testing analysis."""
return {
"timestamp": "2025-01-01T12:00:00",
"total_evaluations": 10,
"summary_stats": {
"overall": {
"TOTAL": 10,
"PASS": 8,
"FAIL": 2,
"ERROR": 0,
"pass_rate": 80.0, # Percentage format
"fail_rate": 20.0,
"error_rate": 0.0,
},
"by_metric": {
"ragas:faithfulness": {
"pass": 4,
"fail": 0,
"error": 0,
"pass_rate": 100.0,
"fail_rate": 0.0,
"error_rate": 0.0,
"score_statistics": {
"mean": 0.95,
"median": 0.95,
"std": 0.02,
"min": 0.92,
"max": 0.98,
"count": 4,
},
},
"ragas:response_relevancy": {
"pass": 4,
"fail": 2,
"error": 0,
"pass_rate": 66.67,
"fail_rate": 33.33,
"error_rate": 0.0,
"score_statistics": {
"mean": 0.75,
"median": 0.78,
"std": 0.12,
"min": 0.55,
"max": 0.88,
"count": 6,
},
},
},
},
"results": [
{
"conversation_group_id": "conv1",
"turn_id": "turn1",
"metric_identifier": "ragas:faithfulness",
"result": "PASS",
"score": 0.95,
"threshold": 0.8,
"execution_time": 1.0,
},
{
"conversation_group_id": "conv1",
"turn_id": "turn2",
"metric_identifier": "ragas:response_relevancy",
"result": "PASS",
"score": 0.85,
"threshold": 0.7,
"execution_time": 1.2,
},
]
* 5, # Repeat to get 10 results
}
Loading
Loading