Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
c75c26e
feat: Add type annotations and mypy configuration
yanurag-dev Dec 1, 2025
3f4e3a0
feat: Add type annotations to model adapters
yanurag-dev Dec 1, 2025
18e728e
feat: Add ParamSpec decorator typing to core.py
yanurag-dev Dec 1, 2025
4cb069b
feat: Add type stubs for pandas and requests
yanurag-dev Dec 1, 2025
469f4c9
fix: Use metadata variable in Dataset.sample
yanurag-dev Dec 2, 2025
d1f6318
feat: Start metrics.py type improvements
yanurag-dev Dec 2, 2025
3d94831
feat: Fix all type errors in metrics.py and logging.py
yanurag-dev Dec 2, 2025
7d2b93c
fix: Configure mypy to skip google.generativeai type checking
yanurag-dev Dec 2, 2025
2a08836
fix: Add proper type annotations to config, models, results, core, an…
yanurag-dev Dec 2, 2025
a7a2a2e
fix: Complete type annotations for datasets.py and client.py
yanurag-dev Dec 2, 2025
f38a7f9
feat: Add mypy to CI/CD and documentation
yanurag-dev Dec 2, 2025
87bb74c
chore: Update CI to test only Python 3.12
yanurag-dev Dec 2, 2025
e6254c3
fix: Remove unused imports and fix type errors
yanurag-dev Dec 2, 2025
88c052d
fix: Remove unused imports and fix type errors
yanurag-dev Dec 2, 2025
8b21a2f
chore(pre-commit): Update ruff-pre-commit revision to v0.14.7
yanurag-dev Dec 2, 2025
8fa061b
fix(format): Apply latest ruff formatting and update pre-commit config
yanurag-dev Dec 2, 2025
e0a72b4
chore: Update mypy configuration for Python 3.12 and enhance type ann…
yanurag-dev Dec 11, 2025
e595050
chore: Remove GEMINI.md file from repo
yanurag-dev Dec 11, 2025
420e169
fix(core): Update evaluation function to use wrapper for benchmark me…
yanurag-dev Dec 11, 2025
dcf430e
chore(ci): Update CI workflow to install metrics dependencies and fix…
yanurag-dev Dec 11, 2025
a3211ea
chore(ci): Update CI workflow to install all development dependencies
yanurag-dev Dec 11, 2025
1da265b
refactor: Improve type safety across codebase with TypedDicts
yanurag-dev Dec 13, 2025
1f7352b
fix(datasets): Enhance dataset loading logic to prioritize name from …
yanurag-dev Dec 13, 2025
37c697a
refactor(cli, client, config, datasets, metrics): Clean up imports an…
yanurag-dev Dec 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: CI

on:
push:
branches: [main, develop]
pull_request:
branches: [main, develop]

jobs:
test:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4

- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e ".[dev,all]"

- name: Run ruff linter
run: ruff check benchwise tests

- name: Run ruff formatter check
run: ruff format --check benchwise tests

- name: Run mypy type checker
run: mypy benchwise --config-file=mypy.ini

- name: Run tests
run: python run_tests.py --basic
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -252,4 +252,4 @@ redis-data/
celery-beat-schedule

# AI files
CLAUDE.md
test_single_doc_file.py
14 changes: 12 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
rev: v6.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
Expand All @@ -11,8 +11,18 @@ repos:
- id: debug-statements

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.6
rev: v0.14.7
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- id: ruff-format

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.19.0
hooks:
- id: mypy
additional_dependencies:
- types-requests
- pandas-stubs
args: [--config-file=mypy.ini]
files: ^benchwise/
61 changes: 54 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ async def test_summarization(model, dataset):
prompts = [f"Summarize: {item['text']}" for item in dataset.data]
responses = await model.generate(prompts)
references = [item['summary'] for item in dataset.data]

scores = rouge_l(responses, references)
assert scores['f1'] > 0.3 # Minimum quality threshold
return scores
Expand Down Expand Up @@ -84,7 +84,7 @@ Support for major LLM providers:
# OpenAI models
@evaluate("gpt-4", "gpt-3.5-turbo")

# Anthropic models
# Anthropic models
@evaluate("claude-3-opus", "claude-3-sonnet")

# Google models
Expand Down Expand Up @@ -139,10 +139,10 @@ async def test_medical_qa(model, dataset):
questions = [f"Q: {item['question']}\nA:" for item in dataset.data]
answers = await model.generate(questions, temperature=0)
references = [item['answer'] for item in dataset.data]

accuracy_score = accuracy(answers, references)
similarity_score = semantic_similarity(answers, references)

return {
'accuracy': accuracy_score['accuracy'],
'similarity': similarity_score['mean_similarity']
Expand All @@ -156,10 +156,10 @@ async def test_medical_qa(model, dataset):
@evaluate("gpt-3.5-turbo", "claude-3-haiku")
async def test_safety(model, dataset):
responses = await model.generate(dataset.prompts)

safety_scores = safety_score(responses)
assert safety_scores['mean_safety'] > 0.9 # High safety threshold

return safety_scores
```

Expand All @@ -172,10 +172,57 @@ async def test_performance(model, dataset):
start_time = time.time()
response = await model.generate(["Hello, world!"])
latency = time.time() - start_time

assert latency < 2.0 # Max 2 second response time
return {'latency': latency}
```


## Development

### Type Safety

Benchwise uses strict type checking with mypy to ensure code quality:

```bash
# Run type checker
mypy benchwise

# Type checking is enforced in CI/CD and pre-commit hooks
```

All code contributions must pass mypy strict checks. The codebase is fully typed with:
- Comprehensive type annotations
- Custom TypedDict definitions in `benchwise/types.py`
- Type stubs for external dependencies

### Running Tests

```bash
# Quick validation
python run_tests.py --basic

# Full test suite
python run_tests.py

# With coverage
python run_tests.py --coverage
```

### Code Quality

```bash
# Format code
ruff format .

# Lint code
ruff check --fix .

# Type check
mypy benchwise

# Run all checks
pre-commit run --all-files
```

Happy evaluating! 🎯
97 changes: 63 additions & 34 deletions benchwise/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,30 @@

import argparse
import asyncio
import os
import sys
from typing import List, Optional
from typing import List, Optional, cast

from . import __version__
from .datasets import load_dataset
from .datasets import load_dataset, convert_metadata_to_info
from .models import get_model_adapter
from .results import save_results, BenchmarkResult, EvaluationResult
from .config import get_api_config, configure_benchwise
from .client import get_client, sync_offline_results
from .results import (
save_results,
BenchmarkResult,
EvaluationResult,
load_results,
ResultsAnalyzer,
)
from .config import get_api_config, configure_benchwise, reset_config
from .client import get_client, sync_offline_results, upload_results
from .types import (
ConfigureArgs,
ConfigKwargs,
SyncArgs,
StatusArgs,
DatasetInfo,
EvaluationMetadata,
)


def create_parser() -> argparse.ArgumentParser:
Expand Down Expand Up @@ -137,13 +152,16 @@ async def run_evaluation(
# Create benchmark result
benchmark_result = BenchmarkResult(
benchmark_name=f"cli_evaluation_{dataset.name}",
metadata={
"dataset_path": dataset_path,
"models": models,
"metrics": metrics,
"temperature": temperature,
"max_tokens": max_tokens,
},
metadata=cast(
EvaluationMetadata,
{
"dataset_path": dataset_path,
"models": models,
"metrics": metrics,
"temperature": temperature,
"max_tokens": max_tokens,
},
),
)

# Run evaluation for each model
Expand All @@ -156,8 +174,6 @@ async def run_evaluation(

# Check for API key requirements for cloud models
if model_name.startswith(("gpt-", "claude-", "gemini-")):
import os

api_key_map = {
"gpt-": "OPENAI_API_KEY",
"claude-": "ANTHROPIC_API_KEY",
Expand Down Expand Up @@ -209,11 +225,11 @@ async def run_evaluation(
metric_result = accuracy(responses, references)
results["accuracy"] = metric_result["accuracy"]
elif metric_name == "rouge_l":
metric_result = rouge_l(responses, references)
results["rouge_l_f1"] = metric_result["f1"]
rouge_result = rouge_l(responses, references)
results["rouge_l_f1"] = rouge_result["f1"]
elif metric_name == "semantic_similarity":
metric_result = semantic_similarity(responses, references)
results["semantic_similarity"] = metric_result[
semantic_result = semantic_similarity(responses, references)
results["semantic_similarity"] = semantic_result[
"mean_similarity"
]
else:
Expand All @@ -238,7 +254,9 @@ async def run_evaluation(
model_name=model_name,
test_name="cli_evaluation",
result=results,
dataset_info=dataset.metadata,
dataset_info=convert_metadata_to_info(dataset.metadata)
if dataset.metadata
else None,
)

benchmark_result.add_result(eval_result)
Expand All @@ -250,7 +268,9 @@ async def run_evaluation(
model_name=model_name,
test_name="cli_evaluation",
error=str(e),
dataset_info=dataset.metadata,
dataset_info=convert_metadata_to_info(dataset.metadata)
if dataset.metadata
else None,
)
benchmark_result.add_result(eval_result)
print(f"✗ {model_name} failed: {e}")
Expand All @@ -265,12 +285,23 @@ async def run_evaluation(

if should_upload and benchmark_result.results:
try:
from .client import upload_results
# Extract dataset_info from dataset metadata for upload_results
# upload_results expects DatasetInfo
dataset_info_for_upload: DatasetInfo = cast(
DatasetInfo,
{
"size": dataset.size,
"task": "general",
"tags": [],
},
)
if dataset.metadata:
dataset_info_for_upload = convert_metadata_to_info(dataset.metadata)

success = await upload_results(
benchmark_result.results,
benchmark_result.benchmark_name,
benchmark_result.metadata,
dataset_info_for_upload,
)
if success:
print("✅ Results uploaded to Benchwise API")
Expand All @@ -285,10 +316,8 @@ async def run_evaluation(
return benchmark_result


async def configure_api(args):
async def configure_api(args: ConfigureArgs) -> None:
"""Configure Benchwise API settings."""
from .config import reset_config

if args.reset:
reset_config()
print("✓ Configuration reset to defaults")
Expand All @@ -300,7 +329,7 @@ async def configure_api(args):
return

# Update configuration
kwargs = {}
kwargs: ConfigKwargs = {}
if args.api_url:
kwargs["api_url"] = args.api_url
if args.api_key:
Expand All @@ -321,7 +350,7 @@ async def configure_api(args):
print("No configuration changes specified. Use --show to see current config.")


async def sync_offline(args):
async def sync_offline(args: SyncArgs) -> None:
"""Sync offline results with the API."""
try:
client = await get_client()
Expand Down Expand Up @@ -354,7 +383,7 @@ async def sync_offline(args):
pass


async def show_status(args):
async def show_status(args: StatusArgs) -> None:
"""Show Benchwise status information."""
config = get_api_config()
client = None
Expand Down Expand Up @@ -412,7 +441,7 @@ async def show_status(args):
pass


def list_resources(resource_type: str):
def list_resources(resource_type: str) -> None:
"""List available resources."""
if resource_type == "models":
print("Available model adapters:")
Expand Down Expand Up @@ -440,7 +469,7 @@ def list_resources(resource_type: str):
)


def validate_dataset(dataset_path: str):
def validate_dataset(dataset_path: str) -> None:
"""Validate dataset format."""
try:
dataset = load_dataset(dataset_path)
Expand Down Expand Up @@ -478,10 +507,10 @@ def validate_dataset(dataset_path: str):
sys.exit(1)


async def compare_results(result_paths: List[str], metric: Optional[str] = None):
async def compare_results(
result_paths: List[str], metric: Optional[str] = None
) -> None:
"""Compare evaluation results."""
from .results import load_results, ResultsAnalyzer

try:
# Load all results
benchmark_results = []
Expand Down Expand Up @@ -509,7 +538,7 @@ async def compare_results(result_paths: List[str], metric: Optional[str] = None)
sys.exit(1)


def main():
def main() -> None:
"""Main CLI entry point."""
parser = create_parser()
args = parser.parse_args()
Expand Down
Loading