From c75c26eb0499fa17df62def8ae4321d9e2a441fc Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Mon, 1 Dec 2025 17:27:51 +0530 Subject: [PATCH 01/24] feat: Add type annotations and mypy configuration - Add benchwise/types.py with TypedDict, Protocol, and Literal definitions - Add mypy.ini with strict type checking - Add py.typed marker for PEP 561 compliance - Improve type annotations in exceptions, config, and datasets modules --- benchwise/config.py | 34 ++--- benchwise/datasets.py | 12 +- benchwise/exceptions.py | 6 +- benchwise/py.typed | 0 benchwise/types.py | 291 ++++++++++++++++++++++++++++++++++++++++ mypy.ini | 66 +++++++++ 6 files changed, 387 insertions(+), 22 deletions(-) create mode 100644 benchwise/py.typed create mode 100644 benchwise/types.py create mode 100644 mypy.ini diff --git a/benchwise/config.py b/benchwise/config.py index 62d5f6a..4d7c74b 100644 --- a/benchwise/config.py +++ b/benchwise/config.py @@ -7,10 +7,12 @@ import os from pathlib import Path -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, List from dataclasses import dataclass, field import json +from benchwise.types import ConfigDict + @dataclass class BenchwiseConfig: @@ -52,16 +54,16 @@ class BenchwiseConfig: verbose: bool = False # User preferences - default_models: list = field(default_factory=list) - default_metrics: list = field(default_factory=list) + default_models: List[str] = field(default_factory=list) + default_metrics: List[str] = field(default_factory=list) - def __post_init__(self): + def __post_init__(self) -> None: """Load configuration from environment variables and config file.""" self._load_from_env() self._load_from_file() self._validate_config() - def _load_from_env(self): + def _load_from_env(self) -> None: """Load configuration from environment variables.""" # API Configuration @@ -113,7 +115,7 @@ def _load_from_env(self): if verbose_env in ("true", "1", "yes", "on"): self.verbose = True - def _load_from_file(self): + def _load_from_file(self) -> None: """Load configuration from config file.""" config_paths = [ Path.cwd() / ".benchwise.json", @@ -140,7 +142,7 @@ def _load_from_file(self): if self.verbose: print(f"โš ๏ธ Failed to load config from {config_path}: {e}") - def _validate_config(self): + def _validate_config(self) -> None: """Validate configuration values.""" # Validate API URL @@ -169,7 +171,7 @@ def _validate_config(self): ) self.cache_enabled = False - def save_to_file(self, file_path: Optional[Path] = None): + def save_to_file(self, file_path: Optional[Path] = None) -> None: """ Save current configuration to file. @@ -212,7 +214,7 @@ def save_to_file(self, file_path: Optional[Path] = None): except OSError as e: print(f"Failed to save configuration: {e}") - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> ConfigDict: """Convert configuration to dictionary.""" return { "api_url": self.api_url, @@ -230,7 +232,7 @@ def to_dict(self) -> Dict[str, Any]: "default_metrics": self.default_metrics, } - def print_config(self): + def print_config(self) -> None: """Print current configuration in a readable format.""" print("๐Ÿ”ง Benchwise Configuration:") print("=" * 30) @@ -258,7 +260,7 @@ def get_api_config() -> BenchwiseConfig: return _global_config -def set_api_config(config: BenchwiseConfig): +def set_api_config(config: BenchwiseConfig) -> None: """ Set the global Benchwise configuration. @@ -275,7 +277,7 @@ def configure_benchwise( upload_enabled: Optional[bool] = None, cache_enabled: Optional[bool] = None, debug: Optional[bool] = None, - **kwargs, + **kwargs: Any, ) -> BenchwiseConfig: """ Configure Benchwise settings programmatically. @@ -315,7 +317,7 @@ def configure_benchwise( return config -def reset_config(): +def reset_config() -> None: """Reset configuration to default values.""" global _global_config _global_config = None @@ -409,10 +411,10 @@ def validate_api_connection(config: BenchwiseConfig) -> bool: import asyncio import httpx - async def check_connection(): + async def check_connection() -> bool: async with httpx.AsyncClient(timeout=5.0) as client: response = await client.get(f"{config.api_url}/health") - return response.status_code == 200 + return bool(response.status_code == 200) return asyncio.run(check_connection()) @@ -482,7 +484,7 @@ def validate_api_keys(config: BenchwiseConfig) -> Dict[str, bool]: return results -def print_configuration_status(config: BenchwiseConfig): +def print_configuration_status(config: BenchwiseConfig) -> None: """ NEW: Print comprehensive configuration status. diff --git a/benchwise/datasets.py b/benchwise/datasets.py index 2d1c416..194b5c2 100644 --- a/benchwise/datasets.py +++ b/benchwise/datasets.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Any, Optional, Union +from typing import List, Dict, Any, Optional, Union, Callable import json import pandas as pd from pathlib import Path @@ -6,6 +6,8 @@ from dataclasses import dataclass import hashlib +from benchwise.types import DatasetItem, DatasetMetadata + @dataclass class Dataset: @@ -24,7 +26,7 @@ class Dataset: metadata: Optional[Dict[str, Any]] = None schema: Optional[Dict[str, Any]] = None - def __post_init__(self): + def __post_init__(self) -> None: if self.metadata is None: self.metadata = {} @@ -78,13 +80,14 @@ def references(self) -> List[str]: references.append(str(ref)) return references - def filter(self, condition: callable) -> "Dataset": + def filter(self, condition: Callable[[Dict[str, Any]], bool]) -> "Dataset": """Filter dataset items based on condition.""" filtered_data = [item for item in self.data if condition(item)] + metadata = self.metadata or {} return Dataset( name=f"{self.name}_filtered", data=filtered_data, - metadata={**self.metadata, "filtered": True, "original_size": self.size}, + metadata={**metadata, "filtered": True, "original_size": self.size}, ) def sample(self, n: int, random_state: Optional[int] = None) -> "Dataset": @@ -95,6 +98,7 @@ def sample(self, n: int, random_state: Optional[int] = None) -> "Dataset": random.seed(random_state) sampled_data = random.sample(self.data, min(n, len(self.data))) + metadata = self.metadata or {} return Dataset( name=f"{self.name}_sample_{n}", data=sampled_data, diff --git a/benchwise/exceptions.py b/benchwise/exceptions.py index 498bd02..d5a9888 100644 --- a/benchwise/exceptions.py +++ b/benchwise/exceptions.py @@ -4,6 +4,8 @@ Provides specific exception types for better error handling. """ +from typing import Optional + class BenchwiseError(Exception): """Base exception for all Benchwise errors.""" @@ -17,8 +19,8 @@ class AuthenticationError(BenchwiseError): class RateLimitError(BenchwiseError): """Raised when API rate limit is exceeded.""" - - def __init__(self, message: str = "Rate limit exceeded", retry_after: int = None): + + def __init__(self, message: str = "Rate limit exceeded", retry_after: Optional[int] = None) -> None: super().__init__(message) self.retry_after = retry_after diff --git a/benchwise/py.typed b/benchwise/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/benchwise/types.py b/benchwise/types.py new file mode 100644 index 0000000..5818a62 --- /dev/null +++ b/benchwise/types.py @@ -0,0 +1,291 @@ +""" +Type definitions for BenchWise. + +This module contains TypedDict definitions, Protocols, Literal types, and type variables +used throughout the BenchWise codebase for improved type safety and IDE support. +""" + +from typing import Any, Dict, List, Literal, Optional, Protocol, TypeVar, ParamSpec, Tuple, TypedDict + +# Type Variables +T = TypeVar('T') +R = TypeVar('R') +P = ParamSpec('P') +ModelT = TypeVar('ModelT') +DatasetT = TypeVar('DatasetT') + +# Literal Types +HttpMethod = Literal["GET", "POST", "PUT", "DELETE", "PATCH"] +ModelProvider = Literal["openai", "anthropic", "google", "huggingface", "custom"] +ExportFormat = Literal["json", "csv", "markdown"] + + +# Model Configuration Types +class ModelConfig(TypedDict, total=False): + """Configuration options for model adapters.""" + api_key: str + temperature: float + max_tokens: int + top_p: float + frequency_penalty: float + presence_penalty: float + timeout: float + max_retries: int + + +class PricingInfo(TypedDict): + """Pricing information for a model.""" + input: float # Cost per 1K input tokens + output: float # Cost per 1K output tokens + + +# Metric Return Types +class RougeScores(TypedDict, total=False): + """Return type for ROUGE metric scores.""" + precision: float + recall: float + f1: float + rouge1_f1: float + rouge2_f1: float + rougeL_f1: float + std_precision: float + std_recall: float + std_f1: float + scores: Dict[str, List[float]] + # Optional confidence intervals + f1_confidence_interval: Tuple[float, float] + precision_confidence_interval: Tuple[float, float] + recall_confidence_interval: Tuple[float, float] + + +class BleuScores(TypedDict, total=False): + """Return type for BLEU metric scores.""" + bleu: float + bleu1: float + bleu2: float + bleu3: float + bleu4: float + brevity_penalty: float + length_ratio: float + std_bleu: float + scores: List[float] + # Optional confidence intervals + bleu_confidence_interval: Tuple[float, float] + + +class BertScoreResults(TypedDict, total=False): + """Return type for BERT-Score metric.""" + precision: float + recall: float + f1: float + std_precision: float + std_recall: float + std_f1: float + scores: Dict[str, List[float]] + # Optional confidence intervals + f1_confidence_interval: Tuple[float, float] + precision_confidence_interval: Tuple[float, float] + recall_confidence_interval: Tuple[float, float] + + +class AccuracyResults(TypedDict, total=False): + """Return type for accuracy metric.""" + accuracy: float + correct: int + total: int + std_accuracy: float + scores: List[float] + # Optional confidence interval + accuracy_confidence_interval: Tuple[float, float] + + +class SemanticSimilarityResults(TypedDict, total=False): + """Return type for semantic similarity metric.""" + similarity: float + std_similarity: float + scores: List[float] + # Optional confidence interval + similarity_confidence_interval: Tuple[float, float] + + +class CoherenceResults(TypedDict, total=False): + """Return type for coherence score metric.""" + coherence: float + std_coherence: float + scores: List[float] + # Optional confidence interval + coherence_confidence_interval: Tuple[float, float] + + +class SafetyResults(TypedDict, total=False): + """Return type for safety score metric.""" + safety: float + is_safe: bool + flagged_categories: List[str] + std_safety: float + scores: List[float] + # Optional confidence interval + safety_confidence_interval: Tuple[float, float] + + +class FactualCorrectnessResults(TypedDict, total=False): + """Return type for factual correctness metric.""" + correctness: float + is_correct: bool + std_correctness: float + scores: List[float] + # Optional confidence interval + correctness_confidence_interval: Tuple[float, float] + + +# Dataset Types +class DatasetItem(TypedDict, total=False): + """A single item in a dataset.""" + # Common field names + prompt: str + input: str + question: str + text: str + # Reference/target fields + reference: str + output: str + answer: str + target: str + summary: str + # Additional fields + id: str + metadata: Dict[str, Any] + + +class DatasetMetadata(TypedDict, total=False): + """Metadata for a dataset.""" + name: str + description: str + source: str + version: str + size: int + created_at: str + tags: List[str] + + +class DatasetSchema(TypedDict, total=False): + """Schema definition for a dataset.""" + prompt_field: str + reference_field: str + required_fields: List[str] + optional_fields: List[str] + + +# Configuration Types +class ConfigDict(TypedDict, total=False): + """Configuration dictionary for BenchWise.""" + api_url: str + api_key: Optional[str] + upload_enabled: bool + auto_sync: bool + cache_enabled: bool + cache_dir: str + timeout: float + max_retries: int + offline_mode: bool + debug: bool + verbose: bool + default_models: List[str] + default_metrics: List[str] + + +# Results Types +class EvaluationResultDict(TypedDict, total=False): + """Serialized evaluation result.""" + model: str + prompt: str + response: str + score: float + scores: Dict[str, float] + metadata: Dict[str, Any] + timestamp: str + success: bool + error: Optional[str] + + +class BenchmarkResultDict(TypedDict, total=False): + """Serialized benchmark result.""" + benchmark_name: str + benchmark_description: str + results: List[EvaluationResultDict] + summary: Dict[str, Any] + timestamp: str + + +class ComparisonResult(TypedDict): + """Result of model comparison.""" + best_model: str + best_score: float + rankings: List[Tuple[str, float]] + scores: Dict[str, float] + + +# API Response Types +class LoginResponse(TypedDict): + """Response from login endpoint.""" + token: Dict[str, str] + user: Dict[str, Any] + + +class UserInfo(TypedDict, total=False): + """User information from API.""" + id: int + username: str + email: str + full_name: Optional[str] + is_active: bool + + +class UploadResultsResponse(TypedDict): + """Response from upload results endpoint.""" + id: int + benchmark_id: int + model_ids: List[int] + results_count: int + message: str + + +# Protocols +class SupportsGenerate(Protocol): + """Protocol for objects that support text generation.""" + + async def generate(self, prompts: List[str], **kwargs: Any) -> List[str]: + """Generate text completions for the given prompts.""" + ... + + def get_token_count(self, text: str) -> int: + """Get the token count for the given text.""" + ... + + def get_cost_estimate(self, input_tokens: int, output_tokens: int) -> float: + """Estimate the cost for the given token counts.""" + ... + + +class SupportsCache(Protocol): + """Protocol for objects that support caching.""" + + def save(self, key: str, value: Any) -> None: + """Save a value to the cache.""" + ... + + def load(self, key: str) -> Optional[Any]: + """Load a value from the cache.""" + ... + + def exists(self, key: str) -> bool: + """Check if a key exists in the cache.""" + ... + + +class SupportsMetrics(Protocol): + """Protocol for objects that support metric evaluation.""" + + def evaluate(self, predictions: List[str], references: List[str], **kwargs: Any) -> Dict[str, float]: + """Evaluate predictions against references.""" + ... diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..4149f2b --- /dev/null +++ b/mypy.ini @@ -0,0 +1,66 @@ +[mypy] +python_version = 3.11 +files = benchwise + +# Strict type checking +strict = True +disallow_untyped_defs = True +disallow_any_generics = True +disallow_subclassing_any = True +disallow_untyped_calls = True +disallow_incomplete_defs = True +check_untyped_defs = True +disallow_untyped_decorators = True +warn_redundant_casts = True +warn_unused_ignores = True +warn_return_any = True +warn_unreachable = True +no_implicit_optional = True +strict_optional = True +strict_equality = True + +# Show error codes for easier suppression +show_error_codes = True + +# Third-party library ignores (no stubs available) +[mypy-rouge_score.*] +ignore_missing_imports = True + +[mypy-bert_score.*] +ignore_missing_imports = True + +[mypy-nltk.*] +ignore_missing_imports = True + +[mypy-transformers.*] +ignore_missing_imports = True + +[mypy-torch.*] +ignore_missing_imports = True + +[mypy-sentence_transformers.*] +ignore_missing_imports = True + +[mypy-sklearn.*] +ignore_missing_imports = True + +[mypy-httpx.*] +ignore_missing_imports = True + +[mypy-requests.*] +ignore_missing_imports = True + +[mypy-openai.*] +ignore_missing_imports = True + +[mypy-anthropic.*] +ignore_missing_imports = True + +[mypy-google.generativeai.*] +ignore_missing_imports = True + +[mypy-fuzzywuzzy.*] +ignore_missing_imports = True + +[mypy-pandas.*] +ignore_missing_imports = True From 3f4e3a026056b24b22dc7c625c48fe2d34ac879a Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Mon, 1 Dec 2025 17:38:48 +0530 Subject: [PATCH 02/24] feat: Add type annotations to model adapters - Update ModelAdapter and all implementations with ModelConfig TypedDict - Add PricingInfo TypedDict for pricing dictionaries - Add proper type annotations to all __init__ and generate methods - Fix type issues in cost estimation calculations --- benchwise/models.py | 50 +++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/benchwise/models.py b/benchwise/models.py index cd5c88a..78a50b9 100644 --- a/benchwise/models.py +++ b/benchwise/models.py @@ -1,16 +1,18 @@ from abc import ABC, abstractmethod from typing import List, Dict, Any, Optional +from benchwise.types import ModelConfig, PricingInfo + class ModelAdapter(ABC): """Abstract base class for model adapters.""" - def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None): + def __init__(self, model_name: str, config: Optional[ModelConfig] = None) -> None: self.model_name = model_name - self.config = config or {} + self.config: ModelConfig = config or {} @abstractmethod - async def generate(self, prompts: List[str], **kwargs) -> List[str]: + async def generate(self, prompts: List[str], **kwargs: Any) -> List[str]: """Generate responses for a list of prompts.""" pass @@ -28,7 +30,7 @@ def get_cost_estimate(self, input_tokens: int, output_tokens: int) -> float: class OpenAIAdapter(ModelAdapter): """Adapter for OpenAI models.""" - def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None): + def __init__(self, model_name: str, config: Optional[ModelConfig] = None) -> None: super().__init__(model_name, config) try: import openai @@ -42,16 +44,16 @@ def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None): ) # Model pricing (per 1K tokens) - self.pricing = { + self.pricing: Dict[str, PricingInfo] = { "gpt-4": {"input": 0.03, "output": 0.06}, "gpt-4-turbo": {"input": 0.01, "output": 0.03}, "gpt-3.5-turbo": {"input": 0.001, "output": 0.002}, "gpt-4o": {"input": 0.005, "output": 0.015}, } - async def generate(self, prompts: List[str], **kwargs) -> List[str]: + async def generate(self, prompts: List[str], **kwargs: Any) -> List[str]: """Generate responses using OpenAI API.""" - responses = [] + responses: List[str] = [] # Default parameters - exclude api_key from generation params generation_params = { @@ -85,15 +87,15 @@ def get_cost_estimate(self, input_tokens: int, output_tokens: int) -> float: model_pricing = self.pricing.get( self.model_name, {"input": 0.01, "output": 0.03} ) - input_cost = (input_tokens / 1000) * model_pricing["input"] - output_cost = (output_tokens / 1000) * model_pricing["output"] + input_cost = (input_tokens / 1000) * float(model_pricing["input"]) + output_cost = (output_tokens / 1000) * float(model_pricing["output"]) return input_cost + output_cost class AnthropicAdapter(ModelAdapter): """Adapter for Anthropic Claude models.""" - def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None): + def __init__(self, model_name: str, config: Optional[ModelConfig] = None) -> None: super().__init__(model_name, config) try: import anthropic @@ -107,16 +109,16 @@ def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None): ) # Model pricing (per 1K tokens) - self.pricing = { + self.pricing: Dict[str, PricingInfo] = { "claude-3-opus": {"input": 0.015, "output": 0.075}, "claude-3-sonnet": {"input": 0.003, "output": 0.015}, "claude-3-haiku": {"input": 0.00025, "output": 0.00125}, "claude-3.5-sonnet": {"input": 0.003, "output": 0.015}, } - async def generate(self, prompts: List[str], **kwargs) -> List[str]: + async def generate(self, prompts: List[str], **kwargs: Any) -> List[str]: """Generate responses using Anthropic API.""" - responses = [] + responses: List[str] = [] # Default parameters - exclude api_key from generation params generation_params = { @@ -150,15 +152,15 @@ def get_cost_estimate(self, input_tokens: int, output_tokens: int) -> float: model_pricing = self.pricing.get( self.model_name, {"input": 0.003, "output": 0.015} ) - input_cost = (input_tokens / 1000) * model_pricing["input"] - output_cost = (output_tokens / 1000) * model_pricing["output"] + input_cost = (input_tokens / 1000) * float(model_pricing["input"]) + output_cost = (output_tokens / 1000) * float(model_pricing["output"]) return input_cost + output_cost class GoogleAdapter(ModelAdapter): """Adapter for Google Gemini models.""" - def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None): + def __init__(self, model_name: str, config: Optional[ModelConfig] = None) -> None: super().__init__(model_name, config) try: import google.generativeai as genai @@ -172,9 +174,9 @@ def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None): "Google Generative AI package not installed. Please install it with: pip install 'benchwise[llm-apis]' or pip install google-generativeai" ) - async def generate(self, prompts: List[str], **kwargs) -> List[str]: + async def generate(self, prompts: List[str], **kwargs: Any) -> List[str]: """Generate responses using Google Gemini API.""" - responses = [] + responses: List[str] = [] for prompt in prompts: try: @@ -206,7 +208,7 @@ def get_cost_estimate(self, input_tokens: int, output_tokens: int) -> float: class HuggingFaceAdapter(ModelAdapter): """Adapter for Hugging Face models.""" - def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None): + def __init__(self, model_name: str, config: Optional[ModelConfig] = None) -> None: super().__init__(model_name, config) try: from transformers import AutoTokenizer, AutoModelForCausalLM @@ -218,9 +220,9 @@ def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None): "Transformers package not installed. Please install it with: pip install 'benchwise[transformers]' or pip install transformers torch" ) - async def generate(self, prompts: List[str], **kwargs) -> List[str]: + async def generate(self, prompts: List[str], **kwargs: Any) -> List[str]: """Generate responses using Hugging Face models.""" - responses = [] + responses: List[str] = [] for prompt in prompts: try: @@ -251,10 +253,10 @@ def get_cost_estimate(self, input_tokens: int, output_tokens: int) -> float: class MockAdapter(ModelAdapter): """Mock adapter for testing without API dependencies.""" - def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None): + def __init__(self, model_name: str, config: Optional[ModelConfig] = None) -> None: super().__init__(model_name, config) - async def generate(self, prompts: List[str], **kwargs) -> List[str]: + async def generate(self, prompts: List[str], **kwargs: Any) -> List[str]: """Generate mock responses.""" return [ f"Mock response from {self.model_name} for: {prompt[:50]}..." @@ -271,7 +273,7 @@ def get_cost_estimate(self, input_tokens: int, output_tokens: int) -> float: def get_model_adapter( - model_name: str, config: Optional[Dict[str, Any]] = None + model_name: str, config: Optional[ModelConfig] = None ) -> ModelAdapter: """Factory function to get the appropriate model adapter.""" From 18e728e81ebedf12be2b5f3657b288493979c2f8 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Mon, 1 Dec 2025 18:06:38 +0530 Subject: [PATCH 03/24] feat: Add ParamSpec decorator typing to core.py - Add proper Callable type signatures for evaluate, benchmark, and stress_test decorators - Use ParamSpec and TypeVar for generic decorator typing - Add Awaitable type annotations for async functions - Type all **kwargs parameters as Any - Update _run_evaluation with specific tuple[str, ...] type - Add type ignore comments for dynamic _benchmark_metadata attributes - All 124 tests passing (2 pre-existing benchmark test failures unchanged) --- benchwise/core.py | 60 ++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/benchwise/core.py b/benchwise/core.py index 8d64c15..b682cb0 100644 --- a/benchwise/core.py +++ b/benchwise/core.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Any, Callable, Optional +from typing import List, Dict, Any, Callable, Optional, Union, ParamSpec, TypeVar, Awaitable from functools import wraps import asyncio import time @@ -10,10 +10,16 @@ from .config import get_api_config from .client import upload_results +# Type variables for decorator typing +P = ParamSpec('P') +R = TypeVar('R') + logger = logging.getLogger("benchwise") -def evaluate(*models: str, upload: bool = None, **kwargs) -> Callable: +def evaluate( + *models: str, upload: Optional[bool] = None, **kwargs: Any +) -> Callable[[Callable[..., Awaitable[Any]]], Callable[[Dataset, Any], Awaitable[List[EvaluationResult]]]]: """ Decorator for creating LLM evaluations. @@ -35,7 +41,7 @@ async def test_qa(model, dataset): return accuracy(responses, dataset.references) """ - def decorator(test_func: Callable) -> Callable: + def decorator(test_func: Callable[..., Awaitable[Any]]) -> Callable[[Dataset, Any], Awaitable[List[EvaluationResult]]]: if not inspect.iscoroutinefunction(test_func): raise TypeError( f"{test_func.__name__} must be an async function. " @@ -47,17 +53,17 @@ async def wrapper(dataset: Dataset, **test_kwargs) -> List[EvaluationResult]: return await _run_evaluation(test_func, dataset, models, upload, kwargs, test_kwargs) if hasattr(test_func, "_benchmark_metadata"): - wrapper._benchmark_metadata = test_func._benchmark_metadata - + wrapper._benchmark_metadata = test_func._benchmark_metadata # type: ignore[attr-defined] + return wrapper return decorator async def _run_evaluation( - test_func: Callable, + test_func: Callable[..., Awaitable[Any]], dataset: Dataset, - models: tuple, + models: tuple[str, ...], upload: Optional[bool], decorator_kwargs: Dict[str, Any], test_kwargs: Dict[str, Any], @@ -127,7 +133,7 @@ async def _run_evaluation( return results -def benchmark(name: str, description: str = "", **kwargs) -> Callable: +def benchmark(name: str, description: str = "", **kwargs: Any) -> Callable[[Callable[P, R]], Callable[P, R]]: """ Decorator for creating benchmarks. @@ -137,8 +143,8 @@ async def medical_qa_test(model, dataset): pass """ - def decorator(test_func: Callable) -> Callable: - test_func._benchmark_metadata = { + def decorator(test_func: Callable[P, R]) -> Callable[P, R]: + test_func._benchmark_metadata = { # type: ignore[attr-defined] "name": name, "description": description, **kwargs, @@ -148,10 +154,10 @@ def decorator(test_func: Callable) -> Callable: return decorator -def stress_test(concurrent_requests: int = 10, duration: int = 60) -> Callable: +def stress_test(concurrent_requests: int = 10, duration: int = 60) -> Callable[[Callable[P, Awaitable[R]]], Callable[P, Awaitable[List[Union[R, Exception]]]]]: """ Decorator for stress testing LLMs. - + NOTE: WIP feature - may not be fully functional. Usage: @@ -160,12 +166,12 @@ async def load_test(model, dataset): pass """ - def decorator(test_func: Callable) -> Callable: + def decorator(test_func: Callable[P, Awaitable[R]]) -> Callable[P, Awaitable[List[Union[R, Exception]]]]: @wraps(test_func) - async def wrapper(*args, **kwargs): + async def wrapper(*args: P.args, **kwargs: P.kwargs) -> List[Union[R, Exception]]: logger.info(f"Starting stress test: {concurrent_requests} concurrent requests for {duration}s") - - tasks = [] + + tasks: List[Union[R, Exception]] = [] start_time = time.time() while time.time() - start_time < duration: @@ -191,17 +197,17 @@ async def wrapper(*args, **kwargs): class EvaluationRunner: """Main class for running evaluations.""" - def __init__(self, config: Optional[Dict[str, Any]] = None): - self.config = config or {} - self.results_cache = {} + def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: + self.config: Dict[str, Any] = config or {} + self.results_cache: Dict[str, Any] = {} self.logger = logging.getLogger("benchwise.runner") async def run_evaluation( - self, test_func: Callable, dataset: Dataset, models: List[str] + self, test_func: Callable[..., Awaitable[Any]], dataset: Dataset, models: List[str] ) -> List[EvaluationResult]: """Run evaluation on multiple models.""" - results = [] - + results: List[EvaluationResult] = [] + self.logger.info(f"Running evaluation on {len(models)} models") for model_name in models: @@ -215,7 +221,7 @@ async def run_evaluation( return results def compare_models( - self, results: List[EvaluationResult], metric_name: str = None + self, results: List[EvaluationResult], metric_name: Optional[str] = None ) -> Dict[str, Any]: """Compare model performance.""" successful_results = [r for r in results if r.success] @@ -267,17 +273,17 @@ def compare_models( def run_benchmark( - benchmark_func: Callable, dataset: Dataset, models: List[str] + benchmark_func: Callable[..., Awaitable[Any]], dataset: Dataset, models: List[str] ) -> List[EvaluationResult]: """Run a benchmark on multiple models.""" runner = EvaluationRunner() return asyncio.run(runner.run_evaluation(benchmark_func, dataset, models)) -async def quick_eval(prompt: str, models: List[str], metric: Callable) -> Dict[str, float]: +async def quick_eval(prompt: str, models: List[str], metric: Callable[[str], float]) -> Dict[str, Optional[float]]: """Quick evaluation with a single prompt.""" - results = {} - + results: Dict[str, Optional[float]] = {} + logger.info(f"Running quick eval on {len(models)} models") for model_name in models: From 4cb069b2c33021bfec9fe3e7b59d478fb8d8ec22 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Mon, 1 Dec 2025 18:42:51 +0530 Subject: [PATCH 04/24] feat: Add type stubs for pandas and requests --- mypy.ini | 6 +----- pyproject.toml | 2 ++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/mypy.ini b/mypy.ini index 4149f2b..6aa31f6 100644 --- a/mypy.ini +++ b/mypy.ini @@ -47,8 +47,7 @@ ignore_missing_imports = True [mypy-httpx.*] ignore_missing_imports = True -[mypy-requests.*] -ignore_missing_imports = True +# Note: pandas and requests have type stubs installed (pandas-stubs, types-requests) [mypy-openai.*] ignore_missing_imports = True @@ -61,6 +60,3 @@ ignore_missing_imports = True [mypy-fuzzywuzzy.*] ignore_missing_imports = True - -[mypy-pandas.*] -ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml index 0076abb..c280da3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,8 @@ lint = [ "ruff>=0.1.6", "pre-commit>=3.0.0", "mypy>=1.0.0", + "pandas-stubs>=2.0.0", + "types-requests>=2.28.0", ] dev = [ From 469f4c9a4b0342d14c44a46c89f7fef34295b638 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Tue, 2 Dec 2025 11:57:34 +0530 Subject: [PATCH 05/24] fix: Use metadata variable in Dataset.sample --- benchwise/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchwise/datasets.py b/benchwise/datasets.py index 194b5c2..b4fca30 100644 --- a/benchwise/datasets.py +++ b/benchwise/datasets.py @@ -102,7 +102,7 @@ def sample(self, n: int, random_state: Optional[int] = None) -> "Dataset": return Dataset( name=f"{self.name}_sample_{n}", data=sampled_data, - metadata={**self.metadata, "sampled": True, "sample_size": n}, + metadata={**metadata, "sampled": True, "sample_size": n}, ) def split( From d1f6318f7490d6c0200d6f7d4027544e7c39da4a Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Tue, 2 Dec 2025 11:58:18 +0530 Subject: [PATCH 06/24] feat: Start metrics.py type improvements --- benchwise/metrics.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/benchwise/metrics.py b/benchwise/metrics.py index e616d91..dd27312 100644 --- a/benchwise/metrics.py +++ b/benchwise/metrics.py @@ -1,5 +1,7 @@ -from typing import List, Dict, Any, Tuple, Optional +from typing import List, Dict, Any, Tuple, Optional, Union import numpy as np +from numpy.typing import NDArray +from benchwise.types import RougeScores, BleuScores, BertScoreResults, AccuracyResults from rouge_score import rouge_scorer from sacrebleu import BLEU import bert_score @@ -26,7 +28,7 @@ def _bootstrap_confidence_interval( ) -> Tuple[float, float]: """Calculate bootstrap confidence interval for a list of scores.""" if len(scores) < 2: - return (np.mean(scores), np.mean(scores)) + return (float(np.mean(scores)), float(np.mean(scores))) bootstrap_means = [] for _ in range(n_bootstrap): @@ -38,8 +40,8 @@ def _bootstrap_confidence_interval( upper_percentile = (1 - alpha / 2) * 100 return ( - np.percentile(bootstrap_means, lower_percentile), - np.percentile(bootstrap_means, upper_percentile), + float(np.percentile(bootstrap_means, lower_percentile)), + float(np.percentile(bootstrap_means, upper_percentile)), ) @@ -65,7 +67,7 @@ def rouge_l( use_stemmer: bool = True, alpha: float = 0.5, return_confidence: bool = True, -) -> Dict[str, float]: +) -> RougeScores: """ Calculate enhanced ROUGE-L scores for predictions vs references. @@ -96,7 +98,7 @@ def rouge_l( scorer = rouge_scorer.RougeScorer( ["rougeL", "rouge1", "rouge2"], use_stemmer=use_stemmer ) - scores = {"precision": [], "recall": [], "f1": [], "rouge1_f1": [], "rouge2_f1": []} + scores: Dict[str, List[float]] = {"precision": [], "recall": [], "f1": [], "rouge1_f1": [], "rouge2_f1": []} for pred, ref in zip(predictions, references): # Handle empty strings gracefully From 3d94831c3dcd4b822cb98f1f6caef4c85c11ade6 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Tue, 2 Dec 2025 17:14:29 +0530 Subject: [PATCH 07/24] feat: Fix all type errors in metrics.py and logging.py --- benchwise/logging.py | 2 +- benchwise/metrics.py | 82 ++++++++++++++++++++++---------------------- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/benchwise/logging.py b/benchwise/logging.py index 3f6b6a5..3b2bd31 100644 --- a/benchwise/logging.py +++ b/benchwise/logging.py @@ -68,7 +68,7 @@ def get_logger(name: str = "benchwise") -> logging.Logger: return logging.getLogger(name) -def set_log_level(level: str): +def set_log_level(level: str) -> None: """ Change the log level for all Benchwise loggers. diff --git a/benchwise/metrics.py b/benchwise/metrics.py index dd27312..25ceb3a 100644 --- a/benchwise/metrics.py +++ b/benchwise/metrics.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Any, Tuple, Optional, Union +from typing import List, Dict, Any, Tuple, Optional, Union, Callable import numpy as np from numpy.typing import NDArray from benchwise.types import RougeScores, BleuScores, BertScoreResults, AccuracyResults @@ -132,15 +132,15 @@ def rouge_l( scores["rouge1_f1"].append(score["rouge1"].fmeasure) scores["rouge2_f1"].append(score["rouge2"].fmeasure) - result = { - "precision": np.mean(scores["precision"]), - "recall": np.mean(scores["recall"]), - "f1": np.mean(scores["f1"]), - "rouge1_f1": np.mean(scores["rouge1_f1"]), - "rouge2_f1": np.mean(scores["rouge2_f1"]), - "std_precision": np.std(scores["precision"]), - "std_recall": np.std(scores["recall"]), - "std_f1": np.std(scores["f1"]), + result: RougeScores = { + "precision": float(np.mean(scores["precision"])), + "recall": float(np.mean(scores["recall"])), + "f1": float(np.mean(scores["f1"])), + "rouge1_f1": float(np.mean(scores["rouge1_f1"])), + "rouge2_f1": float(np.mean(scores["rouge2_f1"])), + "std_precision": float(np.std(scores["precision"])), + "std_recall": float(np.std(scores["recall"])), + "std_f1": float(np.std(scores["f1"])), "scores": scores, } @@ -168,7 +168,7 @@ def bleu_score( smooth_method: str = "exp", return_confidence: bool = True, max_n: int = 4, -) -> Dict[str, float]: +) -> Dict[str, Any]: """ Calculate enhanced BLEU scores for predictions vs references. @@ -206,7 +206,7 @@ def bleu_score( # Calculate sentence-level BLEU with improved handling sentence_scores = [] - ngram_precisions = {f"bleu_{i}": [] for i in range(1, max_n + 1)} + ngram_precisions: Dict[str, List[float]] = {f"bleu_{i}": [] for i in range(1, max_n + 1)} for pred, ref in zip(predictions, references): try: @@ -284,23 +284,23 @@ def bleu_score( return result -def _get_smoothing_function(smooth_method: str): +def _get_smoothing_function(smooth_method: str) -> Optional[Callable[..., Any]]: """Get NLTK smoothing function based on method name.""" from nltk.translate.bleu_score import SmoothingFunction smoothing = SmoothingFunction() if smooth_method == "exp": - return smoothing.method1 + return smoothing.method1 # type: ignore[no-any-return] elif smooth_method == "floor": - return smoothing.method2 + return smoothing.method2 # type: ignore[no-any-return] elif smooth_method == "add-k": - return smoothing.method3 + return smoothing.method3 # type: ignore[no-any-return] else: return None -def _get_weights(n: int) -> tuple: +def _get_weights(n: int) -> Tuple[float, ...]: """Get n-gram weights for BLEU calculation.""" weights = [0.0] * 4 weights[n - 1] = 1.0 @@ -313,7 +313,7 @@ def bert_score_metric( model_type: str = "distilbert-base-uncased", return_confidence: bool = True, batch_size: int = 64, -) -> Dict[str, float]: +) -> Dict[str, Any]: """ Calculate enhanced BERTScore for predictions vs references. @@ -444,7 +444,7 @@ def accuracy( fuzzy_match: bool = False, fuzzy_threshold: float = 0.8, return_confidence: bool = True, -) -> Dict[str, float]: +) -> Dict[str, Any]: """ Calculate enhanced exact match accuracy with multiple matching strategies. @@ -557,7 +557,7 @@ def semantic_similarity( batch_size: int = 32, return_confidence: bool = True, similarity_threshold: float = 0.5, -) -> Dict[str, float]: +) -> Dict[str, Any]: """ Calculate enhanced semantic similarity using sentence embeddings. @@ -678,7 +678,7 @@ def semantic_similarity( return result -def perplexity(predictions: List[str], model_name: str = "gpt2") -> Dict[str, float]: +def perplexity(predictions: List[str], model_name: str = "gpt2") -> Dict[str, Any]: """ Calculate perplexity of generated text. @@ -697,7 +697,7 @@ def perplexity(predictions: List[str], model_name: str = "gpt2") -> Dict[str, fl "transformers and torch packages not installed. Please install them with: pip install 'benchwise[transformers]' or pip install transformers torch" ) - tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) # type: ignore[no-untyped-call] model = AutoModelForCausalLM.from_pretrained(model_name) perplexities = [] @@ -714,8 +714,8 @@ def perplexity(predictions: List[str], model_name: str = "gpt2") -> Dict[str, fl perplexities.append(perplexity) return { - "mean_perplexity": np.mean(perplexities), - "median_perplexity": np.median(perplexities), + "mean_perplexity": float(np.mean(perplexities)), + "median_perplexity": float(np.median(perplexities)), "scores": perplexities, } @@ -787,7 +787,7 @@ def factual_correctness( # Calculate overall correctness score overall_score = np.mean(list(factual_analysis.values())) - correctness_scores.append(overall_score) + correctness_scores.append(float(overall_score)) detailed_results.append(factual_analysis) # Compile results @@ -834,7 +834,7 @@ def factual_correctness( def _analyze_factual_correctness( - prediction: str, reference: str, nlp_model=None, use_named_entities: bool = True + prediction: str, reference: str, nlp_model: Any = None, use_named_entities: bool = True ) -> Dict[str, float]: """ Analyze factual correctness using multiple approaches. @@ -870,7 +870,7 @@ def _analyze_factual_correctness( } -def _calculate_entity_overlap(prediction: str, reference: str, nlp_model) -> float: +def _calculate_entity_overlap(prediction: str, reference: str, nlp_model: Any) -> float: """ Calculate overlap between named entities in prediction and reference. """ @@ -915,7 +915,7 @@ def _calculate_enhanced_keyword_overlap(prediction: str, reference: str) -> floa } # Extract important words from reference - important_ref_words = set() + important_ref_words: set[str] = set() " ".join(ref_words) for pattern_type, pattern in important_patterns.items(): @@ -1029,7 +1029,7 @@ def coherence_score( return {"mean_coherence": 1.0, "scores": []} coherence_scores = [] - component_scores = { + component_scores: Dict[str, List[float]] = { "sentence_consistency": [], "lexical_diversity": [], "flow_continuity": [], @@ -1048,7 +1048,7 @@ def coherence_score( # Calculate overall coherence score overall_coherence = np.mean(list(coherence_components.values())) - coherence_scores.append(overall_coherence) + coherence_scores.append(float(overall_coherence)) # Store component scores for component, score in coherence_components.items(): @@ -1111,7 +1111,7 @@ def _analyze_text_coherence(text: str) -> Dict[str, float]: if np.mean(sentence_lengths) > 0 else 1 ) - sentence_consistency = max(0, 1 - (length_cv / 2)) # Normalize to 0-1 + sentence_consistency = float(max(0, 1 - (length_cv / 2))) # Normalize to 0-1 else: sentence_consistency = 1.0 if sentence_lengths else 0.0 @@ -1254,18 +1254,18 @@ def _calculate_topic_consistency(sentences: List[str]) -> float: overlaps.append(overlap) # Return average overlap - return np.mean(overlaps) if overlaps else 1.0 + return float(np.mean(overlaps)) if overlaps else 1.0 def safety_score( predictions: List[str], - safety_categories: List[str] = None, + safety_categories: Optional[List[str]] = None, use_perspective_api: bool = False, - perspective_api_key: str = None, + perspective_api_key: Optional[str] = None, return_confidence: bool = True, detailed_categories: bool = True, context_aware: bool = True, -) -> Dict[str, float]: +) -> Dict[str, Any]: """ Evaluate safety of generated text with enhanced detection methods. @@ -1343,8 +1343,8 @@ def safety_score( } safety_scores = [] - category_scores = {cat: [] for cat in safety_categories} - violation_details = [] + category_scores: Dict[str, List[float]] = {cat: [] for cat in safety_categories} + violation_details: List[List[str]] = [] for i, text in enumerate(predictions): if not text.strip(): @@ -1484,15 +1484,15 @@ def _check_keyword_in_context( class MetricCollection: """Collection of evaluation metrics that can be run together.""" - def __init__(self): - self.metrics = {} + def __init__(self) -> None: + self.metrics: Dict[str, Tuple[Callable[..., Any], Dict[str, Any]]] = {} - def add_metric(self, name: str, metric_func: callable, **kwargs): + def add_metric(self, name: str, metric_func: Callable[..., Any], **kwargs: Any) -> None: """Add a metric to the collection.""" self.metrics[name] = (metric_func, kwargs) def evaluate( - self, predictions: List[str], references: List[str] = None + self, predictions: List[str], references: Optional[List[str]] = None ) -> Dict[str, Any]: """Run all metrics in the collection.""" results = {} From 7d2b93cfad9cff83c7e8169b8e9c834e68474ea2 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Tue, 2 Dec 2025 22:42:34 +0530 Subject: [PATCH 08/24] fix: Configure mypy to skip google.generativeai type checking --- GEMINI.md | 1 + benchwise/config.py | 6 +- mypy.ini | 1 + mypy_baseline.txt | Bin 0 -> 30682 bytes test_single_doc_file.py | 424 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 429 insertions(+), 3 deletions(-) create mode 100644 GEMINI.md create mode 100644 mypy_baseline.txt create mode 100644 test_single_doc_file.py diff --git a/GEMINI.md b/GEMINI.md new file mode 100644 index 0000000..d0892af --- /dev/null +++ b/GEMINI.md @@ -0,0 +1 @@ +Follow CLAUDE.md diff --git a/benchwise/config.py b/benchwise/config.py index 4d7c74b..10b07f6 100644 --- a/benchwise/config.py +++ b/benchwise/config.py @@ -442,8 +442,8 @@ def validate_api_keys(config: BenchwiseConfig) -> Dict[str, bool]: try: import openai - client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY")) - client.models.list() + openai_client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + openai_client.models.list() results["openai"] = True except Exception: results["openai"] = False @@ -452,7 +452,7 @@ def validate_api_keys(config: BenchwiseConfig) -> Dict[str, bool]: try: import anthropic - client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) # Note: Anthropic doesn't have a simple test endpoint results["anthropic"] = True # Assume valid if key exists except Exception: diff --git a/mypy.ini b/mypy.ini index 6aa31f6..1ccb1bb 100644 --- a/mypy.ini +++ b/mypy.ini @@ -57,6 +57,7 @@ ignore_missing_imports = True [mypy-google.generativeai.*] ignore_missing_imports = True +follow_imports = skip [mypy-fuzzywuzzy.*] ignore_missing_imports = True diff --git a/mypy_baseline.txt b/mypy_baseline.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d9e1f7460f915c81b2bd4dd3cbe892c25259929 GIT binary patch literal 30682 zcmeI1PjA~c6u^7#ryz7nf!MKZId;6YL(r|j4(*DyI~j&SOSH{KA_a;{s$d^}kCZLS znH4K>()93pD3VBieEhxt&#zwLKO}$zkN^@u0!RP}AOR$R1dsp{Kmter2_OL^fCP{L z56karKFmWKSd~$lxn_+8Bt8%FR7^efeV=bqMR?e3q>J=WIA7D1Y$NVy4KZ5Le&*h;2JzhrP5m$ zY;cfa;LOr0XF4)+@eT6gZiaMEOJr@+<8ZV$h4=@|^i(P2{K8Rq544GvWD~k%xea=6 zsie&yT2U@o{P3xB4y+||Ewu+$OKQRfXm}G1zMF6E;RYKEZxVI$I;k%!lhc}2^4DL8-MU=To=A!Bz;xHdS$co0;%Vt>)GCE`xmmV%i~kU;GwaDdf4+jYR^Iy4@+ zI`o5%yr_9xbKx%27!`W1*m8a&RRp#BWDORF25Ha33CWnA{%Qcdn~}Gq+wIyc)T)!| z4SE&$W|d5939vz>2R2H*z@@pD^Df^X9G`2p;dWHc@%uxkNvra+6I+!I^(MWHv6gMQ z-#>Q--%$V}h32ugz1KFFpfRRPri*z2b#-tpT?8g7z$MTc??x4^FoFi52`d;E9HlJiKlAk{QlVWGGAb@ zWwwk%H=PT`D1?LA3Xo~-x26Cb1g;X_QxM90P&~9A3<8(_nx$WJ8QbA}9x+XMT&sWo zdo(FG1&=I$!7~rLzCEA}6XTg)=N+-hlJKQ#v?Iwf6HD94nwFcJhO89SI=20d%Vzy^ zw`JPt%$pk{(Al+w`wzG)+3Zk`YR2}T=`C^|!=@_hn$&5080L+V6s(?jM4$lCvILpM z4gBT49ZqP ztPN$)cgHa1c9s3$q;p?7+X}5kW%CVS^UL%c_7_XOdZ>k}gWR0cd{C-jh&%3}LCp)= z<>nTc21B=qcru|uZNIZ$=p)M126qs+IDT^3<3oFaBlNy-F7#|`V34a=eLW{jt1X=NYZ{cJTocbYZ_ zSsBMUpo9_Vk>NZ%cWr9(Nm(Jnxm+$|SdKpWAlGW$bI@Ae%wa38b8w1>EnK(gMs3g~ vu;y)dzz3O&i1Y{N<=2m9YfpTiEO`u_>`Az0;T4SEexGD8<%X<$@aFY@U*SAf literal 0 HcmV?d00001 diff --git a/test_single_doc_file.py b/test_single_doc_file.py new file mode 100644 index 0000000..5420d7b --- /dev/null +++ b/test_single_doc_file.py @@ -0,0 +1,424 @@ +#!/usr/bin/env python3 +""" +Test all code examples from a documentation file with REAL models. + +This script extracts all Python code blocks from a documentation markdown file +and runs each one as a separate test with real OpenAI and Google models. + +The script can find documentation files in multiple ways: +- Absolute path: /path/to/file.md +- Relative to project root: docs/docs/examples/classification.md +- Just filename (searches docs/ tree): classification.md + +Usage: + # Using just filename (searches in docs/ directory) + python test_single_doc_file.py classification.md + + # Using relative path from project root + python test_single_doc_file.py docs/docs/examples/classification.md + python test_single_doc_file.py README.md + python test_single_doc_file.py docs/docs/getting-started/quickstart.md + + # Syntax check only (no API calls) + python test_single_doc_file.py --syntax-only classification.md + + # Save test results to files + python test_single_doc_file.py --save-results classification.md +""" + +import re +import subprocess +import sys +import tempfile +import time +from pathlib import Path +from typing import List, Tuple + + +def extract_code_blocks(markdown_file: Path) -> List[Tuple[str, int, int]]: + """ + Extract all Python code blocks from a markdown file. + Returns list of (code, block_number, line_number) tuples. + """ + with open(markdown_file, 'r', encoding='utf-8') as f: + content = f.read() + + pattern = r'```python\n(.*?)```' + matches = re.finditer(pattern, content, re.DOTALL) + + code_blocks = [] + for i, match in enumerate(matches, 1): + code = match.group(1) + line_number = content[:match.start()].count('\n') + 1 + code_blocks.append((code, i, line_number)) + + return code_blocks + + +def prepare_code_for_real_models(code: str) -> str: + """ + Replace model names with real OpenAI and Google models. + Ensures we use exactly 2 models: gpt-3.5-turbo and gemini-2.5-flash + """ + import re + + # Find all @evaluate decorators and replace models to ensure diversity + def replace_evaluate_models(match): + decorator = match.group(0) + + # Extract the content inside @evaluate(...) + content = re.search(r'@evaluate\((.*)\)', decorator, re.DOTALL) + if not content: + return decorator + + params = content.group(1) + + # Split by comma, but be careful with nested structures + # Extract all quoted strings (model names) + model_pattern = r'"([^"]+)"' + models = re.findall(model_pattern, params) + + if not models: + return decorator + + # Always use exactly 2 models: gpt-3.5-turbo and gemini-2.5-flash + # Take first N models and replace them, but cap at 2 + num_models = min(len(models), 2) + new_models = ['"gpt-3.5-turbo"', '"gemini-2.5-flash"'][:num_models] + + # If there was only 1 model originally, keep it as 1 model + if len(models) == 1: + new_models = ['"gpt-3.5-turbo"'] + + # Find any kwargs (parameters with =) + # Split params and identify non-string parts (kwargs) + kwargs = [] + # Remove all quoted strings and see what's left + params_without_strings = re.sub(r'"[^"]*"', '', params) + if '=' in params_without_strings: + # Extract kwargs + kwargs_match = re.search(r',?\s*(\w+\s*=\s*[^,)]+(?:,\s*\w+\s*=\s*[^,)]+)*)\s*$', params) + if kwargs_match: + kwargs.append(kwargs_match.group(1)) + + # Reconstruct the decorator + result = '@evaluate(' + ', '.join(new_models) + if kwargs: + result += ', ' + ', '.join(kwargs) + result += ')' + + return result + + # Replace all @evaluate decorators + modified_code = re.sub(r'@evaluate\([^)]+\)', replace_evaluate_models, code) + + # Replace placeholder dataset loading with actual datasets + if 'load_dataset("data/qa_1000.json")' in modified_code: + # Add import if not present + if 'from benchwise' in modified_code and 'create_qa_dataset' not in modified_code: + modified_code = modified_code.replace( + 'from benchwise import', + 'from benchwise import create_qa_dataset,' + ) + modified_code = modified_code.replace( + 'load_dataset("data/qa_1000.json")', + 'create_qa_dataset(questions=["What is AI?", "What is ML?"], answers=["Artificial Intelligence", "Machine Learning"], name="qa_test")' + ) + + if 'load_dataset("data/news_articles.json")' in modified_code: + # Add import if not present + if 'from benchwise' in modified_code and 'create_summarization_dataset' not in modified_code: + modified_code = modified_code.replace( + 'from benchwise import', + 'from benchwise import create_summarization_dataset,' + ) + modified_code = modified_code.replace( + 'load_dataset("data/news_articles.json")', + 'create_summarization_dataset(documents=["Article about AI.", "Article about ML."], summaries=["AI summary", "ML summary"], name="news")' + ) + + return modified_code + + +def check_syntax(code: str) -> Tuple[bool, str]: + """Check if Python code has valid syntax.""" + import ast + try: + ast.parse(code) + return True, None + except SyntaxError as e: + return False, f"SyntaxError at line {e.lineno}: {e.msg}" + except Exception as e: + return False, f"Parse error: {str(e)}" + + +def run_code_sync(code: str, timeout: int = 90) -> Tuple[bool, str, str]: + """Run code in subprocess and capture output.""" + try: + # Create temp file + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: + f.write(code) + temp_file = f.name + + # Run in subprocess + result = subprocess.run( + ['python', temp_file], + capture_output=True, + text=True, + timeout=timeout, + cwd=Path(__file__).parent + ) + + # Cleanup + import os + os.unlink(temp_file) + + output = result.stdout + error = result.stderr + + if result.returncode == 0: + return True, output, None + else: + return False, output, error + + except subprocess.TimeoutExpired: + return False, "", f"Timeout after {timeout}s" + except Exception as e: + return False, "", f"Error: {str(e)}" + + +def test_code_block(code: str, block_num: int, line_num: int, syntax_only: bool = False) -> Tuple[bool, str]: + """Test a single code block.""" + # Check syntax + syntax_valid, syntax_error = check_syntax(code) + if not syntax_valid: + print(f"โŒ SYNTAX ERROR") + return False, f"Syntax Error: {syntax_error}" + + if syntax_only: + print(f"โœ… SYNTAX VALID") + return True, None + + # Prepare code with real models + prepared_code = prepare_code_for_real_models(code) + + # Skip incomplete examples (just function definitions without execution) + if '@evaluate(' in prepared_code and 'asyncio.run' not in prepared_code: + print(f"โญ๏ธ SKIPPED (incomplete example - defines functions only)") + return True, "Skipped: Incomplete example" + + # Run the code + print(f"โณ Running test...", end=" ", flush=True) + start_time = time.time() + success, output, error = run_code_sync(prepared_code, timeout=90) + duration = time.time() - start_time + + if success: + print(f"โœ… PASSED ({duration:.2f}s)") + return True, output + else: + print(f"โŒ FAILED ({duration:.2f}s)") + return False, error or output + + +def main(): + import argparse + import json + from datetime import datetime + + parser = argparse.ArgumentParser(description="Test Python code examples from a documentation file") + parser.add_argument('file', help='Documentation file to test. Can be:\n' + ' - Relative path from project root (e.g., docs/docs/examples/classification.md)\n' + ' - Absolute path (e.g., /path/to/file.md)\n' + ' - Just filename (will search in docs/ directory tree)') + parser.add_argument('--syntax-only', action='store_true', help='Only check syntax') + parser.add_argument('--save-results', action='store_true', help='Save test results to files') + args = parser.parse_args() + + # Find the documentation file + project_root = Path(__file__).parent + file_arg = Path(args.file) + + # Try different strategies to find the file + doc_file = None + + # Strategy 1: Absolute path + if file_arg.is_absolute() and file_arg.exists(): + doc_file = file_arg + + # Strategy 2: Relative to project root + elif (project_root / file_arg).exists(): + doc_file = project_root / file_arg + + # Strategy 3: Search in docs directory tree + else: + docs_dir = project_root / 'docs' + if docs_dir.exists(): + # Search for the file in docs directory tree + for candidate in docs_dir.rglob(file_arg.name if file_arg.name else args.file): + if candidate.is_file(): + doc_file = candidate + break + + if doc_file is None or not doc_file.exists(): + print(f"โŒ Error: File not found: {args.file}") + print(f"\nSearched in:") + print(f" - Absolute path: {file_arg if file_arg.is_absolute() else 'N/A'}") + print(f" - Relative to project: {project_root / file_arg}") + print(f" - In docs/ directory tree") + return 1 + + # Get relative path for display + try: + display_path = doc_file.relative_to(project_root) + except ValueError: + display_path = doc_file + + print(f"\n๐Ÿงช Testing Documentation Examples") + print(f"๐Ÿ“„ File: {display_path}") + + if args.syntax_only: + print("โš™๏ธ Mode: Syntax check only") + else: + print("โš™๏ธ Mode: Full execution with REAL models") + print("๐Ÿค– Models: gpt-3.5-turbo, gemini-2.5-flash") + print("โš ๏ธ Note: This will make actual API calls and incur costs") + + # Extract code blocks + code_blocks = extract_code_blocks(doc_file) + + if not code_blocks: + print(f"\nโŒ No Python code blocks found in {args.file}") + return 1 + + print(f"๐Ÿ“ Total code blocks: {len(code_blocks)}\n") + print("=" * 80) + + # Test each code block + results = [] + for code, block_num, line_num in code_blocks: + print(f"\n{'=' * 80}") + print(f"TEST {block_num}/{len(code_blocks)}: Block {block_num} (Line {line_num})") + print("=" * 80) + + success, output_or_error = test_code_block(code, block_num, line_num, args.syntax_only) + results.append((block_num, success, output_or_error)) + + # Show output + if success and output_or_error and output_or_error.strip() and not args.syntax_only: + print("\n๐Ÿ“‹ OUTPUT:") + print("-" * 80) + output_lines = output_or_error.strip().split('\n') + for line in output_lines[:50]: # Show first 50 lines + print(f" {line}") + if len(output_lines) > 50: + print(f" ... ({len(output_lines) - 50} more lines)") + print("-" * 80) + elif not success and output_or_error: + print("\nโŒ ERROR:") + print("-" * 80) + error_lines = output_or_error.split('\n') + for line in error_lines[:40]: # Show first 40 lines + print(f" {line}") + if len(error_lines) > 40: + print(f" ... ({len(error_lines) - 40} more lines)") + print("-" * 80) + + # Summary + print(f"\n{'=' * 80}") + print("SUMMARY") + print("=" * 80) + + total = len(results) + passed = sum(1 for _, success, _ in results if success) + failed = total - passed + + print(f"\nFile: {display_path}") + print(f"Total: {total} code blocks") + print(f"โœ… Passed: {passed}") + print(f"โŒ Failed: {failed}") + print(f"Success Rate: {passed/total*100:.1f}%") + + # Show failures + if failed > 0: + print(f"\n{'-' * 80}") + print("FAILED TESTS") + print("-" * 80) + for block_num, success, output_or_error in results: + if not success: + print(f"\nโŒ Block {block_num}") + if output_or_error: + print(f" {output_or_error[:200]}") + + print(f"\n{'=' * 80}\n") + + # Save results if requested + if args.save_results: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + # Create a clean base name from the file path + base_name = doc_file.stem # Gets filename without extension + + # Create results directory + results_dir = Path(__file__).parent / 'test_results' + results_dir.mkdir(exist_ok=True) + + # Save JSON results (detailed) + json_file = results_dir / f"{base_name}_{timestamp}.json" + json_data = { + "file": str(display_path), + "full_path": str(doc_file), + "timestamp": datetime.now().isoformat(), + "total": total, + "passed": passed, + "failed": failed, + "success_rate": passed/total*100, + "syntax_only": args.syntax_only, + "results": [ + { + "block": block_num, + "success": success, + "output": output_or_error[:500] if output_or_error else None, # Truncate long outputs + } + for block_num, success, output_or_error in results + ] + } + + with open(json_file, 'w') as f: + json.dump(json_data, f, indent=2) + + print(f"๐Ÿ’พ JSON results saved to: {json_file}") + + # Save Markdown summary + md_file = results_dir / f"{base_name}_{timestamp}.md" + with open(md_file, 'w') as f: + f.write(f"# Test Results: {display_path}\n\n") + f.write(f"**File:** `{doc_file}`\n\n") + f.write(f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") + f.write(f"**Models:** gpt-3.5-turbo, gemini-2.5-flash\n\n") + f.write(f"## Summary\n\n") + f.write(f"- Total Tests: {total}\n") + f.write(f"- โœ… Passed: {passed}\n") + f.write(f"- โŒ Failed: {failed}\n") + f.write(f"- Success Rate: {passed/total*100:.1f}%\n\n") + + if failed > 0: + f.write(f"## Failed Tests\n\n") + for block_num, success, output_or_error in results: + if not success: + f.write(f"### Block {block_num}\n\n") + f.write(f"```\n{output_or_error[:300] if output_or_error else 'No error details'}\n```\n\n") + + print(f"๐Ÿ“ Markdown summary saved to: {md_file}") + + # Save to latest file (overwrite) + latest_json = results_dir / f"{base_name}_latest.json" + with open(latest_json, 'w') as f: + json.dump(json_data, f, indent=2) + + print(f"๐Ÿ“Œ Latest results: {latest_json}") + + return 0 if failed == 0 else 1 + + +if __name__ == '__main__': + sys.exit(main()) From 2a08836689974bd617ccd3a90ed46c4b8daed646 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Tue, 2 Dec 2025 23:10:25 +0530 Subject: [PATCH 09/24] fix: Add proper type annotations to config, models, results, core, and cli --- benchwise/cli.py | 18 +++++++++--------- benchwise/core.py | 16 ++++++++-------- benchwise/metrics.py | 2 +- benchwise/results.py | 26 +++++++++++++------------- mypy.ini | 1 + 5 files changed, 32 insertions(+), 31 deletions(-) diff --git a/benchwise/cli.py b/benchwise/cli.py index e4905bc..a799987 100644 --- a/benchwise/cli.py +++ b/benchwise/cli.py @@ -5,7 +5,7 @@ import argparse import asyncio import sys -from typing import List, Optional +from typing import List, Optional, Any, Dict from . import __version__ from .datasets import load_dataset @@ -206,7 +206,7 @@ async def run_evaluation( for metric_name in metrics: try: if metric_name == "accuracy": - metric_result = accuracy(responses, references) + metric_result: Any = accuracy(responses, references) results["accuracy"] = metric_result["accuracy"] elif metric_name == "rouge_l": metric_result = rouge_l(responses, references) @@ -285,7 +285,7 @@ async def run_evaluation( return benchmark_result -async def configure_api(args): +async def configure_api(args: Any) -> None: """Configure Benchwise API settings.""" from .config import reset_config @@ -321,7 +321,7 @@ async def configure_api(args): print("No configuration changes specified. Use --show to see current config.") -async def sync_offline(args): +async def sync_offline(args: Any) -> None: """Sync offline results with the API.""" try: client = await get_client() @@ -354,7 +354,7 @@ async def sync_offline(args): pass -async def show_status(args): +async def show_status(args: Any) -> None: """Show Benchwise status information.""" config = get_api_config() client = None @@ -412,7 +412,7 @@ async def show_status(args): pass -def list_resources(resource_type: str): +def list_resources(resource_type: str) -> None: """List available resources.""" if resource_type == "models": print("Available model adapters:") @@ -440,7 +440,7 @@ def list_resources(resource_type: str): ) -def validate_dataset(dataset_path: str): +def validate_dataset(dataset_path: str) -> None: """Validate dataset format.""" try: dataset = load_dataset(dataset_path) @@ -478,7 +478,7 @@ def validate_dataset(dataset_path: str): sys.exit(1) -async def compare_results(result_paths: List[str], metric: Optional[str] = None): +async def compare_results(result_paths: List[str], metric: Optional[str] = None) -> None: """Compare evaluation results.""" from .results import load_results, ResultsAnalyzer @@ -509,7 +509,7 @@ async def compare_results(result_paths: List[str], metric: Optional[str] = None) sys.exit(1) -def main(): +def main() -> None: """Main CLI entry point.""" parser = create_parser() args = parser.parse_args() diff --git a/benchwise/core.py b/benchwise/core.py index b682cb0..fe34af5 100644 --- a/benchwise/core.py +++ b/benchwise/core.py @@ -41,17 +41,17 @@ async def test_qa(model, dataset): return accuracy(responses, dataset.references) """ - def decorator(test_func: Callable[..., Awaitable[Any]]) -> Callable[[Dataset, Any], Awaitable[List[EvaluationResult]]]: + def decorator(test_func: Callable[..., Awaitable[Any]]) -> Callable[..., Awaitable[List[EvaluationResult]]]: if not inspect.iscoroutinefunction(test_func): raise TypeError( f"{test_func.__name__} must be an async function. " f"Use: async def {test_func.__name__}(model, dataset):" ) - + @wraps(test_func) - async def wrapper(dataset: Dataset, **test_kwargs) -> List[EvaluationResult]: + async def wrapper(dataset: Dataset, **test_kwargs: Any) -> List[EvaluationResult]: return await _run_evaluation(test_func, dataset, models, upload, kwargs, test_kwargs) - + if hasattr(test_func, "_benchmark_metadata"): wrapper._benchmark_metadata = test_func._benchmark_metadata # type: ignore[attr-defined] @@ -154,7 +154,7 @@ def decorator(test_func: Callable[P, R]) -> Callable[P, R]: return decorator -def stress_test(concurrent_requests: int = 10, duration: int = 60) -> Callable[[Callable[P, Awaitable[R]]], Callable[P, Awaitable[List[Union[R, Exception]]]]]: +def stress_test(concurrent_requests: int = 10, duration: int = 60) -> Callable[[Callable[P, Awaitable[R]]], Callable[P, Awaitable[List[Union[R, BaseException]]]]]: """ Decorator for stress testing LLMs. @@ -166,12 +166,12 @@ async def load_test(model, dataset): pass """ - def decorator(test_func: Callable[P, Awaitable[R]]) -> Callable[P, Awaitable[List[Union[R, Exception]]]]: + def decorator(test_func: Callable[P, Awaitable[R]]) -> Callable[P, Awaitable[List[Union[R, BaseException]]]]: @wraps(test_func) - async def wrapper(*args: P.args, **kwargs: P.kwargs) -> List[Union[R, Exception]]: + async def wrapper(*args: P.args, **kwargs: P.kwargs) -> List[Union[R, BaseException]]: logger.info(f"Starting stress test: {concurrent_requests} concurrent requests for {duration}s") - tasks: List[Union[R, Exception]] = [] + tasks: List[Union[R, BaseException]] = [] start_time = time.time() while time.time() - start_time < duration: diff --git a/benchwise/metrics.py b/benchwise/metrics.py index 25ceb3a..b63ff67 100644 --- a/benchwise/metrics.py +++ b/benchwise/metrics.py @@ -697,7 +697,7 @@ def perplexity(predictions: List[str], model_name: str = "gpt2") -> Dict[str, An "transformers and torch packages not installed. Please install them with: pip install 'benchwise[transformers]' or pip install transformers torch" ) - tokenizer = AutoTokenizer.from_pretrained(model_name) # type: ignore[no-untyped-call] + tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) perplexities = [] diff --git a/benchwise/results.py b/benchwise/results.py index 397a8b8..330a73e 100644 --- a/benchwise/results.py +++ b/benchwise/results.py @@ -57,7 +57,7 @@ def to_dict(self) -> Dict[str, Any]: "success": self.success, } - def get_score(self, metric_name: str = None) -> Union[float, Any]: + def get_score(self, metric_name: Optional[str] = None) -> Union[float, Any]: """ Extract a specific score from the result. @@ -93,7 +93,7 @@ class BenchmarkResult: metadata: Dict[str, Any] = field(default_factory=dict) timestamp: datetime = field(default_factory=datetime.now) - def add_result(self, result: EvaluationResult): + def add_result(self, result: EvaluationResult) -> None: """Add an evaluation result to the benchmark.""" self.results.append(result) @@ -119,7 +119,7 @@ def success_rate(self) -> float: return 0.0 return len(self.successful_results) / len(self.results) - def get_best_model(self, metric_name: str = None) -> Optional[EvaluationResult]: + def get_best_model(self, metric_name: Optional[str] = None) -> Optional[EvaluationResult]: """ Get the best performing model result. @@ -135,7 +135,7 @@ def get_best_model(self, metric_name: str = None) -> Optional[EvaluationResult]: return max(successful_results, key=lambda r: r.get_score(metric_name) or 0) - def get_worst_model(self, metric_name: str = None) -> Optional[EvaluationResult]: + def get_worst_model(self, metric_name: Optional[str] = None) -> Optional[EvaluationResult]: """ Get the worst performing model result. @@ -153,7 +153,7 @@ def get_worst_model(self, metric_name: str = None) -> Optional[EvaluationResult] successful_results, key=lambda r: r.get_score(metric_name) or float("inf") ) - def compare_models(self, metric_name: str = None) -> Dict[str, Any]: + def compare_models(self, metric_name: Optional[str] = None) -> Dict[str, Any]: """ Compare all models in the benchmark. @@ -241,12 +241,12 @@ def to_dataframe(self) -> pd.DataFrame: return pd.DataFrame(data) - def save_to_json(self, file_path: Union[str, Path]): + def save_to_json(self, file_path: Union[str, Path]) -> None: """Save benchmark results to JSON file.""" with open(file_path, "w") as f: json.dump(self.to_dict(), f, indent=2, default=str) - def save_to_csv(self, file_path: Union[str, Path]): + def save_to_csv(self, file_path: Union[str, Path]) -> None: """Save benchmark results to CSV file.""" df = self.to_dataframe() df.to_csv(file_path, index=False) @@ -257,7 +257,7 @@ class ResultsAnalyzer: @staticmethod def compare_benchmarks( - benchmark_results: List[BenchmarkResult], metric_name: str = None + benchmark_results: List[BenchmarkResult], metric_name: Optional[str] = None ) -> Dict[str, Any]: """ Compare results across multiple benchmarks. @@ -269,7 +269,7 @@ def compare_benchmarks( Returns: Dictionary with cross-benchmark comparison """ - comparison = {"benchmarks": [], "models": set(), "cross_benchmark_scores": {}} + comparison: Dict[str, Any] = {"benchmarks": [], "models": set(), "cross_benchmark_scores": {}} for benchmark in benchmark_results: benchmark_info = { @@ -300,7 +300,7 @@ def compare_benchmarks( @staticmethod def analyze_model_performance( - results: List[EvaluationResult], metric_name: str = None + results: List[EvaluationResult], metric_name: Optional[str] = None ) -> Dict[str, Any]: """ Analyze performance of a single model across multiple evaluations. @@ -480,7 +480,7 @@ def _get_cache_key(self, model_name: str, test_name: str, dataset_hash: str) -> key_data = f"{model_name}_{test_name}_{dataset_hash}" return hashlib.md5(key_data.encode()).hexdigest() - def save_result(self, result: EvaluationResult, dataset_hash: str): + def save_result(self, result: EvaluationResult, dataset_hash: str) -> None: """Save evaluation result to cache.""" cache_key = self._get_cache_key( result.model_name, result.test_name, dataset_hash @@ -516,7 +516,7 @@ def load_result( except Exception: return None - def clear_cache(self): + def clear_cache(self) -> None: """Clear all cached results.""" for cache_file in self.cache_dir.glob("*.json"): cache_file.unlink() @@ -546,7 +546,7 @@ def list_cached_results(self) -> List[Dict[str, Any]]: def save_results( benchmark_result: BenchmarkResult, file_path: Union[str, Path], format: str = "json" -): +) -> None: """ Save benchmark results to file. diff --git a/mypy.ini b/mypy.ini index 1ccb1bb..23d3a7c 100644 --- a/mypy.ini +++ b/mypy.ini @@ -34,6 +34,7 @@ ignore_missing_imports = True [mypy-transformers.*] ignore_missing_imports = True +follow_imports = skip [mypy-torch.*] ignore_missing_imports = True From a7a2a2ed58568262f52704823b819f7598b3b16c Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Tue, 2 Dec 2025 23:44:03 +0530 Subject: [PATCH 10/24] fix: Complete type annotations for datasets.py and client.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All mypy errors resolved (82 โ†’ 0). Fixed type narrowing, cast() calls for response.json(), and missing return type annotations. Tests: 124/126 passing. --- benchwise/client.py | 83 +++++++++++++++++++++++-------------------- benchwise/datasets.py | 62 +++++++++++++++++++++----------- 2 files changed, 86 insertions(+), 59 deletions(-) diff --git a/benchwise/client.py b/benchwise/client.py index fc0156a..00a9651 100644 --- a/benchwise/client.py +++ b/benchwise/client.py @@ -2,7 +2,7 @@ import asyncio import uuid import logging -from typing import Dict, Any, Optional, List +from typing import Dict, Any, Optional, List, cast from datetime import datetime from contextvars import ContextVar @@ -19,7 +19,7 @@ class BenchwiseAPIError(Exception): """Enhanced exception with error codes and retry info.""" - def __init__(self, message: str, status_code: int = None, retry_after: int = None, request_id: str = None): + def __init__(self, message: str, status_code: Optional[int] = None, retry_after: Optional[int] = None, request_id: Optional[str] = None): super().__init__(message) self.status_code = status_code self.retry_after = retry_after @@ -56,21 +56,21 @@ def __init__(self, api_url: Optional[str] = None, api_key: Optional[str] = None) self.benchmark_cache: Dict[str, int] = {} # Offline queue for storing results when API is unavailable - self.offline_queue = [] + self.offline_queue: List[Dict[str, Any]] = [] self.offline_mode = False # Track if client is closed self._closed = False - + logger.debug(f"BenchwiseClient initialized with API URL: {self.api_url}") - async def __aenter__(self): + async def __aenter__(self) -> "BenchwiseClient": return self - async def __aexit__(self, exc_type, exc_val, exc_tb): + async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: await self.close() - async def close(self): + async def close(self) -> None: """Close the HTTP client.""" if not self._closed: await self.client.aclose() @@ -78,7 +78,7 @@ async def close(self): logger.debug("BenchwiseClient closed") async def _make_request_with_retry( - self, method: str, url: str, **kwargs + self, method: str, url: str, **kwargs: Any ) -> httpx.Response: """Make HTTP request with automatic retry logic and request ID tracking.""" max_retries = 3 @@ -138,7 +138,7 @@ async def _make_request_with_retry( raise BenchwiseAPIError("Max retries exceeded", request_id=request_id) - def _set_auth_header(self): + def _set_auth_header(self) -> None: """Set JWT authorization header if token is available.""" if self.jwt_token: self.client.headers["Authorization"] = f"Bearer {self.jwt_token}" @@ -225,7 +225,7 @@ async def register( if response.status_code == 201: logger.info(f"Registration successful for user: {username}") - return response.json() + return cast(Dict[str, Any], response.json()) elif response.status_code == 400: error_detail = response.json().get("detail", "Registration failed") logger.error(f"Registration failed: {error_detail}") @@ -252,7 +252,7 @@ async def get_current_user(self) -> Dict[str, Any]: response = await self.client.get("/api/v1/users/me") if response.status_code == 200: - return response.json() + return cast(Dict[str, Any], response.json()) elif response.status_code == 401: logger.warning("Authentication expired") raise BenchwiseAPIError("Authentication expired - please login again") @@ -322,8 +322,8 @@ async def register_model( response = await self.client.post("/api/v1/models", json=model_data) if response.status_code == 201: - model_info = response.json() - model_db_id = model_info["id"] + model_info = cast(Dict[str, Any], response.json()) + model_db_id = cast(int, model_info["id"]) self.model_cache[cache_key] = model_db_id logger.info(f"Model registered successfully with ID: {model_db_id}") return model_db_id @@ -349,14 +349,15 @@ async def _get_existing_model(self, provider: str, model_id: str) -> int: ) if response.status_code == 200: - models = response.json() + models = cast(List[Dict[str, Any]], response.json()) # Filter in Python since backend doesn't support model_id parameter for model in models: if model["provider"] == provider and model["model_id"] == model_id: cache_key = f"{provider}:{model_id}" - self.model_cache[cache_key] = model["id"] - logger.debug(f"Found existing model with ID: {model['id']}") - return model["id"] + model_id_value = cast(int, model["id"]) + self.model_cache[cache_key] = model_id_value + logger.debug(f"Found existing model with ID: {model_id_value}") + return model_id_value raise BenchwiseAPIError(f"Model {provider}:{model_id} not found") else: @@ -410,8 +411,8 @@ async def register_benchmark( response = await self.client.post("/api/v1/benchmarks", json=benchmark_data) if response.status_code == 201: - benchmark_info = response.json() - benchmark_db_id = benchmark_info["id"] + benchmark_info = cast(Dict[str, Any], response.json()) + benchmark_db_id = cast(int, benchmark_info["id"]) self.benchmark_cache[benchmark_name] = benchmark_db_id logger.info(f"Benchmark registered successfully with ID: {benchmark_db_id}") return benchmark_db_id @@ -437,20 +438,22 @@ async def _get_existing_benchmark(self, benchmark_name: str) -> int: ) if response.status_code == 200: - benchmarks = response.json() + benchmarks = cast(List[Dict[str, Any]], response.json()) # Look for exact name match first, then partial match for benchmark in benchmarks: if benchmark["name"] == benchmark_name: - self.benchmark_cache[benchmark_name] = benchmark["id"] - logger.debug(f"Found existing benchmark with ID: {benchmark['id']}") - return benchmark["id"] + benchmark_id_value = cast(int, benchmark["id"]) + self.benchmark_cache[benchmark_name] = benchmark_id_value + logger.debug(f"Found existing benchmark with ID: {benchmark_id_value}") + return benchmark_id_value # If no exact match, try partial match for benchmark in benchmarks: if benchmark_name.lower() in benchmark["name"].lower(): - self.benchmark_cache[benchmark_name] = benchmark["id"] - logger.debug(f"Found similar benchmark with ID: {benchmark['id']}") - return benchmark["id"] + benchmark_id_value = cast(int, benchmark["id"]) + self.benchmark_cache[benchmark_name] = benchmark_id_value + logger.debug(f"Found similar benchmark with ID: {benchmark_id_value}") + return benchmark_id_value raise BenchwiseAPIError(f"Benchmark {benchmark_name} not found") else: @@ -466,7 +469,7 @@ async def create_evaluation( name: str, benchmark_id: int, model_ids: List[int], - metadata: Optional[Dict] = None, + metadata: Optional[Dict[str, Any]] = None, ) -> int: """ Create evaluation with correct backend format. @@ -495,9 +498,10 @@ async def create_evaluation( ) if response.status_code == 201: - evaluation_info = response.json() - logger.info(f"Evaluation created successfully with ID: {evaluation_info['id']}") - return evaluation_info["id"] + evaluation_info = cast(Dict[str, Any], response.json()) + evaluation_id = cast(int, evaluation_info["id"]) + logger.info(f"Evaluation created successfully with ID: {evaluation_id}") + return evaluation_id elif response.status_code == 401: raise BenchwiseAPIError( "Authentication required for creating evaluations" @@ -709,7 +713,7 @@ async def get_benchmarks( ) if response.status_code == 200: - return response.json() + return cast(List[Dict[str, Any]], response.json()) else: raise BenchwiseAPIError( f"Failed to retrieve benchmarks: {response.status_code}" @@ -728,7 +732,7 @@ async def get_evaluations( ) if response.status_code == 200: - return response.json() + return cast(List[Dict[str, Any]], response.json()) else: raise BenchwiseAPIError( f"Failed to retrieve evaluations: {response.status_code}" @@ -737,7 +741,7 @@ async def get_evaluations( except httpx.RequestError as e: raise BenchwiseAPIError(f"Network error retrieving evaluations: {e}") - async def _add_to_offline_queue(self, data: Dict[str, Any]): + async def _add_to_offline_queue(self, data: Dict[str, Any]) -> None: """Add data to offline queue for later upload.""" self.offline_queue.append( {"data": data, "timestamp": datetime.now().isoformat()} @@ -818,9 +822,10 @@ async def upload_dataset_for_benchmark( ) if response.status_code == 200: - result = response.json() + result = cast(Dict[str, Any], response.json()) + file_url = cast(str, result["file_info"]["url"]) logger.info("Dataset uploaded successfully") - return result["file_info"]["url"] + return file_url else: raise BenchwiseAPIError( f"Failed to upload dataset: {response.status_code}" @@ -859,8 +864,8 @@ async def create_benchmark_with_dataset( f"Failed to create benchmark: {response.status_code}" ) - benchmark = response.json() - benchmark_id = benchmark["id"] + benchmark = cast(Dict[str, Any], response.json()) + benchmark_id = cast(int, benchmark["id"]) # 2. Upload dataset try: @@ -899,10 +904,10 @@ async def get_client() -> BenchwiseClient: return client -async def close_client(): +async def close_client() -> None: """Close the context-local client.""" client = _client_context.get() - + if client and not client._closed: try: await client.close() diff --git a/benchwise/datasets.py b/benchwise/datasets.py index b4fca30..2d8d1eb 100644 --- a/benchwise/datasets.py +++ b/benchwise/datasets.py @@ -124,13 +124,13 @@ def split( train_dataset = Dataset( name=f"{self.name}_train", data=train_data, - metadata={**self.metadata, "split": "train", "train_ratio": train_ratio}, + metadata={**(self.metadata or {}), "split": "train", "train_ratio": train_ratio}, ) test_dataset = Dataset( name=f"{self.name}_test", data=test_data, - metadata={**self.metadata, "split": "test", "train_ratio": train_ratio}, + metadata={**(self.metadata or {}), "split": "test", "train_ratio": train_ratio}, ) return train_dataset, test_dataset @@ -154,7 +154,7 @@ def to_json(self, file_path: Optional[str] = None) -> str: return json_data - def to_csv(self, file_path: str): + def to_csv(self, file_path: str) -> None: """Export dataset to CSV format.""" df = pd.DataFrame(self.data) df.to_csv(file_path, index=False) @@ -175,14 +175,15 @@ def validate_schema(self) -> bool: def get_statistics(self) -> Dict[str, Any]: """Get dataset statistics.""" - stats = { + fields: List[str] = list(self.data[0].keys()) if self.data else [] + stats: Dict[str, Any] = { "size": self.size, - "fields": list(self.data[0].keys()) if self.data else [], + "fields": fields, "metadata": self.metadata, } if self.data: - for field in stats["fields"]: + for field in fields: values = [item.get(field) for item in self.data if field in item] if values: if all(isinstance(v, str) for v in values): @@ -190,14 +191,16 @@ def get_statistics(self) -> Dict[str, Any]: len(str(v)) for v in values ) / len(values) elif all(isinstance(v, (int, float)) for v in values): - stats[f"{field}_mean"] = sum(values) / len(values) - stats[f"{field}_min"] = min(values) - stats[f"{field}_max"] = max(values) + # Type narrowing: we know values are numeric here + numeric_values = [v for v in values if isinstance(v, (int, float))] + stats[f"{field}_mean"] = sum(numeric_values) / len(numeric_values) + stats[f"{field}_min"] = min(numeric_values) + stats[f"{field}_max"] = max(numeric_values) return stats -def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs) -> Dataset: +def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs: Any) -> Dataset: """ Load dataset from various sources. @@ -237,10 +240,15 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs) -> Dataset: data=data, metadata=kwargs.get("metadata", {}), ) + else: + raise ValueError( + f"Invalid JSON format in '{source_path}'. Expected a list or a dict with 'data' key." + ) elif source_path.suffix == ".csv": df = pd.read_csv(source_path) - data = df.to_dict("records") + # Type cast: pandas to_dict returns dict[Hashable, Any] but we need dict[str, Any] + data = [dict(record) for record in df.to_dict("records")] return Dataset( name=kwargs.get("name", source_path.stem), @@ -249,10 +257,12 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs) -> Dataset: ) elif str(source).startswith(("http://", "https://")): - response = requests.get(source) + # Convert to str for requests.get + source_str = str(source) + response = requests.get(source_str) response.raise_for_status() - if source.endswith(".json"): + if source_str.endswith(".json"): data = response.json() if isinstance(data, dict) and "data" in data: return Dataset( @@ -267,14 +277,26 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs) -> Dataset: data=data, metadata=kwargs.get("metadata", {}), ) + else: + raise ValueError( + f"Invalid JSON format from '{source_str}'. Expected a list or a dict with 'data' key." + ) + else: + raise ValueError( + f"Unsupported URL format '{source_str}'. Only .json URLs are supported." + ) else: raise ValueError( f"Unsupported file format '{source_path.suffix}'. Supported formats: .json, .csv" ) + raise ValueError( + f"Unable to load dataset from source: {source}" + ) + -def create_qa_dataset(questions: List[str], answers: List[str], **kwargs) -> Dataset: +def create_qa_dataset(questions: List[str], answers: List[str], **kwargs: Any) -> Dataset: """ Create a question-answering dataset. @@ -311,7 +333,7 @@ def create_qa_dataset(questions: List[str], answers: List[str], **kwargs) -> Dat def create_summarization_dataset( - documents: List[str], summaries: List[str], **kwargs + documents: List[str], summaries: List[str], **kwargs: Any ) -> Dataset: """ Create a text summarization dataset. @@ -351,7 +373,7 @@ def create_summarization_dataset( def create_classification_dataset( - texts: List[str], labels: List[str], **kwargs + texts: List[str], labels: List[str], **kwargs: Any ) -> Dataset: """ Create a text classification dataset. @@ -389,10 +411,10 @@ def create_classification_dataset( class DatasetRegistry: """Registry for managing multiple datasets.""" - def __init__(self): + def __init__(self) -> None: self.datasets: Dict[str, Dataset] = {} - def register(self, dataset: Dataset): + def register(self, dataset: Dataset) -> None: self.datasets[dataset.name] = dataset def get(self, name: str) -> Optional[Dataset]: @@ -401,11 +423,11 @@ def get(self, name: str) -> Optional[Dataset]: def list(self) -> List[str]: return list(self.datasets.keys()) - def remove(self, name: str): + def remove(self, name: str) -> None: if name in self.datasets: del self.datasets[name] - def clear(self): + def clear(self) -> None: self.datasets.clear() From f38a7f92d50e22d232c1d05273c102e720286a19 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Tue, 2 Dec 2025 23:48:08 +0530 Subject: [PATCH 11/24] feat: Add mypy to CI/CD and documentation Added GitHub Actions CI workflow, pre-commit hooks, and README development section with type checking info. --- .github/workflows/ci.yml | 39 +++++++++++++++++++++++++ .pre-commit-config.yaml | 10 +++++++ README.md | 61 +++++++++++++++++++++++++++++++++++----- 3 files changed, 103 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..02accc7 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,39 @@ +name: CI + +on: + push: + branches: [main, develop] + pull_request: + branches: [main, develop] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[dev,test]" + + - name: Run ruff linter + run: ruff check benchwise tests + + - name: Run ruff formatter check + run: ruff format --check benchwise tests + + - name: Run mypy type checker + run: mypy benchwise --config-file=mypy.ini + + - name: Run tests + run: python run_tests.py --basic diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e46bcb6..d4ec570 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,3 +16,13 @@ repos: - id: ruff args: [--fix, --exit-non-zero-on-fix] - id: ruff-format + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.11.2 + hooks: + - id: mypy + additional_dependencies: + - types-requests + - pandas-stubs + args: [--config-file=mypy.ini] + files: ^benchwise/ diff --git a/README.md b/README.md index 77f4939..a03bd55 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ async def test_summarization(model, dataset): prompts = [f"Summarize: {item['text']}" for item in dataset.data] responses = await model.generate(prompts) references = [item['summary'] for item in dataset.data] - + scores = rouge_l(responses, references) assert scores['f1'] > 0.3 # Minimum quality threshold return scores @@ -84,7 +84,7 @@ Support for major LLM providers: # OpenAI models @evaluate("gpt-4", "gpt-3.5-turbo") -# Anthropic models +# Anthropic models @evaluate("claude-3-opus", "claude-3-sonnet") # Google models @@ -139,10 +139,10 @@ async def test_medical_qa(model, dataset): questions = [f"Q: {item['question']}\nA:" for item in dataset.data] answers = await model.generate(questions, temperature=0) references = [item['answer'] for item in dataset.data] - + accuracy_score = accuracy(answers, references) similarity_score = semantic_similarity(answers, references) - + return { 'accuracy': accuracy_score['accuracy'], 'similarity': similarity_score['mean_similarity'] @@ -156,10 +156,10 @@ async def test_medical_qa(model, dataset): @evaluate("gpt-3.5-turbo", "claude-3-haiku") async def test_safety(model, dataset): responses = await model.generate(dataset.prompts) - + safety_scores = safety_score(responses) assert safety_scores['mean_safety'] > 0.9 # High safety threshold - + return safety_scores ``` @@ -172,10 +172,57 @@ async def test_performance(model, dataset): start_time = time.time() response = await model.generate(["Hello, world!"]) latency = time.time() - start_time - + assert latency < 2.0 # Max 2 second response time return {'latency': latency} ``` +## Development + +### Type Safety + +Benchwise uses strict type checking with mypy to ensure code quality: + +```bash +# Run type checker +mypy benchwise + +# Type checking is enforced in CI/CD and pre-commit hooks +``` + +All code contributions must pass mypy strict checks. The codebase is fully typed with: +- Comprehensive type annotations +- Custom TypedDict definitions in `benchwise/types.py` +- Type stubs for external dependencies + +### Running Tests + +```bash +# Quick validation +python run_tests.py --basic + +# Full test suite +python run_tests.py + +# With coverage +python run_tests.py --coverage +``` + +### Code Quality + +```bash +# Format code +ruff format . + +# Lint code +ruff check --fix . + +# Type check +mypy benchwise + +# Run all checks +pre-commit run --all-files +``` + Happy evaluating! ๐ŸŽฏ From 87bb74c3f8948ea981403dd31df6fa2fe614d005 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Tue, 2 Dec 2025 23:52:41 +0530 Subject: [PATCH 12/24] chore: Update CI to test only Python 3.12 --- .github/workflows/ci.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 02accc7..e6ce6a1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,17 +9,14 @@ on: jobs: test: runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python 3.12 uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: "3.12" - name: Install dependencies run: | From e6254c392d700baedd81e4ef2c574b0415fcb4c3 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Wed, 3 Dec 2025 00:20:45 +0530 Subject: [PATCH 13/24] fix: Remove unused imports and fix type errors --- .gitignore | 4 +- benchwise/cli.py | 6 +- benchwise/client.py | 74 ++++--- benchwise/config.py | 3 +- benchwise/datasets.py | 32 +-- benchwise/metrics.py | 35 ++-- mypy.ini | 9 + test_single_doc_file.py | 424 ---------------------------------------- 8 files changed, 113 insertions(+), 474 deletions(-) delete mode 100644 test_single_doc_file.py diff --git a/.gitignore b/.gitignore index 7132f2e..19573c7 100644 --- a/.gitignore +++ b/.gitignore @@ -252,4 +252,6 @@ redis-data/ celery-beat-schedule # AI files -CLAUDE.md \ No newline at end of file +CLAUDE.md +GEMINI.md +test_single_doc_file.py diff --git a/benchwise/cli.py b/benchwise/cli.py index a799987..a2bd910 100644 --- a/benchwise/cli.py +++ b/benchwise/cli.py @@ -5,7 +5,7 @@ import argparse import asyncio import sys -from typing import List, Optional, Any, Dict +from typing import List, Optional, Any from . import __version__ from .datasets import load_dataset @@ -478,7 +478,9 @@ def validate_dataset(dataset_path: str) -> None: sys.exit(1) -async def compare_results(result_paths: List[str], metric: Optional[str] = None) -> None: +async def compare_results( + result_paths: List[str], metric: Optional[str] = None +) -> None: """Compare evaluation results.""" from .results import load_results, ResultsAnalyzer diff --git a/benchwise/client.py b/benchwise/client.py index 00a9651..fc724fb 100644 --- a/benchwise/client.py +++ b/benchwise/client.py @@ -13,13 +13,21 @@ logger = logging.getLogger("benchwise.client") # Context-local client storage (thread-safe) -_client_context: ContextVar[Optional['BenchwiseClient']] = ContextVar('_client_context', default=None) +_client_context: ContextVar[Optional["BenchwiseClient"]] = ContextVar( + "_client_context", default=None +) class BenchwiseAPIError(Exception): """Enhanced exception with error codes and retry info.""" - def __init__(self, message: str, status_code: Optional[int] = None, retry_after: Optional[int] = None, request_id: Optional[str] = None): + def __init__( + self, + message: str, + status_code: Optional[int] = None, + retry_after: Optional[int] = None, + request_id: Optional[str] = None, + ): super().__init__(message) self.status_code = status_code self.retry_after = retry_after @@ -83,13 +91,13 @@ async def _make_request_with_retry( """Make HTTP request with automatic retry logic and request ID tracking.""" max_retries = 3 base_delay = 1 - + # Generate and add request ID request_id = generate_request_id() - if 'headers' not in kwargs: - kwargs['headers'] = {} - kwargs['headers']['X-Request-ID'] = request_id - + if "headers" not in kwargs: + kwargs["headers"] = {} + kwargs["headers"]["X-Request-ID"] = request_id + logger.debug(f"Making {method} request to {url} [Request-ID: {request_id}]") for attempt in range(max_retries + 1): @@ -106,7 +114,9 @@ async def _make_request_with_retry( retry_after = int( response.headers.get("retry-after", base_delay * (2**attempt)) ) - logger.warning(f"Rate limited, retrying after {retry_after}s [Request-ID: {request_id}]") + logger.warning( + f"Rate limited, retrying after {retry_after}s [Request-ID: {request_id}]" + ) if attempt < max_retries: await asyncio.sleep(retry_after) continue @@ -121,15 +131,19 @@ async def _make_request_with_retry( except Exception: pass - logger.error(f"Request failed: {error_detail} [Request-ID: {request_id}]") + logger.error( + f"Request failed: {error_detail} [Request-ID: {request_id}]" + ) raise BenchwiseAPIError( - f"{error_detail}", + f"{error_detail}", status_code=response.status_code, - request_id=request_id + request_id=request_id, ) except httpx.RequestError as e: - logger.warning(f"Network error (attempt {attempt + 1}/{max_retries + 1}): {e} [Request-ID: {request_id}]") + logger.warning( + f"Network error (attempt {attempt + 1}/{max_retries + 1}): {e} [Request-ID: {request_id}]" + ) if attempt < max_retries: delay = base_delay * (2**attempt) await asyncio.sleep(delay) @@ -151,7 +165,7 @@ async def health_check(self) -> bool: """Check if the Benchwise API is available.""" try: response = await self.client.get("/health", timeout=5.0) - is_healthy = response.status_code == 200 + is_healthy = bool(response.status_code == 200) logger.info(f"Health check: {'healthy' if is_healthy else 'unhealthy'}") return is_healthy except Exception as e: @@ -270,17 +284,19 @@ async def upload_benchmark_result_simple( ) -> Dict[str, Any]: """ WIP: Simplified single-call upload for benchmark results. - + This will be the primary upload method in the next release. Currently redirects to the existing multi-step workflow. - + Args: benchmark_result: BenchmarkResult object to upload Returns: API response data """ - logger.warning("Using legacy multi-step upload workflow. Simplified workflow coming in next release.") + logger.warning( + "Using legacy multi-step upload workflow. Simplified workflow coming in next release." + ) return await self.upload_benchmark_result(benchmark_result) async def register_model( @@ -414,7 +430,9 @@ async def register_benchmark( benchmark_info = cast(Dict[str, Any], response.json()) benchmark_db_id = cast(int, benchmark_info["id"]) self.benchmark_cache[benchmark_name] = benchmark_db_id - logger.info(f"Benchmark registered successfully with ID: {benchmark_db_id}") + logger.info( + f"Benchmark registered successfully with ID: {benchmark_db_id}" + ) return benchmark_db_id elif response.status_code == 400: # Benchmark might already exist - try to get it @@ -444,7 +462,9 @@ async def _get_existing_benchmark(self, benchmark_name: str) -> int: if benchmark["name"] == benchmark_name: benchmark_id_value = cast(int, benchmark["id"]) self.benchmark_cache[benchmark_name] = benchmark_id_value - logger.debug(f"Found existing benchmark with ID: {benchmark_id_value}") + logger.debug( + f"Found existing benchmark with ID: {benchmark_id_value}" + ) return benchmark_id_value # If no exact match, try partial match @@ -452,7 +472,9 @@ async def _get_existing_benchmark(self, benchmark_name: str) -> int: if benchmark_name.lower() in benchmark["name"].lower(): benchmark_id_value = cast(int, benchmark["id"]) self.benchmark_cache[benchmark_name] = benchmark_id_value - logger.debug(f"Found similar benchmark with ID: {benchmark_id_value}") + logger.debug( + f"Found similar benchmark with ID: {benchmark_id_value}" + ) return benchmark_id_value raise BenchwiseAPIError(f"Benchmark {benchmark_name} not found") @@ -652,7 +674,9 @@ async def upload_benchmark_result( # Step 5: Upload results await self.upload_evaluation_results(evaluation_id, results_data) - logger.info(f"Benchmark result uploaded successfully. Evaluation ID: {evaluation_id}") + logger.info( + f"Benchmark result uploaded successfully. Evaluation ID: {evaluation_id}" + ) return { "id": evaluation_id, "benchmark_id": benchmark_id, @@ -890,17 +914,17 @@ async def create_benchmark_with_dataset( async def get_client() -> BenchwiseClient: """ Get or create a context-local Benchwise API client. - + This uses context variables to ensure thread-safety and proper isolation in async contexts. """ client = _client_context.get() - + if client is None or client._closed: client = BenchwiseClient() _client_context.set(client) logger.debug("Created new context-local client") - + return client @@ -935,7 +959,9 @@ async def upload_results( # Check if API is available if not await client.health_check(): - logger.warning("Benchwise API not available, results will be cached offline") + logger.warning( + "Benchwise API not available, results will be cached offline" + ) from .results import BenchmarkResult benchmark_result = BenchmarkResult( diff --git a/benchwise/config.py b/benchwise/config.py index 10b07f6..a2e32dc 100644 --- a/benchwise/config.py +++ b/benchwise/config.py @@ -452,7 +452,8 @@ def validate_api_keys(config: BenchwiseConfig) -> Dict[str, bool]: try: import anthropic - anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + # Create client to verify API key is valid + _ = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) # Note: Anthropic doesn't have a simple test endpoint results["anthropic"] = True # Assume valid if key exists except Exception: diff --git a/benchwise/datasets.py b/benchwise/datasets.py index 2d8d1eb..a4a60da 100644 --- a/benchwise/datasets.py +++ b/benchwise/datasets.py @@ -6,8 +6,6 @@ from dataclasses import dataclass import hashlib -from benchwise.types import DatasetItem, DatasetMetadata - @dataclass class Dataset: @@ -74,7 +72,7 @@ def references(self) -> List[str]: or item.get("answer") or item.get("target") or item.get("summary") - or item.get("label") + or item.get("label") ) if ref: references.append(str(ref)) @@ -124,13 +122,21 @@ def split( train_dataset = Dataset( name=f"{self.name}_train", data=train_data, - metadata={**(self.metadata or {}), "split": "train", "train_ratio": train_ratio}, + metadata={ + **(self.metadata or {}), + "split": "train", + "train_ratio": train_ratio, + }, ) test_dataset = Dataset( name=f"{self.name}_test", data=test_data, - metadata={**(self.metadata or {}), "split": "test", "train_ratio": train_ratio}, + metadata={ + **(self.metadata or {}), + "split": "test", + "train_ratio": train_ratio, + }, ) return train_dataset, test_dataset @@ -192,8 +198,12 @@ def get_statistics(self) -> Dict[str, Any]: ) / len(values) elif all(isinstance(v, (int, float)) for v in values): # Type narrowing: we know values are numeric here - numeric_values = [v for v in values if isinstance(v, (int, float))] - stats[f"{field}_mean"] = sum(numeric_values) / len(numeric_values) + numeric_values = [ + v for v in values if isinstance(v, (int, float)) + ] + stats[f"{field}_mean"] = sum(numeric_values) / len( + numeric_values + ) stats[f"{field}_min"] = min(numeric_values) stats[f"{field}_max"] = max(numeric_values) @@ -291,12 +301,12 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs: Any) -> Dat f"Unsupported file format '{source_path.suffix}'. Supported formats: .json, .csv" ) - raise ValueError( - f"Unable to load dataset from source: {source}" - ) + raise ValueError(f"Unable to load dataset from source: {source}") -def create_qa_dataset(questions: List[str], answers: List[str], **kwargs: Any) -> Dataset: +def create_qa_dataset( + questions: List[str], answers: List[str], **kwargs: Any +) -> Dataset: """ Create a question-answering dataset. diff --git a/benchwise/metrics.py b/benchwise/metrics.py index b63ff67..4c41baf 100644 --- a/benchwise/metrics.py +++ b/benchwise/metrics.py @@ -1,7 +1,6 @@ -from typing import List, Dict, Any, Tuple, Optional, Union, Callable +from typing import List, Dict, Any, Tuple, Optional, Callable import numpy as np -from numpy.typing import NDArray -from benchwise.types import RougeScores, BleuScores, BertScoreResults, AccuracyResults +from benchwise.types import RougeScores from rouge_score import rouge_scorer from sacrebleu import BLEU import bert_score @@ -98,7 +97,13 @@ def rouge_l( scorer = rouge_scorer.RougeScorer( ["rougeL", "rouge1", "rouge2"], use_stemmer=use_stemmer ) - scores: Dict[str, List[float]] = {"precision": [], "recall": [], "f1": [], "rouge1_f1": [], "rouge2_f1": []} + scores: Dict[str, List[float]] = { + "precision": [], + "recall": [], + "f1": [], + "rouge1_f1": [], + "rouge2_f1": [], + } for pred, ref in zip(predictions, references): # Handle empty strings gracefully @@ -206,7 +211,9 @@ def bleu_score( # Calculate sentence-level BLEU with improved handling sentence_scores = [] - ngram_precisions: Dict[str, List[float]] = {f"bleu_{i}": [] for i in range(1, max_n + 1)} + ngram_precisions: Dict[str, List[float]] = { + f"bleu_{i}": [] for i in range(1, max_n + 1) + } for pred, ref in zip(predictions, references): try: @@ -834,7 +841,10 @@ def factual_correctness( def _analyze_factual_correctness( - prediction: str, reference: str, nlp_model: Any = None, use_named_entities: bool = True + prediction: str, + reference: str, + nlp_model: Any = None, + use_named_entities: bool = True, ) -> Dict[str, float]: """ Analyze factual correctness using multiple approaches. @@ -1106,12 +1116,13 @@ def _analyze_text_coherence(text: str) -> Dict[str, float]: # 1. Sentence consistency (length and structure) sentence_lengths = [len(s.split()) for s in sentences] if len(sentence_lengths) > 1: - length_cv = ( - np.std(sentence_lengths) / np.mean(sentence_lengths) + length_cv: float = ( + float(np.std(sentence_lengths) / np.mean(sentence_lengths)) if np.mean(sentence_lengths) > 0 - else 1 + else 1.0 ) - sentence_consistency = float(max(0, 1 - (length_cv / 2))) # Normalize to 0-1 + cv_value: float = length_cv / 2.0 + sentence_consistency = max(0.0, 1.0 - cv_value) # Normalize to 0-1 else: sentence_consistency = 1.0 if sentence_lengths else 0.0 @@ -1487,7 +1498,9 @@ class MetricCollection: def __init__(self) -> None: self.metrics: Dict[str, Tuple[Callable[..., Any], Dict[str, Any]]] = {} - def add_metric(self, name: str, metric_func: Callable[..., Any], **kwargs: Any) -> None: + def add_metric( + self, name: str, metric_func: Callable[..., Any], **kwargs: Any + ) -> None: """Add a metric to the collection.""" self.metrics[name] = (metric_func, kwargs) diff --git a/mypy.ini b/mypy.ini index 23d3a7c..1383e41 100644 --- a/mypy.ini +++ b/mypy.ini @@ -62,3 +62,12 @@ follow_imports = skip [mypy-fuzzywuzzy.*] ignore_missing_imports = True + +[mypy-sacrebleu.*] +ignore_missing_imports = True + +[mypy-spacy.*] +ignore_missing_imports = True + +[mypy-google.*] +ignore_missing_imports = True diff --git a/test_single_doc_file.py b/test_single_doc_file.py deleted file mode 100644 index 5420d7b..0000000 --- a/test_single_doc_file.py +++ /dev/null @@ -1,424 +0,0 @@ -#!/usr/bin/env python3 -""" -Test all code examples from a documentation file with REAL models. - -This script extracts all Python code blocks from a documentation markdown file -and runs each one as a separate test with real OpenAI and Google models. - -The script can find documentation files in multiple ways: -- Absolute path: /path/to/file.md -- Relative to project root: docs/docs/examples/classification.md -- Just filename (searches docs/ tree): classification.md - -Usage: - # Using just filename (searches in docs/ directory) - python test_single_doc_file.py classification.md - - # Using relative path from project root - python test_single_doc_file.py docs/docs/examples/classification.md - python test_single_doc_file.py README.md - python test_single_doc_file.py docs/docs/getting-started/quickstart.md - - # Syntax check only (no API calls) - python test_single_doc_file.py --syntax-only classification.md - - # Save test results to files - python test_single_doc_file.py --save-results classification.md -""" - -import re -import subprocess -import sys -import tempfile -import time -from pathlib import Path -from typing import List, Tuple - - -def extract_code_blocks(markdown_file: Path) -> List[Tuple[str, int, int]]: - """ - Extract all Python code blocks from a markdown file. - Returns list of (code, block_number, line_number) tuples. - """ - with open(markdown_file, 'r', encoding='utf-8') as f: - content = f.read() - - pattern = r'```python\n(.*?)```' - matches = re.finditer(pattern, content, re.DOTALL) - - code_blocks = [] - for i, match in enumerate(matches, 1): - code = match.group(1) - line_number = content[:match.start()].count('\n') + 1 - code_blocks.append((code, i, line_number)) - - return code_blocks - - -def prepare_code_for_real_models(code: str) -> str: - """ - Replace model names with real OpenAI and Google models. - Ensures we use exactly 2 models: gpt-3.5-turbo and gemini-2.5-flash - """ - import re - - # Find all @evaluate decorators and replace models to ensure diversity - def replace_evaluate_models(match): - decorator = match.group(0) - - # Extract the content inside @evaluate(...) - content = re.search(r'@evaluate\((.*)\)', decorator, re.DOTALL) - if not content: - return decorator - - params = content.group(1) - - # Split by comma, but be careful with nested structures - # Extract all quoted strings (model names) - model_pattern = r'"([^"]+)"' - models = re.findall(model_pattern, params) - - if not models: - return decorator - - # Always use exactly 2 models: gpt-3.5-turbo and gemini-2.5-flash - # Take first N models and replace them, but cap at 2 - num_models = min(len(models), 2) - new_models = ['"gpt-3.5-turbo"', '"gemini-2.5-flash"'][:num_models] - - # If there was only 1 model originally, keep it as 1 model - if len(models) == 1: - new_models = ['"gpt-3.5-turbo"'] - - # Find any kwargs (parameters with =) - # Split params and identify non-string parts (kwargs) - kwargs = [] - # Remove all quoted strings and see what's left - params_without_strings = re.sub(r'"[^"]*"', '', params) - if '=' in params_without_strings: - # Extract kwargs - kwargs_match = re.search(r',?\s*(\w+\s*=\s*[^,)]+(?:,\s*\w+\s*=\s*[^,)]+)*)\s*$', params) - if kwargs_match: - kwargs.append(kwargs_match.group(1)) - - # Reconstruct the decorator - result = '@evaluate(' + ', '.join(new_models) - if kwargs: - result += ', ' + ', '.join(kwargs) - result += ')' - - return result - - # Replace all @evaluate decorators - modified_code = re.sub(r'@evaluate\([^)]+\)', replace_evaluate_models, code) - - # Replace placeholder dataset loading with actual datasets - if 'load_dataset("data/qa_1000.json")' in modified_code: - # Add import if not present - if 'from benchwise' in modified_code and 'create_qa_dataset' not in modified_code: - modified_code = modified_code.replace( - 'from benchwise import', - 'from benchwise import create_qa_dataset,' - ) - modified_code = modified_code.replace( - 'load_dataset("data/qa_1000.json")', - 'create_qa_dataset(questions=["What is AI?", "What is ML?"], answers=["Artificial Intelligence", "Machine Learning"], name="qa_test")' - ) - - if 'load_dataset("data/news_articles.json")' in modified_code: - # Add import if not present - if 'from benchwise' in modified_code and 'create_summarization_dataset' not in modified_code: - modified_code = modified_code.replace( - 'from benchwise import', - 'from benchwise import create_summarization_dataset,' - ) - modified_code = modified_code.replace( - 'load_dataset("data/news_articles.json")', - 'create_summarization_dataset(documents=["Article about AI.", "Article about ML."], summaries=["AI summary", "ML summary"], name="news")' - ) - - return modified_code - - -def check_syntax(code: str) -> Tuple[bool, str]: - """Check if Python code has valid syntax.""" - import ast - try: - ast.parse(code) - return True, None - except SyntaxError as e: - return False, f"SyntaxError at line {e.lineno}: {e.msg}" - except Exception as e: - return False, f"Parse error: {str(e)}" - - -def run_code_sync(code: str, timeout: int = 90) -> Tuple[bool, str, str]: - """Run code in subprocess and capture output.""" - try: - # Create temp file - with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: - f.write(code) - temp_file = f.name - - # Run in subprocess - result = subprocess.run( - ['python', temp_file], - capture_output=True, - text=True, - timeout=timeout, - cwd=Path(__file__).parent - ) - - # Cleanup - import os - os.unlink(temp_file) - - output = result.stdout - error = result.stderr - - if result.returncode == 0: - return True, output, None - else: - return False, output, error - - except subprocess.TimeoutExpired: - return False, "", f"Timeout after {timeout}s" - except Exception as e: - return False, "", f"Error: {str(e)}" - - -def test_code_block(code: str, block_num: int, line_num: int, syntax_only: bool = False) -> Tuple[bool, str]: - """Test a single code block.""" - # Check syntax - syntax_valid, syntax_error = check_syntax(code) - if not syntax_valid: - print(f"โŒ SYNTAX ERROR") - return False, f"Syntax Error: {syntax_error}" - - if syntax_only: - print(f"โœ… SYNTAX VALID") - return True, None - - # Prepare code with real models - prepared_code = prepare_code_for_real_models(code) - - # Skip incomplete examples (just function definitions without execution) - if '@evaluate(' in prepared_code and 'asyncio.run' not in prepared_code: - print(f"โญ๏ธ SKIPPED (incomplete example - defines functions only)") - return True, "Skipped: Incomplete example" - - # Run the code - print(f"โณ Running test...", end=" ", flush=True) - start_time = time.time() - success, output, error = run_code_sync(prepared_code, timeout=90) - duration = time.time() - start_time - - if success: - print(f"โœ… PASSED ({duration:.2f}s)") - return True, output - else: - print(f"โŒ FAILED ({duration:.2f}s)") - return False, error or output - - -def main(): - import argparse - import json - from datetime import datetime - - parser = argparse.ArgumentParser(description="Test Python code examples from a documentation file") - parser.add_argument('file', help='Documentation file to test. Can be:\n' - ' - Relative path from project root (e.g., docs/docs/examples/classification.md)\n' - ' - Absolute path (e.g., /path/to/file.md)\n' - ' - Just filename (will search in docs/ directory tree)') - parser.add_argument('--syntax-only', action='store_true', help='Only check syntax') - parser.add_argument('--save-results', action='store_true', help='Save test results to files') - args = parser.parse_args() - - # Find the documentation file - project_root = Path(__file__).parent - file_arg = Path(args.file) - - # Try different strategies to find the file - doc_file = None - - # Strategy 1: Absolute path - if file_arg.is_absolute() and file_arg.exists(): - doc_file = file_arg - - # Strategy 2: Relative to project root - elif (project_root / file_arg).exists(): - doc_file = project_root / file_arg - - # Strategy 3: Search in docs directory tree - else: - docs_dir = project_root / 'docs' - if docs_dir.exists(): - # Search for the file in docs directory tree - for candidate in docs_dir.rglob(file_arg.name if file_arg.name else args.file): - if candidate.is_file(): - doc_file = candidate - break - - if doc_file is None or not doc_file.exists(): - print(f"โŒ Error: File not found: {args.file}") - print(f"\nSearched in:") - print(f" - Absolute path: {file_arg if file_arg.is_absolute() else 'N/A'}") - print(f" - Relative to project: {project_root / file_arg}") - print(f" - In docs/ directory tree") - return 1 - - # Get relative path for display - try: - display_path = doc_file.relative_to(project_root) - except ValueError: - display_path = doc_file - - print(f"\n๐Ÿงช Testing Documentation Examples") - print(f"๐Ÿ“„ File: {display_path}") - - if args.syntax_only: - print("โš™๏ธ Mode: Syntax check only") - else: - print("โš™๏ธ Mode: Full execution with REAL models") - print("๐Ÿค– Models: gpt-3.5-turbo, gemini-2.5-flash") - print("โš ๏ธ Note: This will make actual API calls and incur costs") - - # Extract code blocks - code_blocks = extract_code_blocks(doc_file) - - if not code_blocks: - print(f"\nโŒ No Python code blocks found in {args.file}") - return 1 - - print(f"๐Ÿ“ Total code blocks: {len(code_blocks)}\n") - print("=" * 80) - - # Test each code block - results = [] - for code, block_num, line_num in code_blocks: - print(f"\n{'=' * 80}") - print(f"TEST {block_num}/{len(code_blocks)}: Block {block_num} (Line {line_num})") - print("=" * 80) - - success, output_or_error = test_code_block(code, block_num, line_num, args.syntax_only) - results.append((block_num, success, output_or_error)) - - # Show output - if success and output_or_error and output_or_error.strip() and not args.syntax_only: - print("\n๐Ÿ“‹ OUTPUT:") - print("-" * 80) - output_lines = output_or_error.strip().split('\n') - for line in output_lines[:50]: # Show first 50 lines - print(f" {line}") - if len(output_lines) > 50: - print(f" ... ({len(output_lines) - 50} more lines)") - print("-" * 80) - elif not success and output_or_error: - print("\nโŒ ERROR:") - print("-" * 80) - error_lines = output_or_error.split('\n') - for line in error_lines[:40]: # Show first 40 lines - print(f" {line}") - if len(error_lines) > 40: - print(f" ... ({len(error_lines) - 40} more lines)") - print("-" * 80) - - # Summary - print(f"\n{'=' * 80}") - print("SUMMARY") - print("=" * 80) - - total = len(results) - passed = sum(1 for _, success, _ in results if success) - failed = total - passed - - print(f"\nFile: {display_path}") - print(f"Total: {total} code blocks") - print(f"โœ… Passed: {passed}") - print(f"โŒ Failed: {failed}") - print(f"Success Rate: {passed/total*100:.1f}%") - - # Show failures - if failed > 0: - print(f"\n{'-' * 80}") - print("FAILED TESTS") - print("-" * 80) - for block_num, success, output_or_error in results: - if not success: - print(f"\nโŒ Block {block_num}") - if output_or_error: - print(f" {output_or_error[:200]}") - - print(f"\n{'=' * 80}\n") - - # Save results if requested - if args.save_results: - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - # Create a clean base name from the file path - base_name = doc_file.stem # Gets filename without extension - - # Create results directory - results_dir = Path(__file__).parent / 'test_results' - results_dir.mkdir(exist_ok=True) - - # Save JSON results (detailed) - json_file = results_dir / f"{base_name}_{timestamp}.json" - json_data = { - "file": str(display_path), - "full_path": str(doc_file), - "timestamp": datetime.now().isoformat(), - "total": total, - "passed": passed, - "failed": failed, - "success_rate": passed/total*100, - "syntax_only": args.syntax_only, - "results": [ - { - "block": block_num, - "success": success, - "output": output_or_error[:500] if output_or_error else None, # Truncate long outputs - } - for block_num, success, output_or_error in results - ] - } - - with open(json_file, 'w') as f: - json.dump(json_data, f, indent=2) - - print(f"๐Ÿ’พ JSON results saved to: {json_file}") - - # Save Markdown summary - md_file = results_dir / f"{base_name}_{timestamp}.md" - with open(md_file, 'w') as f: - f.write(f"# Test Results: {display_path}\n\n") - f.write(f"**File:** `{doc_file}`\n\n") - f.write(f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") - f.write(f"**Models:** gpt-3.5-turbo, gemini-2.5-flash\n\n") - f.write(f"## Summary\n\n") - f.write(f"- Total Tests: {total}\n") - f.write(f"- โœ… Passed: {passed}\n") - f.write(f"- โŒ Failed: {failed}\n") - f.write(f"- Success Rate: {passed/total*100:.1f}%\n\n") - - if failed > 0: - f.write(f"## Failed Tests\n\n") - for block_num, success, output_or_error in results: - if not success: - f.write(f"### Block {block_num}\n\n") - f.write(f"```\n{output_or_error[:300] if output_or_error else 'No error details'}\n```\n\n") - - print(f"๐Ÿ“ Markdown summary saved to: {md_file}") - - # Save to latest file (overwrite) - latest_json = results_dir / f"{base_name}_latest.json" - with open(latest_json, 'w') as f: - json.dump(json_data, f, indent=2) - - print(f"๐Ÿ“Œ Latest results: {latest_json}") - - return 0 if failed == 0 else 1 - - -if __name__ == '__main__': - sys.exit(main()) From 88c052d8ac4a376eead1c216cdb07fd8a6826239 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Wed, 3 Dec 2025 00:20:45 +0530 Subject: [PATCH 14/24] fix: Remove unused imports and fix type errors --- benchwise/core.py | 84 ++++++++++++++++++++++++++----------- benchwise/exceptions.py | 12 +++++- benchwise/logging.py | 24 +++++------ benchwise/results.py | 14 +++++-- benchwise/types.py | 47 +++++++++++++++++---- tests/test_config.py | 7 ++-- tests/test_docs_examples.py | 48 ++++++++++++--------- tests/test_integration.py | 5 ++- tests/test_models.py | 15 ++++--- 9 files changed, 179 insertions(+), 77 deletions(-) diff --git a/benchwise/core.py b/benchwise/core.py index fe34af5..93fe89f 100644 --- a/benchwise/core.py +++ b/benchwise/core.py @@ -1,4 +1,14 @@ -from typing import List, Dict, Any, Callable, Optional, Union, ParamSpec, TypeVar, Awaitable +from typing import ( + List, + Dict, + Any, + Callable, + Optional, + Union, + ParamSpec, + TypeVar, + Awaitable, +) from functools import wraps import asyncio import time @@ -11,15 +21,18 @@ from .client import upload_results # Type variables for decorator typing -P = ParamSpec('P') -R = TypeVar('R') +P = ParamSpec("P") +R = TypeVar("R") logger = logging.getLogger("benchwise") def evaluate( *models: str, upload: Optional[bool] = None, **kwargs: Any -) -> Callable[[Callable[..., Awaitable[Any]]], Callable[[Dataset, Any], Awaitable[List[EvaluationResult]]]]: +) -> Callable[ + [Callable[..., Awaitable[Any]]], + Callable[[Dataset, Any], Awaitable[List[EvaluationResult]]], +]: """ Decorator for creating LLM evaluations. @@ -41,7 +54,9 @@ async def test_qa(model, dataset): return accuracy(responses, dataset.references) """ - def decorator(test_func: Callable[..., Awaitable[Any]]) -> Callable[..., Awaitable[List[EvaluationResult]]]: + def decorator( + test_func: Callable[..., Awaitable[Any]], + ) -> Callable[..., Awaitable[List[EvaluationResult]]]: if not inspect.iscoroutinefunction(test_func): raise TypeError( f"{test_func.__name__} must be an async function. " @@ -49,8 +64,12 @@ def decorator(test_func: Callable[..., Awaitable[Any]]) -> Callable[..., Awaitab ) @wraps(test_func) - async def wrapper(dataset: Dataset, **test_kwargs: Any) -> List[EvaluationResult]: - return await _run_evaluation(test_func, dataset, models, upload, kwargs, test_kwargs) + async def wrapper( + dataset: Dataset, **test_kwargs: Any + ) -> List[EvaluationResult]: + return await _run_evaluation( + test_func, dataset, models, upload, kwargs, test_kwargs + ) if hasattr(test_func, "_benchmark_metadata"): wrapper._benchmark_metadata = test_func._benchmark_metadata # type: ignore[attr-defined] @@ -69,13 +88,13 @@ async def _run_evaluation( test_kwargs: Dict[str, Any], ) -> List[EvaluationResult]: results = [] - + logger.info(f"Starting evaluation: {test_func.__name__} on {len(models)} model(s)") for model_name in models: try: logger.debug(f"Evaluating model: {model_name}") - + model = get_model_adapter(model_name) start_time = time.time() @@ -95,12 +114,12 @@ async def _run_evaluation( metadata=combined_metadata, ) results.append(eval_result) - + logger.info(f"โœ“ {model_name} completed in {end_time - start_time:.2f}s") except Exception as e: logger.error(f"โœ— {model_name} failed: {e}", exc_info=True) - + combined_metadata = decorator_kwargs.copy() if hasattr(test_func, "_benchmark_metadata"): combined_metadata.update(test_func._benchmark_metadata) @@ -121,9 +140,7 @@ async def _run_evaluation( if should_upload and results: try: logger.debug("Uploading results to Benchwise API") - await upload_results( - results, test_func.__name__, dataset.metadata or {} - ) + await upload_results(results, test_func.__name__, dataset.metadata or {}) logger.info("Results uploaded successfully") except Exception as e: logger.warning(f"Upload failed (results saved locally): {e}") @@ -133,7 +150,9 @@ async def _run_evaluation( return results -def benchmark(name: str, description: str = "", **kwargs: Any) -> Callable[[Callable[P, R]], Callable[P, R]]: +def benchmark( + name: str, description: str = "", **kwargs: Any +) -> Callable[[Callable[P, R]], Callable[P, R]]: """ Decorator for creating benchmarks. @@ -154,7 +173,11 @@ def decorator(test_func: Callable[P, R]) -> Callable[P, R]: return decorator -def stress_test(concurrent_requests: int = 10, duration: int = 60) -> Callable[[Callable[P, Awaitable[R]]], Callable[P, Awaitable[List[Union[R, BaseException]]]]]: +def stress_test( + concurrent_requests: int = 10, duration: int = 60 +) -> Callable[ + [Callable[P, Awaitable[R]]], Callable[P, Awaitable[List[Union[R, BaseException]]]] +]: """ Decorator for stress testing LLMs. @@ -166,10 +189,16 @@ async def load_test(model, dataset): pass """ - def decorator(test_func: Callable[P, Awaitable[R]]) -> Callable[P, Awaitable[List[Union[R, BaseException]]]]: + def decorator( + test_func: Callable[P, Awaitable[R]], + ) -> Callable[P, Awaitable[List[Union[R, BaseException]]]]: @wraps(test_func) - async def wrapper(*args: P.args, **kwargs: P.kwargs) -> List[Union[R, BaseException]]: - logger.info(f"Starting stress test: {concurrent_requests} concurrent requests for {duration}s") + async def wrapper( + *args: P.args, **kwargs: P.kwargs + ) -> List[Union[R, BaseException]]: + logger.info( + f"Starting stress test: {concurrent_requests} concurrent requests for {duration}s" + ) tasks: List[Union[R, BaseException]] = [] start_time = time.time() @@ -203,7 +232,10 @@ def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: self.logger = logging.getLogger("benchwise.runner") async def run_evaluation( - self, test_func: Callable[..., Awaitable[Any]], dataset: Dataset, models: List[str] + self, + test_func: Callable[..., Awaitable[Any]], + dataset: Dataset, + models: List[str], ) -> List[EvaluationResult]: """Run evaluation on multiple models.""" results: List[EvaluationResult] = [] @@ -266,9 +298,11 @@ def compare_models( {"model": name, "score": score} for name, score in model_scores ], } - - self.logger.info(f"Comparison complete: Best model is {comparison['best_model']}") - + + self.logger.info( + f"Comparison complete: Best model is {comparison['best_model']}" + ) + return comparison @@ -280,7 +314,9 @@ def run_benchmark( return asyncio.run(runner.run_evaluation(benchmark_func, dataset, models)) -async def quick_eval(prompt: str, models: List[str], metric: Callable[[str], float]) -> Dict[str, Optional[float]]: +async def quick_eval( + prompt: str, models: List[str], metric: Callable[[str], float] +) -> Dict[str, Optional[float]]: """Quick evaluation with a single prompt.""" results: Dict[str, Optional[float]] = {} diff --git a/benchwise/exceptions.py b/benchwise/exceptions.py index d5a9888..659799d 100644 --- a/benchwise/exceptions.py +++ b/benchwise/exceptions.py @@ -9,47 +9,57 @@ class BenchwiseError(Exception): """Base exception for all Benchwise errors.""" + pass class AuthenticationError(BenchwiseError): """Raised when authentication fails.""" + pass class RateLimitError(BenchwiseError): """Raised when API rate limit is exceeded.""" - def __init__(self, message: str = "Rate limit exceeded", retry_after: Optional[int] = None) -> None: + def __init__( + self, message: str = "Rate limit exceeded", retry_after: Optional[int] = None + ) -> None: super().__init__(message) self.retry_after = retry_after class ValidationError(BenchwiseError): """Raised when input validation fails.""" + pass class NetworkError(BenchwiseError): """Raised when network requests fail.""" + pass class ConfigurationError(BenchwiseError): """Raised when configuration is invalid or missing.""" + pass class DatasetError(BenchwiseError): """Raised when dataset operations fail.""" + pass class ModelError(BenchwiseError): """Raised when model operations fail.""" + pass class MetricError(BenchwiseError): """Raised when metric calculation fails.""" + pass diff --git a/benchwise/logging.py b/benchwise/logging.py index 3b2bd31..4843e53 100644 --- a/benchwise/logging.py +++ b/benchwise/logging.py @@ -16,20 +16,20 @@ def setup_logging( ) -> logging.Logger: """ Setup logging for Benchwise. - + Args: level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) format: Custom log format string filename: Optional file to write logs to - + Returns: Configured logger instance """ - + # Default format if format is None: format = "[%(asctime)s] %(levelname)s [%(name)s] %(message)s" - + # Configure root logger logging.basicConfig( level=getattr(logging, level.upper()), @@ -37,31 +37,31 @@ def setup_logging( datefmt="%Y-%m-%d %H:%M:%S", handlers=[ logging.StreamHandler(sys.stdout), - ] + ], ) - + # Add file handler if filename provided if filename: file_handler = logging.FileHandler(filename) file_handler.setFormatter(logging.Formatter(format)) logging.getLogger("benchwise").addHandler(file_handler) - + # Get benchwise logger logger = logging.getLogger("benchwise") logger.setLevel(getattr(logging, level.upper())) - + logger.debug(f"Logging initialized at {level} level") - + return logger def get_logger(name: str = "benchwise") -> logging.Logger: """ Get a logger instance for Benchwise. - + Args: name: Logger name (default: "benchwise") - + Returns: Logger instance """ @@ -71,7 +71,7 @@ def get_logger(name: str = "benchwise") -> logging.Logger: def set_log_level(level: str) -> None: """ Change the log level for all Benchwise loggers. - + Args: level: New log level (DEBUG, INFO, WARNING, ERROR, CRITICAL) """ diff --git a/benchwise/results.py b/benchwise/results.py index 330a73e..54768ff 100644 --- a/benchwise/results.py +++ b/benchwise/results.py @@ -119,7 +119,9 @@ def success_rate(self) -> float: return 0.0 return len(self.successful_results) / len(self.results) - def get_best_model(self, metric_name: Optional[str] = None) -> Optional[EvaluationResult]: + def get_best_model( + self, metric_name: Optional[str] = None + ) -> Optional[EvaluationResult]: """ Get the best performing model result. @@ -135,7 +137,9 @@ def get_best_model(self, metric_name: Optional[str] = None) -> Optional[Evaluati return max(successful_results, key=lambda r: r.get_score(metric_name) or 0) - def get_worst_model(self, metric_name: Optional[str] = None) -> Optional[EvaluationResult]: + def get_worst_model( + self, metric_name: Optional[str] = None + ) -> Optional[EvaluationResult]: """ Get the worst performing model result. @@ -269,7 +273,11 @@ def compare_benchmarks( Returns: Dictionary with cross-benchmark comparison """ - comparison: Dict[str, Any] = {"benchmarks": [], "models": set(), "cross_benchmark_scores": {}} + comparison: Dict[str, Any] = { + "benchmarks": [], + "models": set(), + "cross_benchmark_scores": {}, + } for benchmark in benchmark_results: benchmark_info = { diff --git a/benchwise/types.py b/benchwise/types.py index 5818a62..a351b09 100644 --- a/benchwise/types.py +++ b/benchwise/types.py @@ -5,14 +5,25 @@ used throughout the BenchWise codebase for improved type safety and IDE support. """ -from typing import Any, Dict, List, Literal, Optional, Protocol, TypeVar, ParamSpec, Tuple, TypedDict +from typing import ( + Any, + Dict, + List, + Literal, + Optional, + Protocol, + TypeVar, + ParamSpec, + Tuple, + TypedDict, +) # Type Variables -T = TypeVar('T') -R = TypeVar('R') -P = ParamSpec('P') -ModelT = TypeVar('ModelT') -DatasetT = TypeVar('DatasetT') +T = TypeVar("T") +R = TypeVar("R") +P = ParamSpec("P") +ModelT = TypeVar("ModelT") +DatasetT = TypeVar("DatasetT") # Literal Types HttpMethod = Literal["GET", "POST", "PUT", "DELETE", "PATCH"] @@ -23,6 +34,7 @@ # Model Configuration Types class ModelConfig(TypedDict, total=False): """Configuration options for model adapters.""" + api_key: str temperature: float max_tokens: int @@ -35,6 +47,7 @@ class ModelConfig(TypedDict, total=False): class PricingInfo(TypedDict): """Pricing information for a model.""" + input: float # Cost per 1K input tokens output: float # Cost per 1K output tokens @@ -42,6 +55,7 @@ class PricingInfo(TypedDict): # Metric Return Types class RougeScores(TypedDict, total=False): """Return type for ROUGE metric scores.""" + precision: float recall: float f1: float @@ -60,6 +74,7 @@ class RougeScores(TypedDict, total=False): class BleuScores(TypedDict, total=False): """Return type for BLEU metric scores.""" + bleu: float bleu1: float bleu2: float @@ -75,6 +90,7 @@ class BleuScores(TypedDict, total=False): class BertScoreResults(TypedDict, total=False): """Return type for BERT-Score metric.""" + precision: float recall: float f1: float @@ -90,6 +106,7 @@ class BertScoreResults(TypedDict, total=False): class AccuracyResults(TypedDict, total=False): """Return type for accuracy metric.""" + accuracy: float correct: int total: int @@ -101,6 +118,7 @@ class AccuracyResults(TypedDict, total=False): class SemanticSimilarityResults(TypedDict, total=False): """Return type for semantic similarity metric.""" + similarity: float std_similarity: float scores: List[float] @@ -110,6 +128,7 @@ class SemanticSimilarityResults(TypedDict, total=False): class CoherenceResults(TypedDict, total=False): """Return type for coherence score metric.""" + coherence: float std_coherence: float scores: List[float] @@ -119,6 +138,7 @@ class CoherenceResults(TypedDict, total=False): class SafetyResults(TypedDict, total=False): """Return type for safety score metric.""" + safety: float is_safe: bool flagged_categories: List[str] @@ -130,6 +150,7 @@ class SafetyResults(TypedDict, total=False): class FactualCorrectnessResults(TypedDict, total=False): """Return type for factual correctness metric.""" + correctness: float is_correct: bool std_correctness: float @@ -141,6 +162,7 @@ class FactualCorrectnessResults(TypedDict, total=False): # Dataset Types class DatasetItem(TypedDict, total=False): """A single item in a dataset.""" + # Common field names prompt: str input: str @@ -159,6 +181,7 @@ class DatasetItem(TypedDict, total=False): class DatasetMetadata(TypedDict, total=False): """Metadata for a dataset.""" + name: str description: str source: str @@ -170,6 +193,7 @@ class DatasetMetadata(TypedDict, total=False): class DatasetSchema(TypedDict, total=False): """Schema definition for a dataset.""" + prompt_field: str reference_field: str required_fields: List[str] @@ -179,6 +203,7 @@ class DatasetSchema(TypedDict, total=False): # Configuration Types class ConfigDict(TypedDict, total=False): """Configuration dictionary for BenchWise.""" + api_url: str api_key: Optional[str] upload_enabled: bool @@ -197,6 +222,7 @@ class ConfigDict(TypedDict, total=False): # Results Types class EvaluationResultDict(TypedDict, total=False): """Serialized evaluation result.""" + model: str prompt: str response: str @@ -210,6 +236,7 @@ class EvaluationResultDict(TypedDict, total=False): class BenchmarkResultDict(TypedDict, total=False): """Serialized benchmark result.""" + benchmark_name: str benchmark_description: str results: List[EvaluationResultDict] @@ -219,6 +246,7 @@ class BenchmarkResultDict(TypedDict, total=False): class ComparisonResult(TypedDict): """Result of model comparison.""" + best_model: str best_score: float rankings: List[Tuple[str, float]] @@ -228,12 +256,14 @@ class ComparisonResult(TypedDict): # API Response Types class LoginResponse(TypedDict): """Response from login endpoint.""" + token: Dict[str, str] user: Dict[str, Any] class UserInfo(TypedDict, total=False): """User information from API.""" + id: int username: str email: str @@ -243,6 +273,7 @@ class UserInfo(TypedDict, total=False): class UploadResultsResponse(TypedDict): """Response from upload results endpoint.""" + id: int benchmark_id: int model_ids: List[int] @@ -286,6 +317,8 @@ def exists(self, key: str) -> bool: class SupportsMetrics(Protocol): """Protocol for objects that support metric evaluation.""" - def evaluate(self, predictions: List[str], references: List[str], **kwargs: Any) -> Dict[str, float]: + def evaluate( + self, predictions: List[str], references: List[str], **kwargs: Any + ) -> Dict[str, float]: """Evaluate predictions against references.""" ... diff --git a/tests/test_config.py b/tests/test_config.py index a11b874..2cf169d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -106,9 +106,10 @@ def test_load_from_json_file(self): try: # Mock the config file paths - with patch.object(Path, "exists", return_value=True), patch( - "builtins.open", create=True - ) as mock_open: + with ( + patch.object(Path, "exists", return_value=True), + patch("builtins.open", create=True) as mock_open, + ): import json mock_open.return_value.__enter__.return_value.read.return_value = ( diff --git a/tests/test_docs_examples.py b/tests/test_docs_examples.py index d998cf2..4747fec 100644 --- a/tests/test_docs_examples.py +++ b/tests/test_docs_examples.py @@ -17,16 +17,16 @@ def extract_code_blocks_from_md(markdown_file: Path) -> List[tuple]: Extract all Python code blocks from a markdown file. Returns list of (code, block_number, line_number) tuples. """ - with open(markdown_file, 'r', encoding='utf-8') as f: + with open(markdown_file, "r", encoding="utf-8") as f: content = f.read() - pattern = r'```python\n(.*?)```' + pattern = r"```python\n(.*?)```" matches = re.finditer(pattern, content, re.DOTALL) code_blocks = [] for i, match in enumerate(matches, 1): code = match.group(1) - line_number = content[:match.start()].count('\n') + 1 + line_number = content[: match.start()].count("\n") + 1 code_blocks.append((code, i, line_number)) return code_blocks @@ -34,12 +34,12 @@ def extract_code_blocks_from_md(markdown_file: Path) -> List[tuple]: def get_doc_files() -> List[Path]: """Get all markdown documentation files with code examples.""" - docs_dir = Path(__file__).parent.parent / 'docs' / 'docs' / 'examples' + docs_dir = Path(__file__).parent.parent / "docs" / "docs" / "examples" if not docs_dir.exists(): return [] - return sorted(docs_dir.glob('*.md')) + return sorted(docs_dir.glob("*.md")) def prepare_code_for_testing(code: str) -> str: @@ -67,11 +67,11 @@ def prepare_code_for_testing(code: str) -> str: if 'load_dataset("data/' in modified_code: modified_code = modified_code.replace( 'load_dataset("data/qa_1000.json")', - 'create_qa_dataset(questions=["Q1?"], answers=["A1"], name="test")' + 'create_qa_dataset(questions=["Q1?"], answers=["A1"], name="test")', ) modified_code = modified_code.replace( 'load_dataset("data/news_articles.json")', - 'create_summarization_dataset(documents=["Doc1"], summaries=["Sum1"], name="news")' + 'create_summarization_dataset(documents=["Doc1"], summaries=["Sum1"], name="news")', ) return modified_code @@ -87,8 +87,11 @@ def prepare_code_for_testing(code: str) -> str: test_params.append((doc_file.name, block_num, line_num, code)) -@pytest.mark.parametrize("filename,block_num,line_num,code", test_params, - ids=[f"{f}:block_{b}:L{l}" for f, b, l, _ in test_params]) +@pytest.mark.parametrize( + "filename,block_num,line_num,code", + test_params, + ids=[f"{f}:block_{b}:L{line}" for f, b, line, _ in test_params], +) def test_documentation_code_syntax(filename, block_num, line_num, code): """ Test that all code examples in documentation have valid Python syntax. @@ -107,8 +110,11 @@ def test_documentation_code_syntax(filename, block_num, line_num, code): @pytest.mark.slow @pytest.mark.mock -@pytest.mark.parametrize("filename,block_num,line_num,code", test_params, - ids=[f"{f}:block_{b}:L{l}" for f, b, l, _ in test_params]) +@pytest.mark.parametrize( + "filename,block_num,line_num,code", + test_params, + ids=[f"{f}:block_{b}:L{line}" for f, b, line, _ in test_params], +) def test_documentation_code_execution(filename, block_num, line_num, code): """ Test that code examples can be executed without errors (using mock models). @@ -117,11 +123,13 @@ def test_documentation_code_execution(filename, block_num, line_num, code): and will be skipped. """ # Skip examples that are just function definitions without execution - if '@evaluate(' in code and 'asyncio.run' not in code: + if "@evaluate(" in code and "asyncio.run" not in code: pytest.skip("Incomplete example (defines functions only)") # Skip examples that require external data files - if 'load_dataset("data/' in code and 'create_' not in prepare_code_for_testing(code): + if 'load_dataset("data/' in code and "create_" not in prepare_code_for_testing( + code + ): pytest.skip("Requires external data files") # Prepare code with mock models @@ -129,7 +137,7 @@ def test_documentation_code_execution(filename, block_num, line_num, code): # Execute the code try: - exec_globals = {'__name__': '__main__'} + exec_globals = {"__name__": "__main__"} exec(prepared_code, exec_globals) except Exception as e: pytest.fail( @@ -141,11 +149,11 @@ def test_documentation_code_execution(filename, block_num, line_num, code): @pytest.mark.smoke def test_documentation_examples_exist(): """Verify that documentation example files exist and contain code blocks.""" - docs_dir = Path(__file__).parent.parent / 'docs' / 'docs' / 'examples' + docs_dir = Path(__file__).parent.parent / "docs" / "docs" / "examples" assert docs_dir.exists(), f"Documentation examples directory not found: {docs_dir}" - doc_files = list(docs_dir.glob('*.md')) + doc_files = list(docs_dir.glob("*.md")) assert len(doc_files) > 0, "No documentation markdown files found" total_blocks = 0 @@ -154,9 +162,11 @@ def test_documentation_examples_exist(): total_blocks += len(blocks) assert total_blocks > 0, "No Python code blocks found in documentation" - print(f"\nFound {len(doc_files)} documentation files with {total_blocks} code blocks") + print( + f"\nFound {len(doc_files)} documentation files with {total_blocks} code blocks" + ) -if __name__ == '__main__': +if __name__ == "__main__": # Run just the smoke test - pytest.main([__file__, '-k', 'test_documentation_examples_exist', '-v']) + pytest.main([__file__, "-k", "test_documentation_examples_exist", "-v"]) diff --git a/tests/test_integration.py b/tests/test_integration.py index f88e8be..a64f87b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -109,8 +109,9 @@ def test_model_factory_integration(self): assert adapter.__class__.__name__ == expected_type assert adapter.model_name == model_name - with patch("transformers.AutoTokenizer"), patch( - "transformers.AutoModelForCausalLM" + with ( + patch("transformers.AutoTokenizer"), + patch("transformers.AutoModelForCausalLM"), ): adapter = get_model_adapter("test/unknown-model") assert adapter.__class__.__name__ == "HuggingFaceAdapter" diff --git a/tests/test_models.py b/tests/test_models.py index 53fd01c..f3f6add 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -80,8 +80,9 @@ def test_get_mock_adapter(self): def test_get_huggingface_adapter_default(self): # Use a mock model name that won't trigger real HuggingFace download - with patch("transformers.AutoTokenizer"), patch( - "transformers.AutoModelForCausalLM" + with ( + patch("transformers.AutoTokenizer"), + patch("transformers.AutoModelForCausalLM"), ): adapter = get_model_adapter("test/unknown-model-name") assert isinstance(adapter, HuggingFaceAdapter) @@ -197,15 +198,17 @@ def test_google_import_error(self): class TestHuggingFaceAdapter: def test_huggingface_adapter_creation(self): - with patch("transformers.AutoTokenizer"), patch( - "transformers.AutoModelForCausalLM" + with ( + patch("transformers.AutoTokenizer"), + patch("transformers.AutoModelForCausalLM"), ): adapter = HuggingFaceAdapter("gpt2") assert adapter.model_name == "gpt2" def test_huggingface_cost_estimate(self): - with patch("transformers.AutoTokenizer"), patch( - "transformers.AutoModelForCausalLM" + with ( + patch("transformers.AutoTokenizer"), + patch("transformers.AutoModelForCausalLM"), ): adapter = HuggingFaceAdapter("gpt2") cost = adapter.get_cost_estimate(1000, 500) From 8b21a2f1bf4479068d643b7330d57984daea17b7 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Wed, 3 Dec 2025 00:59:40 +0530 Subject: [PATCH 15/24] chore(pre-commit): Update ruff-pre-commit revision to v0.14.7 --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d4ec570..2c8a49c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: - id: debug-statements - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.6 + rev: v0.14.7 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] From 8fa061b67f6ff82b6aba78eaa3ce43037b0c8b07 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Wed, 3 Dec 2025 01:00:47 +0530 Subject: [PATCH 16/24] fix(format): Apply latest ruff formatting and update pre-commit config --- .pre-commit-config.yaml | 4 +-- benchwise/config.py | 6 ++-- benchwise/metrics.py | 12 ++++---- tests/test_memory_large_datasets.py | 48 ++++++++++++++--------------- tests/test_results.py | 6 ++-- 5 files changed, 38 insertions(+), 38 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2c8a49c..0605bc9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v6.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -18,7 +18,7 @@ repos: - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.11.2 + rev: v1.19.0 hooks: - id: mypy additional_dependencies: diff --git a/benchwise/config.py b/benchwise/config.py index a2e32dc..7f673ad 100644 --- a/benchwise/config.py +++ b/benchwise/config.py @@ -200,9 +200,9 @@ def save_to_file(self, file_path: Optional[Path] = None) -> None: # Don't save sensitive information like API key if self.api_key and not os.getenv("BENCHWISE_SAVE_API_KEY"): - config_dict[ - "_note" - ] = "API key not saved for security. Set BENCHWISE_API_KEY environment variable." + config_dict["_note"] = ( + "API key not saved for security. Set BENCHWISE_API_KEY environment variable." + ) try: with open(file_path, "w") as f: diff --git a/benchwise/metrics.py b/benchwise/metrics.py index 4c41baf..f6a322d 100644 --- a/benchwise/metrics.py +++ b/benchwise/metrics.py @@ -282,9 +282,9 @@ def bleu_score( # Add confidence intervals if requested if return_confidence and len(sentence_scores) > 1: try: - result[ - "sentence_bleu_confidence_interval" - ] = _bootstrap_confidence_interval(sentence_scores) + result["sentence_bleu_confidence_interval"] = ( + _bootstrap_confidence_interval(sentence_scores) + ) except Exception as e: warnings.warn(f"Could not calculate BLEU confidence intervals: {e}") @@ -414,9 +414,9 @@ def bert_score_metric( result["f1_confidence_interval"] = _bootstrap_confidence_interval( F1_scores ) - result[ - "precision_confidence_interval" - ] = _bootstrap_confidence_interval(P_scores) + result["precision_confidence_interval"] = ( + _bootstrap_confidence_interval(P_scores) + ) result["recall_confidence_interval"] = _bootstrap_confidence_interval( R_scores ) diff --git a/tests/test_memory_large_datasets.py b/tests/test_memory_large_datasets.py index 2a896fb..b5142bd 100644 --- a/tests/test_memory_large_datasets.py +++ b/tests/test_memory_large_datasets.py @@ -29,9 +29,9 @@ async def test_large_dataset_memory_usage(self): current_memory = self.get_memory_usage() memory_increase = current_memory - initial_memory - assert ( - memory_increase < 100 - ), f"Memory usage too high: {memory_increase}MB for {size} items" + assert memory_increase < 100, ( + f"Memory usage too high: {memory_increase}MB for {size} items" + ) sampled = dataset.sample(100) filtered = dataset.filter(lambda x: len(x["question"]) > 10) @@ -57,9 +57,9 @@ async def memory_test_evaluation(model, dataset): generation_memory = after_generation - before_generation # Memory increase should be reasonable - assert ( - generation_memory < 50 - ), f"Generation used too much memory: {generation_memory}MB" + assert generation_memory < 50, ( + f"Generation used too much memory: {generation_memory}MB" + ) return {"response_count": len(responses), "memory_used": generation_memory} @@ -69,9 +69,9 @@ async def memory_test_evaluation(model, dataset): total_memory_increase = final_memory - initial_memory # Total memory increase should be reasonable - assert ( - total_memory_increase < 100 - ), f"Total memory increase too high: {total_memory_increase}MB" + assert total_memory_increase < 100, ( + f"Total memory increase too high: {total_memory_increase}MB" + ) assert len(results) == 1 assert results[0].success @@ -100,9 +100,9 @@ async def test_dataset_chunking_memory_efficiency(self): # Memory shouldn't grow significantly per chunk current_memory = self.get_memory_usage() memory_per_chunk = (current_memory - initial_memory) / processed_chunks - assert ( - memory_per_chunk < 10 - ), f"Memory per chunk too high: {memory_per_chunk}MB" + assert memory_per_chunk < 10, ( + f"Memory per chunk too high: {memory_per_chunk}MB" + ) del chunk_dataset, chunk_data, prompts gc.collect() @@ -135,9 +135,9 @@ def dataset_generator(size): memory_used = current_memory - initial_memory max_memory_used = max(max_memory_used, memory_used) - assert ( - memory_used < 50 - ), f"Streaming memory too high: {memory_used}MB at {processed_items} items" + assert memory_used < 50, ( + f"Streaming memory too high: {memory_used}MB at {processed_items} items" + ) assert processed_items == 5000 assert max_memory_used < 50, f"Max memory usage too high: {max_memory_used}MB" @@ -164,9 +164,9 @@ async def cleanup_test(model, dataset): # Memory should return close to baseline current_memory = self.get_memory_usage() memory_diff = current_memory - baseline_memory - assert ( - memory_diff < 30 - ), f"Memory not cleaned up properly: {memory_diff}MB after iteration {i}" + assert memory_diff < 30, ( + f"Memory not cleaned up properly: {memory_diff}MB after iteration {i}" + ) async def test_large_dataset_file_operations(self, tmp_path): initial_memory = self.get_memory_usage() @@ -184,9 +184,9 @@ async def test_large_dataset_file_operations(self, tmp_path): # Memory shouldn't increase significantly during file operations after_save_memory = self.get_memory_usage() save_memory_increase = after_save_memory - initial_memory - assert ( - save_memory_increase < 100 - ), f"Save operation used too much memory: {save_memory_increase}MB" + assert save_memory_increase < 100, ( + f"Save operation used too much memory: {save_memory_increase}MB" + ) # Test loading from file del large_dataset @@ -198,9 +198,9 @@ async def test_large_dataset_file_operations(self, tmp_path): # Memory after loading should be reasonable after_load_memory = self.get_memory_usage() load_memory_increase = after_load_memory - initial_memory - assert ( - load_memory_increase < 150 - ), f"Load operation used too much memory: {load_memory_increase}MB" + assert load_memory_increase < 150, ( + f"Load operation used too much memory: {load_memory_increase}MB" + ) # Verify file sizes are reasonable json_size = json_file.stat().st_size / 1024 / 1024 # MB diff --git a/tests/test_results.py b/tests/test_results.py index 8a63d84..6646f62 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -184,9 +184,9 @@ def test_cache_save_and_load(self, temp_cache_dir, sample_evaluation_result): cache_files = list(Path(temp_cache_dir).glob("*.json")) assert len(cache_files) > 0, f"No cache files created in {temp_cache_dir}" - assert ( - loaded is not None - ), f"Failed to load cached result. Cache files: {cache_files}" + assert loaded is not None, ( + f"Failed to load cached result. Cache files: {cache_files}" + ) assert loaded.model_name == sample_evaluation_result.model_name assert loaded.test_name == sample_evaluation_result.test_name From e0a72b45823bb18a6a66e1ad8bc09d1cd59b5fa8 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Thu, 11 Dec 2025 16:01:59 +0530 Subject: [PATCH 17/24] chore: Update mypy configuration for Python 3.12 and enhance type annotations in cli.py, client.py, core.py, and metrics.py --- benchwise/cli.py | 21 ++-- benchwise/client.py | 13 ++- benchwise/core.py | 2 +- benchwise/metrics.py | 256 +++++++++++++++++++++++-------------------- benchwise/types.py | 198 +++++++++++++++++++++++++++++---- mypy.ini | 2 +- 6 files changed, 340 insertions(+), 152 deletions(-) diff --git a/benchwise/cli.py b/benchwise/cli.py index a2bd910..2dfe56b 100644 --- a/benchwise/cli.py +++ b/benchwise/cli.py @@ -5,7 +5,7 @@ import argparse import asyncio import sys -from typing import List, Optional, Any +from typing import List, Optional from . import __version__ from .datasets import load_dataset @@ -13,6 +13,7 @@ from .results import save_results, BenchmarkResult, EvaluationResult from .config import get_api_config, configure_benchwise from .client import get_client, sync_offline_results +from .types import ConfigureArgs, ConfigKwargs, SyncArgs, StatusArgs def create_parser() -> argparse.ArgumentParser: @@ -206,14 +207,14 @@ async def run_evaluation( for metric_name in metrics: try: if metric_name == "accuracy": - metric_result: Any = accuracy(responses, references) + metric_result = accuracy(responses, references) results["accuracy"] = metric_result["accuracy"] elif metric_name == "rouge_l": - metric_result = rouge_l(responses, references) - results["rouge_l_f1"] = metric_result["f1"] + rouge_result = rouge_l(responses, references) + results["rouge_l_f1"] = rouge_result["f1"] elif metric_name == "semantic_similarity": - metric_result = semantic_similarity(responses, references) - results["semantic_similarity"] = metric_result[ + semantic_result = semantic_similarity(responses, references) + results["semantic_similarity"] = semantic_result[ "mean_similarity" ] else: @@ -285,7 +286,7 @@ async def run_evaluation( return benchmark_result -async def configure_api(args: Any) -> None: +async def configure_api(args: ConfigureArgs) -> None: """Configure Benchwise API settings.""" from .config import reset_config @@ -300,7 +301,7 @@ async def configure_api(args: Any) -> None: return # Update configuration - kwargs = {} + kwargs: ConfigKwargs = {} if args.api_url: kwargs["api_url"] = args.api_url if args.api_key: @@ -321,7 +322,7 @@ async def configure_api(args: Any) -> None: print("No configuration changes specified. Use --show to see current config.") -async def sync_offline(args: Any) -> None: +async def sync_offline(args: SyncArgs) -> None: """Sync offline results with the API.""" try: client = await get_client() @@ -354,7 +355,7 @@ async def sync_offline(args: Any) -> None: pass -async def show_status(args: Any) -> None: +async def show_status(args: StatusArgs) -> None: """Show Benchwise status information.""" config = get_api_config() client = None diff --git a/benchwise/client.py b/benchwise/client.py index fc724fb..3d5700e 100644 --- a/benchwise/client.py +++ b/benchwise/client.py @@ -2,12 +2,14 @@ import asyncio import uuid import logging -from typing import Dict, Any, Optional, List, cast +import types +from typing import Dict, Any, Optional, List, Type, cast from datetime import datetime from contextvars import ContextVar from .config import get_api_config from .results import EvaluationResult, BenchmarkResult +from .types import OfflineQueueItem # Set up logger logger = logging.getLogger("benchwise.client") @@ -64,7 +66,7 @@ def __init__(self, api_url: Optional[str] = None, api_key: Optional[str] = None) self.benchmark_cache: Dict[str, int] = {} # Offline queue for storing results when API is unavailable - self.offline_queue: List[Dict[str, Any]] = [] + self.offline_queue: List[OfflineQueueItem] = [] self.offline_mode = False # Track if client is closed @@ -75,7 +77,12 @@ def __init__(self, api_url: Optional[str] = None, api_key: Optional[str] = None) async def __aenter__(self) -> "BenchwiseClient": return self - async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: + async def __aexit__( + self, + exc_type: Optional[Type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[types.TracebackType], + ) -> None: await self.close() async def close(self) -> None: diff --git a/benchwise/core.py b/benchwise/core.py index 93fe89f..67db3bb 100644 --- a/benchwise/core.py +++ b/benchwise/core.py @@ -31,7 +31,7 @@ def evaluate( *models: str, upload: Optional[bool] = None, **kwargs: Any ) -> Callable[ [Callable[..., Awaitable[Any]]], - Callable[[Dataset, Any], Awaitable[List[EvaluationResult]]], + Callable[[Dataset], Awaitable[List[EvaluationResult]]], ]: """ Decorator for creating LLM evaluations. diff --git a/benchwise/metrics.py b/benchwise/metrics.py index f6a322d..aac4f1c 100644 --- a/benchwise/metrics.py +++ b/benchwise/metrics.py @@ -1,6 +1,16 @@ -from typing import List, Dict, Any, Tuple, Optional, Callable +from typing import List, Dict, Any, Tuple, Optional, Callable, cast import numpy as np -from benchwise.types import RougeScores +from benchwise.types import ( + RougeScores, + BleuScores, + BertScoreResults, + AccuracyResults, + SemanticSimilarityResults, + PerplexityResults, + FactualCorrectnessResults, + CoherenceResults, + SafetyResults, +) from rouge_score import rouge_scorer from sacrebleu import BLEU import bert_score @@ -173,7 +183,7 @@ def bleu_score( smooth_method: str = "exp", return_confidence: bool = True, max_n: int = 4, -) -> Dict[str, Any]: +) -> BleuScores: """ Calculate enhanced BLEU scores for predictions vs references. @@ -265,30 +275,31 @@ def bleu_score( for i in range(1, max_n + 1): ngram_precisions[f"bleu_{i}"].append(0.0) - result = { + # Build result dict dynamically, then cast to BleuScores + result_dict: Dict[str, Any] = { "corpus_bleu": corpus_bleu, - "sentence_bleu": np.mean(sentence_scores), - "std_sentence_bleu": np.std(sentence_scores), - "median_sentence_bleu": np.median(sentence_scores), + "sentence_bleu": float(np.mean(sentence_scores)), + "std_sentence_bleu": float(np.std(sentence_scores)), + "median_sentence_bleu": float(np.median(sentence_scores)), "scores": sentence_scores, } # Add n-gram precision scores for key, scores in ngram_precisions.items(): if scores: # Only add if we have scores - result[key] = np.mean(scores) - result[f"{key}_std"] = np.std(scores) + result_dict[key] = float(np.mean(scores)) + result_dict[f"{key}_std"] = float(np.std(scores)) # Add confidence intervals if requested if return_confidence and len(sentence_scores) > 1: try: - result["sentence_bleu_confidence_interval"] = ( + result_dict["sentence_bleu_confidence_interval"] = ( _bootstrap_confidence_interval(sentence_scores) ) except Exception as e: warnings.warn(f"Could not calculate BLEU confidence intervals: {e}") - return result + return cast(BleuScores, result_dict) def _get_smoothing_function(smooth_method: str) -> Optional[Callable[..., Any]]: @@ -320,7 +331,7 @@ def bert_score_metric( model_type: str = "distilbert-base-uncased", return_confidence: bool = True, batch_size: int = 64, -) -> Dict[str, Any]: +) -> BertScoreResults: """ Calculate enhanced BERTScore for predictions vs references. @@ -340,12 +351,15 @@ def bert_score_metric( ) if not predictions or not references: - return { - "precision": 0.0, - "recall": 0.0, - "f1": 0.0, - "scores": {"precision": [], "recall": [], "f1": []}, - } + return cast( + BertScoreResults, + { + "precision": 0.0, + "recall": 0.0, + "f1": 0.0, + "scores": {"precision": [], "recall": [], "f1": []}, + }, + ) try: # Handle empty strings gracefully @@ -394,16 +408,16 @@ def bert_score_metric( R_scores[idx] = r F1_scores[idx] = f1 - result = { - "precision": np.mean(P_scores), - "recall": np.mean(R_scores), - "f1": np.mean(F1_scores), - "std_precision": np.std(P_scores), - "std_recall": np.std(R_scores), - "std_f1": np.std(F1_scores), - "min_f1": np.min(F1_scores), - "max_f1": np.max(F1_scores), - "median_f1": np.median(F1_scores), + result_dict: Dict[str, Any] = { + "precision": float(np.mean(P_scores)), + "recall": float(np.mean(R_scores)), + "f1": float(np.mean(F1_scores)), + "std_precision": float(np.std(P_scores)), + "std_recall": float(np.std(R_scores)), + "std_f1": float(np.std(F1_scores)), + "min_f1": float(np.min(F1_scores)), + "max_f1": float(np.max(F1_scores)), + "median_f1": float(np.median(F1_scores)), "model_used": model_type, "scores": {"precision": P_scores, "recall": R_scores, "f1": F1_scores}, } @@ -411,36 +425,39 @@ def bert_score_metric( # Add confidence intervals if requested if return_confidence and len(F1_scores) > 1: try: - result["f1_confidence_interval"] = _bootstrap_confidence_interval( + result_dict["f1_confidence_interval"] = _bootstrap_confidence_interval( F1_scores ) - result["precision_confidence_interval"] = ( + result_dict["precision_confidence_interval"] = ( _bootstrap_confidence_interval(P_scores) ) - result["recall_confidence_interval"] = _bootstrap_confidence_interval( - R_scores + result_dict["recall_confidence_interval"] = ( + _bootstrap_confidence_interval(R_scores) ) except Exception as e: warnings.warn( f"Could not calculate BERTScore confidence intervals: {e}" ) - return result + return cast(BertScoreResults, result_dict) except Exception as e: warnings.warn(f"BERTScore calculation failed: {e}") # Return fallback scores - return { - "precision": 0.0, - "recall": 0.0, - "f1": 0.0, - "error": str(e), - "scores": { - "precision": [0.0] * len(predictions), - "recall": [0.0] * len(predictions), - "f1": [0.0] * len(predictions), + return cast( + BertScoreResults, + { + "precision": 0.0, + "recall": 0.0, + "f1": 0.0, + "error": str(e), + "scores": { + "precision": [0.0] * len(predictions), + "recall": [0.0] * len(predictions), + "f1": [0.0] * len(predictions), + }, }, - } + ) def accuracy( @@ -451,7 +468,7 @@ def accuracy( fuzzy_match: bool = False, fuzzy_threshold: float = 0.8, return_confidence: bool = True, -) -> Dict[str, Any]: +) -> AccuracyResults: """ Calculate enhanced exact match accuracy with multiple matching strategies. @@ -473,7 +490,7 @@ def accuracy( ) if not predictions or not references: - return {"accuracy": 0.0, "correct": 0, "total": 0} + return cast(AccuracyResults, {"accuracy": 0.0, "correct": 0, "total": 0}) correct_exact = 0 correct_fuzzy = 0 @@ -533,28 +550,28 @@ def accuracy( exact_accuracy = correct_exact / total if total > 0 else 0.0 fuzzy_accuracy = correct_fuzzy / total if total > 0 else 0.0 - result = { + result_dict: Dict[str, Any] = { "accuracy": exact_accuracy, "exact_accuracy": exact_accuracy, "fuzzy_accuracy": fuzzy_accuracy if fuzzy_match else exact_accuracy, "correct": correct_exact, "correct_fuzzy": correct_fuzzy if fuzzy_match else correct_exact, "total": total, - "mean_score": np.mean(individual_scores), - "std_score": np.std(individual_scores), + "mean_score": float(np.mean(individual_scores)), + "std_score": float(np.std(individual_scores)), "individual_scores": individual_scores, "match_types": match_types, } if return_confidence and len(individual_scores) > 1: try: - result["accuracy_confidence_interval"] = _bootstrap_confidence_interval( - individual_scores + result_dict["accuracy_confidence_interval"] = ( + _bootstrap_confidence_interval(individual_scores) ) except Exception as e: warnings.warn(f"Could not calculate accuracy confidence intervals: {e}") - return result + return cast(AccuracyResults, result_dict) def semantic_similarity( @@ -564,7 +581,7 @@ def semantic_similarity( batch_size: int = 32, return_confidence: bool = True, similarity_threshold: float = 0.5, -) -> Dict[str, Any]: +) -> SemanticSimilarityResults: """ Calculate enhanced semantic similarity using sentence embeddings. @@ -585,7 +602,7 @@ def semantic_similarity( ) if not predictions or not references: - return {"mean_similarity": 0.0, "scores": []} + return cast(SemanticSimilarityResults, {"mean_similarity": 0.0, "scores": []}) try: from sentence_transformers import SentenceTransformer, util @@ -655,37 +672,38 @@ def semantic_similarity( # Calculate enhanced statistics similarities_array = np.array(similarities) - result = { - "mean_similarity": np.mean(similarities), - "median_similarity": np.median(similarities), - "std_similarity": np.std(similarities), - "min_similarity": np.min(similarities), - "max_similarity": np.max(similarities), - "similarity_above_threshold": np.sum(similarities_array >= similarity_threshold) - / len(similarities), + result_dict: Dict[str, Any] = { + "mean_similarity": float(np.mean(similarities)), + "median_similarity": float(np.median(similarities)), + "std_similarity": float(np.std(similarities)), + "min_similarity": float(np.min(similarities)), + "max_similarity": float(np.max(similarities)), + "similarity_above_threshold": float( + np.sum(similarities_array >= similarity_threshold) / len(similarities) + ), "scores": similarities, "model_used": model_type, } - result["percentile_25"] = np.percentile(similarities, 25) - result["percentile_75"] = np.percentile(similarities, 75) - result["percentile_90"] = np.percentile(similarities, 90) + result_dict["percentile_25"] = float(np.percentile(similarities, 25)) + result_dict["percentile_75"] = float(np.percentile(similarities, 75)) + result_dict["percentile_90"] = float(np.percentile(similarities, 90)) # Add confidence intervals if requested if return_confidence and len(similarities) > 1: try: - result["similarity_confidence_interval"] = _bootstrap_confidence_interval( - similarities + result_dict["similarity_confidence_interval"] = ( + _bootstrap_confidence_interval(similarities) ) except Exception as e: warnings.warn( f"Could not calculate semantic similarity confidence intervals: {e}" ) - return result + return cast(SemanticSimilarityResults, result_dict) -def perplexity(predictions: List[str], model_name: str = "gpt2") -> Dict[str, Any]: +def perplexity(predictions: List[str], model_name: str = "gpt2") -> PerplexityResults: """ Calculate perplexity of generated text. @@ -720,11 +738,14 @@ def perplexity(predictions: List[str], model_name: str = "gpt2") -> Dict[str, An perplexity = torch.exp(loss).item() perplexities.append(perplexity) - return { - "mean_perplexity": float(np.mean(perplexities)), - "median_perplexity": float(np.median(perplexities)), - "scores": perplexities, - } + return cast( + PerplexityResults, + { + "mean_perplexity": float(np.mean(perplexities)), + "median_perplexity": float(np.median(perplexities)), + "scores": perplexities, + }, + ) def factual_correctness( @@ -734,7 +755,7 @@ def factual_correctness( use_named_entities: bool = True, return_confidence: bool = True, detailed_analysis: bool = True, -) -> Dict[str, Any]: +) -> FactualCorrectnessResults: """ Evaluate factual correctness of predictions using enhanced fact-checking methods. @@ -755,7 +776,7 @@ def factual_correctness( ) if not predictions or not references: - return {"mean_correctness": 0.0, "scores": []} + return cast(FactualCorrectnessResults, {"mean_correctness": 0.0, "scores": []}) correctness_scores = [] detailed_results = [] @@ -798,12 +819,12 @@ def factual_correctness( detailed_results.append(factual_analysis) # Compile results - result = { - "mean_correctness": np.mean(correctness_scores), - "median_correctness": np.median(correctness_scores), - "std_correctness": np.std(correctness_scores), - "min_correctness": np.min(correctness_scores), - "max_correctness": np.max(correctness_scores), + result_dict: Dict[str, Any] = { + "mean_correctness": float(np.mean(correctness_scores)), + "median_correctness": float(np.median(correctness_scores)), + "std_correctness": float(np.std(correctness_scores)), + "min_correctness": float(np.min(correctness_scores)), + "max_correctness": float(np.max(correctness_scores)), "scores": correctness_scores, } @@ -811,33 +832,33 @@ def factual_correctness( if detailed_analysis: # Aggregate component scores components = ["entity_overlap", "keyword_overlap", "semantic_overlap"] - result["components"] = {} + result_dict["components"] = {} for component in components: component_scores = [ detail.get(component, 0.0) for detail in detailed_results ] if component_scores: - result["components"][component] = { - "mean": np.mean(component_scores), - "std": np.std(component_scores), + result_dict["components"][component] = { + "mean": float(np.mean(component_scores)), + "std": float(np.std(component_scores)), "scores": component_scores, } - result["detailed_results"] = detailed_results + result_dict["detailed_results"] = detailed_results # Add confidence intervals if requested if return_confidence and len(correctness_scores) > 1: try: - result["correctness_confidence_interval"] = _bootstrap_confidence_interval( - correctness_scores + result_dict["correctness_confidence_interval"] = ( + _bootstrap_confidence_interval(correctness_scores) ) except Exception as e: warnings.warn( f"Could not calculate factual correctness confidence intervals: {e}" ) - return result + return cast(FactualCorrectnessResults, result_dict) def _analyze_factual_correctness( @@ -926,7 +947,6 @@ def _calculate_enhanced_keyword_overlap(prediction: str, reference: str) -> floa # Extract important words from reference important_ref_words: set[str] = set() - " ".join(ref_words) for pattern_type, pattern in important_patterns.items(): matches = re.findall(pattern, reference, re.IGNORECASE) @@ -1023,7 +1043,7 @@ def coherence_score( predictions: List[str], return_confidence: bool = True, detailed_analysis: bool = True, -) -> Dict[str, Any]: +) -> CoherenceResults: """ Evaluate text coherence using enhanced linguistic and statistical metrics. @@ -1036,7 +1056,7 @@ def coherence_score( Dictionary with enhanced coherence scores and analysis """ if not predictions: - return {"mean_coherence": 1.0, "scores": []} + return cast(CoherenceResults, {"mean_coherence": 1.0, "scores": []}) coherence_scores = [] component_scores: Dict[str, List[float]] = { @@ -1066,36 +1086,36 @@ def coherence_score( component_scores[component].append(score) # Compile results - result = { - "mean_coherence": np.mean(coherence_scores), - "median_coherence": np.median(coherence_scores), - "std_coherence": np.std(coherence_scores), - "min_coherence": np.min(coherence_scores), - "max_coherence": np.max(coherence_scores), + result_dict: Dict[str, Any] = { + "mean_coherence": float(np.mean(coherence_scores)), + "median_coherence": float(np.median(coherence_scores)), + "std_coherence": float(np.std(coherence_scores)), + "min_coherence": float(np.min(coherence_scores)), + "max_coherence": float(np.max(coherence_scores)), "scores": coherence_scores, } # Add detailed component analysis if requested if detailed_analysis: - result["components"] = {} + result_dict["components"] = {} for component, scores in component_scores.items(): if scores: # Only add if we have scores - result["components"][component] = { - "mean": np.mean(scores), - "std": np.std(scores), + result_dict["components"][component] = { + "mean": float(np.mean(scores)), + "std": float(np.std(scores)), "scores": scores, } # Add confidence intervals if requested if return_confidence and len(coherence_scores) > 1: try: - result["coherence_confidence_interval"] = _bootstrap_confidence_interval( - coherence_scores + result_dict["coherence_confidence_interval"] = ( + _bootstrap_confidence_interval(coherence_scores) ) except Exception as e: warnings.warn(f"Could not calculate coherence confidence intervals: {e}") - return result + return cast(CoherenceResults, result_dict) def _analyze_text_coherence(text: str) -> Dict[str, float]: @@ -1276,7 +1296,7 @@ def safety_score( return_confidence: bool = True, detailed_categories: bool = True, context_aware: bool = True, -) -> Dict[str, Any]: +) -> SafetyResults: """ Evaluate safety of generated text with enhanced detection methods. @@ -1293,7 +1313,7 @@ def safety_score( Dictionary with enhanced safety scores and category breakdowns """ if not predictions: - return {"mean_safety": 1.0, "scores": []} + return cast(SafetyResults, {"mean_safety": 1.0, "scores": []}) if safety_categories is None: safety_categories = [ @@ -1417,11 +1437,11 @@ def safety_score( violation_details.append(violations) # Compile results - result = { - "mean_safety": np.mean(safety_scores), - "median_safety": np.median(safety_scores), - "std_safety": np.std(safety_scores), - "min_safety": np.min(safety_scores), + result_dict: Dict[str, Any] = { + "mean_safety": float(np.mean(safety_scores)), + "median_safety": float(np.median(safety_scores)), + "std_safety": float(np.std(safety_scores)), + "min_safety": float(np.min(safety_scores)), "unsafe_count": len( [s for s in safety_scores if s < 0.8] ), # Threshold for "unsafe" @@ -1431,25 +1451,25 @@ def safety_score( # Add detailed category scores if requested if detailed_categories: - result["category_scores"] = {} + result_dict["category_scores"] = {} for cat in safety_categories: if category_scores[cat]: # Only add if we have scores - result["category_scores"][cat] = { - "mean": np.mean(category_scores[cat]), - "violation_rate": 1.0 - np.mean(category_scores[cat]), + result_dict["category_scores"][cat] = { + "mean": float(np.mean(category_scores[cat])), + "violation_rate": float(1.0 - np.mean(category_scores[cat])), "scores": category_scores[cat], } # Add confidence intervals if requested if return_confidence and len(safety_scores) > 1: try: - result["safety_confidence_interval"] = _bootstrap_confidence_interval( + result_dict["safety_confidence_interval"] = _bootstrap_confidence_interval( safety_scores ) except Exception as e: warnings.warn(f"Could not calculate safety confidence intervals: {e}") - return result + return cast(SafetyResults, result_dict) def _check_keyword_in_context( diff --git a/benchwise/types.py b/benchwise/types.py index a351b09..38cf447 100644 --- a/benchwise/types.py +++ b/benchwise/types.py @@ -75,43 +75,81 @@ class RougeScores(TypedDict, total=False): class BleuScores(TypedDict, total=False): """Return type for BLEU metric scores.""" - bleu: float - bleu1: float - bleu2: float - bleu3: float - bleu4: float - brevity_penalty: float - length_ratio: float - std_bleu: float + # Required fields + corpus_bleu: float + sentence_bleu: float + std_sentence_bleu: float + median_sentence_bleu: float scores: List[float] - # Optional confidence intervals - bleu_confidence_interval: Tuple[float, float] + + # N-gram precision scores (dynamically added based on max_n) + bleu_1: float + bleu_1_std: float + bleu_2: float + bleu_2_std: float + bleu_3: float + bleu_3_std: float + bleu_4: float + bleu_4_std: float + + # Optional confidence interval + sentence_bleu_confidence_interval: Tuple[float, float] class BertScoreResults(TypedDict, total=False): """Return type for BERT-Score metric.""" + # Main scores precision: float recall: float f1: float + + # Standard deviations std_precision: float std_recall: float std_f1: float + + # Additional statistics + min_f1: float + max_f1: float + median_f1: float + + # Metadata + model_used: str + + # Individual scores per sample scores: Dict[str, List[float]] + # Optional confidence intervals f1_confidence_interval: Tuple[float, float] precision_confidence_interval: Tuple[float, float] recall_confidence_interval: Tuple[float, float] + # Error field (when calculation fails) + error: str + class AccuracyResults(TypedDict, total=False): """Return type for accuracy metric.""" + # Main accuracy metrics accuracy: float + exact_accuracy: float + fuzzy_accuracy: float + + # Counts correct: int + correct_fuzzy: int total: int - std_accuracy: float - scores: List[float] + + # Statistical measures + mean_score: float + std_score: float + + # Individual scores and match information + individual_scores: List[float] + match_types: List[str] + # Optional confidence interval accuracy_confidence_interval: Tuple[float, float] @@ -119,42 +157,126 @@ class AccuracyResults(TypedDict, total=False): class SemanticSimilarityResults(TypedDict, total=False): """Return type for semantic similarity metric.""" - similarity: float + # Main similarity metrics + mean_similarity: float + median_similarity: float std_similarity: float + min_similarity: float + max_similarity: float + + # Threshold-based metrics + similarity_above_threshold: float + + # Percentiles + percentile_25: float + percentile_75: float + percentile_90: float + + # Metadata + model_used: str + + # Individual scores scores: List[float] + # Optional confidence interval similarity_confidence_interval: Tuple[float, float] +class PerplexityResults(TypedDict, total=False): + """Return type for perplexity metric.""" + + # Perplexity metrics + mean_perplexity: float + median_perplexity: float + + # Individual scores + scores: List[float] + + +class ComponentAnalysis(TypedDict, total=False): + """Component analysis for factual correctness.""" + + mean: float + std: float + scores: List[float] + + class CoherenceResults(TypedDict, total=False): """Return type for coherence score metric.""" - coherence: float + # Main coherence metrics + mean_coherence: float + median_coherence: float std_coherence: float + min_coherence: float + max_coherence: float + + # Individual scores scores: List[float] + + # Optional detailed component analysis + components: Dict[str, ComponentAnalysis] + # Optional confidence interval coherence_confidence_interval: Tuple[float, float] +class SafetyCategoryScore(TypedDict, total=False): + """Per-category safety score analysis.""" + + mean: float + violation_rate: float + scores: List[float] + + class SafetyResults(TypedDict, total=False): """Return type for safety score metric.""" - safety: float - is_safe: bool - flagged_categories: List[str] + # Main safety metrics + mean_safety: float + median_safety: float std_safety: float + min_safety: float + unsafe_count: int + + # Individual scores scores: List[float] + + # Violation details per prediction + violation_details: List[List[str]] + + # Optional detailed category analysis + category_scores: Dict[str, SafetyCategoryScore] + # Optional confidence interval safety_confidence_interval: Tuple[float, float] +class DetailedFactualAnalysis(TypedDict, total=False): + """Detailed factual analysis for a single prediction-reference pair.""" + + entity_overlap: float + keyword_overlap: float + semantic_overlap: float + + class FactualCorrectnessResults(TypedDict, total=False): """Return type for factual correctness metric.""" - correctness: float - is_correct: bool + # Main correctness metrics + mean_correctness: float + median_correctness: float std_correctness: float + min_correctness: float + max_correctness: float + + # Individual scores scores: List[float] + + # Optional detailed analysis + components: Dict[str, ComponentAnalysis] + detailed_results: List[DetailedFactualAnalysis] + # Optional confidence interval correctness_confidence_interval: Tuple[float, float] @@ -322,3 +444,41 @@ def evaluate( ) -> Dict[str, float]: """Evaluate predictions against references.""" ... + + +class ConfigureArgs(Protocol): + """Arguments for configuring Benchwise.""" + + reset: bool + show: bool + api_url: str | None + api_key: str | None + upload: str | None + + +class SyncArgs(Protocol): + """Arguments for sync command.""" + + dry_run: bool + + +class StatusArgs(Protocol): + """Arguments for status command.""" + + api: bool + auth: bool + + +class ConfigKwargs(TypedDict, total=False): + """Kwargs for configure_benchwise function.""" + + api_url: str + api_key: str + upload_enabled: bool + + +class OfflineQueueItem(TypedDict): + """Item in offline queue.""" + + data: Dict[str, Any] + timestamp: str diff --git a/mypy.ini b/mypy.ini index 1383e41..f311680 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,5 +1,5 @@ [mypy] -python_version = 3.11 +python_version = 3.12 files = benchwise # Strict type checking From e59505070ddab34ba60d16c53a264c630ca32b61 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Thu, 11 Dec 2025 16:14:30 +0530 Subject: [PATCH 18/24] chore: Remove GEMINI.md file from repo --- GEMINI.md | 1 - 1 file changed, 1 deletion(-) delete mode 100644 GEMINI.md diff --git a/GEMINI.md b/GEMINI.md deleted file mode 100644 index d0892af..0000000 --- a/GEMINI.md +++ /dev/null @@ -1 +0,0 @@ -Follow CLAUDE.md From 420e1691fa512a17e11552a0d091d49e311eceaf Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Thu, 11 Dec 2025 17:51:33 +0530 Subject: [PATCH 19/24] fix(core): Update evaluation function to use wrapper for benchmark metadata --- benchwise/core.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/benchwise/core.py b/benchwise/core.py index 67db3bb..5c830ce 100644 --- a/benchwise/core.py +++ b/benchwise/core.py @@ -68,7 +68,7 @@ async def wrapper( dataset: Dataset, **test_kwargs: Any ) -> List[EvaluationResult]: return await _run_evaluation( - test_func, dataset, models, upload, kwargs, test_kwargs + test_func, wrapper, dataset, models, upload, kwargs, test_kwargs ) if hasattr(test_func, "_benchmark_metadata"): @@ -81,6 +81,7 @@ async def wrapper( async def _run_evaluation( test_func: Callable[..., Awaitable[Any]], + wrapper_func: Callable[..., Awaitable[Any]], dataset: Dataset, models: tuple[str, ...], upload: Optional[bool], @@ -102,8 +103,8 @@ async def _run_evaluation( end_time = time.time() combined_metadata = decorator_kwargs.copy() - if hasattr(test_func, "_benchmark_metadata"): - combined_metadata.update(test_func._benchmark_metadata) + if hasattr(wrapper_func, "_benchmark_metadata"): + combined_metadata.update(wrapper_func._benchmark_metadata) eval_result = EvaluationResult( model_name=model_name, @@ -121,8 +122,8 @@ async def _run_evaluation( logger.error(f"โœ— {model_name} failed: {e}", exc_info=True) combined_metadata = decorator_kwargs.copy() - if hasattr(test_func, "_benchmark_metadata"): - combined_metadata.update(test_func._benchmark_metadata) + if hasattr(wrapper_func, "_benchmark_metadata"): + combined_metadata.update(wrapper_func._benchmark_metadata) eval_result = EvaluationResult( model_name=model_name, From dcf430ea7bf1c1351fd4a19051e7c43a87208d01 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Thu, 11 Dec 2025 18:03:06 +0530 Subject: [PATCH 20/24] chore(ci): Update CI workflow to install metrics dependencies and fix test gpt adapter --- .github/workflows/ci.yml | 2 +- tests/test_models.py | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e6ce6a1..acb16d8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -e ".[dev,test]" + pip install -e ".[dev,test,metrics]" - name: Run ruff linter run: ruff check benchwise tests diff --git a/tests/test_models.py b/tests/test_models.py index f3f6add..bdaf260 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -59,9 +59,10 @@ def test_mock_cost_estimate(self): class TestGetModelAdapter: def test_get_gpt_adapter(self): - adapter = get_model_adapter("gpt-3.5-turbo") - assert isinstance(adapter, OpenAIAdapter) - assert adapter.model_name == "gpt-3.5-turbo" + with patch("openai.AsyncOpenAI"): + adapter = get_model_adapter("gpt-3.5-turbo") + assert isinstance(adapter, OpenAIAdapter) + assert adapter.model_name == "gpt-3.5-turbo" def test_get_claude_adapter(self): adapter = get_model_adapter("claude-3-haiku") @@ -242,10 +243,11 @@ class TestModelNaming: def test_gpt_variants(self): models = ["gpt-3.5-turbo", "gpt-4", "gpt-4o"] - for model in models: - adapter = get_model_adapter(model) - assert isinstance(adapter, OpenAIAdapter) - assert adapter.model_name == model + with patch("openai.AsyncOpenAI"): + for model in models: + adapter = get_model_adapter(model) + assert isinstance(adapter, OpenAIAdapter) + assert adapter.model_name == model def test_claude_variants(self): models = ["claude-3-opus", "claude-3-sonnet", "claude-3-haiku"] From a3211ea9520d8255478693996f7c267b8f50f7ec Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Thu, 11 Dec 2025 18:10:38 +0530 Subject: [PATCH 21/24] chore(ci): Update CI workflow to install all development dependencies --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index acb16d8..d39e2df 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -e ".[dev,test,metrics]" + pip install -e ". [dev,all]" - name: Run ruff linter run: ruff check benchwise tests From 1da265b2480209c45c02a2b289d33b84da9e6db9 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Sat, 13 Dec 2025 17:38:52 +0530 Subject: [PATCH 22/24] refactor: Improve type safety across codebase with TypedDicts --- .github/workflows/ci.yml | 2 +- .gitignore | 2 - benchwise/cli.py | 47 +++- benchwise/client.py | 196 +++++++++----- benchwise/core.py | 94 +++++-- benchwise/datasets.py | 539 +++++++++++++++++++++++++++----------- benchwise/results.py | 189 +++++++------ benchwise/types.py | 266 +++++++++++++++++-- demo.py | 21 +- docs/test_load_dataset.py | 2 + 10 files changed, 1003 insertions(+), 355 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d39e2df..a3ed4c0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -e ". [dev,all]" + pip install -e ".[dev,all]" - name: Run ruff linter run: ruff check benchwise tests diff --git a/.gitignore b/.gitignore index 19573c7..aacac31 100644 --- a/.gitignore +++ b/.gitignore @@ -252,6 +252,4 @@ redis-data/ celery-beat-schedule # AI files -CLAUDE.md -GEMINI.md test_single_doc_file.py diff --git a/benchwise/cli.py b/benchwise/cli.py index 2dfe56b..d71bab5 100644 --- a/benchwise/cli.py +++ b/benchwise/cli.py @@ -8,12 +8,12 @@ from typing import List, Optional from . import __version__ -from .datasets import load_dataset +from .datasets import load_dataset, convert_metadata_to_info from .models import get_model_adapter from .results import save_results, BenchmarkResult, EvaluationResult from .config import get_api_config, configure_benchwise from .client import get_client, sync_offline_results -from .types import ConfigureArgs, ConfigKwargs, SyncArgs, StatusArgs +from .types import ConfigureArgs, ConfigKwargs, SyncArgs, StatusArgs, DatasetInfo def create_parser() -> argparse.ArgumentParser: @@ -136,15 +136,21 @@ async def run_evaluation( sys.exit(1) # Create benchmark result + from .types import EvaluationMetadata + from typing import cast + benchmark_result = BenchmarkResult( benchmark_name=f"cli_evaluation_{dataset.name}", - metadata={ - "dataset_path": dataset_path, - "models": models, - "metrics": metrics, - "temperature": temperature, - "max_tokens": max_tokens, - }, + metadata=cast( + EvaluationMetadata, + { + "dataset_path": dataset_path, + "models": models, + "metrics": metrics, + "temperature": temperature, + "max_tokens": max_tokens, + }, + ), ) # Run evaluation for each model @@ -239,7 +245,9 @@ async def run_evaluation( model_name=model_name, test_name="cli_evaluation", result=results, - dataset_info=dataset.metadata, + dataset_info=convert_metadata_to_info(dataset.metadata) + if dataset.metadata + else None, ) benchmark_result.add_result(eval_result) @@ -251,7 +259,9 @@ async def run_evaluation( model_name=model_name, test_name="cli_evaluation", error=str(e), - dataset_info=dataset.metadata, + dataset_info=convert_metadata_to_info(dataset.metadata) + if dataset.metadata + else None, ) benchmark_result.add_result(eval_result) print(f"โœ— {model_name} failed: {e}") @@ -268,10 +278,23 @@ async def run_evaluation( try: from .client import upload_results + # Extract dataset_info from dataset metadata for upload_results + # upload_results expects DatasetInfo + dataset_info_for_upload: DatasetInfo = cast( + DatasetInfo, + { + "size": dataset.size, + "task": "general", + "tags": [], + }, + ) + if dataset.metadata: + dataset_info_for_upload = convert_metadata_to_info(dataset.metadata) + success = await upload_results( benchmark_result.results, benchmark_result.benchmark_name, - benchmark_result.metadata, + dataset_info_for_upload, ) if success: print("โœ… Results uploaded to Benchwise API") diff --git a/benchwise/client.py b/benchwise/client.py index 3d5700e..6adc153 100644 --- a/benchwise/client.py +++ b/benchwise/client.py @@ -9,7 +9,21 @@ from .config import get_api_config from .results import EvaluationResult, BenchmarkResult -from .types import OfflineQueueItem +from .types import ( + OfflineQueueItem, + LoginResponse, + UserInfo, + ModelInfo, + BenchmarkInfo, + BenchmarkRegistrationData, + EvaluationInfo, + DatasetInfo, + EvaluationMetadata, + EvaluationResultDict, + UploadBenchmarkResponse, + FileUploadResponse, + TokenData, +) # Set up logger logger = logging.getLogger("benchwise.client") @@ -179,7 +193,7 @@ async def health_check(self) -> bool: logger.warning(f"Health check failed: {e}") return False - async def login(self, username: str, password: str) -> Dict[str, Any]: + async def login(self, username: str, password: str) -> LoginResponse: """ Login with username/password to get JWT token. @@ -197,14 +211,18 @@ async def login(self, username: str, password: str) -> Dict[str, Any]: ) if response.status_code == 200: - token_data = response.json() + token_data = cast(TokenData, response.json()) self.jwt_token = token_data["access_token"] self._set_auth_header() # Get user info user_info = await self.get_current_user() logger.info(f"Login successful for user: {username}") - return {"token": token_data, "user": user_info} + login_response: LoginResponse = { + "token": token_data, + "user": user_info, + } + return login_response elif response.status_code == 401: logger.error("Login failed: Invalid credentials") raise BenchwiseAPIError("Invalid username or password") @@ -218,7 +236,7 @@ async def login(self, username: str, password: str) -> Dict[str, Any]: async def register( self, username: str, email: str, password: str, full_name: Optional[str] = None - ) -> Dict[str, Any]: + ) -> UserInfo: """ Register a new user account. @@ -246,7 +264,7 @@ async def register( if response.status_code == 201: logger.info(f"Registration successful for user: {username}") - return cast(Dict[str, Any], response.json()) + return cast(UserInfo, response.json()) elif response.status_code == 400: error_detail = response.json().get("detail", "Registration failed") logger.error(f"Registration failed: {error_detail}") @@ -259,7 +277,7 @@ async def register( logger.error(f"Network error during registration: {e}") raise BenchwiseAPIError(f"Network error during registration: {e}") - async def get_current_user(self) -> Dict[str, Any]: + async def get_current_user(self) -> UserInfo: """ Get current authenticated user information. @@ -273,7 +291,7 @@ async def get_current_user(self) -> Dict[str, Any]: response = await self.client.get("/api/v1/users/me") if response.status_code == 200: - return cast(Dict[str, Any], response.json()) + return cast(UserInfo, response.json()) elif response.status_code == 401: logger.warning("Authentication expired") raise BenchwiseAPIError("Authentication expired - please login again") @@ -288,7 +306,7 @@ async def get_current_user(self) -> Dict[str, Any]: # WIP: Simplified upload workflow (to be completed in future release) async def upload_benchmark_result_simple( self, benchmark_result: BenchmarkResult - ) -> Dict[str, Any]: + ) -> UploadBenchmarkResponse: """ WIP: Simplified single-call upload for benchmark results. @@ -345,8 +363,8 @@ async def register_model( response = await self.client.post("/api/v1/models", json=model_data) if response.status_code == 201: - model_info = cast(Dict[str, Any], response.json()) - model_db_id = cast(int, model_info["id"]) + model_info = cast(ModelInfo, response.json()) + model_db_id = model_info["id"] self.model_cache[cache_key] = model_db_id logger.info(f"Model registered successfully with ID: {model_db_id}") return model_db_id @@ -372,12 +390,15 @@ async def _get_existing_model(self, provider: str, model_id: str) -> int: ) if response.status_code == 200: - models = cast(List[Dict[str, Any]], response.json()) + models = cast(List[ModelInfo], response.json()) # Filter in Python since backend doesn't support model_id parameter for model in models: - if model["provider"] == provider and model["model_id"] == model_id: + if ( + model.get("provider") == provider + and model.get("model_id") == model_id + ): cache_key = f"{provider}:{model_id}" - model_id_value = cast(int, model["id"]) + model_id_value: int = model["id"] self.model_cache[cache_key] = model_id_value logger.debug(f"Found existing model with ID: {model_id_value}") return model_id_value @@ -392,7 +413,7 @@ async def _get_existing_model(self, provider: str, model_id: str) -> int: raise BenchwiseAPIError(f"Network error searching models: {e}") async def register_benchmark( - self, benchmark_name: str, description: str, dataset_info: Dict[str, Any] + self, benchmark_name: str, description: str, dataset_info: DatasetInfo ) -> int: """ Register a benchmark and return its database ID. @@ -419,11 +440,11 @@ async def register_benchmark( logger.info(f"Registering benchmark: {benchmark_name}") try: - benchmark_data = { + benchmark_data: BenchmarkRegistrationData = { "name": benchmark_name, "description": description, - "category": dataset_info.get("task", "general"), - "tags": dataset_info.get("tags", []), + "category": dataset_info.get("task", "general") or "general", + "tags": dataset_info.get("tags", []) or [], "difficulty": dataset_info.get("difficulty"), "dataset_url": dataset_info.get("source"), "config": {}, @@ -434,8 +455,8 @@ async def register_benchmark( response = await self.client.post("/api/v1/benchmarks", json=benchmark_data) if response.status_code == 201: - benchmark_info = cast(Dict[str, Any], response.json()) - benchmark_db_id = cast(int, benchmark_info["id"]) + benchmark_info = cast(BenchmarkInfo, response.json()) + benchmark_db_id = benchmark_info["id"] self.benchmark_cache[benchmark_name] = benchmark_db_id logger.info( f"Benchmark registered successfully with ID: {benchmark_db_id}" @@ -463,11 +484,11 @@ async def _get_existing_benchmark(self, benchmark_name: str) -> int: ) if response.status_code == 200: - benchmarks = cast(List[Dict[str, Any]], response.json()) + benchmarks = cast(List[BenchmarkInfo], response.json()) # Look for exact name match first, then partial match for benchmark in benchmarks: - if benchmark["name"] == benchmark_name: - benchmark_id_value = cast(int, benchmark["id"]) + if benchmark.get("name") == benchmark_name: + benchmark_id_value: int = benchmark["id"] self.benchmark_cache[benchmark_name] = benchmark_id_value logger.debug( f"Found existing benchmark with ID: {benchmark_id_value}" @@ -476,8 +497,9 @@ async def _get_existing_benchmark(self, benchmark_name: str) -> int: # If no exact match, try partial match for benchmark in benchmarks: - if benchmark_name.lower() in benchmark["name"].lower(): - benchmark_id_value = cast(int, benchmark["id"]) + benchmark_name_val = benchmark.get("name", "") + if benchmark_name.lower() in benchmark_name_val.lower(): + benchmark_id_value = benchmark["id"] self.benchmark_cache[benchmark_name] = benchmark_id_value logger.debug( f"Found similar benchmark with ID: {benchmark_id_value}" @@ -498,7 +520,7 @@ async def create_evaluation( name: str, benchmark_id: int, model_ids: List[int], - metadata: Optional[Dict[str, Any]] = None, + metadata: Optional[EvaluationMetadata] = None, ) -> int: """ Create evaluation with correct backend format. @@ -527,8 +549,8 @@ async def create_evaluation( ) if response.status_code == 201: - evaluation_info = cast(Dict[str, Any], response.json()) - evaluation_id = cast(int, evaluation_info["id"]) + evaluation_info = cast(EvaluationInfo, response.json()) + evaluation_id = evaluation_info["id"] logger.info(f"Evaluation created successfully with ID: {evaluation_id}") return evaluation_id elif response.status_code == 401: @@ -555,7 +577,7 @@ async def create_evaluation( raise e async def upload_evaluation_results( - self, evaluation_id: int, results: List[Dict[str, Any]] + self, evaluation_id: int, results: List[EvaluationResultDict] ) -> bool: """ Upload results to an existing evaluation using the correct endpoint. @@ -603,7 +625,7 @@ async def upload_evaluation_results( async def upload_benchmark_result( self, benchmark_result: BenchmarkResult - ) -> Dict[str, Any]: + ) -> UploadBenchmarkResponse: """ Upload a complete benchmark result using correct workflow. @@ -620,12 +642,24 @@ async def upload_benchmark_result( try: # Step 1: Register benchmark if needed benchmark_name = benchmark_result.benchmark_name + description_value: Any = benchmark_result.metadata.get( + "description", f"Benchmark: {benchmark_name}" + ) + description_str: str = ( + description_value + if isinstance(description_value, str) + else f"Benchmark: {benchmark_name}" + ) + dataset_info_value: Any = benchmark_result.metadata.get("dataset", {}) + dataset_info_typed: DatasetInfo = ( + cast(DatasetInfo, dataset_info_value) + if isinstance(dataset_info_value, dict) + else cast(DatasetInfo, {}) + ) benchmark_id = await self.register_benchmark( benchmark_name=benchmark_name, - description=benchmark_result.metadata.get( - "description", f"Benchmark: {benchmark_name}" - ), - dataset_info=benchmark_result.metadata.get("dataset", {}), + description=description_str, + dataset_info=dataset_info_typed, ) # Step 2: Register models and collect their IDs @@ -661,22 +695,11 @@ async def upload_benchmark_result( ) # Step 4: Prepare and upload results - results_data = [] + results_data: List[EvaluationResultDict] = [] for result in benchmark_result.results: if result.success and result.model_name in model_name_to_id: - result_data = { - "model_id": model_name_to_id[result.model_name], - "metrics": result.result - if isinstance(result.result, dict) - else {"score": result.result}, - "outputs": {}, # Could include sample outputs if needed - "metadata": { - "duration": result.duration, - "timestamp": result.timestamp.isoformat(), - **result.metadata, - }, - } - results_data.append(result_data) + # result.to_dict() already returns EvaluationResultDict + results_data.append(result.to_dict()) # Step 5: Upload results await self.upload_evaluation_results(evaluation_id, results_data) @@ -684,13 +707,21 @@ async def upload_benchmark_result( logger.info( f"Benchmark result uploaded successfully. Evaluation ID: {evaluation_id}" ) - return { + # Build response with explicit types matching UploadBenchmarkResponse + # All values are properly typed: + # - evaluation_id: int (from create_evaluation) + # - benchmark_id: int (from register_benchmark) + # - model_ids: List[int] (from register_model) + # - len(results_data): int + # - message: str + response: UploadBenchmarkResponse = { "id": evaluation_id, "benchmark_id": benchmark_id, "model_ids": model_ids, "results_count": len(results_data), "message": "Evaluation uploaded successfully", } + return response except Exception as e: # Add to offline queue for later sync @@ -736,7 +767,7 @@ def _get_model_provider(self, model_name: str) -> str: async def get_benchmarks( self, limit: int = 50, skip: int = 0 - ) -> List[Dict[str, Any]]: + ) -> List[BenchmarkInfo]: """Get available benchmarks from the API.""" try: response = await self.client.get( @@ -744,7 +775,7 @@ async def get_benchmarks( ) if response.status_code == 200: - return cast(List[Dict[str, Any]], response.json()) + return cast(List[BenchmarkInfo], response.json()) else: raise BenchwiseAPIError( f"Failed to retrieve benchmarks: {response.status_code}" @@ -755,7 +786,7 @@ async def get_benchmarks( async def get_evaluations( self, limit: int = 50, skip: int = 0 - ) -> List[Dict[str, Any]]: + ) -> List[EvaluationInfo]: """Get evaluations from the API.""" try: response = await self.client.get( @@ -763,7 +794,7 @@ async def get_evaluations( ) if response.status_code == 200: - return cast(List[Dict[str, Any]], response.json()) + return cast(List[EvaluationInfo], response.json()) else: raise BenchwiseAPIError( f"Failed to retrieve evaluations: {response.status_code}" @@ -774,9 +805,11 @@ async def get_evaluations( async def _add_to_offline_queue(self, data: Dict[str, Any]) -> None: """Add data to offline queue for later upload.""" - self.offline_queue.append( - {"data": data, "timestamp": datetime.now().isoformat()} - ) + queue_item: OfflineQueueItem = { + "data": data, + "timestamp": datetime.now().isoformat(), + } + self.offline_queue.append(queue_item) self.offline_mode = True logger.info(f"Added to offline queue (size: {len(self.offline_queue)})") @@ -792,21 +825,33 @@ async def sync_offline_queue(self) -> int: for item in self.offline_queue: try: - data = item["data"] - data_type = data.get("type") + queue_data: Dict[str, Any] = item["data"] + data_type: Any = queue_data.get("type") if data_type == "full_benchmark_result": # Reconstruct BenchmarkResult and upload from .results import BenchmarkResult - benchmark_result = BenchmarkResult(**data["benchmark_result"]) + benchmark_result_dict: Dict[str, Any] = queue_data.get( + "benchmark_result", {} + ) + benchmark_result = BenchmarkResult(**benchmark_result_dict) await self.upload_benchmark_result(benchmark_result) elif data_type == "create_evaluation": - await self.create_evaluation(**data["data"]) + evaluation_data: Dict[str, Any] = queue_data.get("data", {}) + await self.create_evaluation(**evaluation_data) elif data_type == "upload_results": - await self.upload_evaluation_results( - data["evaluation_id"], data["results"] - ) + evaluation_id_value: Any = queue_data.get("evaluation_id") + results_value: Any = queue_data.get("results") + if isinstance(evaluation_id_value, int) and isinstance( + results_value, list + ): + results_list: List[EvaluationResultDict] = cast( + List[EvaluationResultDict], results_value + ) + await self.upload_evaluation_results( + evaluation_id_value, results_list + ) synced_count += 1 logger.info(f"Synced item from {item['timestamp']}") @@ -853,8 +898,8 @@ async def upload_dataset_for_benchmark( ) if response.status_code == 200: - result = cast(Dict[str, Any], response.json()) - file_url = cast(str, result["file_info"]["url"]) + result = cast(FileUploadResponse, response.json()) + file_url = result["file_info"]["url"] logger.info("Dataset uploaded successfully") return file_url else: @@ -895,8 +940,8 @@ async def create_benchmark_with_dataset( f"Failed to create benchmark: {response.status_code}" ) - benchmark = cast(Dict[str, Any], response.json()) - benchmark_id = cast(int, benchmark["id"]) + benchmark = cast(BenchmarkInfo, response.json()) + benchmark_id = benchmark["id"] # 2. Upload dataset try: @@ -948,7 +993,7 @@ async def close_client() -> None: async def upload_results( - results: List[EvaluationResult], test_name: str, dataset_info: Dict[str, Any] + results: List[EvaluationResult], test_name: str, dataset_info: DatasetInfo ) -> bool: """ Convenience function to upload evaluation results. @@ -974,7 +1019,10 @@ async def upload_results( benchmark_result = BenchmarkResult( benchmark_name=test_name, results=results, - metadata={"dataset": dataset_info}, + metadata=cast( + EvaluationMetadata, + {"dataset": dataset_info}, + ), ) await client._add_to_offline_queue( { @@ -992,7 +1040,10 @@ async def upload_results( benchmark_result = BenchmarkResult( benchmark_name=test_name, results=results, - metadata={"dataset": dataset_info}, + metadata=cast( + EvaluationMetadata, + {"dataset": dataset_info}, + ), ) await client._add_to_offline_queue( { @@ -1008,7 +1059,10 @@ async def upload_results( benchmark_result = BenchmarkResult( benchmark_name=test_name, results=results, - metadata={"dataset": dataset_info}, + metadata=cast( + EvaluationMetadata, + {"dataset": dataset_info}, + ), ) response = await client.upload_benchmark_result(benchmark_result) diff --git a/benchwise/core.py b/benchwise/core.py index 5c830ce..df9c724 100644 --- a/benchwise/core.py +++ b/benchwise/core.py @@ -8,6 +8,7 @@ ParamSpec, TypeVar, Awaitable, + cast, ) from functools import wraps import asyncio @@ -15,10 +16,18 @@ import inspect import logging from .models import get_model_adapter -from .datasets import Dataset +from .datasets import Dataset, convert_metadata_to_info from .results import EvaluationResult from .config import get_api_config from .client import upload_results +from .types import ( + RunnerConfig, + ModelComparisonResult, + EvaluationResultDict, + EvaluationMetadata, + DatasetInfo, + CallableWithBenchmarkMetadata, +) # Type variables for decorator typing P = ParamSpec("P") @@ -71,8 +80,15 @@ async def wrapper( test_func, wrapper, dataset, models, upload, kwargs, test_kwargs ) + # Copy benchmark metadata if it exists if hasattr(test_func, "_benchmark_metadata"): - wrapper._benchmark_metadata = test_func._benchmark_metadata # type: ignore[attr-defined] + # Type narrowing: test_func has _benchmark_metadata after hasattr check + benchmark_func = cast(CallableWithBenchmarkMetadata, test_func) + # Type the wrapper as having the metadata attribute + wrapper_with_metadata = cast(CallableWithBenchmarkMetadata, wrapper) + wrapper_with_metadata._benchmark_metadata = ( + benchmark_func._benchmark_metadata + ) return wrapper @@ -104,15 +120,19 @@ async def _run_evaluation( combined_metadata = decorator_kwargs.copy() if hasattr(wrapper_func, "_benchmark_metadata"): - combined_metadata.update(wrapper_func._benchmark_metadata) + # Type narrowing: wrapper_func has _benchmark_metadata after hasattr check + benchmark_func = cast(CallableWithBenchmarkMetadata, wrapper_func) + combined_metadata.update(benchmark_func._benchmark_metadata) eval_result = EvaluationResult( model_name=model_name, test_name=test_func.__name__, result=result, duration=end_time - start_time, - dataset_info=dataset.metadata, - metadata=combined_metadata, + dataset_info=convert_metadata_to_info(dataset.metadata) + if dataset.metadata + else None, + metadata=cast(EvaluationMetadata, combined_metadata), ) results.append(eval_result) @@ -123,15 +143,19 @@ async def _run_evaluation( combined_metadata = decorator_kwargs.copy() if hasattr(wrapper_func, "_benchmark_metadata"): - combined_metadata.update(wrapper_func._benchmark_metadata) + # Type narrowing: wrapper_func has _benchmark_metadata after hasattr check + benchmark_func = cast(CallableWithBenchmarkMetadata, wrapper_func) + combined_metadata.update(benchmark_func._benchmark_metadata) eval_result = EvaluationResult( model_name=model_name, test_name=test_func.__name__, error=str(e), duration=0, - dataset_info=dataset.metadata, - metadata=combined_metadata, + dataset_info=convert_metadata_to_info(dataset.metadata) + if dataset.metadata + else None, + metadata=cast(EvaluationMetadata, combined_metadata), ) results.append(eval_result) @@ -141,7 +165,14 @@ async def _run_evaluation( if should_upload and results: try: logger.debug("Uploading results to Benchwise API") - await upload_results(results, test_func.__name__, dataset.metadata or {}) + dataset_info_for_upload: DatasetInfo = ( + convert_metadata_to_info(dataset.metadata) + if dataset.metadata + else cast( + DatasetInfo, {"size": dataset.size, "task": "general", "tags": []} + ) + ) + await upload_results(results, test_func.__name__, dataset_info_for_upload) logger.info("Results uploaded successfully") except Exception as e: logger.warning(f"Upload failed (results saved locally): {e}") @@ -164,7 +195,9 @@ async def medical_qa_test(model, dataset): """ def decorator(test_func: Callable[P, R]) -> Callable[P, R]: - test_func._benchmark_metadata = { # type: ignore[attr-defined] + # Add benchmark metadata to the function + benchmark_func = cast(CallableWithBenchmarkMetadata, test_func) + benchmark_func._benchmark_metadata = { "name": name, "description": description, **kwargs, @@ -227,9 +260,9 @@ async def wrapper( class EvaluationRunner: """Main class for running evaluations.""" - def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: - self.config: Dict[str, Any] = config or {} - self.results_cache: Dict[str, Any] = {} + def __init__(self, config: Optional[RunnerConfig] = None) -> None: + self.config: RunnerConfig = config or cast(RunnerConfig, {}) + self.results_cache: Dict[str, EvaluationResultDict] = {} self.logger = logging.getLogger("benchwise.runner") async def run_evaluation( @@ -255,13 +288,15 @@ async def run_evaluation( def compare_models( self, results: List[EvaluationResult], metric_name: Optional[str] = None - ) -> Dict[str, Any]: + ) -> ModelComparisonResult: """Compare model performance.""" successful_results = [r for r in results if r.success] if not successful_results: self.logger.warning("No successful results to compare") - return {"error": "No successful results to compare"} + return cast( + ModelComparisonResult, {"error": "No successful results to compare"} + ) model_scores = [] for r in successful_results: @@ -286,19 +321,28 @@ def compare_models( model_scores.append((r.model_name, score if score is not None else 0)) if not model_scores: - return {"error": "No comparable scores found"} + return cast(ModelComparisonResult, {"error": "No comparable scores found"}) model_scores.sort(key=lambda x: x[1], reverse=True) - comparison = { - "models": [r.model_name for r in successful_results], - "scores": [score for _, score in model_scores], - "best_model": model_scores[0][0], - "worst_model": model_scores[-1][0], - "ranking": [ - {"model": name, "score": score} for name, score in model_scores - ], - } + comparison = cast( + ModelComparisonResult, + { + "ranking": [ + {"model": name, "score": float(score)} + for name, score in model_scores + ], + "best_model": model_scores[0][0], + "best_score": float(model_scores[0][1]), + "worst_model": model_scores[-1][0], + "worst_score": float(model_scores[-1][1]), + "mean_score": float( + sum(score for _, score in model_scores) / len(model_scores) + ), + "std_score": 0.0, # Could calculate if needed + "total_models": len(model_scores), + }, + ) self.logger.info( f"Comparison complete: Best model is {comparison['best_model']}" diff --git a/benchwise/datasets.py b/benchwise/datasets.py index a4a60da..3459b97 100644 --- a/benchwise/datasets.py +++ b/benchwise/datasets.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Any, Optional, Union, Callable +from typing import List, Dict, Any, Optional, Union, Callable, cast import json import pandas as pd from pathlib import Path @@ -6,6 +6,101 @@ from dataclasses import dataclass import hashlib +from .types import ( + DatasetItem, + DatasetMetadata, + DatasetSchema, + DatasetDict, + DatasetInfo, +) + + +def _validate_dataset_item(item: Any) -> DatasetItem: + """ + Validate and convert a dictionary to DatasetItem. + + Args: + item: Dictionary or any value to validate + + Returns: + Validated DatasetItem + + Raises: + ValueError: If item is not a dictionary + """ + if not isinstance(item, dict): + raise ValueError(f"Expected dict for DatasetItem, got {type(item).__name__}") + return cast(DatasetItem, item) + + +def _validate_dataset_items(items: Any) -> List[DatasetItem]: + """ + Validate and convert a list of dictionaries to List[DatasetItem]. + + Args: + items: List of dictionaries or any value to validate + + Returns: + Validated List[DatasetItem] + + Raises: + ValueError: If items is not a list or contains non-dict items + """ + if not isinstance(items, list): + raise ValueError(f"Expected list for dataset data, got {type(items).__name__}") + + validated_items: List[DatasetItem] = [] + for i, item in enumerate(items): + if not isinstance(item, dict): + raise ValueError( + f"Expected dict for dataset item at index {i}, got {type(item).__name__}" + ) + validated_items.append(cast(DatasetItem, item)) + + return validated_items + + +def _validate_dataset_metadata(metadata: Any) -> Optional[DatasetMetadata]: + """ + Validate and convert metadata to DatasetMetadata. + + Args: + metadata: Dictionary or None to validate + + Returns: + Validated DatasetMetadata or None + """ + if metadata is None: + return None + + if not isinstance(metadata, dict): + raise ValueError( + f"Expected dict or None for DatasetMetadata, got {type(metadata).__name__}" + ) + + return cast(DatasetMetadata, metadata) + + +def _validate_dataset_schema(schema: Any) -> Optional[DatasetSchema]: + """ + Validate and convert schema to DatasetSchema. + + Args: + schema: Dictionary or None to validate + + Returns: + Validated DatasetSchema or None + """ + if schema is None: + return None + + if not isinstance(schema, dict): + raise ValueError( + f"Expected dict or None for DatasetSchema, got {type(schema).__name__}" + ) + + return cast(DatasetSchema, schema) + @dataclass class Dataset: @@ -20,20 +115,23 @@ class Dataset: """ name: str - data: List[Dict[str, Any]] - metadata: Optional[Dict[str, Any]] = None - schema: Optional[Dict[str, Any]] = None + data: List[DatasetItem] + metadata: Optional[DatasetMetadata] = None + schema: Optional[DatasetSchema] = None def __post_init__(self) -> None: if self.metadata is None: - self.metadata = {} + self.metadata = cast(DatasetMetadata, {}) if not self.metadata: - self.metadata = { - "size": len(self.data), - "created_at": pd.Timestamp.now().isoformat(), - "hash": self._compute_hash(), - } + self.metadata = cast( + DatasetMetadata, + { + "size": len(self.data), + "created_at": pd.Timestamp.now().isoformat(), + "hash": self._compute_hash(), + }, + ) def _compute_hash(self) -> str: """Compute hash of dataset for versioning.""" @@ -78,14 +176,17 @@ def references(self) -> List[str]: references.append(str(ref)) return references - def filter(self, condition: Callable[[Dict[str, Any]], bool]) -> "Dataset": + def filter(self, condition: Callable[[DatasetItem], bool]) -> "Dataset": """Filter dataset items based on condition.""" filtered_data = [item for item in self.data if condition(item)] - metadata = self.metadata or {} + metadata = self.metadata or cast(DatasetMetadata, {}) return Dataset( name=f"{self.name}_filtered", data=filtered_data, - metadata={**metadata, "filtered": True, "original_size": self.size}, + metadata=cast( + DatasetMetadata, + {**metadata, "filtered": True, "original_size": self.size}, + ), ) def sample(self, n: int, random_state: Optional[int] = None) -> "Dataset": @@ -95,12 +196,16 @@ def sample(self, n: int, random_state: Optional[int] = None) -> "Dataset": if random_state: random.seed(random_state) - sampled_data = random.sample(self.data, min(n, len(self.data))) - metadata = self.metadata or {} + sampled_data: List[DatasetItem] = random.sample( + self.data, min(n, len(self.data)) + ) + metadata = self.metadata or cast(DatasetMetadata, {}) return Dataset( name=f"{self.name}_sample_{n}", data=sampled_data, - metadata={**metadata, "sampled": True, "sample_size": n}, + metadata=cast( + DatasetMetadata, {**metadata, "sampled": True, "sample_size": n} + ), ) def split( @@ -112,43 +217,52 @@ def split( if random_state: random.seed(random_state) - shuffled_data = self.data.copy() + shuffled_data: List[DatasetItem] = self.data.copy() random.shuffle(shuffled_data) split_idx = int(len(shuffled_data) * train_ratio) - train_data = shuffled_data[:split_idx] - test_data = shuffled_data[split_idx:] + train_data: List[DatasetItem] = shuffled_data[:split_idx] + test_data: List[DatasetItem] = shuffled_data[split_idx:] train_dataset = Dataset( name=f"{self.name}_train", data=train_data, - metadata={ - **(self.metadata or {}), - "split": "train", - "train_ratio": train_ratio, - }, + metadata=cast( + DatasetMetadata, + { + **(self.metadata or cast(DatasetMetadata, {})), + "split": "train", + "train_ratio": train_ratio, + }, + ), ) test_dataset = Dataset( name=f"{self.name}_test", data=test_data, - metadata={ - **(self.metadata or {}), - "split": "test", - "train_ratio": train_ratio, - }, + metadata=cast( + DatasetMetadata, + { + **(self.metadata or cast(DatasetMetadata, {})), + "split": "test", + "test_ratio": 1 - train_ratio, + }, + ), ) return train_dataset, test_dataset - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> DatasetDict: """Convert dataset to dictionary format.""" - return { - "name": self.name, - "data": self.data, - "metadata": self.metadata, - "schema": self.schema, - } + return cast( + DatasetDict, + { + "name": self.name, + "data": self.data, + "metadata": self.metadata, + "schema": self.schema, + }, + ) def to_json(self, file_path: Optional[str] = None) -> str: """Export dataset to JSON format.""" @@ -170,7 +284,14 @@ def validate_schema(self) -> bool: if not self.schema: return True - required_fields = self.schema.get("required", []) + # Support both "required" and "required_fields" for backward compatibility + # Check if "required" key exists first, then fall back to "required_fields" + if "required" in self.schema: + required_fields = self.schema["required"] + elif "required_fields" in self.schema: + required_fields = self.schema["required_fields"] + else: + required_fields = [] for item in self.data: for field in required_fields: @@ -210,7 +331,7 @@ def get_statistics(self) -> Dict[str, Any]: return stats -def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs: Any) -> Dataset: +def load_dataset(source: Union[str, Path, DatasetDict], **kwargs: Any) -> Dataset: """ Load dataset from various sources. @@ -223,11 +344,19 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs: Any) -> Dat """ if isinstance(source, dict): + # Type narrowing: after isinstance check, treat as DatasetDict + # Note: .get() on TypedDict with total=False returns Any for optional keys, + # but we know the structure from DatasetDict, so we use proper type annotations + dataset_dict: DatasetDict = source + data: List[DatasetItem] = dataset_dict.get("data", []) + metadata: Optional[DatasetMetadata] = dataset_dict.get("metadata") + schema: Optional[DatasetSchema] = dataset_dict.get("schema") + return Dataset( name=kwargs.get("name", "custom_dataset"), - data=source.get("data", []), - metadata=source.get("metadata", {}), - schema=source.get("schema"), + data=_validate_dataset_items(data), + metadata=_validate_dataset_metadata(metadata), + schema=_validate_dataset_schema(schema), ) elif isinstance(source, (str, Path)): @@ -235,20 +364,24 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs: Any) -> Dat if source_path.suffix == ".json": with open(source_path, "r") as f: - data = json.load(f) + json_data = json.load(f) - if isinstance(data, dict) and "data" in data: + if isinstance(json_data, dict) and "data" in json_data: return Dataset( - name=data.get("name", source_path.stem), - data=data["data"], - metadata=data.get("metadata", {}), - schema=data.get("schema"), + name=json_data.get("name", source_path.stem) + if isinstance(json_data.get("name"), str) + else source_path.stem, + data=_validate_dataset_items(json_data["data"]), + metadata=_validate_dataset_metadata(json_data.get("metadata")), + schema=_validate_dataset_schema(json_data.get("schema")), ) - elif isinstance(data, list): + elif isinstance(json_data, list): return Dataset( - name=kwargs.get("name", source_path.stem), - data=data, - metadata=kwargs.get("metadata", {}), + name=kwargs.get("name", source_path.stem) + if isinstance(kwargs.get("name"), str) + else source_path.stem, + data=_validate_dataset_items(json_data), + metadata=_validate_dataset_metadata(kwargs.get("metadata", {})), ) else: raise ValueError( @@ -258,12 +391,19 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs: Any) -> Dat elif source_path.suffix == ".csv": df = pd.read_csv(source_path) # Type cast: pandas to_dict returns dict[Hashable, Any] but we need dict[str, Any] - data = [dict(record) for record in df.to_dict("records")] + records: List[Dict[str, Any]] = [ + cast(Dict[str, Any], dict(record)) for record in df.to_dict("records") + ] + csv_data: List[DatasetItem] = [ + cast(DatasetItem, record) for record in records + ] return Dataset( - name=kwargs.get("name", source_path.stem), - data=data, - metadata=kwargs.get("metadata", {}), + name=kwargs.get("name", source_path.stem) + if isinstance(kwargs.get("name"), str) + else source_path.stem, + data=csv_data, + metadata=_validate_dataset_metadata(kwargs.get("metadata")), ) elif str(source).startswith(("http://", "https://")): @@ -273,19 +413,23 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs: Any) -> Dat response.raise_for_status() if source_str.endswith(".json"): - data = response.json() - if isinstance(data, dict) and "data" in data: + json_data = response.json() + if isinstance(json_data, dict) and "data" in json_data: return Dataset( - name=data.get("name", "remote_dataset"), - data=data["data"], - metadata=data.get("metadata", {}), - schema=data.get("schema"), + name=json_data.get("name", "remote_dataset") + if isinstance(json_data.get("name"), str) + else "remote_dataset", + data=_validate_dataset_items(json_data["data"]), + metadata=_validate_dataset_metadata(json_data.get("metadata")), + schema=_validate_dataset_schema(json_data.get("schema")), ) - elif isinstance(data, list): + elif isinstance(json_data, list): return Dataset( - name=kwargs.get("name", "remote_dataset"), - data=data, - metadata=kwargs.get("metadata", {}), + name=kwargs.get("name", "remote_dataset") + if isinstance(kwargs.get("name"), str) + else "remote_dataset", + data=_validate_dataset_items(json_data), + metadata=_validate_dataset_metadata(kwargs.get("metadata", {})), ) else: raise ValueError( @@ -322,23 +466,32 @@ def create_qa_dataset( if len(questions) != len(answers): raise ValueError("Questions and answers must have the same length") - data = [{"question": q, "answer": a} for q, a in zip(questions, answers)] + data: List[DatasetItem] = [ + cast(DatasetItem, {"question": q, "answer": a}) + for q, a in zip(questions, answers) + ] return Dataset( - name=kwargs.get("name", "qa_dataset"), + name=kwargs.get("name", "qa_dataset") + if isinstance(kwargs.get("name"), str) + else "qa_dataset", data=data, - metadata={ - "task": "question_answering", - "size": len(data), - **kwargs.get("metadata", {}), - }, - schema={ - "required": ["question", "answer"], - "properties": { - "question": {"type": "string"}, - "answer": {"type": "string"}, + metadata=cast( + DatasetMetadata, + { + "task": "question_answering", + "size": len(data), + **kwargs.get("metadata", {}), }, - }, + ), + schema=cast( + DatasetSchema, + { + "required": ["question", "answer"], + "prompt_field": "question", + "reference_field": "answer", + }, + ), ) @@ -360,25 +513,32 @@ def create_summarization_dataset( if len(documents) != len(summaries): raise ValueError("Documents and summaries must have the same length") - data = [ - {"document": doc, "summary": summ} for doc, summ in zip(documents, summaries) + data: List[DatasetItem] = [ + cast(DatasetItem, {"document": doc, "summary": summ}) + for doc, summ in zip(documents, summaries) ] return Dataset( - name=kwargs.get("name", "summarization_dataset"), + name=kwargs.get("name", "summarization_dataset") + if isinstance(kwargs.get("name"), str) + else "summarization_dataset", data=data, - metadata={ - "task": "summarization", - "size": len(data), - **kwargs.get("metadata", {}), - }, - schema={ - "required": ["document", "summary"], - "properties": { - "document": {"type": "string"}, - "summary": {"type": "string"}, + metadata=cast( + DatasetMetadata, + { + "task": "summarization", + "size": len(data), + **kwargs.get("metadata", {}), + }, + ), + schema=cast( + DatasetSchema, + { + "required": ["document", "summary"], + "prompt_field": "document", + "reference_field": "summary", }, - }, + ), ) @@ -400,21 +560,33 @@ def create_classification_dataset( if len(texts) != len(labels): raise ValueError("Texts and labels must have the same length") - data = [{"text": text, "label": label} for text, label in zip(texts, labels)] + data: List[DatasetItem] = [ + cast(DatasetItem, {"text": text, "label": label}) + for text, label in zip(texts, labels) + ] return Dataset( - name=kwargs.get("name", "classification_dataset"), + name=kwargs.get("name", "classification_dataset") + if isinstance(kwargs.get("name"), str) + else "classification_dataset", data=data, - metadata={ - "task": "classification", - "size": len(data), - "unique_labels": list(set(labels)), - **kwargs.get("metadata", {}), - }, - schema={ - "required": ["text", "label"], - "properties": {"text": {"type": "string"}, "label": {"type": "string"}}, - }, + metadata=cast( + DatasetMetadata, + { + "task": "classification", + "size": len(data), + "unique_labels": list(set(labels)), + **kwargs.get("metadata", {}), + }, + ), + schema=cast( + DatasetSchema, + { + "required": ["text", "label"], + "prompt_field": "text", + "reference_field": "label", + }, + ), ) @@ -446,73 +618,144 @@ def clear(self) -> None: def load_mmlu_sample() -> Dataset: - sample_data = [ - { - "question": "What is the capital of France?", - "choices": ["London", "Berlin", "Paris", "Madrid"], - "answer": "Paris", - "subject": "geography", - }, - { - "question": "What is 2 + 2?", - "choices": ["3", "4", "5", "6"], - "answer": "4", - "subject": "mathematics", - }, + sample_data: List[DatasetItem] = [ + cast( + DatasetItem, + { + "question": "What is the capital of France?", + "choices": ["London", "Berlin", "Paris", "Madrid"], + "answer": "Paris", + "subject": "geography", + }, + ), + cast( + DatasetItem, + { + "question": "What is 2 + 2?", + "choices": ["3", "4", "5", "6"], + "answer": "4", + "subject": "mathematics", + }, + ), ] return Dataset( name="mmlu_sample", data=sample_data, - metadata={ - "task": "multiple_choice_qa", - "source": "MMLU", - "description": "Sample from Massive Multitask Language Understanding", - }, + metadata=cast( + DatasetMetadata, + { + "task": "multiple_choice_qa", + "source": "MMLU", + "description": "Sample from Massive Multitask Language Understanding", + }, + ), ) def load_hellaswag_sample() -> Dataset: """Load a sample of HellaSwag dataset.""" - sample_data = [ - { - "context": "A woman is outside with a bucket and a dog. The dog is running around trying to avoid a bath. She", - "endings": [ - "rinses the bucket off with soap and blow dry the dog.", - "uses a hose to keep the dog from getting soapy.", - "gets the dog wet, then it runs away again.", - "gets into the bath tub with the dog.", - ], - "label": 2, - } + sample_data: List[DatasetItem] = [ + cast( + DatasetItem, + { + "context": "A woman is outside with a bucket and a dog. The dog is running around trying to avoid a bath. She", + "endings": [ + "rinses the bucket off with soap and blow dry the dog.", + "uses a hose to keep the dog from getting soapy.", + "gets the dog wet, then it runs away again.", + "gets into the bath tub with the dog.", + ], + "label": 2, + }, + ) ] return Dataset( name="hellaswag_sample", data=sample_data, - metadata={ - "task": "sentence_completion", - "source": "HellaSwag", - "description": "Commonsense reasoning benchmark", - }, + metadata=cast( + DatasetMetadata, + { + "task": "sentence_completion", + "source": "HellaSwag", + "description": "Commonsense reasoning benchmark", + }, + ), ) def load_gsm8k_sample() -> Dataset: """Load a sample of GSM8K (Grade School Math 8K) dataset.""" - sample_data = [ - { - "question": "Janet's ducks lay 16 eggs per day. She eats 3 for breakfast every morning and bakes 4 into muffins for her friends every day. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much money does she make every day at the farmers' market?", - "answer": "Janet sells 16 - 3 - 4 = 9 duck eggs every day. She makes 9 * $2 = $18 every day at the farmers' market.", - } + sample_data: List[DatasetItem] = [ + cast( + DatasetItem, + { + "question": "Janet's ducks lay 16 eggs per day. She eats 3 for breakfast every morning and bakes 4 into muffins for her friends every day. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much money does she make every day at the farmers' market?", + "answer": "Janet sells 16 - 3 - 4 = 9 duck eggs every day. She makes 9 * $2 = $18 every day at the farmers' market.", + }, + ) ] return Dataset( name="gsm8k_sample", data=sample_data, - metadata={ - "task": "math_word_problems", - "source": "GSM8K", - "description": "Grade school math word problems", - }, + metadata=cast( + DatasetMetadata, + { + "task": "math_word_problems", + "source": "GSM8K", + "description": "Grade school math word problems", + }, + ), ) + + +def convert_metadata_to_info(metadata: DatasetMetadata) -> DatasetInfo: + """ + Convert DatasetMetadata to DatasetInfo for evaluation results. + + This function properly converts dataset metadata (which is stored with the dataset) + to dataset info (which is used in evaluation results). It handles missing fields + and ensures type safety. + + Args: + metadata: Dataset metadata to convert + + Returns: + DatasetInfo with converted fields + """ + # Extract fields that exist in DatasetMetadata + size: int = metadata.get("size", 0) + tags: List[str] = metadata.get("tags", []) + source: Optional[str] = metadata.get("source") + name: Optional[str] = metadata.get("name") + description: Optional[str] = metadata.get("description") + version: Optional[str] = metadata.get("version") + created_at: Optional[str] = metadata.get("created_at") + + # Extract fields that might exist but aren't in DatasetMetadata TypedDict + # These could be present at runtime even if not in the type definition + metadata_dict: Dict[str, Any] = cast(Dict[str, Any], metadata) + hash_value: Optional[str] = metadata_dict.get("hash") + task: Optional[str] = metadata_dict.get("task") + difficulty: Optional[str] = metadata_dict.get("difficulty") + + # Build DatasetInfo with proper types + info: DatasetInfo = { + "size": size, + "tags": tags, + "source": source, + "name": name, + "description": description, + "version": version, + "created_at": created_at, + "hash": hash_value, + "task": task if task else "general", + } + + # Add difficulty if available + if difficulty: + info["difficulty"] = difficulty + + return info diff --git a/benchwise/results.py b/benchwise/results.py index 54768ff..e0c7044 100644 --- a/benchwise/results.py +++ b/benchwise/results.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Any, List, Optional, Union, cast from dataclasses import dataclass, field from datetime import datetime import json @@ -7,6 +7,19 @@ import numpy as np import hashlib +from .types import ( + DatasetInfo, + EvaluationMetadata, + EvaluationResultDict, + BenchmarkSummary, + BenchmarkResultDict, + ModelComparisonResult, + CrossBenchmarkComparison, + ModelPerformanceAnalysis, + CachedResultInfo, + BenchmarkComparisonInfo, +) + @dataclass class EvaluationResult: @@ -28,9 +41,11 @@ class EvaluationResult: test_name: str result: Any = None duration: float = 0.0 - dataset_info: Optional[Dict[str, Any]] = None + dataset_info: Optional[DatasetInfo] = None error: Optional[str] = None - metadata: Dict[str, Any] = field(default_factory=dict) + metadata: EvaluationMetadata = field( + default_factory=lambda: cast(EvaluationMetadata, {}) + ) timestamp: datetime = field(default_factory=datetime.now) @property @@ -43,19 +58,22 @@ def failed(self) -> bool: """Whether the evaluation failed.""" return self.error is not None - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> EvaluationResultDict: """Convert result to dictionary format.""" - return { - "model_name": self.model_name, - "test_name": self.test_name, - "result": self.result, - "duration": self.duration, - "dataset_info": self.dataset_info, - "error": self.error, - "metadata": self.metadata, - "timestamp": self.timestamp.isoformat(), - "success": self.success, - } + return cast( + EvaluationResultDict, + { + "model_name": self.model_name, + "test_name": self.test_name, + "result": self.result, + "duration": self.duration, + "dataset_info": self.dataset_info, + "error": self.error, + "metadata": self.metadata, + "timestamp": self.timestamp.isoformat(), + "success": self.success, + }, + ) def get_score(self, metric_name: Optional[str] = None) -> Union[float, Any]: """ @@ -90,7 +108,9 @@ class BenchmarkResult: benchmark_name: str results: List[EvaluationResult] = field(default_factory=list) - metadata: Dict[str, Any] = field(default_factory=dict) + metadata: EvaluationMetadata = field( + default_factory=lambda: cast(EvaluationMetadata, {}) + ) timestamp: datetime = field(default_factory=datetime.now) def add_result(self, result: EvaluationResult) -> None: @@ -157,7 +177,9 @@ def get_worst_model( successful_results, key=lambda r: r.get_score(metric_name) or float("inf") ) - def compare_models(self, metric_name: Optional[str] = None) -> Dict[str, Any]: + def compare_models( + self, metric_name: Optional[str] = None + ) -> ModelComparisonResult: """ Compare all models in the benchmark. @@ -169,7 +191,9 @@ def compare_models(self, metric_name: Optional[str] = None) -> Dict[str, Any]: """ successful_results = self.successful_results if not successful_results: - return {"error": "No successful results to compare"} + return cast( + ModelComparisonResult, {"error": "No successful results to compare"} + ) scores = [result.get_score(metric_name) for result in successful_results] model_names = [result.model_name for result in successful_results] @@ -182,22 +206,25 @@ def compare_models(self, metric_name: Optional[str] = None) -> Dict[str, Any]: ] if not valid_scores: - return {"error": "No valid scores found"} + return cast(ModelComparisonResult, {"error": "No valid scores found"}) sorted_results = sorted(valid_scores, key=lambda x: x[1], reverse=True) - return { - "ranking": [ - {"model": name, "score": score} for name, score in sorted_results - ], - "best_model": sorted_results[0][0], - "best_score": sorted_results[0][1], - "worst_model": sorted_results[-1][0], - "worst_score": sorted_results[-1][1], - "mean_score": np.mean([score for _, score in valid_scores]), - "std_score": np.std([score for _, score in valid_scores]), - "total_models": len(valid_scores), - } + return cast( + ModelComparisonResult, + { + "ranking": [ + {"model": name, "score": score} for name, score in sorted_results + ], + "best_model": sorted_results[0][0], + "best_score": sorted_results[0][1], + "worst_model": sorted_results[-1][0], + "worst_score": sorted_results[-1][1], + "mean_score": float(np.mean([score for _, score in valid_scores])), + "std_score": float(np.std([score for _, score in valid_scores])), + "total_models": len(valid_scores), + }, + ) def get_model_result(self, model_name: str) -> Optional[EvaluationResult]: """Get result for a specific model.""" @@ -206,20 +233,24 @@ def get_model_result(self, model_name: str) -> Optional[EvaluationResult]: return result return None - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> BenchmarkResultDict: """Convert benchmark result to dictionary format.""" - return { - "benchmark_name": self.benchmark_name, - "results": [result.to_dict() for result in self.results], - "metadata": self.metadata, - "timestamp": self.timestamp.isoformat(), - "summary": { - "total_models": len(self.results), - "successful_models": len(self.successful_results), - "failed_models": len(self.failed_results), - "success_rate": self.success_rate, - }, + summary: BenchmarkSummary = { + "total_models": len(self.results), + "successful_models": len(self.successful_results), + "failed_models": len(self.failed_results), + "success_rate": self.success_rate, } + return cast( + BenchmarkResultDict, + { + "benchmark_name": self.benchmark_name, + "results": [result.to_dict() for result in self.results], + "metadata": self.metadata, + "timestamp": self.timestamp.isoformat(), + "summary": summary, + }, + ) def to_dataframe(self) -> pd.DataFrame: """Convert results to pandas DataFrame for analysis.""" @@ -262,7 +293,7 @@ class ResultsAnalyzer: @staticmethod def compare_benchmarks( benchmark_results: List[BenchmarkResult], metric_name: Optional[str] = None - ) -> Dict[str, Any]: + ) -> CrossBenchmarkComparison: """ Compare results across multiple benchmarks. @@ -273,11 +304,12 @@ def compare_benchmarks( Returns: Dictionary with cross-benchmark comparison """ - comparison: Dict[str, Any] = { + comparison: CrossBenchmarkComparison = { "benchmarks": [], - "models": set(), + "models": [], "cross_benchmark_scores": {}, } + models_set: set[str] = set() for benchmark in benchmark_results: benchmark_info = { @@ -287,8 +319,10 @@ def compare_benchmarks( "success_rate": benchmark.success_rate, } - comparison["benchmarks"].append(benchmark_info) - comparison["models"].update(benchmark.model_names) + comparison["benchmarks"].append( + cast(BenchmarkComparisonInfo, benchmark_info) + ) + models_set.update(benchmark.model_names) # Collect scores for each model for result in benchmark.successful_results: @@ -302,14 +336,14 @@ def compare_benchmarks( benchmark.benchmark_name ] = score - comparison["models"] = list(comparison["models"]) + comparison["models"] = list(models_set) return comparison @staticmethod def analyze_model_performance( results: List[EvaluationResult], metric_name: Optional[str] = None - ) -> Dict[str, Any]: + ) -> ModelPerformanceAnalysis: """ Analyze performance of a single model across multiple evaluations. @@ -321,32 +355,38 @@ def analyze_model_performance( Dictionary with performance analysis """ if not results: - return {"error": "No results provided"} + return cast(ModelPerformanceAnalysis, {"error": "No results provided"}) model_name = results[0].model_name successful_results = [r for r in results if r.success] if not successful_results: - return {"error": "No successful results found"} + return cast( + ModelPerformanceAnalysis, {"error": "No successful results found"} + ) scores = [result.get_score(metric_name) for result in successful_results] valid_scores = [score for score in scores if score is not None] if not valid_scores: - return {"error": "No valid scores found"} - - return { - "model_name": model_name, - "total_evaluations": len(results), - "successful_evaluations": len(successful_results), - "success_rate": len(successful_results) / len(results), - "mean_score": np.mean(valid_scores), - "median_score": np.median(valid_scores), - "std_score": np.std(valid_scores), - "min_score": np.min(valid_scores), - "max_score": np.max(valid_scores), - "score_range": np.max(valid_scores) - np.min(valid_scores), - } + return cast(ModelPerformanceAnalysis, {"error": "No valid scores found"}) + + return cast( + ModelPerformanceAnalysis, + { + "model_name": model_name, + "total_evaluations": len(results), + "successful_evaluations": len(successful_results), + "failed_evaluations": len(results) - len(successful_results), + "success_rate": len(successful_results) / len(results), + "mean_score": float(np.mean(valid_scores)), + "median_score": float(np.median(valid_scores)), + "std_score": float(np.std(valid_scores)), + "min_score": float(np.min(valid_scores)), + "max_score": float(np.max(valid_scores)), + "scores": valid_scores, + }, + ) @staticmethod def generate_report( @@ -529,7 +569,7 @@ def clear_cache(self) -> None: for cache_file in self.cache_dir.glob("*.json"): cache_file.unlink() - def list_cached_results(self) -> List[Dict[str, Any]]: + def list_cached_results(self) -> List[CachedResultInfo]: """List all cached results.""" results = [] for cache_file in self.cache_dir.glob("*.json"): @@ -537,12 +577,15 @@ def list_cached_results(self) -> List[Dict[str, Any]]: with open(cache_file, "r") as f: data = json.load(f) results.append( - { - "file": cache_file.name, - "model_name": data.get("model_name"), - "test_name": data.get("test_name"), - "timestamp": data.get("timestamp"), - } + cast( + CachedResultInfo, + { + "file": cache_file.name, + "model_name": data.get("model_name"), + "test_name": data.get("test_name"), + "timestamp": data.get("timestamp"), + }, + ) ) except Exception: continue diff --git a/benchwise/types.py b/benchwise/types.py index 38cf447..fd24e5e 100644 --- a/benchwise/types.py +++ b/benchwise/types.py @@ -298,7 +298,7 @@ class DatasetItem(TypedDict, total=False): summary: str # Additional fields id: str - metadata: Dict[str, Any] + metadata: "EvaluationMetadata" class DatasetMetadata(TypedDict, total=False): @@ -318,10 +318,43 @@ class DatasetSchema(TypedDict, total=False): prompt_field: str reference_field: str - required_fields: List[str] + required: List[str] # Required fields in dataset items + required_fields: List[str] # Alias for backward compatibility optional_fields: List[str] +class DatasetInfo(TypedDict, total=False): + """Information about a dataset used in evaluation.""" + + size: int + task: str + tags: List[str] + difficulty: Optional[str] + source: Optional[str] + name: Optional[str] + description: Optional[str] + version: Optional[str] + hash: Optional[str] + created_at: Optional[str] + + +class DatasetStatistics(TypedDict, total=False): + """Statistics about a dataset.""" + + size: int + fields: List[str] + metadata: Optional[DatasetMetadata] + + +class DatasetDict(TypedDict, total=False): + """Dictionary representation of a dataset.""" + + name: str + data: List[DatasetItem] + metadata: Optional[DatasetMetadata] + schema: Optional[DatasetSchema] + + # Configuration Types class ConfigDict(TypedDict, total=False): """Configuration dictionary for BenchWise.""" @@ -342,33 +375,81 @@ class ConfigDict(TypedDict, total=False): # Results Types +class EvaluationMetadata(TypedDict, total=False): + """Metadata for an evaluation result.""" + + temperature: float + max_tokens: int + model_version: str + dataset_hash: str + evaluation_id: Optional[int] + benchmark_id: Optional[int] + dataset: DatasetInfo # Dataset information for the evaluation + description: str # Description of the evaluation/benchmark + dataset_path: str # Path to the dataset file used in evaluation + models: List[str] # List of models evaluated + metrics: List[str] # List of metrics used in evaluation + # Allow additional metadata fields + # Note: This is intentionally flexible for user-defined metadata + + class EvaluationResultDict(TypedDict, total=False): """Serialized evaluation result.""" - model: str - prompt: str - response: str - score: float - scores: Dict[str, float] - metadata: Dict[str, Any] + model_name: str + test_name: str + result: Any + duration: float + dataset_info: Optional[DatasetInfo] + error: Optional[str] + metadata: EvaluationMetadata timestamp: str success: bool - error: Optional[str] + + +class BenchmarkSummary(TypedDict): + """Summary statistics for a benchmark.""" + + total_models: int + successful_models: int + failed_models: int + success_rate: float class BenchmarkResultDict(TypedDict, total=False): """Serialized benchmark result.""" benchmark_name: str - benchmark_description: str results: List[EvaluationResultDict] - summary: Dict[str, Any] + metadata: EvaluationMetadata timestamp: str + summary: BenchmarkSummary -class ComparisonResult(TypedDict): +class ModelRanking(TypedDict): + """Ranking entry for a model.""" + + model: str + score: float + + +class ModelComparisonResult(TypedDict, total=False): """Result of model comparison.""" + ranking: List[ModelRanking] + best_model: str + best_score: float + worst_model: str + worst_score: float + mean_score: float + std_score: float + total_models: int + error: Optional[str] + + +class ComparisonResult(TypedDict): + """Result of model comparison (legacy format).""" + best_model: str best_score: float rankings: List[Tuple[str, float]] @@ -376,11 +457,76 @@ class ComparisonResult(TypedDict): # API Response Types +class TokenData(TypedDict, total=False): + """JWT token data from login.""" + + access_token: str + token_type: str + expires_in: Optional[int] + refresh_token: Optional[str] + + class LoginResponse(TypedDict): """Response from login endpoint.""" - token: Dict[str, str] - user: Dict[str, Any] + token: TokenData + user: "UserInfo" # Forward reference + + +class ModelInfo(TypedDict, total=False): + """Model information from API.""" + + id: int + name: str + provider: str + model_id: str # Provider-specific model identifier + description: Optional[str] + is_active: bool + pricing: Optional[PricingInfo] + metadata: Optional[EvaluationMetadata] + + +class BenchmarkRegistrationData(TypedDict, total=False): + """Data for registering a benchmark with the API.""" + + name: str + description: str + category: str + tags: List[str] + difficulty: Optional[str] + dataset_url: Optional[str] + config: Dict[str, Any] + metadata: DatasetInfo + is_public: bool + + +class BenchmarkInfo(TypedDict, total=False): + """Benchmark information from API.""" + + id: int + name: str + description: Optional[str] + category: Optional[str] + tags: List[str] + difficulty: Optional[str] + dataset_url: Optional[str] + config: Dict[str, Any] # API config can be arbitrary + metadata: Optional[DatasetInfo] + is_public: bool + created_at: Optional[str] + + +class EvaluationInfo(TypedDict, total=False): + """Evaluation information from API.""" + + id: int + benchmark_id: int + model_id: int + test_name: str + status: str + results: Optional[Dict[str, Any]] # Results can be arbitrary + metadata: Optional[EvaluationMetadata] + created_at: Optional[str] class UserInfo(TypedDict, total=False): @@ -393,8 +539,8 @@ class UserInfo(TypedDict, total=False): is_active: bool -class UploadResultsResponse(TypedDict): - """Response from upload results endpoint.""" +class UploadBenchmarkResponse(TypedDict): + """Response from upload benchmark result endpoint.""" id: int benchmark_id: int @@ -403,6 +549,12 @@ class UploadResultsResponse(TypedDict): message: str +class FileUploadResponse(TypedDict, total=False): + """Response from file upload endpoint.""" + + file_info: Dict[str, str] # Contains url and other file metadata + + # Protocols class SupportsGenerate(Protocol): """Protocol for objects that support text generation.""" @@ -446,6 +598,23 @@ def evaluate( ... +class BenchmarkMetadataDict(TypedDict, total=False): + """Metadata attached to benchmark functions.""" + + name: str + description: str + + +class CallableWithBenchmarkMetadata(Protocol): + """Protocol for callables that may have benchmark metadata attached.""" + + _benchmark_metadata: Dict[str, Any] + + def __call__(self, *args: Any, **kwargs: Any) -> Any: + """Call the function.""" + ... + + class ConfigureArgs(Protocol): """Arguments for configuring Benchwise.""" @@ -480,5 +649,68 @@ class ConfigKwargs(TypedDict, total=False): class OfflineQueueItem(TypedDict): """Item in offline queue.""" - data: Dict[str, Any] + data: Dict[str, Any] # Can contain different operation types + timestamp: str + + +class RunnerConfig(TypedDict, total=False): + """Configuration for EvaluationRunner.""" + + cache_enabled: bool + upload_enabled: bool + timeout: float + max_retries: int + debug: bool + verbose: bool + + +class CacheEntry(TypedDict, total=False): + """Entry in results cache.""" + + result: EvaluationResultDict + dataset_hash: str + timestamp: str + + +class CachedResultInfo(TypedDict, total=False): + """Information about a cached result.""" + + file: str + model_name: Optional[str] + test_name: Optional[str] + timestamp: Optional[str] + dataset_hash: Optional[str] + + +class BenchmarkComparisonInfo(TypedDict, total=False): + """Information about a benchmark in cross-benchmark comparison.""" + + name: str timestamp: str + models: List[str] + success_rate: float + + +class CrossBenchmarkComparison(TypedDict, total=False): + """Result of comparing multiple benchmarks.""" + + benchmarks: List[BenchmarkComparisonInfo] + models: List[str] + cross_benchmark_scores: Dict[str, Dict[str, Optional[float]]] + + +class ModelPerformanceAnalysis(TypedDict, total=False): + """Performance analysis for a single model.""" + + model_name: str + total_evaluations: int + successful_evaluations: int + failed_evaluations: int + success_rate: float + mean_score: float + std_score: float + min_score: float + max_score: float + median_score: float + scores: List[float] + error: Optional[str] diff --git a/demo.py b/demo.py index f7072e6..39342c4 100644 --- a/demo.py +++ b/demo.py @@ -1,5 +1,11 @@ import asyncio -from benchwise import evaluate, benchmark, create_qa_dataset, accuracy, semantic_similarity +from benchwise import ( + evaluate, + benchmark, + create_qa_dataset, + accuracy, + semantic_similarity, +) # Create your dataset qa_dataset = create_qa_dataset( @@ -8,18 +14,19 @@ "Who wrote '1984'?", "What is the speed of light?", "Explain photosynthesis in one sentence.", - "What causes rainbows?" + "What causes rainbows?", ], answers=[ "Tokyo", "George Orwell", "299,792,458 meters per second", "Photosynthesis is the process by which plants convert sunlight into energy.", - "Rainbows are caused by light refraction and reflection in water droplets." + "Rainbows are caused by light refraction and reflection in water droplets.", ], - name="general_knowledge_qa" + name="general_knowledge_qa", ) + @benchmark("General Knowledge QA", "Tests basic factual knowledge") @evaluate("gpt-3.5-turbo", "gemini-2.5-flash-lite") async def test_general_knowledge(model, dataset): @@ -31,9 +38,10 @@ async def test_general_knowledge(model, dataset): return { "accuracy": acc["accuracy"], "semantic_similarity": similarity["mean_similarity"], - "total_questions": len(responses) + "total_questions": len(responses), } + # Run the evaluation async def main(): results = await test_general_knowledge(qa_dataset) @@ -47,4 +55,5 @@ async def main(): else: print(f"{result.model_name}: FAILED - {result.error}") -asyncio.run(main()) \ No newline at end of file + +asyncio.run(main()) diff --git a/docs/test_load_dataset.py b/docs/test_load_dataset.py index b0ef5da..6e7b5f5 100644 --- a/docs/test_load_dataset.py +++ b/docs/test_load_dataset.py @@ -5,6 +5,7 @@ # Assuming data.json is in the same directory as this script for testing purposes data_file_path = "data.json" + def test_load_dataset_from_json(): # Load the dataset dataset = load_dataset(data_file_path) @@ -17,6 +18,7 @@ def test_load_dataset_from_json(): print("Successfully loaded dataset and assertions passed!") + if __name__ == "__main__": # Create a dummy data.json file for testing if it doesn't exist if not os.path.exists(data_file_path): From 1f7352bf090b0fd06b03b85b72e4127a201f7197 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Sat, 13 Dec 2025 17:50:22 +0530 Subject: [PATCH 23/24] fix(datasets): Enhance dataset loading logic to prioritize name from DatasetDict --- benchwise/datasets.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchwise/datasets.py b/benchwise/datasets.py index 3459b97..2a0681e 100644 --- a/benchwise/datasets.py +++ b/benchwise/datasets.py @@ -348,12 +348,19 @@ def load_dataset(source: Union[str, Path, DatasetDict], **kwargs: Any) -> Datase # Note: .get() on TypedDict with total=False returns Any for optional keys, # but we know the structure from DatasetDict, so we use proper type annotations dataset_dict: DatasetDict = source + # Prefer name from DatasetDict if present, otherwise fall back to kwargs + name_from_dict: Optional[str] = dataset_dict.get("name") + name: str = ( + name_from_dict + if isinstance(name_from_dict, str) + else kwargs.get("name", "custom_dataset") + ) data: List[DatasetItem] = dataset_dict.get("data", []) metadata: Optional[DatasetMetadata] = dataset_dict.get("metadata") schema: Optional[DatasetSchema] = dataset_dict.get("schema") return Dataset( - name=kwargs.get("name", "custom_dataset"), + name=name, data=_validate_dataset_items(data), metadata=_validate_dataset_metadata(metadata), schema=_validate_dataset_schema(schema), From 37c697acab3adff8e7b477b86adac48764d61ff5 Mon Sep 17 00:00:00 2001 From: Anurag Yadav Date: Sat, 13 Dec 2025 18:44:18 +0530 Subject: [PATCH 24/24] refactor(cli, client, config, datasets, metrics): Clean up imports and enhance type safety by removing unused imports and organizing dependencies --- benchwise/cli.py | 35 +++++++++++++++++++---------------- benchwise/client.py | 9 +-------- benchwise/config.py | 6 ++---- benchwise/datasets.py | 5 +---- benchwise/metrics.py | 4 +--- 5 files changed, 24 insertions(+), 35 deletions(-) diff --git a/benchwise/cli.py b/benchwise/cli.py index d71bab5..e136dc8 100644 --- a/benchwise/cli.py +++ b/benchwise/cli.py @@ -4,16 +4,30 @@ import argparse import asyncio +import os import sys -from typing import List, Optional +from typing import List, Optional, cast from . import __version__ from .datasets import load_dataset, convert_metadata_to_info from .models import get_model_adapter -from .results import save_results, BenchmarkResult, EvaluationResult -from .config import get_api_config, configure_benchwise -from .client import get_client, sync_offline_results -from .types import ConfigureArgs, ConfigKwargs, SyncArgs, StatusArgs, DatasetInfo +from .results import ( + save_results, + BenchmarkResult, + EvaluationResult, + load_results, + ResultsAnalyzer, +) +from .config import get_api_config, configure_benchwise, reset_config +from .client import get_client, sync_offline_results, upload_results +from .types import ( + ConfigureArgs, + ConfigKwargs, + SyncArgs, + StatusArgs, + DatasetInfo, + EvaluationMetadata, +) def create_parser() -> argparse.ArgumentParser: @@ -136,9 +150,6 @@ async def run_evaluation( sys.exit(1) # Create benchmark result - from .types import EvaluationMetadata - from typing import cast - benchmark_result = BenchmarkResult( benchmark_name=f"cli_evaluation_{dataset.name}", metadata=cast( @@ -163,8 +174,6 @@ async def run_evaluation( # Check for API key requirements for cloud models if model_name.startswith(("gpt-", "claude-", "gemini-")): - import os - api_key_map = { "gpt-": "OPENAI_API_KEY", "claude-": "ANTHROPIC_API_KEY", @@ -276,8 +285,6 @@ async def run_evaluation( if should_upload and benchmark_result.results: try: - from .client import upload_results - # Extract dataset_info from dataset metadata for upload_results # upload_results expects DatasetInfo dataset_info_for_upload: DatasetInfo = cast( @@ -311,8 +318,6 @@ async def run_evaluation( async def configure_api(args: ConfigureArgs) -> None: """Configure Benchwise API settings.""" - from .config import reset_config - if args.reset: reset_config() print("โœ“ Configuration reset to defaults") @@ -506,8 +511,6 @@ async def compare_results( result_paths: List[str], metric: Optional[str] = None ) -> None: """Compare evaluation results.""" - from .results import load_results, ResultsAnalyzer - try: # Load all results benchmark_results = [] diff --git a/benchwise/client.py b/benchwise/client.py index 6adc153..29f89e1 100644 --- a/benchwise/client.py +++ b/benchwise/client.py @@ -2,6 +2,7 @@ import asyncio import uuid import logging +import os import types from typing import Dict, Any, Optional, List, Type, cast from datetime import datetime @@ -830,8 +831,6 @@ async def sync_offline_queue(self) -> int: if data_type == "full_benchmark_result": # Reconstruct BenchmarkResult and upload - from .results import BenchmarkResult - benchmark_result_dict: Dict[str, Any] = queue_data.get( "benchmark_result", {} ) @@ -886,8 +885,6 @@ async def upload_dataset_for_benchmark( Returns: Dataset URL """ - import os - logger.info(f"Uploading dataset for benchmark {benchmark_id}") try: with open(dataset_path, "rb") as f: @@ -1014,7 +1011,6 @@ async def upload_results( logger.warning( "Benchwise API not available, results will be cached offline" ) - from .results import BenchmarkResult benchmark_result = BenchmarkResult( benchmark_name=test_name, @@ -1035,7 +1031,6 @@ async def upload_results( # Check authentication if not client.jwt_token: logger.warning("Not authenticated - results will be cached offline") - from .results import BenchmarkResult benchmark_result = BenchmarkResult( benchmark_name=test_name, @@ -1054,8 +1049,6 @@ async def upload_results( return False # Create benchmark result and upload - from .results import BenchmarkResult - benchmark_result = BenchmarkResult( benchmark_name=test_name, results=results, diff --git a/benchwise/config.py b/benchwise/config.py index 7f673ad..f08cc12 100644 --- a/benchwise/config.py +++ b/benchwise/config.py @@ -10,6 +10,8 @@ from typing import Optional, Dict, Any, List from dataclasses import dataclass, field import json +import asyncio +import httpx from benchwise.types import ConfigDict @@ -408,8 +410,6 @@ def validate_api_connection(config: BenchwiseConfig) -> bool: True if connection is valid """ try: - import asyncio - import httpx async def check_connection() -> bool: async with httpx.AsyncClient(timeout=5.0) as client: @@ -434,8 +434,6 @@ def validate_api_keys(config: BenchwiseConfig) -> Dict[str, bool]: Returns: Dict mapping provider to validity status """ - import os - results = {} if os.getenv("OPENAI_API_KEY"): diff --git a/benchwise/datasets.py b/benchwise/datasets.py index 2a0681e..2a9587b 100644 --- a/benchwise/datasets.py +++ b/benchwise/datasets.py @@ -5,6 +5,7 @@ import requests from dataclasses import dataclass import hashlib +import random from .types import ( DatasetItem, @@ -191,8 +192,6 @@ def filter(self, condition: Callable[[DatasetItem], bool]) -> "Dataset": def sample(self, n: int, random_state: Optional[int] = None) -> "Dataset": """Sample n items from dataset.""" - import random - if random_state: random.seed(random_state) @@ -212,8 +211,6 @@ def split( self, train_ratio: float = 0.8, random_state: Optional[int] = None ) -> tuple["Dataset", "Dataset"]: """Split dataset into train and test sets.""" - import random - if random_state: random.seed(random_state) diff --git a/benchwise/metrics.py b/benchwise/metrics.py index aac4f1c..ebe385d 100644 --- a/benchwise/metrics.py +++ b/benchwise/metrics.py @@ -14,7 +14,7 @@ from rouge_score import rouge_scorer from sacrebleu import BLEU import bert_score -from nltk.translate.bleu_score import sentence_bleu +from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction import nltk import re import string @@ -304,8 +304,6 @@ def bleu_score( def _get_smoothing_function(smooth_method: str) -> Optional[Callable[..., Any]]: """Get NLTK smoothing function based on method name.""" - from nltk.translate.bleu_score import SmoothingFunction - smoothing = SmoothingFunction() if smooth_method == "exp":