From c75c26eb0499fa17df62def8ae4321d9e2a441fc Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Mon, 1 Dec 2025 17:27:51 +0530
Subject: [PATCH 01/24] feat: Add type annotations and mypy configuration

- Add benchwise/types.py with TypedDict, Protocol, and Literal definitions
- Add mypy.ini with strict type checking
- Add py.typed marker for PEP 561 compliance
- Improve type annotations in exceptions, config, and datasets modules
---
 benchwise/config.py     |  34 ++---
 benchwise/datasets.py   |  12 +-
 benchwise/exceptions.py |   6 +-
 benchwise/py.typed      |   0
 benchwise/types.py      | 291 ++++++++++++++++++++++++++++++++++++++++
 mypy.ini                |  66 +++++++++
 6 files changed, 387 insertions(+), 22 deletions(-)
 create mode 100644 benchwise/py.typed
 create mode 100644 benchwise/types.py
 create mode 100644 mypy.ini

diff --git a/benchwise/config.py b/benchwise/config.py
index 62d5f6a..4d7c74b 100644
--- a/benchwise/config.py
+++ b/benchwise/config.py
@@ -7,10 +7,12 @@
 
 import os
 from pathlib import Path
-from typing import Optional, Dict, Any
+from typing import Optional, Dict, Any, List
 from dataclasses import dataclass, field
 import json
 
+from benchwise.types import ConfigDict
+
 
 @dataclass
 class BenchwiseConfig:
@@ -52,16 +54,16 @@ class BenchwiseConfig:
     verbose: bool = False
 
     # User preferences
-    default_models: list = field(default_factory=list)
-    default_metrics: list = field(default_factory=list)
+    default_models: List[str] = field(default_factory=list)
+    default_metrics: List[str] = field(default_factory=list)
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         """Load configuration from environment variables and config file."""
         self._load_from_env()
         self._load_from_file()
         self._validate_config()
 
-    def _load_from_env(self):
+    def _load_from_env(self) -> None:
         """Load configuration from environment variables."""
 
         # API Configuration
@@ -113,7 +115,7 @@ def _load_from_env(self):
         if verbose_env in ("true", "1", "yes", "on"):
             self.verbose = True
 
-    def _load_from_file(self):
+    def _load_from_file(self) -> None:
         """Load configuration from config file."""
         config_paths = [
             Path.cwd() / ".benchwise.json",
@@ -140,7 +142,7 @@ def _load_from_file(self):
                     if self.verbose:
                         print(f"⚠️ Failed to load config from {config_path}: {e}")
 
-    def _validate_config(self):
+    def _validate_config(self) -> None:
         """Validate configuration values."""
 
         # Validate API URL
@@ -169,7 +171,7 @@ def _validate_config(self):
                 )
                 self.cache_enabled = False
 
-    def save_to_file(self, file_path: Optional[Path] = None):
+    def save_to_file(self, file_path: Optional[Path] = None) -> None:
         """
         Save current configuration to file.
 
@@ -212,7 +214,7 @@ def save_to_file(self, file_path: Optional[Path] = None):
         except OSError as e:
             print(f"Failed to save configuration: {e}")
 
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> ConfigDict:
         """Convert configuration to dictionary."""
         return {
             "api_url": self.api_url,
@@ -230,7 +232,7 @@ def to_dict(self) -> Dict[str, Any]:
             "default_metrics": self.default_metrics,
         }
 
-    def print_config(self):
+    def print_config(self) -> None:
         """Print current configuration in a readable format."""
         print("🔧 Benchwise Configuration:")
         print("=" * 30)
@@ -258,7 +260,7 @@ def get_api_config() -> BenchwiseConfig:
     return _global_config
 
 
-def set_api_config(config: BenchwiseConfig):
+def set_api_config(config: BenchwiseConfig) -> None:
     """
     Set the global Benchwise configuration.
 
@@ -275,7 +277,7 @@ def configure_benchwise(
     upload_enabled: Optional[bool] = None,
     cache_enabled: Optional[bool] = None,
     debug: Optional[bool] = None,
-    **kwargs,
+    **kwargs: Any,
 ) -> BenchwiseConfig:
     """
     Configure Benchwise settings programmatically.
@@ -315,7 +317,7 @@ def configure_benchwise(
     return config
 
 
-def reset_config():
+def reset_config() -> None:
     """Reset configuration to default values."""
     global _global_config
     _global_config = None
@@ -409,10 +411,10 @@ def validate_api_connection(config: BenchwiseConfig) -> bool:
         import asyncio
         import httpx
 
-        async def check_connection():
+        async def check_connection() -> bool:
             async with httpx.AsyncClient(timeout=5.0) as client:
                 response = await client.get(f"{config.api_url}/health")
-                return response.status_code == 200
+                return bool(response.status_code == 200)
 
         return asyncio.run(check_connection())
 
@@ -482,7 +484,7 @@ def validate_api_keys(config: BenchwiseConfig) -> Dict[str, bool]:
     return results
 
 
-def print_configuration_status(config: BenchwiseConfig):
+def print_configuration_status(config: BenchwiseConfig) -> None:
     """
     NEW: Print comprehensive configuration status.
 
diff --git a/benchwise/datasets.py b/benchwise/datasets.py
index 2d1c416..194b5c2 100644
--- a/benchwise/datasets.py
+++ b/benchwise/datasets.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Any, Optional, Union
+from typing import List, Dict, Any, Optional, Union, Callable
 import json
 import pandas as pd
 from pathlib import Path
@@ -6,6 +6,8 @@
 from dataclasses import dataclass
 import hashlib
 
+from benchwise.types import DatasetItem, DatasetMetadata
+
 
 @dataclass
 class Dataset:
@@ -24,7 +26,7 @@ class Dataset:
     metadata: Optional[Dict[str, Any]] = None
     schema: Optional[Dict[str, Any]] = None
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         if self.metadata is None:
             self.metadata = {}
 
@@ -78,13 +80,14 @@ def references(self) -> List[str]:
                 references.append(str(ref))
         return references
 
-    def filter(self, condition: callable) -> "Dataset":
+    def filter(self, condition: Callable[[Dict[str, Any]], bool]) -> "Dataset":
         """Filter dataset items based on condition."""
         filtered_data = [item for item in self.data if condition(item)]
+        metadata = self.metadata or {}
         return Dataset(
             name=f"{self.name}_filtered",
             data=filtered_data,
-            metadata={**self.metadata, "filtered": True, "original_size": self.size},
+            metadata={**metadata, "filtered": True, "original_size": self.size},
         )
 
     def sample(self, n: int, random_state: Optional[int] = None) -> "Dataset":
@@ -95,6 +98,7 @@ def sample(self, n: int, random_state: Optional[int] = None) -> "Dataset":
             random.seed(random_state)
 
         sampled_data = random.sample(self.data, min(n, len(self.data)))
+        metadata = self.metadata or {}
         return Dataset(
             name=f"{self.name}_sample_{n}",
             data=sampled_data,
diff --git a/benchwise/exceptions.py b/benchwise/exceptions.py
index 498bd02..d5a9888 100644
--- a/benchwise/exceptions.py
+++ b/benchwise/exceptions.py
@@ -4,6 +4,8 @@
 Provides specific exception types for better error handling.
 """
 
+from typing import Optional
+
 
 class BenchwiseError(Exception):
     """Base exception for all Benchwise errors."""
@@ -17,8 +19,8 @@ class AuthenticationError(BenchwiseError):
 
 class RateLimitError(BenchwiseError):
     """Raised when API rate limit is exceeded."""
-    
-    def __init__(self, message: str = "Rate limit exceeded", retry_after: int = None):
+
+    def __init__(self, message: str = "Rate limit exceeded", retry_after: Optional[int] = None) -> None:
         super().__init__(message)
         self.retry_after = retry_after
 
diff --git a/benchwise/py.typed b/benchwise/py.typed
new file mode 100644
index 0000000..e69de29
diff --git a/benchwise/types.py b/benchwise/types.py
new file mode 100644
index 0000000..5818a62
--- /dev/null
+++ b/benchwise/types.py
@@ -0,0 +1,291 @@
+"""
+Type definitions for BenchWise.
+
+This module contains TypedDict definitions, Protocols, Literal types, and type variables
+used throughout the BenchWise codebase for improved type safety and IDE support.
+"""
+
+from typing import Any, Dict, List, Literal, Optional, Protocol, TypeVar, ParamSpec, Tuple, TypedDict
+
+# Type Variables
+T = TypeVar('T')
+R = TypeVar('R')
+P = ParamSpec('P')
+ModelT = TypeVar('ModelT')
+DatasetT = TypeVar('DatasetT')
+
+# Literal Types
+HttpMethod = Literal["GET", "POST", "PUT", "DELETE", "PATCH"]
+ModelProvider = Literal["openai", "anthropic", "google", "huggingface", "custom"]
+ExportFormat = Literal["json", "csv", "markdown"]
+
+
+# Model Configuration Types
+class ModelConfig(TypedDict, total=False):
+    """Configuration options for model adapters."""
+    api_key: str
+    temperature: float
+    max_tokens: int
+    top_p: float
+    frequency_penalty: float
+    presence_penalty: float
+    timeout: float
+    max_retries: int
+
+
+class PricingInfo(TypedDict):
+    """Pricing information for a model."""
+    input: float  # Cost per 1K input tokens
+    output: float  # Cost per 1K output tokens
+
+
+# Metric Return Types
+class RougeScores(TypedDict, total=False):
+    """Return type for ROUGE metric scores."""
+    precision: float
+    recall: float
+    f1: float
+    rouge1_f1: float
+    rouge2_f1: float
+    rougeL_f1: float
+    std_precision: float
+    std_recall: float
+    std_f1: float
+    scores: Dict[str, List[float]]
+    # Optional confidence intervals
+    f1_confidence_interval: Tuple[float, float]
+    precision_confidence_interval: Tuple[float, float]
+    recall_confidence_interval: Tuple[float, float]
+
+
+class BleuScores(TypedDict, total=False):
+    """Return type for BLEU metric scores."""
+    bleu: float
+    bleu1: float
+    bleu2: float
+    bleu3: float
+    bleu4: float
+    brevity_penalty: float
+    length_ratio: float
+    std_bleu: float
+    scores: List[float]
+    # Optional confidence intervals
+    bleu_confidence_interval: Tuple[float, float]
+
+
+class BertScoreResults(TypedDict, total=False):
+    """Return type for BERT-Score metric."""
+    precision: float
+    recall: float
+    f1: float
+    std_precision: float
+    std_recall: float
+    std_f1: float
+    scores: Dict[str, List[float]]
+    # Optional confidence intervals
+    f1_confidence_interval: Tuple[float, float]
+    precision_confidence_interval: Tuple[float, float]
+    recall_confidence_interval: Tuple[float, float]
+
+
+class AccuracyResults(TypedDict, total=False):
+    """Return type for accuracy metric."""
+    accuracy: float
+    correct: int
+    total: int
+    std_accuracy: float
+    scores: List[float]
+    # Optional confidence interval
+    accuracy_confidence_interval: Tuple[float, float]
+
+
+class SemanticSimilarityResults(TypedDict, total=False):
+    """Return type for semantic similarity metric."""
+    similarity: float
+    std_similarity: float
+    scores: List[float]
+    # Optional confidence interval
+    similarity_confidence_interval: Tuple[float, float]
+
+
+class CoherenceResults(TypedDict, total=False):
+    """Return type for coherence score metric."""
+    coherence: float
+    std_coherence: float
+    scores: List[float]
+    # Optional confidence interval
+    coherence_confidence_interval: Tuple[float, float]
+
+
+class SafetyResults(TypedDict, total=False):
+    """Return type for safety score metric."""
+    safety: float
+    is_safe: bool
+    flagged_categories: List[str]
+    std_safety: float
+    scores: List[float]
+    # Optional confidence interval
+    safety_confidence_interval: Tuple[float, float]
+
+
+class FactualCorrectnessResults(TypedDict, total=False):
+    """Return type for factual correctness metric."""
+    correctness: float
+    is_correct: bool
+    std_correctness: float
+    scores: List[float]
+    # Optional confidence interval
+    correctness_confidence_interval: Tuple[float, float]
+
+
+# Dataset Types
+class DatasetItem(TypedDict, total=False):
+    """A single item in a dataset."""
+    # Common field names
+    prompt: str
+    input: str
+    question: str
+    text: str
+    # Reference/target fields
+    reference: str
+    output: str
+    answer: str
+    target: str
+    summary: str
+    # Additional fields
+    id: str
+    metadata: Dict[str, Any]
+
+
+class DatasetMetadata(TypedDict, total=False):
+    """Metadata for a dataset."""
+    name: str
+    description: str
+    source: str
+    version: str
+    size: int
+    created_at: str
+    tags: List[str]
+
+
+class DatasetSchema(TypedDict, total=False):
+    """Schema definition for a dataset."""
+    prompt_field: str
+    reference_field: str
+    required_fields: List[str]
+    optional_fields: List[str]
+
+
+# Configuration Types
+class ConfigDict(TypedDict, total=False):
+    """Configuration dictionary for BenchWise."""
+    api_url: str
+    api_key: Optional[str]
+    upload_enabled: bool
+    auto_sync: bool
+    cache_enabled: bool
+    cache_dir: str
+    timeout: float
+    max_retries: int
+    offline_mode: bool
+    debug: bool
+    verbose: bool
+    default_models: List[str]
+    default_metrics: List[str]
+
+
+# Results Types
+class EvaluationResultDict(TypedDict, total=False):
+    """Serialized evaluation result."""
+    model: str
+    prompt: str
+    response: str
+    score: float
+    scores: Dict[str, float]
+    metadata: Dict[str, Any]
+    timestamp: str
+    success: bool
+    error: Optional[str]
+
+
+class BenchmarkResultDict(TypedDict, total=False):
+    """Serialized benchmark result."""
+    benchmark_name: str
+    benchmark_description: str
+    results: List[EvaluationResultDict]
+    summary: Dict[str, Any]
+    timestamp: str
+
+
+class ComparisonResult(TypedDict):
+    """Result of model comparison."""
+    best_model: str
+    best_score: float
+    rankings: List[Tuple[str, float]]
+    scores: Dict[str, float]
+
+
+# API Response Types
+class LoginResponse(TypedDict):
+    """Response from login endpoint."""
+    token: Dict[str, str]
+    user: Dict[str, Any]
+
+
+class UserInfo(TypedDict, total=False):
+    """User information from API."""
+    id: int
+    username: str
+    email: str
+    full_name: Optional[str]
+    is_active: bool
+
+
+class UploadResultsResponse(TypedDict):
+    """Response from upload results endpoint."""
+    id: int
+    benchmark_id: int
+    model_ids: List[int]
+    results_count: int
+    message: str
+
+
+# Protocols
+class SupportsGenerate(Protocol):
+    """Protocol for objects that support text generation."""
+
+    async def generate(self, prompts: List[str], **kwargs: Any) -> List[str]:
+        """Generate text completions for the given prompts."""
+        ...
+
+    def get_token_count(self, text: str) -> int:
+        """Get the token count for the given text."""
+        ...
+
+    def get_cost_estimate(self, input_tokens: int, output_tokens: int) -> float:
+        """Estimate the cost for the given token counts."""
+        ...
+
+
+class SupportsCache(Protocol):
+    """Protocol for objects that support caching."""
+
+    def save(self, key: str, value: Any) -> None:
+        """Save a value to the cache."""
+        ...
+
+    def load(self, key: str) -> Optional[Any]:
+        """Load a value from the cache."""
+        ...
+
+    def exists(self, key: str) -> bool:
+        """Check if a key exists in the cache."""
+        ...
+
+
+class SupportsMetrics(Protocol):
+    """Protocol for objects that support metric evaluation."""
+
+    def evaluate(self, predictions: List[str], references: List[str], **kwargs: Any) -> Dict[str, float]:
+        """Evaluate predictions against references."""
+        ...
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 0000000..4149f2b
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,66 @@
+[mypy]
+python_version = 3.11
+files = benchwise
+
+# Strict type checking
+strict = True
+disallow_untyped_defs = True
+disallow_any_generics = True
+disallow_subclassing_any = True
+disallow_untyped_calls = True
+disallow_incomplete_defs = True
+check_untyped_defs = True
+disallow_untyped_decorators = True
+warn_redundant_casts = True
+warn_unused_ignores = True
+warn_return_any = True
+warn_unreachable = True
+no_implicit_optional = True
+strict_optional = True
+strict_equality = True
+
+# Show error codes for easier suppression
+show_error_codes = True
+
+# Third-party library ignores (no stubs available)
+[mypy-rouge_score.*]
+ignore_missing_imports = True
+
+[mypy-bert_score.*]
+ignore_missing_imports = True
+
+[mypy-nltk.*]
+ignore_missing_imports = True
+
+[mypy-transformers.*]
+ignore_missing_imports = True
+
+[mypy-torch.*]
+ignore_missing_imports = True
+
+[mypy-sentence_transformers.*]
+ignore_missing_imports = True
+
+[mypy-sklearn.*]
+ignore_missing_imports = True
+
+[mypy-httpx.*]
+ignore_missing_imports = True
+
+[mypy-requests.*]
+ignore_missing_imports = True
+
+[mypy-openai.*]
+ignore_missing_imports = True
+
+[mypy-anthropic.*]
+ignore_missing_imports = True
+
+[mypy-google.generativeai.*]
+ignore_missing_imports = True
+
+[mypy-fuzzywuzzy.*]
+ignore_missing_imports = True
+
+[mypy-pandas.*]
+ignore_missing_imports = True

From 3f4e3a026056b24b22dc7c625c48fe2d34ac879a Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Mon, 1 Dec 2025 17:38:48 +0530
Subject: [PATCH 02/24] feat: Add type annotations to model adapters

- Update ModelAdapter and all implementations with ModelConfig TypedDict
- Add PricingInfo TypedDict for pricing dictionaries
- Add proper type annotations to all __init__ and generate methods
- Fix type issues in cost estimation calculations
---
 benchwise/models.py | 50 +++++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/benchwise/models.py b/benchwise/models.py
index cd5c88a..78a50b9 100644
--- a/benchwise/models.py
+++ b/benchwise/models.py
@@ -1,16 +1,18 @@
 from abc import ABC, abstractmethod
 from typing import List, Dict, Any, Optional
 
+from benchwise.types import ModelConfig, PricingInfo
+
 
 class ModelAdapter(ABC):
     """Abstract base class for model adapters."""
 
-    def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None):
+    def __init__(self, model_name: str, config: Optional[ModelConfig] = None) -> None:
         self.model_name = model_name
-        self.config = config or {}
+        self.config: ModelConfig = config or {}
 
     @abstractmethod
-    async def generate(self, prompts: List[str], **kwargs) -> List[str]:
+    async def generate(self, prompts: List[str], **kwargs: Any) -> List[str]:
         """Generate responses for a list of prompts."""
         pass
 
@@ -28,7 +30,7 @@ def get_cost_estimate(self, input_tokens: int, output_tokens: int) -> float:
 class OpenAIAdapter(ModelAdapter):
     """Adapter for OpenAI models."""
 
-    def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None):
+    def __init__(self, model_name: str, config: Optional[ModelConfig] = None) -> None:
         super().__init__(model_name, config)
         try:
             import openai
@@ -42,16 +44,16 @@ def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None):
             )
 
         # Model pricing (per 1K tokens)
-        self.pricing = {
+        self.pricing: Dict[str, PricingInfo] = {
             "gpt-4": {"input": 0.03, "output": 0.06},
             "gpt-4-turbo": {"input": 0.01, "output": 0.03},
             "gpt-3.5-turbo": {"input": 0.001, "output": 0.002},
             "gpt-4o": {"input": 0.005, "output": 0.015},
         }
 
-    async def generate(self, prompts: List[str], **kwargs) -> List[str]:
+    async def generate(self, prompts: List[str], **kwargs: Any) -> List[str]:
         """Generate responses using OpenAI API."""
-        responses = []
+        responses: List[str] = []
 
         # Default parameters - exclude api_key from generation params
         generation_params = {
@@ -85,15 +87,15 @@ def get_cost_estimate(self, input_tokens: int, output_tokens: int) -> float:
         model_pricing = self.pricing.get(
             self.model_name, {"input": 0.01, "output": 0.03}
         )
-        input_cost = (input_tokens / 1000) * model_pricing["input"]
-        output_cost = (output_tokens / 1000) * model_pricing["output"]
+        input_cost = (input_tokens / 1000) * float(model_pricing["input"])
+        output_cost = (output_tokens / 1000) * float(model_pricing["output"])
         return input_cost + output_cost
 
 
 class AnthropicAdapter(ModelAdapter):
     """Adapter for Anthropic Claude models."""
 
-    def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None):
+    def __init__(self, model_name: str, config: Optional[ModelConfig] = None) -> None:
         super().__init__(model_name, config)
         try:
             import anthropic
@@ -107,16 +109,16 @@ def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None):
             )
 
         # Model pricing (per 1K tokens)
-        self.pricing = {
+        self.pricing: Dict[str, PricingInfo] = {
             "claude-3-opus": {"input": 0.015, "output": 0.075},
             "claude-3-sonnet": {"input": 0.003, "output": 0.015},
             "claude-3-haiku": {"input": 0.00025, "output": 0.00125},
             "claude-3.5-sonnet": {"input": 0.003, "output": 0.015},
         }
 
-    async def generate(self, prompts: List[str], **kwargs) -> List[str]:
+    async def generate(self, prompts: List[str], **kwargs: Any) -> List[str]:
         """Generate responses using Anthropic API."""
-        responses = []
+        responses: List[str] = []
 
         # Default parameters - exclude api_key from generation params
         generation_params = {
@@ -150,15 +152,15 @@ def get_cost_estimate(self, input_tokens: int, output_tokens: int) -> float:
         model_pricing = self.pricing.get(
             self.model_name, {"input": 0.003, "output": 0.015}
         )
-        input_cost = (input_tokens / 1000) * model_pricing["input"]
-        output_cost = (output_tokens / 1000) * model_pricing["output"]
+        input_cost = (input_tokens / 1000) * float(model_pricing["input"])
+        output_cost = (output_tokens / 1000) * float(model_pricing["output"])
         return input_cost + output_cost
 
 
 class GoogleAdapter(ModelAdapter):
     """Adapter for Google Gemini models."""
 
-    def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None):
+    def __init__(self, model_name: str, config: Optional[ModelConfig] = None) -> None:
         super().__init__(model_name, config)
         try:
             import google.generativeai as genai
@@ -172,9 +174,9 @@ def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None):
                 "Google Generative AI package not installed. Please install it with: pip install 'benchwise[llm-apis]' or pip install google-generativeai"
             )
 
-    async def generate(self, prompts: List[str], **kwargs) -> List[str]:
+    async def generate(self, prompts: List[str], **kwargs: Any) -> List[str]:
         """Generate responses using Google Gemini API."""
-        responses = []
+        responses: List[str] = []
 
         for prompt in prompts:
             try:
@@ -206,7 +208,7 @@ def get_cost_estimate(self, input_tokens: int, output_tokens: int) -> float:
 class HuggingFaceAdapter(ModelAdapter):
     """Adapter for Hugging Face models."""
 
-    def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None):
+    def __init__(self, model_name: str, config: Optional[ModelConfig] = None) -> None:
         super().__init__(model_name, config)
         try:
             from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -218,9 +220,9 @@ def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None):
                 "Transformers package not installed. Please install it with: pip install 'benchwise[transformers]' or pip install transformers torch"
             )
 
-    async def generate(self, prompts: List[str], **kwargs) -> List[str]:
+    async def generate(self, prompts: List[str], **kwargs: Any) -> List[str]:
         """Generate responses using Hugging Face models."""
-        responses = []
+        responses: List[str] = []
 
         for prompt in prompts:
             try:
@@ -251,10 +253,10 @@ def get_cost_estimate(self, input_tokens: int, output_tokens: int) -> float:
 class MockAdapter(ModelAdapter):
     """Mock adapter for testing without API dependencies."""
 
-    def __init__(self, model_name: str, config: Optional[Dict[str, Any]] = None):
+    def __init__(self, model_name: str, config: Optional[ModelConfig] = None) -> None:
         super().__init__(model_name, config)
 
-    async def generate(self, prompts: List[str], **kwargs) -> List[str]:
+    async def generate(self, prompts: List[str], **kwargs: Any) -> List[str]:
         """Generate mock responses."""
         return [
             f"Mock response from {self.model_name} for: {prompt[:50]}..."
@@ -271,7 +273,7 @@ def get_cost_estimate(self, input_tokens: int, output_tokens: int) -> float:
 
 
 def get_model_adapter(
-    model_name: str, config: Optional[Dict[str, Any]] = None
+    model_name: str, config: Optional[ModelConfig] = None
 ) -> ModelAdapter:
     """Factory function to get the appropriate model adapter."""
 

From 18e728e81ebedf12be2b5f3657b288493979c2f8 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Mon, 1 Dec 2025 18:06:38 +0530
Subject: [PATCH 03/24] feat: Add ParamSpec decorator typing to core.py

- Add proper Callable type signatures for evaluate, benchmark, and stress_test decorators
- Use ParamSpec and TypeVar for generic decorator typing
- Add Awaitable type annotations for async functions
- Type all **kwargs parameters as Any
- Update _run_evaluation with specific tuple[str, ...] type
- Add type ignore comments for dynamic _benchmark_metadata attributes
- All 124 tests passing (2 pre-existing benchmark test failures unchanged)
---
 benchwise/core.py | 60 ++++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/benchwise/core.py b/benchwise/core.py
index 8d64c15..b682cb0 100644
--- a/benchwise/core.py
+++ b/benchwise/core.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Any, Callable, Optional
+from typing import List, Dict, Any, Callable, Optional, Union, ParamSpec, TypeVar, Awaitable
 from functools import wraps
 import asyncio
 import time
@@ -10,10 +10,16 @@
 from .config import get_api_config
 from .client import upload_results
 
+# Type variables for decorator typing
+P = ParamSpec('P')
+R = TypeVar('R')
+
 logger = logging.getLogger("benchwise")
 
 
-def evaluate(*models: str, upload: bool = None, **kwargs) -> Callable:
+def evaluate(
+    *models: str, upload: Optional[bool] = None, **kwargs: Any
+) -> Callable[[Callable[..., Awaitable[Any]]], Callable[[Dataset, Any], Awaitable[List[EvaluationResult]]]]:
     """
     Decorator for creating LLM evaluations.
 
@@ -35,7 +41,7 @@ async def test_qa(model, dataset):
             return accuracy(responses, dataset.references)
     """
 
-    def decorator(test_func: Callable) -> Callable:
+    def decorator(test_func: Callable[..., Awaitable[Any]]) -> Callable[[Dataset, Any], Awaitable[List[EvaluationResult]]]:
         if not inspect.iscoroutinefunction(test_func):
             raise TypeError(
                 f"{test_func.__name__} must be an async function. "
@@ -47,17 +53,17 @@ async def wrapper(dataset: Dataset, **test_kwargs) -> List[EvaluationResult]:
             return await _run_evaluation(test_func, dataset, models, upload, kwargs, test_kwargs)
         
         if hasattr(test_func, "_benchmark_metadata"):
-            wrapper._benchmark_metadata = test_func._benchmark_metadata
-        
+            wrapper._benchmark_metadata = test_func._benchmark_metadata  # type: ignore[attr-defined]
+
         return wrapper
 
     return decorator
 
 
 async def _run_evaluation(
-    test_func: Callable,
+    test_func: Callable[..., Awaitable[Any]],
     dataset: Dataset,
-    models: tuple,
+    models: tuple[str, ...],
     upload: Optional[bool],
     decorator_kwargs: Dict[str, Any],
     test_kwargs: Dict[str, Any],
@@ -127,7 +133,7 @@ async def _run_evaluation(
     return results
 
 
-def benchmark(name: str, description: str = "", **kwargs) -> Callable:
+def benchmark(name: str, description: str = "", **kwargs: Any) -> Callable[[Callable[P, R]], Callable[P, R]]:
     """
     Decorator for creating benchmarks.
 
@@ -137,8 +143,8 @@ async def medical_qa_test(model, dataset):
             pass
     """
 
-    def decorator(test_func: Callable) -> Callable:
-        test_func._benchmark_metadata = {
+    def decorator(test_func: Callable[P, R]) -> Callable[P, R]:
+        test_func._benchmark_metadata = {  # type: ignore[attr-defined]
             "name": name,
             "description": description,
             **kwargs,
@@ -148,10 +154,10 @@ def decorator(test_func: Callable) -> Callable:
     return decorator
 
 
-def stress_test(concurrent_requests: int = 10, duration: int = 60) -> Callable:
+def stress_test(concurrent_requests: int = 10, duration: int = 60) -> Callable[[Callable[P, Awaitable[R]]], Callable[P, Awaitable[List[Union[R, Exception]]]]]:
     """
     Decorator for stress testing LLMs.
-    
+
     NOTE: WIP feature - may not be fully functional.
 
     Usage:
@@ -160,12 +166,12 @@ async def load_test(model, dataset):
             pass
     """
 
-    def decorator(test_func: Callable) -> Callable:
+    def decorator(test_func: Callable[P, Awaitable[R]]) -> Callable[P, Awaitable[List[Union[R, Exception]]]]:
         @wraps(test_func)
-        async def wrapper(*args, **kwargs):
+        async def wrapper(*args: P.args, **kwargs: P.kwargs) -> List[Union[R, Exception]]:
             logger.info(f"Starting stress test: {concurrent_requests} concurrent requests for {duration}s")
-            
-            tasks = []
+
+            tasks: List[Union[R, Exception]] = []
             start_time = time.time()
 
             while time.time() - start_time < duration:
@@ -191,17 +197,17 @@ async def wrapper(*args, **kwargs):
 class EvaluationRunner:
     """Main class for running evaluations."""
 
-    def __init__(self, config: Optional[Dict[str, Any]] = None):
-        self.config = config or {}
-        self.results_cache = {}
+    def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
+        self.config: Dict[str, Any] = config or {}
+        self.results_cache: Dict[str, Any] = {}
         self.logger = logging.getLogger("benchwise.runner")
 
     async def run_evaluation(
-        self, test_func: Callable, dataset: Dataset, models: List[str]
+        self, test_func: Callable[..., Awaitable[Any]], dataset: Dataset, models: List[str]
     ) -> List[EvaluationResult]:
         """Run evaluation on multiple models."""
-        results = []
-        
+        results: List[EvaluationResult] = []
+
         self.logger.info(f"Running evaluation on {len(models)} models")
 
         for model_name in models:
@@ -215,7 +221,7 @@ async def run_evaluation(
         return results
 
     def compare_models(
-        self, results: List[EvaluationResult], metric_name: str = None
+        self, results: List[EvaluationResult], metric_name: Optional[str] = None
     ) -> Dict[str, Any]:
         """Compare model performance."""
         successful_results = [r for r in results if r.success]
@@ -267,17 +273,17 @@ def compare_models(
 
 
 def run_benchmark(
-    benchmark_func: Callable, dataset: Dataset, models: List[str]
+    benchmark_func: Callable[..., Awaitable[Any]], dataset: Dataset, models: List[str]
 ) -> List[EvaluationResult]:
     """Run a benchmark on multiple models."""
     runner = EvaluationRunner()
     return asyncio.run(runner.run_evaluation(benchmark_func, dataset, models))
 
 
-async def quick_eval(prompt: str, models: List[str], metric: Callable) -> Dict[str, float]:
+async def quick_eval(prompt: str, models: List[str], metric: Callable[[str], float]) -> Dict[str, Optional[float]]:
     """Quick evaluation with a single prompt."""
-    results = {}
-    
+    results: Dict[str, Optional[float]] = {}
+
     logger.info(f"Running quick eval on {len(models)} models")
 
     for model_name in models:

From 4cb069b2c33021bfec9fe3e7b59d478fb8d8ec22 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Mon, 1 Dec 2025 18:42:51 +0530
Subject: [PATCH 04/24] feat: Add type stubs for pandas and requests

---
 mypy.ini       | 6 +-----
 pyproject.toml | 2 ++
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 4149f2b..6aa31f6 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -47,8 +47,7 @@ ignore_missing_imports = True
 [mypy-httpx.*]
 ignore_missing_imports = True
 
-[mypy-requests.*]
-ignore_missing_imports = True
+# Note: pandas and requests have type stubs installed (pandas-stubs, types-requests)
 
 [mypy-openai.*]
 ignore_missing_imports = True
@@ -61,6 +60,3 @@ ignore_missing_imports = True
 
 [mypy-fuzzywuzzy.*]
 ignore_missing_imports = True
-
-[mypy-pandas.*]
-ignore_missing_imports = True
diff --git a/pyproject.toml b/pyproject.toml
index 0076abb..c280da3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,6 +70,8 @@ lint = [
     "ruff>=0.1.6",
     "pre-commit>=3.0.0",
     "mypy>=1.0.0",
+    "pandas-stubs>=2.0.0",
+    "types-requests>=2.28.0",
 ]
 
 dev = [

From 469f4c9a4b0342d14c44a46c89f7fef34295b638 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Tue, 2 Dec 2025 11:57:34 +0530
Subject: [PATCH 05/24] fix: Use metadata variable in Dataset.sample

---
 benchwise/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchwise/datasets.py b/benchwise/datasets.py
index 194b5c2..b4fca30 100644
--- a/benchwise/datasets.py
+++ b/benchwise/datasets.py
@@ -102,7 +102,7 @@ def sample(self, n: int, random_state: Optional[int] = None) -> "Dataset":
         return Dataset(
             name=f"{self.name}_sample_{n}",
             data=sampled_data,
-            metadata={**self.metadata, "sampled": True, "sample_size": n},
+            metadata={**metadata, "sampled": True, "sample_size": n},
         )
 
     def split(

From d1f6318f7490d6c0200d6f7d4027544e7c39da4a Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Tue, 2 Dec 2025 11:58:18 +0530
Subject: [PATCH 06/24] feat: Start metrics.py type improvements

---
 benchwise/metrics.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/benchwise/metrics.py b/benchwise/metrics.py
index e616d91..dd27312 100644
--- a/benchwise/metrics.py
+++ b/benchwise/metrics.py
@@ -1,5 +1,7 @@
-from typing import List, Dict, Any, Tuple, Optional
+from typing import List, Dict, Any, Tuple, Optional, Union
 import numpy as np
+from numpy.typing import NDArray
+from benchwise.types import RougeScores, BleuScores, BertScoreResults, AccuracyResults
 from rouge_score import rouge_scorer
 from sacrebleu import BLEU
 import bert_score
@@ -26,7 +28,7 @@ def _bootstrap_confidence_interval(
 ) -> Tuple[float, float]:
     """Calculate bootstrap confidence interval for a list of scores."""
     if len(scores) < 2:
-        return (np.mean(scores), np.mean(scores))
+        return (float(np.mean(scores)), float(np.mean(scores)))
 
     bootstrap_means = []
     for _ in range(n_bootstrap):
@@ -38,8 +40,8 @@ def _bootstrap_confidence_interval(
     upper_percentile = (1 - alpha / 2) * 100
 
     return (
-        np.percentile(bootstrap_means, lower_percentile),
-        np.percentile(bootstrap_means, upper_percentile),
+        float(np.percentile(bootstrap_means, lower_percentile)),
+        float(np.percentile(bootstrap_means, upper_percentile)),
     )
 
 
@@ -65,7 +67,7 @@ def rouge_l(
     use_stemmer: bool = True,
     alpha: float = 0.5,
     return_confidence: bool = True,
-) -> Dict[str, float]:
+) -> RougeScores:
     """
     Calculate enhanced ROUGE-L scores for predictions vs references.
 
@@ -96,7 +98,7 @@ def rouge_l(
     scorer = rouge_scorer.RougeScorer(
         ["rougeL", "rouge1", "rouge2"], use_stemmer=use_stemmer
     )
-    scores = {"precision": [], "recall": [], "f1": [], "rouge1_f1": [], "rouge2_f1": []}
+    scores: Dict[str, List[float]] = {"precision": [], "recall": [], "f1": [], "rouge1_f1": [], "rouge2_f1": []}
 
     for pred, ref in zip(predictions, references):
         # Handle empty strings gracefully

From 3d94831c3dcd4b822cb98f1f6caef4c85c11ade6 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Tue, 2 Dec 2025 17:14:29 +0530
Subject: [PATCH 07/24] feat: Fix all type errors in metrics.py and logging.py

---
 benchwise/logging.py |  2 +-
 benchwise/metrics.py | 82 ++++++++++++++++++++++----------------------
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/benchwise/logging.py b/benchwise/logging.py
index 3f6b6a5..3b2bd31 100644
--- a/benchwise/logging.py
+++ b/benchwise/logging.py
@@ -68,7 +68,7 @@ def get_logger(name: str = "benchwise") -> logging.Logger:
     return logging.getLogger(name)
 
 
-def set_log_level(level: str):
+def set_log_level(level: str) -> None:
     """
     Change the log level for all Benchwise loggers.
     
diff --git a/benchwise/metrics.py b/benchwise/metrics.py
index dd27312..25ceb3a 100644
--- a/benchwise/metrics.py
+++ b/benchwise/metrics.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Any, Tuple, Optional, Union
+from typing import List, Dict, Any, Tuple, Optional, Union, Callable
 import numpy as np
 from numpy.typing import NDArray
 from benchwise.types import RougeScores, BleuScores, BertScoreResults, AccuracyResults
@@ -132,15 +132,15 @@ def rouge_l(
             scores["rouge1_f1"].append(score["rouge1"].fmeasure)
             scores["rouge2_f1"].append(score["rouge2"].fmeasure)
 
-    result = {
-        "precision": np.mean(scores["precision"]),
-        "recall": np.mean(scores["recall"]),
-        "f1": np.mean(scores["f1"]),
-        "rouge1_f1": np.mean(scores["rouge1_f1"]),
-        "rouge2_f1": np.mean(scores["rouge2_f1"]),
-        "std_precision": np.std(scores["precision"]),
-        "std_recall": np.std(scores["recall"]),
-        "std_f1": np.std(scores["f1"]),
+    result: RougeScores = {
+        "precision": float(np.mean(scores["precision"])),
+        "recall": float(np.mean(scores["recall"])),
+        "f1": float(np.mean(scores["f1"])),
+        "rouge1_f1": float(np.mean(scores["rouge1_f1"])),
+        "rouge2_f1": float(np.mean(scores["rouge2_f1"])),
+        "std_precision": float(np.std(scores["precision"])),
+        "std_recall": float(np.std(scores["recall"])),
+        "std_f1": float(np.std(scores["f1"])),
         "scores": scores,
     }
 
@@ -168,7 +168,7 @@ def bleu_score(
     smooth_method: str = "exp",
     return_confidence: bool = True,
     max_n: int = 4,
-) -> Dict[str, float]:
+) -> Dict[str, Any]:
     """
     Calculate enhanced BLEU scores for predictions vs references.
 
@@ -206,7 +206,7 @@ def bleu_score(
 
     # Calculate sentence-level BLEU with improved handling
     sentence_scores = []
-    ngram_precisions = {f"bleu_{i}": [] for i in range(1, max_n + 1)}
+    ngram_precisions: Dict[str, List[float]] = {f"bleu_{i}": [] for i in range(1, max_n + 1)}
 
     for pred, ref in zip(predictions, references):
         try:
@@ -284,23 +284,23 @@ def bleu_score(
     return result
 
 
-def _get_smoothing_function(smooth_method: str):
+def _get_smoothing_function(smooth_method: str) -> Optional[Callable[..., Any]]:
     """Get NLTK smoothing function based on method name."""
     from nltk.translate.bleu_score import SmoothingFunction
 
     smoothing = SmoothingFunction()
 
     if smooth_method == "exp":
-        return smoothing.method1
+        return smoothing.method1  # type: ignore[no-any-return]
     elif smooth_method == "floor":
-        return smoothing.method2
+        return smoothing.method2  # type: ignore[no-any-return]
     elif smooth_method == "add-k":
-        return smoothing.method3
+        return smoothing.method3  # type: ignore[no-any-return]
     else:
         return None
 
 
-def _get_weights(n: int) -> tuple:
+def _get_weights(n: int) -> Tuple[float, ...]:
     """Get n-gram weights for BLEU calculation."""
     weights = [0.0] * 4
     weights[n - 1] = 1.0
@@ -313,7 +313,7 @@ def bert_score_metric(
     model_type: str = "distilbert-base-uncased",
     return_confidence: bool = True,
     batch_size: int = 64,
-) -> Dict[str, float]:
+) -> Dict[str, Any]:
     """
     Calculate enhanced BERTScore for predictions vs references.
 
@@ -444,7 +444,7 @@ def accuracy(
     fuzzy_match: bool = False,
     fuzzy_threshold: float = 0.8,
     return_confidence: bool = True,
-) -> Dict[str, float]:
+) -> Dict[str, Any]:
     """
     Calculate enhanced exact match accuracy with multiple matching strategies.
 
@@ -557,7 +557,7 @@ def semantic_similarity(
     batch_size: int = 32,
     return_confidence: bool = True,
     similarity_threshold: float = 0.5,
-) -> Dict[str, float]:
+) -> Dict[str, Any]:
     """
     Calculate enhanced semantic similarity using sentence embeddings.
 
@@ -678,7 +678,7 @@ def semantic_similarity(
     return result
 
 
-def perplexity(predictions: List[str], model_name: str = "gpt2") -> Dict[str, float]:
+def perplexity(predictions: List[str], model_name: str = "gpt2") -> Dict[str, Any]:
     """
     Calculate perplexity of generated text.
 
@@ -697,7 +697,7 @@ def perplexity(predictions: List[str], model_name: str = "gpt2") -> Dict[str, fl
             "transformers and torch packages not installed. Please install them with: pip install 'benchwise[transformers]' or pip install transformers torch"
         )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)  # type: ignore[no-untyped-call]
     model = AutoModelForCausalLM.from_pretrained(model_name)
 
     perplexities = []
@@ -714,8 +714,8 @@ def perplexity(predictions: List[str], model_name: str = "gpt2") -> Dict[str, fl
             perplexities.append(perplexity)
 
     return {
-        "mean_perplexity": np.mean(perplexities),
-        "median_perplexity": np.median(perplexities),
+        "mean_perplexity": float(np.mean(perplexities)),
+        "median_perplexity": float(np.median(perplexities)),
         "scores": perplexities,
     }
 
@@ -787,7 +787,7 @@ def factual_correctness(
 
         # Calculate overall correctness score
         overall_score = np.mean(list(factual_analysis.values()))
-        correctness_scores.append(overall_score)
+        correctness_scores.append(float(overall_score))
         detailed_results.append(factual_analysis)
 
     # Compile results
@@ -834,7 +834,7 @@ def factual_correctness(
 
 
 def _analyze_factual_correctness(
-    prediction: str, reference: str, nlp_model=None, use_named_entities: bool = True
+    prediction: str, reference: str, nlp_model: Any = None, use_named_entities: bool = True
 ) -> Dict[str, float]:
     """
     Analyze factual correctness using multiple approaches.
@@ -870,7 +870,7 @@ def _analyze_factual_correctness(
     }
 
 
-def _calculate_entity_overlap(prediction: str, reference: str, nlp_model) -> float:
+def _calculate_entity_overlap(prediction: str, reference: str, nlp_model: Any) -> float:
     """
     Calculate overlap between named entities in prediction and reference.
     """
@@ -915,7 +915,7 @@ def _calculate_enhanced_keyword_overlap(prediction: str, reference: str) -> floa
     }
 
     # Extract important words from reference
-    important_ref_words = set()
+    important_ref_words: set[str] = set()
     " ".join(ref_words)
 
     for pattern_type, pattern in important_patterns.items():
@@ -1029,7 +1029,7 @@ def coherence_score(
         return {"mean_coherence": 1.0, "scores": []}
 
     coherence_scores = []
-    component_scores = {
+    component_scores: Dict[str, List[float]] = {
         "sentence_consistency": [],
         "lexical_diversity": [],
         "flow_continuity": [],
@@ -1048,7 +1048,7 @@ def coherence_score(
 
         # Calculate overall coherence score
         overall_coherence = np.mean(list(coherence_components.values()))
-        coherence_scores.append(overall_coherence)
+        coherence_scores.append(float(overall_coherence))
 
         # Store component scores
         for component, score in coherence_components.items():
@@ -1111,7 +1111,7 @@ def _analyze_text_coherence(text: str) -> Dict[str, float]:
             if np.mean(sentence_lengths) > 0
             else 1
         )
-        sentence_consistency = max(0, 1 - (length_cv / 2))  # Normalize to 0-1
+        sentence_consistency = float(max(0, 1 - (length_cv / 2)))  # Normalize to 0-1
     else:
         sentence_consistency = 1.0 if sentence_lengths else 0.0
 
@@ -1254,18 +1254,18 @@ def _calculate_topic_consistency(sentences: List[str]) -> float:
         overlaps.append(overlap)
 
     # Return average overlap
-    return np.mean(overlaps) if overlaps else 1.0
+    return float(np.mean(overlaps)) if overlaps else 1.0
 
 
 def safety_score(
     predictions: List[str],
-    safety_categories: List[str] = None,
+    safety_categories: Optional[List[str]] = None,
     use_perspective_api: bool = False,
-    perspective_api_key: str = None,
+    perspective_api_key: Optional[str] = None,
     return_confidence: bool = True,
     detailed_categories: bool = True,
     context_aware: bool = True,
-) -> Dict[str, float]:
+) -> Dict[str, Any]:
     """
     Evaluate safety of generated text with enhanced detection methods.
 
@@ -1343,8 +1343,8 @@ def safety_score(
     }
 
     safety_scores = []
-    category_scores = {cat: [] for cat in safety_categories}
-    violation_details = []
+    category_scores: Dict[str, List[float]] = {cat: [] for cat in safety_categories}
+    violation_details: List[List[str]] = []
 
     for i, text in enumerate(predictions):
         if not text.strip():
@@ -1484,15 +1484,15 @@ def _check_keyword_in_context(
 class MetricCollection:
     """Collection of evaluation metrics that can be run together."""
 
-    def __init__(self):
-        self.metrics = {}
+    def __init__(self) -> None:
+        self.metrics: Dict[str, Tuple[Callable[..., Any], Dict[str, Any]]] = {}
 
-    def add_metric(self, name: str, metric_func: callable, **kwargs):
+    def add_metric(self, name: str, metric_func: Callable[..., Any], **kwargs: Any) -> None:
         """Add a metric to the collection."""
         self.metrics[name] = (metric_func, kwargs)
 
     def evaluate(
-        self, predictions: List[str], references: List[str] = None
+        self, predictions: List[str], references: Optional[List[str]] = None
     ) -> Dict[str, Any]:
         """Run all metrics in the collection."""
         results = {}

From 7d2b93cfad9cff83c7e8169b8e9c834e68474ea2 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Tue, 2 Dec 2025 22:42:34 +0530
Subject: [PATCH 08/24] fix: Configure mypy to skip google.generativeai type
 checking

---
 GEMINI.md               |   1 +
 benchwise/config.py     |   6 +-
 mypy.ini                |   1 +
 mypy_baseline.txt       | Bin 0 -> 30682 bytes
 test_single_doc_file.py | 424 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 429 insertions(+), 3 deletions(-)
 create mode 100644 GEMINI.md
 create mode 100644 mypy_baseline.txt
 create mode 100644 test_single_doc_file.py

diff --git a/GEMINI.md b/GEMINI.md
new file mode 100644
index 0000000..d0892af
--- /dev/null
+++ b/GEMINI.md
@@ -0,0 +1 @@
+Follow CLAUDE.md
diff --git a/benchwise/config.py b/benchwise/config.py
index 4d7c74b..10b07f6 100644
--- a/benchwise/config.py
+++ b/benchwise/config.py
@@ -442,8 +442,8 @@ def validate_api_keys(config: BenchwiseConfig) -> Dict[str, bool]:
         try:
             import openai
 
-            client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-            client.models.list()
+            openai_client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+            openai_client.models.list()
             results["openai"] = True
         except Exception:
             results["openai"] = False
@@ -452,7 +452,7 @@ def validate_api_keys(config: BenchwiseConfig) -> Dict[str, bool]:
         try:
             import anthropic
 
-            client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+            anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
             # Note: Anthropic doesn't have a simple test endpoint
             results["anthropic"] = True  # Assume valid if key exists
         except Exception:
diff --git a/mypy.ini b/mypy.ini
index 6aa31f6..1ccb1bb 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -57,6 +57,7 @@ ignore_missing_imports = True
 
 [mypy-google.generativeai.*]
 ignore_missing_imports = True
+follow_imports = skip
 
 [mypy-fuzzywuzzy.*]
 ignore_missing_imports = True
diff --git a/mypy_baseline.txt b/mypy_baseline.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9d9e1f7460f915c81b2bd4dd3cbe892c25259929
GIT binary patch
literal 30682
zcmeI1PjA~c6u^7#ryz7nf!MKZId;6YL(r|j4(*DyI~j&SOSH{KA_a;{s$d^}kCZLS
znH4K>()93pD3VBieEhxt&#zwLKO}$zkN^@u0!RP}AOR$R1dsp{Kmter2_OL^fCP{L
z5<mh-00|%gB!C2v01`j~NB{{S0VIF~kN^@u0!RP}AOR$R1dsp{Kmter2_OL^fCP{L
z5<mh-00|%gB!C2v01`j~NB{{S0VIF~kN^@u0!RP}AOR$R1dsp{Kmter2_OL^fCP{L
z5<mh-00|%gB=F1x7EFZe8=kSVglWaYteX~-{>6karKFmWKSd~$lxn_+8Bt8<N|0+B
z=ZxqgW#nWfHSu(w#%#KbCA=qA(@RmzPRY)%Gw+nJ?<os4i+0}Myf^dS5Hc<0cdY!`
z>%FR7^efeV=b<J{XjPB_S<{U0x~^6|FVo7K5HEsPr||PBiFu}{pV&21vzhlxO{dM9
zR%;J*F(2u0&5|ZX9HUPRE@jW7LP!k|q##SFh?j*@vCN3OrmC}f%Ay95{@BT^U9NxN
za<wZRb)$Z7RIzQol2|F4%u~fep20m^U#>qMR?<J@BD9crMiQQ7;53A8z3VPWG8M9u
z3-jlw6R~9jyg|>e<bT}Aw4zc93Ny`AW^J-!f`Olx_9{Ks*vGuup=#A6eK4-1*G%&Q
zei!r~rHOIxW86Kc<%Al5;DB$MD5?dXjjkgDFBqLyrucHj)6{y8pmA|)e)5+2?QL;m
z{)5O*WRyeV0zcy-(=?7*bV?SvuG}u>3q>J=WIA7D1Y$NVy4KZ5Le&*h;2JzhrP5m$
zY;cfa;LOr0XF4)+@eT6gZiaMEOJr@+<8ZV$h4=@|^i(P2{K8Rq544GvWD~k%xea=6
zsie&yT2U@o<Po_f-es=kS9!$*zh%k;bJ+-uNGOG7-!<5wmIgwQ-C&53@A6E|QF{jn
z>{P3xB4y+||Ewu+$OKQRfXm}G1zMF6E;RYKEZxVI$I;k%!lhc}2^4DL8-MU<nliDm
zx^X<O$u!MK>=To=A!Bz;xHdS$co0;%Vt>)GCE`xmmV%i~kU;GwaDdf4+jYR^Iy4@+
zI`o5%yr_9xbKx%27!`W1*m8a&RRp#BWDORF25Ha33CWnA{%Qcdn~}Gq+wIyc)T)!|
z4SE&$W|d5939vz>2R2H*z@@pD^Df^X9G`2p;dWHc@%uxkNvra+6I+!I^(MWHv6gMQ
z-#>Q--%$V}h32ugz1KFFpfRRPri*z<HCsu=O^<xCebcn^HY~kQ;2bbGR3CFsY{7z_
zrkoonw2&)dQcOc0DAj}BoK48*KR%OS9FSDWHD7QItwEB;Jmi}Em71=U#$9q5LgyYq
zKM@zFBq`Fu=BRK@#R__T^P#P_kkN#kZ$-@IM#S5%Drfsgn7T6mWQ?q}PP56`*^29R
zzUV?woUJKT&5Q~zzR|ON;!Pr@P%#r2HcT-rNS;Dt9x+nLTtU%{*n(!w3b-k%g|SlT
zoz3XRui2cYe164>2b#-tpT?8g7z$MTc??x4^FoFi52`d;E9HlJiKlAk{QlVWGGAb@
zWwwk%H=PT`D1?LA3Xo~-x26Cb1g;X_QxM90P&~9A3<8(_nx$WJ8QbA}9x+XMT&sWo
zdo(FG1&=I$!7~rLzCEA}6XTg)=N+-hlJKQ#v?Iwf6HD94nwFcJhO89SI=20d%Vzy^
zw`JPt%$pk{(Al+w`wzG)+3Zk`YR2}T=`C^|!=@_hn$&5080L+V6s(?jM4$lCvILpM
z4gBT49<w}Sld_mv(=;<lm?R8(E!cGxW<c$lX=rm245ExicIc_j|E<X=7cK#`$kfo;
zF$EjFXRoJ$(d%bP(=}yYCQT1}&PvCn$)WGkq;Z-31N29`s(mkEoniNAM~ut<u>ZqP
ztPN$)cgHa1c9s3$q;p?7+X}5kW%CVS^UL%c_7_XOdZ>k}gWR0cd{C-jh&%3}LCp)=
z<>nTc21B=qcru|uZNIZ$=p)M126qs+IDT^3<3oFaBlNy-F<gzqeY<S2-`*%Tw_m21
zl_oSjX5IET%U~+YF74jbEZmyEK!NGd_7->7#|`V34a=eLW{jt1X=NYZ{cJTocbYZ_
zSsBMUpo9_Vk>NZ%cWr9(Nm(Jnxm+$|SdKpWAlGW$bI@Ae%wa38b8w1>EnK(gMs3g~
vu;y)dzz3O&i1Y{N<=2m9YfpTiEO`u_>`Az0;T4SEexGD8<%X<$@aFY@U*SAf

literal 0
HcmV?d00001

diff --git a/test_single_doc_file.py b/test_single_doc_file.py
new file mode 100644
index 0000000..5420d7b
--- /dev/null
+++ b/test_single_doc_file.py
@@ -0,0 +1,424 @@
+#!/usr/bin/env python3
+"""
+Test all code examples from a documentation file with REAL models.
+
+This script extracts all Python code blocks from a documentation markdown file
+and runs each one as a separate test with real OpenAI and Google models.
+
+The script can find documentation files in multiple ways:
+- Absolute path: /path/to/file.md
+- Relative to project root: docs/docs/examples/classification.md
+- Just filename (searches docs/ tree): classification.md
+
+Usage:
+    # Using just filename (searches in docs/ directory)
+    python test_single_doc_file.py classification.md
+
+    # Using relative path from project root
+    python test_single_doc_file.py docs/docs/examples/classification.md
+    python test_single_doc_file.py README.md
+    python test_single_doc_file.py docs/docs/getting-started/quickstart.md
+
+    # Syntax check only (no API calls)
+    python test_single_doc_file.py --syntax-only classification.md
+
+    # Save test results to files
+    python test_single_doc_file.py --save-results classification.md
+"""
+
+import re
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+from typing import List, Tuple
+
+
+def extract_code_blocks(markdown_file: Path) -> List[Tuple[str, int, int]]:
+    """
+    Extract all Python code blocks from a markdown file.
+    Returns list of (code, block_number, line_number) tuples.
+    """
+    with open(markdown_file, 'r', encoding='utf-8') as f:
+        content = f.read()
+
+    pattern = r'```python\n(.*?)```'
+    matches = re.finditer(pattern, content, re.DOTALL)
+
+    code_blocks = []
+    for i, match in enumerate(matches, 1):
+        code = match.group(1)
+        line_number = content[:match.start()].count('\n') + 1
+        code_blocks.append((code, i, line_number))
+
+    return code_blocks
+
+
+def prepare_code_for_real_models(code: str) -> str:
+    """
+    Replace model names with real OpenAI and Google models.
+    Ensures we use exactly 2 models: gpt-3.5-turbo and gemini-2.5-flash
+    """
+    import re
+
+    # Find all @evaluate decorators and replace models to ensure diversity
+    def replace_evaluate_models(match):
+        decorator = match.group(0)
+
+        # Extract the content inside @evaluate(...)
+        content = re.search(r'@evaluate\((.*)\)', decorator, re.DOTALL)
+        if not content:
+            return decorator
+
+        params = content.group(1)
+
+        # Split by comma, but be careful with nested structures
+        # Extract all quoted strings (model names)
+        model_pattern = r'"([^"]+)"'
+        models = re.findall(model_pattern, params)
+
+        if not models:
+            return decorator
+
+        # Always use exactly 2 models: gpt-3.5-turbo and gemini-2.5-flash
+        # Take first N models and replace them, but cap at 2
+        num_models = min(len(models), 2)
+        new_models = ['"gpt-3.5-turbo"', '"gemini-2.5-flash"'][:num_models]
+
+        # If there was only 1 model originally, keep it as 1 model
+        if len(models) == 1:
+            new_models = ['"gpt-3.5-turbo"']
+
+        # Find any kwargs (parameters with =)
+        # Split params and identify non-string parts (kwargs)
+        kwargs = []
+        # Remove all quoted strings and see what's left
+        params_without_strings = re.sub(r'"[^"]*"', '', params)
+        if '=' in params_without_strings:
+            # Extract kwargs
+            kwargs_match = re.search(r',?\s*(\w+\s*=\s*[^,)]+(?:,\s*\w+\s*=\s*[^,)]+)*)\s*$', params)
+            if kwargs_match:
+                kwargs.append(kwargs_match.group(1))
+
+        # Reconstruct the decorator
+        result = '@evaluate(' + ', '.join(new_models)
+        if kwargs:
+            result += ', ' + ', '.join(kwargs)
+        result += ')'
+
+        return result
+
+    # Replace all @evaluate decorators
+    modified_code = re.sub(r'@evaluate\([^)]+\)', replace_evaluate_models, code)
+
+    # Replace placeholder dataset loading with actual datasets
+    if 'load_dataset("data/qa_1000.json")' in modified_code:
+        # Add import if not present
+        if 'from benchwise' in modified_code and 'create_qa_dataset' not in modified_code:
+            modified_code = modified_code.replace(
+                'from benchwise import',
+                'from benchwise import create_qa_dataset,'
+            )
+        modified_code = modified_code.replace(
+            'load_dataset("data/qa_1000.json")',
+            'create_qa_dataset(questions=["What is AI?", "What is ML?"], answers=["Artificial Intelligence", "Machine Learning"], name="qa_test")'
+        )
+
+    if 'load_dataset("data/news_articles.json")' in modified_code:
+        # Add import if not present
+        if 'from benchwise' in modified_code and 'create_summarization_dataset' not in modified_code:
+            modified_code = modified_code.replace(
+                'from benchwise import',
+                'from benchwise import create_summarization_dataset,'
+            )
+        modified_code = modified_code.replace(
+            'load_dataset("data/news_articles.json")',
+            'create_summarization_dataset(documents=["Article about AI.", "Article about ML."], summaries=["AI summary", "ML summary"], name="news")'
+        )
+
+    return modified_code
+
+
+def check_syntax(code: str) -> Tuple[bool, str]:
+    """Check if Python code has valid syntax."""
+    import ast
+    try:
+        ast.parse(code)
+        return True, None
+    except SyntaxError as e:
+        return False, f"SyntaxError at line {e.lineno}: {e.msg}"
+    except Exception as e:
+        return False, f"Parse error: {str(e)}"
+
+
+def run_code_sync(code: str, timeout: int = 90) -> Tuple[bool, str, str]:
+    """Run code in subprocess and capture output."""
+    try:
+        # Create temp file
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+            f.write(code)
+            temp_file = f.name
+
+        # Run in subprocess
+        result = subprocess.run(
+            ['python', temp_file],
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+            cwd=Path(__file__).parent
+        )
+
+        # Cleanup
+        import os
+        os.unlink(temp_file)
+
+        output = result.stdout
+        error = result.stderr
+
+        if result.returncode == 0:
+            return True, output, None
+        else:
+            return False, output, error
+
+    except subprocess.TimeoutExpired:
+        return False, "", f"Timeout after {timeout}s"
+    except Exception as e:
+        return False, "", f"Error: {str(e)}"
+
+
+def test_code_block(code: str, block_num: int, line_num: int, syntax_only: bool = False) -> Tuple[bool, str]:
+    """Test a single code block."""
+    # Check syntax
+    syntax_valid, syntax_error = check_syntax(code)
+    if not syntax_valid:
+        print(f"❌ SYNTAX ERROR")
+        return False, f"Syntax Error: {syntax_error}"
+
+    if syntax_only:
+        print(f"✅ SYNTAX VALID")
+        return True, None
+
+    # Prepare code with real models
+    prepared_code = prepare_code_for_real_models(code)
+
+    # Skip incomplete examples (just function definitions without execution)
+    if '@evaluate(' in prepared_code and 'asyncio.run' not in prepared_code:
+        print(f"⏭️  SKIPPED (incomplete example - defines functions only)")
+        return True, "Skipped: Incomplete example"
+
+    # Run the code
+    print(f"⏳ Running test...", end=" ", flush=True)
+    start_time = time.time()
+    success, output, error = run_code_sync(prepared_code, timeout=90)
+    duration = time.time() - start_time
+
+    if success:
+        print(f"✅ PASSED ({duration:.2f}s)")
+        return True, output
+    else:
+        print(f"❌ FAILED ({duration:.2f}s)")
+        return False, error or output
+
+
+def main():
+    import argparse
+    import json
+    from datetime import datetime
+
+    parser = argparse.ArgumentParser(description="Test Python code examples from a documentation file")
+    parser.add_argument('file', help='Documentation file to test. Can be:\n'
+                                     '  - Relative path from project root (e.g., docs/docs/examples/classification.md)\n'
+                                     '  - Absolute path (e.g., /path/to/file.md)\n'
+                                     '  - Just filename (will search in docs/ directory tree)')
+    parser.add_argument('--syntax-only', action='store_true', help='Only check syntax')
+    parser.add_argument('--save-results', action='store_true', help='Save test results to files')
+    args = parser.parse_args()
+
+    # Find the documentation file
+    project_root = Path(__file__).parent
+    file_arg = Path(args.file)
+
+    # Try different strategies to find the file
+    doc_file = None
+
+    # Strategy 1: Absolute path
+    if file_arg.is_absolute() and file_arg.exists():
+        doc_file = file_arg
+
+    # Strategy 2: Relative to project root
+    elif (project_root / file_arg).exists():
+        doc_file = project_root / file_arg
+
+    # Strategy 3: Search in docs directory tree
+    else:
+        docs_dir = project_root / 'docs'
+        if docs_dir.exists():
+            # Search for the file in docs directory tree
+            for candidate in docs_dir.rglob(file_arg.name if file_arg.name else args.file):
+                if candidate.is_file():
+                    doc_file = candidate
+                    break
+
+    if doc_file is None or not doc_file.exists():
+        print(f"❌ Error: File not found: {args.file}")
+        print(f"\nSearched in:")
+        print(f"  - Absolute path: {file_arg if file_arg.is_absolute() else 'N/A'}")
+        print(f"  - Relative to project: {project_root / file_arg}")
+        print(f"  - In docs/ directory tree")
+        return 1
+
+    # Get relative path for display
+    try:
+        display_path = doc_file.relative_to(project_root)
+    except ValueError:
+        display_path = doc_file
+
+    print(f"\n🧪 Testing Documentation Examples")
+    print(f"📄 File: {display_path}")
+
+    if args.syntax_only:
+        print("⚙️  Mode: Syntax check only")
+    else:
+        print("⚙️  Mode: Full execution with REAL models")
+        print("🤖 Models: gpt-3.5-turbo, gemini-2.5-flash")
+        print("⚠️  Note: This will make actual API calls and incur costs")
+
+    # Extract code blocks
+    code_blocks = extract_code_blocks(doc_file)
+
+    if not code_blocks:
+        print(f"\n❌ No Python code blocks found in {args.file}")
+        return 1
+
+    print(f"📝 Total code blocks: {len(code_blocks)}\n")
+    print("=" * 80)
+
+    # Test each code block
+    results = []
+    for code, block_num, line_num in code_blocks:
+        print(f"\n{'=' * 80}")
+        print(f"TEST {block_num}/{len(code_blocks)}: Block {block_num} (Line {line_num})")
+        print("=" * 80)
+
+        success, output_or_error = test_code_block(code, block_num, line_num, args.syntax_only)
+        results.append((block_num, success, output_or_error))
+
+        # Show output
+        if success and output_or_error and output_or_error.strip() and not args.syntax_only:
+            print("\n📋 OUTPUT:")
+            print("-" * 80)
+            output_lines = output_or_error.strip().split('\n')
+            for line in output_lines[:50]:  # Show first 50 lines
+                print(f"  {line}")
+            if len(output_lines) > 50:
+                print(f"  ... ({len(output_lines) - 50} more lines)")
+            print("-" * 80)
+        elif not success and output_or_error:
+            print("\n❌ ERROR:")
+            print("-" * 80)
+            error_lines = output_or_error.split('\n')
+            for line in error_lines[:40]:  # Show first 40 lines
+                print(f"  {line}")
+            if len(error_lines) > 40:
+                print(f"  ... ({len(error_lines) - 40} more lines)")
+            print("-" * 80)
+
+    # Summary
+    print(f"\n{'=' * 80}")
+    print("SUMMARY")
+    print("=" * 80)
+
+    total = len(results)
+    passed = sum(1 for _, success, _ in results if success)
+    failed = total - passed
+
+    print(f"\nFile: {display_path}")
+    print(f"Total: {total} code blocks")
+    print(f"✅ Passed: {passed}")
+    print(f"❌ Failed: {failed}")
+    print(f"Success Rate: {passed/total*100:.1f}%")
+
+    # Show failures
+    if failed > 0:
+        print(f"\n{'-' * 80}")
+        print("FAILED TESTS")
+        print("-" * 80)
+        for block_num, success, output_or_error in results:
+            if not success:
+                print(f"\n❌ Block {block_num}")
+                if output_or_error:
+                    print(f"   {output_or_error[:200]}")
+
+    print(f"\n{'=' * 80}\n")
+
+    # Save results if requested
+    if args.save_results:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        # Create a clean base name from the file path
+        base_name = doc_file.stem  # Gets filename without extension
+
+        # Create results directory
+        results_dir = Path(__file__).parent / 'test_results'
+        results_dir.mkdir(exist_ok=True)
+
+        # Save JSON results (detailed)
+        json_file = results_dir / f"{base_name}_{timestamp}.json"
+        json_data = {
+            "file": str(display_path),
+            "full_path": str(doc_file),
+            "timestamp": datetime.now().isoformat(),
+            "total": total,
+            "passed": passed,
+            "failed": failed,
+            "success_rate": passed/total*100,
+            "syntax_only": args.syntax_only,
+            "results": [
+                {
+                    "block": block_num,
+                    "success": success,
+                    "output": output_or_error[:500] if output_or_error else None,  # Truncate long outputs
+                }
+                for block_num, success, output_or_error in results
+            ]
+        }
+
+        with open(json_file, 'w') as f:
+            json.dump(json_data, f, indent=2)
+
+        print(f"💾 JSON results saved to: {json_file}")
+
+        # Save Markdown summary
+        md_file = results_dir / f"{base_name}_{timestamp}.md"
+        with open(md_file, 'w') as f:
+            f.write(f"# Test Results: {display_path}\n\n")
+            f.write(f"**File:** `{doc_file}`\n\n")
+            f.write(f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
+            f.write(f"**Models:** gpt-3.5-turbo, gemini-2.5-flash\n\n")
+            f.write(f"## Summary\n\n")
+            f.write(f"- Total Tests: {total}\n")
+            f.write(f"- ✅ Passed: {passed}\n")
+            f.write(f"- ❌ Failed: {failed}\n")
+            f.write(f"- Success Rate: {passed/total*100:.1f}%\n\n")
+
+            if failed > 0:
+                f.write(f"## Failed Tests\n\n")
+                for block_num, success, output_or_error in results:
+                    if not success:
+                        f.write(f"### Block {block_num}\n\n")
+                        f.write(f"```\n{output_or_error[:300] if output_or_error else 'No error details'}\n```\n\n")
+
+        print(f"📝 Markdown summary saved to: {md_file}")
+
+        # Save to latest file (overwrite)
+        latest_json = results_dir / f"{base_name}_latest.json"
+        with open(latest_json, 'w') as f:
+            json.dump(json_data, f, indent=2)
+
+        print(f"📌 Latest results: {latest_json}")
+
+    return 0 if failed == 0 else 1
+
+
+if __name__ == '__main__':
+    sys.exit(main())

From 2a08836689974bd617ccd3a90ed46c4b8daed646 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Tue, 2 Dec 2025 23:10:25 +0530
Subject: [PATCH 09/24] fix: Add proper type annotations to config, models,
 results, core, and cli

---
 benchwise/cli.py     | 18 +++++++++---------
 benchwise/core.py    | 16 ++++++++--------
 benchwise/metrics.py |  2 +-
 benchwise/results.py | 26 +++++++++++++-------------
 mypy.ini             |  1 +
 5 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/benchwise/cli.py b/benchwise/cli.py
index e4905bc..a799987 100644
--- a/benchwise/cli.py
+++ b/benchwise/cli.py
@@ -5,7 +5,7 @@
 import argparse
 import asyncio
 import sys
-from typing import List, Optional
+from typing import List, Optional, Any, Dict
 
 from . import __version__
 from .datasets import load_dataset
@@ -206,7 +206,7 @@ async def run_evaluation(
             for metric_name in metrics:
                 try:
                     if metric_name == "accuracy":
-                        metric_result = accuracy(responses, references)
+                        metric_result: Any = accuracy(responses, references)
                         results["accuracy"] = metric_result["accuracy"]
                     elif metric_name == "rouge_l":
                         metric_result = rouge_l(responses, references)
@@ -285,7 +285,7 @@ async def run_evaluation(
     return benchmark_result
 
 
-async def configure_api(args):
+async def configure_api(args: Any) -> None:
     """Configure Benchwise API settings."""
     from .config import reset_config
 
@@ -321,7 +321,7 @@ async def configure_api(args):
         print("No configuration changes specified. Use --show to see current config.")
 
 
-async def sync_offline(args):
+async def sync_offline(args: Any) -> None:
     """Sync offline results with the API."""
     try:
         client = await get_client()
@@ -354,7 +354,7 @@ async def sync_offline(args):
         pass
 
 
-async def show_status(args):
+async def show_status(args: Any) -> None:
     """Show Benchwise status information."""
     config = get_api_config()
     client = None
@@ -412,7 +412,7 @@ async def show_status(args):
         pass
 
 
-def list_resources(resource_type: str):
+def list_resources(resource_type: str) -> None:
     """List available resources."""
     if resource_type == "models":
         print("Available model adapters:")
@@ -440,7 +440,7 @@ def list_resources(resource_type: str):
         )
 
 
-def validate_dataset(dataset_path: str):
+def validate_dataset(dataset_path: str) -> None:
     """Validate dataset format."""
     try:
         dataset = load_dataset(dataset_path)
@@ -478,7 +478,7 @@ def validate_dataset(dataset_path: str):
         sys.exit(1)
 
 
-async def compare_results(result_paths: List[str], metric: Optional[str] = None):
+async def compare_results(result_paths: List[str], metric: Optional[str] = None) -> None:
     """Compare evaluation results."""
     from .results import load_results, ResultsAnalyzer
 
@@ -509,7 +509,7 @@ async def compare_results(result_paths: List[str], metric: Optional[str] = None)
         sys.exit(1)
 
 
-def main():
+def main() -> None:
     """Main CLI entry point."""
     parser = create_parser()
     args = parser.parse_args()
diff --git a/benchwise/core.py b/benchwise/core.py
index b682cb0..fe34af5 100644
--- a/benchwise/core.py
+++ b/benchwise/core.py
@@ -41,17 +41,17 @@ async def test_qa(model, dataset):
             return accuracy(responses, dataset.references)
     """
 
-    def decorator(test_func: Callable[..., Awaitable[Any]]) -> Callable[[Dataset, Any], Awaitable[List[EvaluationResult]]]:
+    def decorator(test_func: Callable[..., Awaitable[Any]]) -> Callable[..., Awaitable[List[EvaluationResult]]]:
         if not inspect.iscoroutinefunction(test_func):
             raise TypeError(
                 f"{test_func.__name__} must be an async function. "
                 f"Use: async def {test_func.__name__}(model, dataset):"
             )
-        
+
         @wraps(test_func)
-        async def wrapper(dataset: Dataset, **test_kwargs) -> List[EvaluationResult]:
+        async def wrapper(dataset: Dataset, **test_kwargs: Any) -> List[EvaluationResult]:
             return await _run_evaluation(test_func, dataset, models, upload, kwargs, test_kwargs)
-        
+
         if hasattr(test_func, "_benchmark_metadata"):
             wrapper._benchmark_metadata = test_func._benchmark_metadata  # type: ignore[attr-defined]
 
@@ -154,7 +154,7 @@ def decorator(test_func: Callable[P, R]) -> Callable[P, R]:
     return decorator
 
 
-def stress_test(concurrent_requests: int = 10, duration: int = 60) -> Callable[[Callable[P, Awaitable[R]]], Callable[P, Awaitable[List[Union[R, Exception]]]]]:
+def stress_test(concurrent_requests: int = 10, duration: int = 60) -> Callable[[Callable[P, Awaitable[R]]], Callable[P, Awaitable[List[Union[R, BaseException]]]]]:
     """
     Decorator for stress testing LLMs.
 
@@ -166,12 +166,12 @@ async def load_test(model, dataset):
             pass
     """
 
-    def decorator(test_func: Callable[P, Awaitable[R]]) -> Callable[P, Awaitable[List[Union[R, Exception]]]]:
+    def decorator(test_func: Callable[P, Awaitable[R]]) -> Callable[P, Awaitable[List[Union[R, BaseException]]]]:
         @wraps(test_func)
-        async def wrapper(*args: P.args, **kwargs: P.kwargs) -> List[Union[R, Exception]]:
+        async def wrapper(*args: P.args, **kwargs: P.kwargs) -> List[Union[R, BaseException]]:
             logger.info(f"Starting stress test: {concurrent_requests} concurrent requests for {duration}s")
 
-            tasks: List[Union[R, Exception]] = []
+            tasks: List[Union[R, BaseException]] = []
             start_time = time.time()
 
             while time.time() - start_time < duration:
diff --git a/benchwise/metrics.py b/benchwise/metrics.py
index 25ceb3a..b63ff67 100644
--- a/benchwise/metrics.py
+++ b/benchwise/metrics.py
@@ -697,7 +697,7 @@ def perplexity(predictions: List[str], model_name: str = "gpt2") -> Dict[str, An
             "transformers and torch packages not installed. Please install them with: pip install 'benchwise[transformers]' or pip install transformers torch"
         )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name)  # type: ignore[no-untyped-call]
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForCausalLM.from_pretrained(model_name)
 
     perplexities = []
diff --git a/benchwise/results.py b/benchwise/results.py
index 397a8b8..330a73e 100644
--- a/benchwise/results.py
+++ b/benchwise/results.py
@@ -57,7 +57,7 @@ def to_dict(self) -> Dict[str, Any]:
             "success": self.success,
         }
 
-    def get_score(self, metric_name: str = None) -> Union[float, Any]:
+    def get_score(self, metric_name: Optional[str] = None) -> Union[float, Any]:
         """
         Extract a specific score from the result.
 
@@ -93,7 +93,7 @@ class BenchmarkResult:
     metadata: Dict[str, Any] = field(default_factory=dict)
     timestamp: datetime = field(default_factory=datetime.now)
 
-    def add_result(self, result: EvaluationResult):
+    def add_result(self, result: EvaluationResult) -> None:
         """Add an evaluation result to the benchmark."""
         self.results.append(result)
 
@@ -119,7 +119,7 @@ def success_rate(self) -> float:
             return 0.0
         return len(self.successful_results) / len(self.results)
 
-    def get_best_model(self, metric_name: str = None) -> Optional[EvaluationResult]:
+    def get_best_model(self, metric_name: Optional[str] = None) -> Optional[EvaluationResult]:
         """
         Get the best performing model result.
 
@@ -135,7 +135,7 @@ def get_best_model(self, metric_name: str = None) -> Optional[EvaluationResult]:
 
         return max(successful_results, key=lambda r: r.get_score(metric_name) or 0)
 
-    def get_worst_model(self, metric_name: str = None) -> Optional[EvaluationResult]:
+    def get_worst_model(self, metric_name: Optional[str] = None) -> Optional[EvaluationResult]:
         """
         Get the worst performing model result.
 
@@ -153,7 +153,7 @@ def get_worst_model(self, metric_name: str = None) -> Optional[EvaluationResult]
             successful_results, key=lambda r: r.get_score(metric_name) or float("inf")
         )
 
-    def compare_models(self, metric_name: str = None) -> Dict[str, Any]:
+    def compare_models(self, metric_name: Optional[str] = None) -> Dict[str, Any]:
         """
         Compare all models in the benchmark.
 
@@ -241,12 +241,12 @@ def to_dataframe(self) -> pd.DataFrame:
 
         return pd.DataFrame(data)
 
-    def save_to_json(self, file_path: Union[str, Path]):
+    def save_to_json(self, file_path: Union[str, Path]) -> None:
         """Save benchmark results to JSON file."""
         with open(file_path, "w") as f:
             json.dump(self.to_dict(), f, indent=2, default=str)
 
-    def save_to_csv(self, file_path: Union[str, Path]):
+    def save_to_csv(self, file_path: Union[str, Path]) -> None:
         """Save benchmark results to CSV file."""
         df = self.to_dataframe()
         df.to_csv(file_path, index=False)
@@ -257,7 +257,7 @@ class ResultsAnalyzer:
 
     @staticmethod
     def compare_benchmarks(
-        benchmark_results: List[BenchmarkResult], metric_name: str = None
+        benchmark_results: List[BenchmarkResult], metric_name: Optional[str] = None
     ) -> Dict[str, Any]:
         """
         Compare results across multiple benchmarks.
@@ -269,7 +269,7 @@ def compare_benchmarks(
         Returns:
             Dictionary with cross-benchmark comparison
         """
-        comparison = {"benchmarks": [], "models": set(), "cross_benchmark_scores": {}}
+        comparison: Dict[str, Any] = {"benchmarks": [], "models": set(), "cross_benchmark_scores": {}}
 
         for benchmark in benchmark_results:
             benchmark_info = {
@@ -300,7 +300,7 @@ def compare_benchmarks(
 
     @staticmethod
     def analyze_model_performance(
-        results: List[EvaluationResult], metric_name: str = None
+        results: List[EvaluationResult], metric_name: Optional[str] = None
     ) -> Dict[str, Any]:
         """
         Analyze performance of a single model across multiple evaluations.
@@ -480,7 +480,7 @@ def _get_cache_key(self, model_name: str, test_name: str, dataset_hash: str) ->
         key_data = f"{model_name}_{test_name}_{dataset_hash}"
         return hashlib.md5(key_data.encode()).hexdigest()
 
-    def save_result(self, result: EvaluationResult, dataset_hash: str):
+    def save_result(self, result: EvaluationResult, dataset_hash: str) -> None:
         """Save evaluation result to cache."""
         cache_key = self._get_cache_key(
             result.model_name, result.test_name, dataset_hash
@@ -516,7 +516,7 @@ def load_result(
         except Exception:
             return None
 
-    def clear_cache(self):
+    def clear_cache(self) -> None:
         """Clear all cached results."""
         for cache_file in self.cache_dir.glob("*.json"):
             cache_file.unlink()
@@ -546,7 +546,7 @@ def list_cached_results(self) -> List[Dict[str, Any]]:
 
 def save_results(
     benchmark_result: BenchmarkResult, file_path: Union[str, Path], format: str = "json"
-):
+) -> None:
     """
     Save benchmark results to file.
 
diff --git a/mypy.ini b/mypy.ini
index 1ccb1bb..23d3a7c 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -34,6 +34,7 @@ ignore_missing_imports = True
 
 [mypy-transformers.*]
 ignore_missing_imports = True
+follow_imports = skip
 
 [mypy-torch.*]
 ignore_missing_imports = True

From a7a2a2ed58568262f52704823b819f7598b3b16c Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Tue, 2 Dec 2025 23:44:03 +0530
Subject: [PATCH 10/24] fix: Complete type annotations for datasets.py and
 client.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All mypy errors resolved (82 → 0). Fixed type narrowing, cast() calls for response.json(), and missing return type annotations. Tests: 124/126 passing.
---
 benchwise/client.py   | 83 +++++++++++++++++++++++--------------------
 benchwise/datasets.py | 62 +++++++++++++++++++++-----------
 2 files changed, 86 insertions(+), 59 deletions(-)

diff --git a/benchwise/client.py b/benchwise/client.py
index fc0156a..00a9651 100644
--- a/benchwise/client.py
+++ b/benchwise/client.py
@@ -2,7 +2,7 @@
 import asyncio
 import uuid
 import logging
-from typing import Dict, Any, Optional, List
+from typing import Dict, Any, Optional, List, cast
 from datetime import datetime
 from contextvars import ContextVar
 
@@ -19,7 +19,7 @@
 class BenchwiseAPIError(Exception):
     """Enhanced exception with error codes and retry info."""
 
-    def __init__(self, message: str, status_code: int = None, retry_after: int = None, request_id: str = None):
+    def __init__(self, message: str, status_code: Optional[int] = None, retry_after: Optional[int] = None, request_id: Optional[str] = None):
         super().__init__(message)
         self.status_code = status_code
         self.retry_after = retry_after
@@ -56,21 +56,21 @@ def __init__(self, api_url: Optional[str] = None, api_key: Optional[str] = None)
         self.benchmark_cache: Dict[str, int] = {}
 
         # Offline queue for storing results when API is unavailable
-        self.offline_queue = []
+        self.offline_queue: List[Dict[str, Any]] = []
         self.offline_mode = False
 
         # Track if client is closed
         self._closed = False
-        
+
         logger.debug(f"BenchwiseClient initialized with API URL: {self.api_url}")
 
-    async def __aenter__(self):
+    async def __aenter__(self) -> "BenchwiseClient":
         return self
 
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
+    async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
         await self.close()
 
-    async def close(self):
+    async def close(self) -> None:
         """Close the HTTP client."""
         if not self._closed:
             await self.client.aclose()
@@ -78,7 +78,7 @@ async def close(self):
             logger.debug("BenchwiseClient closed")
 
     async def _make_request_with_retry(
-        self, method: str, url: str, **kwargs
+        self, method: str, url: str, **kwargs: Any
     ) -> httpx.Response:
         """Make HTTP request with automatic retry logic and request ID tracking."""
         max_retries = 3
@@ -138,7 +138,7 @@ async def _make_request_with_retry(
 
         raise BenchwiseAPIError("Max retries exceeded", request_id=request_id)
 
-    def _set_auth_header(self):
+    def _set_auth_header(self) -> None:
         """Set JWT authorization header if token is available."""
         if self.jwt_token:
             self.client.headers["Authorization"] = f"Bearer {self.jwt_token}"
@@ -225,7 +225,7 @@ async def register(
 
             if response.status_code == 201:
                 logger.info(f"Registration successful for user: {username}")
-                return response.json()
+                return cast(Dict[str, Any], response.json())
             elif response.status_code == 400:
                 error_detail = response.json().get("detail", "Registration failed")
                 logger.error(f"Registration failed: {error_detail}")
@@ -252,7 +252,7 @@ async def get_current_user(self) -> Dict[str, Any]:
             response = await self.client.get("/api/v1/users/me")
 
             if response.status_code == 200:
-                return response.json()
+                return cast(Dict[str, Any], response.json())
             elif response.status_code == 401:
                 logger.warning("Authentication expired")
                 raise BenchwiseAPIError("Authentication expired - please login again")
@@ -322,8 +322,8 @@ async def register_model(
             response = await self.client.post("/api/v1/models", json=model_data)
 
             if response.status_code == 201:
-                model_info = response.json()
-                model_db_id = model_info["id"]
+                model_info = cast(Dict[str, Any], response.json())
+                model_db_id = cast(int, model_info["id"])
                 self.model_cache[cache_key] = model_db_id
                 logger.info(f"Model registered successfully with ID: {model_db_id}")
                 return model_db_id
@@ -349,14 +349,15 @@ async def _get_existing_model(self, provider: str, model_id: str) -> int:
             )
 
             if response.status_code == 200:
-                models = response.json()
+                models = cast(List[Dict[str, Any]], response.json())
                 # Filter in Python since backend doesn't support model_id parameter
                 for model in models:
                     if model["provider"] == provider and model["model_id"] == model_id:
                         cache_key = f"{provider}:{model_id}"
-                        self.model_cache[cache_key] = model["id"]
-                        logger.debug(f"Found existing model with ID: {model['id']}")
-                        return model["id"]
+                        model_id_value = cast(int, model["id"])
+                        self.model_cache[cache_key] = model_id_value
+                        logger.debug(f"Found existing model with ID: {model_id_value}")
+                        return model_id_value
 
                 raise BenchwiseAPIError(f"Model {provider}:{model_id} not found")
             else:
@@ -410,8 +411,8 @@ async def register_benchmark(
             response = await self.client.post("/api/v1/benchmarks", json=benchmark_data)
 
             if response.status_code == 201:
-                benchmark_info = response.json()
-                benchmark_db_id = benchmark_info["id"]
+                benchmark_info = cast(Dict[str, Any], response.json())
+                benchmark_db_id = cast(int, benchmark_info["id"])
                 self.benchmark_cache[benchmark_name] = benchmark_db_id
                 logger.info(f"Benchmark registered successfully with ID: {benchmark_db_id}")
                 return benchmark_db_id
@@ -437,20 +438,22 @@ async def _get_existing_benchmark(self, benchmark_name: str) -> int:
             )
 
             if response.status_code == 200:
-                benchmarks = response.json()
+                benchmarks = cast(List[Dict[str, Any]], response.json())
                 # Look for exact name match first, then partial match
                 for benchmark in benchmarks:
                     if benchmark["name"] == benchmark_name:
-                        self.benchmark_cache[benchmark_name] = benchmark["id"]
-                        logger.debug(f"Found existing benchmark with ID: {benchmark['id']}")
-                        return benchmark["id"]
+                        benchmark_id_value = cast(int, benchmark["id"])
+                        self.benchmark_cache[benchmark_name] = benchmark_id_value
+                        logger.debug(f"Found existing benchmark with ID: {benchmark_id_value}")
+                        return benchmark_id_value
 
                 # If no exact match, try partial match
                 for benchmark in benchmarks:
                     if benchmark_name.lower() in benchmark["name"].lower():
-                        self.benchmark_cache[benchmark_name] = benchmark["id"]
-                        logger.debug(f"Found similar benchmark with ID: {benchmark['id']}")
-                        return benchmark["id"]
+                        benchmark_id_value = cast(int, benchmark["id"])
+                        self.benchmark_cache[benchmark_name] = benchmark_id_value
+                        logger.debug(f"Found similar benchmark with ID: {benchmark_id_value}")
+                        return benchmark_id_value
 
                 raise BenchwiseAPIError(f"Benchmark {benchmark_name} not found")
             else:
@@ -466,7 +469,7 @@ async def create_evaluation(
         name: str,
         benchmark_id: int,
         model_ids: List[int],
-        metadata: Optional[Dict] = None,
+        metadata: Optional[Dict[str, Any]] = None,
     ) -> int:
         """
         Create evaluation with correct backend format.
@@ -495,9 +498,10 @@ async def create_evaluation(
             )
 
             if response.status_code == 201:
-                evaluation_info = response.json()
-                logger.info(f"Evaluation created successfully with ID: {evaluation_info['id']}")
-                return evaluation_info["id"]
+                evaluation_info = cast(Dict[str, Any], response.json())
+                evaluation_id = cast(int, evaluation_info["id"])
+                logger.info(f"Evaluation created successfully with ID: {evaluation_id}")
+                return evaluation_id
             elif response.status_code == 401:
                 raise BenchwiseAPIError(
                     "Authentication required for creating evaluations"
@@ -709,7 +713,7 @@ async def get_benchmarks(
             )
 
             if response.status_code == 200:
-                return response.json()
+                return cast(List[Dict[str, Any]], response.json())
             else:
                 raise BenchwiseAPIError(
                     f"Failed to retrieve benchmarks: {response.status_code}"
@@ -728,7 +732,7 @@ async def get_evaluations(
             )
 
             if response.status_code == 200:
-                return response.json()
+                return cast(List[Dict[str, Any]], response.json())
             else:
                 raise BenchwiseAPIError(
                     f"Failed to retrieve evaluations: {response.status_code}"
@@ -737,7 +741,7 @@ async def get_evaluations(
         except httpx.RequestError as e:
             raise BenchwiseAPIError(f"Network error retrieving evaluations: {e}")
 
-    async def _add_to_offline_queue(self, data: Dict[str, Any]):
+    async def _add_to_offline_queue(self, data: Dict[str, Any]) -> None:
         """Add data to offline queue for later upload."""
         self.offline_queue.append(
             {"data": data, "timestamp": datetime.now().isoformat()}
@@ -818,9 +822,10 @@ async def upload_dataset_for_benchmark(
                 )
 
             if response.status_code == 200:
-                result = response.json()
+                result = cast(Dict[str, Any], response.json())
+                file_url = cast(str, result["file_info"]["url"])
                 logger.info("Dataset uploaded successfully")
-                return result["file_info"]["url"]
+                return file_url
             else:
                 raise BenchwiseAPIError(
                     f"Failed to upload dataset: {response.status_code}"
@@ -859,8 +864,8 @@ async def create_benchmark_with_dataset(
                 f"Failed to create benchmark: {response.status_code}"
             )
 
-        benchmark = response.json()
-        benchmark_id = benchmark["id"]
+        benchmark = cast(Dict[str, Any], response.json())
+        benchmark_id = cast(int, benchmark["id"])
 
         # 2. Upload dataset
         try:
@@ -899,10 +904,10 @@ async def get_client() -> BenchwiseClient:
     return client
 
 
-async def close_client():
+async def close_client() -> None:
     """Close the context-local client."""
     client = _client_context.get()
-    
+
     if client and not client._closed:
         try:
             await client.close()
diff --git a/benchwise/datasets.py b/benchwise/datasets.py
index b4fca30..2d8d1eb 100644
--- a/benchwise/datasets.py
+++ b/benchwise/datasets.py
@@ -124,13 +124,13 @@ def split(
         train_dataset = Dataset(
             name=f"{self.name}_train",
             data=train_data,
-            metadata={**self.metadata, "split": "train", "train_ratio": train_ratio},
+            metadata={**(self.metadata or {}), "split": "train", "train_ratio": train_ratio},
         )
 
         test_dataset = Dataset(
             name=f"{self.name}_test",
             data=test_data,
-            metadata={**self.metadata, "split": "test", "train_ratio": train_ratio},
+            metadata={**(self.metadata or {}), "split": "test", "train_ratio": train_ratio},
         )
 
         return train_dataset, test_dataset
@@ -154,7 +154,7 @@ def to_json(self, file_path: Optional[str] = None) -> str:
 
         return json_data
 
-    def to_csv(self, file_path: str):
+    def to_csv(self, file_path: str) -> None:
         """Export dataset to CSV format."""
         df = pd.DataFrame(self.data)
         df.to_csv(file_path, index=False)
@@ -175,14 +175,15 @@ def validate_schema(self) -> bool:
 
     def get_statistics(self) -> Dict[str, Any]:
         """Get dataset statistics."""
-        stats = {
+        fields: List[str] = list(self.data[0].keys()) if self.data else []
+        stats: Dict[str, Any] = {
             "size": self.size,
-            "fields": list(self.data[0].keys()) if self.data else [],
+            "fields": fields,
             "metadata": self.metadata,
         }
 
         if self.data:
-            for field in stats["fields"]:
+            for field in fields:
                 values = [item.get(field) for item in self.data if field in item]
                 if values:
                     if all(isinstance(v, str) for v in values):
@@ -190,14 +191,16 @@ def get_statistics(self) -> Dict[str, Any]:
                             len(str(v)) for v in values
                         ) / len(values)
                     elif all(isinstance(v, (int, float)) for v in values):
-                        stats[f"{field}_mean"] = sum(values) / len(values)
-                        stats[f"{field}_min"] = min(values)
-                        stats[f"{field}_max"] = max(values)
+                        # Type narrowing: we know values are numeric here
+                        numeric_values = [v for v in values if isinstance(v, (int, float))]
+                        stats[f"{field}_mean"] = sum(numeric_values) / len(numeric_values)
+                        stats[f"{field}_min"] = min(numeric_values)
+                        stats[f"{field}_max"] = max(numeric_values)
 
         return stats
 
 
-def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs) -> Dataset:
+def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs: Any) -> Dataset:
     """
     Load dataset from various sources.
 
@@ -237,10 +240,15 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs) -> Dataset:
                     data=data,
                     metadata=kwargs.get("metadata", {}),
                 )
+            else:
+                raise ValueError(
+                    f"Invalid JSON format in '{source_path}'. Expected a list or a dict with 'data' key."
+                )
 
         elif source_path.suffix == ".csv":
             df = pd.read_csv(source_path)
-            data = df.to_dict("records")
+            # Type cast: pandas to_dict returns dict[Hashable, Any] but we need dict[str, Any]
+            data = [dict(record) for record in df.to_dict("records")]
 
             return Dataset(
                 name=kwargs.get("name", source_path.stem),
@@ -249,10 +257,12 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs) -> Dataset:
             )
 
         elif str(source).startswith(("http://", "https://")):
-            response = requests.get(source)
+            # Convert to str for requests.get
+            source_str = str(source)
+            response = requests.get(source_str)
             response.raise_for_status()
 
-            if source.endswith(".json"):
+            if source_str.endswith(".json"):
                 data = response.json()
                 if isinstance(data, dict) and "data" in data:
                     return Dataset(
@@ -267,14 +277,26 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs) -> Dataset:
                         data=data,
                         metadata=kwargs.get("metadata", {}),
                     )
+                else:
+                    raise ValueError(
+                        f"Invalid JSON format from '{source_str}'. Expected a list or a dict with 'data' key."
+                    )
+            else:
+                raise ValueError(
+                    f"Unsupported URL format '{source_str}'. Only .json URLs are supported."
+                )
 
         else:
             raise ValueError(
                 f"Unsupported file format '{source_path.suffix}'. Supported formats: .json, .csv"
             )
 
+    raise ValueError(
+        f"Unable to load dataset from source: {source}"
+    )
+
 
-def create_qa_dataset(questions: List[str], answers: List[str], **kwargs) -> Dataset:
+def create_qa_dataset(questions: List[str], answers: List[str], **kwargs: Any) -> Dataset:
     """
     Create a question-answering dataset.
 
@@ -311,7 +333,7 @@ def create_qa_dataset(questions: List[str], answers: List[str], **kwargs) -> Dat
 
 
 def create_summarization_dataset(
-    documents: List[str], summaries: List[str], **kwargs
+    documents: List[str], summaries: List[str], **kwargs: Any
 ) -> Dataset:
     """
     Create a text summarization dataset.
@@ -351,7 +373,7 @@ def create_summarization_dataset(
 
 
 def create_classification_dataset(
-    texts: List[str], labels: List[str], **kwargs
+    texts: List[str], labels: List[str], **kwargs: Any
 ) -> Dataset:
     """
     Create a text classification dataset.
@@ -389,10 +411,10 @@ def create_classification_dataset(
 class DatasetRegistry:
     """Registry for managing multiple datasets."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.datasets: Dict[str, Dataset] = {}
 
-    def register(self, dataset: Dataset):
+    def register(self, dataset: Dataset) -> None:
         self.datasets[dataset.name] = dataset
 
     def get(self, name: str) -> Optional[Dataset]:
@@ -401,11 +423,11 @@ def get(self, name: str) -> Optional[Dataset]:
     def list(self) -> List[str]:
         return list(self.datasets.keys())
 
-    def remove(self, name: str):
+    def remove(self, name: str) -> None:
         if name in self.datasets:
             del self.datasets[name]
 
-    def clear(self):
+    def clear(self) -> None:
         self.datasets.clear()
 
 

From f38a7f92d50e22d232c1d05273c102e720286a19 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Tue, 2 Dec 2025 23:48:08 +0530
Subject: [PATCH 11/24] feat: Add mypy to CI/CD and documentation

Added GitHub Actions CI workflow, pre-commit hooks, and README development section with type checking info.
---
 .github/workflows/ci.yml | 39 +++++++++++++++++++++++++
 .pre-commit-config.yaml  | 10 +++++++
 README.md                | 61 +++++++++++++++++++++++++++++++++++-----
 3 files changed, 103 insertions(+), 7 deletions(-)
 create mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..02accc7
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,39 @@
+name: CI
+
+on:
+  push:
+    branches: [main, develop]
+  pull_request:
+    branches: [main, develop]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev,test]"
+
+      - name: Run ruff linter
+        run: ruff check benchwise tests
+
+      - name: Run ruff formatter check
+        run: ruff format --check benchwise tests
+
+      - name: Run mypy type checker
+        run: mypy benchwise --config-file=mypy.ini
+
+      - name: Run tests
+        run: python run_tests.py --basic
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e46bcb6..d4ec570 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,3 +16,13 @@ repos:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
       - id: ruff-format
+
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.11.2
+    hooks:
+      - id: mypy
+        additional_dependencies:
+          - types-requests
+          - pandas-stubs
+        args: [--config-file=mypy.ini]
+        files: ^benchwise/
diff --git a/README.md b/README.md
index 77f4939..a03bd55 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ async def test_summarization(model, dataset):
     prompts = [f"Summarize: {item['text']}" for item in dataset.data]
     responses = await model.generate(prompts)
     references = [item['summary'] for item in dataset.data]
-    
+
     scores = rouge_l(responses, references)
     assert scores['f1'] > 0.3  # Minimum quality threshold
     return scores
@@ -84,7 +84,7 @@ Support for major LLM providers:
 # OpenAI models
 @evaluate("gpt-4", "gpt-3.5-turbo")
 
-# Anthropic models  
+# Anthropic models
 @evaluate("claude-3-opus", "claude-3-sonnet")
 
 # Google models
@@ -139,10 +139,10 @@ async def test_medical_qa(model, dataset):
     questions = [f"Q: {item['question']}\nA:" for item in dataset.data]
     answers = await model.generate(questions, temperature=0)
     references = [item['answer'] for item in dataset.data]
-    
+
     accuracy_score = accuracy(answers, references)
     similarity_score = semantic_similarity(answers, references)
-    
+
     return {
         'accuracy': accuracy_score['accuracy'],
         'similarity': similarity_score['mean_similarity']
@@ -156,10 +156,10 @@ async def test_medical_qa(model, dataset):
 @evaluate("gpt-3.5-turbo", "claude-3-haiku")
 async def test_safety(model, dataset):
     responses = await model.generate(dataset.prompts)
-    
+
     safety_scores = safety_score(responses)
     assert safety_scores['mean_safety'] > 0.9  # High safety threshold
-    
+
     return safety_scores
 ```
 
@@ -172,10 +172,57 @@ async def test_performance(model, dataset):
     start_time = time.time()
     response = await model.generate(["Hello, world!"])
     latency = time.time() - start_time
-    
+
     assert latency < 2.0  # Max 2 second response time
     return {'latency': latency}
 ```
 
 
+## Development
+
+### Type Safety
+
+Benchwise uses strict type checking with mypy to ensure code quality:
+
+```bash
+# Run type checker
+mypy benchwise
+
+# Type checking is enforced in CI/CD and pre-commit hooks
+```
+
+All code contributions must pass mypy strict checks. The codebase is fully typed with:
+- Comprehensive type annotations
+- Custom TypedDict definitions in `benchwise/types.py`
+- Type stubs for external dependencies
+
+### Running Tests
+
+```bash
+# Quick validation
+python run_tests.py --basic
+
+# Full test suite
+python run_tests.py
+
+# With coverage
+python run_tests.py --coverage
+```
+
+### Code Quality
+
+```bash
+# Format code
+ruff format .
+
+# Lint code
+ruff check --fix .
+
+# Type check
+mypy benchwise
+
+# Run all checks
+pre-commit run --all-files
+```
+
 Happy evaluating! 🎯

From 87bb74c3f8948ea981403dd31df6fa2fe614d005 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Tue, 2 Dec 2025 23:52:41 +0530
Subject: [PATCH 12/24] chore: Update CI to test only Python 3.12

---
 .github/workflows/ci.yml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 02accc7..e6ce6a1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -9,17 +9,14 @@ on:
 jobs:
   test:
     runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
 
     steps:
       - uses: actions/checkout@v4
 
-      - name: Set up Python ${{ matrix.python-version }}
+      - name: Set up Python 3.12
         uses: actions/setup-python@v5
         with:
-          python-version: ${{ matrix.python-version }}
+          python-version: "3.12"
 
       - name: Install dependencies
         run: |

From e6254c392d700baedd81e4ef2c574b0415fcb4c3 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Wed, 3 Dec 2025 00:20:45 +0530
Subject: [PATCH 13/24] fix: Remove unused imports and fix type errors

---
 .gitignore              |   4 +-
 benchwise/cli.py        |   6 +-
 benchwise/client.py     |  74 ++++---
 benchwise/config.py     |   3 +-
 benchwise/datasets.py   |  32 +--
 benchwise/metrics.py    |  35 ++--
 mypy.ini                |   9 +
 test_single_doc_file.py | 424 ----------------------------------------
 8 files changed, 113 insertions(+), 474 deletions(-)
 delete mode 100644 test_single_doc_file.py

diff --git a/.gitignore b/.gitignore
index 7132f2e..19573c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -252,4 +252,6 @@ redis-data/
 celery-beat-schedule
 
 # AI files
-CLAUDE.md
\ No newline at end of file
+CLAUDE.md
+GEMINI.md
+test_single_doc_file.py
diff --git a/benchwise/cli.py b/benchwise/cli.py
index a799987..a2bd910 100644
--- a/benchwise/cli.py
+++ b/benchwise/cli.py
@@ -5,7 +5,7 @@
 import argparse
 import asyncio
 import sys
-from typing import List, Optional, Any, Dict
+from typing import List, Optional, Any
 
 from . import __version__
 from .datasets import load_dataset
@@ -478,7 +478,9 @@ def validate_dataset(dataset_path: str) -> None:
         sys.exit(1)
 
 
-async def compare_results(result_paths: List[str], metric: Optional[str] = None) -> None:
+async def compare_results(
+    result_paths: List[str], metric: Optional[str] = None
+) -> None:
     """Compare evaluation results."""
     from .results import load_results, ResultsAnalyzer
 
diff --git a/benchwise/client.py b/benchwise/client.py
index 00a9651..fc724fb 100644
--- a/benchwise/client.py
+++ b/benchwise/client.py
@@ -13,13 +13,21 @@
 logger = logging.getLogger("benchwise.client")
 
 # Context-local client storage (thread-safe)
-_client_context: ContextVar[Optional['BenchwiseClient']] = ContextVar('_client_context', default=None)
+_client_context: ContextVar[Optional["BenchwiseClient"]] = ContextVar(
+    "_client_context", default=None
+)
 
 
 class BenchwiseAPIError(Exception):
     """Enhanced exception with error codes and retry info."""
 
-    def __init__(self, message: str, status_code: Optional[int] = None, retry_after: Optional[int] = None, request_id: Optional[str] = None):
+    def __init__(
+        self,
+        message: str,
+        status_code: Optional[int] = None,
+        retry_after: Optional[int] = None,
+        request_id: Optional[str] = None,
+    ):
         super().__init__(message)
         self.status_code = status_code
         self.retry_after = retry_after
@@ -83,13 +91,13 @@ async def _make_request_with_retry(
         """Make HTTP request with automatic retry logic and request ID tracking."""
         max_retries = 3
         base_delay = 1
-        
+
         # Generate and add request ID
         request_id = generate_request_id()
-        if 'headers' not in kwargs:
-            kwargs['headers'] = {}
-        kwargs['headers']['X-Request-ID'] = request_id
-        
+        if "headers" not in kwargs:
+            kwargs["headers"] = {}
+        kwargs["headers"]["X-Request-ID"] = request_id
+
         logger.debug(f"Making {method} request to {url} [Request-ID: {request_id}]")
 
         for attempt in range(max_retries + 1):
@@ -106,7 +114,9 @@ async def _make_request_with_retry(
                     retry_after = int(
                         response.headers.get("retry-after", base_delay * (2**attempt))
                     )
-                    logger.warning(f"Rate limited, retrying after {retry_after}s [Request-ID: {request_id}]")
+                    logger.warning(
+                        f"Rate limited, retrying after {retry_after}s [Request-ID: {request_id}]"
+                    )
                     if attempt < max_retries:
                         await asyncio.sleep(retry_after)
                         continue
@@ -121,15 +131,19 @@ async def _make_request_with_retry(
                 except Exception:
                     pass
 
-                logger.error(f"Request failed: {error_detail} [Request-ID: {request_id}]")
+                logger.error(
+                    f"Request failed: {error_detail} [Request-ID: {request_id}]"
+                )
                 raise BenchwiseAPIError(
-                    f"{error_detail}", 
+                    f"{error_detail}",
                     status_code=response.status_code,
-                    request_id=request_id
+                    request_id=request_id,
                 )
 
             except httpx.RequestError as e:
-                logger.warning(f"Network error (attempt {attempt + 1}/{max_retries + 1}): {e} [Request-ID: {request_id}]")
+                logger.warning(
+                    f"Network error (attempt {attempt + 1}/{max_retries + 1}): {e} [Request-ID: {request_id}]"
+                )
                 if attempt < max_retries:
                     delay = base_delay * (2**attempt)
                     await asyncio.sleep(delay)
@@ -151,7 +165,7 @@ async def health_check(self) -> bool:
         """Check if the Benchwise API is available."""
         try:
             response = await self.client.get("/health", timeout=5.0)
-            is_healthy = response.status_code == 200
+            is_healthy = bool(response.status_code == 200)
             logger.info(f"Health check: {'healthy' if is_healthy else 'unhealthy'}")
             return is_healthy
         except Exception as e:
@@ -270,17 +284,19 @@ async def upload_benchmark_result_simple(
     ) -> Dict[str, Any]:
         """
         WIP: Simplified single-call upload for benchmark results.
-        
+
         This will be the primary upload method in the next release.
         Currently redirects to the existing multi-step workflow.
-        
+
         Args:
             benchmark_result: BenchmarkResult object to upload
 
         Returns:
             API response data
         """
-        logger.warning("Using legacy multi-step upload workflow. Simplified workflow coming in next release.")
+        logger.warning(
+            "Using legacy multi-step upload workflow. Simplified workflow coming in next release."
+        )
         return await self.upload_benchmark_result(benchmark_result)
 
     async def register_model(
@@ -414,7 +430,9 @@ async def register_benchmark(
                 benchmark_info = cast(Dict[str, Any], response.json())
                 benchmark_db_id = cast(int, benchmark_info["id"])
                 self.benchmark_cache[benchmark_name] = benchmark_db_id
-                logger.info(f"Benchmark registered successfully with ID: {benchmark_db_id}")
+                logger.info(
+                    f"Benchmark registered successfully with ID: {benchmark_db_id}"
+                )
                 return benchmark_db_id
             elif response.status_code == 400:
                 # Benchmark might already exist - try to get it
@@ -444,7 +462,9 @@ async def _get_existing_benchmark(self, benchmark_name: str) -> int:
                     if benchmark["name"] == benchmark_name:
                         benchmark_id_value = cast(int, benchmark["id"])
                         self.benchmark_cache[benchmark_name] = benchmark_id_value
-                        logger.debug(f"Found existing benchmark with ID: {benchmark_id_value}")
+                        logger.debug(
+                            f"Found existing benchmark with ID: {benchmark_id_value}"
+                        )
                         return benchmark_id_value
 
                 # If no exact match, try partial match
@@ -452,7 +472,9 @@ async def _get_existing_benchmark(self, benchmark_name: str) -> int:
                     if benchmark_name.lower() in benchmark["name"].lower():
                         benchmark_id_value = cast(int, benchmark["id"])
                         self.benchmark_cache[benchmark_name] = benchmark_id_value
-                        logger.debug(f"Found similar benchmark with ID: {benchmark_id_value}")
+                        logger.debug(
+                            f"Found similar benchmark with ID: {benchmark_id_value}"
+                        )
                         return benchmark_id_value
 
                 raise BenchwiseAPIError(f"Benchmark {benchmark_name} not found")
@@ -652,7 +674,9 @@ async def upload_benchmark_result(
             # Step 5: Upload results
             await self.upload_evaluation_results(evaluation_id, results_data)
 
-            logger.info(f"Benchmark result uploaded successfully. Evaluation ID: {evaluation_id}")
+            logger.info(
+                f"Benchmark result uploaded successfully. Evaluation ID: {evaluation_id}"
+            )
             return {
                 "id": evaluation_id,
                 "benchmark_id": benchmark_id,
@@ -890,17 +914,17 @@ async def create_benchmark_with_dataset(
 async def get_client() -> BenchwiseClient:
     """
     Get or create a context-local Benchwise API client.
-    
+
     This uses context variables to ensure thread-safety and proper
     isolation in async contexts.
     """
     client = _client_context.get()
-    
+
     if client is None or client._closed:
         client = BenchwiseClient()
         _client_context.set(client)
         logger.debug("Created new context-local client")
-    
+
     return client
 
 
@@ -935,7 +959,9 @@ async def upload_results(
 
         # Check if API is available
         if not await client.health_check():
-            logger.warning("Benchwise API not available, results will be cached offline")
+            logger.warning(
+                "Benchwise API not available, results will be cached offline"
+            )
             from .results import BenchmarkResult
 
             benchmark_result = BenchmarkResult(
diff --git a/benchwise/config.py b/benchwise/config.py
index 10b07f6..a2e32dc 100644
--- a/benchwise/config.py
+++ b/benchwise/config.py
@@ -452,7 +452,8 @@ def validate_api_keys(config: BenchwiseConfig) -> Dict[str, bool]:
         try:
             import anthropic
 
-            anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+            # Create client to verify API key is valid
+            _ = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
             # Note: Anthropic doesn't have a simple test endpoint
             results["anthropic"] = True  # Assume valid if key exists
         except Exception:
diff --git a/benchwise/datasets.py b/benchwise/datasets.py
index 2d8d1eb..a4a60da 100644
--- a/benchwise/datasets.py
+++ b/benchwise/datasets.py
@@ -6,8 +6,6 @@
 from dataclasses import dataclass
 import hashlib
 
-from benchwise.types import DatasetItem, DatasetMetadata
-
 
 @dataclass
 class Dataset:
@@ -74,7 +72,7 @@ def references(self) -> List[str]:
                 or item.get("answer")
                 or item.get("target")
                 or item.get("summary")
-                or item.get("label")  
+                or item.get("label")
             )
             if ref:
                 references.append(str(ref))
@@ -124,13 +122,21 @@ def split(
         train_dataset = Dataset(
             name=f"{self.name}_train",
             data=train_data,
-            metadata={**(self.metadata or {}), "split": "train", "train_ratio": train_ratio},
+            metadata={
+                **(self.metadata or {}),
+                "split": "train",
+                "train_ratio": train_ratio,
+            },
         )
 
         test_dataset = Dataset(
             name=f"{self.name}_test",
             data=test_data,
-            metadata={**(self.metadata or {}), "split": "test", "train_ratio": train_ratio},
+            metadata={
+                **(self.metadata or {}),
+                "split": "test",
+                "train_ratio": train_ratio,
+            },
         )
 
         return train_dataset, test_dataset
@@ -192,8 +198,12 @@ def get_statistics(self) -> Dict[str, Any]:
                         ) / len(values)
                     elif all(isinstance(v, (int, float)) for v in values):
                         # Type narrowing: we know values are numeric here
-                        numeric_values = [v for v in values if isinstance(v, (int, float))]
-                        stats[f"{field}_mean"] = sum(numeric_values) / len(numeric_values)
+                        numeric_values = [
+                            v for v in values if isinstance(v, (int, float))
+                        ]
+                        stats[f"{field}_mean"] = sum(numeric_values) / len(
+                            numeric_values
+                        )
                         stats[f"{field}_min"] = min(numeric_values)
                         stats[f"{field}_max"] = max(numeric_values)
 
@@ -291,12 +301,12 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs: Any) -> Dat
                 f"Unsupported file format '{source_path.suffix}'. Supported formats: .json, .csv"
             )
 
-    raise ValueError(
-        f"Unable to load dataset from source: {source}"
-    )
+    raise ValueError(f"Unable to load dataset from source: {source}")
 
 
-def create_qa_dataset(questions: List[str], answers: List[str], **kwargs: Any) -> Dataset:
+def create_qa_dataset(
+    questions: List[str], answers: List[str], **kwargs: Any
+) -> Dataset:
     """
     Create a question-answering dataset.
 
diff --git a/benchwise/metrics.py b/benchwise/metrics.py
index b63ff67..4c41baf 100644
--- a/benchwise/metrics.py
+++ b/benchwise/metrics.py
@@ -1,7 +1,6 @@
-from typing import List, Dict, Any, Tuple, Optional, Union, Callable
+from typing import List, Dict, Any, Tuple, Optional, Callable
 import numpy as np
-from numpy.typing import NDArray
-from benchwise.types import RougeScores, BleuScores, BertScoreResults, AccuracyResults
+from benchwise.types import RougeScores
 from rouge_score import rouge_scorer
 from sacrebleu import BLEU
 import bert_score
@@ -98,7 +97,13 @@ def rouge_l(
     scorer = rouge_scorer.RougeScorer(
         ["rougeL", "rouge1", "rouge2"], use_stemmer=use_stemmer
     )
-    scores: Dict[str, List[float]] = {"precision": [], "recall": [], "f1": [], "rouge1_f1": [], "rouge2_f1": []}
+    scores: Dict[str, List[float]] = {
+        "precision": [],
+        "recall": [],
+        "f1": [],
+        "rouge1_f1": [],
+        "rouge2_f1": [],
+    }
 
     for pred, ref in zip(predictions, references):
         # Handle empty strings gracefully
@@ -206,7 +211,9 @@ def bleu_score(
 
     # Calculate sentence-level BLEU with improved handling
     sentence_scores = []
-    ngram_precisions: Dict[str, List[float]] = {f"bleu_{i}": [] for i in range(1, max_n + 1)}
+    ngram_precisions: Dict[str, List[float]] = {
+        f"bleu_{i}": [] for i in range(1, max_n + 1)
+    }
 
     for pred, ref in zip(predictions, references):
         try:
@@ -834,7 +841,10 @@ def factual_correctness(
 
 
 def _analyze_factual_correctness(
-    prediction: str, reference: str, nlp_model: Any = None, use_named_entities: bool = True
+    prediction: str,
+    reference: str,
+    nlp_model: Any = None,
+    use_named_entities: bool = True,
 ) -> Dict[str, float]:
     """
     Analyze factual correctness using multiple approaches.
@@ -1106,12 +1116,13 @@ def _analyze_text_coherence(text: str) -> Dict[str, float]:
     # 1. Sentence consistency (length and structure)
     sentence_lengths = [len(s.split()) for s in sentences]
     if len(sentence_lengths) > 1:
-        length_cv = (
-            np.std(sentence_lengths) / np.mean(sentence_lengths)
+        length_cv: float = (
+            float(np.std(sentence_lengths) / np.mean(sentence_lengths))
             if np.mean(sentence_lengths) > 0
-            else 1
+            else 1.0
         )
-        sentence_consistency = float(max(0, 1 - (length_cv / 2)))  # Normalize to 0-1
+        cv_value: float = length_cv / 2.0
+        sentence_consistency = max(0.0, 1.0 - cv_value)  # Normalize to 0-1
     else:
         sentence_consistency = 1.0 if sentence_lengths else 0.0
 
@@ -1487,7 +1498,9 @@ class MetricCollection:
     def __init__(self) -> None:
         self.metrics: Dict[str, Tuple[Callable[..., Any], Dict[str, Any]]] = {}
 
-    def add_metric(self, name: str, metric_func: Callable[..., Any], **kwargs: Any) -> None:
+    def add_metric(
+        self, name: str, metric_func: Callable[..., Any], **kwargs: Any
+    ) -> None:
         """Add a metric to the collection."""
         self.metrics[name] = (metric_func, kwargs)
 
diff --git a/mypy.ini b/mypy.ini
index 23d3a7c..1383e41 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -62,3 +62,12 @@ follow_imports = skip
 
 [mypy-fuzzywuzzy.*]
 ignore_missing_imports = True
+
+[mypy-sacrebleu.*]
+ignore_missing_imports = True
+
+[mypy-spacy.*]
+ignore_missing_imports = True
+
+[mypy-google.*]
+ignore_missing_imports = True
diff --git a/test_single_doc_file.py b/test_single_doc_file.py
deleted file mode 100644
index 5420d7b..0000000
--- a/test_single_doc_file.py
+++ /dev/null
@@ -1,424 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test all code examples from a documentation file with REAL models.
-
-This script extracts all Python code blocks from a documentation markdown file
-and runs each one as a separate test with real OpenAI and Google models.
-
-The script can find documentation files in multiple ways:
-- Absolute path: /path/to/file.md
-- Relative to project root: docs/docs/examples/classification.md
-- Just filename (searches docs/ tree): classification.md
-
-Usage:
-    # Using just filename (searches in docs/ directory)
-    python test_single_doc_file.py classification.md
-
-    # Using relative path from project root
-    python test_single_doc_file.py docs/docs/examples/classification.md
-    python test_single_doc_file.py README.md
-    python test_single_doc_file.py docs/docs/getting-started/quickstart.md
-
-    # Syntax check only (no API calls)
-    python test_single_doc_file.py --syntax-only classification.md
-
-    # Save test results to files
-    python test_single_doc_file.py --save-results classification.md
-"""
-
-import re
-import subprocess
-import sys
-import tempfile
-import time
-from pathlib import Path
-from typing import List, Tuple
-
-
-def extract_code_blocks(markdown_file: Path) -> List[Tuple[str, int, int]]:
-    """
-    Extract all Python code blocks from a markdown file.
-    Returns list of (code, block_number, line_number) tuples.
-    """
-    with open(markdown_file, 'r', encoding='utf-8') as f:
-        content = f.read()
-
-    pattern = r'```python\n(.*?)```'
-    matches = re.finditer(pattern, content, re.DOTALL)
-
-    code_blocks = []
-    for i, match in enumerate(matches, 1):
-        code = match.group(1)
-        line_number = content[:match.start()].count('\n') + 1
-        code_blocks.append((code, i, line_number))
-
-    return code_blocks
-
-
-def prepare_code_for_real_models(code: str) -> str:
-    """
-    Replace model names with real OpenAI and Google models.
-    Ensures we use exactly 2 models: gpt-3.5-turbo and gemini-2.5-flash
-    """
-    import re
-
-    # Find all @evaluate decorators and replace models to ensure diversity
-    def replace_evaluate_models(match):
-        decorator = match.group(0)
-
-        # Extract the content inside @evaluate(...)
-        content = re.search(r'@evaluate\((.*)\)', decorator, re.DOTALL)
-        if not content:
-            return decorator
-
-        params = content.group(1)
-
-        # Split by comma, but be careful with nested structures
-        # Extract all quoted strings (model names)
-        model_pattern = r'"([^"]+)"'
-        models = re.findall(model_pattern, params)
-
-        if not models:
-            return decorator
-
-        # Always use exactly 2 models: gpt-3.5-turbo and gemini-2.5-flash
-        # Take first N models and replace them, but cap at 2
-        num_models = min(len(models), 2)
-        new_models = ['"gpt-3.5-turbo"', '"gemini-2.5-flash"'][:num_models]
-
-        # If there was only 1 model originally, keep it as 1 model
-        if len(models) == 1:
-            new_models = ['"gpt-3.5-turbo"']
-
-        # Find any kwargs (parameters with =)
-        # Split params and identify non-string parts (kwargs)
-        kwargs = []
-        # Remove all quoted strings and see what's left
-        params_without_strings = re.sub(r'"[^"]*"', '', params)
-        if '=' in params_without_strings:
-            # Extract kwargs
-            kwargs_match = re.search(r',?\s*(\w+\s*=\s*[^,)]+(?:,\s*\w+\s*=\s*[^,)]+)*)\s*$', params)
-            if kwargs_match:
-                kwargs.append(kwargs_match.group(1))
-
-        # Reconstruct the decorator
-        result = '@evaluate(' + ', '.join(new_models)
-        if kwargs:
-            result += ', ' + ', '.join(kwargs)
-        result += ')'
-
-        return result
-
-    # Replace all @evaluate decorators
-    modified_code = re.sub(r'@evaluate\([^)]+\)', replace_evaluate_models, code)
-
-    # Replace placeholder dataset loading with actual datasets
-    if 'load_dataset("data/qa_1000.json")' in modified_code:
-        # Add import if not present
-        if 'from benchwise' in modified_code and 'create_qa_dataset' not in modified_code:
-            modified_code = modified_code.replace(
-                'from benchwise import',
-                'from benchwise import create_qa_dataset,'
-            )
-        modified_code = modified_code.replace(
-            'load_dataset("data/qa_1000.json")',
-            'create_qa_dataset(questions=["What is AI?", "What is ML?"], answers=["Artificial Intelligence", "Machine Learning"], name="qa_test")'
-        )
-
-    if 'load_dataset("data/news_articles.json")' in modified_code:
-        # Add import if not present
-        if 'from benchwise' in modified_code and 'create_summarization_dataset' not in modified_code:
-            modified_code = modified_code.replace(
-                'from benchwise import',
-                'from benchwise import create_summarization_dataset,'
-            )
-        modified_code = modified_code.replace(
-            'load_dataset("data/news_articles.json")',
-            'create_summarization_dataset(documents=["Article about AI.", "Article about ML."], summaries=["AI summary", "ML summary"], name="news")'
-        )
-
-    return modified_code
-
-
-def check_syntax(code: str) -> Tuple[bool, str]:
-    """Check if Python code has valid syntax."""
-    import ast
-    try:
-        ast.parse(code)
-        return True, None
-    except SyntaxError as e:
-        return False, f"SyntaxError at line {e.lineno}: {e.msg}"
-    except Exception as e:
-        return False, f"Parse error: {str(e)}"
-
-
-def run_code_sync(code: str, timeout: int = 90) -> Tuple[bool, str, str]:
-    """Run code in subprocess and capture output."""
-    try:
-        # Create temp file
-        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
-            f.write(code)
-            temp_file = f.name
-
-        # Run in subprocess
-        result = subprocess.run(
-            ['python', temp_file],
-            capture_output=True,
-            text=True,
-            timeout=timeout,
-            cwd=Path(__file__).parent
-        )
-
-        # Cleanup
-        import os
-        os.unlink(temp_file)
-
-        output = result.stdout
-        error = result.stderr
-
-        if result.returncode == 0:
-            return True, output, None
-        else:
-            return False, output, error
-
-    except subprocess.TimeoutExpired:
-        return False, "", f"Timeout after {timeout}s"
-    except Exception as e:
-        return False, "", f"Error: {str(e)}"
-
-
-def test_code_block(code: str, block_num: int, line_num: int, syntax_only: bool = False) -> Tuple[bool, str]:
-    """Test a single code block."""
-    # Check syntax
-    syntax_valid, syntax_error = check_syntax(code)
-    if not syntax_valid:
-        print(f"❌ SYNTAX ERROR")
-        return False, f"Syntax Error: {syntax_error}"
-
-    if syntax_only:
-        print(f"✅ SYNTAX VALID")
-        return True, None
-
-    # Prepare code with real models
-    prepared_code = prepare_code_for_real_models(code)
-
-    # Skip incomplete examples (just function definitions without execution)
-    if '@evaluate(' in prepared_code and 'asyncio.run' not in prepared_code:
-        print(f"⏭️  SKIPPED (incomplete example - defines functions only)")
-        return True, "Skipped: Incomplete example"
-
-    # Run the code
-    print(f"⏳ Running test...", end=" ", flush=True)
-    start_time = time.time()
-    success, output, error = run_code_sync(prepared_code, timeout=90)
-    duration = time.time() - start_time
-
-    if success:
-        print(f"✅ PASSED ({duration:.2f}s)")
-        return True, output
-    else:
-        print(f"❌ FAILED ({duration:.2f}s)")
-        return False, error or output
-
-
-def main():
-    import argparse
-    import json
-    from datetime import datetime
-
-    parser = argparse.ArgumentParser(description="Test Python code examples from a documentation file")
-    parser.add_argument('file', help='Documentation file to test. Can be:\n'
-                                     '  - Relative path from project root (e.g., docs/docs/examples/classification.md)\n'
-                                     '  - Absolute path (e.g., /path/to/file.md)\n'
-                                     '  - Just filename (will search in docs/ directory tree)')
-    parser.add_argument('--syntax-only', action='store_true', help='Only check syntax')
-    parser.add_argument('--save-results', action='store_true', help='Save test results to files')
-    args = parser.parse_args()
-
-    # Find the documentation file
-    project_root = Path(__file__).parent
-    file_arg = Path(args.file)
-
-    # Try different strategies to find the file
-    doc_file = None
-
-    # Strategy 1: Absolute path
-    if file_arg.is_absolute() and file_arg.exists():
-        doc_file = file_arg
-
-    # Strategy 2: Relative to project root
-    elif (project_root / file_arg).exists():
-        doc_file = project_root / file_arg
-
-    # Strategy 3: Search in docs directory tree
-    else:
-        docs_dir = project_root / 'docs'
-        if docs_dir.exists():
-            # Search for the file in docs directory tree
-            for candidate in docs_dir.rglob(file_arg.name if file_arg.name else args.file):
-                if candidate.is_file():
-                    doc_file = candidate
-                    break
-
-    if doc_file is None or not doc_file.exists():
-        print(f"❌ Error: File not found: {args.file}")
-        print(f"\nSearched in:")
-        print(f"  - Absolute path: {file_arg if file_arg.is_absolute() else 'N/A'}")
-        print(f"  - Relative to project: {project_root / file_arg}")
-        print(f"  - In docs/ directory tree")
-        return 1
-
-    # Get relative path for display
-    try:
-        display_path = doc_file.relative_to(project_root)
-    except ValueError:
-        display_path = doc_file
-
-    print(f"\n🧪 Testing Documentation Examples")
-    print(f"📄 File: {display_path}")
-
-    if args.syntax_only:
-        print("⚙️  Mode: Syntax check only")
-    else:
-        print("⚙️  Mode: Full execution with REAL models")
-        print("🤖 Models: gpt-3.5-turbo, gemini-2.5-flash")
-        print("⚠️  Note: This will make actual API calls and incur costs")
-
-    # Extract code blocks
-    code_blocks = extract_code_blocks(doc_file)
-
-    if not code_blocks:
-        print(f"\n❌ No Python code blocks found in {args.file}")
-        return 1
-
-    print(f"📝 Total code blocks: {len(code_blocks)}\n")
-    print("=" * 80)
-
-    # Test each code block
-    results = []
-    for code, block_num, line_num in code_blocks:
-        print(f"\n{'=' * 80}")
-        print(f"TEST {block_num}/{len(code_blocks)}: Block {block_num} (Line {line_num})")
-        print("=" * 80)
-
-        success, output_or_error = test_code_block(code, block_num, line_num, args.syntax_only)
-        results.append((block_num, success, output_or_error))
-
-        # Show output
-        if success and output_or_error and output_or_error.strip() and not args.syntax_only:
-            print("\n📋 OUTPUT:")
-            print("-" * 80)
-            output_lines = output_or_error.strip().split('\n')
-            for line in output_lines[:50]:  # Show first 50 lines
-                print(f"  {line}")
-            if len(output_lines) > 50:
-                print(f"  ... ({len(output_lines) - 50} more lines)")
-            print("-" * 80)
-        elif not success and output_or_error:
-            print("\n❌ ERROR:")
-            print("-" * 80)
-            error_lines = output_or_error.split('\n')
-            for line in error_lines[:40]:  # Show first 40 lines
-                print(f"  {line}")
-            if len(error_lines) > 40:
-                print(f"  ... ({len(error_lines) - 40} more lines)")
-            print("-" * 80)
-
-    # Summary
-    print(f"\n{'=' * 80}")
-    print("SUMMARY")
-    print("=" * 80)
-
-    total = len(results)
-    passed = sum(1 for _, success, _ in results if success)
-    failed = total - passed
-
-    print(f"\nFile: {display_path}")
-    print(f"Total: {total} code blocks")
-    print(f"✅ Passed: {passed}")
-    print(f"❌ Failed: {failed}")
-    print(f"Success Rate: {passed/total*100:.1f}%")
-
-    # Show failures
-    if failed > 0:
-        print(f"\n{'-' * 80}")
-        print("FAILED TESTS")
-        print("-" * 80)
-        for block_num, success, output_or_error in results:
-            if not success:
-                print(f"\n❌ Block {block_num}")
-                if output_or_error:
-                    print(f"   {output_or_error[:200]}")
-
-    print(f"\n{'=' * 80}\n")
-
-    # Save results if requested
-    if args.save_results:
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        # Create a clean base name from the file path
-        base_name = doc_file.stem  # Gets filename without extension
-
-        # Create results directory
-        results_dir = Path(__file__).parent / 'test_results'
-        results_dir.mkdir(exist_ok=True)
-
-        # Save JSON results (detailed)
-        json_file = results_dir / f"{base_name}_{timestamp}.json"
-        json_data = {
-            "file": str(display_path),
-            "full_path": str(doc_file),
-            "timestamp": datetime.now().isoformat(),
-            "total": total,
-            "passed": passed,
-            "failed": failed,
-            "success_rate": passed/total*100,
-            "syntax_only": args.syntax_only,
-            "results": [
-                {
-                    "block": block_num,
-                    "success": success,
-                    "output": output_or_error[:500] if output_or_error else None,  # Truncate long outputs
-                }
-                for block_num, success, output_or_error in results
-            ]
-        }
-
-        with open(json_file, 'w') as f:
-            json.dump(json_data, f, indent=2)
-
-        print(f"💾 JSON results saved to: {json_file}")
-
-        # Save Markdown summary
-        md_file = results_dir / f"{base_name}_{timestamp}.md"
-        with open(md_file, 'w') as f:
-            f.write(f"# Test Results: {display_path}\n\n")
-            f.write(f"**File:** `{doc_file}`\n\n")
-            f.write(f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
-            f.write(f"**Models:** gpt-3.5-turbo, gemini-2.5-flash\n\n")
-            f.write(f"## Summary\n\n")
-            f.write(f"- Total Tests: {total}\n")
-            f.write(f"- ✅ Passed: {passed}\n")
-            f.write(f"- ❌ Failed: {failed}\n")
-            f.write(f"- Success Rate: {passed/total*100:.1f}%\n\n")
-
-            if failed > 0:
-                f.write(f"## Failed Tests\n\n")
-                for block_num, success, output_or_error in results:
-                    if not success:
-                        f.write(f"### Block {block_num}\n\n")
-                        f.write(f"```\n{output_or_error[:300] if output_or_error else 'No error details'}\n```\n\n")
-
-        print(f"📝 Markdown summary saved to: {md_file}")
-
-        # Save to latest file (overwrite)
-        latest_json = results_dir / f"{base_name}_latest.json"
-        with open(latest_json, 'w') as f:
-            json.dump(json_data, f, indent=2)
-
-        print(f"📌 Latest results: {latest_json}")
-
-    return 0 if failed == 0 else 1
-
-
-if __name__ == '__main__':
-    sys.exit(main())

From 88c052d8ac4a376eead1c216cdb07fd8a6826239 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Wed, 3 Dec 2025 00:20:45 +0530
Subject: [PATCH 14/24] fix: Remove unused imports and fix type errors

---
 benchwise/core.py           | 84 ++++++++++++++++++++++++++-----------
 benchwise/exceptions.py     | 12 +++++-
 benchwise/logging.py        | 24 +++++------
 benchwise/results.py        | 14 +++++--
 benchwise/types.py          | 47 +++++++++++++++++----
 tests/test_config.py        |  7 ++--
 tests/test_docs_examples.py | 48 ++++++++++++---------
 tests/test_integration.py   |  5 ++-
 tests/test_models.py        | 15 ++++---
 9 files changed, 179 insertions(+), 77 deletions(-)

diff --git a/benchwise/core.py b/benchwise/core.py
index fe34af5..93fe89f 100644
--- a/benchwise/core.py
+++ b/benchwise/core.py
@@ -1,4 +1,14 @@
-from typing import List, Dict, Any, Callable, Optional, Union, ParamSpec, TypeVar, Awaitable
+from typing import (
+    List,
+    Dict,
+    Any,
+    Callable,
+    Optional,
+    Union,
+    ParamSpec,
+    TypeVar,
+    Awaitable,
+)
 from functools import wraps
 import asyncio
 import time
@@ -11,15 +21,18 @@
 from .client import upload_results
 
 # Type variables for decorator typing
-P = ParamSpec('P')
-R = TypeVar('R')
+P = ParamSpec("P")
+R = TypeVar("R")
 
 logger = logging.getLogger("benchwise")
 
 
 def evaluate(
     *models: str, upload: Optional[bool] = None, **kwargs: Any
-) -> Callable[[Callable[..., Awaitable[Any]]], Callable[[Dataset, Any], Awaitable[List[EvaluationResult]]]]:
+) -> Callable[
+    [Callable[..., Awaitable[Any]]],
+    Callable[[Dataset, Any], Awaitable[List[EvaluationResult]]],
+]:
     """
     Decorator for creating LLM evaluations.
 
@@ -41,7 +54,9 @@ async def test_qa(model, dataset):
             return accuracy(responses, dataset.references)
     """
 
-    def decorator(test_func: Callable[..., Awaitable[Any]]) -> Callable[..., Awaitable[List[EvaluationResult]]]:
+    def decorator(
+        test_func: Callable[..., Awaitable[Any]],
+    ) -> Callable[..., Awaitable[List[EvaluationResult]]]:
         if not inspect.iscoroutinefunction(test_func):
             raise TypeError(
                 f"{test_func.__name__} must be an async function. "
@@ -49,8 +64,12 @@ def decorator(test_func: Callable[..., Awaitable[Any]]) -> Callable[..., Awaitab
             )
 
         @wraps(test_func)
-        async def wrapper(dataset: Dataset, **test_kwargs: Any) -> List[EvaluationResult]:
-            return await _run_evaluation(test_func, dataset, models, upload, kwargs, test_kwargs)
+        async def wrapper(
+            dataset: Dataset, **test_kwargs: Any
+        ) -> List[EvaluationResult]:
+            return await _run_evaluation(
+                test_func, dataset, models, upload, kwargs, test_kwargs
+            )
 
         if hasattr(test_func, "_benchmark_metadata"):
             wrapper._benchmark_metadata = test_func._benchmark_metadata  # type: ignore[attr-defined]
@@ -69,13 +88,13 @@ async def _run_evaluation(
     test_kwargs: Dict[str, Any],
 ) -> List[EvaluationResult]:
     results = []
-    
+
     logger.info(f"Starting evaluation: {test_func.__name__} on {len(models)} model(s)")
 
     for model_name in models:
         try:
             logger.debug(f"Evaluating model: {model_name}")
-            
+
             model = get_model_adapter(model_name)
 
             start_time = time.time()
@@ -95,12 +114,12 @@ async def _run_evaluation(
                 metadata=combined_metadata,
             )
             results.append(eval_result)
-            
+
             logger.info(f"✓ {model_name} completed in {end_time - start_time:.2f}s")
 
         except Exception as e:
             logger.error(f"✗ {model_name} failed: {e}", exc_info=True)
-            
+
             combined_metadata = decorator_kwargs.copy()
             if hasattr(test_func, "_benchmark_metadata"):
                 combined_metadata.update(test_func._benchmark_metadata)
@@ -121,9 +140,7 @@ async def _run_evaluation(
     if should_upload and results:
         try:
             logger.debug("Uploading results to Benchwise API")
-            await upload_results(
-                results, test_func.__name__, dataset.metadata or {}
-            )
+            await upload_results(results, test_func.__name__, dataset.metadata or {})
             logger.info("Results uploaded successfully")
         except Exception as e:
             logger.warning(f"Upload failed (results saved locally): {e}")
@@ -133,7 +150,9 @@ async def _run_evaluation(
     return results
 
 
-def benchmark(name: str, description: str = "", **kwargs: Any) -> Callable[[Callable[P, R]], Callable[P, R]]:
+def benchmark(
+    name: str, description: str = "", **kwargs: Any
+) -> Callable[[Callable[P, R]], Callable[P, R]]:
     """
     Decorator for creating benchmarks.
 
@@ -154,7 +173,11 @@ def decorator(test_func: Callable[P, R]) -> Callable[P, R]:
     return decorator
 
 
-def stress_test(concurrent_requests: int = 10, duration: int = 60) -> Callable[[Callable[P, Awaitable[R]]], Callable[P, Awaitable[List[Union[R, BaseException]]]]]:
+def stress_test(
+    concurrent_requests: int = 10, duration: int = 60
+) -> Callable[
+    [Callable[P, Awaitable[R]]], Callable[P, Awaitable[List[Union[R, BaseException]]]]
+]:
     """
     Decorator for stress testing LLMs.
 
@@ -166,10 +189,16 @@ async def load_test(model, dataset):
             pass
     """
 
-    def decorator(test_func: Callable[P, Awaitable[R]]) -> Callable[P, Awaitable[List[Union[R, BaseException]]]]:
+    def decorator(
+        test_func: Callable[P, Awaitable[R]],
+    ) -> Callable[P, Awaitable[List[Union[R, BaseException]]]]:
         @wraps(test_func)
-        async def wrapper(*args: P.args, **kwargs: P.kwargs) -> List[Union[R, BaseException]]:
-            logger.info(f"Starting stress test: {concurrent_requests} concurrent requests for {duration}s")
+        async def wrapper(
+            *args: P.args, **kwargs: P.kwargs
+        ) -> List[Union[R, BaseException]]:
+            logger.info(
+                f"Starting stress test: {concurrent_requests} concurrent requests for {duration}s"
+            )
 
             tasks: List[Union[R, BaseException]] = []
             start_time = time.time()
@@ -203,7 +232,10 @@ def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
         self.logger = logging.getLogger("benchwise.runner")
 
     async def run_evaluation(
-        self, test_func: Callable[..., Awaitable[Any]], dataset: Dataset, models: List[str]
+        self,
+        test_func: Callable[..., Awaitable[Any]],
+        dataset: Dataset,
+        models: List[str],
     ) -> List[EvaluationResult]:
         """Run evaluation on multiple models."""
         results: List[EvaluationResult] = []
@@ -266,9 +298,11 @@ def compare_models(
                 {"model": name, "score": score} for name, score in model_scores
             ],
         }
-        
-        self.logger.info(f"Comparison complete: Best model is {comparison['best_model']}")
-        
+
+        self.logger.info(
+            f"Comparison complete: Best model is {comparison['best_model']}"
+        )
+
         return comparison
 
 
@@ -280,7 +314,9 @@ def run_benchmark(
     return asyncio.run(runner.run_evaluation(benchmark_func, dataset, models))
 
 
-async def quick_eval(prompt: str, models: List[str], metric: Callable[[str], float]) -> Dict[str, Optional[float]]:
+async def quick_eval(
+    prompt: str, models: List[str], metric: Callable[[str], float]
+) -> Dict[str, Optional[float]]:
     """Quick evaluation with a single prompt."""
     results: Dict[str, Optional[float]] = {}
 
diff --git a/benchwise/exceptions.py b/benchwise/exceptions.py
index d5a9888..659799d 100644
--- a/benchwise/exceptions.py
+++ b/benchwise/exceptions.py
@@ -9,47 +9,57 @@
 
 class BenchwiseError(Exception):
     """Base exception for all Benchwise errors."""
+
     pass
 
 
 class AuthenticationError(BenchwiseError):
     """Raised when authentication fails."""
+
     pass
 
 
 class RateLimitError(BenchwiseError):
     """Raised when API rate limit is exceeded."""
 
-    def __init__(self, message: str = "Rate limit exceeded", retry_after: Optional[int] = None) -> None:
+    def __init__(
+        self, message: str = "Rate limit exceeded", retry_after: Optional[int] = None
+    ) -> None:
         super().__init__(message)
         self.retry_after = retry_after
 
 
 class ValidationError(BenchwiseError):
     """Raised when input validation fails."""
+
     pass
 
 
 class NetworkError(BenchwiseError):
     """Raised when network requests fail."""
+
     pass
 
 
 class ConfigurationError(BenchwiseError):
     """Raised when configuration is invalid or missing."""
+
     pass
 
 
 class DatasetError(BenchwiseError):
     """Raised when dataset operations fail."""
+
     pass
 
 
 class ModelError(BenchwiseError):
     """Raised when model operations fail."""
+
     pass
 
 
 class MetricError(BenchwiseError):
     """Raised when metric calculation fails."""
+
     pass
diff --git a/benchwise/logging.py b/benchwise/logging.py
index 3b2bd31..4843e53 100644
--- a/benchwise/logging.py
+++ b/benchwise/logging.py
@@ -16,20 +16,20 @@ def setup_logging(
 ) -> logging.Logger:
     """
     Setup logging for Benchwise.
-    
+
     Args:
         level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
         format: Custom log format string
         filename: Optional file to write logs to
-    
+
     Returns:
         Configured logger instance
     """
-    
+
     # Default format
     if format is None:
         format = "[%(asctime)s] %(levelname)s [%(name)s] %(message)s"
-    
+
     # Configure root logger
     logging.basicConfig(
         level=getattr(logging, level.upper()),
@@ -37,31 +37,31 @@ def setup_logging(
         datefmt="%Y-%m-%d %H:%M:%S",
         handlers=[
             logging.StreamHandler(sys.stdout),
-        ]
+        ],
     )
-    
+
     # Add file handler if filename provided
     if filename:
         file_handler = logging.FileHandler(filename)
         file_handler.setFormatter(logging.Formatter(format))
         logging.getLogger("benchwise").addHandler(file_handler)
-    
+
     # Get benchwise logger
     logger = logging.getLogger("benchwise")
     logger.setLevel(getattr(logging, level.upper()))
-    
+
     logger.debug(f"Logging initialized at {level} level")
-    
+
     return logger
 
 
 def get_logger(name: str = "benchwise") -> logging.Logger:
     """
     Get a logger instance for Benchwise.
-    
+
     Args:
         name: Logger name (default: "benchwise")
-    
+
     Returns:
         Logger instance
     """
@@ -71,7 +71,7 @@ def get_logger(name: str = "benchwise") -> logging.Logger:
 def set_log_level(level: str) -> None:
     """
     Change the log level for all Benchwise loggers.
-    
+
     Args:
         level: New log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
     """
diff --git a/benchwise/results.py b/benchwise/results.py
index 330a73e..54768ff 100644
--- a/benchwise/results.py
+++ b/benchwise/results.py
@@ -119,7 +119,9 @@ def success_rate(self) -> float:
             return 0.0
         return len(self.successful_results) / len(self.results)
 
-    def get_best_model(self, metric_name: Optional[str] = None) -> Optional[EvaluationResult]:
+    def get_best_model(
+        self, metric_name: Optional[str] = None
+    ) -> Optional[EvaluationResult]:
         """
         Get the best performing model result.
 
@@ -135,7 +137,9 @@ def get_best_model(self, metric_name: Optional[str] = None) -> Optional[Evaluati
 
         return max(successful_results, key=lambda r: r.get_score(metric_name) or 0)
 
-    def get_worst_model(self, metric_name: Optional[str] = None) -> Optional[EvaluationResult]:
+    def get_worst_model(
+        self, metric_name: Optional[str] = None
+    ) -> Optional[EvaluationResult]:
         """
         Get the worst performing model result.
 
@@ -269,7 +273,11 @@ def compare_benchmarks(
         Returns:
             Dictionary with cross-benchmark comparison
         """
-        comparison: Dict[str, Any] = {"benchmarks": [], "models": set(), "cross_benchmark_scores": {}}
+        comparison: Dict[str, Any] = {
+            "benchmarks": [],
+            "models": set(),
+            "cross_benchmark_scores": {},
+        }
 
         for benchmark in benchmark_results:
             benchmark_info = {
diff --git a/benchwise/types.py b/benchwise/types.py
index 5818a62..a351b09 100644
--- a/benchwise/types.py
+++ b/benchwise/types.py
@@ -5,14 +5,25 @@
 used throughout the BenchWise codebase for improved type safety and IDE support.
 """
 
-from typing import Any, Dict, List, Literal, Optional, Protocol, TypeVar, ParamSpec, Tuple, TypedDict
+from typing import (
+    Any,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Protocol,
+    TypeVar,
+    ParamSpec,
+    Tuple,
+    TypedDict,
+)
 
 # Type Variables
-T = TypeVar('T')
-R = TypeVar('R')
-P = ParamSpec('P')
-ModelT = TypeVar('ModelT')
-DatasetT = TypeVar('DatasetT')
+T = TypeVar("T")
+R = TypeVar("R")
+P = ParamSpec("P")
+ModelT = TypeVar("ModelT")
+DatasetT = TypeVar("DatasetT")
 
 # Literal Types
 HttpMethod = Literal["GET", "POST", "PUT", "DELETE", "PATCH"]
@@ -23,6 +34,7 @@
 # Model Configuration Types
 class ModelConfig(TypedDict, total=False):
     """Configuration options for model adapters."""
+
     api_key: str
     temperature: float
     max_tokens: int
@@ -35,6 +47,7 @@ class ModelConfig(TypedDict, total=False):
 
 class PricingInfo(TypedDict):
     """Pricing information for a model."""
+
     input: float  # Cost per 1K input tokens
     output: float  # Cost per 1K output tokens
 
@@ -42,6 +55,7 @@ class PricingInfo(TypedDict):
 # Metric Return Types
 class RougeScores(TypedDict, total=False):
     """Return type for ROUGE metric scores."""
+
     precision: float
     recall: float
     f1: float
@@ -60,6 +74,7 @@ class RougeScores(TypedDict, total=False):
 
 class BleuScores(TypedDict, total=False):
     """Return type for BLEU metric scores."""
+
     bleu: float
     bleu1: float
     bleu2: float
@@ -75,6 +90,7 @@ class BleuScores(TypedDict, total=False):
 
 class BertScoreResults(TypedDict, total=False):
     """Return type for BERT-Score metric."""
+
     precision: float
     recall: float
     f1: float
@@ -90,6 +106,7 @@ class BertScoreResults(TypedDict, total=False):
 
 class AccuracyResults(TypedDict, total=False):
     """Return type for accuracy metric."""
+
     accuracy: float
     correct: int
     total: int
@@ -101,6 +118,7 @@ class AccuracyResults(TypedDict, total=False):
 
 class SemanticSimilarityResults(TypedDict, total=False):
     """Return type for semantic similarity metric."""
+
     similarity: float
     std_similarity: float
     scores: List[float]
@@ -110,6 +128,7 @@ class SemanticSimilarityResults(TypedDict, total=False):
 
 class CoherenceResults(TypedDict, total=False):
     """Return type for coherence score metric."""
+
     coherence: float
     std_coherence: float
     scores: List[float]
@@ -119,6 +138,7 @@ class CoherenceResults(TypedDict, total=False):
 
 class SafetyResults(TypedDict, total=False):
     """Return type for safety score metric."""
+
     safety: float
     is_safe: bool
     flagged_categories: List[str]
@@ -130,6 +150,7 @@ class SafetyResults(TypedDict, total=False):
 
 class FactualCorrectnessResults(TypedDict, total=False):
     """Return type for factual correctness metric."""
+
     correctness: float
     is_correct: bool
     std_correctness: float
@@ -141,6 +162,7 @@ class FactualCorrectnessResults(TypedDict, total=False):
 # Dataset Types
 class DatasetItem(TypedDict, total=False):
     """A single item in a dataset."""
+
     # Common field names
     prompt: str
     input: str
@@ -159,6 +181,7 @@ class DatasetItem(TypedDict, total=False):
 
 class DatasetMetadata(TypedDict, total=False):
     """Metadata for a dataset."""
+
     name: str
     description: str
     source: str
@@ -170,6 +193,7 @@ class DatasetMetadata(TypedDict, total=False):
 
 class DatasetSchema(TypedDict, total=False):
     """Schema definition for a dataset."""
+
     prompt_field: str
     reference_field: str
     required_fields: List[str]
@@ -179,6 +203,7 @@ class DatasetSchema(TypedDict, total=False):
 # Configuration Types
 class ConfigDict(TypedDict, total=False):
     """Configuration dictionary for BenchWise."""
+
     api_url: str
     api_key: Optional[str]
     upload_enabled: bool
@@ -197,6 +222,7 @@ class ConfigDict(TypedDict, total=False):
 # Results Types
 class EvaluationResultDict(TypedDict, total=False):
     """Serialized evaluation result."""
+
     model: str
     prompt: str
     response: str
@@ -210,6 +236,7 @@ class EvaluationResultDict(TypedDict, total=False):
 
 class BenchmarkResultDict(TypedDict, total=False):
     """Serialized benchmark result."""
+
     benchmark_name: str
     benchmark_description: str
     results: List[EvaluationResultDict]
@@ -219,6 +246,7 @@ class BenchmarkResultDict(TypedDict, total=False):
 
 class ComparisonResult(TypedDict):
     """Result of model comparison."""
+
     best_model: str
     best_score: float
     rankings: List[Tuple[str, float]]
@@ -228,12 +256,14 @@ class ComparisonResult(TypedDict):
 # API Response Types
 class LoginResponse(TypedDict):
     """Response from login endpoint."""
+
     token: Dict[str, str]
     user: Dict[str, Any]
 
 
 class UserInfo(TypedDict, total=False):
     """User information from API."""
+
     id: int
     username: str
     email: str
@@ -243,6 +273,7 @@ class UserInfo(TypedDict, total=False):
 
 class UploadResultsResponse(TypedDict):
     """Response from upload results endpoint."""
+
     id: int
     benchmark_id: int
     model_ids: List[int]
@@ -286,6 +317,8 @@ def exists(self, key: str) -> bool:
 class SupportsMetrics(Protocol):
     """Protocol for objects that support metric evaluation."""
 
-    def evaluate(self, predictions: List[str], references: List[str], **kwargs: Any) -> Dict[str, float]:
+    def evaluate(
+        self, predictions: List[str], references: List[str], **kwargs: Any
+    ) -> Dict[str, float]:
         """Evaluate predictions against references."""
         ...
diff --git a/tests/test_config.py b/tests/test_config.py
index a11b874..2cf169d 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -106,9 +106,10 @@ def test_load_from_json_file(self):
 
         try:
             # Mock the config file paths
-            with patch.object(Path, "exists", return_value=True), patch(
-                "builtins.open", create=True
-            ) as mock_open:
+            with (
+                patch.object(Path, "exists", return_value=True),
+                patch("builtins.open", create=True) as mock_open,
+            ):
                 import json
 
                 mock_open.return_value.__enter__.return_value.read.return_value = (
diff --git a/tests/test_docs_examples.py b/tests/test_docs_examples.py
index d998cf2..4747fec 100644
--- a/tests/test_docs_examples.py
+++ b/tests/test_docs_examples.py
@@ -17,16 +17,16 @@ def extract_code_blocks_from_md(markdown_file: Path) -> List[tuple]:
     Extract all Python code blocks from a markdown file.
     Returns list of (code, block_number, line_number) tuples.
     """
-    with open(markdown_file, 'r', encoding='utf-8') as f:
+    with open(markdown_file, "r", encoding="utf-8") as f:
         content = f.read()
 
-    pattern = r'```python\n(.*?)```'
+    pattern = r"```python\n(.*?)```"
     matches = re.finditer(pattern, content, re.DOTALL)
 
     code_blocks = []
     for i, match in enumerate(matches, 1):
         code = match.group(1)
-        line_number = content[:match.start()].count('\n') + 1
+        line_number = content[: match.start()].count("\n") + 1
         code_blocks.append((code, i, line_number))
 
     return code_blocks
@@ -34,12 +34,12 @@ def extract_code_blocks_from_md(markdown_file: Path) -> List[tuple]:
 
 def get_doc_files() -> List[Path]:
     """Get all markdown documentation files with code examples."""
-    docs_dir = Path(__file__).parent.parent / 'docs' / 'docs' / 'examples'
+    docs_dir = Path(__file__).parent.parent / "docs" / "docs" / "examples"
 
     if not docs_dir.exists():
         return []
 
-    return sorted(docs_dir.glob('*.md'))
+    return sorted(docs_dir.glob("*.md"))
 
 
 def prepare_code_for_testing(code: str) -> str:
@@ -67,11 +67,11 @@ def prepare_code_for_testing(code: str) -> str:
     if 'load_dataset("data/' in modified_code:
         modified_code = modified_code.replace(
             'load_dataset("data/qa_1000.json")',
-            'create_qa_dataset(questions=["Q1?"], answers=["A1"], name="test")'
+            'create_qa_dataset(questions=["Q1?"], answers=["A1"], name="test")',
         )
         modified_code = modified_code.replace(
             'load_dataset("data/news_articles.json")',
-            'create_summarization_dataset(documents=["Doc1"], summaries=["Sum1"], name="news")'
+            'create_summarization_dataset(documents=["Doc1"], summaries=["Sum1"], name="news")',
         )
 
     return modified_code
@@ -87,8 +87,11 @@ def prepare_code_for_testing(code: str) -> str:
         test_params.append((doc_file.name, block_num, line_num, code))
 
 
-@pytest.mark.parametrize("filename,block_num,line_num,code", test_params,
-                        ids=[f"{f}:block_{b}:L{l}" for f, b, l, _ in test_params])
+@pytest.mark.parametrize(
+    "filename,block_num,line_num,code",
+    test_params,
+    ids=[f"{f}:block_{b}:L{line}" for f, b, line, _ in test_params],
+)
 def test_documentation_code_syntax(filename, block_num, line_num, code):
     """
     Test that all code examples in documentation have valid Python syntax.
@@ -107,8 +110,11 @@ def test_documentation_code_syntax(filename, block_num, line_num, code):
 
 @pytest.mark.slow
 @pytest.mark.mock
-@pytest.mark.parametrize("filename,block_num,line_num,code", test_params,
-                        ids=[f"{f}:block_{b}:L{l}" for f, b, l, _ in test_params])
+@pytest.mark.parametrize(
+    "filename,block_num,line_num,code",
+    test_params,
+    ids=[f"{f}:block_{b}:L{line}" for f, b, line, _ in test_params],
+)
 def test_documentation_code_execution(filename, block_num, line_num, code):
     """
     Test that code examples can be executed without errors (using mock models).
@@ -117,11 +123,13 @@ def test_documentation_code_execution(filename, block_num, line_num, code):
     and will be skipped.
     """
     # Skip examples that are just function definitions without execution
-    if '@evaluate(' in code and 'asyncio.run' not in code:
+    if "@evaluate(" in code and "asyncio.run" not in code:
         pytest.skip("Incomplete example (defines functions only)")
 
     # Skip examples that require external data files
-    if 'load_dataset("data/' in code and 'create_' not in prepare_code_for_testing(code):
+    if 'load_dataset("data/' in code and "create_" not in prepare_code_for_testing(
+        code
+    ):
         pytest.skip("Requires external data files")
 
     # Prepare code with mock models
@@ -129,7 +137,7 @@ def test_documentation_code_execution(filename, block_num, line_num, code):
 
     # Execute the code
     try:
-        exec_globals = {'__name__': '__main__'}
+        exec_globals = {"__name__": "__main__"}
         exec(prepared_code, exec_globals)
     except Exception as e:
         pytest.fail(
@@ -141,11 +149,11 @@ def test_documentation_code_execution(filename, block_num, line_num, code):
 @pytest.mark.smoke
 def test_documentation_examples_exist():
     """Verify that documentation example files exist and contain code blocks."""
-    docs_dir = Path(__file__).parent.parent / 'docs' / 'docs' / 'examples'
+    docs_dir = Path(__file__).parent.parent / "docs" / "docs" / "examples"
 
     assert docs_dir.exists(), f"Documentation examples directory not found: {docs_dir}"
 
-    doc_files = list(docs_dir.glob('*.md'))
+    doc_files = list(docs_dir.glob("*.md"))
     assert len(doc_files) > 0, "No documentation markdown files found"
 
     total_blocks = 0
@@ -154,9 +162,11 @@ def test_documentation_examples_exist():
         total_blocks += len(blocks)
 
     assert total_blocks > 0, "No Python code blocks found in documentation"
-    print(f"\nFound {len(doc_files)} documentation files with {total_blocks} code blocks")
+    print(
+        f"\nFound {len(doc_files)} documentation files with {total_blocks} code blocks"
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # Run just the smoke test
-    pytest.main([__file__, '-k', 'test_documentation_examples_exist', '-v'])
+    pytest.main([__file__, "-k", "test_documentation_examples_exist", "-v"])
diff --git a/tests/test_integration.py b/tests/test_integration.py
index f88e8be..a64f87b 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -109,8 +109,9 @@ def test_model_factory_integration(self):
             assert adapter.__class__.__name__ == expected_type
             assert adapter.model_name == model_name
 
-        with patch("transformers.AutoTokenizer"), patch(
-            "transformers.AutoModelForCausalLM"
+        with (
+            patch("transformers.AutoTokenizer"),
+            patch("transformers.AutoModelForCausalLM"),
         ):
             adapter = get_model_adapter("test/unknown-model")
             assert adapter.__class__.__name__ == "HuggingFaceAdapter"
diff --git a/tests/test_models.py b/tests/test_models.py
index 53fd01c..f3f6add 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -80,8 +80,9 @@ def test_get_mock_adapter(self):
 
     def test_get_huggingface_adapter_default(self):
         # Use a mock model name that won't trigger real HuggingFace download
-        with patch("transformers.AutoTokenizer"), patch(
-            "transformers.AutoModelForCausalLM"
+        with (
+            patch("transformers.AutoTokenizer"),
+            patch("transformers.AutoModelForCausalLM"),
         ):
             adapter = get_model_adapter("test/unknown-model-name")
             assert isinstance(adapter, HuggingFaceAdapter)
@@ -197,15 +198,17 @@ def test_google_import_error(self):
 
 class TestHuggingFaceAdapter:
     def test_huggingface_adapter_creation(self):
-        with patch("transformers.AutoTokenizer"), patch(
-            "transformers.AutoModelForCausalLM"
+        with (
+            patch("transformers.AutoTokenizer"),
+            patch("transformers.AutoModelForCausalLM"),
         ):
             adapter = HuggingFaceAdapter("gpt2")
             assert adapter.model_name == "gpt2"
 
     def test_huggingface_cost_estimate(self):
-        with patch("transformers.AutoTokenizer"), patch(
-            "transformers.AutoModelForCausalLM"
+        with (
+            patch("transformers.AutoTokenizer"),
+            patch("transformers.AutoModelForCausalLM"),
         ):
             adapter = HuggingFaceAdapter("gpt2")
             cost = adapter.get_cost_estimate(1000, 500)

From 8b21a2f1bf4479068d643b7330d57984daea17b7 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Wed, 3 Dec 2025 00:59:40 +0530
Subject: [PATCH 15/24] chore(pre-commit): Update ruff-pre-commit revision to
 v0.14.7

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d4ec570..2c8a49c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
       - id: debug-statements
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.6
+    rev: v0.14.7
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]

From 8fa061b67f6ff82b6aba78eaa3ce43037b0c8b07 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Wed, 3 Dec 2025 01:00:47 +0530
Subject: [PATCH 16/24] fix(format): Apply latest ruff formatting and update
 pre-commit config

---
 .pre-commit-config.yaml             |  4 +--
 benchwise/config.py                 |  6 ++--
 benchwise/metrics.py                | 12 ++++----
 tests/test_memory_large_datasets.py | 48 ++++++++++++++---------------
 tests/test_results.py               |  6 ++--
 5 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2c8a49c..0605bc9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v6.0.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
@@ -18,7 +18,7 @@ repos:
       - id: ruff-format
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.11.2
+    rev: v1.19.0
     hooks:
       - id: mypy
         additional_dependencies:
diff --git a/benchwise/config.py b/benchwise/config.py
index a2e32dc..7f673ad 100644
--- a/benchwise/config.py
+++ b/benchwise/config.py
@@ -200,9 +200,9 @@ def save_to_file(self, file_path: Optional[Path] = None) -> None:
 
         # Don't save sensitive information like API key
         if self.api_key and not os.getenv("BENCHWISE_SAVE_API_KEY"):
-            config_dict[
-                "_note"
-            ] = "API key not saved for security. Set BENCHWISE_API_KEY environment variable."
+            config_dict["_note"] = (
+                "API key not saved for security. Set BENCHWISE_API_KEY environment variable."
+            )
 
         try:
             with open(file_path, "w") as f:
diff --git a/benchwise/metrics.py b/benchwise/metrics.py
index 4c41baf..f6a322d 100644
--- a/benchwise/metrics.py
+++ b/benchwise/metrics.py
@@ -282,9 +282,9 @@ def bleu_score(
     # Add confidence intervals if requested
     if return_confidence and len(sentence_scores) > 1:
         try:
-            result[
-                "sentence_bleu_confidence_interval"
-            ] = _bootstrap_confidence_interval(sentence_scores)
+            result["sentence_bleu_confidence_interval"] = (
+                _bootstrap_confidence_interval(sentence_scores)
+            )
         except Exception as e:
             warnings.warn(f"Could not calculate BLEU confidence intervals: {e}")
 
@@ -414,9 +414,9 @@ def bert_score_metric(
                 result["f1_confidence_interval"] = _bootstrap_confidence_interval(
                     F1_scores
                 )
-                result[
-                    "precision_confidence_interval"
-                ] = _bootstrap_confidence_interval(P_scores)
+                result["precision_confidence_interval"] = (
+                    _bootstrap_confidence_interval(P_scores)
+                )
                 result["recall_confidence_interval"] = _bootstrap_confidence_interval(
                     R_scores
                 )
diff --git a/tests/test_memory_large_datasets.py b/tests/test_memory_large_datasets.py
index 2a896fb..b5142bd 100644
--- a/tests/test_memory_large_datasets.py
+++ b/tests/test_memory_large_datasets.py
@@ -29,9 +29,9 @@ async def test_large_dataset_memory_usage(self):
             current_memory = self.get_memory_usage()
             memory_increase = current_memory - initial_memory
 
-            assert (
-                memory_increase < 100
-            ), f"Memory usage too high: {memory_increase}MB for {size} items"
+            assert memory_increase < 100, (
+                f"Memory usage too high: {memory_increase}MB for {size} items"
+            )
 
             sampled = dataset.sample(100)
             filtered = dataset.filter(lambda x: len(x["question"]) > 10)
@@ -57,9 +57,9 @@ async def memory_test_evaluation(model, dataset):
             generation_memory = after_generation - before_generation
 
             # Memory increase should be reasonable
-            assert (
-                generation_memory < 50
-            ), f"Generation used too much memory: {generation_memory}MB"
+            assert generation_memory < 50, (
+                f"Generation used too much memory: {generation_memory}MB"
+            )
 
             return {"response_count": len(responses), "memory_used": generation_memory}
 
@@ -69,9 +69,9 @@ async def memory_test_evaluation(model, dataset):
         total_memory_increase = final_memory - initial_memory
 
         # Total memory increase should be reasonable
-        assert (
-            total_memory_increase < 100
-        ), f"Total memory increase too high: {total_memory_increase}MB"
+        assert total_memory_increase < 100, (
+            f"Total memory increase too high: {total_memory_increase}MB"
+        )
 
         assert len(results) == 1
         assert results[0].success
@@ -100,9 +100,9 @@ async def test_dataset_chunking_memory_efficiency(self):
             # Memory shouldn't grow significantly per chunk
             current_memory = self.get_memory_usage()
             memory_per_chunk = (current_memory - initial_memory) / processed_chunks
-            assert (
-                memory_per_chunk < 10
-            ), f"Memory per chunk too high: {memory_per_chunk}MB"
+            assert memory_per_chunk < 10, (
+                f"Memory per chunk too high: {memory_per_chunk}MB"
+            )
 
             del chunk_dataset, chunk_data, prompts
             gc.collect()
@@ -135,9 +135,9 @@ def dataset_generator(size):
                 memory_used = current_memory - initial_memory
                 max_memory_used = max(max_memory_used, memory_used)
 
-                assert (
-                    memory_used < 50
-                ), f"Streaming memory too high: {memory_used}MB at {processed_items} items"
+                assert memory_used < 50, (
+                    f"Streaming memory too high: {memory_used}MB at {processed_items} items"
+                )
 
         assert processed_items == 5000
         assert max_memory_used < 50, f"Max memory usage too high: {max_memory_used}MB"
@@ -164,9 +164,9 @@ async def cleanup_test(model, dataset):
             # Memory should return close to baseline
             current_memory = self.get_memory_usage()
             memory_diff = current_memory - baseline_memory
-            assert (
-                memory_diff < 30
-            ), f"Memory not cleaned up properly: {memory_diff}MB after iteration {i}"
+            assert memory_diff < 30, (
+                f"Memory not cleaned up properly: {memory_diff}MB after iteration {i}"
+            )
 
     async def test_large_dataset_file_operations(self, tmp_path):
         initial_memory = self.get_memory_usage()
@@ -184,9 +184,9 @@ async def test_large_dataset_file_operations(self, tmp_path):
         # Memory shouldn't increase significantly during file operations
         after_save_memory = self.get_memory_usage()
         save_memory_increase = after_save_memory - initial_memory
-        assert (
-            save_memory_increase < 100
-        ), f"Save operation used too much memory: {save_memory_increase}MB"
+        assert save_memory_increase < 100, (
+            f"Save operation used too much memory: {save_memory_increase}MB"
+        )
 
         # Test loading from file
         del large_dataset
@@ -198,9 +198,9 @@ async def test_large_dataset_file_operations(self, tmp_path):
         # Memory after loading should be reasonable
         after_load_memory = self.get_memory_usage()
         load_memory_increase = after_load_memory - initial_memory
-        assert (
-            load_memory_increase < 150
-        ), f"Load operation used too much memory: {load_memory_increase}MB"
+        assert load_memory_increase < 150, (
+            f"Load operation used too much memory: {load_memory_increase}MB"
+        )
 
         # Verify file sizes are reasonable
         json_size = json_file.stat().st_size / 1024 / 1024  # MB
diff --git a/tests/test_results.py b/tests/test_results.py
index 8a63d84..6646f62 100644
--- a/tests/test_results.py
+++ b/tests/test_results.py
@@ -184,9 +184,9 @@ def test_cache_save_and_load(self, temp_cache_dir, sample_evaluation_result):
         cache_files = list(Path(temp_cache_dir).glob("*.json"))
         assert len(cache_files) > 0, f"No cache files created in {temp_cache_dir}"
 
-        assert (
-            loaded is not None
-        ), f"Failed to load cached result. Cache files: {cache_files}"
+        assert loaded is not None, (
+            f"Failed to load cached result. Cache files: {cache_files}"
+        )
         assert loaded.model_name == sample_evaluation_result.model_name
         assert loaded.test_name == sample_evaluation_result.test_name
 

From e0a72b45823bb18a6a66e1ad8bc09d1cd59b5fa8 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Thu, 11 Dec 2025 16:01:59 +0530
Subject: [PATCH 17/24] chore: Update mypy configuration for Python 3.12 and
 enhance type annotations in cli.py, client.py, core.py, and metrics.py

---
 benchwise/cli.py     |  21 ++--
 benchwise/client.py  |  13 ++-
 benchwise/core.py    |   2 +-
 benchwise/metrics.py | 256 +++++++++++++++++++++++--------------------
 benchwise/types.py   | 198 +++++++++++++++++++++++++++++----
 mypy.ini             |   2 +-
 6 files changed, 340 insertions(+), 152 deletions(-)

diff --git a/benchwise/cli.py b/benchwise/cli.py
index a2bd910..2dfe56b 100644
--- a/benchwise/cli.py
+++ b/benchwise/cli.py
@@ -5,7 +5,7 @@
 import argparse
 import asyncio
 import sys
-from typing import List, Optional, Any
+from typing import List, Optional
 
 from . import __version__
 from .datasets import load_dataset
@@ -13,6 +13,7 @@
 from .results import save_results, BenchmarkResult, EvaluationResult
 from .config import get_api_config, configure_benchwise
 from .client import get_client, sync_offline_results
+from .types import ConfigureArgs, ConfigKwargs, SyncArgs, StatusArgs
 
 
 def create_parser() -> argparse.ArgumentParser:
@@ -206,14 +207,14 @@ async def run_evaluation(
             for metric_name in metrics:
                 try:
                     if metric_name == "accuracy":
-                        metric_result: Any = accuracy(responses, references)
+                        metric_result = accuracy(responses, references)
                         results["accuracy"] = metric_result["accuracy"]
                     elif metric_name == "rouge_l":
-                        metric_result = rouge_l(responses, references)
-                        results["rouge_l_f1"] = metric_result["f1"]
+                        rouge_result = rouge_l(responses, references)
+                        results["rouge_l_f1"] = rouge_result["f1"]
                     elif metric_name == "semantic_similarity":
-                        metric_result = semantic_similarity(responses, references)
-                        results["semantic_similarity"] = metric_result[
+                        semantic_result = semantic_similarity(responses, references)
+                        results["semantic_similarity"] = semantic_result[
                             "mean_similarity"
                         ]
                     else:
@@ -285,7 +286,7 @@ async def run_evaluation(
     return benchmark_result
 
 
-async def configure_api(args: Any) -> None:
+async def configure_api(args: ConfigureArgs) -> None:
     """Configure Benchwise API settings."""
     from .config import reset_config
 
@@ -300,7 +301,7 @@ async def configure_api(args: Any) -> None:
         return
 
     # Update configuration
-    kwargs = {}
+    kwargs: ConfigKwargs = {}
     if args.api_url:
         kwargs["api_url"] = args.api_url
     if args.api_key:
@@ -321,7 +322,7 @@ async def configure_api(args: Any) -> None:
         print("No configuration changes specified. Use --show to see current config.")
 
 
-async def sync_offline(args: Any) -> None:
+async def sync_offline(args: SyncArgs) -> None:
     """Sync offline results with the API."""
     try:
         client = await get_client()
@@ -354,7 +355,7 @@ async def sync_offline(args: Any) -> None:
         pass
 
 
-async def show_status(args: Any) -> None:
+async def show_status(args: StatusArgs) -> None:
     """Show Benchwise status information."""
     config = get_api_config()
     client = None
diff --git a/benchwise/client.py b/benchwise/client.py
index fc724fb..3d5700e 100644
--- a/benchwise/client.py
+++ b/benchwise/client.py
@@ -2,12 +2,14 @@
 import asyncio
 import uuid
 import logging
-from typing import Dict, Any, Optional, List, cast
+import types
+from typing import Dict, Any, Optional, List, Type, cast
 from datetime import datetime
 from contextvars import ContextVar
 
 from .config import get_api_config
 from .results import EvaluationResult, BenchmarkResult
+from .types import OfflineQueueItem
 
 # Set up logger
 logger = logging.getLogger("benchwise.client")
@@ -64,7 +66,7 @@ def __init__(self, api_url: Optional[str] = None, api_key: Optional[str] = None)
         self.benchmark_cache: Dict[str, int] = {}
 
         # Offline queue for storing results when API is unavailable
-        self.offline_queue: List[Dict[str, Any]] = []
+        self.offline_queue: List[OfflineQueueItem] = []
         self.offline_mode = False
 
         # Track if client is closed
@@ -75,7 +77,12 @@ def __init__(self, api_url: Optional[str] = None, api_key: Optional[str] = None)
     async def __aenter__(self) -> "BenchwiseClient":
         return self
 
-    async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+    async def __aexit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_val: Optional[BaseException],
+        exc_tb: Optional[types.TracebackType],
+    ) -> None:
         await self.close()
 
     async def close(self) -> None:
diff --git a/benchwise/core.py b/benchwise/core.py
index 93fe89f..67db3bb 100644
--- a/benchwise/core.py
+++ b/benchwise/core.py
@@ -31,7 +31,7 @@ def evaluate(
     *models: str, upload: Optional[bool] = None, **kwargs: Any
 ) -> Callable[
     [Callable[..., Awaitable[Any]]],
-    Callable[[Dataset, Any], Awaitable[List[EvaluationResult]]],
+    Callable[[Dataset], Awaitable[List[EvaluationResult]]],
 ]:
     """
     Decorator for creating LLM evaluations.
diff --git a/benchwise/metrics.py b/benchwise/metrics.py
index f6a322d..aac4f1c 100644
--- a/benchwise/metrics.py
+++ b/benchwise/metrics.py
@@ -1,6 +1,16 @@
-from typing import List, Dict, Any, Tuple, Optional, Callable
+from typing import List, Dict, Any, Tuple, Optional, Callable, cast
 import numpy as np
-from benchwise.types import RougeScores
+from benchwise.types import (
+    RougeScores,
+    BleuScores,
+    BertScoreResults,
+    AccuracyResults,
+    SemanticSimilarityResults,
+    PerplexityResults,
+    FactualCorrectnessResults,
+    CoherenceResults,
+    SafetyResults,
+)
 from rouge_score import rouge_scorer
 from sacrebleu import BLEU
 import bert_score
@@ -173,7 +183,7 @@ def bleu_score(
     smooth_method: str = "exp",
     return_confidence: bool = True,
     max_n: int = 4,
-) -> Dict[str, Any]:
+) -> BleuScores:
     """
     Calculate enhanced BLEU scores for predictions vs references.
 
@@ -265,30 +275,31 @@ def bleu_score(
             for i in range(1, max_n + 1):
                 ngram_precisions[f"bleu_{i}"].append(0.0)
 
-    result = {
+    # Build result dict dynamically, then cast to BleuScores
+    result_dict: Dict[str, Any] = {
         "corpus_bleu": corpus_bleu,
-        "sentence_bleu": np.mean(sentence_scores),
-        "std_sentence_bleu": np.std(sentence_scores),
-        "median_sentence_bleu": np.median(sentence_scores),
+        "sentence_bleu": float(np.mean(sentence_scores)),
+        "std_sentence_bleu": float(np.std(sentence_scores)),
+        "median_sentence_bleu": float(np.median(sentence_scores)),
         "scores": sentence_scores,
     }
 
     # Add n-gram precision scores
     for key, scores in ngram_precisions.items():
         if scores:  # Only add if we have scores
-            result[key] = np.mean(scores)
-            result[f"{key}_std"] = np.std(scores)
+            result_dict[key] = float(np.mean(scores))
+            result_dict[f"{key}_std"] = float(np.std(scores))
 
     # Add confidence intervals if requested
     if return_confidence and len(sentence_scores) > 1:
         try:
-            result["sentence_bleu_confidence_interval"] = (
+            result_dict["sentence_bleu_confidence_interval"] = (
                 _bootstrap_confidence_interval(sentence_scores)
             )
         except Exception as e:
             warnings.warn(f"Could not calculate BLEU confidence intervals: {e}")
 
-    return result
+    return cast(BleuScores, result_dict)
 
 
 def _get_smoothing_function(smooth_method: str) -> Optional[Callable[..., Any]]:
@@ -320,7 +331,7 @@ def bert_score_metric(
     model_type: str = "distilbert-base-uncased",
     return_confidence: bool = True,
     batch_size: int = 64,
-) -> Dict[str, Any]:
+) -> BertScoreResults:
     """
     Calculate enhanced BERTScore for predictions vs references.
 
@@ -340,12 +351,15 @@ def bert_score_metric(
         )
 
     if not predictions or not references:
-        return {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1": 0.0,
-            "scores": {"precision": [], "recall": [], "f1": []},
-        }
+        return cast(
+            BertScoreResults,
+            {
+                "precision": 0.0,
+                "recall": 0.0,
+                "f1": 0.0,
+                "scores": {"precision": [], "recall": [], "f1": []},
+            },
+        )
 
     try:
         # Handle empty strings gracefully
@@ -394,16 +408,16 @@ def bert_score_metric(
             R_scores[idx] = r
             F1_scores[idx] = f1
 
-        result = {
-            "precision": np.mean(P_scores),
-            "recall": np.mean(R_scores),
-            "f1": np.mean(F1_scores),
-            "std_precision": np.std(P_scores),
-            "std_recall": np.std(R_scores),
-            "std_f1": np.std(F1_scores),
-            "min_f1": np.min(F1_scores),
-            "max_f1": np.max(F1_scores),
-            "median_f1": np.median(F1_scores),
+        result_dict: Dict[str, Any] = {
+            "precision": float(np.mean(P_scores)),
+            "recall": float(np.mean(R_scores)),
+            "f1": float(np.mean(F1_scores)),
+            "std_precision": float(np.std(P_scores)),
+            "std_recall": float(np.std(R_scores)),
+            "std_f1": float(np.std(F1_scores)),
+            "min_f1": float(np.min(F1_scores)),
+            "max_f1": float(np.max(F1_scores)),
+            "median_f1": float(np.median(F1_scores)),
             "model_used": model_type,
             "scores": {"precision": P_scores, "recall": R_scores, "f1": F1_scores},
         }
@@ -411,36 +425,39 @@ def bert_score_metric(
         # Add confidence intervals if requested
         if return_confidence and len(F1_scores) > 1:
             try:
-                result["f1_confidence_interval"] = _bootstrap_confidence_interval(
+                result_dict["f1_confidence_interval"] = _bootstrap_confidence_interval(
                     F1_scores
                 )
-                result["precision_confidence_interval"] = (
+                result_dict["precision_confidence_interval"] = (
                     _bootstrap_confidence_interval(P_scores)
                 )
-                result["recall_confidence_interval"] = _bootstrap_confidence_interval(
-                    R_scores
+                result_dict["recall_confidence_interval"] = (
+                    _bootstrap_confidence_interval(R_scores)
                 )
             except Exception as e:
                 warnings.warn(
                     f"Could not calculate BERTScore confidence intervals: {e}"
                 )
 
-        return result
+        return cast(BertScoreResults, result_dict)
 
     except Exception as e:
         warnings.warn(f"BERTScore calculation failed: {e}")
         # Return fallback scores
-        return {
-            "precision": 0.0,
-            "recall": 0.0,
-            "f1": 0.0,
-            "error": str(e),
-            "scores": {
-                "precision": [0.0] * len(predictions),
-                "recall": [0.0] * len(predictions),
-                "f1": [0.0] * len(predictions),
+        return cast(
+            BertScoreResults,
+            {
+                "precision": 0.0,
+                "recall": 0.0,
+                "f1": 0.0,
+                "error": str(e),
+                "scores": {
+                    "precision": [0.0] * len(predictions),
+                    "recall": [0.0] * len(predictions),
+                    "f1": [0.0] * len(predictions),
+                },
             },
-        }
+        )
 
 
 def accuracy(
@@ -451,7 +468,7 @@ def accuracy(
     fuzzy_match: bool = False,
     fuzzy_threshold: float = 0.8,
     return_confidence: bool = True,
-) -> Dict[str, Any]:
+) -> AccuracyResults:
     """
     Calculate enhanced exact match accuracy with multiple matching strategies.
 
@@ -473,7 +490,7 @@ def accuracy(
         )
 
     if not predictions or not references:
-        return {"accuracy": 0.0, "correct": 0, "total": 0}
+        return cast(AccuracyResults, {"accuracy": 0.0, "correct": 0, "total": 0})
 
     correct_exact = 0
     correct_fuzzy = 0
@@ -533,28 +550,28 @@ def accuracy(
     exact_accuracy = correct_exact / total if total > 0 else 0.0
     fuzzy_accuracy = correct_fuzzy / total if total > 0 else 0.0
 
-    result = {
+    result_dict: Dict[str, Any] = {
         "accuracy": exact_accuracy,
         "exact_accuracy": exact_accuracy,
         "fuzzy_accuracy": fuzzy_accuracy if fuzzy_match else exact_accuracy,
         "correct": correct_exact,
         "correct_fuzzy": correct_fuzzy if fuzzy_match else correct_exact,
         "total": total,
-        "mean_score": np.mean(individual_scores),
-        "std_score": np.std(individual_scores),
+        "mean_score": float(np.mean(individual_scores)),
+        "std_score": float(np.std(individual_scores)),
         "individual_scores": individual_scores,
         "match_types": match_types,
     }
 
     if return_confidence and len(individual_scores) > 1:
         try:
-            result["accuracy_confidence_interval"] = _bootstrap_confidence_interval(
-                individual_scores
+            result_dict["accuracy_confidence_interval"] = (
+                _bootstrap_confidence_interval(individual_scores)
             )
         except Exception as e:
             warnings.warn(f"Could not calculate accuracy confidence intervals: {e}")
 
-    return result
+    return cast(AccuracyResults, result_dict)
 
 
 def semantic_similarity(
@@ -564,7 +581,7 @@ def semantic_similarity(
     batch_size: int = 32,
     return_confidence: bool = True,
     similarity_threshold: float = 0.5,
-) -> Dict[str, Any]:
+) -> SemanticSimilarityResults:
     """
     Calculate enhanced semantic similarity using sentence embeddings.
 
@@ -585,7 +602,7 @@ def semantic_similarity(
         )
 
     if not predictions or not references:
-        return {"mean_similarity": 0.0, "scores": []}
+        return cast(SemanticSimilarityResults, {"mean_similarity": 0.0, "scores": []})
 
     try:
         from sentence_transformers import SentenceTransformer, util
@@ -655,37 +672,38 @@ def semantic_similarity(
     # Calculate enhanced statistics
     similarities_array = np.array(similarities)
 
-    result = {
-        "mean_similarity": np.mean(similarities),
-        "median_similarity": np.median(similarities),
-        "std_similarity": np.std(similarities),
-        "min_similarity": np.min(similarities),
-        "max_similarity": np.max(similarities),
-        "similarity_above_threshold": np.sum(similarities_array >= similarity_threshold)
-        / len(similarities),
+    result_dict: Dict[str, Any] = {
+        "mean_similarity": float(np.mean(similarities)),
+        "median_similarity": float(np.median(similarities)),
+        "std_similarity": float(np.std(similarities)),
+        "min_similarity": float(np.min(similarities)),
+        "max_similarity": float(np.max(similarities)),
+        "similarity_above_threshold": float(
+            np.sum(similarities_array >= similarity_threshold) / len(similarities)
+        ),
         "scores": similarities,
         "model_used": model_type,
     }
 
-    result["percentile_25"] = np.percentile(similarities, 25)
-    result["percentile_75"] = np.percentile(similarities, 75)
-    result["percentile_90"] = np.percentile(similarities, 90)
+    result_dict["percentile_25"] = float(np.percentile(similarities, 25))
+    result_dict["percentile_75"] = float(np.percentile(similarities, 75))
+    result_dict["percentile_90"] = float(np.percentile(similarities, 90))
 
     # Add confidence intervals if requested
     if return_confidence and len(similarities) > 1:
         try:
-            result["similarity_confidence_interval"] = _bootstrap_confidence_interval(
-                similarities
+            result_dict["similarity_confidence_interval"] = (
+                _bootstrap_confidence_interval(similarities)
             )
         except Exception as e:
             warnings.warn(
                 f"Could not calculate semantic similarity confidence intervals: {e}"
             )
 
-    return result
+    return cast(SemanticSimilarityResults, result_dict)
 
 
-def perplexity(predictions: List[str], model_name: str = "gpt2") -> Dict[str, Any]:
+def perplexity(predictions: List[str], model_name: str = "gpt2") -> PerplexityResults:
     """
     Calculate perplexity of generated text.
 
@@ -720,11 +738,14 @@ def perplexity(predictions: List[str], model_name: str = "gpt2") -> Dict[str, An
             perplexity = torch.exp(loss).item()
             perplexities.append(perplexity)
 
-    return {
-        "mean_perplexity": float(np.mean(perplexities)),
-        "median_perplexity": float(np.median(perplexities)),
-        "scores": perplexities,
-    }
+    return cast(
+        PerplexityResults,
+        {
+            "mean_perplexity": float(np.mean(perplexities)),
+            "median_perplexity": float(np.median(perplexities)),
+            "scores": perplexities,
+        },
+    )
 
 
 def factual_correctness(
@@ -734,7 +755,7 @@ def factual_correctness(
     use_named_entities: bool = True,
     return_confidence: bool = True,
     detailed_analysis: bool = True,
-) -> Dict[str, Any]:
+) -> FactualCorrectnessResults:
     """
     Evaluate factual correctness of predictions using enhanced fact-checking methods.
 
@@ -755,7 +776,7 @@ def factual_correctness(
         )
 
     if not predictions or not references:
-        return {"mean_correctness": 0.0, "scores": []}
+        return cast(FactualCorrectnessResults, {"mean_correctness": 0.0, "scores": []})
 
     correctness_scores = []
     detailed_results = []
@@ -798,12 +819,12 @@ def factual_correctness(
         detailed_results.append(factual_analysis)
 
     # Compile results
-    result = {
-        "mean_correctness": np.mean(correctness_scores),
-        "median_correctness": np.median(correctness_scores),
-        "std_correctness": np.std(correctness_scores),
-        "min_correctness": np.min(correctness_scores),
-        "max_correctness": np.max(correctness_scores),
+    result_dict: Dict[str, Any] = {
+        "mean_correctness": float(np.mean(correctness_scores)),
+        "median_correctness": float(np.median(correctness_scores)),
+        "std_correctness": float(np.std(correctness_scores)),
+        "min_correctness": float(np.min(correctness_scores)),
+        "max_correctness": float(np.max(correctness_scores)),
         "scores": correctness_scores,
     }
 
@@ -811,33 +832,33 @@ def factual_correctness(
     if detailed_analysis:
         # Aggregate component scores
         components = ["entity_overlap", "keyword_overlap", "semantic_overlap"]
-        result["components"] = {}
+        result_dict["components"] = {}
 
         for component in components:
             component_scores = [
                 detail.get(component, 0.0) for detail in detailed_results
             ]
             if component_scores:
-                result["components"][component] = {
-                    "mean": np.mean(component_scores),
-                    "std": np.std(component_scores),
+                result_dict["components"][component] = {
+                    "mean": float(np.mean(component_scores)),
+                    "std": float(np.std(component_scores)),
                     "scores": component_scores,
                 }
 
-        result["detailed_results"] = detailed_results
+        result_dict["detailed_results"] = detailed_results
 
     # Add confidence intervals if requested
     if return_confidence and len(correctness_scores) > 1:
         try:
-            result["correctness_confidence_interval"] = _bootstrap_confidence_interval(
-                correctness_scores
+            result_dict["correctness_confidence_interval"] = (
+                _bootstrap_confidence_interval(correctness_scores)
             )
         except Exception as e:
             warnings.warn(
                 f"Could not calculate factual correctness confidence intervals: {e}"
             )
 
-    return result
+    return cast(FactualCorrectnessResults, result_dict)
 
 
 def _analyze_factual_correctness(
@@ -926,7 +947,6 @@ def _calculate_enhanced_keyword_overlap(prediction: str, reference: str) -> floa
 
     # Extract important words from reference
     important_ref_words: set[str] = set()
-    " ".join(ref_words)
 
     for pattern_type, pattern in important_patterns.items():
         matches = re.findall(pattern, reference, re.IGNORECASE)
@@ -1023,7 +1043,7 @@ def coherence_score(
     predictions: List[str],
     return_confidence: bool = True,
     detailed_analysis: bool = True,
-) -> Dict[str, Any]:
+) -> CoherenceResults:
     """
     Evaluate text coherence using enhanced linguistic and statistical metrics.
 
@@ -1036,7 +1056,7 @@ def coherence_score(
         Dictionary with enhanced coherence scores and analysis
     """
     if not predictions:
-        return {"mean_coherence": 1.0, "scores": []}
+        return cast(CoherenceResults, {"mean_coherence": 1.0, "scores": []})
 
     coherence_scores = []
     component_scores: Dict[str, List[float]] = {
@@ -1066,36 +1086,36 @@ def coherence_score(
                 component_scores[component].append(score)
 
     # Compile results
-    result = {
-        "mean_coherence": np.mean(coherence_scores),
-        "median_coherence": np.median(coherence_scores),
-        "std_coherence": np.std(coherence_scores),
-        "min_coherence": np.min(coherence_scores),
-        "max_coherence": np.max(coherence_scores),
+    result_dict: Dict[str, Any] = {
+        "mean_coherence": float(np.mean(coherence_scores)),
+        "median_coherence": float(np.median(coherence_scores)),
+        "std_coherence": float(np.std(coherence_scores)),
+        "min_coherence": float(np.min(coherence_scores)),
+        "max_coherence": float(np.max(coherence_scores)),
         "scores": coherence_scores,
     }
 
     # Add detailed component analysis if requested
     if detailed_analysis:
-        result["components"] = {}
+        result_dict["components"] = {}
         for component, scores in component_scores.items():
             if scores:  # Only add if we have scores
-                result["components"][component] = {
-                    "mean": np.mean(scores),
-                    "std": np.std(scores),
+                result_dict["components"][component] = {
+                    "mean": float(np.mean(scores)),
+                    "std": float(np.std(scores)),
                     "scores": scores,
                 }
 
     # Add confidence intervals if requested
     if return_confidence and len(coherence_scores) > 1:
         try:
-            result["coherence_confidence_interval"] = _bootstrap_confidence_interval(
-                coherence_scores
+            result_dict["coherence_confidence_interval"] = (
+                _bootstrap_confidence_interval(coherence_scores)
             )
         except Exception as e:
             warnings.warn(f"Could not calculate coherence confidence intervals: {e}")
 
-    return result
+    return cast(CoherenceResults, result_dict)
 
 
 def _analyze_text_coherence(text: str) -> Dict[str, float]:
@@ -1276,7 +1296,7 @@ def safety_score(
     return_confidence: bool = True,
     detailed_categories: bool = True,
     context_aware: bool = True,
-) -> Dict[str, Any]:
+) -> SafetyResults:
     """
     Evaluate safety of generated text with enhanced detection methods.
 
@@ -1293,7 +1313,7 @@ def safety_score(
         Dictionary with enhanced safety scores and category breakdowns
     """
     if not predictions:
-        return {"mean_safety": 1.0, "scores": []}
+        return cast(SafetyResults, {"mean_safety": 1.0, "scores": []})
 
     if safety_categories is None:
         safety_categories = [
@@ -1417,11 +1437,11 @@ def safety_score(
         violation_details.append(violations)
 
     # Compile results
-    result = {
-        "mean_safety": np.mean(safety_scores),
-        "median_safety": np.median(safety_scores),
-        "std_safety": np.std(safety_scores),
-        "min_safety": np.min(safety_scores),
+    result_dict: Dict[str, Any] = {
+        "mean_safety": float(np.mean(safety_scores)),
+        "median_safety": float(np.median(safety_scores)),
+        "std_safety": float(np.std(safety_scores)),
+        "min_safety": float(np.min(safety_scores)),
         "unsafe_count": len(
             [s for s in safety_scores if s < 0.8]
         ),  # Threshold for "unsafe"
@@ -1431,25 +1451,25 @@ def safety_score(
 
     # Add detailed category scores if requested
     if detailed_categories:
-        result["category_scores"] = {}
+        result_dict["category_scores"] = {}
         for cat in safety_categories:
             if category_scores[cat]:  # Only add if we have scores
-                result["category_scores"][cat] = {
-                    "mean": np.mean(category_scores[cat]),
-                    "violation_rate": 1.0 - np.mean(category_scores[cat]),
+                result_dict["category_scores"][cat] = {
+                    "mean": float(np.mean(category_scores[cat])),
+                    "violation_rate": float(1.0 - np.mean(category_scores[cat])),
                     "scores": category_scores[cat],
                 }
 
     # Add confidence intervals if requested
     if return_confidence and len(safety_scores) > 1:
         try:
-            result["safety_confidence_interval"] = _bootstrap_confidence_interval(
+            result_dict["safety_confidence_interval"] = _bootstrap_confidence_interval(
                 safety_scores
             )
         except Exception as e:
             warnings.warn(f"Could not calculate safety confidence intervals: {e}")
 
-    return result
+    return cast(SafetyResults, result_dict)
 
 
 def _check_keyword_in_context(
diff --git a/benchwise/types.py b/benchwise/types.py
index a351b09..38cf447 100644
--- a/benchwise/types.py
+++ b/benchwise/types.py
@@ -75,43 +75,81 @@ class RougeScores(TypedDict, total=False):
 class BleuScores(TypedDict, total=False):
     """Return type for BLEU metric scores."""
 
-    bleu: float
-    bleu1: float
-    bleu2: float
-    bleu3: float
-    bleu4: float
-    brevity_penalty: float
-    length_ratio: float
-    std_bleu: float
+    # Required fields
+    corpus_bleu: float
+    sentence_bleu: float
+    std_sentence_bleu: float
+    median_sentence_bleu: float
     scores: List[float]
-    # Optional confidence intervals
-    bleu_confidence_interval: Tuple[float, float]
+
+    # N-gram precision scores (dynamically added based on max_n)
+    bleu_1: float
+    bleu_1_std: float
+    bleu_2: float
+    bleu_2_std: float
+    bleu_3: float
+    bleu_3_std: float
+    bleu_4: float
+    bleu_4_std: float
+
+    # Optional confidence interval
+    sentence_bleu_confidence_interval: Tuple[float, float]
 
 
 class BertScoreResults(TypedDict, total=False):
     """Return type for BERT-Score metric."""
 
+    # Main scores
     precision: float
     recall: float
     f1: float
+
+    # Standard deviations
     std_precision: float
     std_recall: float
     std_f1: float
+
+    # Additional statistics
+    min_f1: float
+    max_f1: float
+    median_f1: float
+
+    # Metadata
+    model_used: str
+
+    # Individual scores per sample
     scores: Dict[str, List[float]]
+
     # Optional confidence intervals
     f1_confidence_interval: Tuple[float, float]
     precision_confidence_interval: Tuple[float, float]
     recall_confidence_interval: Tuple[float, float]
 
+    # Error field (when calculation fails)
+    error: str
+
 
 class AccuracyResults(TypedDict, total=False):
     """Return type for accuracy metric."""
 
+    # Main accuracy metrics
     accuracy: float
+    exact_accuracy: float
+    fuzzy_accuracy: float
+
+    # Counts
     correct: int
+    correct_fuzzy: int
     total: int
-    std_accuracy: float
-    scores: List[float]
+
+    # Statistical measures
+    mean_score: float
+    std_score: float
+
+    # Individual scores and match information
+    individual_scores: List[float]
+    match_types: List[str]
+
     # Optional confidence interval
     accuracy_confidence_interval: Tuple[float, float]
 
@@ -119,42 +157,126 @@ class AccuracyResults(TypedDict, total=False):
 class SemanticSimilarityResults(TypedDict, total=False):
     """Return type for semantic similarity metric."""
 
-    similarity: float
+    # Main similarity metrics
+    mean_similarity: float
+    median_similarity: float
     std_similarity: float
+    min_similarity: float
+    max_similarity: float
+
+    # Threshold-based metrics
+    similarity_above_threshold: float
+
+    # Percentiles
+    percentile_25: float
+    percentile_75: float
+    percentile_90: float
+
+    # Metadata
+    model_used: str
+
+    # Individual scores
     scores: List[float]
+
     # Optional confidence interval
     similarity_confidence_interval: Tuple[float, float]
 
 
+class PerplexityResults(TypedDict, total=False):
+    """Return type for perplexity metric."""
+
+    # Perplexity metrics
+    mean_perplexity: float
+    median_perplexity: float
+
+    # Individual scores
+    scores: List[float]
+
+
+class ComponentAnalysis(TypedDict, total=False):
+    """Component analysis for factual correctness."""
+
+    mean: float
+    std: float
+    scores: List[float]
+
+
 class CoherenceResults(TypedDict, total=False):
     """Return type for coherence score metric."""
 
-    coherence: float
+    # Main coherence metrics
+    mean_coherence: float
+    median_coherence: float
     std_coherence: float
+    min_coherence: float
+    max_coherence: float
+
+    # Individual scores
     scores: List[float]
+
+    # Optional detailed component analysis
+    components: Dict[str, ComponentAnalysis]
+
     # Optional confidence interval
     coherence_confidence_interval: Tuple[float, float]
 
 
+class SafetyCategoryScore(TypedDict, total=False):
+    """Per-category safety score analysis."""
+
+    mean: float
+    violation_rate: float
+    scores: List[float]
+
+
 class SafetyResults(TypedDict, total=False):
     """Return type for safety score metric."""
 
-    safety: float
-    is_safe: bool
-    flagged_categories: List[str]
+    # Main safety metrics
+    mean_safety: float
+    median_safety: float
     std_safety: float
+    min_safety: float
+    unsafe_count: int
+
+    # Individual scores
     scores: List[float]
+
+    # Violation details per prediction
+    violation_details: List[List[str]]
+
+    # Optional detailed category analysis
+    category_scores: Dict[str, SafetyCategoryScore]
+
     # Optional confidence interval
     safety_confidence_interval: Tuple[float, float]
 
 
+class DetailedFactualAnalysis(TypedDict, total=False):
+    """Detailed factual analysis for a single prediction-reference pair."""
+
+    entity_overlap: float
+    keyword_overlap: float
+    semantic_overlap: float
+
+
 class FactualCorrectnessResults(TypedDict, total=False):
     """Return type for factual correctness metric."""
 
-    correctness: float
-    is_correct: bool
+    # Main correctness metrics
+    mean_correctness: float
+    median_correctness: float
     std_correctness: float
+    min_correctness: float
+    max_correctness: float
+
+    # Individual scores
     scores: List[float]
+
+    # Optional detailed analysis
+    components: Dict[str, ComponentAnalysis]
+    detailed_results: List[DetailedFactualAnalysis]
+
     # Optional confidence interval
     correctness_confidence_interval: Tuple[float, float]
 
@@ -322,3 +444,41 @@ def evaluate(
     ) -> Dict[str, float]:
         """Evaluate predictions against references."""
         ...
+
+
+class ConfigureArgs(Protocol):
+    """Arguments for configuring Benchwise."""
+
+    reset: bool
+    show: bool
+    api_url: str | None
+    api_key: str | None
+    upload: str | None
+
+
+class SyncArgs(Protocol):
+    """Arguments for sync command."""
+
+    dry_run: bool
+
+
+class StatusArgs(Protocol):
+    """Arguments for status command."""
+
+    api: bool
+    auth: bool
+
+
+class ConfigKwargs(TypedDict, total=False):
+    """Kwargs for configure_benchwise function."""
+
+    api_url: str
+    api_key: str
+    upload_enabled: bool
+
+
+class OfflineQueueItem(TypedDict):
+    """Item in offline queue."""
+
+    data: Dict[str, Any]
+    timestamp: str
diff --git a/mypy.ini b/mypy.ini
index 1383e41..f311680 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -1,5 +1,5 @@
 [mypy]
-python_version = 3.11
+python_version = 3.12
 files = benchwise
 
 # Strict type checking

From e59505070ddab34ba60d16c53a264c630ca32b61 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Thu, 11 Dec 2025 16:14:30 +0530
Subject: [PATCH 18/24] chore: Remove GEMINI.md file from repo

---
 GEMINI.md | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 GEMINI.md

diff --git a/GEMINI.md b/GEMINI.md
deleted file mode 100644
index d0892af..0000000
--- a/GEMINI.md
+++ /dev/null
@@ -1 +0,0 @@
-Follow CLAUDE.md

From 420e1691fa512a17e11552a0d091d49e311eceaf Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Thu, 11 Dec 2025 17:51:33 +0530
Subject: [PATCH 19/24] fix(core): Update evaluation function to use wrapper
 for benchmark metadata

---
 benchwise/core.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/benchwise/core.py b/benchwise/core.py
index 67db3bb..5c830ce 100644
--- a/benchwise/core.py
+++ b/benchwise/core.py
@@ -68,7 +68,7 @@ async def wrapper(
             dataset: Dataset, **test_kwargs: Any
         ) -> List[EvaluationResult]:
             return await _run_evaluation(
-                test_func, dataset, models, upload, kwargs, test_kwargs
+                test_func, wrapper, dataset, models, upload, kwargs, test_kwargs
             )
 
         if hasattr(test_func, "_benchmark_metadata"):
@@ -81,6 +81,7 @@ async def wrapper(
 
 async def _run_evaluation(
     test_func: Callable[..., Awaitable[Any]],
+    wrapper_func: Callable[..., Awaitable[Any]],
     dataset: Dataset,
     models: tuple[str, ...],
     upload: Optional[bool],
@@ -102,8 +103,8 @@ async def _run_evaluation(
             end_time = time.time()
 
             combined_metadata = decorator_kwargs.copy()
-            if hasattr(test_func, "_benchmark_metadata"):
-                combined_metadata.update(test_func._benchmark_metadata)
+            if hasattr(wrapper_func, "_benchmark_metadata"):
+                combined_metadata.update(wrapper_func._benchmark_metadata)
 
             eval_result = EvaluationResult(
                 model_name=model_name,
@@ -121,8 +122,8 @@ async def _run_evaluation(
             logger.error(f"✗ {model_name} failed: {e}", exc_info=True)
 
             combined_metadata = decorator_kwargs.copy()
-            if hasattr(test_func, "_benchmark_metadata"):
-                combined_metadata.update(test_func._benchmark_metadata)
+            if hasattr(wrapper_func, "_benchmark_metadata"):
+                combined_metadata.update(wrapper_func._benchmark_metadata)
 
             eval_result = EvaluationResult(
                 model_name=model_name,

From dcf430ea7bf1c1351fd4a19051e7c43a87208d01 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Thu, 11 Dec 2025 18:03:06 +0530
Subject: [PATCH 20/24] chore(ci): Update CI workflow to install metrics
 dependencies and fix test gpt adapter

---
 .github/workflows/ci.yml |  2 +-
 tests/test_models.py     | 16 +++++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e6ce6a1..acb16d8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -21,7 +21,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -e ".[dev,test]"
+          pip install -e ".[dev,test,metrics]"
 
       - name: Run ruff linter
         run: ruff check benchwise tests
diff --git a/tests/test_models.py b/tests/test_models.py
index f3f6add..bdaf260 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -59,9 +59,10 @@ def test_mock_cost_estimate(self):
 
 class TestGetModelAdapter:
     def test_get_gpt_adapter(self):
-        adapter = get_model_adapter("gpt-3.5-turbo")
-        assert isinstance(adapter, OpenAIAdapter)
-        assert adapter.model_name == "gpt-3.5-turbo"
+        with patch("openai.AsyncOpenAI"):
+            adapter = get_model_adapter("gpt-3.5-turbo")
+            assert isinstance(adapter, OpenAIAdapter)
+            assert adapter.model_name == "gpt-3.5-turbo"
 
     def test_get_claude_adapter(self):
         adapter = get_model_adapter("claude-3-haiku")
@@ -242,10 +243,11 @@ class TestModelNaming:
     def test_gpt_variants(self):
         models = ["gpt-3.5-turbo", "gpt-4", "gpt-4o"]
 
-        for model in models:
-            adapter = get_model_adapter(model)
-            assert isinstance(adapter, OpenAIAdapter)
-            assert adapter.model_name == model
+        with patch("openai.AsyncOpenAI"):
+            for model in models:
+                adapter = get_model_adapter(model)
+                assert isinstance(adapter, OpenAIAdapter)
+                assert adapter.model_name == model
 
     def test_claude_variants(self):
         models = ["claude-3-opus", "claude-3-sonnet", "claude-3-haiku"]

From a3211ea9520d8255478693996f7c267b8f50f7ec Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Thu, 11 Dec 2025 18:10:38 +0530
Subject: [PATCH 21/24] chore(ci): Update CI workflow to install all
 development dependencies

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index acb16d8..d39e2df 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -21,7 +21,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -e ".[dev,test,metrics]"
+          pip install -e ". [dev,all]"
 
       - name: Run ruff linter
         run: ruff check benchwise tests

From 1da265b2480209c45c02a2b289d33b84da9e6db9 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Sat, 13 Dec 2025 17:38:52 +0530
Subject: [PATCH 22/24] refactor: Improve type safety across codebase with
 TypedDicts

---
 .github/workflows/ci.yml  |   2 +-
 .gitignore                |   2 -
 benchwise/cli.py          |  47 +++-
 benchwise/client.py       | 196 +++++++++-----
 benchwise/core.py         |  94 +++++--
 benchwise/datasets.py     | 539 +++++++++++++++++++++++++++-----------
 benchwise/results.py      | 189 +++++++------
 benchwise/types.py        | 266 +++++++++++++++++--
 demo.py                   |  21 +-
 docs/test_load_dataset.py |   2 +
 10 files changed, 1003 insertions(+), 355 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d39e2df..a3ed4c0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -21,7 +21,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -e ". [dev,all]"
+          pip install -e ".[dev,all]"
 
       - name: Run ruff linter
         run: ruff check benchwise tests
diff --git a/.gitignore b/.gitignore
index 19573c7..aacac31 100644
--- a/.gitignore
+++ b/.gitignore
@@ -252,6 +252,4 @@ redis-data/
 celery-beat-schedule
 
 # AI files
-CLAUDE.md
-GEMINI.md
 test_single_doc_file.py
diff --git a/benchwise/cli.py b/benchwise/cli.py
index 2dfe56b..d71bab5 100644
--- a/benchwise/cli.py
+++ b/benchwise/cli.py
@@ -8,12 +8,12 @@
 from typing import List, Optional
 
 from . import __version__
-from .datasets import load_dataset
+from .datasets import load_dataset, convert_metadata_to_info
 from .models import get_model_adapter
 from .results import save_results, BenchmarkResult, EvaluationResult
 from .config import get_api_config, configure_benchwise
 from .client import get_client, sync_offline_results
-from .types import ConfigureArgs, ConfigKwargs, SyncArgs, StatusArgs
+from .types import ConfigureArgs, ConfigKwargs, SyncArgs, StatusArgs, DatasetInfo
 
 
 def create_parser() -> argparse.ArgumentParser:
@@ -136,15 +136,21 @@ async def run_evaluation(
         sys.exit(1)
 
     # Create benchmark result
+    from .types import EvaluationMetadata
+    from typing import cast
+
     benchmark_result = BenchmarkResult(
         benchmark_name=f"cli_evaluation_{dataset.name}",
-        metadata={
-            "dataset_path": dataset_path,
-            "models": models,
-            "metrics": metrics,
-            "temperature": temperature,
-            "max_tokens": max_tokens,
-        },
+        metadata=cast(
+            EvaluationMetadata,
+            {
+                "dataset_path": dataset_path,
+                "models": models,
+                "metrics": metrics,
+                "temperature": temperature,
+                "max_tokens": max_tokens,
+            },
+        ),
     )
 
     # Run evaluation for each model
@@ -239,7 +245,9 @@ async def run_evaluation(
                 model_name=model_name,
                 test_name="cli_evaluation",
                 result=results,
-                dataset_info=dataset.metadata,
+                dataset_info=convert_metadata_to_info(dataset.metadata)
+                if dataset.metadata
+                else None,
             )
 
             benchmark_result.add_result(eval_result)
@@ -251,7 +259,9 @@ async def run_evaluation(
                 model_name=model_name,
                 test_name="cli_evaluation",
                 error=str(e),
-                dataset_info=dataset.metadata,
+                dataset_info=convert_metadata_to_info(dataset.metadata)
+                if dataset.metadata
+                else None,
             )
             benchmark_result.add_result(eval_result)
             print(f"✗ {model_name} failed: {e}")
@@ -268,10 +278,23 @@ async def run_evaluation(
             try:
                 from .client import upload_results
 
+                # Extract dataset_info from dataset metadata for upload_results
+                # upload_results expects DatasetInfo
+                dataset_info_for_upload: DatasetInfo = cast(
+                    DatasetInfo,
+                    {
+                        "size": dataset.size,
+                        "task": "general",
+                        "tags": [],
+                    },
+                )
+                if dataset.metadata:
+                    dataset_info_for_upload = convert_metadata_to_info(dataset.metadata)
+
                 success = await upload_results(
                     benchmark_result.results,
                     benchmark_result.benchmark_name,
-                    benchmark_result.metadata,
+                    dataset_info_for_upload,
                 )
                 if success:
                     print("✅ Results uploaded to Benchwise API")
diff --git a/benchwise/client.py b/benchwise/client.py
index 3d5700e..6adc153 100644
--- a/benchwise/client.py
+++ b/benchwise/client.py
@@ -9,7 +9,21 @@
 
 from .config import get_api_config
 from .results import EvaluationResult, BenchmarkResult
-from .types import OfflineQueueItem
+from .types import (
+    OfflineQueueItem,
+    LoginResponse,
+    UserInfo,
+    ModelInfo,
+    BenchmarkInfo,
+    BenchmarkRegistrationData,
+    EvaluationInfo,
+    DatasetInfo,
+    EvaluationMetadata,
+    EvaluationResultDict,
+    UploadBenchmarkResponse,
+    FileUploadResponse,
+    TokenData,
+)
 
 # Set up logger
 logger = logging.getLogger("benchwise.client")
@@ -179,7 +193,7 @@ async def health_check(self) -> bool:
             logger.warning(f"Health check failed: {e}")
             return False
 
-    async def login(self, username: str, password: str) -> Dict[str, Any]:
+    async def login(self, username: str, password: str) -> LoginResponse:
         """
         Login with username/password to get JWT token.
 
@@ -197,14 +211,18 @@ async def login(self, username: str, password: str) -> Dict[str, Any]:
             )
 
             if response.status_code == 200:
-                token_data = response.json()
+                token_data = cast(TokenData, response.json())
                 self.jwt_token = token_data["access_token"]
                 self._set_auth_header()
 
                 # Get user info
                 user_info = await self.get_current_user()
                 logger.info(f"Login successful for user: {username}")
-                return {"token": token_data, "user": user_info}
+                login_response: LoginResponse = {
+                    "token": token_data,
+                    "user": user_info,
+                }
+                return login_response
             elif response.status_code == 401:
                 logger.error("Login failed: Invalid credentials")
                 raise BenchwiseAPIError("Invalid username or password")
@@ -218,7 +236,7 @@ async def login(self, username: str, password: str) -> Dict[str, Any]:
 
     async def register(
         self, username: str, email: str, password: str, full_name: Optional[str] = None
-    ) -> Dict[str, Any]:
+    ) -> UserInfo:
         """
         Register a new user account.
 
@@ -246,7 +264,7 @@ async def register(
 
             if response.status_code == 201:
                 logger.info(f"Registration successful for user: {username}")
-                return cast(Dict[str, Any], response.json())
+                return cast(UserInfo, response.json())
             elif response.status_code == 400:
                 error_detail = response.json().get("detail", "Registration failed")
                 logger.error(f"Registration failed: {error_detail}")
@@ -259,7 +277,7 @@ async def register(
             logger.error(f"Network error during registration: {e}")
             raise BenchwiseAPIError(f"Network error during registration: {e}")
 
-    async def get_current_user(self) -> Dict[str, Any]:
+    async def get_current_user(self) -> UserInfo:
         """
         Get current authenticated user information.
 
@@ -273,7 +291,7 @@ async def get_current_user(self) -> Dict[str, Any]:
             response = await self.client.get("/api/v1/users/me")
 
             if response.status_code == 200:
-                return cast(Dict[str, Any], response.json())
+                return cast(UserInfo, response.json())
             elif response.status_code == 401:
                 logger.warning("Authentication expired")
                 raise BenchwiseAPIError("Authentication expired - please login again")
@@ -288,7 +306,7 @@ async def get_current_user(self) -> Dict[str, Any]:
     # WIP: Simplified upload workflow (to be completed in future release)
     async def upload_benchmark_result_simple(
         self, benchmark_result: BenchmarkResult
-    ) -> Dict[str, Any]:
+    ) -> UploadBenchmarkResponse:
         """
         WIP: Simplified single-call upload for benchmark results.
 
@@ -345,8 +363,8 @@ async def register_model(
             response = await self.client.post("/api/v1/models", json=model_data)
 
             if response.status_code == 201:
-                model_info = cast(Dict[str, Any], response.json())
-                model_db_id = cast(int, model_info["id"])
+                model_info = cast(ModelInfo, response.json())
+                model_db_id = model_info["id"]
                 self.model_cache[cache_key] = model_db_id
                 logger.info(f"Model registered successfully with ID: {model_db_id}")
                 return model_db_id
@@ -372,12 +390,15 @@ async def _get_existing_model(self, provider: str, model_id: str) -> int:
             )
 
             if response.status_code == 200:
-                models = cast(List[Dict[str, Any]], response.json())
+                models = cast(List[ModelInfo], response.json())
                 # Filter in Python since backend doesn't support model_id parameter
                 for model in models:
-                    if model["provider"] == provider and model["model_id"] == model_id:
+                    if (
+                        model.get("provider") == provider
+                        and model.get("model_id") == model_id
+                    ):
                         cache_key = f"{provider}:{model_id}"
-                        model_id_value = cast(int, model["id"])
+                        model_id_value: int = model["id"]
                         self.model_cache[cache_key] = model_id_value
                         logger.debug(f"Found existing model with ID: {model_id_value}")
                         return model_id_value
@@ -392,7 +413,7 @@ async def _get_existing_model(self, provider: str, model_id: str) -> int:
             raise BenchwiseAPIError(f"Network error searching models: {e}")
 
     async def register_benchmark(
-        self, benchmark_name: str, description: str, dataset_info: Dict[str, Any]
+        self, benchmark_name: str, description: str, dataset_info: DatasetInfo
     ) -> int:
         """
         Register a benchmark and return its database ID.
@@ -419,11 +440,11 @@ async def register_benchmark(
 
         logger.info(f"Registering benchmark: {benchmark_name}")
         try:
-            benchmark_data = {
+            benchmark_data: BenchmarkRegistrationData = {
                 "name": benchmark_name,
                 "description": description,
-                "category": dataset_info.get("task", "general"),
-                "tags": dataset_info.get("tags", []),
+                "category": dataset_info.get("task", "general") or "general",
+                "tags": dataset_info.get("tags", []) or [],
                 "difficulty": dataset_info.get("difficulty"),
                 "dataset_url": dataset_info.get("source"),
                 "config": {},
@@ -434,8 +455,8 @@ async def register_benchmark(
             response = await self.client.post("/api/v1/benchmarks", json=benchmark_data)
 
             if response.status_code == 201:
-                benchmark_info = cast(Dict[str, Any], response.json())
-                benchmark_db_id = cast(int, benchmark_info["id"])
+                benchmark_info = cast(BenchmarkInfo, response.json())
+                benchmark_db_id = benchmark_info["id"]
                 self.benchmark_cache[benchmark_name] = benchmark_db_id
                 logger.info(
                     f"Benchmark registered successfully with ID: {benchmark_db_id}"
@@ -463,11 +484,11 @@ async def _get_existing_benchmark(self, benchmark_name: str) -> int:
             )
 
             if response.status_code == 200:
-                benchmarks = cast(List[Dict[str, Any]], response.json())
+                benchmarks = cast(List[BenchmarkInfo], response.json())
                 # Look for exact name match first, then partial match
                 for benchmark in benchmarks:
-                    if benchmark["name"] == benchmark_name:
-                        benchmark_id_value = cast(int, benchmark["id"])
+                    if benchmark.get("name") == benchmark_name:
+                        benchmark_id_value: int = benchmark["id"]
                         self.benchmark_cache[benchmark_name] = benchmark_id_value
                         logger.debug(
                             f"Found existing benchmark with ID: {benchmark_id_value}"
@@ -476,8 +497,9 @@ async def _get_existing_benchmark(self, benchmark_name: str) -> int:
 
                 # If no exact match, try partial match
                 for benchmark in benchmarks:
-                    if benchmark_name.lower() in benchmark["name"].lower():
-                        benchmark_id_value = cast(int, benchmark["id"])
+                    benchmark_name_val = benchmark.get("name", "")
+                    if benchmark_name.lower() in benchmark_name_val.lower():
+                        benchmark_id_value = benchmark["id"]
                         self.benchmark_cache[benchmark_name] = benchmark_id_value
                         logger.debug(
                             f"Found similar benchmark with ID: {benchmark_id_value}"
@@ -498,7 +520,7 @@ async def create_evaluation(
         name: str,
         benchmark_id: int,
         model_ids: List[int],
-        metadata: Optional[Dict[str, Any]] = None,
+        metadata: Optional[EvaluationMetadata] = None,
     ) -> int:
         """
         Create evaluation with correct backend format.
@@ -527,8 +549,8 @@ async def create_evaluation(
             )
 
             if response.status_code == 201:
-                evaluation_info = cast(Dict[str, Any], response.json())
-                evaluation_id = cast(int, evaluation_info["id"])
+                evaluation_info = cast(EvaluationInfo, response.json())
+                evaluation_id = evaluation_info["id"]
                 logger.info(f"Evaluation created successfully with ID: {evaluation_id}")
                 return evaluation_id
             elif response.status_code == 401:
@@ -555,7 +577,7 @@ async def create_evaluation(
             raise e
 
     async def upload_evaluation_results(
-        self, evaluation_id: int, results: List[Dict[str, Any]]
+        self, evaluation_id: int, results: List[EvaluationResultDict]
     ) -> bool:
         """
         Upload results to an existing evaluation using the correct endpoint.
@@ -603,7 +625,7 @@ async def upload_evaluation_results(
 
     async def upload_benchmark_result(
         self, benchmark_result: BenchmarkResult
-    ) -> Dict[str, Any]:
+    ) -> UploadBenchmarkResponse:
         """
         Upload a complete benchmark result using correct workflow.
 
@@ -620,12 +642,24 @@ async def upload_benchmark_result(
         try:
             # Step 1: Register benchmark if needed
             benchmark_name = benchmark_result.benchmark_name
+            description_value: Any = benchmark_result.metadata.get(
+                "description", f"Benchmark: {benchmark_name}"
+            )
+            description_str: str = (
+                description_value
+                if isinstance(description_value, str)
+                else f"Benchmark: {benchmark_name}"
+            )
+            dataset_info_value: Any = benchmark_result.metadata.get("dataset", {})
+            dataset_info_typed: DatasetInfo = (
+                cast(DatasetInfo, dataset_info_value)
+                if isinstance(dataset_info_value, dict)
+                else cast(DatasetInfo, {})
+            )
             benchmark_id = await self.register_benchmark(
                 benchmark_name=benchmark_name,
-                description=benchmark_result.metadata.get(
-                    "description", f"Benchmark: {benchmark_name}"
-                ),
-                dataset_info=benchmark_result.metadata.get("dataset", {}),
+                description=description_str,
+                dataset_info=dataset_info_typed,
             )
 
             # Step 2: Register models and collect their IDs
@@ -661,22 +695,11 @@ async def upload_benchmark_result(
             )
 
             # Step 4: Prepare and upload results
-            results_data = []
+            results_data: List[EvaluationResultDict] = []
             for result in benchmark_result.results:
                 if result.success and result.model_name in model_name_to_id:
-                    result_data = {
-                        "model_id": model_name_to_id[result.model_name],
-                        "metrics": result.result
-                        if isinstance(result.result, dict)
-                        else {"score": result.result},
-                        "outputs": {},  # Could include sample outputs if needed
-                        "metadata": {
-                            "duration": result.duration,
-                            "timestamp": result.timestamp.isoformat(),
-                            **result.metadata,
-                        },
-                    }
-                    results_data.append(result_data)
+                    # result.to_dict() already returns EvaluationResultDict
+                    results_data.append(result.to_dict())
 
             # Step 5: Upload results
             await self.upload_evaluation_results(evaluation_id, results_data)
@@ -684,13 +707,21 @@ async def upload_benchmark_result(
             logger.info(
                 f"Benchmark result uploaded successfully. Evaluation ID: {evaluation_id}"
             )
-            return {
+            # Build response with explicit types matching UploadBenchmarkResponse
+            # All values are properly typed:
+            # - evaluation_id: int (from create_evaluation)
+            # - benchmark_id: int (from register_benchmark)
+            # - model_ids: List[int] (from register_model)
+            # - len(results_data): int
+            # - message: str
+            response: UploadBenchmarkResponse = {
                 "id": evaluation_id,
                 "benchmark_id": benchmark_id,
                 "model_ids": model_ids,
                 "results_count": len(results_data),
                 "message": "Evaluation uploaded successfully",
             }
+            return response
 
         except Exception as e:
             # Add to offline queue for later sync
@@ -736,7 +767,7 @@ def _get_model_provider(self, model_name: str) -> str:
 
     async def get_benchmarks(
         self, limit: int = 50, skip: int = 0
-    ) -> List[Dict[str, Any]]:
+    ) -> List[BenchmarkInfo]:
         """Get available benchmarks from the API."""
         try:
             response = await self.client.get(
@@ -744,7 +775,7 @@ async def get_benchmarks(
             )
 
             if response.status_code == 200:
-                return cast(List[Dict[str, Any]], response.json())
+                return cast(List[BenchmarkInfo], response.json())
             else:
                 raise BenchwiseAPIError(
                     f"Failed to retrieve benchmarks: {response.status_code}"
@@ -755,7 +786,7 @@ async def get_benchmarks(
 
     async def get_evaluations(
         self, limit: int = 50, skip: int = 0
-    ) -> List[Dict[str, Any]]:
+    ) -> List[EvaluationInfo]:
         """Get evaluations from the API."""
         try:
             response = await self.client.get(
@@ -763,7 +794,7 @@ async def get_evaluations(
             )
 
             if response.status_code == 200:
-                return cast(List[Dict[str, Any]], response.json())
+                return cast(List[EvaluationInfo], response.json())
             else:
                 raise BenchwiseAPIError(
                     f"Failed to retrieve evaluations: {response.status_code}"
@@ -774,9 +805,11 @@ async def get_evaluations(
 
     async def _add_to_offline_queue(self, data: Dict[str, Any]) -> None:
         """Add data to offline queue for later upload."""
-        self.offline_queue.append(
-            {"data": data, "timestamp": datetime.now().isoformat()}
-        )
+        queue_item: OfflineQueueItem = {
+            "data": data,
+            "timestamp": datetime.now().isoformat(),
+        }
+        self.offline_queue.append(queue_item)
         self.offline_mode = True
         logger.info(f"Added to offline queue (size: {len(self.offline_queue)})")
 
@@ -792,21 +825,33 @@ async def sync_offline_queue(self) -> int:
 
         for item in self.offline_queue:
             try:
-                data = item["data"]
-                data_type = data.get("type")
+                queue_data: Dict[str, Any] = item["data"]
+                data_type: Any = queue_data.get("type")
 
                 if data_type == "full_benchmark_result":
                     # Reconstruct BenchmarkResult and upload
                     from .results import BenchmarkResult
 
-                    benchmark_result = BenchmarkResult(**data["benchmark_result"])
+                    benchmark_result_dict: Dict[str, Any] = queue_data.get(
+                        "benchmark_result", {}
+                    )
+                    benchmark_result = BenchmarkResult(**benchmark_result_dict)
                     await self.upload_benchmark_result(benchmark_result)
                 elif data_type == "create_evaluation":
-                    await self.create_evaluation(**data["data"])
+                    evaluation_data: Dict[str, Any] = queue_data.get("data", {})
+                    await self.create_evaluation(**evaluation_data)
                 elif data_type == "upload_results":
-                    await self.upload_evaluation_results(
-                        data["evaluation_id"], data["results"]
-                    )
+                    evaluation_id_value: Any = queue_data.get("evaluation_id")
+                    results_value: Any = queue_data.get("results")
+                    if isinstance(evaluation_id_value, int) and isinstance(
+                        results_value, list
+                    ):
+                        results_list: List[EvaluationResultDict] = cast(
+                            List[EvaluationResultDict], results_value
+                        )
+                        await self.upload_evaluation_results(
+                            evaluation_id_value, results_list
+                        )
 
                 synced_count += 1
                 logger.info(f"Synced item from {item['timestamp']}")
@@ -853,8 +898,8 @@ async def upload_dataset_for_benchmark(
                 )
 
             if response.status_code == 200:
-                result = cast(Dict[str, Any], response.json())
-                file_url = cast(str, result["file_info"]["url"])
+                result = cast(FileUploadResponse, response.json())
+                file_url = result["file_info"]["url"]
                 logger.info("Dataset uploaded successfully")
                 return file_url
             else:
@@ -895,8 +940,8 @@ async def create_benchmark_with_dataset(
                 f"Failed to create benchmark: {response.status_code}"
             )
 
-        benchmark = cast(Dict[str, Any], response.json())
-        benchmark_id = cast(int, benchmark["id"])
+        benchmark = cast(BenchmarkInfo, response.json())
+        benchmark_id = benchmark["id"]
 
         # 2. Upload dataset
         try:
@@ -948,7 +993,7 @@ async def close_client() -> None:
 
 
 async def upload_results(
-    results: List[EvaluationResult], test_name: str, dataset_info: Dict[str, Any]
+    results: List[EvaluationResult], test_name: str, dataset_info: DatasetInfo
 ) -> bool:
     """
     Convenience function to upload evaluation results.
@@ -974,7 +1019,10 @@ async def upload_results(
             benchmark_result = BenchmarkResult(
                 benchmark_name=test_name,
                 results=results,
-                metadata={"dataset": dataset_info},
+                metadata=cast(
+                    EvaluationMetadata,
+                    {"dataset": dataset_info},
+                ),
             )
             await client._add_to_offline_queue(
                 {
@@ -992,7 +1040,10 @@ async def upload_results(
             benchmark_result = BenchmarkResult(
                 benchmark_name=test_name,
                 results=results,
-                metadata={"dataset": dataset_info},
+                metadata=cast(
+                    EvaluationMetadata,
+                    {"dataset": dataset_info},
+                ),
             )
             await client._add_to_offline_queue(
                 {
@@ -1008,7 +1059,10 @@ async def upload_results(
         benchmark_result = BenchmarkResult(
             benchmark_name=test_name,
             results=results,
-            metadata={"dataset": dataset_info},
+            metadata=cast(
+                EvaluationMetadata,
+                {"dataset": dataset_info},
+            ),
         )
 
         response = await client.upload_benchmark_result(benchmark_result)
diff --git a/benchwise/core.py b/benchwise/core.py
index 5c830ce..df9c724 100644
--- a/benchwise/core.py
+++ b/benchwise/core.py
@@ -8,6 +8,7 @@
     ParamSpec,
     TypeVar,
     Awaitable,
+    cast,
 )
 from functools import wraps
 import asyncio
@@ -15,10 +16,18 @@
 import inspect
 import logging
 from .models import get_model_adapter
-from .datasets import Dataset
+from .datasets import Dataset, convert_metadata_to_info
 from .results import EvaluationResult
 from .config import get_api_config
 from .client import upload_results
+from .types import (
+    RunnerConfig,
+    ModelComparisonResult,
+    EvaluationResultDict,
+    EvaluationMetadata,
+    DatasetInfo,
+    CallableWithBenchmarkMetadata,
+)
 
 # Type variables for decorator typing
 P = ParamSpec("P")
@@ -71,8 +80,15 @@ async def wrapper(
                 test_func, wrapper, dataset, models, upload, kwargs, test_kwargs
             )
 
+        # Copy benchmark metadata if it exists
         if hasattr(test_func, "_benchmark_metadata"):
-            wrapper._benchmark_metadata = test_func._benchmark_metadata  # type: ignore[attr-defined]
+            # Type narrowing: test_func has _benchmark_metadata after hasattr check
+            benchmark_func = cast(CallableWithBenchmarkMetadata, test_func)
+            # Type the wrapper as having the metadata attribute
+            wrapper_with_metadata = cast(CallableWithBenchmarkMetadata, wrapper)
+            wrapper_with_metadata._benchmark_metadata = (
+                benchmark_func._benchmark_metadata
+            )
 
         return wrapper
 
@@ -104,15 +120,19 @@ async def _run_evaluation(
 
             combined_metadata = decorator_kwargs.copy()
             if hasattr(wrapper_func, "_benchmark_metadata"):
-                combined_metadata.update(wrapper_func._benchmark_metadata)
+                # Type narrowing: wrapper_func has _benchmark_metadata after hasattr check
+                benchmark_func = cast(CallableWithBenchmarkMetadata, wrapper_func)
+                combined_metadata.update(benchmark_func._benchmark_metadata)
 
             eval_result = EvaluationResult(
                 model_name=model_name,
                 test_name=test_func.__name__,
                 result=result,
                 duration=end_time - start_time,
-                dataset_info=dataset.metadata,
-                metadata=combined_metadata,
+                dataset_info=convert_metadata_to_info(dataset.metadata)
+                if dataset.metadata
+                else None,
+                metadata=cast(EvaluationMetadata, combined_metadata),
             )
             results.append(eval_result)
 
@@ -123,15 +143,19 @@ async def _run_evaluation(
 
             combined_metadata = decorator_kwargs.copy()
             if hasattr(wrapper_func, "_benchmark_metadata"):
-                combined_metadata.update(wrapper_func._benchmark_metadata)
+                # Type narrowing: wrapper_func has _benchmark_metadata after hasattr check
+                benchmark_func = cast(CallableWithBenchmarkMetadata, wrapper_func)
+                combined_metadata.update(benchmark_func._benchmark_metadata)
 
             eval_result = EvaluationResult(
                 model_name=model_name,
                 test_name=test_func.__name__,
                 error=str(e),
                 duration=0,
-                dataset_info=dataset.metadata,
-                metadata=combined_metadata,
+                dataset_info=convert_metadata_to_info(dataset.metadata)
+                if dataset.metadata
+                else None,
+                metadata=cast(EvaluationMetadata, combined_metadata),
             )
             results.append(eval_result)
 
@@ -141,7 +165,14 @@ async def _run_evaluation(
     if should_upload and results:
         try:
             logger.debug("Uploading results to Benchwise API")
-            await upload_results(results, test_func.__name__, dataset.metadata or {})
+            dataset_info_for_upload: DatasetInfo = (
+                convert_metadata_to_info(dataset.metadata)
+                if dataset.metadata
+                else cast(
+                    DatasetInfo, {"size": dataset.size, "task": "general", "tags": []}
+                )
+            )
+            await upload_results(results, test_func.__name__, dataset_info_for_upload)
             logger.info("Results uploaded successfully")
         except Exception as e:
             logger.warning(f"Upload failed (results saved locally): {e}")
@@ -164,7 +195,9 @@ async def medical_qa_test(model, dataset):
     """
 
     def decorator(test_func: Callable[P, R]) -> Callable[P, R]:
-        test_func._benchmark_metadata = {  # type: ignore[attr-defined]
+        # Add benchmark metadata to the function
+        benchmark_func = cast(CallableWithBenchmarkMetadata, test_func)
+        benchmark_func._benchmark_metadata = {
             "name": name,
             "description": description,
             **kwargs,
@@ -227,9 +260,9 @@ async def wrapper(
 class EvaluationRunner:
     """Main class for running evaluations."""
 
-    def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
-        self.config: Dict[str, Any] = config or {}
-        self.results_cache: Dict[str, Any] = {}
+    def __init__(self, config: Optional[RunnerConfig] = None) -> None:
+        self.config: RunnerConfig = config or cast(RunnerConfig, {})
+        self.results_cache: Dict[str, EvaluationResultDict] = {}
         self.logger = logging.getLogger("benchwise.runner")
 
     async def run_evaluation(
@@ -255,13 +288,15 @@ async def run_evaluation(
 
     def compare_models(
         self, results: List[EvaluationResult], metric_name: Optional[str] = None
-    ) -> Dict[str, Any]:
+    ) -> ModelComparisonResult:
         """Compare model performance."""
         successful_results = [r for r in results if r.success]
 
         if not successful_results:
             self.logger.warning("No successful results to compare")
-            return {"error": "No successful results to compare"}
+            return cast(
+                ModelComparisonResult, {"error": "No successful results to compare"}
+            )
 
         model_scores = []
         for r in successful_results:
@@ -286,19 +321,28 @@ def compare_models(
             model_scores.append((r.model_name, score if score is not None else 0))
 
         if not model_scores:
-            return {"error": "No comparable scores found"}
+            return cast(ModelComparisonResult, {"error": "No comparable scores found"})
 
         model_scores.sort(key=lambda x: x[1], reverse=True)
 
-        comparison = {
-            "models": [r.model_name for r in successful_results],
-            "scores": [score for _, score in model_scores],
-            "best_model": model_scores[0][0],
-            "worst_model": model_scores[-1][0],
-            "ranking": [
-                {"model": name, "score": score} for name, score in model_scores
-            ],
-        }
+        comparison = cast(
+            ModelComparisonResult,
+            {
+                "ranking": [
+                    {"model": name, "score": float(score)}
+                    for name, score in model_scores
+                ],
+                "best_model": model_scores[0][0],
+                "best_score": float(model_scores[0][1]),
+                "worst_model": model_scores[-1][0],
+                "worst_score": float(model_scores[-1][1]),
+                "mean_score": float(
+                    sum(score for _, score in model_scores) / len(model_scores)
+                ),
+                "std_score": 0.0,  # Could calculate if needed
+                "total_models": len(model_scores),
+            },
+        )
 
         self.logger.info(
             f"Comparison complete: Best model is {comparison['best_model']}"
diff --git a/benchwise/datasets.py b/benchwise/datasets.py
index a4a60da..3459b97 100644
--- a/benchwise/datasets.py
+++ b/benchwise/datasets.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Any, Optional, Union, Callable
+from typing import List, Dict, Any, Optional, Union, Callable, cast
 import json
 import pandas as pd
 from pathlib import Path
@@ -6,6 +6,101 @@
 from dataclasses import dataclass
 import hashlib
 
+from .types import (
+    DatasetItem,
+    DatasetMetadata,
+    DatasetSchema,
+    DatasetDict,
+    DatasetInfo,
+)
+
+
+def _validate_dataset_item(item: Any) -> DatasetItem:
+    """
+    Validate and convert a dictionary to DatasetItem.
+
+    Args:
+        item: Dictionary or any value to validate
+
+    Returns:
+        Validated DatasetItem
+
+    Raises:
+        ValueError: If item is not a dictionary
+    """
+    if not isinstance(item, dict):
+        raise ValueError(f"Expected dict for DatasetItem, got {type(item).__name__}")
+    return cast(DatasetItem, item)
+
+
+def _validate_dataset_items(items: Any) -> List[DatasetItem]:
+    """
+    Validate and convert a list of dictionaries to List[DatasetItem].
+
+    Args:
+        items: List of dictionaries or any value to validate
+
+    Returns:
+        Validated List[DatasetItem]
+
+    Raises:
+        ValueError: If items is not a list or contains non-dict items
+    """
+    if not isinstance(items, list):
+        raise ValueError(f"Expected list for dataset data, got {type(items).__name__}")
+
+    validated_items: List[DatasetItem] = []
+    for i, item in enumerate(items):
+        if not isinstance(item, dict):
+            raise ValueError(
+                f"Expected dict for dataset item at index {i}, got {type(item).__name__}"
+            )
+        validated_items.append(cast(DatasetItem, item))
+
+    return validated_items
+
+
+def _validate_dataset_metadata(metadata: Any) -> Optional[DatasetMetadata]:
+    """
+    Validate and convert metadata to DatasetMetadata.
+
+    Args:
+        metadata: Dictionary or None to validate
+
+    Returns:
+        Validated DatasetMetadata or None
+    """
+    if metadata is None:
+        return None
+
+    if not isinstance(metadata, dict):
+        raise ValueError(
+            f"Expected dict or None for DatasetMetadata, got {type(metadata).__name__}"
+        )
+
+    return cast(DatasetMetadata, metadata)
+
+
+def _validate_dataset_schema(schema: Any) -> Optional[DatasetSchema]:
+    """
+    Validate and convert schema to DatasetSchema.
+
+    Args:
+        schema: Dictionary or None to validate
+
+    Returns:
+        Validated DatasetSchema or None
+    """
+    if schema is None:
+        return None
+
+    if not isinstance(schema, dict):
+        raise ValueError(
+            f"Expected dict or None for DatasetSchema, got {type(schema).__name__}"
+        )
+
+    return cast(DatasetSchema, schema)
+
 
 @dataclass
 class Dataset:
@@ -20,20 +115,23 @@ class Dataset:
     """
 
     name: str
-    data: List[Dict[str, Any]]
-    metadata: Optional[Dict[str, Any]] = None
-    schema: Optional[Dict[str, Any]] = None
+    data: List[DatasetItem]
+    metadata: Optional[DatasetMetadata] = None
+    schema: Optional[DatasetSchema] = None
 
     def __post_init__(self) -> None:
         if self.metadata is None:
-            self.metadata = {}
+            self.metadata = cast(DatasetMetadata, {})
 
         if not self.metadata:
-            self.metadata = {
-                "size": len(self.data),
-                "created_at": pd.Timestamp.now().isoformat(),
-                "hash": self._compute_hash(),
-            }
+            self.metadata = cast(
+                DatasetMetadata,
+                {
+                    "size": len(self.data),
+                    "created_at": pd.Timestamp.now().isoformat(),
+                    "hash": self._compute_hash(),
+                },
+            )
 
     def _compute_hash(self) -> str:
         """Compute hash of dataset for versioning."""
@@ -78,14 +176,17 @@ def references(self) -> List[str]:
                 references.append(str(ref))
         return references
 
-    def filter(self, condition: Callable[[Dict[str, Any]], bool]) -> "Dataset":
+    def filter(self, condition: Callable[[DatasetItem], bool]) -> "Dataset":
         """Filter dataset items based on condition."""
         filtered_data = [item for item in self.data if condition(item)]
-        metadata = self.metadata or {}
+        metadata = self.metadata or cast(DatasetMetadata, {})
         return Dataset(
             name=f"{self.name}_filtered",
             data=filtered_data,
-            metadata={**metadata, "filtered": True, "original_size": self.size},
+            metadata=cast(
+                DatasetMetadata,
+                {**metadata, "filtered": True, "original_size": self.size},
+            ),
         )
 
     def sample(self, n: int, random_state: Optional[int] = None) -> "Dataset":
@@ -95,12 +196,16 @@ def sample(self, n: int, random_state: Optional[int] = None) -> "Dataset":
         if random_state:
             random.seed(random_state)
 
-        sampled_data = random.sample(self.data, min(n, len(self.data)))
-        metadata = self.metadata or {}
+        sampled_data: List[DatasetItem] = random.sample(
+            self.data, min(n, len(self.data))
+        )
+        metadata = self.metadata or cast(DatasetMetadata, {})
         return Dataset(
             name=f"{self.name}_sample_{n}",
             data=sampled_data,
-            metadata={**metadata, "sampled": True, "sample_size": n},
+            metadata=cast(
+                DatasetMetadata, {**metadata, "sampled": True, "sample_size": n}
+            ),
         )
 
     def split(
@@ -112,43 +217,52 @@ def split(
         if random_state:
             random.seed(random_state)
 
-        shuffled_data = self.data.copy()
+        shuffled_data: List[DatasetItem] = self.data.copy()
         random.shuffle(shuffled_data)
 
         split_idx = int(len(shuffled_data) * train_ratio)
-        train_data = shuffled_data[:split_idx]
-        test_data = shuffled_data[split_idx:]
+        train_data: List[DatasetItem] = shuffled_data[:split_idx]
+        test_data: List[DatasetItem] = shuffled_data[split_idx:]
 
         train_dataset = Dataset(
             name=f"{self.name}_train",
             data=train_data,
-            metadata={
-                **(self.metadata or {}),
-                "split": "train",
-                "train_ratio": train_ratio,
-            },
+            metadata=cast(
+                DatasetMetadata,
+                {
+                    **(self.metadata or cast(DatasetMetadata, {})),
+                    "split": "train",
+                    "train_ratio": train_ratio,
+                },
+            ),
         )
 
         test_dataset = Dataset(
             name=f"{self.name}_test",
             data=test_data,
-            metadata={
-                **(self.metadata or {}),
-                "split": "test",
-                "train_ratio": train_ratio,
-            },
+            metadata=cast(
+                DatasetMetadata,
+                {
+                    **(self.metadata or cast(DatasetMetadata, {})),
+                    "split": "test",
+                    "test_ratio": 1 - train_ratio,
+                },
+            ),
         )
 
         return train_dataset, test_dataset
 
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> DatasetDict:
         """Convert dataset to dictionary format."""
-        return {
-            "name": self.name,
-            "data": self.data,
-            "metadata": self.metadata,
-            "schema": self.schema,
-        }
+        return cast(
+            DatasetDict,
+            {
+                "name": self.name,
+                "data": self.data,
+                "metadata": self.metadata,
+                "schema": self.schema,
+            },
+        )
 
     def to_json(self, file_path: Optional[str] = None) -> str:
         """Export dataset to JSON format."""
@@ -170,7 +284,14 @@ def validate_schema(self) -> bool:
         if not self.schema:
             return True
 
-        required_fields = self.schema.get("required", [])
+        # Support both "required" and "required_fields" for backward compatibility
+        # Check if "required" key exists first, then fall back to "required_fields"
+        if "required" in self.schema:
+            required_fields = self.schema["required"]
+        elif "required_fields" in self.schema:
+            required_fields = self.schema["required_fields"]
+        else:
+            required_fields = []
 
         for item in self.data:
             for field in required_fields:
@@ -210,7 +331,7 @@ def get_statistics(self) -> Dict[str, Any]:
         return stats
 
 
-def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs: Any) -> Dataset:
+def load_dataset(source: Union[str, Path, DatasetDict], **kwargs: Any) -> Dataset:
     """
     Load dataset from various sources.
 
@@ -223,11 +344,19 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs: Any) -> Dat
     """
 
     if isinstance(source, dict):
+        # Type narrowing: after isinstance check, treat as DatasetDict
+        # Note: .get() on TypedDict with total=False returns Any for optional keys,
+        # but we know the structure from DatasetDict, so we use proper type annotations
+        dataset_dict: DatasetDict = source
+        data: List[DatasetItem] = dataset_dict.get("data", [])
+        metadata: Optional[DatasetMetadata] = dataset_dict.get("metadata")
+        schema: Optional[DatasetSchema] = dataset_dict.get("schema")
+
         return Dataset(
             name=kwargs.get("name", "custom_dataset"),
-            data=source.get("data", []),
-            metadata=source.get("metadata", {}),
-            schema=source.get("schema"),
+            data=_validate_dataset_items(data),
+            metadata=_validate_dataset_metadata(metadata),
+            schema=_validate_dataset_schema(schema),
         )
 
     elif isinstance(source, (str, Path)):
@@ -235,20 +364,24 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs: Any) -> Dat
 
         if source_path.suffix == ".json":
             with open(source_path, "r") as f:
-                data = json.load(f)
+                json_data = json.load(f)
 
-            if isinstance(data, dict) and "data" in data:
+            if isinstance(json_data, dict) and "data" in json_data:
                 return Dataset(
-                    name=data.get("name", source_path.stem),
-                    data=data["data"],
-                    metadata=data.get("metadata", {}),
-                    schema=data.get("schema"),
+                    name=json_data.get("name", source_path.stem)
+                    if isinstance(json_data.get("name"), str)
+                    else source_path.stem,
+                    data=_validate_dataset_items(json_data["data"]),
+                    metadata=_validate_dataset_metadata(json_data.get("metadata")),
+                    schema=_validate_dataset_schema(json_data.get("schema")),
                 )
-            elif isinstance(data, list):
+            elif isinstance(json_data, list):
                 return Dataset(
-                    name=kwargs.get("name", source_path.stem),
-                    data=data,
-                    metadata=kwargs.get("metadata", {}),
+                    name=kwargs.get("name", source_path.stem)
+                    if isinstance(kwargs.get("name"), str)
+                    else source_path.stem,
+                    data=_validate_dataset_items(json_data),
+                    metadata=_validate_dataset_metadata(kwargs.get("metadata", {})),
                 )
             else:
                 raise ValueError(
@@ -258,12 +391,19 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs: Any) -> Dat
         elif source_path.suffix == ".csv":
             df = pd.read_csv(source_path)
             # Type cast: pandas to_dict returns dict[Hashable, Any] but we need dict[str, Any]
-            data = [dict(record) for record in df.to_dict("records")]
+            records: List[Dict[str, Any]] = [
+                cast(Dict[str, Any], dict(record)) for record in df.to_dict("records")
+            ]
+            csv_data: List[DatasetItem] = [
+                cast(DatasetItem, record) for record in records
+            ]
 
             return Dataset(
-                name=kwargs.get("name", source_path.stem),
-                data=data,
-                metadata=kwargs.get("metadata", {}),
+                name=kwargs.get("name", source_path.stem)
+                if isinstance(kwargs.get("name"), str)
+                else source_path.stem,
+                data=csv_data,
+                metadata=_validate_dataset_metadata(kwargs.get("metadata")),
             )
 
         elif str(source).startswith(("http://", "https://")):
@@ -273,19 +413,23 @@ def load_dataset(source: Union[str, Path, Dict[str, Any]], **kwargs: Any) -> Dat
             response.raise_for_status()
 
             if source_str.endswith(".json"):
-                data = response.json()
-                if isinstance(data, dict) and "data" in data:
+                json_data = response.json()
+                if isinstance(json_data, dict) and "data" in json_data:
                     return Dataset(
-                        name=data.get("name", "remote_dataset"),
-                        data=data["data"],
-                        metadata=data.get("metadata", {}),
-                        schema=data.get("schema"),
+                        name=json_data.get("name", "remote_dataset")
+                        if isinstance(json_data.get("name"), str)
+                        else "remote_dataset",
+                        data=_validate_dataset_items(json_data["data"]),
+                        metadata=_validate_dataset_metadata(json_data.get("metadata")),
+                        schema=_validate_dataset_schema(json_data.get("schema")),
                     )
-                elif isinstance(data, list):
+                elif isinstance(json_data, list):
                     return Dataset(
-                        name=kwargs.get("name", "remote_dataset"),
-                        data=data,
-                        metadata=kwargs.get("metadata", {}),
+                        name=kwargs.get("name", "remote_dataset")
+                        if isinstance(kwargs.get("name"), str)
+                        else "remote_dataset",
+                        data=_validate_dataset_items(json_data),
+                        metadata=_validate_dataset_metadata(kwargs.get("metadata", {})),
                     )
                 else:
                     raise ValueError(
@@ -322,23 +466,32 @@ def create_qa_dataset(
     if len(questions) != len(answers):
         raise ValueError("Questions and answers must have the same length")
 
-    data = [{"question": q, "answer": a} for q, a in zip(questions, answers)]
+    data: List[DatasetItem] = [
+        cast(DatasetItem, {"question": q, "answer": a})
+        for q, a in zip(questions, answers)
+    ]
 
     return Dataset(
-        name=kwargs.get("name", "qa_dataset"),
+        name=kwargs.get("name", "qa_dataset")
+        if isinstance(kwargs.get("name"), str)
+        else "qa_dataset",
         data=data,
-        metadata={
-            "task": "question_answering",
-            "size": len(data),
-            **kwargs.get("metadata", {}),
-        },
-        schema={
-            "required": ["question", "answer"],
-            "properties": {
-                "question": {"type": "string"},
-                "answer": {"type": "string"},
+        metadata=cast(
+            DatasetMetadata,
+            {
+                "task": "question_answering",
+                "size": len(data),
+                **kwargs.get("metadata", {}),
             },
-        },
+        ),
+        schema=cast(
+            DatasetSchema,
+            {
+                "required": ["question", "answer"],
+                "prompt_field": "question",
+                "reference_field": "answer",
+            },
+        ),
     )
 
 
@@ -360,25 +513,32 @@ def create_summarization_dataset(
     if len(documents) != len(summaries):
         raise ValueError("Documents and summaries must have the same length")
 
-    data = [
-        {"document": doc, "summary": summ} for doc, summ in zip(documents, summaries)
+    data: List[DatasetItem] = [
+        cast(DatasetItem, {"document": doc, "summary": summ})
+        for doc, summ in zip(documents, summaries)
     ]
 
     return Dataset(
-        name=kwargs.get("name", "summarization_dataset"),
+        name=kwargs.get("name", "summarization_dataset")
+        if isinstance(kwargs.get("name"), str)
+        else "summarization_dataset",
         data=data,
-        metadata={
-            "task": "summarization",
-            "size": len(data),
-            **kwargs.get("metadata", {}),
-        },
-        schema={
-            "required": ["document", "summary"],
-            "properties": {
-                "document": {"type": "string"},
-                "summary": {"type": "string"},
+        metadata=cast(
+            DatasetMetadata,
+            {
+                "task": "summarization",
+                "size": len(data),
+                **kwargs.get("metadata", {}),
+            },
+        ),
+        schema=cast(
+            DatasetSchema,
+            {
+                "required": ["document", "summary"],
+                "prompt_field": "document",
+                "reference_field": "summary",
             },
-        },
+        ),
     )
 
 
@@ -400,21 +560,33 @@ def create_classification_dataset(
     if len(texts) != len(labels):
         raise ValueError("Texts and labels must have the same length")
 
-    data = [{"text": text, "label": label} for text, label in zip(texts, labels)]
+    data: List[DatasetItem] = [
+        cast(DatasetItem, {"text": text, "label": label})
+        for text, label in zip(texts, labels)
+    ]
 
     return Dataset(
-        name=kwargs.get("name", "classification_dataset"),
+        name=kwargs.get("name", "classification_dataset")
+        if isinstance(kwargs.get("name"), str)
+        else "classification_dataset",
         data=data,
-        metadata={
-            "task": "classification",
-            "size": len(data),
-            "unique_labels": list(set(labels)),
-            **kwargs.get("metadata", {}),
-        },
-        schema={
-            "required": ["text", "label"],
-            "properties": {"text": {"type": "string"}, "label": {"type": "string"}},
-        },
+        metadata=cast(
+            DatasetMetadata,
+            {
+                "task": "classification",
+                "size": len(data),
+                "unique_labels": list(set(labels)),
+                **kwargs.get("metadata", {}),
+            },
+        ),
+        schema=cast(
+            DatasetSchema,
+            {
+                "required": ["text", "label"],
+                "prompt_field": "text",
+                "reference_field": "label",
+            },
+        ),
     )
 
 
@@ -446,73 +618,144 @@ def clear(self) -> None:
 
 
 def load_mmlu_sample() -> Dataset:
-    sample_data = [
-        {
-            "question": "What is the capital of France?",
-            "choices": ["London", "Berlin", "Paris", "Madrid"],
-            "answer": "Paris",
-            "subject": "geography",
-        },
-        {
-            "question": "What is 2 + 2?",
-            "choices": ["3", "4", "5", "6"],
-            "answer": "4",
-            "subject": "mathematics",
-        },
+    sample_data: List[DatasetItem] = [
+        cast(
+            DatasetItem,
+            {
+                "question": "What is the capital of France?",
+                "choices": ["London", "Berlin", "Paris", "Madrid"],
+                "answer": "Paris",
+                "subject": "geography",
+            },
+        ),
+        cast(
+            DatasetItem,
+            {
+                "question": "What is 2 + 2?",
+                "choices": ["3", "4", "5", "6"],
+                "answer": "4",
+                "subject": "mathematics",
+            },
+        ),
     ]
 
     return Dataset(
         name="mmlu_sample",
         data=sample_data,
-        metadata={
-            "task": "multiple_choice_qa",
-            "source": "MMLU",
-            "description": "Sample from Massive Multitask Language Understanding",
-        },
+        metadata=cast(
+            DatasetMetadata,
+            {
+                "task": "multiple_choice_qa",
+                "source": "MMLU",
+                "description": "Sample from Massive Multitask Language Understanding",
+            },
+        ),
     )
 
 
 def load_hellaswag_sample() -> Dataset:
     """Load a sample of HellaSwag dataset."""
-    sample_data = [
-        {
-            "context": "A woman is outside with a bucket and a dog. The dog is running around trying to avoid a bath. She",
-            "endings": [
-                "rinses the bucket off with soap and blow dry the dog.",
-                "uses a hose to keep the dog from getting soapy.",
-                "gets the dog wet, then it runs away again.",
-                "gets into the bath tub with the dog.",
-            ],
-            "label": 2,
-        }
+    sample_data: List[DatasetItem] = [
+        cast(
+            DatasetItem,
+            {
+                "context": "A woman is outside with a bucket and a dog. The dog is running around trying to avoid a bath. She",
+                "endings": [
+                    "rinses the bucket off with soap and blow dry the dog.",
+                    "uses a hose to keep the dog from getting soapy.",
+                    "gets the dog wet, then it runs away again.",
+                    "gets into the bath tub with the dog.",
+                ],
+                "label": 2,
+            },
+        )
     ]
 
     return Dataset(
         name="hellaswag_sample",
         data=sample_data,
-        metadata={
-            "task": "sentence_completion",
-            "source": "HellaSwag",
-            "description": "Commonsense reasoning benchmark",
-        },
+        metadata=cast(
+            DatasetMetadata,
+            {
+                "task": "sentence_completion",
+                "source": "HellaSwag",
+                "description": "Commonsense reasoning benchmark",
+            },
+        ),
     )
 
 
 def load_gsm8k_sample() -> Dataset:
     """Load a sample of GSM8K (Grade School Math 8K) dataset."""
-    sample_data = [
-        {
-            "question": "Janet's ducks lay 16 eggs per day. She eats 3 for breakfast every morning and bakes 4 into muffins for her friends every day. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much money does she make every day at the farmers' market?",
-            "answer": "Janet sells 16 - 3 - 4 = 9 duck eggs every day. She makes 9 * $2 = $18 every day at the farmers' market.",
-        }
+    sample_data: List[DatasetItem] = [
+        cast(
+            DatasetItem,
+            {
+                "question": "Janet's ducks lay 16 eggs per day. She eats 3 for breakfast every morning and bakes 4 into muffins for her friends every day. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much money does she make every day at the farmers' market?",
+                "answer": "Janet sells 16 - 3 - 4 = 9 duck eggs every day. She makes 9 * $2 = $18 every day at the farmers' market.",
+            },
+        )
     ]
 
     return Dataset(
         name="gsm8k_sample",
         data=sample_data,
-        metadata={
-            "task": "math_word_problems",
-            "source": "GSM8K",
-            "description": "Grade school math word problems",
-        },
+        metadata=cast(
+            DatasetMetadata,
+            {
+                "task": "math_word_problems",
+                "source": "GSM8K",
+                "description": "Grade school math word problems",
+            },
+        ),
     )
+
+
+def convert_metadata_to_info(metadata: DatasetMetadata) -> DatasetInfo:
+    """
+    Convert DatasetMetadata to DatasetInfo for evaluation results.
+
+    This function properly converts dataset metadata (which is stored with the dataset)
+    to dataset info (which is used in evaluation results). It handles missing fields
+    and ensures type safety.
+
+    Args:
+        metadata: Dataset metadata to convert
+
+    Returns:
+        DatasetInfo with converted fields
+    """
+    # Extract fields that exist in DatasetMetadata
+    size: int = metadata.get("size", 0)
+    tags: List[str] = metadata.get("tags", [])
+    source: Optional[str] = metadata.get("source")
+    name: Optional[str] = metadata.get("name")
+    description: Optional[str] = metadata.get("description")
+    version: Optional[str] = metadata.get("version")
+    created_at: Optional[str] = metadata.get("created_at")
+
+    # Extract fields that might exist but aren't in DatasetMetadata TypedDict
+    # These could be present at runtime even if not in the type definition
+    metadata_dict: Dict[str, Any] = cast(Dict[str, Any], metadata)
+    hash_value: Optional[str] = metadata_dict.get("hash")
+    task: Optional[str] = metadata_dict.get("task")
+    difficulty: Optional[str] = metadata_dict.get("difficulty")
+
+    # Build DatasetInfo with proper types
+    info: DatasetInfo = {
+        "size": size,
+        "tags": tags,
+        "source": source,
+        "name": name,
+        "description": description,
+        "version": version,
+        "created_at": created_at,
+        "hash": hash_value,
+        "task": task if task else "general",
+    }
+
+    # Add difficulty if available
+    if difficulty:
+        info["difficulty"] = difficulty
+
+    return info
diff --git a/benchwise/results.py b/benchwise/results.py
index 54768ff..e0c7044 100644
--- a/benchwise/results.py
+++ b/benchwise/results.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, List, Optional, Union, cast
 from dataclasses import dataclass, field
 from datetime import datetime
 import json
@@ -7,6 +7,19 @@
 import numpy as np
 import hashlib
 
+from .types import (
+    DatasetInfo,
+    EvaluationMetadata,
+    EvaluationResultDict,
+    BenchmarkSummary,
+    BenchmarkResultDict,
+    ModelComparisonResult,
+    CrossBenchmarkComparison,
+    ModelPerformanceAnalysis,
+    CachedResultInfo,
+    BenchmarkComparisonInfo,
+)
+
 
 @dataclass
 class EvaluationResult:
@@ -28,9 +41,11 @@ class EvaluationResult:
     test_name: str
     result: Any = None
     duration: float = 0.0
-    dataset_info: Optional[Dict[str, Any]] = None
+    dataset_info: Optional[DatasetInfo] = None
     error: Optional[str] = None
-    metadata: Dict[str, Any] = field(default_factory=dict)
+    metadata: EvaluationMetadata = field(
+        default_factory=lambda: cast(EvaluationMetadata, {})
+    )
     timestamp: datetime = field(default_factory=datetime.now)
 
     @property
@@ -43,19 +58,22 @@ def failed(self) -> bool:
         """Whether the evaluation failed."""
         return self.error is not None
 
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> EvaluationResultDict:
         """Convert result to dictionary format."""
-        return {
-            "model_name": self.model_name,
-            "test_name": self.test_name,
-            "result": self.result,
-            "duration": self.duration,
-            "dataset_info": self.dataset_info,
-            "error": self.error,
-            "metadata": self.metadata,
-            "timestamp": self.timestamp.isoformat(),
-            "success": self.success,
-        }
+        return cast(
+            EvaluationResultDict,
+            {
+                "model_name": self.model_name,
+                "test_name": self.test_name,
+                "result": self.result,
+                "duration": self.duration,
+                "dataset_info": self.dataset_info,
+                "error": self.error,
+                "metadata": self.metadata,
+                "timestamp": self.timestamp.isoformat(),
+                "success": self.success,
+            },
+        )
 
     def get_score(self, metric_name: Optional[str] = None) -> Union[float, Any]:
         """
@@ -90,7 +108,9 @@ class BenchmarkResult:
 
     benchmark_name: str
     results: List[EvaluationResult] = field(default_factory=list)
-    metadata: Dict[str, Any] = field(default_factory=dict)
+    metadata: EvaluationMetadata = field(
+        default_factory=lambda: cast(EvaluationMetadata, {})
+    )
     timestamp: datetime = field(default_factory=datetime.now)
 
     def add_result(self, result: EvaluationResult) -> None:
@@ -157,7 +177,9 @@ def get_worst_model(
             successful_results, key=lambda r: r.get_score(metric_name) or float("inf")
         )
 
-    def compare_models(self, metric_name: Optional[str] = None) -> Dict[str, Any]:
+    def compare_models(
+        self, metric_name: Optional[str] = None
+    ) -> ModelComparisonResult:
         """
         Compare all models in the benchmark.
 
@@ -169,7 +191,9 @@ def compare_models(self, metric_name: Optional[str] = None) -> Dict[str, Any]:
         """
         successful_results = self.successful_results
         if not successful_results:
-            return {"error": "No successful results to compare"}
+            return cast(
+                ModelComparisonResult, {"error": "No successful results to compare"}
+            )
 
         scores = [result.get_score(metric_name) for result in successful_results]
         model_names = [result.model_name for result in successful_results]
@@ -182,22 +206,25 @@ def compare_models(self, metric_name: Optional[str] = None) -> Dict[str, Any]:
         ]
 
         if not valid_scores:
-            return {"error": "No valid scores found"}
+            return cast(ModelComparisonResult, {"error": "No valid scores found"})
 
         sorted_results = sorted(valid_scores, key=lambda x: x[1], reverse=True)
 
-        return {
-            "ranking": [
-                {"model": name, "score": score} for name, score in sorted_results
-            ],
-            "best_model": sorted_results[0][0],
-            "best_score": sorted_results[0][1],
-            "worst_model": sorted_results[-1][0],
-            "worst_score": sorted_results[-1][1],
-            "mean_score": np.mean([score for _, score in valid_scores]),
-            "std_score": np.std([score for _, score in valid_scores]),
-            "total_models": len(valid_scores),
-        }
+        return cast(
+            ModelComparisonResult,
+            {
+                "ranking": [
+                    {"model": name, "score": score} for name, score in sorted_results
+                ],
+                "best_model": sorted_results[0][0],
+                "best_score": sorted_results[0][1],
+                "worst_model": sorted_results[-1][0],
+                "worst_score": sorted_results[-1][1],
+                "mean_score": float(np.mean([score for _, score in valid_scores])),
+                "std_score": float(np.std([score for _, score in valid_scores])),
+                "total_models": len(valid_scores),
+            },
+        )
 
     def get_model_result(self, model_name: str) -> Optional[EvaluationResult]:
         """Get result for a specific model."""
@@ -206,20 +233,24 @@ def get_model_result(self, model_name: str) -> Optional[EvaluationResult]:
                 return result
         return None
 
-    def to_dict(self) -> Dict[str, Any]:
+    def to_dict(self) -> BenchmarkResultDict:
         """Convert benchmark result to dictionary format."""
-        return {
-            "benchmark_name": self.benchmark_name,
-            "results": [result.to_dict() for result in self.results],
-            "metadata": self.metadata,
-            "timestamp": self.timestamp.isoformat(),
-            "summary": {
-                "total_models": len(self.results),
-                "successful_models": len(self.successful_results),
-                "failed_models": len(self.failed_results),
-                "success_rate": self.success_rate,
-            },
+        summary: BenchmarkSummary = {
+            "total_models": len(self.results),
+            "successful_models": len(self.successful_results),
+            "failed_models": len(self.failed_results),
+            "success_rate": self.success_rate,
         }
+        return cast(
+            BenchmarkResultDict,
+            {
+                "benchmark_name": self.benchmark_name,
+                "results": [result.to_dict() for result in self.results],
+                "metadata": self.metadata,
+                "timestamp": self.timestamp.isoformat(),
+                "summary": summary,
+            },
+        )
 
     def to_dataframe(self) -> pd.DataFrame:
         """Convert results to pandas DataFrame for analysis."""
@@ -262,7 +293,7 @@ class ResultsAnalyzer:
     @staticmethod
     def compare_benchmarks(
         benchmark_results: List[BenchmarkResult], metric_name: Optional[str] = None
-    ) -> Dict[str, Any]:
+    ) -> CrossBenchmarkComparison:
         """
         Compare results across multiple benchmarks.
 
@@ -273,11 +304,12 @@ def compare_benchmarks(
         Returns:
             Dictionary with cross-benchmark comparison
         """
-        comparison: Dict[str, Any] = {
+        comparison: CrossBenchmarkComparison = {
             "benchmarks": [],
-            "models": set(),
+            "models": [],
             "cross_benchmark_scores": {},
         }
+        models_set: set[str] = set()
 
         for benchmark in benchmark_results:
             benchmark_info = {
@@ -287,8 +319,10 @@ def compare_benchmarks(
                 "success_rate": benchmark.success_rate,
             }
 
-            comparison["benchmarks"].append(benchmark_info)
-            comparison["models"].update(benchmark.model_names)
+            comparison["benchmarks"].append(
+                cast(BenchmarkComparisonInfo, benchmark_info)
+            )
+            models_set.update(benchmark.model_names)
 
             # Collect scores for each model
             for result in benchmark.successful_results:
@@ -302,14 +336,14 @@ def compare_benchmarks(
                     benchmark.benchmark_name
                 ] = score
 
-        comparison["models"] = list(comparison["models"])
+        comparison["models"] = list(models_set)
 
         return comparison
 
     @staticmethod
     def analyze_model_performance(
         results: List[EvaluationResult], metric_name: Optional[str] = None
-    ) -> Dict[str, Any]:
+    ) -> ModelPerformanceAnalysis:
         """
         Analyze performance of a single model across multiple evaluations.
 
@@ -321,32 +355,38 @@ def analyze_model_performance(
             Dictionary with performance analysis
         """
         if not results:
-            return {"error": "No results provided"}
+            return cast(ModelPerformanceAnalysis, {"error": "No results provided"})
 
         model_name = results[0].model_name
         successful_results = [r for r in results if r.success]
 
         if not successful_results:
-            return {"error": "No successful results found"}
+            return cast(
+                ModelPerformanceAnalysis, {"error": "No successful results found"}
+            )
 
         scores = [result.get_score(metric_name) for result in successful_results]
         valid_scores = [score for score in scores if score is not None]
 
         if not valid_scores:
-            return {"error": "No valid scores found"}
-
-        return {
-            "model_name": model_name,
-            "total_evaluations": len(results),
-            "successful_evaluations": len(successful_results),
-            "success_rate": len(successful_results) / len(results),
-            "mean_score": np.mean(valid_scores),
-            "median_score": np.median(valid_scores),
-            "std_score": np.std(valid_scores),
-            "min_score": np.min(valid_scores),
-            "max_score": np.max(valid_scores),
-            "score_range": np.max(valid_scores) - np.min(valid_scores),
-        }
+            return cast(ModelPerformanceAnalysis, {"error": "No valid scores found"})
+
+        return cast(
+            ModelPerformanceAnalysis,
+            {
+                "model_name": model_name,
+                "total_evaluations": len(results),
+                "successful_evaluations": len(successful_results),
+                "failed_evaluations": len(results) - len(successful_results),
+                "success_rate": len(successful_results) / len(results),
+                "mean_score": float(np.mean(valid_scores)),
+                "median_score": float(np.median(valid_scores)),
+                "std_score": float(np.std(valid_scores)),
+                "min_score": float(np.min(valid_scores)),
+                "max_score": float(np.max(valid_scores)),
+                "scores": valid_scores,
+            },
+        )
 
     @staticmethod
     def generate_report(
@@ -529,7 +569,7 @@ def clear_cache(self) -> None:
         for cache_file in self.cache_dir.glob("*.json"):
             cache_file.unlink()
 
-    def list_cached_results(self) -> List[Dict[str, Any]]:
+    def list_cached_results(self) -> List[CachedResultInfo]:
         """List all cached results."""
         results = []
         for cache_file in self.cache_dir.glob("*.json"):
@@ -537,12 +577,15 @@ def list_cached_results(self) -> List[Dict[str, Any]]:
                 with open(cache_file, "r") as f:
                     data = json.load(f)
                 results.append(
-                    {
-                        "file": cache_file.name,
-                        "model_name": data.get("model_name"),
-                        "test_name": data.get("test_name"),
-                        "timestamp": data.get("timestamp"),
-                    }
+                    cast(
+                        CachedResultInfo,
+                        {
+                            "file": cache_file.name,
+                            "model_name": data.get("model_name"),
+                            "test_name": data.get("test_name"),
+                            "timestamp": data.get("timestamp"),
+                        },
+                    )
                 )
             except Exception:
                 continue
diff --git a/benchwise/types.py b/benchwise/types.py
index 38cf447..fd24e5e 100644
--- a/benchwise/types.py
+++ b/benchwise/types.py
@@ -298,7 +298,7 @@ class DatasetItem(TypedDict, total=False):
     summary: str
     # Additional fields
     id: str
-    metadata: Dict[str, Any]
+    metadata: "EvaluationMetadata"
 
 
 class DatasetMetadata(TypedDict, total=False):
@@ -318,10 +318,43 @@ class DatasetSchema(TypedDict, total=False):
 
     prompt_field: str
     reference_field: str
-    required_fields: List[str]
+    required: List[str]  # Required fields in dataset items
+    required_fields: List[str]  # Alias for backward compatibility
     optional_fields: List[str]
 
 
+class DatasetInfo(TypedDict, total=False):
+    """Information about a dataset used in evaluation."""
+
+    size: int
+    task: str
+    tags: List[str]
+    difficulty: Optional[str]
+    source: Optional[str]
+    name: Optional[str]
+    description: Optional[str]
+    version: Optional[str]
+    hash: Optional[str]
+    created_at: Optional[str]
+
+
+class DatasetStatistics(TypedDict, total=False):
+    """Statistics about a dataset."""
+
+    size: int
+    fields: List[str]
+    metadata: Optional[DatasetMetadata]
+
+
+class DatasetDict(TypedDict, total=False):
+    """Dictionary representation of a dataset."""
+
+    name: str
+    data: List[DatasetItem]
+    metadata: Optional[DatasetMetadata]
+    schema: Optional[DatasetSchema]
+
+
 # Configuration Types
 class ConfigDict(TypedDict, total=False):
     """Configuration dictionary for BenchWise."""
@@ -342,33 +375,81 @@ class ConfigDict(TypedDict, total=False):
 
 
 # Results Types
+class EvaluationMetadata(TypedDict, total=False):
+    """Metadata for an evaluation result."""
+
+    temperature: float
+    max_tokens: int
+    model_version: str
+    dataset_hash: str
+    evaluation_id: Optional[int]
+    benchmark_id: Optional[int]
+    dataset: DatasetInfo  # Dataset information for the evaluation
+    description: str  # Description of the evaluation/benchmark
+    dataset_path: str  # Path to the dataset file used in evaluation
+    models: List[str]  # List of models evaluated
+    metrics: List[str]  # List of metrics used in evaluation
+    # Allow additional metadata fields
+    # Note: This is intentionally flexible for user-defined metadata
+
+
 class EvaluationResultDict(TypedDict, total=False):
     """Serialized evaluation result."""
 
-    model: str
-    prompt: str
-    response: str
-    score: float
-    scores: Dict[str, float]
-    metadata: Dict[str, Any]
+    model_name: str
+    test_name: str
+    result: Any
+    duration: float
+    dataset_info: Optional[DatasetInfo]
+    error: Optional[str]
+    metadata: EvaluationMetadata
     timestamp: str
     success: bool
-    error: Optional[str]
+
+
+class BenchmarkSummary(TypedDict):
+    """Summary statistics for a benchmark."""
+
+    total_models: int
+    successful_models: int
+    failed_models: int
+    success_rate: float
 
 
 class BenchmarkResultDict(TypedDict, total=False):
     """Serialized benchmark result."""
 
     benchmark_name: str
-    benchmark_description: str
     results: List[EvaluationResultDict]
-    summary: Dict[str, Any]
+    metadata: EvaluationMetadata
     timestamp: str
+    summary: BenchmarkSummary
 
 
-class ComparisonResult(TypedDict):
+class ModelRanking(TypedDict):
+    """Ranking entry for a model."""
+
+    model: str
+    score: float
+
+
+class ModelComparisonResult(TypedDict, total=False):
     """Result of model comparison."""
 
+    ranking: List[ModelRanking]
+    best_model: str
+    best_score: float
+    worst_model: str
+    worst_score: float
+    mean_score: float
+    std_score: float
+    total_models: int
+    error: Optional[str]
+
+
+class ComparisonResult(TypedDict):
+    """Result of model comparison (legacy format)."""
+
     best_model: str
     best_score: float
     rankings: List[Tuple[str, float]]
@@ -376,11 +457,76 @@ class ComparisonResult(TypedDict):
 
 
 # API Response Types
+class TokenData(TypedDict, total=False):
+    """JWT token data from login."""
+
+    access_token: str
+    token_type: str
+    expires_in: Optional[int]
+    refresh_token: Optional[str]
+
+
 class LoginResponse(TypedDict):
     """Response from login endpoint."""
 
-    token: Dict[str, str]
-    user: Dict[str, Any]
+    token: TokenData
+    user: "UserInfo"  # Forward reference
+
+
+class ModelInfo(TypedDict, total=False):
+    """Model information from API."""
+
+    id: int
+    name: str
+    provider: str
+    model_id: str  # Provider-specific model identifier
+    description: Optional[str]
+    is_active: bool
+    pricing: Optional[PricingInfo]
+    metadata: Optional[EvaluationMetadata]
+
+
+class BenchmarkRegistrationData(TypedDict, total=False):
+    """Data for registering a benchmark with the API."""
+
+    name: str
+    description: str
+    category: str
+    tags: List[str]
+    difficulty: Optional[str]
+    dataset_url: Optional[str]
+    config: Dict[str, Any]
+    metadata: DatasetInfo
+    is_public: bool
+
+
+class BenchmarkInfo(TypedDict, total=False):
+    """Benchmark information from API."""
+
+    id: int
+    name: str
+    description: Optional[str]
+    category: Optional[str]
+    tags: List[str]
+    difficulty: Optional[str]
+    dataset_url: Optional[str]
+    config: Dict[str, Any]  # API config can be arbitrary
+    metadata: Optional[DatasetInfo]
+    is_public: bool
+    created_at: Optional[str]
+
+
+class EvaluationInfo(TypedDict, total=False):
+    """Evaluation information from API."""
+
+    id: int
+    benchmark_id: int
+    model_id: int
+    test_name: str
+    status: str
+    results: Optional[Dict[str, Any]]  # Results can be arbitrary
+    metadata: Optional[EvaluationMetadata]
+    created_at: Optional[str]
 
 
 class UserInfo(TypedDict, total=False):
@@ -393,8 +539,8 @@ class UserInfo(TypedDict, total=False):
     is_active: bool
 
 
-class UploadResultsResponse(TypedDict):
-    """Response from upload results endpoint."""
+class UploadBenchmarkResponse(TypedDict):
+    """Response from upload benchmark result endpoint."""
 
     id: int
     benchmark_id: int
@@ -403,6 +549,12 @@ class UploadResultsResponse(TypedDict):
     message: str
 
 
+class FileUploadResponse(TypedDict, total=False):
+    """Response from file upload endpoint."""
+
+    file_info: Dict[str, str]  # Contains url and other file metadata
+
+
 # Protocols
 class SupportsGenerate(Protocol):
     """Protocol for objects that support text generation."""
@@ -446,6 +598,23 @@ def evaluate(
         ...
 
 
+class BenchmarkMetadataDict(TypedDict, total=False):
+    """Metadata attached to benchmark functions."""
+
+    name: str
+    description: str
+
+
+class CallableWithBenchmarkMetadata(Protocol):
+    """Protocol for callables that may have benchmark metadata attached."""
+
+    _benchmark_metadata: Dict[str, Any]
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        """Call the function."""
+        ...
+
+
 class ConfigureArgs(Protocol):
     """Arguments for configuring Benchwise."""
 
@@ -480,5 +649,68 @@ class ConfigKwargs(TypedDict, total=False):
 class OfflineQueueItem(TypedDict):
     """Item in offline queue."""
 
-    data: Dict[str, Any]
+    data: Dict[str, Any]  # Can contain different operation types
+    timestamp: str
+
+
+class RunnerConfig(TypedDict, total=False):
+    """Configuration for EvaluationRunner."""
+
+    cache_enabled: bool
+    upload_enabled: bool
+    timeout: float
+    max_retries: int
+    debug: bool
+    verbose: bool
+
+
+class CacheEntry(TypedDict, total=False):
+    """Entry in results cache."""
+
+    result: EvaluationResultDict
+    dataset_hash: str
+    timestamp: str
+
+
+class CachedResultInfo(TypedDict, total=False):
+    """Information about a cached result."""
+
+    file: str
+    model_name: Optional[str]
+    test_name: Optional[str]
+    timestamp: Optional[str]
+    dataset_hash: Optional[str]
+
+
+class BenchmarkComparisonInfo(TypedDict, total=False):
+    """Information about a benchmark in cross-benchmark comparison."""
+
+    name: str
     timestamp: str
+    models: List[str]
+    success_rate: float
+
+
+class CrossBenchmarkComparison(TypedDict, total=False):
+    """Result of comparing multiple benchmarks."""
+
+    benchmarks: List[BenchmarkComparisonInfo]
+    models: List[str]
+    cross_benchmark_scores: Dict[str, Dict[str, Optional[float]]]
+
+
+class ModelPerformanceAnalysis(TypedDict, total=False):
+    """Performance analysis for a single model."""
+
+    model_name: str
+    total_evaluations: int
+    successful_evaluations: int
+    failed_evaluations: int
+    success_rate: float
+    mean_score: float
+    std_score: float
+    min_score: float
+    max_score: float
+    median_score: float
+    scores: List[float]
+    error: Optional[str]
diff --git a/demo.py b/demo.py
index f7072e6..39342c4 100644
--- a/demo.py
+++ b/demo.py
@@ -1,5 +1,11 @@
 import asyncio
-from benchwise import evaluate, benchmark, create_qa_dataset, accuracy, semantic_similarity
+from benchwise import (
+    evaluate,
+    benchmark,
+    create_qa_dataset,
+    accuracy,
+    semantic_similarity,
+)
 
 # Create your dataset
 qa_dataset = create_qa_dataset(
@@ -8,18 +14,19 @@
         "Who wrote '1984'?",
         "What is the speed of light?",
         "Explain photosynthesis in one sentence.",
-        "What causes rainbows?"
+        "What causes rainbows?",
     ],
     answers=[
         "Tokyo",
         "George Orwell",
         "299,792,458 meters per second",
         "Photosynthesis is the process by which plants convert sunlight into energy.",
-        "Rainbows are caused by light refraction and reflection in water droplets."
+        "Rainbows are caused by light refraction and reflection in water droplets.",
     ],
-    name="general_knowledge_qa"
+    name="general_knowledge_qa",
 )
 
+
 @benchmark("General Knowledge QA", "Tests basic factual knowledge")
 @evaluate("gpt-3.5-turbo", "gemini-2.5-flash-lite")
 async def test_general_knowledge(model, dataset):
@@ -31,9 +38,10 @@ async def test_general_knowledge(model, dataset):
     return {
         "accuracy": acc["accuracy"],
         "semantic_similarity": similarity["mean_similarity"],
-        "total_questions": len(responses)
+        "total_questions": len(responses),
     }
 
+
 # Run the evaluation
 async def main():
     results = await test_general_knowledge(qa_dataset)
@@ -47,4 +55,5 @@ async def main():
         else:
             print(f"{result.model_name}: FAILED - {result.error}")
 
-asyncio.run(main())
\ No newline at end of file
+
+asyncio.run(main())
diff --git a/docs/test_load_dataset.py b/docs/test_load_dataset.py
index b0ef5da..6e7b5f5 100644
--- a/docs/test_load_dataset.py
+++ b/docs/test_load_dataset.py
@@ -5,6 +5,7 @@
 # Assuming data.json is in the same directory as this script for testing purposes
 data_file_path = "data.json"
 
+
 def test_load_dataset_from_json():
     # Load the dataset
     dataset = load_dataset(data_file_path)
@@ -17,6 +18,7 @@ def test_load_dataset_from_json():
 
     print("Successfully loaded dataset and assertions passed!")
 
+
 if __name__ == "__main__":
     # Create a dummy data.json file for testing if it doesn't exist
     if not os.path.exists(data_file_path):

From 1f7352bf090b0fd06b03b85b72e4127a201f7197 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Sat, 13 Dec 2025 17:50:22 +0530
Subject: [PATCH 23/24] fix(datasets): Enhance dataset loading logic to
 prioritize name from DatasetDict

---
 benchwise/datasets.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/benchwise/datasets.py b/benchwise/datasets.py
index 3459b97..2a0681e 100644
--- a/benchwise/datasets.py
+++ b/benchwise/datasets.py
@@ -348,12 +348,19 @@ def load_dataset(source: Union[str, Path, DatasetDict], **kwargs: Any) -> Datase
         # Note: .get() on TypedDict with total=False returns Any for optional keys,
         # but we know the structure from DatasetDict, so we use proper type annotations
         dataset_dict: DatasetDict = source
+        # Prefer name from DatasetDict if present, otherwise fall back to kwargs
+        name_from_dict: Optional[str] = dataset_dict.get("name")
+        name: str = (
+            name_from_dict
+            if isinstance(name_from_dict, str)
+            else kwargs.get("name", "custom_dataset")
+        )
         data: List[DatasetItem] = dataset_dict.get("data", [])
         metadata: Optional[DatasetMetadata] = dataset_dict.get("metadata")
         schema: Optional[DatasetSchema] = dataset_dict.get("schema")
 
         return Dataset(
-            name=kwargs.get("name", "custom_dataset"),
+            name=name,
             data=_validate_dataset_items(data),
             metadata=_validate_dataset_metadata(metadata),
             schema=_validate_dataset_schema(schema),

From 37c697acab3adff8e7b477b86adac48764d61ff5 Mon Sep 17 00:00:00 2001
From: Anurag Yadav <yadavanurag1310@gmail.com>
Date: Sat, 13 Dec 2025 18:44:18 +0530
Subject: [PATCH 24/24] refactor(cli, client, config, datasets, metrics): Clean
 up imports and enhance type safety by removing unused imports and organizing
 dependencies

---
 benchwise/cli.py      | 35 +++++++++++++++++++----------------
 benchwise/client.py   |  9 +--------
 benchwise/config.py   |  6 ++----
 benchwise/datasets.py |  5 +----
 benchwise/metrics.py  |  4 +---
 5 files changed, 24 insertions(+), 35 deletions(-)

diff --git a/benchwise/cli.py b/benchwise/cli.py
index d71bab5..e136dc8 100644
--- a/benchwise/cli.py
+++ b/benchwise/cli.py
@@ -4,16 +4,30 @@
 
 import argparse
 import asyncio
+import os
 import sys
-from typing import List, Optional
+from typing import List, Optional, cast
 
 from . import __version__
 from .datasets import load_dataset, convert_metadata_to_info
 from .models import get_model_adapter
-from .results import save_results, BenchmarkResult, EvaluationResult
-from .config import get_api_config, configure_benchwise
-from .client import get_client, sync_offline_results
-from .types import ConfigureArgs, ConfigKwargs, SyncArgs, StatusArgs, DatasetInfo
+from .results import (
+    save_results,
+    BenchmarkResult,
+    EvaluationResult,
+    load_results,
+    ResultsAnalyzer,
+)
+from .config import get_api_config, configure_benchwise, reset_config
+from .client import get_client, sync_offline_results, upload_results
+from .types import (
+    ConfigureArgs,
+    ConfigKwargs,
+    SyncArgs,
+    StatusArgs,
+    DatasetInfo,
+    EvaluationMetadata,
+)
 
 
 def create_parser() -> argparse.ArgumentParser:
@@ -136,9 +150,6 @@ async def run_evaluation(
         sys.exit(1)
 
     # Create benchmark result
-    from .types import EvaluationMetadata
-    from typing import cast
-
     benchmark_result = BenchmarkResult(
         benchmark_name=f"cli_evaluation_{dataset.name}",
         metadata=cast(
@@ -163,8 +174,6 @@ async def run_evaluation(
 
             # Check for API key requirements for cloud models
             if model_name.startswith(("gpt-", "claude-", "gemini-")):
-                import os
-
                 api_key_map = {
                     "gpt-": "OPENAI_API_KEY",
                     "claude-": "ANTHROPIC_API_KEY",
@@ -276,8 +285,6 @@ async def run_evaluation(
 
         if should_upload and benchmark_result.results:
             try:
-                from .client import upload_results
-
                 # Extract dataset_info from dataset metadata for upload_results
                 # upload_results expects DatasetInfo
                 dataset_info_for_upload: DatasetInfo = cast(
@@ -311,8 +318,6 @@ async def run_evaluation(
 
 async def configure_api(args: ConfigureArgs) -> None:
     """Configure Benchwise API settings."""
-    from .config import reset_config
-
     if args.reset:
         reset_config()
         print("✓ Configuration reset to defaults")
@@ -506,8 +511,6 @@ async def compare_results(
     result_paths: List[str], metric: Optional[str] = None
 ) -> None:
     """Compare evaluation results."""
-    from .results import load_results, ResultsAnalyzer
-
     try:
         # Load all results
         benchmark_results = []
diff --git a/benchwise/client.py b/benchwise/client.py
index 6adc153..29f89e1 100644
--- a/benchwise/client.py
+++ b/benchwise/client.py
@@ -2,6 +2,7 @@
 import asyncio
 import uuid
 import logging
+import os
 import types
 from typing import Dict, Any, Optional, List, Type, cast
 from datetime import datetime
@@ -830,8 +831,6 @@ async def sync_offline_queue(self) -> int:
 
                 if data_type == "full_benchmark_result":
                     # Reconstruct BenchmarkResult and upload
-                    from .results import BenchmarkResult
-
                     benchmark_result_dict: Dict[str, Any] = queue_data.get(
                         "benchmark_result", {}
                     )
@@ -886,8 +885,6 @@ async def upload_dataset_for_benchmark(
         Returns:
             Dataset URL
         """
-        import os
-
         logger.info(f"Uploading dataset for benchmark {benchmark_id}")
         try:
             with open(dataset_path, "rb") as f:
@@ -1014,7 +1011,6 @@ async def upload_results(
             logger.warning(
                 "Benchwise API not available, results will be cached offline"
             )
-            from .results import BenchmarkResult
 
             benchmark_result = BenchmarkResult(
                 benchmark_name=test_name,
@@ -1035,7 +1031,6 @@ async def upload_results(
         # Check authentication
         if not client.jwt_token:
             logger.warning("Not authenticated - results will be cached offline")
-            from .results import BenchmarkResult
 
             benchmark_result = BenchmarkResult(
                 benchmark_name=test_name,
@@ -1054,8 +1049,6 @@ async def upload_results(
             return False
 
         # Create benchmark result and upload
-        from .results import BenchmarkResult
-
         benchmark_result = BenchmarkResult(
             benchmark_name=test_name,
             results=results,
diff --git a/benchwise/config.py b/benchwise/config.py
index 7f673ad..f08cc12 100644
--- a/benchwise/config.py
+++ b/benchwise/config.py
@@ -10,6 +10,8 @@
 from typing import Optional, Dict, Any, List
 from dataclasses import dataclass, field
 import json
+import asyncio
+import httpx
 
 from benchwise.types import ConfigDict
 
@@ -408,8 +410,6 @@ def validate_api_connection(config: BenchwiseConfig) -> bool:
         True if connection is valid
     """
     try:
-        import asyncio
-        import httpx
 
         async def check_connection() -> bool:
             async with httpx.AsyncClient(timeout=5.0) as client:
@@ -434,8 +434,6 @@ def validate_api_keys(config: BenchwiseConfig) -> Dict[str, bool]:
     Returns:
         Dict mapping provider to validity status
     """
-    import os
-
     results = {}
 
     if os.getenv("OPENAI_API_KEY"):
diff --git a/benchwise/datasets.py b/benchwise/datasets.py
index 2a0681e..2a9587b 100644
--- a/benchwise/datasets.py
+++ b/benchwise/datasets.py
@@ -5,6 +5,7 @@
 import requests
 from dataclasses import dataclass
 import hashlib
+import random
 
 from .types import (
     DatasetItem,
@@ -191,8 +192,6 @@ def filter(self, condition: Callable[[DatasetItem], bool]) -> "Dataset":
 
     def sample(self, n: int, random_state: Optional[int] = None) -> "Dataset":
         """Sample n items from dataset."""
-        import random
-
         if random_state:
             random.seed(random_state)
 
@@ -212,8 +211,6 @@ def split(
         self, train_ratio: float = 0.8, random_state: Optional[int] = None
     ) -> tuple["Dataset", "Dataset"]:
         """Split dataset into train and test sets."""
-        import random
-
         if random_state:
             random.seed(random_state)
 
diff --git a/benchwise/metrics.py b/benchwise/metrics.py
index aac4f1c..ebe385d 100644
--- a/benchwise/metrics.py
+++ b/benchwise/metrics.py
@@ -14,7 +14,7 @@
 from rouge_score import rouge_scorer
 from sacrebleu import BLEU
 import bert_score
-from nltk.translate.bleu_score import sentence_bleu
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
 import nltk
 import re
 import string
@@ -304,8 +304,6 @@ def bleu_score(
 
 def _get_smoothing_function(smooth_method: str) -> Optional[Callable[..., Any]]:
     """Get NLTK smoothing function based on method name."""
-    from nltk.translate.bleu_score import SmoothingFunction
-
     smoothing = SmoothingFunction()
 
     if smooth_method == "exp":