From f3f3eaf9e6e6d4a674434fefa30cf98dfc1ed1ed Mon Sep 17 00:00:00 2001 From: cemde Date: Thu, 25 Dec 2025 13:37:03 +0100 Subject: [PATCH 1/6] implemented model chat --- CHANGELOG.md | 11 + maseval/__init__.py | 6 +- maseval/core/model.py | 327 +++++++++++++-- maseval/interface/inference/__init__.py | 28 +- maseval/interface/inference/anthropic.py | 379 ++++++++++++++++++ maseval/interface/inference/google_genai.py | 316 +++++++++++++-- maseval/interface/inference/huggingface.py | 341 ++++++++++++++-- maseval/interface/inference/litellm.py | 188 ++++++--- maseval/interface/inference/openai.py | 293 +++++++++++--- pyproject.toml | 3 +- tests/conftest.py | 62 ++- tests/test_core/test_model_adapter.py | 85 +++- .../test_model_adapters.py | 146 +++++-- uv.lock | 38 +- 14 files changed, 1934 insertions(+), 289 deletions(-) create mode 100644 maseval/interface/inference/anthropic.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 3850a69..7a28d0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +**ModelAdapter Chat Interface** + +- Added `chat()` method to `ModelAdapter` as the primary interface for LLM inference, accepting a list of messages in OpenAI format and returning a `ChatResponse` object and accepting tools +- Added `ChatResponse` dataclass containing `content`, `tool_calls`, `role`, `usage`, `model`, and `stop_reason` fields for structured response handling + +**AnthropicModelAdapter** + +- New `AnthropicModelAdapter` for direct integration with Anthropic Claude models via the official Anthropic SDK +- Handles Anthropic-specific message format conversion (system messages, tool_use/tool_result blocks) internally while accepting OpenAI-compatible input +- Added `anthropic` optional dependency: `pip install maseval[anthropic]` + ### Changed ### Fixed diff --git a/maseval/__init__.py b/maseval/__init__.py index 11ea20d..e74feda 100644 --- a/maseval/__init__.py +++ b/maseval/__init__.py @@ -22,7 +22,7 @@ ToolSimulatorError, UserSimulatorError, ) -from .core.model import ModelAdapter +from .core.model import ModelAdapter, ChatResponse from .core.user import User, TerminationReason from .core.evaluator import Evaluator from .core.history import MessageHistory, ToolInvocationHistory @@ -67,8 +67,10 @@ # History and tracing "MessageHistory", "ToolInvocationHistory", - "ModelAdapter", "TraceableMixin", + # Model adapters + "ModelAdapter", + "ChatResponse", # Exceptions and validation "MASEvalError", "AgentError", diff --git a/maseval/core/model.py b/maseval/core/model.py index afd7c11..69dfa83 100644 --- a/maseval/core/model.py +++ b/maseval/core/model.py @@ -1,78 +1,280 @@ -"""Core model adapter abstractions. +"""Core model adapter abstractions for LLM inference. + +This module provides the base `ModelAdapter` class that all model adapters must +implement. It defines a consistent interface for interacting with LLMs across +different providers (OpenAI, Anthropic, Google, HuggingFace, LiteLLM, etc.). Concrete implementations for specific inference providers are in: maseval.interface.inference + +Example: + ```python + from maseval.interface.inference import LiteLLMModelAdapter + + # Create adapter + model = LiteLLMModelAdapter(model_id="gpt-4") + + # Simple text generation + response = model.generate("What is 2+2?") + print(response) # "4" + + # Chat with messages + response = model.chat([ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is 2+2?"} + ]) + print(response.content) # "4" + + # Chat with tools + response = model.chat( + messages=[{"role": "user", "content": "What's the weather in Paris?"}], + tools=[{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather for a city", + "parameters": { + "type": "object", + "properties": {"city": {"type": "string"}}, + "required": ["city"] + } + } + }] + ) + if response.tool_calls: + print(response.tool_calls[0]["function"]["name"]) # "get_weather" + ``` """ from __future__ import annotations from abc import ABC, abstractmethod -from typing import Any, Optional, Dict +from dataclasses import dataclass +from typing import Any, Optional, Dict, List, Union from datetime import datetime import time from .tracing import TraceableMixin from .config import ConfigurableMixin +from .history import MessageHistory + + +@dataclass +class ChatResponse: + """Response from a chat completion. + + When the model generates a response, it returns either text content, + tool calls, or both. Use this class to access the response data. + + Attributes: + content: The text content of the response. May be None if the model + only returned tool calls. + tool_calls: List of tool calls the model wants to execute. Each tool + call is a dict with 'id', 'type', and 'function' keys. The + 'function' contains 'name' and 'arguments' (JSON string). + None if no tools were called. + role: The role of the response message. Always "assistant". + usage: Token usage statistics if available. Dict with keys like + 'input_tokens', 'output_tokens', 'total_tokens'. + model: The model ID that generated this response, if available. + stop_reason: Why the model stopped generating. Common values: + 'end_turn', 'tool_use', 'max_tokens', 'stop_sequence'. + + Example: + ```python + response = model.chat([{"role": "user", "content": "Hello"}]) + + # Text response + if response.content: + print(response.content) + + # Tool call response + if response.tool_calls: + for call in response.tool_calls: + name = call["function"]["name"] + args = json.loads(call["function"]["arguments"]) + result = execute_tool(name, args) + ``` + """ + + content: Optional[str] = None + tool_calls: Optional[List[Dict[str, Any]]] = None + role: str = "assistant" + usage: Optional[Dict[str, int]] = None + model: Optional[str] = None + stop_reason: Optional[str] = None + + def to_message(self) -> Dict[str, Any]: + """Convert this response to an OpenAI-compatible message dict. + + Use this to append the assistant's response to your message history + before continuing the conversation. + + Returns: + Dict with 'role', 'content', and optionally 'tool_calls'. + + Example: + ```python + messages = [{"role": "user", "content": "Hello"}] + response = model.chat(messages) + + # Add assistant response to history + messages.append(response.to_message()) + + # Continue conversation + messages.append({"role": "user", "content": "Tell me more"}) + response = model.chat(messages) + ``` + """ + msg: Dict[str, Any] = {"role": self.role} + if self.content is not None: + msg["content"] = self.content + if self.tool_calls: + msg["tool_calls"] = self.tool_calls + return msg class ModelAdapter(ABC, TraceableMixin, ConfigurableMixin): """Abstract base class for model adapters. - Concrete implementations must provide a `generate` method that accepts a - prompt string and returns the model's text output. They should also expose - a `model_id` property identifying the underlying model. + ModelAdapter provides a consistent interface for LLM inference across + different providers. All adapters implement the same methods, so you + can swap providers without changing your code. - This class automatically tracks all generation calls for tracing and evaluation. + To use a model adapter: + 1. Create an instance with provider-specific configuration + 2. Call `chat()` for message-based conversations + 3. Call `generate()` for simple text-in/text-out + + The adapter automatically tracks all calls for tracing and evaluation. + + Implementing a custom adapter: + Subclass ModelAdapter and implement: + - `model_id` property: Return the model identifier string + - `_chat_impl()`: The actual chat completion logic See maseval.interface.inference for concrete implementations: - - GoogleGenAIModelAdapter - - OpenAIModelAdapter - - HuggingFaceModelAdapter + - AnthropicModelAdapter + - GoogleGenAIModelAdapter + - HuggingFaceModelAdapter + - LiteLLMModelAdapter + - OpenAIModelAdapter """ def __init__(self): """Initialize the model adapter with call tracing.""" super().__init__() - self.logs: list[dict[str, Any]] = [] + self.logs: List[Dict[str, Any]] = [] @property @abstractmethod def model_id(self) -> str: - """A string identifier for the underlying model.""" + """The identifier for the underlying model. + + Returns: + A string identifying the model (e.g., "gpt-4", "claude-sonnet-4-5", + "gemini-pro"). Used for tracing and configuration. + """ - def generate(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str: - """Generate text from the model with automatic tracing. + def chat( + self, + messages: Union[List[Dict[str, Any]], MessageHistory], + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> ChatResponse: + """Send messages to the model and get a response. - This method wraps the actual generation logic to track timing, - parameters, and errors for later evaluation. + This is the primary method for interacting with the model. Pass a + conversation history and receive the model's response. Args: - prompt: The input prompt - generation_params: Optional generation parameters - **kwargs: Additional provider-specific arguments + messages: The conversation history. Either a list of message dicts + in OpenAI format, or a MessageHistory object. Each message + has 'role' ('system', 'user', 'assistant', 'tool') and + 'content' keys. + generation_params: Model parameters like temperature, max_tokens, + top_p, etc. Provider-specific parameters are also accepted. + tools: Tool definitions the model can use. Each tool is a dict + with 'type' (usually 'function') and 'function' containing + 'name', 'description', and 'parameters' (JSON Schema). + tool_choice: How the model should use tools: + - "auto": Model decides whether to use tools (default) + - "none": Model won't use tools + - "required": Model must use a tool + - {"type": "function", "function": {"name": "..."}}: Use specific tool + **kwargs: Additional provider-specific arguments. Returns: - The model output as a string + ChatResponse containing the model's response (text and/or tool calls). Raises: - Exception: Any exception from the underlying model is logged and re-raised + Exception: Provider-specific errors are logged and re-raised. + + Example: + ```python + # Simple conversation + response = model.chat([ + {"role": "user", "content": "Hello!"} + ]) + print(response.content) + + # With system prompt + response = model.chat([ + {"role": "system", "content": "You are a pirate."}, + {"role": "user", "content": "Hello!"} + ]) + + # With tools + response = model.chat( + messages=[{"role": "user", "content": "What's 2+2?"}], + tools=[{ + "type": "function", + "function": { + "name": "calculator", + "description": "Evaluate math expressions", + "parameters": { + "type": "object", + "properties": {"expression": {"type": "string"}}, + "required": ["expression"] + } + } + }] + ) + ``` """ start_time = time.time() timestamp = datetime.now().isoformat() + # Convert MessageHistory to list if needed + if isinstance(messages, MessageHistory): + messages_list = messages.to_openai_format() + else: + messages_list = messages + try: - result = self._generate_impl(prompt, generation_params, **kwargs) + result = self._chat_impl( + messages_list, + generation_params=generation_params, + tools=tools, + tool_choice=tool_choice, + **kwargs, + ) duration = time.time() - start_time self.logs.append( { "timestamp": timestamp, - "prompt_length": len(prompt), - "response_length": len(result) if result else 0, + "message_count": len(messages_list), + "response_type": "tool_call" if result.tool_calls else "text", + "response_length": len(result.content) if result.content else 0, + "tool_calls_count": len(result.tool_calls) if result.tool_calls else 0, "duration_seconds": duration, "status": "success", "generation_params": generation_params or {}, - "kwargs": {k: str(v) for k, v in kwargs.items()}, # Serialize for JSON + "tools_provided": len(tools) if tools else 0, + "kwargs": {k: str(v) for k, v in kwargs.items()}, } ) @@ -84,12 +286,13 @@ def generate(self, prompt: str, generation_params: Optional[Dict[str, Any]] = No self.logs.append( { "timestamp": timestamp, - "prompt_length": len(prompt), + "message_count": len(messages_list), "duration_seconds": duration, "status": "error", "error": str(e), "error_type": type(e).__name__, "generation_params": generation_params or {}, + "tools_provided": len(tools) if tools else 0, "kwargs": {k: str(v) for k, v in kwargs.items()}, } ) @@ -97,32 +300,79 @@ def generate(self, prompt: str, generation_params: Optional[Dict[str, Any]] = No raise @abstractmethod - def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str: - """Internal generation implementation to be overridden by subclasses. + def _chat_impl( + self, + messages: List[Dict[str, Any]], + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> ChatResponse: + """Internal chat implementation to be overridden by subclasses. + + Implement this method to call your provider's API. The base class + handles tracing, timing, and error logging. + + Args: + messages: List of message dicts in OpenAI format. + generation_params: Generation parameters (temperature, etc.). + tools: Tool definitions, if any. + tool_choice: Tool choice setting, if any. + **kwargs: Additional provider-specific arguments. + + Returns: + ChatResponse with the model's output. + """ + + def generate( + self, + prompt: str, + generation_params: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> str: + """Generate text from a simple prompt. + + This is a convenience method that wraps the prompt in a user message + and calls `chat()`. Use this for simple text-in/text-out scenarios. + + For conversations or tool use, use `chat()` directly. Args: - prompt: The input prompt - generation_params: Optional generation parameters - **kwargs: Additional provider-specific arguments + prompt: The input prompt. + generation_params: Generation parameters (temperature, max_tokens, etc.). + **kwargs: Additional provider-specific arguments. Returns: - The model output as a string + The model's text response. + + Example: + ```python + response = model.generate("What is the capital of France?") + print(response) # "Paris" + ``` """ + messages = [{"role": "user", "content": prompt}] + response = self.chat(messages, generation_params=generation_params, **kwargs) + return response.content or "" - def gather_traces(self) -> dict[str, Any]: + def gather_traces(self) -> Dict[str, Any]: """Gather execution traces from this model adapter. + Called automatically by Benchmark to collect execution data for + evaluation. Returns comprehensive statistics about all calls made + to this adapter. + Returns: Dictionary containing: - type: Component class name - gathered_at: ISO timestamp - model_id: Model identifier - - total_calls: Number of generation calls + - total_calls: Number of chat/generate calls - successful_calls: Number of successful calls - failed_calls: Number of failed calls - - total_duration_seconds: Total time spent generating + - total_duration_seconds: Total time spent in calls - average_duration_seconds: Average time per call - - logs: List of all individual call records with timestamps, durations, and parameters + - logs: List of individual call records """ total_calls = len(self.logs) successful_calls = sum(1 for call in self.logs if call["status"] == "success") @@ -141,15 +391,18 @@ def gather_traces(self) -> dict[str, Any]: "logs": self.logs, } - def gather_config(self) -> dict[str, Any]: + def gather_config(self) -> Dict[str, Any]: """Gather configuration from this model adapter. + Called automatically by Benchmark to collect configuration for + reproducibility. Returns identifying information about this adapter. + Returns: Dictionary containing: - type: Component class name - gathered_at: ISO timestamp - model_id: Model identifier - - adapter_type: The specific adapter class (e.g., OpenAIModelAdapter) + - adapter_type: The specific adapter class name """ return { **super().gather_config(), diff --git a/maseval/interface/inference/__init__.py b/maseval/interface/inference/__init__.py index 72be8e3..e6765d1 100644 --- a/maseval/interface/inference/__init__.py +++ b/maseval/interface/inference/__init__.py @@ -2,10 +2,35 @@ This package contains concrete implementations of ModelAdapter for different inference providers. Each adapter requires the corresponding optional dependency. + +Available adapters: + - AnthropicModelAdapter: Anthropic Claude models (requires anthropic) + - GoogleGenAIModelAdapter: Google Gemini models (requires google-genai) + - HuggingFaceModelAdapter: HuggingFace transformers (requires transformers) + - LiteLLMModelAdapter: 100+ providers via LiteLLM (requires litellm) + - OpenAIModelAdapter: OpenAI and compatible APIs (requires openai) + +Example: + ```python + from maseval.interface.inference import LiteLLMModelAdapter + + # Use any supported provider + model = LiteLLMModelAdapter(model_id="gpt-4") + response = model.chat([{"role": "user", "content": "Hello!"}]) + print(response.content) + ``` """ __all__ = [] +# Conditionally import Anthropic adapter +try: + from .anthropic import AnthropicModelAdapter # noqa: F401 + + __all__.append("AnthropicModelAdapter") +except ImportError: + pass + # Conditionally import google-genai adapter try: from .google_genai import GoogleGenAIModelAdapter # noqa: F401 @@ -24,9 +49,10 @@ # Conditionally import HuggingFace adapter try: - from .huggingface import HuggingFaceModelAdapter # noqa: F401 + from .huggingface import HuggingFaceModelAdapter, ToolCallingNotSupportedError # noqa: F401 __all__.append("HuggingFaceModelAdapter") + __all__.append("ToolCallingNotSupportedError") except ImportError: pass diff --git a/maseval/interface/inference/anthropic.py b/maseval/interface/inference/anthropic.py new file mode 100644 index 0000000..0363d22 --- /dev/null +++ b/maseval/interface/inference/anthropic.py @@ -0,0 +1,379 @@ +"""Anthropic model adapter. + +This adapter works with the official Anthropic Python SDK for accessing +Claude models directly. + +Requires anthropic to be installed: + pip install maseval[anthropic] + +Example: + ```python + from anthropic import Anthropic + from maseval.interface.inference import AnthropicModelAdapter + + # Create client (uses ANTHROPIC_API_KEY env var) + client = Anthropic() + + # Create adapter + model = AnthropicModelAdapter( + client=client, + model_id="claude-sonnet-4-5-20250514" + ) + + # Simple generation + response = model.generate("Hello!") + + # Chat with messages + response = model.chat([ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello!"} + ]) + + # Chat with tools + response = model.chat( + messages=[{"role": "user", "content": "What's the weather in Paris?"}], + tools=[{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather for a city", + "parameters": { + "type": "object", + "properties": {"city": {"type": "string"}}, + "required": ["city"] + } + } + }] + ) + ``` +""" + +import json +from typing import Any, Optional, Dict, List, Union + +from maseval.core.model import ModelAdapter, ChatResponse + + +class AnthropicModelAdapter(ModelAdapter): + """Adapter for Anthropic Claude models. + + Works with Claude models through the official Anthropic Python SDK. + + Supported models include: + - claude-sonnet-4-5-20250514 (Claude Sonnet 4.5) + - claude-opus-4-5-20251101 (Claude Opus 4.5) + - claude-3-5-sonnet-20241022 + - claude-3-opus-20240229 + - And other Claude model variants + + The adapter accepts OpenAI-style messages and converts them to Anthropic's + format internally. Key differences handled automatically: + - System messages are passed separately (not in messages array) + - Tool definitions are converted to Anthropic format + - Tool responses are converted to tool_result content blocks + + API keys can be set via ANTHROPIC_API_KEY environment variable or + passed to the Anthropic client directly. + """ + + def __init__( + self, + client: Any, + model_id: str, + default_generation_params: Optional[Dict[str, Any]] = None, + max_tokens: int = 4096, + ): + """Initialize Anthropic model adapter. + + Args: + client: An anthropic.Anthropic client instance. + model_id: The model identifier (e.g., "claude-sonnet-4-5-20250514"). + default_generation_params: Default parameters for all calls. + Common parameters: temperature, top_p, top_k. + max_tokens: Maximum tokens to generate. Anthropic requires this + parameter. Default is 4096. + """ + super().__init__() + self._client = client + self._model_id = model_id + self._default_generation_params = default_generation_params or {} + self._max_tokens = max_tokens + + @property + def model_id(self) -> str: + return self._model_id + + def _chat_impl( + self, + messages: List[Dict[str, Any]], + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> ChatResponse: + """Call Anthropic Messages API. + + Args: + messages: List of message dicts in OpenAI format. + generation_params: Generation parameters (temperature, etc.). + tools: Tool definitions for function calling (OpenAI format). + tool_choice: Tool choice setting. + **kwargs: Additional Anthropic parameters. + + Returns: + ChatResponse with the model's output. + """ + # Merge parameters + params = dict(self._default_generation_params) + if generation_params: + params.update(generation_params) + params.update(kwargs) + + # Extract and set max_tokens + max_tokens = params.pop("max_tokens", self._max_tokens) + + # Convert messages (extract system, convert tool responses) + system_prompt, converted_messages = self._convert_messages(messages) + + # Convert tools to Anthropic format + anthropic_tools = None + if tools: + anthropic_tools = self._convert_tools(tools) + + # Handle tool_choice + anthropic_tool_choice = None + if tool_choice is not None: + anthropic_tool_choice = self._convert_tool_choice(tool_choice) + + # Build request + request_params = { + "model": self._model_id, + "max_tokens": max_tokens, + "messages": converted_messages, + **params, + } + + if system_prompt: + request_params["system"] = system_prompt + + if anthropic_tools: + request_params["tools"] = anthropic_tools + + if anthropic_tool_choice: + request_params["tool_choice"] = anthropic_tool_choice + + # Call API + response = self._client.messages.create(**request_params) + + return self._parse_response(response) + + def _convert_messages( + self, messages: List[Dict[str, Any]] + ) -> tuple[Optional[str], List[Dict[str, Any]]]: + """Convert OpenAI messages to Anthropic format. + + Anthropic separates system messages and uses different format for + tool responses. + + Args: + messages: OpenAI-format messages. + + Returns: + Tuple of (system_prompt, converted_messages). + """ + system_prompt = None + converted = [] + + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", "") + + if role == "system": + # Anthropic takes system as separate parameter + system_prompt = content + + elif role == "tool": + # Convert to Anthropic tool_result format + # Tool results in Anthropic are user messages with tool_result content + tool_call_id = msg.get("tool_call_id", "") + converted.append( + { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": tool_call_id, + "content": content, + } + ], + } + ) + + elif role == "assistant": + # Check if this message has tool_calls (from previous response) + if "tool_calls" in msg and msg["tool_calls"]: + # Convert to Anthropic format with tool_use content blocks + content_blocks = [] + + # Add text content if present + if msg.get("content"): + content_blocks.append({"type": "text", "text": msg["content"]}) + + # Add tool use blocks + for tc in msg["tool_calls"]: + func = tc.get("function", {}) + args = func.get("arguments", "{}") + if isinstance(args, str): + try: + args = json.loads(args) + except json.JSONDecodeError: + args = {} + + content_blocks.append( + { + "type": "tool_use", + "id": tc.get("id", ""), + "name": func.get("name", ""), + "input": args, + } + ) + + converted.append({"role": "assistant", "content": content_blocks}) + else: + # Simple text message + converted.append({"role": "assistant", "content": content}) + + else: + # User message + converted.append({"role": "user", "content": content}) + + return system_prompt, converted + + def _convert_tools(self, tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Convert OpenAI tool format to Anthropic format. + + Args: + tools: OpenAI-format tool definitions. + + Returns: + Anthropic-format tool definitions. + """ + anthropic_tools = [] + + for tool in tools: + if tool.get("type") == "function": + func = tool.get("function", {}) + anthropic_tools.append( + { + "name": func.get("name", ""), + "description": func.get("description", ""), + "input_schema": func.get("parameters", {"type": "object", "properties": {}}), + } + ) + + return anthropic_tools + + def _convert_tool_choice( + self, tool_choice: Union[str, Dict[str, Any]] + ) -> Dict[str, Any]: + """Convert OpenAI tool_choice to Anthropic format. + + Args: + tool_choice: OpenAI-format tool choice. + + Returns: + Anthropic-format tool choice. + """ + if tool_choice == "auto": + return {"type": "auto"} + elif tool_choice == "none": + # Anthropic doesn't have a direct "none" - we just don't pass tools + return {"type": "auto"} + elif tool_choice == "required": + return {"type": "any"} + elif isinstance(tool_choice, dict) and "function" in tool_choice: + return {"type": "tool", "name": tool_choice["function"]["name"]} + else: + return {"type": "auto"} + + def _parse_response(self, response: Any) -> ChatResponse: + """Parse Anthropic response into ChatResponse. + + Args: + response: The raw response from Anthropic. + + Returns: + ChatResponse with extracted data. + """ + # Extract content (may be text and/or tool_use blocks) + content = None + tool_calls = None + + if hasattr(response, "content") and response.content: + text_parts = [] + tool_use_parts = [] + + for block in response.content: + if hasattr(block, "type"): + if block.type == "text": + text_parts.append(block.text) + elif block.type == "tool_use": + tool_use_parts.append( + { + "id": block.id, + "type": "function", + "function": { + "name": block.name, + "arguments": json.dumps(block.input), + }, + } + ) + + if text_parts: + content = "".join(text_parts) + + if tool_use_parts: + tool_calls = tool_use_parts + + # Extract usage + usage = None + if hasattr(response, "usage") and response.usage: + usage = { + "input_tokens": getattr(response.usage, "input_tokens", 0), + "output_tokens": getattr(response.usage, "output_tokens", 0), + "total_tokens": ( + getattr(response.usage, "input_tokens", 0) + + getattr(response.usage, "output_tokens", 0) + ), + } + + # Extract stop reason + stop_reason = None + if hasattr(response, "stop_reason"): + stop_reason = response.stop_reason + + return ChatResponse( + content=content, + tool_calls=tool_calls, + role="assistant", + usage=usage, + model=getattr(response, "model", self._model_id), + stop_reason=stop_reason, + ) + + def gather_config(self) -> Dict[str, Any]: + """Gather configuration from this Anthropic model adapter. + + Returns: + Dictionary containing model configuration. + """ + base_config = super().gather_config() + base_config.update( + { + "default_generation_params": self._default_generation_params, + "max_tokens": self._max_tokens, + "client_type": type(self._client).__name__, + } + ) + + return base_config diff --git a/maseval/interface/inference/google_genai.py b/maseval/interface/inference/google_genai.py index d30989f..fe71ba7 100644 --- a/maseval/interface/inference/google_genai.py +++ b/maseval/interface/inference/google_genai.py @@ -1,23 +1,67 @@ """Google Generative AI model adapter. +This adapter works with Google's Generative AI SDK (google-genai) for accessing +Gemini models. + Requires google-genai to be installed: pip install maseval[google-genai] + +Example: + ```python + from google import genai + from maseval.interface.inference import GoogleGenAIModelAdapter + + # Create client + client = genai.Client(api_key="your-api-key") + # Or set GOOGLE_API_KEY environment variable + + # Create adapter + model = GoogleGenAIModelAdapter( + client=client, + model_id="gemini-2.0-flash" + ) + + # Simple generation + response = model.generate("Hello!") + + # Chat with messages + response = model.chat([ + {"role": "user", "content": "Hello!"} + ]) + + # Chat with tools + response = model.chat( + messages=[{"role": "user", "content": "What's the weather?"}], + tools=[{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather for a city", + "parameters": {...} + } + }] + ) + ``` """ -from typing import Any, Optional, Dict -import json +from typing import Any, Optional, Dict, List, Union -from maseval.core.model import ModelAdapter +from maseval.core.model import ModelAdapter, ChatResponse class GoogleGenAIModelAdapter(ModelAdapter): - """Adapter for Google Generative AI. + """Adapter for Google Generative AI (Gemini models). + + Works with Google's Gemini models through the google-genai SDK. - The `client` may be a callable that accepts the prompt and returns a dict-like - response, or a client object with a `generate` method. The adapter will try - to normalize the response to a text string. + Supported models include: + - gemini-2.0-flash + - gemini-1.5-pro + - gemini-1.5-flash + - And other Gemini model variants - Requires google-genai to be installed. + The adapter converts OpenAI-style messages to Google's format internally, + so you can use the same message format across all adapters. """ def __init__( @@ -26,6 +70,14 @@ def __init__( model_id: str, default_generation_params: Optional[Dict[str, Any]] = None, ): + """Initialize Google GenAI model adapter. + + Args: + client: A google.genai.Client instance. + model_id: The model identifier (e.g., "gemini-2.0-flash"). + default_generation_params: Default parameters for all calls. + Common parameters: temperature, max_output_tokens, top_p. + """ super().__init__() self._client = client self._model_id = model_id @@ -35,47 +87,231 @@ def __init__( def model_id(self) -> str: return self._model_id - def _extract_text(self, response: Any) -> str: - # Normalize a few common shapes - if isinstance(response, str): - return response - if isinstance(response, dict): - # google generative responses often have `candidates` or `output` fields - if "candidates" in response and response["candidates"]: - return response["candidates"][0].get("content", "") - if "output" in response and isinstance(response["output"], list) and response["output"]: - # some implementations return a list of text chunks - first = response["output"][0] - if isinstance(first, dict): - return first.get("content", "") - return str(first) - # fallback to stringifying - return json.dumps(response) - return str(response) - - def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str: - from google import genai # Lazy import + def _chat_impl( + self, + messages: List[Dict[str, Any]], + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> ChatResponse: + """Call Google GenAI API. + + Args: + messages: List of message dicts in OpenAI format. + generation_params: Generation parameters (temperature, etc.). + tools: Tool definitions for function calling. + tool_choice: Tool choice setting. + **kwargs: Additional parameters. + Returns: + ChatResponse with the model's output. + """ + from google import genai + + # Merge parameters params = dict(self._default_generation_params) if generation_params: params.update(generation_params) - generation_config = genai.types.GenerateContentConfig(**params) if params else None + params.update(kwargs) + + # Convert messages to Google format + system_instruction, contents = self._convert_messages(messages) + + # Build config + config_params = {} + if system_instruction: + config_params["system_instruction"] = system_instruction + + # Map common parameter names + if "max_tokens" in params: + config_params["max_output_tokens"] = params.pop("max_tokens") + if "max_output_tokens" in params: + config_params["max_output_tokens"] = params.pop("max_output_tokens") + if "temperature" in params: + config_params["temperature"] = params.pop("temperature") + if "top_p" in params: + config_params["top_p"] = params.pop("top_p") + if "top_k" in params: + config_params["top_k"] = params.pop("top_k") + if "stop_sequences" in params: + config_params["stop_sequences"] = params.pop("stop_sequences") + + # Convert tools to Google format + if tools: + config_params["tools"] = self._convert_tools(tools) + + # Handle tool_choice + if tool_choice is not None: + if tool_choice == "none": + config_params["tool_config"] = {"function_calling_config": {"mode": "NONE"}} + elif tool_choice == "auto": + config_params["tool_config"] = {"function_calling_config": {"mode": "AUTO"}} + elif tool_choice == "required": + config_params["tool_config"] = {"function_calling_config": {"mode": "ANY"}} + elif isinstance(tool_choice, dict) and "function" in tool_choice: + config_params["tool_config"] = { + "function_calling_config": { + "mode": "ANY", + "allowed_function_names": [tool_choice["function"]["name"]], + } + } + + # Build generation config + generation_config = genai.types.GenerateContentConfig(**config_params) if config_params else None + + # Call API + response = self._client.models.generate_content( + model=self._model_id, contents=contents, config=generation_config + ) + + return self._parse_response(response) + + def _convert_messages( + self, messages: List[Dict[str, Any]] + ) -> tuple[Optional[str], List[Dict[str, Any]]]: + """Convert OpenAI messages to Google format. - # Call client - response = self._client.models.generate_content(model=self.model_id, contents=prompt, config=generation_config) - return response.text + Google uses 'contents' with 'parts', and separates system instructions. + Roles are 'user' and 'model' (not 'assistant'). + + Args: + messages: OpenAI-format messages. + + Returns: + Tuple of (system_instruction, contents). + """ + system_instruction = None + contents = [] + + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", "") + + if role == "system": + system_instruction = content + elif role == "assistant": + contents.append({"role": "model", "parts": [{"text": content}]}) + elif role == "tool": + # Tool response in Google format + tool_call_id = msg.get("tool_call_id", "") + contents.append( + { + "role": "function", + "parts": [ + { + "function_response": { + "name": msg.get("name", tool_call_id), + "response": {"result": content}, + } + } + ], + } + ) + else: + # User message + contents.append({"role": "user", "parts": [{"text": content}]}) + + return system_instruction, contents + + def _convert_tools(self, tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Convert OpenAI tool format to Google format. + + Args: + tools: OpenAI-format tool definitions. + + Returns: + Google-format tool definitions. + """ + google_tools = [] + + for tool in tools: + if tool.get("type") == "function": + func = tool.get("function", {}) + google_tools.append( + { + "function_declarations": [ + { + "name": func.get("name", ""), + "description": func.get("description", ""), + "parameters": func.get("parameters", {}), + } + ] + } + ) + + return google_tools + + def _parse_response(self, response: Any) -> ChatResponse: + """Parse Google GenAI response into ChatResponse. + + Args: + response: The raw response from Google. + + Returns: + ChatResponse with extracted data. + """ + # Extract text content + content = None + if hasattr(response, "text"): + content = response.text + + # Extract tool calls (function calls in Google terminology) + tool_calls = None + if hasattr(response, "candidates") and response.candidates: + candidate = response.candidates[0] + if hasattr(candidate, "content") and candidate.content: + for part in candidate.content.parts: + if hasattr(part, "function_call") and part.function_call: + if tool_calls is None: + tool_calls = [] + fc = part.function_call + # Convert args to JSON string + import json + + args = dict(fc.args) if fc.args else {} + tool_calls.append( + { + "id": f"call_{fc.name}", + "type": "function", + "function": { + "name": fc.name, + "arguments": json.dumps(args), + }, + } + ) + + # Extract usage + usage = None + if hasattr(response, "usage_metadata") and response.usage_metadata: + um = response.usage_metadata + usage = { + "input_tokens": getattr(um, "prompt_token_count", 0), + "output_tokens": getattr(um, "candidates_token_count", 0), + "total_tokens": getattr(um, "total_token_count", 0), + } + + # Extract stop reason + stop_reason = None + if hasattr(response, "candidates") and response.candidates: + candidate = response.candidates[0] + if hasattr(candidate, "finish_reason"): + stop_reason = str(candidate.finish_reason) + + return ChatResponse( + content=content, + tool_calls=tool_calls, + role="assistant", + usage=usage, + model=self._model_id, + stop_reason=stop_reason, + ) - def gather_config(self) -> dict[str, Any]: + def gather_config(self) -> Dict[str, Any]: """Gather configuration from this Google GenAI model adapter. Returns: - Dictionary containing: - - type: Component class name - - gathered_at: ISO timestamp - - model_id: Model identifier - - adapter_type: GoogleGenAIModelAdapter - - default_generation_params: Default parameters used for generation (temperature, top_p, etc.) - - client_type: Type name of the underlying client + Dictionary containing model configuration. """ base_config = super().gather_config() base_config.update( diff --git a/maseval/interface/inference/huggingface.py b/maseval/interface/inference/huggingface.py index 3ef0751..7f7e541 100644 --- a/maseval/interface/inference/huggingface.py +++ b/maseval/interface/inference/huggingface.py @@ -1,21 +1,62 @@ """HuggingFace model adapter. +This adapter works with HuggingFace transformers pipelines and models. +It supports both simple callable models and full pipeline objects. + Requires transformers to be installed: pip install maseval[transformers] + +Example: + ```python + from transformers import pipeline + from maseval.interface.inference import HuggingFaceModelAdapter + + # Using a pipeline + pipe = pipeline("text-generation", model="meta-llama/Llama-3.1-8B-Instruct") + model = HuggingFaceModelAdapter(model=pipe, model_id="llama-3.1-8b") + + # Simple generation + response = model.generate("Hello!") + + # Chat with messages (uses chat template if available) + response = model.chat([ + {"role": "user", "content": "Hello!"} + ]) + ``` + +Note on tool calling: + HuggingFace models have varying support for tool calling. This adapter + will raise an exception if tools are passed but the model's chat template + does not support them. Use LiteLLMModelAdapter for more reliable tool + calling with a wider range of models. """ -from typing import Any, Callable, Optional, Dict +from typing import Any, Optional, Dict, List, Callable, Union + +from maseval.core.model import ModelAdapter, ChatResponse + + +class ToolCallingNotSupportedError(Exception): + """Raised when tool calling is requested but not supported by the model.""" -from maseval.core.model import ModelAdapter + pass class HuggingFaceModelAdapter(ModelAdapter): - """Adapter for HuggingFace-style generation. + """Adapter for HuggingFace transformers models and pipelines. - This adapter accepts either a `callable` that takes `prompt` and returns - text, or a thin `pipeline`-like object with a `__call__`. + Works with: + - transformers.pipeline() objects + - Any callable that accepts a prompt and returns text - Requires transformers to be installed. + For chat functionality, the adapter uses the tokenizer's chat template + if available. This provides proper formatting for instruction-tuned models. + + Tool calling support: + Tool calling is only supported if the model's chat template explicitly + supports it. If you pass tools and the model doesn't support them, + a ToolCallingNotSupportedError is raised. For reliable tool calling, + consider using LiteLLMModelAdapter instead. """ def __init__( @@ -24,6 +65,17 @@ def __init__( model_id: Optional[str] = None, default_generation_params: Optional[Dict[str, Any]] = None, ): + """Initialize HuggingFace model adapter. + + Args: + model: A callable that generates text. Can be: + - A transformers pipeline (e.g., pipeline("text-generation", ...)) + - Any callable that takes a prompt string and returns text + model_id: Identifier for the model. If not provided, attempts to + extract from the model's name_or_path attribute. + default_generation_params: Default parameters for all calls. + Common parameters: max_new_tokens, temperature, top_p, do_sample. + """ super().__init__() self._model = model self._model_id = model_id or getattr(model, "name_or_path", "huggingface:unknown") @@ -33,34 +85,271 @@ def __init__( def model_id(self) -> str: return self._model_id - def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str: - # Merge default params and call-time params; forward to underlying callable + def _chat_impl( + self, + messages: List[Dict[str, Any]], + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> ChatResponse: + """Generate response using HuggingFace model. + + Args: + messages: List of message dicts in OpenAI format. + generation_params: Generation parameters (temperature, etc.). + tools: Tool definitions. Raises ToolCallingNotSupportedError if + provided but not supported by the model's chat template. + tool_choice: Tool choice setting (ignored if tools not supported). + **kwargs: Additional parameters passed to the model. + + Returns: + ChatResponse with the model's output. + + Raises: + ToolCallingNotSupportedError: If tools are provided but the model + doesn't support tool calling. + """ + # Merge parameters params = dict(self._default_generation_params) if generation_params: params.update(generation_params) - # allow explicit kwargs to override params.update(kwargs) + + # Try to use chat template if available + tokenizer = self._get_tokenizer() + + if tokenizer is not None and hasattr(tokenizer, "apply_chat_template"): + return self._chat_with_template(messages, params, tools, tool_choice, tokenizer) + else: + # Fallback: convert messages to simple prompt + if tools: + raise ToolCallingNotSupportedError( + f"Model {self._model_id} does not have a chat template that supports tools. " + "Tool calling requires a model with an appropriate chat template. " + "Consider using LiteLLMModelAdapter for reliable tool calling." + ) + return self._chat_without_template(messages, params) + + def _get_tokenizer(self) -> Any: + """Get the tokenizer from the model/pipeline if available. + + Returns: + The tokenizer, or None if not available. + """ + # Pipeline objects have a tokenizer attribute + if hasattr(self._model, "tokenizer"): + return self._model.tokenizer + + # Some models expose the tokenizer directly + if hasattr(self._model, "model") and hasattr(self._model.model, "tokenizer"): + return self._model.model.tokenizer + + return None + + def _chat_with_template( + self, + messages: List[Dict[str, Any]], + params: Dict[str, Any], + tools: Optional[List[Dict[str, Any]]], + tool_choice: Optional[Union[str, Dict[str, Any]]], + tokenizer: Any, + ) -> ChatResponse: + """Generate using the tokenizer's chat template. + + Args: + messages: Messages to send. + params: Generation parameters. + tools: Tool definitions. + tool_choice: Tool choice setting. + tokenizer: The tokenizer with chat template. + + Returns: + ChatResponse with the model's output. + """ + # Check if tools are requested but not supported + if tools: + # Try to apply template with tools to check support + try: + # The template should accept tools parameter if it supports them + prompt = tokenizer.apply_chat_template( + messages, tools=tools, add_generation_prompt=True, tokenize=False + ) + except TypeError: + # Template doesn't accept tools parameter + raise ToolCallingNotSupportedError( + f"Model {self._model_id} chat template does not support tools. " + "The apply_chat_template() method does not accept a 'tools' parameter. " + "Consider using LiteLLMModelAdapter for reliable tool calling." + ) + else: + prompt = tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) + + # Generate response + response_text = self._call_model(prompt, params) + + # Parse tool calls from response if tools were provided + tool_calls = None + content = response_text + + if tools: + # Attempt to parse tool calls from the response + # Different models format tool calls differently + tool_calls, content = self._parse_tool_calls(response_text) + + return ChatResponse( + content=content if content else None, + tool_calls=tool_calls, + role="assistant", + model=self._model_id, + ) + + def _chat_without_template( + self, messages: List[Dict[str, Any]], params: Dict[str, Any] + ) -> ChatResponse: + """Generate without a chat template (simple prompt concatenation). + + Args: + messages: Messages to convert to prompt. + params: Generation parameters. + + Returns: + ChatResponse with the model's output. + """ + # Simple conversion: concatenate messages + prompt_parts = [] + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", "") + prompt_parts.append(f"{role}: {content}") + + prompt = "\n".join(prompt_parts) + "\nassistant:" + + response_text = self._call_model(prompt, params) + + return ChatResponse( + content=response_text, + role="assistant", + model=self._model_id, + ) + + def _call_model(self, prompt: str, params: Dict[str, Any]) -> str: + """Call the underlying model with a prompt. + + Args: + prompt: The formatted prompt. + params: Generation parameters. + + Returns: + The generated text. + """ try: - return self._model(prompt, **params) + result = self._model(prompt, **params) except TypeError: - # fall back to calling without kwargs - return self._model(prompt) + # Fallback: call without params + result = self._model(prompt) + + # Extract text from various response formats + if isinstance(result, str): + return result + elif isinstance(result, list) and len(result) > 0: + # Pipeline returns list of dicts + item = result[0] + if isinstance(item, dict): + # Text generation pipeline format + if "generated_text" in item: + generated = item["generated_text"] + # Remove the prompt from the response if it's included + if generated.startswith(prompt): + return generated[len(prompt) :].strip() + return generated + return str(item) + return str(item) + elif isinstance(result, dict): + if "generated_text" in result: + return result["generated_text"] + return str(result) + else: + return str(result) + + def _parse_tool_calls( + self, response: str + ) -> tuple[Optional[List[Dict[str, Any]]], Optional[str]]: + """Parse tool calls from model response. + + Different models format tool calls differently. This method attempts + to parse common formats. + + Args: + response: The raw model response. + + Returns: + Tuple of (tool_calls, remaining_content). + """ + import json + import re + + # Try to find JSON tool calls in the response + # Common patterns: ..., ```json...```, etc. + + tool_calls = [] + remaining_content = response + + # Pattern 1: tags (used by some models) + tool_call_pattern = r"(.*?)" + matches = re.findall(tool_call_pattern, response, re.DOTALL) + + for match in matches: + try: + call_data = json.loads(match.strip()) + tool_calls.append( + { + "id": f"call_{len(tool_calls)}", + "type": "function", + "function": { + "name": call_data.get("name", ""), + "arguments": json.dumps(call_data.get("arguments", {})), + }, + } + ) + remaining_content = remaining_content.replace( + f"{match}", "" + ) + except json.JSONDecodeError: + continue + + # Pattern 2: Function call JSON blocks + function_pattern = r'\{"name":\s*"([^"]+)",\s*"arguments":\s*(\{[^}]+\})\}' + for match in re.finditer(function_pattern, response): + try: + name = match.group(1) + args = match.group(2) + # Validate JSON + json.loads(args) + tool_calls.append( + { + "id": f"call_{len(tool_calls)}", + "type": "function", + "function": { + "name": name, + "arguments": args, + }, + } + ) + except (json.JSONDecodeError, IndexError): + continue + + remaining_content = remaining_content.strip() + + return (tool_calls if tool_calls else None, remaining_content if remaining_content else None) - def gather_config(self) -> dict[str, Any]: + def gather_config(self) -> Dict[str, Any]: """Gather configuration from this HuggingFace model adapter. Returns: - Dictionary containing: - - type: Component class name - - gathered_at: ISO timestamp - - model_id: Model identifier - - adapter_type: HuggingFaceModelAdapter - - default_generation_params: Default parameters used for generation (temperature, top_p, max_length, etc.) - - callable_type: Type name of the underlying callable - - pipeline_config: Pipeline configuration affecting model behavior: - - task: Pipeline task type (e.g., text-generation, text-classification) - - device: Device (cpu, cuda, etc.) - - framework: Framework (pt for PyTorch, tf for TensorFlow) + Dictionary containing model configuration. """ base_config = super().gather_config() base_config.update( @@ -70,16 +359,14 @@ def gather_config(self) -> dict[str, Any]: } ) - # Extract pipeline configuration that affects model behavior + # Extract pipeline configuration pipeline_config = {} - # Core pipeline attributes if hasattr(self._model, "task"): pipeline_config["task"] = self._model.task if hasattr(self._model, "device"): device = self._model.device - # Convert device to string representation pipeline_config["device"] = str(device) if device is not None else None if hasattr(self._model, "framework"): diff --git a/maseval/interface/inference/litellm.py b/maseval/interface/inference/litellm.py index 90825f7..a6d7259 100644 --- a/maseval/interface/inference/litellm.py +++ b/maseval/interface/inference/litellm.py @@ -1,44 +1,72 @@ """LiteLLM model adapter. -LiteLLM provides a unified interface for 100+ LLM APIs. +LiteLLM provides a unified interface for 100+ LLM APIs using OpenAI-compatible +syntax. This adapter wraps LiteLLM to provide consistent behavior within MASEval. Requires litellm to be installed: pip install maseval[litellm] -""" -from typing import Any, Optional, Dict +Example: + ```python + from maseval.interface.inference import LiteLLMModelAdapter -from maseval.core.model import ModelAdapter + # OpenAI models + model = LiteLLMModelAdapter(model_id="gpt-4") + # Anthropic models + model = LiteLLMModelAdapter(model_id="claude-3-opus-20240229") -class LiteLLMModelAdapter(ModelAdapter): - """Adapter for LiteLLM unified interface. + # Azure OpenAI + model = LiteLLMModelAdapter( + model_id="azure/gpt-4", + api_base="https://your-resource.openai.azure.com" + ) - LiteLLM provides a consistent API for calling multiple LLM providers - (OpenAI, Anthropic, Cohere, Azure, AWS Bedrock, etc.) using the same - interface. + # AWS Bedrock + model = LiteLLMModelAdapter(model_id="bedrock/anthropic.claude-v2") - Requires litellm to be installed. + # Simple generation + response = model.generate("Hello!") - Example: - ```python - from maseval.interface.inference import LiteLLMModelAdapter + # Chat with messages + response = model.chat([ + {"role": "user", "content": "Hello!"} + ]) - # OpenAI - model = LiteLLMModelAdapter(model_id="gpt-4") + # Chat with tools + response = model.chat( + messages=[{"role": "user", "content": "What's the weather?"}], + tools=[{"type": "function", "function": {...}}] + ) + ``` +""" - # Anthropic - model = LiteLLMModelAdapter(model_id="claude-3-opus-20240229") +from typing import Any, Optional, Dict, List, Union - # Azure OpenAI - model = LiteLLMModelAdapter( - model_id="azure/gpt-4", - default_generation_params={"api_base": "..."} - ) +from maseval.core.model import ModelAdapter, ChatResponse - # AWS Bedrock - model = LiteLLMModelAdapter(model_id="bedrock/anthropic.claude-v2") - ``` + +class LiteLLMModelAdapter(ModelAdapter): + """Adapter for LiteLLM unified interface. + + LiteLLM provides a consistent API for calling multiple LLM providers + (OpenAI, Anthropic, Cohere, Azure, AWS Bedrock, Google, etc.) using + OpenAI-compatible syntax. + + Supported providers include: + - OpenAI: "gpt-4", "gpt-3.5-turbo" + - Anthropic: "claude-3-opus-20240229", "claude-3-sonnet-20240229" + - Azure: "azure/gpt-4", "azure/gpt-35-turbo" + - AWS Bedrock: "bedrock/anthropic.claude-v2" + - Google: "gemini/gemini-pro" + - And many more (see https://docs.litellm.ai/docs/providers) + + API keys are read from environment variables by default: + - OPENAI_API_KEY for OpenAI + - ANTHROPIC_API_KEY for Anthropic + - etc. + + Or pass api_key directly to the constructor. """ def __init__( @@ -51,14 +79,17 @@ def __init__( """Initialize LiteLLM model adapter. Args: - model_id: The model identifier in LiteLLM format (e.g., "gpt-4", - "claude-3-opus-20240229", "azure/gpt-4", "bedrock/..."). - See: https://docs.litellm.ai/docs/providers - default_generation_params: Default parameters passed to litellm.completion() - (e.g., temperature, max_tokens, top_p, etc.) - api_key: Optional API key. If not provided, LiteLLM will use environment - variables (OPENAI_API_KEY, ANTHROPIC_API_KEY, etc.) - api_base: Optional API base URL for custom endpoints + model_id: The model identifier in LiteLLM format. Examples: + - "gpt-4" (OpenAI) + - "claude-3-opus-20240229" (Anthropic) + - "azure/gpt-4" (Azure OpenAI) + - "bedrock/anthropic.claude-v2" (AWS Bedrock) + See https://docs.litellm.ai/docs/providers for full list. + default_generation_params: Default parameters for all calls. + Common parameters: temperature, max_tokens, top_p. + api_key: API key for the provider. If not provided, LiteLLM + reads from environment variables. + api_base: Custom API base URL for self-hosted or Azure endpoints. """ super().__init__() self._model_id = model_id @@ -70,21 +101,32 @@ def __init__( def model_id(self) -> str: return self._model_id - def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str: - """Generate text using LiteLLM. + def _chat_impl( + self, + messages: List[Dict[str, Any]], + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> ChatResponse: + """Call LiteLLM completion API. Args: - prompt: The input prompt - generation_params: Optional generation parameters (temperature, max_tokens, etc.) - **kwargs: Additional LiteLLM-specific parameters + messages: List of message dicts in OpenAI format. + generation_params: Generation parameters (temperature, etc.). + tools: Tool definitions for function calling. + tool_choice: Tool choice setting. + **kwargs: Additional LiteLLM parameters. Returns: - Generated text string + ChatResponse with the model's output. """ try: import litellm except ImportError as e: - raise ImportError("LiteLLM is not installed. Install it with: pip install maseval[litellm] or pip install litellm") from e + raise ImportError( + "LiteLLM is not installed. Install with: pip install maseval[litellm]" + ) from e # Merge parameters params = dict(self._default_generation_params) @@ -98,31 +140,58 @@ def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any] if self._api_base: params["api_base"] = self._api_base - # LiteLLM expects messages format - messages = [{"role": "user", "content": prompt}] + # Add tools if provided + if tools: + params["tools"] = tools + if tool_choice is not None: + params["tool_choice"] = tool_choice # Call LiteLLM response = litellm.completion(model=self._model_id, messages=messages, **params) - # Extract text from response - # LiteLLM returns a ModelResponse object similar to OpenAI's format - content = response.choices[0].message.content - return content if content is not None else "" + # Extract response data + choice = response.choices[0] + message = choice.message + + # Build tool_calls list if present + tool_calls = None + if hasattr(message, "tool_calls") and message.tool_calls: + tool_calls = [] + for tc in message.tool_calls: + tool_calls.append( + { + "id": tc.id, + "type": tc.type, + "function": { + "name": tc.function.name, + "arguments": tc.function.arguments, + }, + } + ) + + # Build usage dict if present + usage = None + if hasattr(response, "usage") and response.usage: + usage = { + "input_tokens": getattr(response.usage, "prompt_tokens", 0), + "output_tokens": getattr(response.usage, "completion_tokens", 0), + "total_tokens": getattr(response.usage, "total_tokens", 0), + } + + return ChatResponse( + content=message.content, + tool_calls=tool_calls, + role=message.role if hasattr(message, "role") else "assistant", + usage=usage, + model=getattr(response, "model", self._model_id), + stop_reason=getattr(choice, "finish_reason", None), + ) - def gather_config(self) -> dict[str, Any]: + def gather_config(self) -> Dict[str, Any]: """Gather configuration from this LiteLLM model adapter. Returns: - Dictionary containing: - - type: Component class name - - gathered_at: ISO timestamp - - model_id: Model identifier - - adapter_type: LiteLLMModelAdapter - - default_generation_params: Default parameters used for generation (temperature, top_p, etc.) - - litellm_global_config: LiteLLM global configuration affecting model behavior: - - num_retries: Number of retry attempts (affects reliability) - - drop_params: Whether to drop unsupported params (affects behavior) - - verbose: Debug logging enabled (affects observability) + Dictionary containing model configuration and LiteLLM settings. """ base_config = super().gather_config() base_config.update( @@ -131,21 +200,18 @@ def gather_config(self) -> dict[str, Any]: } ) - # Extract LiteLLM global configuration that affects model behavior + # Extract LiteLLM global configuration try: import litellm litellm_config = {} - # Retry configuration (affects reliability and latency) if hasattr(litellm, "num_retries"): litellm_config["num_retries"] = litellm.num_retries - # Drop params (affects model behavior with unsupported parameters) if hasattr(litellm, "drop_params"): litellm_config["drop_params"] = litellm.drop_params - # Verbose mode (affects logging and debugging) if hasattr(litellm, "verbose"): litellm_config["verbose"] = litellm.verbose diff --git a/maseval/interface/inference/openai.py b/maseval/interface/inference/openai.py index 846aa80..26207aa 100644 --- a/maseval/interface/inference/openai.py +++ b/maseval/interface/inference/openai.py @@ -1,96 +1,276 @@ """OpenAI and OpenAI-compatible model adapter. +This adapter works with the official OpenAI Python SDK and any OpenAI-compatible +API (like Azure OpenAI, local models with OpenAI-compatible servers, etc.). + Requires openai to be installed: pip install maseval[openai] + +Example: + ```python + from openai import OpenAI + from maseval.interface.inference import OpenAIModelAdapter + + # Standard OpenAI usage + client = OpenAI() # Uses OPENAI_API_KEY env var + model = OpenAIModelAdapter(client=client, model_id="gpt-4") + + # Simple generation + response = model.generate("Hello!") + + # Chat with messages + response = model.chat([ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Hello!"} + ]) + + # Chat with tools + response = model.chat( + messages=[{"role": "user", "content": "What's the weather?"}], + tools=[{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather for a city", + "parameters": {...} + } + }] + ) + + # Azure OpenAI + from openai import AzureOpenAI + client = AzureOpenAI( + azure_endpoint="https://your-resource.openai.azure.com", + api_version="2024-02-15-preview" + ) + model = OpenAIModelAdapter(client=client, model_id="gpt-4") + ``` """ -from typing import Any, Optional, Dict -import json +from typing import Any, Optional, Dict, List, Union -from maseval.core.model import ModelAdapter +from maseval.core.model import ModelAdapter, ChatResponse class OpenAIModelAdapter(ModelAdapter): - """Adapter for OpenAI-compatible models (openai or OpenAI-compatible servers). + """Adapter for OpenAI and OpenAI-compatible APIs. - The `client` can be a callable returning a string, or an object with a - `complete`/`chat`/`create` method. This adapter tries common method names. + Works with: + - OpenAI API (gpt-4, gpt-3.5-turbo, etc.) + - Azure OpenAI + - Any OpenAI-compatible server (vLLM, LocalAI, etc.) - Requires openai to be installed. + The adapter expects an OpenAI client instance. API keys and configuration + should be set on the client before passing it to the adapter. """ def __init__( self, client: Any, - model_id: Optional[str] = None, + model_id: str, default_generation_params: Optional[Dict[str, Any]] = None, ): + """Initialize OpenAI model adapter. + + Args: + client: An OpenAI client instance (openai.OpenAI or openai.AzureOpenAI). + The client should already be configured with API keys. + model_id: The model identifier (e.g., "gpt-4", "gpt-3.5-turbo"). + default_generation_params: Default parameters for all calls. + Common parameters: temperature, max_tokens, top_p. + """ super().__init__() self._client = client - self._model_id = model_id or getattr(client, "model_id", "openai:unknown") + self._model_id = model_id self._default_generation_params = default_generation_params or {} @property def model_id(self) -> str: return self._model_id - def _extract_text(self, resp: Any) -> str: - if isinstance(resp, str): - return resp - if isinstance(resp, dict): - # common OpenAI shapes - if "choices" in resp and resp["choices"]: - choice = resp["choices"][0] - # chat-like - if "message" in choice and isinstance(choice["message"], dict): - return choice["message"].get("content", "") - # completion-like - return choice.get("text", "") - # fallback - return json.dumps(resp) - return str(resp) - - def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str: + def _chat_impl( + self, + messages: List[Dict[str, Any]], + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Union[str, Dict[str, Any]]] = None, + **kwargs: Any, + ) -> ChatResponse: + """Call OpenAI chat completions API. + + Args: + messages: List of message dicts in OpenAI format. + generation_params: Generation parameters (temperature, etc.). + tools: Tool definitions for function calling. + tool_choice: Tool choice setting. + **kwargs: Additional OpenAI parameters. + + Returns: + ChatResponse with the model's output. + """ + # Merge parameters params = dict(self._default_generation_params) if generation_params: params.update(generation_params) params.update(kwargs) - # try common call patterns - # 1) client(prompt) - try: - resp = self._client(prompt, **params) - except TypeError: - # 2) client.create / client.complete / client.chat - for meth in ("create", "complete", "chat", "generate"): - if hasattr(self._client, meth): - func = getattr(self._client, meth) + # Add tools if provided + if tools: + params["tools"] = tools + if tool_choice is not None: + params["tool_choice"] = tool_choice + + # Call OpenAI API + # Try the modern client interface first + if hasattr(self._client, "chat") and hasattr(self._client.chat, "completions"): + response = self._client.chat.completions.create( + model=self._model_id, messages=messages, **params + ) + else: + # Fallback for older or custom clients + response = self._call_legacy_client(messages, params) + + return self._parse_response(response) + + def _call_legacy_client( + self, messages: List[Dict[str, Any]], params: Dict[str, Any] + ) -> Any: + """Handle older client interfaces or callables. + + Args: + messages: Messages to send. + params: Parameters to pass. + + Returns: + Response from the client. + """ + # Try common method names + for method_name in ("create", "complete", "chat", "generate"): + if hasattr(self._client, method_name): + method = getattr(self._client, method_name) + try: + return method(model=self._model_id, messages=messages, **params) + except TypeError: + # Try without model parameter try: - resp = func(prompt, **params) - break + return method(messages=messages, **params) except TypeError: - resp = func(prompt) - break - else: - # last resort: call without kwargs - resp = self._client(prompt) + continue + + # Last resort: try calling directly + if callable(self._client): + return self._client(model=self._model_id, messages=messages, **params) - return self._extract_text(resp) + raise TypeError( + f"Unable to call client of type {type(self._client).__name__}. " + "Expected an OpenAI client with chat.completions.create() method." + ) + + def _parse_response(self, response: Any) -> ChatResponse: + """Parse OpenAI response into ChatResponse. + + Args: + response: The raw response from OpenAI. + + Returns: + ChatResponse with extracted data. + """ + # Handle dict responses (from mocks or legacy clients) + if isinstance(response, dict): + return self._parse_dict_response(response) + + # Handle modern OpenAI response objects + choice = response.choices[0] + message = choice.message + + # Extract tool calls + tool_calls = None + if hasattr(message, "tool_calls") and message.tool_calls: + tool_calls = [] + for tc in message.tool_calls: + tool_calls.append( + { + "id": tc.id, + "type": tc.type, + "function": { + "name": tc.function.name, + "arguments": tc.function.arguments, + }, + } + ) + + # Extract usage + usage = None + if hasattr(response, "usage") and response.usage: + usage = { + "input_tokens": getattr(response.usage, "prompt_tokens", 0), + "output_tokens": getattr(response.usage, "completion_tokens", 0), + "total_tokens": getattr(response.usage, "total_tokens", 0), + } + + return ChatResponse( + content=message.content, + tool_calls=tool_calls, + role=getattr(message, "role", "assistant"), + usage=usage, + model=getattr(response, "model", self._model_id), + stop_reason=getattr(choice, "finish_reason", None), + ) + + def _parse_dict_response(self, response: Dict[str, Any]) -> ChatResponse: + """Parse dict response (from mocks or legacy APIs). + + Args: + response: Dict response in OpenAI format. + + Returns: + ChatResponse with extracted data. + """ + if "choices" not in response or not response["choices"]: + # Simple string response wrapped in dict + return ChatResponse(content=str(response)) + + choice = response["choices"][0] + + # Handle chat-style response + if "message" in choice: + message = choice["message"] + content = message.get("content") + tool_calls = message.get("tool_calls") + role = message.get("role", "assistant") + # Handle completion-style response + elif "text" in choice: + content = choice["text"] + tool_calls = None + role = "assistant" + else: + content = str(choice) + tool_calls = None + role = "assistant" + + # Extract usage if present + usage = None + if "usage" in response: + usage = { + "input_tokens": response["usage"].get("prompt_tokens", 0), + "output_tokens": response["usage"].get("completion_tokens", 0), + "total_tokens": response["usage"].get("total_tokens", 0), + } + + return ChatResponse( + content=content, + tool_calls=tool_calls, + role=role, + usage=usage, + model=response.get("model", self._model_id), + stop_reason=choice.get("finish_reason"), + ) - def gather_config(self) -> dict[str, Any]: + def gather_config(self) -> Dict[str, Any]: """Gather configuration from this OpenAI model adapter. Returns: - Dictionary containing: - - type: Component class name - - gathered_at: ISO timestamp - - model_id: Model identifier - - adapter_type: OpenAIModelAdapter - - default_generation_params: Default parameters used for generation (temperature, top_p, etc.) - - client_type: Type name of the underlying client - - client_config: OpenAI client configuration affecting model behavior: - - timeout: Request timeout settings (affects latency) - - max_retries: Maximum number of retry attempts (affects reliability) + Dictionary containing model configuration and client settings. """ base_config = super().gather_config() base_config.update( @@ -100,13 +280,11 @@ def gather_config(self) -> dict[str, Any]: } ) - # Extract OpenAI client configuration that affects model behavior + # Extract client configuration client_config = {} - # Timeout configuration (affects latency and reliability) if hasattr(self._client, "timeout"): timeout = self._client.timeout - # Handle both httpx.Timeout objects and simple floats if hasattr(timeout, "connect"): client_config["timeout"] = { "connect": timeout.connect, @@ -117,7 +295,6 @@ def gather_config(self) -> dict[str, Any]: else: client_config["timeout"] = timeout - # Max retries (affects reliability and latency) if hasattr(self._client, "max_retries"): client_config["max_retries"] = self._client.max_retries diff --git a/pyproject.toml b/pyproject.toml index 5c41202..0b49cf1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ langgraph = ["langgraph>=0.6.0"] llamaindex = ["llama-index-core>=0.12.0"] # Inference engines +anthropic = ["anthropic>=0.40.0"] openai = ["openai>=1.107.2"] google-genai = ["google-genai>=1.37.0"] transformers = ["transformers>=4.37.0"] @@ -47,7 +48,7 @@ langfuse = ["langfuse>=3.3.4"] # Dependencies for running examples (only what's actually used) examples = [ - "maseval[smolagents,langgraph,llamaindex,openai,google-genai,litellm,langfuse]", + "maseval[smolagents,langgraph,llamaindex,anthropic,openai,google-genai,litellm,langfuse]", # Additional integrations used in examples "langchain>=0.3.27", "langchain-google-genai>=2.1.12", diff --git a/tests/conftest.py b/tests/conftest.py index 75b147c..fd40e9e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,29 +12,81 @@ Evaluator, MessageHistory, ) -from maseval.core.model import ModelAdapter +from maseval.core.model import ModelAdapter, ChatResponse # ==================== Dummy Components ==================== class DummyModelAdapter(ModelAdapter): - """Minimal model adapter for testing.""" + """Minimal model adapter for testing. - def __init__(self, model_id: str = "test-model", responses: Optional[List[str]] = None): + Simulates model responses without making actual API calls. Useful for + unit tests and integration tests that don't require real LLM inference. + + Supports both chat() and generate() methods, returning responses from + a predefined list in round-robin fashion. + """ + + def __init__( + self, + model_id: str = "test-model", + responses: Optional[List[str]] = None, + tool_calls: Optional[List[List[Dict[str, Any]]]] = None, + ): + """Initialize DummyModelAdapter. + + Args: + model_id: Identifier for this model instance. + responses: List of text responses to return. Cycles through the list. + tool_calls: Optional list of tool call lists. If provided, each call + returns the corresponding tool_calls (cycling through the list). + """ super().__init__() self._model_id = model_id self._responses = responses or ["test response"] + self._tool_calls = tool_calls self._call_count = 0 @property def model_id(self) -> str: return self._model_id - def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str: + def _chat_impl( + self, + messages: List[Dict[str, Any]], + generation_params: Optional[Dict[str, Any]] = None, + tools: Optional[List[Dict[str, Any]]] = None, + tool_choice: Optional[Any] = None, + **kwargs: Any, + ) -> ChatResponse: + """Return a mock response. + + Args: + messages: Input messages (ignored for mock). + generation_params: Generation parameters (ignored for mock). + tools: Tool definitions (ignored for mock). + tool_choice: Tool choice (ignored for mock). + **kwargs: Additional arguments (ignored for mock). + + Returns: + ChatResponse with mock content and optional tool_calls. + """ response = self._responses[self._call_count % len(self._responses)] + + # Get tool_calls for this response if provided + response_tool_calls = None + if self._tool_calls: + response_tool_calls = self._tool_calls[self._call_count % len(self._tool_calls)] + self._call_count += 1 - return response + + return ChatResponse( + content=response, + tool_calls=response_tool_calls, + role="assistant", + model=self._model_id, + ) class DummyAgent: diff --git a/tests/test_core/test_model_adapter.py b/tests/test_core/test_model_adapter.py index 8718dea..bbcf3c6 100644 --- a/tests/test_core/test_model_adapter.py +++ b/tests/test_core/test_model_adapter.py @@ -15,7 +15,9 @@ import json import time from datetime import datetime +from typing import Any, Dict, List, Optional, Union from conftest import DummyModelAdapter +from maseval.core.model import ChatResponse @pytest.mark.core @@ -23,7 +25,7 @@ class TestModelAdapterBaseContract: """Test fundamental ModelAdapter base class behavior.""" def test_model_adapter_has_abstract_methods(self): - """ModelAdapter requires subclasses to implement model_id and _generate_impl.""" + """ModelAdapter requires subclasses to implement model_id and _chat_impl.""" from maseval.core.model import ModelAdapter # Cannot instantiate abstract class directly @@ -35,14 +37,14 @@ def test_model_adapter_requires_model_id_property(self): from maseval.core.model import ModelAdapter class IncompleteAdapter(ModelAdapter): - def _generate_impl(self, prompt, generation_params=None, **kwargs): - return "test" + def _chat_impl(self, messages, generation_params=None, tools=None, tool_choice=None, **kwargs): + return ChatResponse(content="test") with pytest.raises(TypeError): IncompleteAdapter() # type: ignore - def test_model_adapter_requires_generate_impl(self): - """Subclasses must implement _generate_impl method.""" + def test_model_adapter_requires_chat_impl(self): + """Subclasses must implement _chat_impl method.""" from maseval.core.model import ModelAdapter class IncompleteAdapter(ModelAdapter): @@ -91,7 +93,7 @@ def test_generate_logs_successful_calls(self, dummy_model): # Verify required fields call = dummy_model.logs[0] assert "timestamp" in call - assert "prompt_length" in call + assert "message_count" in call assert "response_length" in call assert "duration_seconds" in call assert "status" in call @@ -146,7 +148,60 @@ def test_generate_with_empty_prompt(self): assert isinstance(result, str) assert len(model.logs) == 1 - assert model.logs[0]["prompt_length"] == 0 + # Empty prompt creates one message + assert model.logs[0]["message_count"] == 1 + + +@pytest.mark.core +class TestModelAdapterChatContract: + """Test chat() method behavior.""" + + def test_chat_returns_chat_response(self): + """chat() returns a ChatResponse object.""" + model = DummyModelAdapter(responses=["Test response"]) + result = model.chat([{"role": "user", "content": "Hello"}]) + + assert isinstance(result, ChatResponse) + assert result.content == "Test response" + assert result.role == "assistant" + + def test_chat_with_multiple_messages(self): + """chat() accepts multiple messages.""" + model = DummyModelAdapter(responses=["Response"]) + messages = [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Hello"}, + ] + result = model.chat(messages) + + assert isinstance(result, ChatResponse) + assert model.logs[0]["message_count"] == 2 + + def test_chat_response_to_message(self): + """ChatResponse.to_message() returns dict.""" + model = DummyModelAdapter(responses=["Hello!"]) + result = model.chat([{"role": "user", "content": "Hi"}]) + + message = result.to_message() + assert isinstance(message, dict) + assert message["role"] == "assistant" + assert message["content"] == "Hello!" + + def test_chat_with_tool_calls(self): + """chat() returns tool_calls when provided.""" + tool_calls = [ + { + "id": "call_1", + "type": "function", + "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'}, + } + ] + model = DummyModelAdapter(responses=[""], tool_calls=[tool_calls]) + result = model.chat([{"role": "user", "content": "Weather?"}]) + + assert result.tool_calls is not None + assert len(result.tool_calls) == 1 + assert result.tool_calls[0]["function"]["name"] == "get_weather" @pytest.mark.core @@ -157,7 +212,7 @@ def test_model_adapter_error_handling(self, dummy_model): """Test that errors are logged correctly.""" class FailingModel(DummyModelAdapter): - def _generate_impl(self, prompt, generation_params=None, **kwargs): + def _chat_impl(self, messages, generation_params=None, tools=None, tool_choice=None, **kwargs): raise ValueError("Test error") model = FailingModel() @@ -174,7 +229,7 @@ def test_generate_logs_error_timing(self): """generate() logs duration even when errors occur.""" class FailingModel(DummyModelAdapter): - def _generate_impl(self, prompt, generation_params=None, **kwargs): + def _chat_impl(self, messages, generation_params=None, tools=None, tool_choice=None, **kwargs): time.sleep(0.01) # Small delay raise RuntimeError("Fail") @@ -187,10 +242,10 @@ def _generate_impl(self, prompt, generation_params=None, **kwargs): assert call["duration_seconds"] >= 0.01 def test_generate_logs_error_metadata(self): - """generate() logs prompt length and params even on error.""" + """generate() logs message count and params even on error.""" class FailingModel(DummyModelAdapter): - def _generate_impl(self, prompt, generation_params=None, **kwargs): + def _chat_impl(self, messages, generation_params=None, tools=None, tool_choice=None, **kwargs): raise Exception("Fail") model = FailingModel() @@ -200,7 +255,7 @@ def _generate_impl(self, prompt, generation_params=None, **kwargs): model.generate("Test prompt", generation_params=params, custom="arg") call = model.logs[0] - assert call["prompt_length"] == len("Test prompt") + assert call["message_count"] == 1 assert call["generation_params"] == params assert "custom" in call["kwargs"] @@ -211,7 +266,7 @@ class CustomError(Exception): pass class FailingModel(DummyModelAdapter): - def _generate_impl(self, prompt, generation_params=None, **kwargs): + def _chat_impl(self, messages, generation_params=None, tools=None, tool_choice=None, **kwargs): raise CustomError("Original error") model = FailingModel() @@ -285,11 +340,11 @@ def __init__(self): super().__init__() self.call_count = 0 - def _generate_impl(self, prompt, generation_params=None, **kwargs): + def _chat_impl(self, messages, generation_params=None, tools=None, tool_choice=None, **kwargs): self.call_count += 1 if self.call_count % 2 == 0: raise ValueError("Fail") - return "Success" + return ChatResponse(content="Success") model = SometimesFailingModel() diff --git a/tests/test_interface/test_model_integration/test_model_adapters.py b/tests/test_interface/test_model_integration/test_model_adapters.py index 0d1a7b3..8e73ac6 100644 --- a/tests/test_interface/test_model_integration/test_model_adapters.py +++ b/tests/test_interface/test_model_integration/test_model_adapters.py @@ -25,23 +25,40 @@ def test_openai_adapter_initialization(self): pytest.importorskip("openai") from maseval.interface.inference.openai import OpenAIModelAdapter - # Mock client - def mock_client(prompt, **kwargs): - return {"choices": [{"message": {"content": "Response"}}]} + # Mock client with chat.completions.create interface + class MockClient: + class Chat: + class Completions: + def create(self, model, messages, **kwargs): + return {"choices": [{"message": {"content": "Response"}}]} + + completions = Completions() - adapter = OpenAIModelAdapter(client=mock_client, model_id="gpt-4") + chat = Chat() + + adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4") assert adapter.model_id == "gpt-4" - def test_openai_adapter_generate_with_callable(self): - """OpenAIModelAdapter works with callable client.""" + def test_openai_adapter_generate_with_modern_client(self): + """OpenAIModelAdapter works with modern client interface.""" pytest.importorskip("openai") from maseval.interface.inference.openai import OpenAIModelAdapter - def mock_client(prompt, **kwargs): - return {"choices": [{"message": {"content": f"Response to: {prompt}"}}]} + class MockClient: + class Chat: + class Completions: + def create(self, model, messages, **kwargs): + # Extract user message content + user_msg = next((m for m in messages if m["role"] == "user"), {}) + content = user_msg.get("content", "") + return {"choices": [{"message": {"content": f"Response to: {content}"}}]} + + completions = Completions() + + chat = Chat() - adapter = OpenAIModelAdapter(client=mock_client, model_id="gpt-4") + adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4") result = adapter.generate("Test prompt") assert isinstance(result, str) @@ -53,24 +70,19 @@ def test_openai_adapter_extract_text_from_dict(self): from maseval.interface.inference.openai import OpenAIModelAdapter # Chat completion format - def chat_client(prompt, **kwargs): - return {"choices": [{"message": {"content": "Chat response"}}]} - - adapter = OpenAIModelAdapter(client=chat_client, model_id="gpt-4") - result = adapter.generate("Test") - assert result == "Chat response" + class MockClient: + class Chat: + class Completions: + def create(self, model, messages, **kwargs): + return {"choices": [{"message": {"content": "Chat response"}}]} - def test_openai_adapter_extract_text_from_string(self): - """OpenAIModelAdapter handles string responses.""" - pytest.importorskip("openai") - from maseval.interface.inference.openai import OpenAIModelAdapter + completions = Completions() - def string_client(prompt, **kwargs): - return "Direct string response" + chat = Chat() - adapter = OpenAIModelAdapter(client=string_client, model_id="gpt-4") + adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4") result = adapter.generate("Test") - assert result == "Direct string response" + assert result == "Chat response" def test_openai_adapter_default_generation_params(self): """OpenAIModelAdapter uses default generation parameters.""" @@ -79,12 +91,19 @@ def test_openai_adapter_default_generation_params(self): captured_params = {} - def mock_client(prompt, **kwargs): - captured_params.update(kwargs) - return "Response" + class MockClient: + class Chat: + class Completions: + def create(self, model, messages, **kwargs): + captured_params.update(kwargs) + return {"choices": [{"message": {"content": "Response"}}]} + + completions = Completions() + + chat = Chat() adapter = OpenAIModelAdapter( - client=mock_client, + client=MockClient(), model_id="gpt-4", default_generation_params={"temperature": 0.7, "max_tokens": 100}, ) @@ -100,11 +119,18 @@ def test_openai_adapter_gather_config_includes_params(self): pytest.importorskip("openai") from maseval.interface.inference.openai import OpenAIModelAdapter - def mock_client(prompt, **kwargs): - return "Response" + class MockClient: + class Chat: + class Completions: + def create(self, model, messages, **kwargs): + return {"choices": [{"message": {"content": "Response"}}]} + + completions = Completions() + + chat = Chat() adapter = OpenAIModelAdapter( - client=mock_client, + client=MockClient(), model_id="gpt-4", default_generation_params={"temperature": 0.9}, ) @@ -126,8 +152,14 @@ def __init__(self): self.timeout = 60 self.max_retries = 3 - def __call__(self, prompt, **kwargs): - return "Response" + class Chat: + class Completions: + def create(self, model, messages, **kwargs): + return {"choices": [{"message": {"content": "Response"}}]} + + completions = Completions() + + chat = Chat() client = MockOpenAIClient() adapter = OpenAIModelAdapter(client=client, model_id="gpt-4") @@ -176,10 +208,22 @@ def test_google_genai_adapter_generate(self): class MockClient: class Models: def generate_content(self, model, contents, config=None): + # Extract text from contents (first user message) + text = "" + if contents: + for content in contents: + if content.get("role") == "user": + parts = content.get("parts", []) + if parts: + text = parts[0].get("text", "") + break + class Response: - text = f"Response to: {contents}" + pass - return Response() + resp = Response() + resp.text = f"Response to: {text}" + return resp def __init__(self): self.models = self.Models() @@ -274,7 +318,7 @@ def mock_model(prompt, **kwargs): assert adapter.model_id == "gpt2" def test_huggingface_adapter_generate(self): - """HuggingFaceModelAdapter generates text.""" + """HuggingFaceModelAdapter generates text with message formatting.""" pytest.importorskip("transformers") from maseval.interface.inference.huggingface import HuggingFaceModelAdapter @@ -285,7 +329,8 @@ def mock_model(prompt, **kwargs): result = adapter.generate("Test prompt") assert isinstance(result, str) - assert result == "Generated: Test prompt" + # Without a tokenizer, the adapter formats messages as "user: content\nassistant:" + assert "Generated:" in result def test_huggingface_adapter_default_generation_params(self): """HuggingFaceModelAdapter uses default generation parameters.""" @@ -322,7 +367,8 @@ def mock_model(prompt): adapter = HuggingFaceModelAdapter(model=mock_model, model_id="gpt2") result = adapter.generate("Test") - assert result == "Response: Test" + # Should still work, just formats the prompt as messages + assert "Response:" in result def test_huggingface_adapter_gather_config(self): """HuggingFaceModelAdapter config includes parameters.""" @@ -440,8 +486,18 @@ def test_all_adapters_expose_model_id(self): from maseval.interface.inference.huggingface import HuggingFaceModelAdapter from maseval.interface.inference.litellm import LiteLLMModelAdapter - # OpenAI - openai_adapter = OpenAIModelAdapter(client=lambda p, **k: "R", model_id="gpt-4") + # OpenAI - mock with modern interface + class MockOpenAIClient: + class Chat: + class Completions: + def create(self, model, messages, **kwargs): + return {"choices": [{"message": {"content": "R"}}]} + + completions = Completions() + + chat = Chat() + + openai_adapter = OpenAIModelAdapter(client=MockOpenAIClient(), model_id="gpt-4") assert openai_adapter.model_id == "gpt-4" # Google GenAI @@ -482,8 +538,18 @@ def test_all_adapters_include_default_params_in_config(self): params = {"temperature": 0.7} # OpenAI + class MockOpenAIClient: + class Chat: + class Completions: + def create(self, model, messages, **kwargs): + return {"choices": [{"message": {"content": "R"}}]} + + completions = Completions() + + chat = Chat() + openai_config = OpenAIModelAdapter( - client=lambda p, **k: "R", + client=MockOpenAIClient(), model_id="gpt-4", default_generation_params=params, ).gather_config() diff --git a/uv.lock b/uv.lock index eea7ebc..e12ce79 100644 --- a/uv.lock +++ b/uv.lock @@ -170,6 +170,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, ] +[[package]] +name = "anthropic" +version = "0.75.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "docstring-parser" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/04/1f/08e95f4b7e2d35205ae5dcbb4ae97e7d477fc521c275c02609e2931ece2d/anthropic-0.75.0.tar.gz", hash = "sha256:e8607422f4ab616db2ea5baacc215dd5f028da99ce2f022e33c7c535b29f3dfb", size = 439565, upload-time = "2025-11-24T20:41:45.28Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/1c/1cd02b7ae64302a6e06724bf80a96401d5313708651d277b1458504a1730/anthropic-0.75.0-py3-none-any.whl", hash = "sha256:ea8317271b6c15d80225a9f3c670152746e88805a7a61e14d4a374577164965b", size = 388164, upload-time = "2025-11-24T20:41:43.587Z" }, +] + [[package]] name = "anyio" version = "4.12.0" @@ -849,6 +868,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, ] +[[package]] +name = "docstring-parser" +version = "0.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/9d/c3b43da9515bd270df0f80548d9944e389870713cc1fe2b8fb35fe2bcefd/docstring_parser-0.17.0.tar.gz", hash = "sha256:583de4a309722b3315439bb31d64ba3eebada841f2e2cee23b99df001434c912", size = 27442, upload-time = "2025-07-21T07:35:01.868Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/e2/2537ebcff11c1ee1ff17d8d0b6f4db75873e3b0fb32c2d4a2ee31ecb310a/docstring_parser-0.17.0-py3-none-any.whl", hash = "sha256:cf2569abd23dce8099b300f9b4fa8191e9582dda731fd533daf54c4551658708", size = 36896, upload-time = "2025-07-21T07:35:00.684Z" }, +] + [[package]] name = "exceptiongroup" version = "1.3.1" @@ -2407,6 +2435,7 @@ dependencies = [ [package.optional-dependencies] all = [ + { name = "anthropic" }, { name = "google-genai" }, { name = "ipykernel" }, { name = "ipywidgets" }, @@ -2424,7 +2453,11 @@ all = [ { name = "typing-extensions" }, { name = "wandb" }, ] +anthropic = [ + { name = "anthropic" }, +] examples = [ + { name = "anthropic" }, { name = "google-genai" }, { name = "ipykernel" }, { name = "ipywidgets" }, @@ -2486,6 +2519,7 @@ docs = [ [package.metadata] requires-dist = [ + { name = "anthropic", marker = "extra == 'anthropic'", specifier = ">=0.40.0" }, { name = "gitpython", specifier = ">=3.1.0" }, { name = "google-genai", marker = "extra == 'google-genai'", specifier = ">=1.37.0" }, { name = "ipykernel", marker = "extra == 'examples'", specifier = ">=6.0.0" }, @@ -2498,7 +2532,7 @@ requires-dist = [ { name = "litellm", marker = "extra == 'litellm'", specifier = ">=1.0.0" }, { name = "llama-index-core", marker = "extra == 'llamaindex'", specifier = ">=0.12.0" }, { name = "maseval", extras = ["examples", "transformers", "wandb"], marker = "extra == 'all'" }, - { name = "maseval", extras = ["smolagents", "langgraph", "llamaindex", "openai", "google-genai", "litellm", "langfuse"], marker = "extra == 'examples'" }, + { name = "maseval", extras = ["smolagents", "langgraph", "llamaindex", "anthropic", "openai", "google-genai", "litellm", "langfuse"], marker = "extra == 'examples'" }, { name = "mcp", marker = "extra == 'examples'", specifier = ">=1.22.0" }, { name = "openai", marker = "extra == 'openai'", specifier = ">=1.107.2" }, { name = "pydantic", specifier = ">=2.12.5" }, @@ -2509,7 +2543,7 @@ requires-dist = [ { name = "typing-extensions", marker = "extra == 'examples'", specifier = ">=4.0.0" }, { name = "wandb", marker = "extra == 'wandb'", specifier = ">=0.15.0" }, ] -provides-extras = ["smolagents", "langgraph", "llamaindex", "openai", "google-genai", "transformers", "litellm", "wandb", "langfuse", "examples", "all"] +provides-extras = ["smolagents", "langgraph", "llamaindex", "anthropic", "openai", "google-genai", "transformers", "litellm", "wandb", "langfuse", "examples", "all"] [package.metadata.requires-dev] dev = [ From 3d4d19b25fb69d182dd1e463dbe937176588d9b4 Mon Sep 17 00:00:00 2001 From: cemde Date: Thu, 25 Dec 2025 13:45:57 +0100 Subject: [PATCH 2/6] fixed tests --- maseval/interface/inference/anthropic.py | 13 +++-------- maseval/interface/inference/google_genai.py | 8 ++----- maseval/interface/inference/huggingface.py | 20 ++++------------ maseval/interface/inference/litellm.py | 4 +--- maseval/interface/inference/openai.py | 11 +++------ .../test_macs/test_macs_evaluator.py | 19 +++++++-------- .../test_model_adapter_contract.py | 23 +++++++++++++------ tests/test_core/test_model_adapter.py | 1 - 8 files changed, 40 insertions(+), 59 deletions(-) diff --git a/maseval/interface/inference/anthropic.py b/maseval/interface/inference/anthropic.py index 0363d22..2b31f88 100644 --- a/maseval/interface/inference/anthropic.py +++ b/maseval/interface/inference/anthropic.py @@ -167,9 +167,7 @@ def _chat_impl( return self._parse_response(response) - def _convert_messages( - self, messages: List[Dict[str, Any]] - ) -> tuple[Optional[str], List[Dict[str, Any]]]: + def _convert_messages(self, messages: List[Dict[str, Any]]) -> tuple[Optional[str], List[Dict[str, Any]]]: """Convert OpenAI messages to Anthropic format. Anthropic separates system messages and uses different format for @@ -273,9 +271,7 @@ def _convert_tools(self, tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]: return anthropic_tools - def _convert_tool_choice( - self, tool_choice: Union[str, Dict[str, Any]] - ) -> Dict[str, Any]: + def _convert_tool_choice(self, tool_choice: Union[str, Dict[str, Any]]) -> Dict[str, Any]: """Convert OpenAI tool_choice to Anthropic format. Args: @@ -341,10 +337,7 @@ def _parse_response(self, response: Any) -> ChatResponse: usage = { "input_tokens": getattr(response.usage, "input_tokens", 0), "output_tokens": getattr(response.usage, "output_tokens", 0), - "total_tokens": ( - getattr(response.usage, "input_tokens", 0) - + getattr(response.usage, "output_tokens", 0) - ), + "total_tokens": (getattr(response.usage, "input_tokens", 0) + getattr(response.usage, "output_tokens", 0)), } # Extract stop reason diff --git a/maseval/interface/inference/google_genai.py b/maseval/interface/inference/google_genai.py index fe71ba7..80e4c53 100644 --- a/maseval/interface/inference/google_genai.py +++ b/maseval/interface/inference/google_genai.py @@ -161,15 +161,11 @@ def _chat_impl( generation_config = genai.types.GenerateContentConfig(**config_params) if config_params else None # Call API - response = self._client.models.generate_content( - model=self._model_id, contents=contents, config=generation_config - ) + response = self._client.models.generate_content(model=self._model_id, contents=contents, config=generation_config) return self._parse_response(response) - def _convert_messages( - self, messages: List[Dict[str, Any]] - ) -> tuple[Optional[str], List[Dict[str, Any]]]: + def _convert_messages(self, messages: List[Dict[str, Any]]) -> tuple[Optional[str], List[Dict[str, Any]]]: """Convert OpenAI messages to Google format. Google uses 'contents' with 'parts', and separates system instructions. diff --git a/maseval/interface/inference/huggingface.py b/maseval/interface/inference/huggingface.py index 7f7e541..5d20b56 100644 --- a/maseval/interface/inference/huggingface.py +++ b/maseval/interface/inference/huggingface.py @@ -172,9 +172,7 @@ def _chat_with_template( # Try to apply template with tools to check support try: # The template should accept tools parameter if it supports them - prompt = tokenizer.apply_chat_template( - messages, tools=tools, add_generation_prompt=True, tokenize=False - ) + prompt = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, tokenize=False) except TypeError: # Template doesn't accept tools parameter raise ToolCallingNotSupportedError( @@ -183,9 +181,7 @@ def _chat_with_template( "Consider using LiteLLMModelAdapter for reliable tool calling." ) else: - prompt = tokenizer.apply_chat_template( - messages, add_generation_prompt=True, tokenize=False - ) + prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) # Generate response response_text = self._call_model(prompt, params) @@ -206,9 +202,7 @@ def _chat_with_template( model=self._model_id, ) - def _chat_without_template( - self, messages: List[Dict[str, Any]], params: Dict[str, Any] - ) -> ChatResponse: + def _chat_without_template(self, messages: List[Dict[str, Any]], params: Dict[str, Any]) -> ChatResponse: """Generate without a chat template (simple prompt concatenation). Args: @@ -274,9 +268,7 @@ def _call_model(self, prompt: str, params: Dict[str, Any]) -> str: else: return str(result) - def _parse_tool_calls( - self, response: str - ) -> tuple[Optional[List[Dict[str, Any]]], Optional[str]]: + def _parse_tool_calls(self, response: str) -> tuple[Optional[List[Dict[str, Any]]], Optional[str]]: """Parse tool calls from model response. Different models format tool calls differently. This method attempts @@ -314,9 +306,7 @@ def _parse_tool_calls( }, } ) - remaining_content = remaining_content.replace( - f"{match}", "" - ) + remaining_content = remaining_content.replace(f"{match}", "") except json.JSONDecodeError: continue diff --git a/maseval/interface/inference/litellm.py b/maseval/interface/inference/litellm.py index a6d7259..f0b9866 100644 --- a/maseval/interface/inference/litellm.py +++ b/maseval/interface/inference/litellm.py @@ -124,9 +124,7 @@ def _chat_impl( try: import litellm except ImportError as e: - raise ImportError( - "LiteLLM is not installed. Install with: pip install maseval[litellm]" - ) from e + raise ImportError("LiteLLM is not installed. Install with: pip install maseval[litellm]") from e # Merge parameters params = dict(self._default_generation_params) diff --git a/maseval/interface/inference/openai.py b/maseval/interface/inference/openai.py index 26207aa..62bae77 100644 --- a/maseval/interface/inference/openai.py +++ b/maseval/interface/inference/openai.py @@ -123,18 +123,14 @@ def _chat_impl( # Call OpenAI API # Try the modern client interface first if hasattr(self._client, "chat") and hasattr(self._client.chat, "completions"): - response = self._client.chat.completions.create( - model=self._model_id, messages=messages, **params - ) + response = self._client.chat.completions.create(model=self._model_id, messages=messages, **params) else: # Fallback for older or custom clients response = self._call_legacy_client(messages, params) return self._parse_response(response) - def _call_legacy_client( - self, messages: List[Dict[str, Any]], params: Dict[str, Any] - ) -> Any: + def _call_legacy_client(self, messages: List[Dict[str, Any]], params: Dict[str, Any]) -> Any: """Handle older client interfaces or callables. Args: @@ -162,8 +158,7 @@ def _call_legacy_client( return self._client(model=self._model_id, messages=messages, **params) raise TypeError( - f"Unable to call client of type {type(self._client).__name__}. " - "Expected an OpenAI client with chat.completions.create() method." + f"Unable to call client of type {type(self._client).__name__}. Expected an OpenAI client with chat.completions.create() method." ) def _parse_response(self, response: Any) -> ChatResponse: diff --git a/tests/test_benchmarks/test_macs/test_macs_evaluator.py b/tests/test_benchmarks/test_macs/test_macs_evaluator.py index ab1b627..e5f0deb 100644 --- a/tests/test_benchmarks/test_macs/test_macs_evaluator.py +++ b/tests/test_benchmarks/test_macs/test_macs_evaluator.py @@ -436,20 +436,21 @@ def test_call_system_includes_tool_invocations(self, sample_task, sample_trace, traces = {"messages": sample_trace, "tool_traces": sample_tool_traces} - # Capture the prompt sent to the model - captured_prompts = [] - original_generate = model._generate_impl + # Capture the messages sent to the model + captured_messages = [] + original_chat = model._chat_impl - def capture_prompt(prompt, *args, **kwargs): - captured_prompts.append(prompt) - return original_generate(prompt, *args, **kwargs) + def capture_messages(messages, *args, **kwargs): + captured_messages.append(messages) + return original_chat(messages, *args, **kwargs) - with patch.object(model, "_generate_impl", side_effect=capture_prompt): + with patch.object(model, "_chat_impl", side_effect=capture_messages): evaluator(traces) # Check that tool invocations were included in the prompt - assert len(captured_prompts) > 0 - prompt = captured_prompts[0] + assert len(captured_messages) > 0 + # The prompt is in the first user message content + prompt = captured_messages[0][0]["content"] assert "search_flights" in prompt or "book_flight" in prompt diff --git a/tests/test_contract/test_model_adapter_contract.py b/tests/test_contract/test_model_adapter_contract.py index e229d1b..c4eb34f 100644 --- a/tests/test_contract/test_model_adapter_contract.py +++ b/tests/test_contract/test_model_adapter_contract.py @@ -11,6 +11,7 @@ What this contract validates: - generate() returns string consistently +- chat() returns ChatResponse consistently - Call logging happens uniformly (successful and failed calls) - Timing capture works consistently - Trace structure is consistent across implementations (gather_traces) @@ -126,12 +127,19 @@ def create_openai_adapter(model_id: str = "gpt-4", responses: Optional[List[str] response_list: List[str] = responses or ["Test response"] call_count = [0] - def mock_client(prompt, **kwargs): - response = response_list[call_count[0] % len(response_list)] - call_count[0] += 1 - return {"choices": [{"message": {"content": response}}]} + class MockClient: + class Chat: + class Completions: + def create(self, model, messages, **kwargs): + response = response_list[call_count[0] % len(response_list)] + call_count[0] += 1 + return {"choices": [{"message": {"content": response}}]} + + completions = Completions() + + chat = Chat() - return OpenAIModelAdapter(client=mock_client, model_id=model_id) + return OpenAIModelAdapter(client=MockClient(), model_id=model_id) def create_google_genai_adapter(model_id: str = "gemini-pro", responses: Optional[List[str]] = None) -> Any: @@ -417,7 +425,8 @@ def test_adapter_handles_empty_prompt(self, implementation): traces = adapter.gather_traces() assert traces["total_calls"] == 1 - assert traces["logs"][0]["prompt_length"] == 0 + # Empty prompt still creates one message + assert traces["logs"][0]["message_count"] == 1 finally: cleanup_adapter(adapter, implementation) @@ -521,7 +530,7 @@ def test_all_adapters_log_same_call_metadata(self): assert "timestamp" in call, f"Missing timestamp in {impl}" assert "status" in call, f"Missing status in {impl}" assert "duration_seconds" in call, f"Missing duration in {impl}" - assert "prompt_length" in call, f"Missing prompt_length in {impl}" + assert "message_count" in call, f"Missing message_count in {impl}" finally: for adapter, impl in adapters: cleanup_adapter(adapter, impl) diff --git a/tests/test_core/test_model_adapter.py b/tests/test_core/test_model_adapter.py index bbcf3c6..e66861c 100644 --- a/tests/test_core/test_model_adapter.py +++ b/tests/test_core/test_model_adapter.py @@ -15,7 +15,6 @@ import json import time from datetime import datetime -from typing import Any, Dict, List, Optional, Union from conftest import DummyModelAdapter from maseval.core.model import ChatResponse From 203315ec4c040a2aadb1fa2935ce2289d23c0f66 Mon Sep 17 00:00:00 2001 From: cemde Date: Thu, 25 Dec 2025 16:07:10 +0100 Subject: [PATCH 3/6] improved testing --- .../test_model_adapter_contract.py | 92 +- .../test_model_adapters.py | 912 ++++++++++++++++++ 2 files changed, 1000 insertions(+), 4 deletions(-) diff --git a/tests/test_contract/test_model_adapter_contract.py b/tests/test_contract/test_model_adapter_contract.py index c4eb34f..c8fbd6a 100644 --- a/tests/test_contract/test_model_adapter_contract.py +++ b/tests/test_contract/test_model_adapter_contract.py @@ -29,6 +29,7 @@ from datetime import datetime from typing import Any, Dict, Optional, List from conftest import DummyModelAdapter +from maseval.core.model import ChatResponse # ==================== Helper Functions ==================== @@ -226,6 +227,43 @@ def create_dummy_adapter(model_id: str = "test-model", responses: Optional[List[ return DummyModelAdapter(model_id=model_id, responses=responses) +def create_anthropic_adapter(model_id: str = "claude-3", responses: Optional[List[str]] = None) -> Any: + """Create AnthropicModelAdapter instance.""" + pytest.importorskip("anthropic") + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + response_list: List[str] = responses or ["Test response"] + call_count = [0] + + class MockTextBlock: + type = "text" + + def __init__(self, text: str): + self.text = text + + class MockUsage: + input_tokens = 10 + output_tokens = 5 + + class MockMessages: + def create(self, **kwargs): + response = response_list[call_count[0] % len(response_list)] + call_count[0] += 1 + + class MockResponse: + content = [MockTextBlock(response)] + usage = MockUsage() + model = model_id + stop_reason = "end_turn" + + return MockResponse() + + class MockClient: + messages = MockMessages() + + return AnthropicModelAdapter(client=MockClient(), model_id=model_id) + + def create_adapter_for_implementation(implementation: str, model_id: str, responses: Optional[List[str]] = None) -> Any: """Factory function to create adapter for specified implementation.""" factories = { @@ -234,6 +272,7 @@ def create_adapter_for_implementation(implementation: str, model_id: str, respon "google_genai": create_google_genai_adapter, "huggingface": create_huggingface_adapter, "litellm": create_litellm_adapter, + "anthropic": create_anthropic_adapter, } if implementation not in factories: @@ -255,7 +294,7 @@ def cleanup_adapter(adapter: Any, implementation: str) -> None: @pytest.mark.contract @pytest.mark.interface -@pytest.mark.parametrize("implementation", ["dummy", "openai", "google_genai", "huggingface", "litellm"]) +@pytest.mark.parametrize("implementation", ["dummy", "openai", "google_genai", "huggingface", "litellm", "anthropic"]) class TestModelAdapterContract: """Verify all ModelAdapter implementations honor the same contract.""" @@ -270,6 +309,51 @@ def test_adapter_generate_returns_string(self, implementation): finally: cleanup_adapter(adapter, implementation) + def test_adapter_chat_returns_chat_response(self, implementation): + """All adapters return ChatResponse from chat().""" + adapter = create_adapter_for_implementation(implementation, model_id="test-model", responses=["Test response"]) + + try: + result = adapter.chat([{"role": "user", "content": "Test prompt"}]) + assert isinstance(result, ChatResponse) + assert result.content is not None or result.tool_calls is not None + assert result.role == "assistant" + finally: + cleanup_adapter(adapter, implementation) + + def test_adapter_chat_handles_multi_turn(self, implementation): + """All adapters handle multi-turn conversations.""" + adapter = create_adapter_for_implementation(implementation, model_id="test-model", responses=["Response"]) + + try: + result = adapter.chat( + [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there!"}, + {"role": "user", "content": "How are you?"}, + ] + ) + assert isinstance(result, ChatResponse) + assert result.content is not None + finally: + cleanup_adapter(adapter, implementation) + + def test_adapter_chat_handles_system_message(self, implementation): + """All adapters handle system messages.""" + adapter = create_adapter_for_implementation(implementation, model_id="test-model", responses=["Response"]) + + try: + result = adapter.chat( + [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello"}, + ] + ) + assert isinstance(result, ChatResponse) + assert result.content is not None + finally: + cleanup_adapter(adapter, implementation) + def test_adapter_traces_have_base_fields(self, implementation): """All adapters include required trace fields.""" adapter = create_adapter_for_implementation(implementation, model_id="test-model") @@ -453,7 +537,7 @@ class TestCrossAdapterConsistency: def test_all_adapters_have_consistent_trace_structure(self): """All adapter implementations have same base trace structure.""" - implementations = ["dummy", "openai", "google_genai", "huggingface", "litellm"] + implementations = ["dummy", "openai", "google_genai", "huggingface", "litellm", "anthropic"] adapters = [] try: @@ -485,7 +569,7 @@ def test_all_adapters_have_consistent_trace_structure(self): def test_all_adapters_have_consistent_config_structure(self): """All adapter implementations have same base config structure.""" - implementations = ["dummy", "openai", "google_genai", "huggingface", "litellm"] + implementations = ["dummy", "openai", "google_genai", "huggingface", "litellm", "anthropic"] adapters = [] try: @@ -511,7 +595,7 @@ def test_all_adapters_have_consistent_config_structure(self): def test_all_adapters_log_same_call_metadata(self): """All adapters log same metadata for each call.""" - implementations = ["dummy", "openai", "google_genai", "huggingface", "litellm"] + implementations = ["dummy", "openai", "google_genai", "huggingface", "litellm", "anthropic"] adapters = [] try: diff --git a/tests/test_interface/test_model_integration/test_model_adapters.py b/tests/test_interface/test_model_integration/test_model_adapters.py index 8e73ac6..46ed43d 100644 --- a/tests/test_interface/test_model_integration/test_model_adapters.py +++ b/tests/test_interface/test_model_integration/test_model_adapters.py @@ -170,6 +170,228 @@ def create(self, model, messages, **kwargs): assert "client_type" in config assert config["client_type"] == "MockOpenAIClient" + def test_openai_adapter_tool_calls_response(self): + """OpenAIModelAdapter handles tool call responses.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class MockToolCall: + id = "call_123" + type = "function" + + class function: + name = "get_weather" + arguments = '{"city": "Paris"}' + + class MockMessage: + content = None + role = "assistant" + tool_calls = [MockToolCall()] + + class MockChoice: + message = MockMessage() + finish_reason = "tool_calls" + + class MockUsage: + prompt_tokens = 10 + completion_tokens = 5 + total_tokens = 15 + + class MockResponse: + choices = [MockChoice()] + usage = MockUsage() + model = "gpt-4" + + class MockClient: + class Chat: + class Completions: + def create(self, model, messages, **kwargs): + return MockResponse() + + completions = Completions() + + chat = Chat() + + adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4") + response = adapter.chat([{"role": "user", "content": "Weather?"}]) + + assert response.tool_calls is not None + assert len(response.tool_calls) == 1 + assert response.tool_calls[0]["function"]["name"] == "get_weather" + assert response.usage["input_tokens"] == 10 + assert response.stop_reason == "tool_calls" + + def test_openai_adapter_tools_parameter_passing(self): + """OpenAIModelAdapter passes tools to API.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + captured_kwargs = {} + + class MockMessage: + content = "I'll check the weather" + role = "assistant" + tool_calls = None + + class MockChoice: + message = MockMessage() + finish_reason = "stop" + + class MockResponse: + choices = [MockChoice()] + model = "gpt-4" + + class MockClient: + class Chat: + class Completions: + def create(self, model, messages, **kwargs): + captured_kwargs.update(kwargs) + return MockResponse() + + completions = Completions() + + chat = Chat() + + adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4") + tools = [{"type": "function", "function": {"name": "get_weather"}}] + adapter.chat( + [{"role": "user", "content": "Weather?"}], + tools=tools, + tool_choice="auto", + ) + + assert "tools" in captured_kwargs + assert captured_kwargs["tools"] == tools + assert captured_kwargs["tool_choice"] == "auto" + + def test_openai_adapter_legacy_client_fallback(self): + """OpenAIModelAdapter falls back to legacy client interface.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class LegacyClient: + def create(self, model, messages, **kwargs): + return {"choices": [{"message": {"content": "Legacy response"}}]} + + adapter = OpenAIModelAdapter(client=LegacyClient(), model_id="gpt-4") + response = adapter.chat([{"role": "user", "content": "Hi"}]) + + assert response.content == "Legacy response" + + def test_openai_adapter_callable_client(self): + """OpenAIModelAdapter falls back to calling client directly.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + def callable_client(model, messages, **kwargs): + return {"choices": [{"message": {"content": "Callable response"}}]} + + adapter = OpenAIModelAdapter(client=callable_client, model_id="gpt-4") + response = adapter.chat([{"role": "user", "content": "Hi"}]) + + assert response.content == "Callable response" + + def test_openai_adapter_text_format_response(self): + """OpenAIModelAdapter parses text format responses.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class MockClient: + class Chat: + class Completions: + def create(self, model, messages, **kwargs): + return {"choices": [{"text": "Completion text"}]} + + completions = Completions() + + chat = Chat() + + adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4") + response = adapter.chat([{"role": "user", "content": "Hi"}]) + + assert response.content == "Completion text" + + def test_openai_adapter_dict_response_with_tool_calls(self): + """OpenAIModelAdapter parses dict responses with tool calls.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class MockClient: + class Chat: + class Completions: + def create(self, model, messages, **kwargs): + return { + "choices": [ + { + "message": { + "content": None, + "tool_calls": [ + { + "id": "call_1", + "type": "function", + "function": {"name": "search", "arguments": "{}"}, + } + ], + }, + } + ], + } + + completions = Completions() + + chat = Chat() + + adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4") + response = adapter.chat([{"role": "user", "content": "Search"}]) + + assert response.tool_calls is not None + assert response.tool_calls[0]["function"]["name"] == "search" + + def test_openai_adapter_fallback_without_model_param(self): + """OpenAIModelAdapter falls back to calling without model param.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class LegacyClient: + def create(self, messages, **kwargs): + # Only accepts messages, no model param + return {"choices": [{"message": {"content": "No model param"}}]} + + adapter = OpenAIModelAdapter(client=LegacyClient(), model_id="gpt-4") + response = adapter.chat([{"role": "user", "content": "Hi"}]) + + assert response.content == "No model param" + + def test_openai_adapter_gather_config_with_timeout(self): + """OpenAIModelAdapter includes timeout in config.""" + pytest.importorskip("openai") + from maseval.interface.inference.openai import OpenAIModelAdapter + + class MockTimeout: + connect = 5.0 + read = 30.0 + write = 30.0 + pool = 10.0 + + class MockClient: + timeout = MockTimeout() + max_retries = 3 + + class Chat: + class Completions: + def create(self, **kwargs): + return {"choices": [{"message": {"content": "R"}}]} + + completions = Completions() + + chat = Chat() + + adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4") + config = adapter.gather_config() + + assert "client_config" in config + assert config["client_config"]["max_retries"] == 3 + # ==================== Google GenAI Tests ==================== @@ -297,6 +519,125 @@ def __init__(self): assert config["default_generation_params"]["temperature"] == 0.9 assert "client_type" in config + def test_google_genai_adapter_function_call_response(self): + """GoogleGenAIModelAdapter handles function call responses.""" + pytest.importorskip("google.genai") + from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter + + class MockFunctionCall: + name = "search_web" + args = {"query": "test"} + + class MockPart: + type = "function_call" + function_call = MockFunctionCall() + + class MockContent: + parts = [MockPart()] + + class MockCandidate: + content = MockContent() + finish_reason = "STOP" + + class MockUsage: + prompt_token_count = 20 + candidates_token_count = 10 + total_token_count = 30 + + class MockResponse: + text = None + candidates = [MockCandidate()] + usage_metadata = MockUsage() + + class MockClient: + class Models: + def generate_content(self, model, contents, config=None): + return MockResponse() + + def __init__(self): + self.models = self.Models() + + adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro") + response = adapter.chat([{"role": "user", "content": "Search"}]) + + assert response.tool_calls is not None + assert len(response.tool_calls) == 1 + assert response.tool_calls[0]["function"]["name"] == "search_web" + assert response.usage["input_tokens"] == 20 + + def test_google_genai_adapter_tools_conversion(self): + """GoogleGenAIModelAdapter converts tools to Google format.""" + pytest.importorskip("google.genai") + from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter + + captured_config = None + + class MockResponse: + text = "Response" + candidates = [] + + class MockClient: + class Models: + def generate_content(self, model, contents, config=None): + nonlocal captured_config + captured_config = config + return MockResponse() + + def __init__(self): + self.models = self.Models() + + adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro") + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather", + "parameters": {"type": "object"}, + }, + } + ] + adapter.chat([{"role": "user", "content": "Weather?"}], tools=tools) + + assert captured_config is not None + + def test_google_genai_adapter_tool_choice_options(self): + """GoogleGenAIModelAdapter handles various tool_choice options.""" + pytest.importorskip("google.genai") + from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter + + class MockResponse: + text = "Response" + candidates = [] + + class MockClient: + class Models: + def generate_content(self, model, contents, config=None): + return MockResponse() + + def __init__(self): + self.models = self.Models() + + adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro") + tools = [{"type": "function", "function": {"name": "test"}}] + + # Test different tool_choice values + for choice in ["none", "auto", "required"]: + response = adapter.chat( + [{"role": "user", "content": "Test"}], + tools=tools, + tool_choice=choice, + ) + assert response is not None + + # Test specific function choice + response = adapter.chat( + [{"role": "user", "content": "Test"}], + tools=tools, + tool_choice={"type": "function", "function": {"name": "test"}}, + ) + assert response is not None + # ==================== HuggingFace Tests ==================== @@ -415,6 +756,170 @@ def __call__(self, prompt, **kwargs): assert "cpu" in str(config["pipeline_config"]["device"]) assert config["pipeline_config"]["framework"] == "pt" + def test_huggingface_adapter_tools_raises_error_without_support(self): + """HuggingFaceModelAdapter raises error when tools not supported.""" + pytest.importorskip("transformers") + from maseval.interface.inference.huggingface import ( + HuggingFaceModelAdapter, + ToolCallingNotSupportedError, + ) + + def mock_model(prompt, **kwargs): + return "Response" + + adapter = HuggingFaceModelAdapter(model=mock_model, model_id="test-model") + + with pytest.raises(ToolCallingNotSupportedError): + adapter.chat( + [{"role": "user", "content": "Test"}], + tools=[{"type": "function", "function": {"name": "test"}}], + ) + + def test_huggingface_adapter_tools_raises_when_template_doesnt_support(self): + """HuggingFaceModelAdapter raises error when template doesn't support tools.""" + pytest.importorskip("transformers") + from maseval.interface.inference.huggingface import ( + HuggingFaceModelAdapter, + ToolCallingNotSupportedError, + ) + + class MockTokenizer: + def apply_chat_template(self, messages, add_generation_prompt=True, tokenize=False, **kwargs): + if "tools" in kwargs: + raise TypeError("Unexpected keyword argument 'tools'") + return "Formatted prompt" + + class MockPipeline: + tokenizer = MockTokenizer() + + def __call__(self, prompt, **kwargs): + return "Response" + + adapter = HuggingFaceModelAdapter(model=MockPipeline(), model_id="test-model") + + with pytest.raises(ToolCallingNotSupportedError): + adapter.chat( + [{"role": "user", "content": "Test"}], + tools=[{"type": "function", "function": {"name": "test"}}], + ) + + def test_huggingface_adapter_chat_template_with_tools(self): + """HuggingFaceModelAdapter works when template supports tools.""" + pytest.importorskip("transformers") + from maseval.interface.inference.huggingface import HuggingFaceModelAdapter + + class MockTokenizer: + def apply_chat_template(self, messages, add_generation_prompt=True, tokenize=False, tools=None, **kwargs): + return "Formatted with tools" + + class MockPipeline: + tokenizer = MockTokenizer() + + def __call__(self, prompt, **kwargs): + return "Response" + + adapter = HuggingFaceModelAdapter(model=MockPipeline(), model_id="test-model") + response = adapter.chat( + [{"role": "user", "content": "Test"}], + tools=[{"type": "function", "function": {"name": "test"}}], + ) + + assert response is not None + + def test_huggingface_adapter_parses_tool_calls_from_output(self): + """HuggingFaceModelAdapter parses tool calls from model output.""" + pytest.importorskip("transformers") + from maseval.interface.inference.huggingface import HuggingFaceModelAdapter + + class MockTokenizer: + def apply_chat_template(self, messages, add_generation_prompt=True, tokenize=False, tools=None, **kwargs): + return "Prompt" + + class MockPipeline: + tokenizer = MockTokenizer() + + def __call__(self, prompt, **kwargs): + return '{"name": "search", "arguments": {"q": "test"}}' + + adapter = HuggingFaceModelAdapter(model=MockPipeline(), model_id="test-model") + response = adapter.chat( + [{"role": "user", "content": "Search"}], + tools=[{"type": "function", "function": {"name": "search"}}], + ) + + assert response.tool_calls is not None + assert len(response.tool_calls) >= 1 + assert any(tc["function"]["name"] == "search" for tc in response.tool_calls) + + def test_huggingface_adapter_chat_with_tokenizer(self): + """HuggingFaceModelAdapter uses chat template when available.""" + pytest.importorskip("transformers") + from maseval.interface.inference.huggingface import HuggingFaceModelAdapter + + class MockTokenizer: + def apply_chat_template(self, messages, add_generation_prompt=True, tokenize=False, **kwargs): + return "Formatted: " + messages[0]["content"] + + class MockPipeline: + tokenizer = MockTokenizer() + + def __call__(self, prompt, **kwargs): + return f"Response to: {prompt}" + + adapter = HuggingFaceModelAdapter(model=MockPipeline(), model_id="test-model") + response = adapter.chat([{"role": "user", "content": "Hello"}]) + + assert response.content is not None + + def test_huggingface_adapter_pipeline_response_format(self): + """HuggingFaceModelAdapter handles pipeline list response format.""" + pytest.importorskip("transformers") + from maseval.interface.inference.huggingface import HuggingFaceModelAdapter + + def mock_model(prompt, **kwargs): + return [{"generated_text": prompt + " Generated"}] + + adapter = HuggingFaceModelAdapter(model=mock_model, model_id="test-model") + response = adapter.chat([{"role": "user", "content": "Test"}]) + + assert "Generated" in response.content + + def test_huggingface_adapter_dict_response_format(self): + """HuggingFaceModelAdapter handles dict response format.""" + pytest.importorskip("transformers") + from maseval.interface.inference.huggingface import HuggingFaceModelAdapter + + def mock_model(prompt, **kwargs): + return {"generated_text": "Dict response"} + + adapter = HuggingFaceModelAdapter(model=mock_model, model_id="test-model") + response = adapter.chat([{"role": "user", "content": "Test"}]) + + assert response.content == "Dict response" + + def test_huggingface_adapter_nested_tokenizer(self): + """HuggingFaceModelAdapter gets tokenizer from model.model.tokenizer.""" + pytest.importorskip("transformers") + from maseval.interface.inference.huggingface import HuggingFaceModelAdapter + + class MockTokenizer: + def apply_chat_template(self, messages, add_generation_prompt=True, tokenize=False, **kwargs): + return "From nested tokenizer" + + class MockInnerModel: + tokenizer = MockTokenizer() + + class MockPipeline: + model = MockInnerModel() + + def __call__(self, prompt, **kwargs): + return "Response" + + adapter = HuggingFaceModelAdapter(model=MockPipeline(), model_id="test-model") + response = adapter.chat([{"role": "user", "content": "Test"}]) + + assert response is not None + # ==================== LiteLLM Tests ==================== @@ -466,6 +971,413 @@ def test_litellm_adapter_gather_config(self): assert config["default_generation_params"]["max_tokens"] == 200 assert config["model_id"] == "gpt-4" + def test_litellm_adapter_tool_calls_response(self): + """LiteLLMModelAdapter handles tool call responses.""" + pytest.importorskip("litellm") + import litellm + from maseval.interface.inference.litellm import LiteLLMModelAdapter + + class MockToolCall: + id = "call_456" + type = "function" + + class function: + name = "calculator" + arguments = '{"expression": "2+2"}' + + class MockMessage: + content = None + role = "assistant" + tool_calls = [MockToolCall()] + + class MockChoice: + message = MockMessage() + finish_reason = "tool_calls" + + class MockUsage: + prompt_tokens = 15 + completion_tokens = 8 + total_tokens = 23 + + class MockResponse: + choices = [MockChoice()] + usage = MockUsage() + model = "gpt-4" + + original = litellm.completion + + def mock_completion(model, messages, **kwargs): + return MockResponse() + + litellm.completion = mock_completion + + try: + adapter = LiteLLMModelAdapter(model_id="gpt-4") + response = adapter.chat([{"role": "user", "content": "Calculate"}]) + + assert response.tool_calls is not None + assert len(response.tool_calls) == 1 + assert response.tool_calls[0]["function"]["name"] == "calculator" + assert response.usage["input_tokens"] == 15 + assert response.stop_reason == "tool_calls" + finally: + litellm.completion = original + + def test_litellm_adapter_tools_and_credentials_passing(self): + """LiteLLMModelAdapter passes tools and credentials.""" + pytest.importorskip("litellm") + import litellm + from maseval.interface.inference.litellm import LiteLLMModelAdapter + + captured_kwargs = {} + + class MockMessage: + content = "Response" + role = "assistant" + tool_calls = None + + class MockChoice: + message = MockMessage() + finish_reason = "stop" + + class MockResponse: + choices = [MockChoice()] + + original = litellm.completion + + def mock_completion(model, messages, **kwargs): + captured_kwargs.update(kwargs) + return MockResponse() + + litellm.completion = mock_completion + + try: + adapter = LiteLLMModelAdapter( + model_id="gpt-4", + api_key="test-key", + api_base="https://test.api.com", + ) + tools = [{"type": "function", "function": {"name": "test"}}] + adapter.chat( + [{"role": "user", "content": "Test"}], + tools=tools, + tool_choice="required", + ) + + assert captured_kwargs["api_key"] == "test-key" + assert captured_kwargs["api_base"] == "https://test.api.com" + assert captured_kwargs["tools"] == tools + assert captured_kwargs["tool_choice"] == "required" + finally: + litellm.completion = original + + +# ==================== Anthropic Tests ==================== + + +@pytest.mark.interface +class TestAnthropicModelAdapterIntegration: + """Test AnthropicModelAdapter specific behavior.""" + + def test_anthropic_adapter_initialization(self): + """AnthropicModelAdapter initializes with client and model_id.""" + pytest.importorskip("anthropic") + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + class MockClient: + pass + + adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3") + assert adapter.model_id == "claude-3" + + def test_anthropic_adapter_chat_basic(self): + """AnthropicModelAdapter handles basic chat.""" + pytest.importorskip("anthropic") + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + class MockTextBlock: + type = "text" + text = "Hello! How can I help?" + + class MockUsage: + input_tokens = 10 + output_tokens = 8 + + class MockResponse: + content = [MockTextBlock()] + usage = MockUsage() + model = "claude-3" + stop_reason = "end_turn" + + class MockMessages: + def create(self, **kwargs): + return MockResponse() + + class MockClient: + messages = MockMessages() + + adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3") + response = adapter.chat([{"role": "user", "content": "Hello"}]) + + assert response.content == "Hello! How can I help?" + assert response.usage["input_tokens"] == 10 + assert response.stop_reason == "end_turn" + + def test_anthropic_adapter_tool_use_response(self): + """AnthropicModelAdapter handles tool use responses.""" + pytest.importorskip("anthropic") + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + class MockToolUseBlock: + type = "tool_use" + id = "tool_123" + name = "get_weather" + input = {"city": "Paris"} + + class MockUsage: + input_tokens = 15 + output_tokens = 12 + + class MockResponse: + content = [MockToolUseBlock()] + usage = MockUsage() + model = "claude-3" + stop_reason = "tool_use" + + class MockMessages: + def create(self, **kwargs): + return MockResponse() + + class MockClient: + messages = MockMessages() + + adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3") + response = adapter.chat([{"role": "user", "content": "Weather?"}]) + + assert response.tool_calls is not None + assert len(response.tool_calls) == 1 + assert response.tool_calls[0]["function"]["name"] == "get_weather" + assert response.stop_reason == "tool_use" + + def test_anthropic_adapter_system_message_extraction(self): + """AnthropicModelAdapter extracts system message.""" + pytest.importorskip("anthropic") + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + captured_kwargs = {} + + class MockTextBlock: + type = "text" + text = "I'm helpful!" + + class MockResponse: + content = [MockTextBlock()] + + class MockMessages: + def create(self, **kwargs): + captured_kwargs.update(kwargs) + return MockResponse() + + class MockClient: + messages = MockMessages() + + adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3") + adapter.chat( + [ + {"role": "system", "content": "You are very helpful"}, + {"role": "user", "content": "Hi"}, + ] + ) + + assert captured_kwargs["system"] == "You are very helpful" + assert all(m["role"] != "system" for m in captured_kwargs["messages"]) + + def test_anthropic_adapter_tools_conversion(self): + """AnthropicModelAdapter converts tools to Anthropic format.""" + pytest.importorskip("anthropic") + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + captured_kwargs = {} + + class MockTextBlock: + type = "text" + text = "Response" + + class MockResponse: + content = [MockTextBlock()] + + class MockMessages: + def create(self, **kwargs): + captured_kwargs.update(kwargs) + return MockResponse() + + class MockClient: + messages = MockMessages() + + adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3") + tools = [ + { + "type": "function", + "function": { + "name": "search", + "description": "Search the web", + "parameters": {"type": "object", "properties": {}}, + }, + } + ] + adapter.chat([{"role": "user", "content": "Search"}], tools=tools) + + assert "tools" in captured_kwargs + assert captured_kwargs["tools"][0]["name"] == "search" + assert "input_schema" in captured_kwargs["tools"][0] + + def test_anthropic_adapter_tool_choice_conversion(self): + """AnthropicModelAdapter converts tool_choice options.""" + pytest.importorskip("anthropic") + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + captured_kwargs = {} + + class MockTextBlock: + type = "text" + text = "Response" + + class MockResponse: + content = [MockTextBlock()] + + class MockMessages: + def create(self, **kwargs): + captured_kwargs.update(kwargs) + return MockResponse() + + class MockClient: + messages = MockMessages() + + adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3") + tools = [{"type": "function", "function": {"name": "test"}}] + + # Test "required" -> "any" + adapter.chat( + [{"role": "user", "content": "Test"}], + tools=tools, + tool_choice="required", + ) + assert captured_kwargs["tool_choice"]["type"] == "any" + + # Test specific function + adapter.chat( + [{"role": "user", "content": "Test"}], + tools=tools, + tool_choice={"type": "function", "function": {"name": "test"}}, + ) + assert captured_kwargs["tool_choice"]["type"] == "tool" + assert captured_kwargs["tool_choice"]["name"] == "test" + + def test_anthropic_adapter_tool_result_conversion(self): + """AnthropicModelAdapter converts tool result messages.""" + pytest.importorskip("anthropic") + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + captured_kwargs = {} + + class MockTextBlock: + type = "text" + text = "Final answer" + + class MockResponse: + content = [MockTextBlock()] + + class MockMessages: + def create(self, **kwargs): + captured_kwargs.update(kwargs) + return MockResponse() + + class MockClient: + messages = MockMessages() + + adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3") + adapter.chat( + [ + {"role": "user", "content": "What's the weather?"}, + { + "role": "assistant", + "tool_calls": [ + { + "id": "tool_1", + "type": "function", + "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'}, + } + ], + }, + {"role": "tool", "tool_call_id": "tool_1", "content": "Sunny, 22°C"}, + ] + ) + + messages = captured_kwargs["messages"] + tool_result_msg = [m for m in messages if m["role"] == "user" and isinstance(m.get("content"), list)] + assert len(tool_result_msg) > 0 + + def test_anthropic_adapter_mixed_content_response(self): + """AnthropicModelAdapter handles mixed text and tool_use response.""" + pytest.importorskip("anthropic") + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + class MockTextBlock: + type = "text" + text = "Let me check that for you." + + class MockToolUseBlock: + type = "tool_use" + id = "tool_456" + name = "lookup" + input = {"id": "123"} + + class MockUsage: + input_tokens = 20 + output_tokens = 15 + + class MockResponse: + content = [MockTextBlock(), MockToolUseBlock()] + usage = MockUsage() + model = "claude-3" + stop_reason = "tool_use" + + class MockMessages: + def create(self, **kwargs): + return MockResponse() + + class MockClient: + messages = MockMessages() + + adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3") + response = adapter.chat([{"role": "user", "content": "Look up ID 123"}]) + + assert response.content == "Let me check that for you." + assert response.tool_calls is not None + assert len(response.tool_calls) == 1 + assert response.tool_calls[0]["function"]["name"] == "lookup" + + def test_anthropic_adapter_gather_config(self): + """AnthropicModelAdapter config includes parameters.""" + pytest.importorskip("anthropic") + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + class MockClient: + pass + + adapter = AnthropicModelAdapter( + client=MockClient(), + model_id="claude-3", + max_tokens=2048, + default_generation_params={"temperature": 0.8}, + ) + config = adapter.gather_config() + + assert config["model_id"] == "claude-3" + assert config["max_tokens"] == 2048 + assert config["default_generation_params"]["temperature"] == 0.8 + assert config["client_type"] == "MockClient" + # ==================== Cross-Adapter Tests ==================== From 8f98c7bb93b6cf313d38a0aa2369de9f0153ee0b Mon Sep 17 00:00:00 2001 From: cemde Date: Fri, 26 Dec 2025 12:16:58 +0100 Subject: [PATCH 4/6] improved call logs --- tests/conftest.py | 17 +- .../test_model_adapter_contract.py | 493 +++++++++++++++++- 2 files changed, 489 insertions(+), 21 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index fd40e9e..bd4d5e0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -31,21 +31,30 @@ class DummyModelAdapter(ModelAdapter): def __init__( self, model_id: str = "test-model", - responses: Optional[List[str]] = None, - tool_calls: Optional[List[List[Dict[str, Any]]]] = None, + responses: Optional[List[Optional[str]]] = None, + tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None, + usage: Optional[Dict[str, int]] = None, + stop_reason: Optional[str] = None, ): """Initialize DummyModelAdapter. Args: model_id: Identifier for this model instance. responses: List of text responses to return. Cycles through the list. + Can include None for tool-only responses. tool_calls: Optional list of tool call lists. If provided, each call returns the corresponding tool_calls (cycling through the list). + Can include None for text-only responses. + usage: Optional usage dict to include in all responses. Should have + input_tokens, output_tokens, total_tokens. + stop_reason: Optional stop_reason to include in all responses. """ super().__init__() self._model_id = model_id - self._responses = responses or ["test response"] + self._responses: List[Optional[str]] = responses or ["test response"] self._tool_calls = tool_calls + self._usage = usage + self._stop_reason = stop_reason self._call_count = 0 @property @@ -86,6 +95,8 @@ def _chat_impl( tool_calls=response_tool_calls, role="assistant", model=self._model_id, + usage=self._usage, + stop_reason=self._stop_reason, ) diff --git a/tests/test_contract/test_model_adapter_contract.py b/tests/test_contract/test_model_adapter_contract.py index c8fbd6a..6815f39 100644 --- a/tests/test_contract/test_model_adapter_contract.py +++ b/tests/test_contract/test_model_adapter_contract.py @@ -120,21 +120,36 @@ def assert_base_config_fields(config: Dict[str, Any], model_id: Optional[str] = # ==================== Adapter Factory Functions ==================== -def create_openai_adapter(model_id: str = "gpt-4", responses: Optional[List[str]] = None) -> Any: +def create_openai_adapter( + model_id: str = "gpt-4", responses: Optional[List[str]] = None, tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None +) -> Any: """Create OpenAIModelAdapter instance.""" pytest.importorskip("openai") from maseval.interface.inference.openai import OpenAIModelAdapter response_list: List[str] = responses or ["Test response"] + tool_calls_list = tool_calls call_count = [0] class MockClient: class Chat: class Completions: def create(self, model, messages, **kwargs): - response = response_list[call_count[0] % len(response_list)] + response_text = response_list[call_count[0] % len(response_list)] + response_tool_calls = tool_calls_list[call_count[0] % len(tool_calls_list)] if tool_calls_list else None call_count[0] += 1 - return {"choices": [{"message": {"content": response}}]} + + # Mock response structure + message = {"content": response_text, "role": "assistant"} + + if response_tool_calls: + message["tool_calls"] = response_tool_calls + + return { + "choices": [{"message": message, "finish_reason": "stop"}], + "model": model, + "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30}, + } completions = Completions() @@ -143,12 +158,15 @@ def create(self, model, messages, **kwargs): return OpenAIModelAdapter(client=MockClient(), model_id=model_id) -def create_google_genai_adapter(model_id: str = "gemini-pro", responses: Optional[List[str]] = None) -> Any: +def create_google_genai_adapter( + model_id: str = "gemini-pro", responses: Optional[List[str]] = None, tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None +) -> Any: """Create GoogleGenAIModelAdapter instance.""" pytest.importorskip("google.genai") from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter response_list: List[str] = responses or ["Test response"] + tool_calls_list = tool_calls call_count = [0] class MockClient: @@ -168,7 +186,9 @@ def __init__(self): return GoogleGenAIModelAdapter(client=MockClient(), model_id=model_id) -def create_huggingface_adapter(model_id: str = "gpt2", responses: Optional[List[str]] = None) -> Any: +def create_huggingface_adapter( + model_id: str = "gpt2", responses: Optional[List[str]] = None, tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None +) -> Any: """Create HuggingFaceModelAdapter instance.""" pytest.importorskip("transformers") from maseval.interface.inference.huggingface import HuggingFaceModelAdapter @@ -184,7 +204,9 @@ def mock_model(prompt, **kwargs): return HuggingFaceModelAdapter(model=mock_model, model_id=model_id) -def create_litellm_adapter(model_id: str = "gpt-3.5-turbo", responses: Optional[List[str]] = None) -> Any: +def create_litellm_adapter( + model_id: str = "gpt-3.5-turbo", responses: Optional[List[str]] = None, tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None +) -> Any: """Create LiteLLMModelAdapter instance.""" pytest.importorskip("litellm") import litellm @@ -192,21 +214,36 @@ def create_litellm_adapter(model_id: str = "gpt-3.5-turbo", responses: Optional[ # Mock litellm.completion response_list: List[str] = responses or ["Test response"] + tool_calls_list = tool_calls call_count = [0] original_completion = litellm.completion def mock_completion(model, messages, **kwargs): response = response_list[call_count[0] % len(response_list)] + response_tool_calls = tool_calls_list[call_count[0] % len(tool_calls_list)] if tool_calls_list else None call_count[0] += 1 class MockMessage: - content = response + def __init__(self): + self.content = response + self.role = "assistant" + self.tool_calls = response_tool_calls class MockChoice: - message = MockMessage() + def __init__(self): + self.message = MockMessage() + self.finish_reason = "stop" + + class MockUsage: + prompt_tokens = 10 + completion_tokens = 20 + total_tokens = 30 class MockResponse: - choices = [MockChoice()] + def __init__(self): + self.choices = [MockChoice()] + self.usage = MockUsage() + self.model = model return MockResponse() @@ -221,18 +258,24 @@ class MockResponse: return adapter -def create_dummy_adapter(model_id: str = "test-model", responses: Optional[List[str]] = None) -> DummyModelAdapter: +def create_dummy_adapter( + model_id: str = "test-model", responses: Optional[List[str]] = None, tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None +) -> DummyModelAdapter: """Create DummyModelAdapter instance.""" responses = responses or ["Test response"] - return DummyModelAdapter(model_id=model_id, responses=responses) + usage = {"input_tokens": 10, "output_tokens": 20, "total_tokens": 30} + return DummyModelAdapter(model_id=model_id, responses=responses, tool_calls=tool_calls, usage=usage, stop_reason="stop") -def create_anthropic_adapter(model_id: str = "claude-3", responses: Optional[List[str]] = None) -> Any: +def create_anthropic_adapter( + model_id: str = "claude-3", responses: Optional[List[str]] = None, tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None +) -> Any: """Create AnthropicModelAdapter instance.""" pytest.importorskip("anthropic") from maseval.interface.inference.anthropic import AnthropicModelAdapter response_list: List[str] = responses or ["Test response"] + tool_calls_list = tool_calls call_count = [0] class MockTextBlock: @@ -241,6 +284,16 @@ class MockTextBlock: def __init__(self, text: str): self.text = text + class MockToolUseBlock: + type = "tool_use" + + def __init__(self, tool_call: Dict[str, Any]): + self.id = tool_call["id"] + self.name = tool_call["function"]["name"] + import json + + self.input = json.loads(tool_call["function"]["arguments"]) + class MockUsage: input_tokens = 10 output_tokens = 5 @@ -248,13 +301,20 @@ class MockUsage: class MockMessages: def create(self, **kwargs): response = response_list[call_count[0] % len(response_list)] + response_tool_calls = tool_calls_list[call_count[0] % len(tool_calls_list)] if tool_calls_list else None call_count[0] += 1 class MockResponse: - content = [MockTextBlock(response)] - usage = MockUsage() - model = model_id - stop_reason = "end_turn" + def __init__(self): + self.content = [] + if response: + self.content.append(MockTextBlock(response)) + if response_tool_calls: + for tc in response_tool_calls: + self.content.append(MockToolUseBlock(tc)) + self.usage = MockUsage() + self.model = model_id + self.stop_reason = "end_turn" return MockResponse() @@ -264,7 +324,12 @@ class MockClient: return AnthropicModelAdapter(client=MockClient(), model_id=model_id) -def create_adapter_for_implementation(implementation: str, model_id: str, responses: Optional[List[str]] = None) -> Any: +def create_adapter_for_implementation( + implementation: str, + model_id: str, + responses: Optional[List[Optional[str]]] = None, + tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None, +) -> Any: """Factory function to create adapter for specified implementation.""" factories = { "dummy": create_dummy_adapter, @@ -278,7 +343,7 @@ def create_adapter_for_implementation(implementation: str, model_id: str, respon if implementation not in factories: raise ValueError(f"Unknown implementation: {implementation}") - return factories[implementation](model_id=model_id, responses=responses) + return factories[implementation](model_id=model_id, responses=responses, tool_calls=tool_calls) def cleanup_adapter(adapter: Any, implementation: str) -> None: @@ -618,3 +683,395 @@ def test_all_adapters_log_same_call_metadata(self): finally: for adapter, impl in adapters: cleanup_adapter(adapter, impl) + + +# ==================== Tool Calling Contract Tests ==================== + + +@pytest.mark.contract +@pytest.mark.interface +@pytest.mark.parametrize("implementation", ["dummy", "openai", "litellm", "anthropic"]) +class TestToolCallingContract: + """Contract tests for tool calling functionality across adapters. + + These tests verify that tool-related features work consistently across + all model adapters that support tools. This is critical for users building + agentic systems that need to swap between providers. + + Note: Only testing adapters that support tools (OpenAI, Anthropic, LiteLLM, Dummy). + HuggingFace and GoogleGenAI don't fully support tool calling in their current implementation. + """ + + def test_adapter_accepts_tools_parameter(self, implementation): + """All adapters accept tools parameter without error.""" + adapter = create_adapter_for_implementation(implementation, model_id="test-model") + + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather for a city", + "parameters": {"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]}, + }, + } + ] + + try: + result = adapter.chat([{"role": "user", "content": "What's the weather in Paris?"}], tools=tools) + assert isinstance(result, ChatResponse) + finally: + cleanup_adapter(adapter, implementation) + + def test_adapter_accepts_tool_choice_parameter(self, implementation): + """All adapters accept tool_choice parameter without error.""" + adapter = create_adapter_for_implementation(implementation, model_id="test-model") + + tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather for a city", + "parameters": {"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]}, + }, + } + ] + + try: + # Test different tool_choice values + for tool_choice in ["auto", "none", "required"]: + result = adapter.chat([{"role": "user", "content": "What's the weather?"}], tools=tools, tool_choice=tool_choice) + assert isinstance(result, ChatResponse) + + # Test specific tool selection + result = adapter.chat( + [{"role": "user", "content": "What's the weather?"}], + tools=tools, + tool_choice={"type": "function", "function": {"name": "get_weather"}}, + ) + assert isinstance(result, ChatResponse) + finally: + cleanup_adapter(adapter, implementation) + + def test_adapter_returns_tool_calls_in_response(self, implementation): + """All adapters return tool_calls with consistent structure.""" + tool_calls_to_return = [ + [ + { + "id": "call_123", + "type": "function", + "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'}, + } + ] + ] + + adapter = create_adapter_for_implementation( + implementation, model_id="test-model", responses=["I'll check the weather"], tool_calls=tool_calls_to_return + ) + + tools = [ + { + "type": "function", + "function": {"name": "get_weather", "description": "Get weather", "parameters": {"type": "object", "properties": {}}}, + } + ] + + try: + result = adapter.chat([{"role": "user", "content": "What's the weather in Paris?"}], tools=tools) + + assert result.tool_calls is not None, f"{implementation} did not return tool_calls" + assert isinstance(result.tool_calls, list) + assert len(result.tool_calls) > 0 + + # Verify structure of first tool call + tc = result.tool_calls[0] + assert "id" in tc, f"{implementation} tool_call missing 'id'" + assert "type" in tc, f"{implementation} tool_call missing 'type'" + assert "function" in tc, f"{implementation} tool_call missing 'function'" + assert "name" in tc["function"], f"{implementation} tool_call function missing 'name'" + assert "arguments" in tc["function"], f"{implementation} tool_call function missing 'arguments'" + + # Verify types + assert isinstance(tc["id"], str) + assert isinstance(tc["type"], str) + assert isinstance(tc["function"]["name"], str) + assert isinstance(tc["function"]["arguments"], str) # JSON string + finally: + cleanup_adapter(adapter, implementation) + + def test_adapter_handles_tool_result_messages(self, implementation): + """All adapters handle role='tool' messages in conversations.""" + adapter = create_adapter_for_implementation(implementation, model_id="test-model") + + # Simulate a conversation with tool use + messages = [ + {"role": "user", "content": "What's the weather in Paris?"}, + { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_123", + "type": "function", + "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'}, + } + ], + }, + {"role": "tool", "tool_call_id": "call_123", "content": '{"temperature": 72, "condition": "sunny"}'}, + {"role": "user", "content": "What about London?"}, + ] + + try: + result = adapter.chat(messages) + assert isinstance(result, ChatResponse) + # Should not raise an error + finally: + cleanup_adapter(adapter, implementation) + + def test_adapter_handles_assistant_messages_with_tool_calls(self, implementation): + """All adapters handle assistant messages containing tool_calls.""" + adapter = create_adapter_for_implementation(implementation, model_id="test-model") + + # Include an assistant message with tool_calls in the history + messages = [ + {"role": "user", "content": "Get weather for Paris"}, + { + "role": "assistant", + "content": "I'll check the weather for you.", + "tool_calls": [ + { + "id": "call_123", + "type": "function", + "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'}, + } + ], + }, + {"role": "tool", "tool_call_id": "call_123", "content": '{"temperature": 72}'}, + {"role": "user", "content": "Thanks!"}, + ] + + try: + result = adapter.chat(messages) + assert isinstance(result, ChatResponse) + # Should process the conversation history without error + finally: + cleanup_adapter(adapter, implementation) + + def test_adapter_tool_calls_logs_correctly(self, implementation): + """All adapters log tool-related calls consistently.""" + tool_calls_to_return = [ + [ + { + "id": "call_123", + "type": "function", + "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'}, + } + ] + ] + + adapter = create_adapter_for_implementation( + implementation, model_id="test-model", responses=["I'll check"], tool_calls=tool_calls_to_return + ) + + tools = [ + { + "type": "function", + "function": {"name": "get_weather", "description": "Get weather", "parameters": {"type": "object", "properties": {}}}, + } + ] + + try: + adapter.chat([{"role": "user", "content": "Weather?"}], tools=tools) + + traces = adapter.gather_traces() + assert traces["total_calls"] == 1 + assert len(traces["logs"]) == 1 + + call_log = traces["logs"][0] + assert "response_type" in call_log + assert call_log["response_type"] == "tool_call" + assert "tool_calls_count" in call_log + assert call_log["tool_calls_count"] == 1 + assert "tools_provided" in call_log + assert call_log["tools_provided"] == 1 + finally: + cleanup_adapter(adapter, implementation) + + +# ==================== Usage and Metadata Contract Tests ==================== + + +@pytest.mark.contract +@pytest.mark.interface +@pytest.mark.parametrize("implementation", ["dummy", "openai", "litellm", "anthropic"]) +class TestUsageAndMetadataContract: + """Contract tests for usage tracking and response metadata. + + These tests ensure consistent reporting of token usage, stop reasons, + and other metadata across all adapters. This is important for evaluation + and cost tracking in production systems. + + Note: Only testing adapters with full metadata support (OpenAI, Anthropic, LiteLLM, Dummy). + """ + + def test_adapter_returns_usage_info(self, implementation): + """All adapters return consistent usage information.""" + adapter = create_adapter_for_implementation(implementation, model_id="test-model") + + try: + result = adapter.chat([{"role": "user", "content": "Hello"}]) + + # Usage should be present and have required fields + if result.usage is not None: # Some adapters might not support this + assert isinstance(result.usage, dict) + assert "input_tokens" in result.usage + assert "output_tokens" in result.usage + assert "total_tokens" in result.usage + + assert isinstance(result.usage["input_tokens"], int) + assert isinstance(result.usage["output_tokens"], int) + assert isinstance(result.usage["total_tokens"], int) + + assert result.usage["input_tokens"] >= 0 + assert result.usage["output_tokens"] >= 0 + assert result.usage["total_tokens"] >= 0 + finally: + cleanup_adapter(adapter, implementation) + + def test_adapter_returns_stop_reason(self, implementation): + """All adapters return stop_reason in responses.""" + adapter = create_adapter_for_implementation(implementation, model_id="test-model") + + try: + result = adapter.chat([{"role": "user", "content": "Hello"}]) + + # stop_reason should be present + if result.stop_reason is not None: # Some adapters might not support this + assert isinstance(result.stop_reason, str) + assert len(result.stop_reason) > 0 + finally: + cleanup_adapter(adapter, implementation) + + def test_adapter_stop_reason_for_tool_calls(self, implementation): + """All adapters indicate tool use in stop_reason when applicable.""" + tool_calls_to_return = [ + [ + { + "id": "call_123", + "type": "function", + "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'}, + } + ] + ] + + adapter = create_adapter_for_implementation(implementation, model_id="test-model", responses=[None], tool_calls=tool_calls_to_return) + + tools = [ + { + "type": "function", + "function": {"name": "get_weather", "description": "Get weather", "parameters": {"type": "object", "properties": {}}}, + } + ] + + try: + result = adapter.chat([{"role": "user", "content": "Weather?"}], tools=tools) + + # When tool_calls are returned, should have a stop_reason + # (The exact value may vary: "tool_calls", "tool_use", "function_call", etc.) + if result.stop_reason is not None: + assert isinstance(result.stop_reason, str) + finally: + cleanup_adapter(adapter, implementation) + + def test_adapter_handles_content_none_with_tool_calls(self, implementation): + """All adapters handle responses with content=None and only tool_calls.""" + tool_calls_to_return = [ + [ + { + "id": "call_123", + "type": "function", + "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'}, + } + ] + ] + + # Response with None content, only tool_calls + adapter = create_adapter_for_implementation(implementation, model_id="test-model", responses=[None], tool_calls=tool_calls_to_return) + + tools = [ + { + "type": "function", + "function": {"name": "get_weather", "description": "Get weather", "parameters": {"type": "object", "properties": {}}}, + } + ] + + try: + result = adapter.chat([{"role": "user", "content": "What's the weather?"}], tools=tools) + + assert isinstance(result, ChatResponse) + # content can be None when model only returns tool calls + assert result.tool_calls is not None, f"{implementation} should return tool_calls when content is None" + assert isinstance(result.tool_calls, list) + assert len(result.tool_calls) > 0 + + # Verify the response is still valid + msg = result.to_message() + assert isinstance(msg, dict) + assert msg["role"] == "assistant" + assert "tool_calls" in msg + finally: + cleanup_adapter(adapter, implementation) + + def test_adapter_to_message_includes_tool_calls(self, implementation): + """All adapters include tool_calls in to_message() output.""" + tool_calls_to_return = [ + [ + { + "id": "call_123", + "type": "function", + "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'}, + } + ] + ] + + adapter = create_adapter_for_implementation( + implementation, model_id="test-model", responses=["I'll check"], tool_calls=tool_calls_to_return + ) + + tools = [ + { + "type": "function", + "function": {"name": "get_weather", "description": "Get weather", "parameters": {"type": "object", "properties": {}}}, + } + ] + + try: + result = adapter.chat([{"role": "user", "content": "Weather?"}], tools=tools) + + msg = result.to_message() + assert isinstance(msg, dict) + assert msg["role"] == "assistant" + assert "tool_calls" in msg, f"{implementation} to_message() should include tool_calls" + assert isinstance(msg["tool_calls"], list) + assert len(msg["tool_calls"]) > 0 + finally: + cleanup_adapter(adapter, implementation) + + def test_adapter_usage_tracking_across_calls(self, implementation): + """All adapters consistently report usage across multiple calls.""" + adapter = create_adapter_for_implementation(implementation, model_id="test-model", responses=["R1", "R2"]) + + try: + result1 = adapter.chat([{"role": "user", "content": "First"}]) + result2 = adapter.chat([{"role": "user", "content": "Second"}]) + + # Both should have usage (if supported) + if result1.usage is not None and result2.usage is not None: + assert isinstance(result1.usage, dict) + assert isinstance(result2.usage, dict) + + # Structure should be consistent + assert set(result1.usage.keys()) == set(result2.usage.keys()) + finally: + cleanup_adapter(adapter, implementation) From 71610bb919754f1e768d05c0ebc04dc79a87917a Mon Sep 17 00:00:00 2001 From: cemde Date: Fri, 26 Dec 2025 12:32:01 +0100 Subject: [PATCH 5/6] fixed test file --- .../test_model_adapter_contract.py | 71 +++++++++++++++++-- 1 file changed, 64 insertions(+), 7 deletions(-) diff --git a/tests/test_contract/test_model_adapter_contract.py b/tests/test_contract/test_model_adapter_contract.py index 6815f39..cb7dc33 100644 --- a/tests/test_contract/test_model_adapter_contract.py +++ b/tests/test_contract/test_model_adapter_contract.py @@ -171,14 +171,49 @@ def create_google_genai_adapter( class MockClient: class Models: - def generate_content(self, model, contents, config=None): + def generate_content(self_inner, model, contents, config=None): response = response_list[call_count[0] % len(response_list)] + response_tool_calls = tool_calls_list[call_count[0] % len(tool_calls_list)] if tool_calls_list else None call_count[0] += 1 - class Response: - text = response + # Build mock response with function calls if tool_calls provided + if response_tool_calls: - return Response() + class MockFunctionCall: + def __init__(self, name, args): + self.name = name + self.args = args + + class MockPart: + def __init__(self, tc_dict): + self.type = "function_call" + func = tc_dict.get("function", {}) + args_str = func.get("arguments", "{}") + import json + + self.function_call = MockFunctionCall(func.get("name", ""), json.loads(args_str) if args_str else {}) + + class MockContent: + def __init__(self): + self.parts = [MockPart(tc) for tc in response_tool_calls] + + class MockCandidate: + def __init__(self): + self.content = MockContent() + self.finish_reason = "STOP" + + class MockResponse: + text = None + candidates = [MockCandidate()] + + return MockResponse() + else: + + class Response: + text = response + candidates = [] + + return Response() def __init__(self): self.models = self.Models() @@ -220,19 +255,41 @@ def create_litellm_adapter( def mock_completion(model, messages, **kwargs): response = response_list[call_count[0] % len(response_list)] - response_tool_calls = tool_calls_list[call_count[0] % len(tool_calls_list)] if tool_calls_list else None + response_tool_calls_dicts = tool_calls_list[call_count[0] % len(tool_calls_list)] if tool_calls_list else None call_count[0] += 1 + # Convert dict tool_calls to objects with attributes (like real LiteLLM returns) + mock_tool_calls = None + if response_tool_calls_dicts: + mock_tool_calls = [] + for tc_dict in response_tool_calls_dicts: + + class MockFunction: + pass + + class MockToolCall: + pass + + func = MockFunction() + func.name = tc_dict.get("function", {}).get("name", "") + func.arguments = tc_dict.get("function", {}).get("arguments", "{}") + + tc = MockToolCall() + tc.id = tc_dict.get("id", "") + tc.type = tc_dict.get("type", "function") + tc.function = func + mock_tool_calls.append(tc) + class MockMessage: def __init__(self): self.content = response self.role = "assistant" - self.tool_calls = response_tool_calls + self.tool_calls = mock_tool_calls class MockChoice: def __init__(self): self.message = MockMessage() - self.finish_reason = "stop" + self.finish_reason = "tool_calls" if mock_tool_calls else "stop" class MockUsage: prompt_tokens = 10 From 2c0a804ed294810d711e91e606ebe6d2c633f264 Mon Sep 17 00:00:00 2001 From: cemde Date: Fri, 26 Dec 2025 12:43:24 +0100 Subject: [PATCH 6/6] fixed docs --- docs/interface/inference/anthropic.md | 7 +++++++ docs/reference/model.md | 8 ++++++++ maseval/core/model.py | 3 +-- mkdocs.yml | 1 + 4 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 docs/interface/inference/anthropic.md diff --git a/docs/interface/inference/anthropic.md b/docs/interface/inference/anthropic.md new file mode 100644 index 0000000..477608a --- /dev/null +++ b/docs/interface/inference/anthropic.md @@ -0,0 +1,7 @@ +# Anthropic Inference Adapter + +This page documents the [Anthropic](https://docs.anthropic.com/) model adapter for MASEval. + +[:material-github: View source](https://github.com/parameterlab/maseval/blob/main/maseval/interface/inference/anthropic.py){ .md-source-file } + +::: maseval.interface.inference.anthropic.AnthropicModelAdapter diff --git a/docs/reference/model.md b/docs/reference/model.md index e2b421e..1569d93 100644 --- a/docs/reference/model.md +++ b/docs/reference/model.md @@ -25,3 +25,11 @@ The following adapter classes implement the ModelAdapter interface for specific [:material-github: View source](https://github.com/parameterlab/maseval/blob/main/maseval/interface/inference/google_genai.py){ .md-source-file } ::: maseval.interface.inference.google_genai.GoogleGenAIModelAdapter + +[:material-github: View source](https://github.com/parameterlab/maseval/blob/main/maseval/interface/inference/litellm.py){ .md-source-file } + +::: maseval.interface.inference.litellm.LiteLLMModelAdapter + +[:material-github: View source](https://github.com/parameterlab/maseval/blob/main/maseval/interface/inference/anthropic.py){ .md-source-file } + +::: maseval.interface.inference.anthropic.AnthropicModelAdapter diff --git a/maseval/core/model.py b/maseval/core/model.py index 69dfa83..d1de156 100644 --- a/maseval/core/model.py +++ b/maseval/core/model.py @@ -4,8 +4,7 @@ implement. It defines a consistent interface for interacting with LLMs across different providers (OpenAI, Anthropic, Google, HuggingFace, LiteLLM, etc.). -Concrete implementations for specific inference providers are in: - maseval.interface.inference +See `maseval.interface.inference` for concrete implementations. Example: ```python diff --git a/mkdocs.yml b/mkdocs.yml index 8161f8a..76695b6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -114,6 +114,7 @@ nav: - LlamaIndex: interface/agents/llamaindex.md - SmolAgents: interface/agents/smolagents.md - Models: + - Anthropic: interface/inference/anthropic.md - Google GenAI: interface/inference/google_genai.md - HuggingFace: interface/inference/huggingface.md - LiteLLM: interface/inference/litellm.md