From f3f3eaf9e6e6d4a674434fefa30cf98dfc1ed1ed Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 25 Dec 2025 13:37:03 +0100
Subject: [PATCH 1/6] implemented model chat

---
 CHANGELOG.md                                  |  11 +
 maseval/__init__.py                           |   6 +-
 maseval/core/model.py                         | 327 +++++++++++++--
 maseval/interface/inference/__init__.py       |  28 +-
 maseval/interface/inference/anthropic.py      | 379 ++++++++++++++++++
 maseval/interface/inference/google_genai.py   | 316 +++++++++++++--
 maseval/interface/inference/huggingface.py    | 341 ++++++++++++++--
 maseval/interface/inference/litellm.py        | 188 ++++++---
 maseval/interface/inference/openai.py         | 293 +++++++++++---
 pyproject.toml                                |   3 +-
 tests/conftest.py                             |  62 ++-
 tests/test_core/test_model_adapter.py         |  85 +++-
 .../test_model_adapters.py                    | 146 +++++--
 uv.lock                                       |  38 +-
 14 files changed, 1934 insertions(+), 289 deletions(-)
 create mode 100644 maseval/interface/inference/anthropic.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3850a69..7a28d0f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+**ModelAdapter Chat Interface**
+
+- Added `chat()` method to `ModelAdapter` as the primary interface for LLM inference, accepting a list of messages in OpenAI format and returning a `ChatResponse` object and accepting tools
+- Added `ChatResponse` dataclass containing `content`, `tool_calls`, `role`, `usage`, `model`, and `stop_reason` fields for structured response handling
+
+**AnthropicModelAdapter**
+
+- New `AnthropicModelAdapter` for direct integration with Anthropic Claude models via the official Anthropic SDK
+- Handles Anthropic-specific message format conversion (system messages, tool_use/tool_result blocks) internally while accepting OpenAI-compatible input
+- Added `anthropic` optional dependency: `pip install maseval[anthropic]`
+
 ### Changed
 
 ### Fixed
diff --git a/maseval/__init__.py b/maseval/__init__.py
index 11ea20d..e74feda 100644
--- a/maseval/__init__.py
+++ b/maseval/__init__.py
@@ -22,7 +22,7 @@
     ToolSimulatorError,
     UserSimulatorError,
 )
-from .core.model import ModelAdapter
+from .core.model import ModelAdapter, ChatResponse
 from .core.user import User, TerminationReason
 from .core.evaluator import Evaluator
 from .core.history import MessageHistory, ToolInvocationHistory
@@ -67,8 +67,10 @@
     # History and tracing
     "MessageHistory",
     "ToolInvocationHistory",
-    "ModelAdapter",
     "TraceableMixin",
+    # Model adapters
+    "ModelAdapter",
+    "ChatResponse",
     # Exceptions and validation
     "MASEvalError",
     "AgentError",
diff --git a/maseval/core/model.py b/maseval/core/model.py
index afd7c11..69dfa83 100644
--- a/maseval/core/model.py
+++ b/maseval/core/model.py
@@ -1,78 +1,280 @@
-"""Core model adapter abstractions.
+"""Core model adapter abstractions for LLM inference.
+
+This module provides the base `ModelAdapter` class that all model adapters must
+implement. It defines a consistent interface for interacting with LLMs across
+different providers (OpenAI, Anthropic, Google, HuggingFace, LiteLLM, etc.).
 
 Concrete implementations for specific inference providers are in:
     maseval.interface.inference
+
+Example:
+    ```python
+    from maseval.interface.inference import LiteLLMModelAdapter
+
+    # Create adapter
+    model = LiteLLMModelAdapter(model_id="gpt-4")
+
+    # Simple text generation
+    response = model.generate("What is 2+2?")
+    print(response)  # "4"
+
+    # Chat with messages
+    response = model.chat([
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is 2+2?"}
+    ])
+    print(response.content)  # "4"
+
+    # Chat with tools
+    response = model.chat(
+        messages=[{"role": "user", "content": "What's the weather in Paris?"}],
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get weather for a city",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"city": {"type": "string"}},
+                    "required": ["city"]
+                }
+            }
+        }]
+    )
+    if response.tool_calls:
+        print(response.tool_calls[0]["function"]["name"])  # "get_weather"
+    ```
 """
 
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Any, Optional, Dict
+from dataclasses import dataclass
+from typing import Any, Optional, Dict, List, Union
 from datetime import datetime
 import time
 
 from .tracing import TraceableMixin
 from .config import ConfigurableMixin
+from .history import MessageHistory
+
+
+@dataclass
+class ChatResponse:
+    """Response from a chat completion.
+
+    When the model generates a response, it returns either text content,
+    tool calls, or both. Use this class to access the response data.
+
+    Attributes:
+        content: The text content of the response. May be None if the model
+            only returned tool calls.
+        tool_calls: List of tool calls the model wants to execute. Each tool
+            call is a dict with 'id', 'type', and 'function' keys. The
+            'function' contains 'name' and 'arguments' (JSON string).
+            None if no tools were called.
+        role: The role of the response message. Always "assistant".
+        usage: Token usage statistics if available. Dict with keys like
+            'input_tokens', 'output_tokens', 'total_tokens'.
+        model: The model ID that generated this response, if available.
+        stop_reason: Why the model stopped generating. Common values:
+            'end_turn', 'tool_use', 'max_tokens', 'stop_sequence'.
+
+    Example:
+        ```python
+        response = model.chat([{"role": "user", "content": "Hello"}])
+
+        # Text response
+        if response.content:
+            print(response.content)
+
+        # Tool call response
+        if response.tool_calls:
+            for call in response.tool_calls:
+                name = call["function"]["name"]
+                args = json.loads(call["function"]["arguments"])
+                result = execute_tool(name, args)
+        ```
+    """
+
+    content: Optional[str] = None
+    tool_calls: Optional[List[Dict[str, Any]]] = None
+    role: str = "assistant"
+    usage: Optional[Dict[str, int]] = None
+    model: Optional[str] = None
+    stop_reason: Optional[str] = None
+
+    def to_message(self) -> Dict[str, Any]:
+        """Convert this response to an OpenAI-compatible message dict.
+
+        Use this to append the assistant's response to your message history
+        before continuing the conversation.
+
+        Returns:
+            Dict with 'role', 'content', and optionally 'tool_calls'.
+
+        Example:
+            ```python
+            messages = [{"role": "user", "content": "Hello"}]
+            response = model.chat(messages)
+
+            # Add assistant response to history
+            messages.append(response.to_message())
+
+            # Continue conversation
+            messages.append({"role": "user", "content": "Tell me more"})
+            response = model.chat(messages)
+            ```
+        """
+        msg: Dict[str, Any] = {"role": self.role}
+        if self.content is not None:
+            msg["content"] = self.content
+        if self.tool_calls:
+            msg["tool_calls"] = self.tool_calls
+        return msg
 
 
 class ModelAdapter(ABC, TraceableMixin, ConfigurableMixin):
     """Abstract base class for model adapters.
 
-    Concrete implementations must provide a `generate` method that accepts a
-    prompt string and returns the model's text output. They should also expose
-    a `model_id` property identifying the underlying model.
+    ModelAdapter provides a consistent interface for LLM inference across
+    different providers. All adapters implement the same methods, so you
+    can swap providers without changing your code.
 
-    This class automatically tracks all generation calls for tracing and evaluation.
+    To use a model adapter:
+        1. Create an instance with provider-specific configuration
+        2. Call `chat()` for message-based conversations
+        3. Call `generate()` for simple text-in/text-out
+
+    The adapter automatically tracks all calls for tracing and evaluation.
+
+    Implementing a custom adapter:
+        Subclass ModelAdapter and implement:
+        - `model_id` property: Return the model identifier string
+        - `_chat_impl()`: The actual chat completion logic
 
     See maseval.interface.inference for concrete implementations:
-    - GoogleGenAIModelAdapter
-    - OpenAIModelAdapter
-    - HuggingFaceModelAdapter
+        - AnthropicModelAdapter
+        - GoogleGenAIModelAdapter
+        - HuggingFaceModelAdapter
+        - LiteLLMModelAdapter
+        - OpenAIModelAdapter
     """
 
     def __init__(self):
         """Initialize the model adapter with call tracing."""
         super().__init__()
-        self.logs: list[dict[str, Any]] = []
+        self.logs: List[Dict[str, Any]] = []
 
     @property
     @abstractmethod
     def model_id(self) -> str:
-        """A string identifier for the underlying model."""
+        """The identifier for the underlying model.
+
+        Returns:
+            A string identifying the model (e.g., "gpt-4", "claude-sonnet-4-5",
+            "gemini-pro"). Used for tracing and configuration.
+        """
 
-    def generate(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str:
-        """Generate text from the model with automatic tracing.
+    def chat(
+        self,
+        messages: Union[List[Dict[str, Any]], MessageHistory],
+        generation_params: Optional[Dict[str, Any]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        **kwargs: Any,
+    ) -> ChatResponse:
+        """Send messages to the model and get a response.
 
-        This method wraps the actual generation logic to track timing,
-        parameters, and errors for later evaluation.
+        This is the primary method for interacting with the model. Pass a
+        conversation history and receive the model's response.
 
         Args:
-            prompt: The input prompt
-            generation_params: Optional generation parameters
-            **kwargs: Additional provider-specific arguments
+            messages: The conversation history. Either a list of message dicts
+                in OpenAI format, or a MessageHistory object. Each message
+                has 'role' ('system', 'user', 'assistant', 'tool') and
+                'content' keys.
+            generation_params: Model parameters like temperature, max_tokens,
+                top_p, etc. Provider-specific parameters are also accepted.
+            tools: Tool definitions the model can use. Each tool is a dict
+                with 'type' (usually 'function') and 'function' containing
+                'name', 'description', and 'parameters' (JSON Schema).
+            tool_choice: How the model should use tools:
+                - "auto": Model decides whether to use tools (default)
+                - "none": Model won't use tools
+                - "required": Model must use a tool
+                - {"type": "function", "function": {"name": "..."}}: Use specific tool
+            **kwargs: Additional provider-specific arguments.
 
         Returns:
-            The model output as a string
+            ChatResponse containing the model's response (text and/or tool calls).
 
         Raises:
-            Exception: Any exception from the underlying model is logged and re-raised
+            Exception: Provider-specific errors are logged and re-raised.
+
+        Example:
+            ```python
+            # Simple conversation
+            response = model.chat([
+                {"role": "user", "content": "Hello!"}
+            ])
+            print(response.content)
+
+            # With system prompt
+            response = model.chat([
+                {"role": "system", "content": "You are a pirate."},
+                {"role": "user", "content": "Hello!"}
+            ])
+
+            # With tools
+            response = model.chat(
+                messages=[{"role": "user", "content": "What's 2+2?"}],
+                tools=[{
+                    "type": "function",
+                    "function": {
+                        "name": "calculator",
+                        "description": "Evaluate math expressions",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {"expression": {"type": "string"}},
+                            "required": ["expression"]
+                        }
+                    }
+                }]
+            )
+            ```
         """
         start_time = time.time()
         timestamp = datetime.now().isoformat()
 
+        # Convert MessageHistory to list if needed
+        if isinstance(messages, MessageHistory):
+            messages_list = messages.to_openai_format()
+        else:
+            messages_list = messages
+
         try:
-            result = self._generate_impl(prompt, generation_params, **kwargs)
+            result = self._chat_impl(
+                messages_list,
+                generation_params=generation_params,
+                tools=tools,
+                tool_choice=tool_choice,
+                **kwargs,
+            )
             duration = time.time() - start_time
 
             self.logs.append(
                 {
                     "timestamp": timestamp,
-                    "prompt_length": len(prompt),
-                    "response_length": len(result) if result else 0,
+                    "message_count": len(messages_list),
+                    "response_type": "tool_call" if result.tool_calls else "text",
+                    "response_length": len(result.content) if result.content else 0,
+                    "tool_calls_count": len(result.tool_calls) if result.tool_calls else 0,
                     "duration_seconds": duration,
                     "status": "success",
                     "generation_params": generation_params or {},
-                    "kwargs": {k: str(v) for k, v in kwargs.items()},  # Serialize for JSON
+                    "tools_provided": len(tools) if tools else 0,
+                    "kwargs": {k: str(v) for k, v in kwargs.items()},
                 }
             )
 
@@ -84,12 +286,13 @@ def generate(self, prompt: str, generation_params: Optional[Dict[str, Any]] = No
             self.logs.append(
                 {
                     "timestamp": timestamp,
-                    "prompt_length": len(prompt),
+                    "message_count": len(messages_list),
                     "duration_seconds": duration,
                     "status": "error",
                     "error": str(e),
                     "error_type": type(e).__name__,
                     "generation_params": generation_params or {},
+                    "tools_provided": len(tools) if tools else 0,
                     "kwargs": {k: str(v) for k, v in kwargs.items()},
                 }
             )
@@ -97,32 +300,79 @@ def generate(self, prompt: str, generation_params: Optional[Dict[str, Any]] = No
             raise
 
     @abstractmethod
-    def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str:
-        """Internal generation implementation to be overridden by subclasses.
+    def _chat_impl(
+        self,
+        messages: List[Dict[str, Any]],
+        generation_params: Optional[Dict[str, Any]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        **kwargs: Any,
+    ) -> ChatResponse:
+        """Internal chat implementation to be overridden by subclasses.
+
+        Implement this method to call your provider's API. The base class
+        handles tracing, timing, and error logging.
+
+        Args:
+            messages: List of message dicts in OpenAI format.
+            generation_params: Generation parameters (temperature, etc.).
+            tools: Tool definitions, if any.
+            tool_choice: Tool choice setting, if any.
+            **kwargs: Additional provider-specific arguments.
+
+        Returns:
+            ChatResponse with the model's output.
+        """
+
+    def generate(
+        self,
+        prompt: str,
+        generation_params: Optional[Dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Generate text from a simple prompt.
+
+        This is a convenience method that wraps the prompt in a user message
+        and calls `chat()`. Use this for simple text-in/text-out scenarios.
+
+        For conversations or tool use, use `chat()` directly.
 
         Args:
-            prompt: The input prompt
-            generation_params: Optional generation parameters
-            **kwargs: Additional provider-specific arguments
+            prompt: The input prompt.
+            generation_params: Generation parameters (temperature, max_tokens, etc.).
+            **kwargs: Additional provider-specific arguments.
 
         Returns:
-            The model output as a string
+            The model's text response.
+
+        Example:
+            ```python
+            response = model.generate("What is the capital of France?")
+            print(response)  # "Paris"
+            ```
         """
+        messages = [{"role": "user", "content": prompt}]
+        response = self.chat(messages, generation_params=generation_params, **kwargs)
+        return response.content or ""
 
-    def gather_traces(self) -> dict[str, Any]:
+    def gather_traces(self) -> Dict[str, Any]:
         """Gather execution traces from this model adapter.
 
+        Called automatically by Benchmark to collect execution data for
+        evaluation. Returns comprehensive statistics about all calls made
+        to this adapter.
+
         Returns:
             Dictionary containing:
             - type: Component class name
             - gathered_at: ISO timestamp
             - model_id: Model identifier
-            - total_calls: Number of generation calls
+            - total_calls: Number of chat/generate calls
             - successful_calls: Number of successful calls
             - failed_calls: Number of failed calls
-            - total_duration_seconds: Total time spent generating
+            - total_duration_seconds: Total time spent in calls
             - average_duration_seconds: Average time per call
-            - logs: List of all individual call records with timestamps, durations, and parameters
+            - logs: List of individual call records
         """
         total_calls = len(self.logs)
         successful_calls = sum(1 for call in self.logs if call["status"] == "success")
@@ -141,15 +391,18 @@ def gather_traces(self) -> dict[str, Any]:
             "logs": self.logs,
         }
 
-    def gather_config(self) -> dict[str, Any]:
+    def gather_config(self) -> Dict[str, Any]:
         """Gather configuration from this model adapter.
 
+        Called automatically by Benchmark to collect configuration for
+        reproducibility. Returns identifying information about this adapter.
+
         Returns:
             Dictionary containing:
             - type: Component class name
             - gathered_at: ISO timestamp
             - model_id: Model identifier
-            - adapter_type: The specific adapter class (e.g., OpenAIModelAdapter)
+            - adapter_type: The specific adapter class name
         """
         return {
             **super().gather_config(),
diff --git a/maseval/interface/inference/__init__.py b/maseval/interface/inference/__init__.py
index 72be8e3..e6765d1 100644
--- a/maseval/interface/inference/__init__.py
+++ b/maseval/interface/inference/__init__.py
@@ -2,10 +2,35 @@
 
 This package contains concrete implementations of ModelAdapter for different
 inference providers. Each adapter requires the corresponding optional dependency.
+
+Available adapters:
+    - AnthropicModelAdapter: Anthropic Claude models (requires anthropic)
+    - GoogleGenAIModelAdapter: Google Gemini models (requires google-genai)
+    - HuggingFaceModelAdapter: HuggingFace transformers (requires transformers)
+    - LiteLLMModelAdapter: 100+ providers via LiteLLM (requires litellm)
+    - OpenAIModelAdapter: OpenAI and compatible APIs (requires openai)
+
+Example:
+    ```python
+    from maseval.interface.inference import LiteLLMModelAdapter
+
+    # Use any supported provider
+    model = LiteLLMModelAdapter(model_id="gpt-4")
+    response = model.chat([{"role": "user", "content": "Hello!"}])
+    print(response.content)
+    ```
 """
 
 __all__ = []
 
+# Conditionally import Anthropic adapter
+try:
+    from .anthropic import AnthropicModelAdapter  # noqa: F401
+
+    __all__.append("AnthropicModelAdapter")
+except ImportError:
+    pass
+
 # Conditionally import google-genai adapter
 try:
     from .google_genai import GoogleGenAIModelAdapter  # noqa: F401
@@ -24,9 +49,10 @@
 
 # Conditionally import HuggingFace adapter
 try:
-    from .huggingface import HuggingFaceModelAdapter  # noqa: F401
+    from .huggingface import HuggingFaceModelAdapter, ToolCallingNotSupportedError  # noqa: F401
 
     __all__.append("HuggingFaceModelAdapter")
+    __all__.append("ToolCallingNotSupportedError")
 except ImportError:
     pass
 
diff --git a/maseval/interface/inference/anthropic.py b/maseval/interface/inference/anthropic.py
new file mode 100644
index 0000000..0363d22
--- /dev/null
+++ b/maseval/interface/inference/anthropic.py
@@ -0,0 +1,379 @@
+"""Anthropic model adapter.
+
+This adapter works with the official Anthropic Python SDK for accessing
+Claude models directly.
+
+Requires anthropic to be installed:
+    pip install maseval[anthropic]
+
+Example:
+    ```python
+    from anthropic import Anthropic
+    from maseval.interface.inference import AnthropicModelAdapter
+
+    # Create client (uses ANTHROPIC_API_KEY env var)
+    client = Anthropic()
+
+    # Create adapter
+    model = AnthropicModelAdapter(
+        client=client,
+        model_id="claude-sonnet-4-5-20250514"
+    )
+
+    # Simple generation
+    response = model.generate("Hello!")
+
+    # Chat with messages
+    response = model.chat([
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Hello!"}
+    ])
+
+    # Chat with tools
+    response = model.chat(
+        messages=[{"role": "user", "content": "What's the weather in Paris?"}],
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get weather for a city",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"city": {"type": "string"}},
+                    "required": ["city"]
+                }
+            }
+        }]
+    )
+    ```
+"""
+
+import json
+from typing import Any, Optional, Dict, List, Union
+
+from maseval.core.model import ModelAdapter, ChatResponse
+
+
+class AnthropicModelAdapter(ModelAdapter):
+    """Adapter for Anthropic Claude models.
+
+    Works with Claude models through the official Anthropic Python SDK.
+
+    Supported models include:
+        - claude-sonnet-4-5-20250514 (Claude Sonnet 4.5)
+        - claude-opus-4-5-20251101 (Claude Opus 4.5)
+        - claude-3-5-sonnet-20241022
+        - claude-3-opus-20240229
+        - And other Claude model variants
+
+    The adapter accepts OpenAI-style messages and converts them to Anthropic's
+    format internally. Key differences handled automatically:
+        - System messages are passed separately (not in messages array)
+        - Tool definitions are converted to Anthropic format
+        - Tool responses are converted to tool_result content blocks
+
+    API keys can be set via ANTHROPIC_API_KEY environment variable or
+    passed to the Anthropic client directly.
+    """
+
+    def __init__(
+        self,
+        client: Any,
+        model_id: str,
+        default_generation_params: Optional[Dict[str, Any]] = None,
+        max_tokens: int = 4096,
+    ):
+        """Initialize Anthropic model adapter.
+
+        Args:
+            client: An anthropic.Anthropic client instance.
+            model_id: The model identifier (e.g., "claude-sonnet-4-5-20250514").
+            default_generation_params: Default parameters for all calls.
+                Common parameters: temperature, top_p, top_k.
+            max_tokens: Maximum tokens to generate. Anthropic requires this
+                parameter. Default is 4096.
+        """
+        super().__init__()
+        self._client = client
+        self._model_id = model_id
+        self._default_generation_params = default_generation_params or {}
+        self._max_tokens = max_tokens
+
+    @property
+    def model_id(self) -> str:
+        return self._model_id
+
+    def _chat_impl(
+        self,
+        messages: List[Dict[str, Any]],
+        generation_params: Optional[Dict[str, Any]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        **kwargs: Any,
+    ) -> ChatResponse:
+        """Call Anthropic Messages API.
+
+        Args:
+            messages: List of message dicts in OpenAI format.
+            generation_params: Generation parameters (temperature, etc.).
+            tools: Tool definitions for function calling (OpenAI format).
+            tool_choice: Tool choice setting.
+            **kwargs: Additional Anthropic parameters.
+
+        Returns:
+            ChatResponse with the model's output.
+        """
+        # Merge parameters
+        params = dict(self._default_generation_params)
+        if generation_params:
+            params.update(generation_params)
+        params.update(kwargs)
+
+        # Extract and set max_tokens
+        max_tokens = params.pop("max_tokens", self._max_tokens)
+
+        # Convert messages (extract system, convert tool responses)
+        system_prompt, converted_messages = self._convert_messages(messages)
+
+        # Convert tools to Anthropic format
+        anthropic_tools = None
+        if tools:
+            anthropic_tools = self._convert_tools(tools)
+
+        # Handle tool_choice
+        anthropic_tool_choice = None
+        if tool_choice is not None:
+            anthropic_tool_choice = self._convert_tool_choice(tool_choice)
+
+        # Build request
+        request_params = {
+            "model": self._model_id,
+            "max_tokens": max_tokens,
+            "messages": converted_messages,
+            **params,
+        }
+
+        if system_prompt:
+            request_params["system"] = system_prompt
+
+        if anthropic_tools:
+            request_params["tools"] = anthropic_tools
+
+        if anthropic_tool_choice:
+            request_params["tool_choice"] = anthropic_tool_choice
+
+        # Call API
+        response = self._client.messages.create(**request_params)
+
+        return self._parse_response(response)
+
+    def _convert_messages(
+        self, messages: List[Dict[str, Any]]
+    ) -> tuple[Optional[str], List[Dict[str, Any]]]:
+        """Convert OpenAI messages to Anthropic format.
+
+        Anthropic separates system messages and uses different format for
+        tool responses.
+
+        Args:
+            messages: OpenAI-format messages.
+
+        Returns:
+            Tuple of (system_prompt, converted_messages).
+        """
+        system_prompt = None
+        converted = []
+
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+
+            if role == "system":
+                # Anthropic takes system as separate parameter
+                system_prompt = content
+
+            elif role == "tool":
+                # Convert to Anthropic tool_result format
+                # Tool results in Anthropic are user messages with tool_result content
+                tool_call_id = msg.get("tool_call_id", "")
+                converted.append(
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "tool_result",
+                                "tool_use_id": tool_call_id,
+                                "content": content,
+                            }
+                        ],
+                    }
+                )
+
+            elif role == "assistant":
+                # Check if this message has tool_calls (from previous response)
+                if "tool_calls" in msg and msg["tool_calls"]:
+                    # Convert to Anthropic format with tool_use content blocks
+                    content_blocks = []
+
+                    # Add text content if present
+                    if msg.get("content"):
+                        content_blocks.append({"type": "text", "text": msg["content"]})
+
+                    # Add tool use blocks
+                    for tc in msg["tool_calls"]:
+                        func = tc.get("function", {})
+                        args = func.get("arguments", "{}")
+                        if isinstance(args, str):
+                            try:
+                                args = json.loads(args)
+                            except json.JSONDecodeError:
+                                args = {}
+
+                        content_blocks.append(
+                            {
+                                "type": "tool_use",
+                                "id": tc.get("id", ""),
+                                "name": func.get("name", ""),
+                                "input": args,
+                            }
+                        )
+
+                    converted.append({"role": "assistant", "content": content_blocks})
+                else:
+                    # Simple text message
+                    converted.append({"role": "assistant", "content": content})
+
+            else:
+                # User message
+                converted.append({"role": "user", "content": content})
+
+        return system_prompt, converted
+
+    def _convert_tools(self, tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Convert OpenAI tool format to Anthropic format.
+
+        Args:
+            tools: OpenAI-format tool definitions.
+
+        Returns:
+            Anthropic-format tool definitions.
+        """
+        anthropic_tools = []
+
+        for tool in tools:
+            if tool.get("type") == "function":
+                func = tool.get("function", {})
+                anthropic_tools.append(
+                    {
+                        "name": func.get("name", ""),
+                        "description": func.get("description", ""),
+                        "input_schema": func.get("parameters", {"type": "object", "properties": {}}),
+                    }
+                )
+
+        return anthropic_tools
+
+    def _convert_tool_choice(
+        self, tool_choice: Union[str, Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        """Convert OpenAI tool_choice to Anthropic format.
+
+        Args:
+            tool_choice: OpenAI-format tool choice.
+
+        Returns:
+            Anthropic-format tool choice.
+        """
+        if tool_choice == "auto":
+            return {"type": "auto"}
+        elif tool_choice == "none":
+            # Anthropic doesn't have a direct "none" - we just don't pass tools
+            return {"type": "auto"}
+        elif tool_choice == "required":
+            return {"type": "any"}
+        elif isinstance(tool_choice, dict) and "function" in tool_choice:
+            return {"type": "tool", "name": tool_choice["function"]["name"]}
+        else:
+            return {"type": "auto"}
+
+    def _parse_response(self, response: Any) -> ChatResponse:
+        """Parse Anthropic response into ChatResponse.
+
+        Args:
+            response: The raw response from Anthropic.
+
+        Returns:
+            ChatResponse with extracted data.
+        """
+        # Extract content (may be text and/or tool_use blocks)
+        content = None
+        tool_calls = None
+
+        if hasattr(response, "content") and response.content:
+            text_parts = []
+            tool_use_parts = []
+
+            for block in response.content:
+                if hasattr(block, "type"):
+                    if block.type == "text":
+                        text_parts.append(block.text)
+                    elif block.type == "tool_use":
+                        tool_use_parts.append(
+                            {
+                                "id": block.id,
+                                "type": "function",
+                                "function": {
+                                    "name": block.name,
+                                    "arguments": json.dumps(block.input),
+                                },
+                            }
+                        )
+
+            if text_parts:
+                content = "".join(text_parts)
+
+            if tool_use_parts:
+                tool_calls = tool_use_parts
+
+        # Extract usage
+        usage = None
+        if hasattr(response, "usage") and response.usage:
+            usage = {
+                "input_tokens": getattr(response.usage, "input_tokens", 0),
+                "output_tokens": getattr(response.usage, "output_tokens", 0),
+                "total_tokens": (
+                    getattr(response.usage, "input_tokens", 0)
+                    + getattr(response.usage, "output_tokens", 0)
+                ),
+            }
+
+        # Extract stop reason
+        stop_reason = None
+        if hasattr(response, "stop_reason"):
+            stop_reason = response.stop_reason
+
+        return ChatResponse(
+            content=content,
+            tool_calls=tool_calls,
+            role="assistant",
+            usage=usage,
+            model=getattr(response, "model", self._model_id),
+            stop_reason=stop_reason,
+        )
+
+    def gather_config(self) -> Dict[str, Any]:
+        """Gather configuration from this Anthropic model adapter.
+
+        Returns:
+            Dictionary containing model configuration.
+        """
+        base_config = super().gather_config()
+        base_config.update(
+            {
+                "default_generation_params": self._default_generation_params,
+                "max_tokens": self._max_tokens,
+                "client_type": type(self._client).__name__,
+            }
+        )
+
+        return base_config
diff --git a/maseval/interface/inference/google_genai.py b/maseval/interface/inference/google_genai.py
index d30989f..fe71ba7 100644
--- a/maseval/interface/inference/google_genai.py
+++ b/maseval/interface/inference/google_genai.py
@@ -1,23 +1,67 @@
 """Google Generative AI model adapter.
 
+This adapter works with Google's Generative AI SDK (google-genai) for accessing
+Gemini models.
+
 Requires google-genai to be installed:
     pip install maseval[google-genai]
+
+Example:
+    ```python
+    from google import genai
+    from maseval.interface.inference import GoogleGenAIModelAdapter
+
+    # Create client
+    client = genai.Client(api_key="your-api-key")
+    # Or set GOOGLE_API_KEY environment variable
+
+    # Create adapter
+    model = GoogleGenAIModelAdapter(
+        client=client,
+        model_id="gemini-2.0-flash"
+    )
+
+    # Simple generation
+    response = model.generate("Hello!")
+
+    # Chat with messages
+    response = model.chat([
+        {"role": "user", "content": "Hello!"}
+    ])
+
+    # Chat with tools
+    response = model.chat(
+        messages=[{"role": "user", "content": "What's the weather?"}],
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get weather for a city",
+                "parameters": {...}
+            }
+        }]
+    )
+    ```
 """
 
-from typing import Any, Optional, Dict
-import json
+from typing import Any, Optional, Dict, List, Union
 
-from maseval.core.model import ModelAdapter
+from maseval.core.model import ModelAdapter, ChatResponse
 
 
 class GoogleGenAIModelAdapter(ModelAdapter):
-    """Adapter for Google Generative AI.
+    """Adapter for Google Generative AI (Gemini models).
+
+    Works with Google's Gemini models through the google-genai SDK.
 
-    The `client` may be a callable that accepts the prompt and returns a dict-like
-    response, or a client object with a `generate` method. The adapter will try
-    to normalize the response to a text string.
+    Supported models include:
+        - gemini-2.0-flash
+        - gemini-1.5-pro
+        - gemini-1.5-flash
+        - And other Gemini model variants
 
-    Requires google-genai to be installed.
+    The adapter converts OpenAI-style messages to Google's format internally,
+    so you can use the same message format across all adapters.
     """
 
     def __init__(
@@ -26,6 +70,14 @@ def __init__(
         model_id: str,
         default_generation_params: Optional[Dict[str, Any]] = None,
     ):
+        """Initialize Google GenAI model adapter.
+
+        Args:
+            client: A google.genai.Client instance.
+            model_id: The model identifier (e.g., "gemini-2.0-flash").
+            default_generation_params: Default parameters for all calls.
+                Common parameters: temperature, max_output_tokens, top_p.
+        """
         super().__init__()
         self._client = client
         self._model_id = model_id
@@ -35,47 +87,231 @@ def __init__(
     def model_id(self) -> str:
         return self._model_id
 
-    def _extract_text(self, response: Any) -> str:
-        # Normalize a few common shapes
-        if isinstance(response, str):
-            return response
-        if isinstance(response, dict):
-            # google generative responses often have `candidates` or `output` fields
-            if "candidates" in response and response["candidates"]:
-                return response["candidates"][0].get("content", "")
-            if "output" in response and isinstance(response["output"], list) and response["output"]:
-                # some implementations return a list of text chunks
-                first = response["output"][0]
-                if isinstance(first, dict):
-                    return first.get("content", "")
-                return str(first)
-            # fallback to stringifying
-            return json.dumps(response)
-        return str(response)
-
-    def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str:
-        from google import genai  # Lazy import
+    def _chat_impl(
+        self,
+        messages: List[Dict[str, Any]],
+        generation_params: Optional[Dict[str, Any]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        **kwargs: Any,
+    ) -> ChatResponse:
+        """Call Google GenAI API.
+
+        Args:
+            messages: List of message dicts in OpenAI format.
+            generation_params: Generation parameters (temperature, etc.).
+            tools: Tool definitions for function calling.
+            tool_choice: Tool choice setting.
+            **kwargs: Additional parameters.
 
+        Returns:
+            ChatResponse with the model's output.
+        """
+        from google import genai
+
+        # Merge parameters
         params = dict(self._default_generation_params)
         if generation_params:
             params.update(generation_params)
-        generation_config = genai.types.GenerateContentConfig(**params) if params else None
+        params.update(kwargs)
+
+        # Convert messages to Google format
+        system_instruction, contents = self._convert_messages(messages)
+
+        # Build config
+        config_params = {}
+        if system_instruction:
+            config_params["system_instruction"] = system_instruction
+
+        # Map common parameter names
+        if "max_tokens" in params:
+            config_params["max_output_tokens"] = params.pop("max_tokens")
+        if "max_output_tokens" in params:
+            config_params["max_output_tokens"] = params.pop("max_output_tokens")
+        if "temperature" in params:
+            config_params["temperature"] = params.pop("temperature")
+        if "top_p" in params:
+            config_params["top_p"] = params.pop("top_p")
+        if "top_k" in params:
+            config_params["top_k"] = params.pop("top_k")
+        if "stop_sequences" in params:
+            config_params["stop_sequences"] = params.pop("stop_sequences")
+
+        # Convert tools to Google format
+        if tools:
+            config_params["tools"] = self._convert_tools(tools)
+
+        # Handle tool_choice
+        if tool_choice is not None:
+            if tool_choice == "none":
+                config_params["tool_config"] = {"function_calling_config": {"mode": "NONE"}}
+            elif tool_choice == "auto":
+                config_params["tool_config"] = {"function_calling_config": {"mode": "AUTO"}}
+            elif tool_choice == "required":
+                config_params["tool_config"] = {"function_calling_config": {"mode": "ANY"}}
+            elif isinstance(tool_choice, dict) and "function" in tool_choice:
+                config_params["tool_config"] = {
+                    "function_calling_config": {
+                        "mode": "ANY",
+                        "allowed_function_names": [tool_choice["function"]["name"]],
+                    }
+                }
+
+        # Build generation config
+        generation_config = genai.types.GenerateContentConfig(**config_params) if config_params else None
+
+        # Call API
+        response = self._client.models.generate_content(
+            model=self._model_id, contents=contents, config=generation_config
+        )
+
+        return self._parse_response(response)
+
+    def _convert_messages(
+        self, messages: List[Dict[str, Any]]
+    ) -> tuple[Optional[str], List[Dict[str, Any]]]:
+        """Convert OpenAI messages to Google format.
 
-        # Call client
-        response = self._client.models.generate_content(model=self.model_id, contents=prompt, config=generation_config)
-        return response.text
+        Google uses 'contents' with 'parts', and separates system instructions.
+        Roles are 'user' and 'model' (not 'assistant').
+
+        Args:
+            messages: OpenAI-format messages.
+
+        Returns:
+            Tuple of (system_instruction, contents).
+        """
+        system_instruction = None
+        contents = []
+
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+
+            if role == "system":
+                system_instruction = content
+            elif role == "assistant":
+                contents.append({"role": "model", "parts": [{"text": content}]})
+            elif role == "tool":
+                # Tool response in Google format
+                tool_call_id = msg.get("tool_call_id", "")
+                contents.append(
+                    {
+                        "role": "function",
+                        "parts": [
+                            {
+                                "function_response": {
+                                    "name": msg.get("name", tool_call_id),
+                                    "response": {"result": content},
+                                }
+                            }
+                        ],
+                    }
+                )
+            else:
+                # User message
+                contents.append({"role": "user", "parts": [{"text": content}]})
+
+        return system_instruction, contents
+
+    def _convert_tools(self, tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Convert OpenAI tool format to Google format.
+
+        Args:
+            tools: OpenAI-format tool definitions.
+
+        Returns:
+            Google-format tool definitions.
+        """
+        google_tools = []
+
+        for tool in tools:
+            if tool.get("type") == "function":
+                func = tool.get("function", {})
+                google_tools.append(
+                    {
+                        "function_declarations": [
+                            {
+                                "name": func.get("name", ""),
+                                "description": func.get("description", ""),
+                                "parameters": func.get("parameters", {}),
+                            }
+                        ]
+                    }
+                )
+
+        return google_tools
+
+    def _parse_response(self, response: Any) -> ChatResponse:
+        """Parse Google GenAI response into ChatResponse.
+
+        Args:
+            response: The raw response from Google.
+
+        Returns:
+            ChatResponse with extracted data.
+        """
+        # Extract text content
+        content = None
+        if hasattr(response, "text"):
+            content = response.text
+
+        # Extract tool calls (function calls in Google terminology)
+        tool_calls = None
+        if hasattr(response, "candidates") and response.candidates:
+            candidate = response.candidates[0]
+            if hasattr(candidate, "content") and candidate.content:
+                for part in candidate.content.parts:
+                    if hasattr(part, "function_call") and part.function_call:
+                        if tool_calls is None:
+                            tool_calls = []
+                        fc = part.function_call
+                        # Convert args to JSON string
+                        import json
+
+                        args = dict(fc.args) if fc.args else {}
+                        tool_calls.append(
+                            {
+                                "id": f"call_{fc.name}",
+                                "type": "function",
+                                "function": {
+                                    "name": fc.name,
+                                    "arguments": json.dumps(args),
+                                },
+                            }
+                        )
+
+        # Extract usage
+        usage = None
+        if hasattr(response, "usage_metadata") and response.usage_metadata:
+            um = response.usage_metadata
+            usage = {
+                "input_tokens": getattr(um, "prompt_token_count", 0),
+                "output_tokens": getattr(um, "candidates_token_count", 0),
+                "total_tokens": getattr(um, "total_token_count", 0),
+            }
+
+        # Extract stop reason
+        stop_reason = None
+        if hasattr(response, "candidates") and response.candidates:
+            candidate = response.candidates[0]
+            if hasattr(candidate, "finish_reason"):
+                stop_reason = str(candidate.finish_reason)
+
+        return ChatResponse(
+            content=content,
+            tool_calls=tool_calls,
+            role="assistant",
+            usage=usage,
+            model=self._model_id,
+            stop_reason=stop_reason,
+        )
 
-    def gather_config(self) -> dict[str, Any]:
+    def gather_config(self) -> Dict[str, Any]:
         """Gather configuration from this Google GenAI model adapter.
 
         Returns:
-            Dictionary containing:
-            - type: Component class name
-            - gathered_at: ISO timestamp
-            - model_id: Model identifier
-            - adapter_type: GoogleGenAIModelAdapter
-            - default_generation_params: Default parameters used for generation (temperature, top_p, etc.)
-            - client_type: Type name of the underlying client
+            Dictionary containing model configuration.
         """
         base_config = super().gather_config()
         base_config.update(
diff --git a/maseval/interface/inference/huggingface.py b/maseval/interface/inference/huggingface.py
index 3ef0751..7f7e541 100644
--- a/maseval/interface/inference/huggingface.py
+++ b/maseval/interface/inference/huggingface.py
@@ -1,21 +1,62 @@
 """HuggingFace model adapter.
 
+This adapter works with HuggingFace transformers pipelines and models.
+It supports both simple callable models and full pipeline objects.
+
 Requires transformers to be installed:
     pip install maseval[transformers]
+
+Example:
+    ```python
+    from transformers import pipeline
+    from maseval.interface.inference import HuggingFaceModelAdapter
+
+    # Using a pipeline
+    pipe = pipeline("text-generation", model="meta-llama/Llama-3.1-8B-Instruct")
+    model = HuggingFaceModelAdapter(model=pipe, model_id="llama-3.1-8b")
+
+    # Simple generation
+    response = model.generate("Hello!")
+
+    # Chat with messages (uses chat template if available)
+    response = model.chat([
+        {"role": "user", "content": "Hello!"}
+    ])
+    ```
+
+Note on tool calling:
+    HuggingFace models have varying support for tool calling. This adapter
+    will raise an exception if tools are passed but the model's chat template
+    does not support them. Use LiteLLMModelAdapter for more reliable tool
+    calling with a wider range of models.
 """
 
-from typing import Any, Callable, Optional, Dict
+from typing import Any, Optional, Dict, List, Callable, Union
+
+from maseval.core.model import ModelAdapter, ChatResponse
+
+
+class ToolCallingNotSupportedError(Exception):
+    """Raised when tool calling is requested but not supported by the model."""
 
-from maseval.core.model import ModelAdapter
+    pass
 
 
 class HuggingFaceModelAdapter(ModelAdapter):
-    """Adapter for HuggingFace-style generation.
+    """Adapter for HuggingFace transformers models and pipelines.
 
-    This adapter accepts either a `callable` that takes `prompt` and returns
-    text, or a thin `pipeline`-like object with a `__call__`.
+    Works with:
+        - transformers.pipeline() objects
+        - Any callable that accepts a prompt and returns text
 
-    Requires transformers to be installed.
+    For chat functionality, the adapter uses the tokenizer's chat template
+    if available. This provides proper formatting for instruction-tuned models.
+
+    Tool calling support:
+        Tool calling is only supported if the model's chat template explicitly
+        supports it. If you pass tools and the model doesn't support them,
+        a ToolCallingNotSupportedError is raised. For reliable tool calling,
+        consider using LiteLLMModelAdapter instead.
     """
 
     def __init__(
@@ -24,6 +65,17 @@ def __init__(
         model_id: Optional[str] = None,
         default_generation_params: Optional[Dict[str, Any]] = None,
     ):
+        """Initialize HuggingFace model adapter.
+
+        Args:
+            model: A callable that generates text. Can be:
+                - A transformers pipeline (e.g., pipeline("text-generation", ...))
+                - Any callable that takes a prompt string and returns text
+            model_id: Identifier for the model. If not provided, attempts to
+                extract from the model's name_or_path attribute.
+            default_generation_params: Default parameters for all calls.
+                Common parameters: max_new_tokens, temperature, top_p, do_sample.
+        """
         super().__init__()
         self._model = model
         self._model_id = model_id or getattr(model, "name_or_path", "huggingface:unknown")
@@ -33,34 +85,271 @@ def __init__(
     def model_id(self) -> str:
         return self._model_id
 
-    def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str:
-        # Merge default params and call-time params; forward to underlying callable
+    def _chat_impl(
+        self,
+        messages: List[Dict[str, Any]],
+        generation_params: Optional[Dict[str, Any]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        **kwargs: Any,
+    ) -> ChatResponse:
+        """Generate response using HuggingFace model.
+
+        Args:
+            messages: List of message dicts in OpenAI format.
+            generation_params: Generation parameters (temperature, etc.).
+            tools: Tool definitions. Raises ToolCallingNotSupportedError if
+                provided but not supported by the model's chat template.
+            tool_choice: Tool choice setting (ignored if tools not supported).
+            **kwargs: Additional parameters passed to the model.
+
+        Returns:
+            ChatResponse with the model's output.
+
+        Raises:
+            ToolCallingNotSupportedError: If tools are provided but the model
+                doesn't support tool calling.
+        """
+        # Merge parameters
         params = dict(self._default_generation_params)
         if generation_params:
             params.update(generation_params)
-        # allow explicit kwargs to override
         params.update(kwargs)
+
+        # Try to use chat template if available
+        tokenizer = self._get_tokenizer()
+
+        if tokenizer is not None and hasattr(tokenizer, "apply_chat_template"):
+            return self._chat_with_template(messages, params, tools, tool_choice, tokenizer)
+        else:
+            # Fallback: convert messages to simple prompt
+            if tools:
+                raise ToolCallingNotSupportedError(
+                    f"Model {self._model_id} does not have a chat template that supports tools. "
+                    "Tool calling requires a model with an appropriate chat template. "
+                    "Consider using LiteLLMModelAdapter for reliable tool calling."
+                )
+            return self._chat_without_template(messages, params)
+
+    def _get_tokenizer(self) -> Any:
+        """Get the tokenizer from the model/pipeline if available.
+
+        Returns:
+            The tokenizer, or None if not available.
+        """
+        # Pipeline objects have a tokenizer attribute
+        if hasattr(self._model, "tokenizer"):
+            return self._model.tokenizer
+
+        # Some models expose the tokenizer directly
+        if hasattr(self._model, "model") and hasattr(self._model.model, "tokenizer"):
+            return self._model.model.tokenizer
+
+        return None
+
+    def _chat_with_template(
+        self,
+        messages: List[Dict[str, Any]],
+        params: Dict[str, Any],
+        tools: Optional[List[Dict[str, Any]]],
+        tool_choice: Optional[Union[str, Dict[str, Any]]],
+        tokenizer: Any,
+    ) -> ChatResponse:
+        """Generate using the tokenizer's chat template.
+
+        Args:
+            messages: Messages to send.
+            params: Generation parameters.
+            tools: Tool definitions.
+            tool_choice: Tool choice setting.
+            tokenizer: The tokenizer with chat template.
+
+        Returns:
+            ChatResponse with the model's output.
+        """
+        # Check if tools are requested but not supported
+        if tools:
+            # Try to apply template with tools to check support
+            try:
+                # The template should accept tools parameter if it supports them
+                prompt = tokenizer.apply_chat_template(
+                    messages, tools=tools, add_generation_prompt=True, tokenize=False
+                )
+            except TypeError:
+                # Template doesn't accept tools parameter
+                raise ToolCallingNotSupportedError(
+                    f"Model {self._model_id} chat template does not support tools. "
+                    "The apply_chat_template() method does not accept a 'tools' parameter. "
+                    "Consider using LiteLLMModelAdapter for reliable tool calling."
+                )
+        else:
+            prompt = tokenizer.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=False
+            )
+
+        # Generate response
+        response_text = self._call_model(prompt, params)
+
+        # Parse tool calls from response if tools were provided
+        tool_calls = None
+        content = response_text
+
+        if tools:
+            # Attempt to parse tool calls from the response
+            # Different models format tool calls differently
+            tool_calls, content = self._parse_tool_calls(response_text)
+
+        return ChatResponse(
+            content=content if content else None,
+            tool_calls=tool_calls,
+            role="assistant",
+            model=self._model_id,
+        )
+
+    def _chat_without_template(
+        self, messages: List[Dict[str, Any]], params: Dict[str, Any]
+    ) -> ChatResponse:
+        """Generate without a chat template (simple prompt concatenation).
+
+        Args:
+            messages: Messages to convert to prompt.
+            params: Generation parameters.
+
+        Returns:
+            ChatResponse with the model's output.
+        """
+        # Simple conversion: concatenate messages
+        prompt_parts = []
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+            prompt_parts.append(f"{role}: {content}")
+
+        prompt = "\n".join(prompt_parts) + "\nassistant:"
+
+        response_text = self._call_model(prompt, params)
+
+        return ChatResponse(
+            content=response_text,
+            role="assistant",
+            model=self._model_id,
+        )
+
+    def _call_model(self, prompt: str, params: Dict[str, Any]) -> str:
+        """Call the underlying model with a prompt.
+
+        Args:
+            prompt: The formatted prompt.
+            params: Generation parameters.
+
+        Returns:
+            The generated text.
+        """
         try:
-            return self._model(prompt, **params)
+            result = self._model(prompt, **params)
         except TypeError:
-            # fall back to calling without kwargs
-            return self._model(prompt)
+            # Fallback: call without params
+            result = self._model(prompt)
+
+        # Extract text from various response formats
+        if isinstance(result, str):
+            return result
+        elif isinstance(result, list) and len(result) > 0:
+            # Pipeline returns list of dicts
+            item = result[0]
+            if isinstance(item, dict):
+                # Text generation pipeline format
+                if "generated_text" in item:
+                    generated = item["generated_text"]
+                    # Remove the prompt from the response if it's included
+                    if generated.startswith(prompt):
+                        return generated[len(prompt) :].strip()
+                    return generated
+                return str(item)
+            return str(item)
+        elif isinstance(result, dict):
+            if "generated_text" in result:
+                return result["generated_text"]
+            return str(result)
+        else:
+            return str(result)
+
+    def _parse_tool_calls(
+        self, response: str
+    ) -> tuple[Optional[List[Dict[str, Any]]], Optional[str]]:
+        """Parse tool calls from model response.
+
+        Different models format tool calls differently. This method attempts
+        to parse common formats.
+
+        Args:
+            response: The raw model response.
+
+        Returns:
+            Tuple of (tool_calls, remaining_content).
+        """
+        import json
+        import re
+
+        # Try to find JSON tool calls in the response
+        # Common patterns: <tool_call>...</tool_call>, ```json...```, etc.
+
+        tool_calls = []
+        remaining_content = response
+
+        # Pattern 1: <tool_call> tags (used by some models)
+        tool_call_pattern = r"<tool_call>(.*?)</tool_call>"
+        matches = re.findall(tool_call_pattern, response, re.DOTALL)
+
+        for match in matches:
+            try:
+                call_data = json.loads(match.strip())
+                tool_calls.append(
+                    {
+                        "id": f"call_{len(tool_calls)}",
+                        "type": "function",
+                        "function": {
+                            "name": call_data.get("name", ""),
+                            "arguments": json.dumps(call_data.get("arguments", {})),
+                        },
+                    }
+                )
+                remaining_content = remaining_content.replace(
+                    f"<tool_call>{match}</tool_call>", ""
+                )
+            except json.JSONDecodeError:
+                continue
+
+        # Pattern 2: Function call JSON blocks
+        function_pattern = r'\{"name":\s*"([^"]+)",\s*"arguments":\s*(\{[^}]+\})\}'
+        for match in re.finditer(function_pattern, response):
+            try:
+                name = match.group(1)
+                args = match.group(2)
+                # Validate JSON
+                json.loads(args)
+                tool_calls.append(
+                    {
+                        "id": f"call_{len(tool_calls)}",
+                        "type": "function",
+                        "function": {
+                            "name": name,
+                            "arguments": args,
+                        },
+                    }
+                )
+            except (json.JSONDecodeError, IndexError):
+                continue
+
+        remaining_content = remaining_content.strip()
+
+        return (tool_calls if tool_calls else None, remaining_content if remaining_content else None)
 
-    def gather_config(self) -> dict[str, Any]:
+    def gather_config(self) -> Dict[str, Any]:
         """Gather configuration from this HuggingFace model adapter.
 
         Returns:
-            Dictionary containing:
-            - type: Component class name
-            - gathered_at: ISO timestamp
-            - model_id: Model identifier
-            - adapter_type: HuggingFaceModelAdapter
-            - default_generation_params: Default parameters used for generation (temperature, top_p, max_length, etc.)
-            - callable_type: Type name of the underlying callable
-            - pipeline_config: Pipeline configuration affecting model behavior:
-                - task: Pipeline task type (e.g., text-generation, text-classification)
-                - device: Device (cpu, cuda, etc.)
-                - framework: Framework (pt for PyTorch, tf for TensorFlow)
+            Dictionary containing model configuration.
         """
         base_config = super().gather_config()
         base_config.update(
@@ -70,16 +359,14 @@ def gather_config(self) -> dict[str, Any]:
             }
         )
 
-        # Extract pipeline configuration that affects model behavior
+        # Extract pipeline configuration
         pipeline_config = {}
 
-        # Core pipeline attributes
         if hasattr(self._model, "task"):
             pipeline_config["task"] = self._model.task
 
         if hasattr(self._model, "device"):
             device = self._model.device
-            # Convert device to string representation
             pipeline_config["device"] = str(device) if device is not None else None
 
         if hasattr(self._model, "framework"):
diff --git a/maseval/interface/inference/litellm.py b/maseval/interface/inference/litellm.py
index 90825f7..a6d7259 100644
--- a/maseval/interface/inference/litellm.py
+++ b/maseval/interface/inference/litellm.py
@@ -1,44 +1,72 @@
 """LiteLLM model adapter.
 
-LiteLLM provides a unified interface for 100+ LLM APIs.
+LiteLLM provides a unified interface for 100+ LLM APIs using OpenAI-compatible
+syntax. This adapter wraps LiteLLM to provide consistent behavior within MASEval.
 
 Requires litellm to be installed:
     pip install maseval[litellm]
-"""
 
-from typing import Any, Optional, Dict
+Example:
+    ```python
+    from maseval.interface.inference import LiteLLMModelAdapter
 
-from maseval.core.model import ModelAdapter
+    # OpenAI models
+    model = LiteLLMModelAdapter(model_id="gpt-4")
 
+    # Anthropic models
+    model = LiteLLMModelAdapter(model_id="claude-3-opus-20240229")
 
-class LiteLLMModelAdapter(ModelAdapter):
-    """Adapter for LiteLLM unified interface.
+    # Azure OpenAI
+    model = LiteLLMModelAdapter(
+        model_id="azure/gpt-4",
+        api_base="https://your-resource.openai.azure.com"
+    )
 
-    LiteLLM provides a consistent API for calling multiple LLM providers
-    (OpenAI, Anthropic, Cohere, Azure, AWS Bedrock, etc.) using the same
-    interface.
+    # AWS Bedrock
+    model = LiteLLMModelAdapter(model_id="bedrock/anthropic.claude-v2")
 
-    Requires litellm to be installed.
+    # Simple generation
+    response = model.generate("Hello!")
 
-    Example:
-        ```python
-        from maseval.interface.inference import LiteLLMModelAdapter
+    # Chat with messages
+    response = model.chat([
+        {"role": "user", "content": "Hello!"}
+    ])
 
-        # OpenAI
-        model = LiteLLMModelAdapter(model_id="gpt-4")
+    # Chat with tools
+    response = model.chat(
+        messages=[{"role": "user", "content": "What's the weather?"}],
+        tools=[{"type": "function", "function": {...}}]
+    )
+    ```
+"""
 
-        # Anthropic
-        model = LiteLLMModelAdapter(model_id="claude-3-opus-20240229")
+from typing import Any, Optional, Dict, List, Union
 
-        # Azure OpenAI
-        model = LiteLLMModelAdapter(
-            model_id="azure/gpt-4",
-            default_generation_params={"api_base": "..."}
-        )
+from maseval.core.model import ModelAdapter, ChatResponse
 
-        # AWS Bedrock
-        model = LiteLLMModelAdapter(model_id="bedrock/anthropic.claude-v2")
-        ```
+
+class LiteLLMModelAdapter(ModelAdapter):
+    """Adapter for LiteLLM unified interface.
+
+    LiteLLM provides a consistent API for calling multiple LLM providers
+    (OpenAI, Anthropic, Cohere, Azure, AWS Bedrock, Google, etc.) using
+    OpenAI-compatible syntax.
+
+    Supported providers include:
+        - OpenAI: "gpt-4", "gpt-3.5-turbo"
+        - Anthropic: "claude-3-opus-20240229", "claude-3-sonnet-20240229"
+        - Azure: "azure/gpt-4", "azure/gpt-35-turbo"
+        - AWS Bedrock: "bedrock/anthropic.claude-v2"
+        - Google: "gemini/gemini-pro"
+        - And many more (see https://docs.litellm.ai/docs/providers)
+
+    API keys are read from environment variables by default:
+        - OPENAI_API_KEY for OpenAI
+        - ANTHROPIC_API_KEY for Anthropic
+        - etc.
+
+    Or pass api_key directly to the constructor.
     """
 
     def __init__(
@@ -51,14 +79,17 @@ def __init__(
         """Initialize LiteLLM model adapter.
 
         Args:
-            model_id: The model identifier in LiteLLM format (e.g., "gpt-4",
-                "claude-3-opus-20240229", "azure/gpt-4", "bedrock/...").
-                See: https://docs.litellm.ai/docs/providers
-            default_generation_params: Default parameters passed to litellm.completion()
-                (e.g., temperature, max_tokens, top_p, etc.)
-            api_key: Optional API key. If not provided, LiteLLM will use environment
-                variables (OPENAI_API_KEY, ANTHROPIC_API_KEY, etc.)
-            api_base: Optional API base URL for custom endpoints
+            model_id: The model identifier in LiteLLM format. Examples:
+                - "gpt-4" (OpenAI)
+                - "claude-3-opus-20240229" (Anthropic)
+                - "azure/gpt-4" (Azure OpenAI)
+                - "bedrock/anthropic.claude-v2" (AWS Bedrock)
+                See https://docs.litellm.ai/docs/providers for full list.
+            default_generation_params: Default parameters for all calls.
+                Common parameters: temperature, max_tokens, top_p.
+            api_key: API key for the provider. If not provided, LiteLLM
+                reads from environment variables.
+            api_base: Custom API base URL for self-hosted or Azure endpoints.
         """
         super().__init__()
         self._model_id = model_id
@@ -70,21 +101,32 @@ def __init__(
     def model_id(self) -> str:
         return self._model_id
 
-    def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str:
-        """Generate text using LiteLLM.
+    def _chat_impl(
+        self,
+        messages: List[Dict[str, Any]],
+        generation_params: Optional[Dict[str, Any]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        **kwargs: Any,
+    ) -> ChatResponse:
+        """Call LiteLLM completion API.
 
         Args:
-            prompt: The input prompt
-            generation_params: Optional generation parameters (temperature, max_tokens, etc.)
-            **kwargs: Additional LiteLLM-specific parameters
+            messages: List of message dicts in OpenAI format.
+            generation_params: Generation parameters (temperature, etc.).
+            tools: Tool definitions for function calling.
+            tool_choice: Tool choice setting.
+            **kwargs: Additional LiteLLM parameters.
 
         Returns:
-            Generated text string
+            ChatResponse with the model's output.
         """
         try:
             import litellm
         except ImportError as e:
-            raise ImportError("LiteLLM is not installed. Install it with: pip install maseval[litellm] or pip install litellm") from e
+            raise ImportError(
+                "LiteLLM is not installed. Install with: pip install maseval[litellm]"
+            ) from e
 
         # Merge parameters
         params = dict(self._default_generation_params)
@@ -98,31 +140,58 @@ def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]
         if self._api_base:
             params["api_base"] = self._api_base
 
-        # LiteLLM expects messages format
-        messages = [{"role": "user", "content": prompt}]
+        # Add tools if provided
+        if tools:
+            params["tools"] = tools
+        if tool_choice is not None:
+            params["tool_choice"] = tool_choice
 
         # Call LiteLLM
         response = litellm.completion(model=self._model_id, messages=messages, **params)
 
-        # Extract text from response
-        # LiteLLM returns a ModelResponse object similar to OpenAI's format
-        content = response.choices[0].message.content
-        return content if content is not None else ""
+        # Extract response data
+        choice = response.choices[0]
+        message = choice.message
+
+        # Build tool_calls list if present
+        tool_calls = None
+        if hasattr(message, "tool_calls") and message.tool_calls:
+            tool_calls = []
+            for tc in message.tool_calls:
+                tool_calls.append(
+                    {
+                        "id": tc.id,
+                        "type": tc.type,
+                        "function": {
+                            "name": tc.function.name,
+                            "arguments": tc.function.arguments,
+                        },
+                    }
+                )
+
+        # Build usage dict if present
+        usage = None
+        if hasattr(response, "usage") and response.usage:
+            usage = {
+                "input_tokens": getattr(response.usage, "prompt_tokens", 0),
+                "output_tokens": getattr(response.usage, "completion_tokens", 0),
+                "total_tokens": getattr(response.usage, "total_tokens", 0),
+            }
+
+        return ChatResponse(
+            content=message.content,
+            tool_calls=tool_calls,
+            role=message.role if hasattr(message, "role") else "assistant",
+            usage=usage,
+            model=getattr(response, "model", self._model_id),
+            stop_reason=getattr(choice, "finish_reason", None),
+        )
 
-    def gather_config(self) -> dict[str, Any]:
+    def gather_config(self) -> Dict[str, Any]:
         """Gather configuration from this LiteLLM model adapter.
 
         Returns:
-            Dictionary containing:
-            - type: Component class name
-            - gathered_at: ISO timestamp
-            - model_id: Model identifier
-            - adapter_type: LiteLLMModelAdapter
-            - default_generation_params: Default parameters used for generation (temperature, top_p, etc.)
-            - litellm_global_config: LiteLLM global configuration affecting model behavior:
-                - num_retries: Number of retry attempts (affects reliability)
-                - drop_params: Whether to drop unsupported params (affects behavior)
-                - verbose: Debug logging enabled (affects observability)
+            Dictionary containing model configuration and LiteLLM settings.
         """
         base_config = super().gather_config()
         base_config.update(
@@ -131,21 +200,18 @@ def gather_config(self) -> dict[str, Any]:
             }
         )
 
-        # Extract LiteLLM global configuration that affects model behavior
+        # Extract LiteLLM global configuration
         try:
             import litellm
 
             litellm_config = {}
 
-            # Retry configuration (affects reliability and latency)
             if hasattr(litellm, "num_retries"):
                 litellm_config["num_retries"] = litellm.num_retries
 
-            # Drop params (affects model behavior with unsupported parameters)
             if hasattr(litellm, "drop_params"):
                 litellm_config["drop_params"] = litellm.drop_params
 
-            # Verbose mode (affects logging and debugging)
             if hasattr(litellm, "verbose"):
                 litellm_config["verbose"] = litellm.verbose
 
diff --git a/maseval/interface/inference/openai.py b/maseval/interface/inference/openai.py
index 846aa80..26207aa 100644
--- a/maseval/interface/inference/openai.py
+++ b/maseval/interface/inference/openai.py
@@ -1,96 +1,276 @@
 """OpenAI and OpenAI-compatible model adapter.
 
+This adapter works with the official OpenAI Python SDK and any OpenAI-compatible
+API (like Azure OpenAI, local models with OpenAI-compatible servers, etc.).
+
 Requires openai to be installed:
     pip install maseval[openai]
+
+Example:
+    ```python
+    from openai import OpenAI
+    from maseval.interface.inference import OpenAIModelAdapter
+
+    # Standard OpenAI usage
+    client = OpenAI()  # Uses OPENAI_API_KEY env var
+    model = OpenAIModelAdapter(client=client, model_id="gpt-4")
+
+    # Simple generation
+    response = model.generate("Hello!")
+
+    # Chat with messages
+    response = model.chat([
+        {"role": "system", "content": "You are helpful."},
+        {"role": "user", "content": "Hello!"}
+    ])
+
+    # Chat with tools
+    response = model.chat(
+        messages=[{"role": "user", "content": "What's the weather?"}],
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get weather for a city",
+                "parameters": {...}
+            }
+        }]
+    )
+
+    # Azure OpenAI
+    from openai import AzureOpenAI
+    client = AzureOpenAI(
+        azure_endpoint="https://your-resource.openai.azure.com",
+        api_version="2024-02-15-preview"
+    )
+    model = OpenAIModelAdapter(client=client, model_id="gpt-4")
+    ```
 """
 
-from typing import Any, Optional, Dict
-import json
+from typing import Any, Optional, Dict, List, Union
 
-from maseval.core.model import ModelAdapter
+from maseval.core.model import ModelAdapter, ChatResponse
 
 
 class OpenAIModelAdapter(ModelAdapter):
-    """Adapter for OpenAI-compatible models (openai or OpenAI-compatible servers).
+    """Adapter for OpenAI and OpenAI-compatible APIs.
 
-    The `client` can be a callable returning a string, or an object with a
-    `complete`/`chat`/`create` method. This adapter tries common method names.
+    Works with:
+        - OpenAI API (gpt-4, gpt-3.5-turbo, etc.)
+        - Azure OpenAI
+        - Any OpenAI-compatible server (vLLM, LocalAI, etc.)
 
-    Requires openai to be installed.
+    The adapter expects an OpenAI client instance. API keys and configuration
+    should be set on the client before passing it to the adapter.
     """
 
     def __init__(
         self,
         client: Any,
-        model_id: Optional[str] = None,
+        model_id: str,
         default_generation_params: Optional[Dict[str, Any]] = None,
     ):
+        """Initialize OpenAI model adapter.
+
+        Args:
+            client: An OpenAI client instance (openai.OpenAI or openai.AzureOpenAI).
+                The client should already be configured with API keys.
+            model_id: The model identifier (e.g., "gpt-4", "gpt-3.5-turbo").
+            default_generation_params: Default parameters for all calls.
+                Common parameters: temperature, max_tokens, top_p.
+        """
         super().__init__()
         self._client = client
-        self._model_id = model_id or getattr(client, "model_id", "openai:unknown")
+        self._model_id = model_id
         self._default_generation_params = default_generation_params or {}
 
     @property
     def model_id(self) -> str:
         return self._model_id
 
-    def _extract_text(self, resp: Any) -> str:
-        if isinstance(resp, str):
-            return resp
-        if isinstance(resp, dict):
-            # common OpenAI shapes
-            if "choices" in resp and resp["choices"]:
-                choice = resp["choices"][0]
-                # chat-like
-                if "message" in choice and isinstance(choice["message"], dict):
-                    return choice["message"].get("content", "")
-                # completion-like
-                return choice.get("text", "")
-            # fallback
-            return json.dumps(resp)
-        return str(resp)
-
-    def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str:
+    def _chat_impl(
+        self,
+        messages: List[Dict[str, Any]],
+        generation_params: Optional[Dict[str, Any]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        **kwargs: Any,
+    ) -> ChatResponse:
+        """Call OpenAI chat completions API.
+
+        Args:
+            messages: List of message dicts in OpenAI format.
+            generation_params: Generation parameters (temperature, etc.).
+            tools: Tool definitions for function calling.
+            tool_choice: Tool choice setting.
+            **kwargs: Additional OpenAI parameters.
+
+        Returns:
+            ChatResponse with the model's output.
+        """
+        # Merge parameters
         params = dict(self._default_generation_params)
         if generation_params:
             params.update(generation_params)
         params.update(kwargs)
 
-        # try common call patterns
-        # 1) client(prompt)
-        try:
-            resp = self._client(prompt, **params)
-        except TypeError:
-            # 2) client.create / client.complete / client.chat
-            for meth in ("create", "complete", "chat", "generate"):
-                if hasattr(self._client, meth):
-                    func = getattr(self._client, meth)
+        # Add tools if provided
+        if tools:
+            params["tools"] = tools
+        if tool_choice is not None:
+            params["tool_choice"] = tool_choice
+
+        # Call OpenAI API
+        # Try the modern client interface first
+        if hasattr(self._client, "chat") and hasattr(self._client.chat, "completions"):
+            response = self._client.chat.completions.create(
+                model=self._model_id, messages=messages, **params
+            )
+        else:
+            # Fallback for older or custom clients
+            response = self._call_legacy_client(messages, params)
+
+        return self._parse_response(response)
+
+    def _call_legacy_client(
+        self, messages: List[Dict[str, Any]], params: Dict[str, Any]
+    ) -> Any:
+        """Handle older client interfaces or callables.
+
+        Args:
+            messages: Messages to send.
+            params: Parameters to pass.
+
+        Returns:
+            Response from the client.
+        """
+        # Try common method names
+        for method_name in ("create", "complete", "chat", "generate"):
+            if hasattr(self._client, method_name):
+                method = getattr(self._client, method_name)
+                try:
+                    return method(model=self._model_id, messages=messages, **params)
+                except TypeError:
+                    # Try without model parameter
                     try:
-                        resp = func(prompt, **params)
-                        break
+                        return method(messages=messages, **params)
                     except TypeError:
-                        resp = func(prompt)
-                        break
-            else:
-                # last resort: call without kwargs
-                resp = self._client(prompt)
+                        continue
+
+        # Last resort: try calling directly
+        if callable(self._client):
+            return self._client(model=self._model_id, messages=messages, **params)
 
-        return self._extract_text(resp)
+        raise TypeError(
+            f"Unable to call client of type {type(self._client).__name__}. "
+            "Expected an OpenAI client with chat.completions.create() method."
+        )
+
+    def _parse_response(self, response: Any) -> ChatResponse:
+        """Parse OpenAI response into ChatResponse.
+
+        Args:
+            response: The raw response from OpenAI.
+
+        Returns:
+            ChatResponse with extracted data.
+        """
+        # Handle dict responses (from mocks or legacy clients)
+        if isinstance(response, dict):
+            return self._parse_dict_response(response)
+
+        # Handle modern OpenAI response objects
+        choice = response.choices[0]
+        message = choice.message
+
+        # Extract tool calls
+        tool_calls = None
+        if hasattr(message, "tool_calls") and message.tool_calls:
+            tool_calls = []
+            for tc in message.tool_calls:
+                tool_calls.append(
+                    {
+                        "id": tc.id,
+                        "type": tc.type,
+                        "function": {
+                            "name": tc.function.name,
+                            "arguments": tc.function.arguments,
+                        },
+                    }
+                )
+
+        # Extract usage
+        usage = None
+        if hasattr(response, "usage") and response.usage:
+            usage = {
+                "input_tokens": getattr(response.usage, "prompt_tokens", 0),
+                "output_tokens": getattr(response.usage, "completion_tokens", 0),
+                "total_tokens": getattr(response.usage, "total_tokens", 0),
+            }
+
+        return ChatResponse(
+            content=message.content,
+            tool_calls=tool_calls,
+            role=getattr(message, "role", "assistant"),
+            usage=usage,
+            model=getattr(response, "model", self._model_id),
+            stop_reason=getattr(choice, "finish_reason", None),
+        )
+
+    def _parse_dict_response(self, response: Dict[str, Any]) -> ChatResponse:
+        """Parse dict response (from mocks or legacy APIs).
+
+        Args:
+            response: Dict response in OpenAI format.
+
+        Returns:
+            ChatResponse with extracted data.
+        """
+        if "choices" not in response or not response["choices"]:
+            # Simple string response wrapped in dict
+            return ChatResponse(content=str(response))
+
+        choice = response["choices"][0]
+
+        # Handle chat-style response
+        if "message" in choice:
+            message = choice["message"]
+            content = message.get("content")
+            tool_calls = message.get("tool_calls")
+            role = message.get("role", "assistant")
+        # Handle completion-style response
+        elif "text" in choice:
+            content = choice["text"]
+            tool_calls = None
+            role = "assistant"
+        else:
+            content = str(choice)
+            tool_calls = None
+            role = "assistant"
+
+        # Extract usage if present
+        usage = None
+        if "usage" in response:
+            usage = {
+                "input_tokens": response["usage"].get("prompt_tokens", 0),
+                "output_tokens": response["usage"].get("completion_tokens", 0),
+                "total_tokens": response["usage"].get("total_tokens", 0),
+            }
+
+        return ChatResponse(
+            content=content,
+            tool_calls=tool_calls,
+            role=role,
+            usage=usage,
+            model=response.get("model", self._model_id),
+            stop_reason=choice.get("finish_reason"),
+        )
 
-    def gather_config(self) -> dict[str, Any]:
+    def gather_config(self) -> Dict[str, Any]:
         """Gather configuration from this OpenAI model adapter.
 
         Returns:
-            Dictionary containing:
-            - type: Component class name
-            - gathered_at: ISO timestamp
-            - model_id: Model identifier
-            - adapter_type: OpenAIModelAdapter
-            - default_generation_params: Default parameters used for generation (temperature, top_p, etc.)
-            - client_type: Type name of the underlying client
-            - client_config: OpenAI client configuration affecting model behavior:
-                - timeout: Request timeout settings (affects latency)
-                - max_retries: Maximum number of retry attempts (affects reliability)
+            Dictionary containing model configuration and client settings.
         """
         base_config = super().gather_config()
         base_config.update(
@@ -100,13 +280,11 @@ def gather_config(self) -> dict[str, Any]:
             }
         )
 
-        # Extract OpenAI client configuration that affects model behavior
+        # Extract client configuration
         client_config = {}
 
-        # Timeout configuration (affects latency and reliability)
         if hasattr(self._client, "timeout"):
             timeout = self._client.timeout
-            # Handle both httpx.Timeout objects and simple floats
             if hasattr(timeout, "connect"):
                 client_config["timeout"] = {
                     "connect": timeout.connect,
@@ -117,7 +295,6 @@ def gather_config(self) -> dict[str, Any]:
             else:
                 client_config["timeout"] = timeout
 
-        # Max retries (affects reliability and latency)
         if hasattr(self._client, "max_retries"):
             client_config["max_retries"] = self._client.max_retries
 
diff --git a/pyproject.toml b/pyproject.toml
index 5c41202..0b49cf1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ langgraph = ["langgraph>=0.6.0"]
 llamaindex = ["llama-index-core>=0.12.0"]
 
 # Inference engines
+anthropic = ["anthropic>=0.40.0"]
 openai = ["openai>=1.107.2"]
 google-genai = ["google-genai>=1.37.0"]
 transformers = ["transformers>=4.37.0"]
@@ -47,7 +48,7 @@ langfuse = ["langfuse>=3.3.4"]
 
 # Dependencies for running examples (only what's actually used)
 examples = [
-    "maseval[smolagents,langgraph,llamaindex,openai,google-genai,litellm,langfuse]",
+    "maseval[smolagents,langgraph,llamaindex,anthropic,openai,google-genai,litellm,langfuse]",
     # Additional integrations used in examples
     "langchain>=0.3.27",
     "langchain-google-genai>=2.1.12",
diff --git a/tests/conftest.py b/tests/conftest.py
index 75b147c..fd40e9e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,29 +12,81 @@
     Evaluator,
     MessageHistory,
 )
-from maseval.core.model import ModelAdapter
+from maseval.core.model import ModelAdapter, ChatResponse
 
 
 # ==================== Dummy Components ====================
 
 
 class DummyModelAdapter(ModelAdapter):
-    """Minimal model adapter for testing."""
+    """Minimal model adapter for testing.
 
-    def __init__(self, model_id: str = "test-model", responses: Optional[List[str]] = None):
+    Simulates model responses without making actual API calls. Useful for
+    unit tests and integration tests that don't require real LLM inference.
+
+    Supports both chat() and generate() methods, returning responses from
+    a predefined list in round-robin fashion.
+    """
+
+    def __init__(
+        self,
+        model_id: str = "test-model",
+        responses: Optional[List[str]] = None,
+        tool_calls: Optional[List[List[Dict[str, Any]]]] = None,
+    ):
+        """Initialize DummyModelAdapter.
+
+        Args:
+            model_id: Identifier for this model instance.
+            responses: List of text responses to return. Cycles through the list.
+            tool_calls: Optional list of tool call lists. If provided, each call
+                returns the corresponding tool_calls (cycling through the list).
+        """
         super().__init__()
         self._model_id = model_id
         self._responses = responses or ["test response"]
+        self._tool_calls = tool_calls
         self._call_count = 0
 
     @property
     def model_id(self) -> str:
         return self._model_id
 
-    def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str:
+    def _chat_impl(
+        self,
+        messages: List[Dict[str, Any]],
+        generation_params: Optional[Dict[str, Any]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        tool_choice: Optional[Any] = None,
+        **kwargs: Any,
+    ) -> ChatResponse:
+        """Return a mock response.
+
+        Args:
+            messages: Input messages (ignored for mock).
+            generation_params: Generation parameters (ignored for mock).
+            tools: Tool definitions (ignored for mock).
+            tool_choice: Tool choice (ignored for mock).
+            **kwargs: Additional arguments (ignored for mock).
+
+        Returns:
+            ChatResponse with mock content and optional tool_calls.
+        """
         response = self._responses[self._call_count % len(self._responses)]
+
+        # Get tool_calls for this response if provided
+        response_tool_calls = None
+        if self._tool_calls:
+            response_tool_calls = self._tool_calls[self._call_count % len(self._tool_calls)]
+
         self._call_count += 1
-        return response
+
+        return ChatResponse(
+            content=response,
+            tool_calls=response_tool_calls,
+            role="assistant",
+            model=self._model_id,
+        )
 
 
 class DummyAgent:
diff --git a/tests/test_core/test_model_adapter.py b/tests/test_core/test_model_adapter.py
index 8718dea..bbcf3c6 100644
--- a/tests/test_core/test_model_adapter.py
+++ b/tests/test_core/test_model_adapter.py
@@ -15,7 +15,9 @@
 import json
 import time
 from datetime import datetime
+from typing import Any, Dict, List, Optional, Union
 from conftest import DummyModelAdapter
+from maseval.core.model import ChatResponse
 
 
 @pytest.mark.core
@@ -23,7 +25,7 @@ class TestModelAdapterBaseContract:
     """Test fundamental ModelAdapter base class behavior."""
 
     def test_model_adapter_has_abstract_methods(self):
-        """ModelAdapter requires subclasses to implement model_id and _generate_impl."""
+        """ModelAdapter requires subclasses to implement model_id and _chat_impl."""
         from maseval.core.model import ModelAdapter
 
         # Cannot instantiate abstract class directly
@@ -35,14 +37,14 @@ def test_model_adapter_requires_model_id_property(self):
         from maseval.core.model import ModelAdapter
 
         class IncompleteAdapter(ModelAdapter):
-            def _generate_impl(self, prompt, generation_params=None, **kwargs):
-                return "test"
+            def _chat_impl(self, messages, generation_params=None, tools=None, tool_choice=None, **kwargs):
+                return ChatResponse(content="test")
 
         with pytest.raises(TypeError):
             IncompleteAdapter()  # type: ignore
 
-    def test_model_adapter_requires_generate_impl(self):
-        """Subclasses must implement _generate_impl method."""
+    def test_model_adapter_requires_chat_impl(self):
+        """Subclasses must implement _chat_impl method."""
         from maseval.core.model import ModelAdapter
 
         class IncompleteAdapter(ModelAdapter):
@@ -91,7 +93,7 @@ def test_generate_logs_successful_calls(self, dummy_model):
         # Verify required fields
         call = dummy_model.logs[0]
         assert "timestamp" in call
-        assert "prompt_length" in call
+        assert "message_count" in call
         assert "response_length" in call
         assert "duration_seconds" in call
         assert "status" in call
@@ -146,7 +148,60 @@ def test_generate_with_empty_prompt(self):
 
         assert isinstance(result, str)
         assert len(model.logs) == 1
-        assert model.logs[0]["prompt_length"] == 0
+        # Empty prompt creates one message
+        assert model.logs[0]["message_count"] == 1
+
+
+@pytest.mark.core
+class TestModelAdapterChatContract:
+    """Test chat() method behavior."""
+
+    def test_chat_returns_chat_response(self):
+        """chat() returns a ChatResponse object."""
+        model = DummyModelAdapter(responses=["Test response"])
+        result = model.chat([{"role": "user", "content": "Hello"}])
+
+        assert isinstance(result, ChatResponse)
+        assert result.content == "Test response"
+        assert result.role == "assistant"
+
+    def test_chat_with_multiple_messages(self):
+        """chat() accepts multiple messages."""
+        model = DummyModelAdapter(responses=["Response"])
+        messages = [
+            {"role": "system", "content": "You are helpful."},
+            {"role": "user", "content": "Hello"},
+        ]
+        result = model.chat(messages)
+
+        assert isinstance(result, ChatResponse)
+        assert model.logs[0]["message_count"] == 2
+
+    def test_chat_response_to_message(self):
+        """ChatResponse.to_message() returns dict."""
+        model = DummyModelAdapter(responses=["Hello!"])
+        result = model.chat([{"role": "user", "content": "Hi"}])
+
+        message = result.to_message()
+        assert isinstance(message, dict)
+        assert message["role"] == "assistant"
+        assert message["content"] == "Hello!"
+
+    def test_chat_with_tool_calls(self):
+        """chat() returns tool_calls when provided."""
+        tool_calls = [
+            {
+                "id": "call_1",
+                "type": "function",
+                "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'},
+            }
+        ]
+        model = DummyModelAdapter(responses=[""], tool_calls=[tool_calls])
+        result = model.chat([{"role": "user", "content": "Weather?"}])
+
+        assert result.tool_calls is not None
+        assert len(result.tool_calls) == 1
+        assert result.tool_calls[0]["function"]["name"] == "get_weather"
 
 
 @pytest.mark.core
@@ -157,7 +212,7 @@ def test_model_adapter_error_handling(self, dummy_model):
         """Test that errors are logged correctly."""
 
         class FailingModel(DummyModelAdapter):
-            def _generate_impl(self, prompt, generation_params=None, **kwargs):
+            def _chat_impl(self, messages, generation_params=None, tools=None, tool_choice=None, **kwargs):
                 raise ValueError("Test error")
 
         model = FailingModel()
@@ -174,7 +229,7 @@ def test_generate_logs_error_timing(self):
         """generate() logs duration even when errors occur."""
 
         class FailingModel(DummyModelAdapter):
-            def _generate_impl(self, prompt, generation_params=None, **kwargs):
+            def _chat_impl(self, messages, generation_params=None, tools=None, tool_choice=None, **kwargs):
                 time.sleep(0.01)  # Small delay
                 raise RuntimeError("Fail")
 
@@ -187,10 +242,10 @@ def _generate_impl(self, prompt, generation_params=None, **kwargs):
         assert call["duration_seconds"] >= 0.01
 
     def test_generate_logs_error_metadata(self):
-        """generate() logs prompt length and params even on error."""
+        """generate() logs message count and params even on error."""
 
         class FailingModel(DummyModelAdapter):
-            def _generate_impl(self, prompt, generation_params=None, **kwargs):
+            def _chat_impl(self, messages, generation_params=None, tools=None, tool_choice=None, **kwargs):
                 raise Exception("Fail")
 
         model = FailingModel()
@@ -200,7 +255,7 @@ def _generate_impl(self, prompt, generation_params=None, **kwargs):
             model.generate("Test prompt", generation_params=params, custom="arg")
 
         call = model.logs[0]
-        assert call["prompt_length"] == len("Test prompt")
+        assert call["message_count"] == 1
         assert call["generation_params"] == params
         assert "custom" in call["kwargs"]
 
@@ -211,7 +266,7 @@ class CustomError(Exception):
             pass
 
         class FailingModel(DummyModelAdapter):
-            def _generate_impl(self, prompt, generation_params=None, **kwargs):
+            def _chat_impl(self, messages, generation_params=None, tools=None, tool_choice=None, **kwargs):
                 raise CustomError("Original error")
 
         model = FailingModel()
@@ -285,11 +340,11 @@ def __init__(self):
                 super().__init__()
                 self.call_count = 0
 
-            def _generate_impl(self, prompt, generation_params=None, **kwargs):
+            def _chat_impl(self, messages, generation_params=None, tools=None, tool_choice=None, **kwargs):
                 self.call_count += 1
                 if self.call_count % 2 == 0:
                     raise ValueError("Fail")
-                return "Success"
+                return ChatResponse(content="Success")
 
         model = SometimesFailingModel()
 
diff --git a/tests/test_interface/test_model_integration/test_model_adapters.py b/tests/test_interface/test_model_integration/test_model_adapters.py
index 0d1a7b3..8e73ac6 100644
--- a/tests/test_interface/test_model_integration/test_model_adapters.py
+++ b/tests/test_interface/test_model_integration/test_model_adapters.py
@@ -25,23 +25,40 @@ def test_openai_adapter_initialization(self):
         pytest.importorskip("openai")
         from maseval.interface.inference.openai import OpenAIModelAdapter
 
-        # Mock client
-        def mock_client(prompt, **kwargs):
-            return {"choices": [{"message": {"content": "Response"}}]}
+        # Mock client with chat.completions.create interface
+        class MockClient:
+            class Chat:
+                class Completions:
+                    def create(self, model, messages, **kwargs):
+                        return {"choices": [{"message": {"content": "Response"}}]}
+
+                completions = Completions()
 
-        adapter = OpenAIModelAdapter(client=mock_client, model_id="gpt-4")
+            chat = Chat()
+
+        adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4")
 
         assert adapter.model_id == "gpt-4"
 
-    def test_openai_adapter_generate_with_callable(self):
-        """OpenAIModelAdapter works with callable client."""
+    def test_openai_adapter_generate_with_modern_client(self):
+        """OpenAIModelAdapter works with modern client interface."""
         pytest.importorskip("openai")
         from maseval.interface.inference.openai import OpenAIModelAdapter
 
-        def mock_client(prompt, **kwargs):
-            return {"choices": [{"message": {"content": f"Response to: {prompt}"}}]}
+        class MockClient:
+            class Chat:
+                class Completions:
+                    def create(self, model, messages, **kwargs):
+                        # Extract user message content
+                        user_msg = next((m for m in messages if m["role"] == "user"), {})
+                        content = user_msg.get("content", "")
+                        return {"choices": [{"message": {"content": f"Response to: {content}"}}]}
+
+                completions = Completions()
+
+            chat = Chat()
 
-        adapter = OpenAIModelAdapter(client=mock_client, model_id="gpt-4")
+        adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4")
         result = adapter.generate("Test prompt")
 
         assert isinstance(result, str)
@@ -53,24 +70,19 @@ def test_openai_adapter_extract_text_from_dict(self):
         from maseval.interface.inference.openai import OpenAIModelAdapter
 
         # Chat completion format
-        def chat_client(prompt, **kwargs):
-            return {"choices": [{"message": {"content": "Chat response"}}]}
-
-        adapter = OpenAIModelAdapter(client=chat_client, model_id="gpt-4")
-        result = adapter.generate("Test")
-        assert result == "Chat response"
+        class MockClient:
+            class Chat:
+                class Completions:
+                    def create(self, model, messages, **kwargs):
+                        return {"choices": [{"message": {"content": "Chat response"}}]}
 
-    def test_openai_adapter_extract_text_from_string(self):
-        """OpenAIModelAdapter handles string responses."""
-        pytest.importorskip("openai")
-        from maseval.interface.inference.openai import OpenAIModelAdapter
+                completions = Completions()
 
-        def string_client(prompt, **kwargs):
-            return "Direct string response"
+            chat = Chat()
 
-        adapter = OpenAIModelAdapter(client=string_client, model_id="gpt-4")
+        adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4")
         result = adapter.generate("Test")
-        assert result == "Direct string response"
+        assert result == "Chat response"
 
     def test_openai_adapter_default_generation_params(self):
         """OpenAIModelAdapter uses default generation parameters."""
@@ -79,12 +91,19 @@ def test_openai_adapter_default_generation_params(self):
 
         captured_params = {}
 
-        def mock_client(prompt, **kwargs):
-            captured_params.update(kwargs)
-            return "Response"
+        class MockClient:
+            class Chat:
+                class Completions:
+                    def create(self, model, messages, **kwargs):
+                        captured_params.update(kwargs)
+                        return {"choices": [{"message": {"content": "Response"}}]}
+
+                completions = Completions()
+
+            chat = Chat()
 
         adapter = OpenAIModelAdapter(
-            client=mock_client,
+            client=MockClient(),
             model_id="gpt-4",
             default_generation_params={"temperature": 0.7, "max_tokens": 100},
         )
@@ -100,11 +119,18 @@ def test_openai_adapter_gather_config_includes_params(self):
         pytest.importorskip("openai")
         from maseval.interface.inference.openai import OpenAIModelAdapter
 
-        def mock_client(prompt, **kwargs):
-            return "Response"
+        class MockClient:
+            class Chat:
+                class Completions:
+                    def create(self, model, messages, **kwargs):
+                        return {"choices": [{"message": {"content": "Response"}}]}
+
+                completions = Completions()
+
+            chat = Chat()
 
         adapter = OpenAIModelAdapter(
-            client=mock_client,
+            client=MockClient(),
             model_id="gpt-4",
             default_generation_params={"temperature": 0.9},
         )
@@ -126,8 +152,14 @@ def __init__(self):
                 self.timeout = 60
                 self.max_retries = 3
 
-            def __call__(self, prompt, **kwargs):
-                return "Response"
+            class Chat:
+                class Completions:
+                    def create(self, model, messages, **kwargs):
+                        return {"choices": [{"message": {"content": "Response"}}]}
+
+                completions = Completions()
+
+            chat = Chat()
 
         client = MockOpenAIClient()
         adapter = OpenAIModelAdapter(client=client, model_id="gpt-4")
@@ -176,10 +208,22 @@ def test_google_genai_adapter_generate(self):
         class MockClient:
             class Models:
                 def generate_content(self, model, contents, config=None):
+                    # Extract text from contents (first user message)
+                    text = ""
+                    if contents:
+                        for content in contents:
+                            if content.get("role") == "user":
+                                parts = content.get("parts", [])
+                                if parts:
+                                    text = parts[0].get("text", "")
+                                    break
+
                     class Response:
-                        text = f"Response to: {contents}"
+                        pass
 
-                    return Response()
+                    resp = Response()
+                    resp.text = f"Response to: {text}"
+                    return resp
 
             def __init__(self):
                 self.models = self.Models()
@@ -274,7 +318,7 @@ def mock_model(prompt, **kwargs):
         assert adapter.model_id == "gpt2"
 
     def test_huggingface_adapter_generate(self):
-        """HuggingFaceModelAdapter generates text."""
+        """HuggingFaceModelAdapter generates text with message formatting."""
         pytest.importorskip("transformers")
         from maseval.interface.inference.huggingface import HuggingFaceModelAdapter
 
@@ -285,7 +329,8 @@ def mock_model(prompt, **kwargs):
         result = adapter.generate("Test prompt")
 
         assert isinstance(result, str)
-        assert result == "Generated: Test prompt"
+        # Without a tokenizer, the adapter formats messages as "user: content\nassistant:"
+        assert "Generated:" in result
 
     def test_huggingface_adapter_default_generation_params(self):
         """HuggingFaceModelAdapter uses default generation parameters."""
@@ -322,7 +367,8 @@ def mock_model(prompt):
         adapter = HuggingFaceModelAdapter(model=mock_model, model_id="gpt2")
         result = adapter.generate("Test")
 
-        assert result == "Response: Test"
+        # Should still work, just formats the prompt as messages
+        assert "Response:" in result
 
     def test_huggingface_adapter_gather_config(self):
         """HuggingFaceModelAdapter config includes parameters."""
@@ -440,8 +486,18 @@ def test_all_adapters_expose_model_id(self):
         from maseval.interface.inference.huggingface import HuggingFaceModelAdapter
         from maseval.interface.inference.litellm import LiteLLMModelAdapter
 
-        # OpenAI
-        openai_adapter = OpenAIModelAdapter(client=lambda p, **k: "R", model_id="gpt-4")
+        # OpenAI - mock with modern interface
+        class MockOpenAIClient:
+            class Chat:
+                class Completions:
+                    def create(self, model, messages, **kwargs):
+                        return {"choices": [{"message": {"content": "R"}}]}
+
+                completions = Completions()
+
+            chat = Chat()
+
+        openai_adapter = OpenAIModelAdapter(client=MockOpenAIClient(), model_id="gpt-4")
         assert openai_adapter.model_id == "gpt-4"
 
         # Google GenAI
@@ -482,8 +538,18 @@ def test_all_adapters_include_default_params_in_config(self):
         params = {"temperature": 0.7}
 
         # OpenAI
+        class MockOpenAIClient:
+            class Chat:
+                class Completions:
+                    def create(self, model, messages, **kwargs):
+                        return {"choices": [{"message": {"content": "R"}}]}
+
+                completions = Completions()
+
+            chat = Chat()
+
         openai_config = OpenAIModelAdapter(
-            client=lambda p, **k: "R",
+            client=MockOpenAIClient(),
             model_id="gpt-4",
             default_generation_params=params,
         ).gather_config()
diff --git a/uv.lock b/uv.lock
index eea7ebc..e12ce79 100644
--- a/uv.lock
+++ b/uv.lock
@@ -170,6 +170,25 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
 ]
 
+[[package]]
+name = "anthropic"
+version = "0.75.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "distro" },
+    { name = "docstring-parser" },
+    { name = "httpx" },
+    { name = "jiter" },
+    { name = "pydantic" },
+    { name = "sniffio" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/04/1f/08e95f4b7e2d35205ae5dcbb4ae97e7d477fc521c275c02609e2931ece2d/anthropic-0.75.0.tar.gz", hash = "sha256:e8607422f4ab616db2ea5baacc215dd5f028da99ce2f022e33c7c535b29f3dfb", size = 439565, upload-time = "2025-11-24T20:41:45.28Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/60/1c/1cd02b7ae64302a6e06724bf80a96401d5313708651d277b1458504a1730/anthropic-0.75.0-py3-none-any.whl", hash = "sha256:ea8317271b6c15d80225a9f3c670152746e88805a7a61e14d4a374577164965b", size = 388164, upload-time = "2025-11-24T20:41:43.587Z" },
+]
+
 [[package]]
 name = "anyio"
 version = "4.12.0"
@@ -849,6 +868,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" },
 ]
 
+[[package]]
+name = "docstring-parser"
+version = "0.17.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b2/9d/c3b43da9515bd270df0f80548d9944e389870713cc1fe2b8fb35fe2bcefd/docstring_parser-0.17.0.tar.gz", hash = "sha256:583de4a309722b3315439bb31d64ba3eebada841f2e2cee23b99df001434c912", size = 27442, upload-time = "2025-07-21T07:35:01.868Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/55/e2/2537ebcff11c1ee1ff17d8d0b6f4db75873e3b0fb32c2d4a2ee31ecb310a/docstring_parser-0.17.0-py3-none-any.whl", hash = "sha256:cf2569abd23dce8099b300f9b4fa8191e9582dda731fd533daf54c4551658708", size = 36896, upload-time = "2025-07-21T07:35:00.684Z" },
+]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.3.1"
@@ -2407,6 +2435,7 @@ dependencies = [
 
 [package.optional-dependencies]
 all = [
+    { name = "anthropic" },
     { name = "google-genai" },
     { name = "ipykernel" },
     { name = "ipywidgets" },
@@ -2424,7 +2453,11 @@ all = [
     { name = "typing-extensions" },
     { name = "wandb" },
 ]
+anthropic = [
+    { name = "anthropic" },
+]
 examples = [
+    { name = "anthropic" },
     { name = "google-genai" },
     { name = "ipykernel" },
     { name = "ipywidgets" },
@@ -2486,6 +2519,7 @@ docs = [
 
 [package.metadata]
 requires-dist = [
+    { name = "anthropic", marker = "extra == 'anthropic'", specifier = ">=0.40.0" },
     { name = "gitpython", specifier = ">=3.1.0" },
     { name = "google-genai", marker = "extra == 'google-genai'", specifier = ">=1.37.0" },
     { name = "ipykernel", marker = "extra == 'examples'", specifier = ">=6.0.0" },
@@ -2498,7 +2532,7 @@ requires-dist = [
     { name = "litellm", marker = "extra == 'litellm'", specifier = ">=1.0.0" },
     { name = "llama-index-core", marker = "extra == 'llamaindex'", specifier = ">=0.12.0" },
     { name = "maseval", extras = ["examples", "transformers", "wandb"], marker = "extra == 'all'" },
-    { name = "maseval", extras = ["smolagents", "langgraph", "llamaindex", "openai", "google-genai", "litellm", "langfuse"], marker = "extra == 'examples'" },
+    { name = "maseval", extras = ["smolagents", "langgraph", "llamaindex", "anthropic", "openai", "google-genai", "litellm", "langfuse"], marker = "extra == 'examples'" },
     { name = "mcp", marker = "extra == 'examples'", specifier = ">=1.22.0" },
     { name = "openai", marker = "extra == 'openai'", specifier = ">=1.107.2" },
     { name = "pydantic", specifier = ">=2.12.5" },
@@ -2509,7 +2543,7 @@ requires-dist = [
     { name = "typing-extensions", marker = "extra == 'examples'", specifier = ">=4.0.0" },
     { name = "wandb", marker = "extra == 'wandb'", specifier = ">=0.15.0" },
 ]
-provides-extras = ["smolagents", "langgraph", "llamaindex", "openai", "google-genai", "transformers", "litellm", "wandb", "langfuse", "examples", "all"]
+provides-extras = ["smolagents", "langgraph", "llamaindex", "anthropic", "openai", "google-genai", "transformers", "litellm", "wandb", "langfuse", "examples", "all"]
 
 [package.metadata.requires-dev]
 dev = [

From 3d4d19b25fb69d182dd1e463dbe937176588d9b4 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 25 Dec 2025 13:45:57 +0100
Subject: [PATCH 2/6] fixed tests

---
 maseval/interface/inference/anthropic.py      | 13 +++--------
 maseval/interface/inference/google_genai.py   |  8 ++-----
 maseval/interface/inference/huggingface.py    | 20 ++++------------
 maseval/interface/inference/litellm.py        |  4 +---
 maseval/interface/inference/openai.py         | 11 +++------
 .../test_macs/test_macs_evaluator.py          | 19 +++++++--------
 .../test_model_adapter_contract.py            | 23 +++++++++++++------
 tests/test_core/test_model_adapter.py         |  1 -
 8 files changed, 40 insertions(+), 59 deletions(-)

diff --git a/maseval/interface/inference/anthropic.py b/maseval/interface/inference/anthropic.py
index 0363d22..2b31f88 100644
--- a/maseval/interface/inference/anthropic.py
+++ b/maseval/interface/inference/anthropic.py
@@ -167,9 +167,7 @@ def _chat_impl(
 
         return self._parse_response(response)
 
-    def _convert_messages(
-        self, messages: List[Dict[str, Any]]
-    ) -> tuple[Optional[str], List[Dict[str, Any]]]:
+    def _convert_messages(self, messages: List[Dict[str, Any]]) -> tuple[Optional[str], List[Dict[str, Any]]]:
         """Convert OpenAI messages to Anthropic format.
 
         Anthropic separates system messages and uses different format for
@@ -273,9 +271,7 @@ def _convert_tools(self, tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 
         return anthropic_tools
 
-    def _convert_tool_choice(
-        self, tool_choice: Union[str, Dict[str, Any]]
-    ) -> Dict[str, Any]:
+    def _convert_tool_choice(self, tool_choice: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
         """Convert OpenAI tool_choice to Anthropic format.
 
         Args:
@@ -341,10 +337,7 @@ def _parse_response(self, response: Any) -> ChatResponse:
             usage = {
                 "input_tokens": getattr(response.usage, "input_tokens", 0),
                 "output_tokens": getattr(response.usage, "output_tokens", 0),
-                "total_tokens": (
-                    getattr(response.usage, "input_tokens", 0)
-                    + getattr(response.usage, "output_tokens", 0)
-                ),
+                "total_tokens": (getattr(response.usage, "input_tokens", 0) + getattr(response.usage, "output_tokens", 0)),
             }
 
         # Extract stop reason
diff --git a/maseval/interface/inference/google_genai.py b/maseval/interface/inference/google_genai.py
index fe71ba7..80e4c53 100644
--- a/maseval/interface/inference/google_genai.py
+++ b/maseval/interface/inference/google_genai.py
@@ -161,15 +161,11 @@ def _chat_impl(
         generation_config = genai.types.GenerateContentConfig(**config_params) if config_params else None
 
         # Call API
-        response = self._client.models.generate_content(
-            model=self._model_id, contents=contents, config=generation_config
-        )
+        response = self._client.models.generate_content(model=self._model_id, contents=contents, config=generation_config)
 
         return self._parse_response(response)
 
-    def _convert_messages(
-        self, messages: List[Dict[str, Any]]
-    ) -> tuple[Optional[str], List[Dict[str, Any]]]:
+    def _convert_messages(self, messages: List[Dict[str, Any]]) -> tuple[Optional[str], List[Dict[str, Any]]]:
         """Convert OpenAI messages to Google format.
 
         Google uses 'contents' with 'parts', and separates system instructions.
diff --git a/maseval/interface/inference/huggingface.py b/maseval/interface/inference/huggingface.py
index 7f7e541..5d20b56 100644
--- a/maseval/interface/inference/huggingface.py
+++ b/maseval/interface/inference/huggingface.py
@@ -172,9 +172,7 @@ def _chat_with_template(
             # Try to apply template with tools to check support
             try:
                 # The template should accept tools parameter if it supports them
-                prompt = tokenizer.apply_chat_template(
-                    messages, tools=tools, add_generation_prompt=True, tokenize=False
-                )
+                prompt = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, tokenize=False)
             except TypeError:
                 # Template doesn't accept tools parameter
                 raise ToolCallingNotSupportedError(
@@ -183,9 +181,7 @@ def _chat_with_template(
                     "Consider using LiteLLMModelAdapter for reliable tool calling."
                 )
         else:
-            prompt = tokenizer.apply_chat_template(
-                messages, add_generation_prompt=True, tokenize=False
-            )
+            prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
 
         # Generate response
         response_text = self._call_model(prompt, params)
@@ -206,9 +202,7 @@ def _chat_with_template(
             model=self._model_id,
         )
 
-    def _chat_without_template(
-        self, messages: List[Dict[str, Any]], params: Dict[str, Any]
-    ) -> ChatResponse:
+    def _chat_without_template(self, messages: List[Dict[str, Any]], params: Dict[str, Any]) -> ChatResponse:
         """Generate without a chat template (simple prompt concatenation).
 
         Args:
@@ -274,9 +268,7 @@ def _call_model(self, prompt: str, params: Dict[str, Any]) -> str:
         else:
             return str(result)
 
-    def _parse_tool_calls(
-        self, response: str
-    ) -> tuple[Optional[List[Dict[str, Any]]], Optional[str]]:
+    def _parse_tool_calls(self, response: str) -> tuple[Optional[List[Dict[str, Any]]], Optional[str]]:
         """Parse tool calls from model response.
 
         Different models format tool calls differently. This method attempts
@@ -314,9 +306,7 @@ def _parse_tool_calls(
                         },
                     }
                 )
-                remaining_content = remaining_content.replace(
-                    f"<tool_call>{match}</tool_call>", ""
-                )
+                remaining_content = remaining_content.replace(f"<tool_call>{match}</tool_call>", "")
             except json.JSONDecodeError:
                 continue
 
diff --git a/maseval/interface/inference/litellm.py b/maseval/interface/inference/litellm.py
index a6d7259..f0b9866 100644
--- a/maseval/interface/inference/litellm.py
+++ b/maseval/interface/inference/litellm.py
@@ -124,9 +124,7 @@ def _chat_impl(
         try:
             import litellm
         except ImportError as e:
-            raise ImportError(
-                "LiteLLM is not installed. Install with: pip install maseval[litellm]"
-            ) from e
+            raise ImportError("LiteLLM is not installed. Install with: pip install maseval[litellm]") from e
 
         # Merge parameters
         params = dict(self._default_generation_params)
diff --git a/maseval/interface/inference/openai.py b/maseval/interface/inference/openai.py
index 26207aa..62bae77 100644
--- a/maseval/interface/inference/openai.py
+++ b/maseval/interface/inference/openai.py
@@ -123,18 +123,14 @@ def _chat_impl(
         # Call OpenAI API
         # Try the modern client interface first
         if hasattr(self._client, "chat") and hasattr(self._client.chat, "completions"):
-            response = self._client.chat.completions.create(
-                model=self._model_id, messages=messages, **params
-            )
+            response = self._client.chat.completions.create(model=self._model_id, messages=messages, **params)
         else:
             # Fallback for older or custom clients
             response = self._call_legacy_client(messages, params)
 
         return self._parse_response(response)
 
-    def _call_legacy_client(
-        self, messages: List[Dict[str, Any]], params: Dict[str, Any]
-    ) -> Any:
+    def _call_legacy_client(self, messages: List[Dict[str, Any]], params: Dict[str, Any]) -> Any:
         """Handle older client interfaces or callables.
 
         Args:
@@ -162,8 +158,7 @@ def _call_legacy_client(
             return self._client(model=self._model_id, messages=messages, **params)
 
         raise TypeError(
-            f"Unable to call client of type {type(self._client).__name__}. "
-            "Expected an OpenAI client with chat.completions.create() method."
+            f"Unable to call client of type {type(self._client).__name__}. Expected an OpenAI client with chat.completions.create() method."
         )
 
     def _parse_response(self, response: Any) -> ChatResponse:
diff --git a/tests/test_benchmarks/test_macs/test_macs_evaluator.py b/tests/test_benchmarks/test_macs/test_macs_evaluator.py
index ab1b627..e5f0deb 100644
--- a/tests/test_benchmarks/test_macs/test_macs_evaluator.py
+++ b/tests/test_benchmarks/test_macs/test_macs_evaluator.py
@@ -436,20 +436,21 @@ def test_call_system_includes_tool_invocations(self, sample_task, sample_trace,
 
         traces = {"messages": sample_trace, "tool_traces": sample_tool_traces}
 
-        # Capture the prompt sent to the model
-        captured_prompts = []
-        original_generate = model._generate_impl
+        # Capture the messages sent to the model
+        captured_messages = []
+        original_chat = model._chat_impl
 
-        def capture_prompt(prompt, *args, **kwargs):
-            captured_prompts.append(prompt)
-            return original_generate(prompt, *args, **kwargs)
+        def capture_messages(messages, *args, **kwargs):
+            captured_messages.append(messages)
+            return original_chat(messages, *args, **kwargs)
 
-        with patch.object(model, "_generate_impl", side_effect=capture_prompt):
+        with patch.object(model, "_chat_impl", side_effect=capture_messages):
             evaluator(traces)
 
         # Check that tool invocations were included in the prompt
-        assert len(captured_prompts) > 0
-        prompt = captured_prompts[0]
+        assert len(captured_messages) > 0
+        # The prompt is in the first user message content
+        prompt = captured_messages[0][0]["content"]
         assert "search_flights" in prompt or "book_flight" in prompt
 
 
diff --git a/tests/test_contract/test_model_adapter_contract.py b/tests/test_contract/test_model_adapter_contract.py
index e229d1b..c4eb34f 100644
--- a/tests/test_contract/test_model_adapter_contract.py
+++ b/tests/test_contract/test_model_adapter_contract.py
@@ -11,6 +11,7 @@
 
 What this contract validates:
 - generate() returns string consistently
+- chat() returns ChatResponse consistently
 - Call logging happens uniformly (successful and failed calls)
 - Timing capture works consistently
 - Trace structure is consistent across implementations (gather_traces)
@@ -126,12 +127,19 @@ def create_openai_adapter(model_id: str = "gpt-4", responses: Optional[List[str]
     response_list: List[str] = responses or ["Test response"]
     call_count = [0]
 
-    def mock_client(prompt, **kwargs):
-        response = response_list[call_count[0] % len(response_list)]
-        call_count[0] += 1
-        return {"choices": [{"message": {"content": response}}]}
+    class MockClient:
+        class Chat:
+            class Completions:
+                def create(self, model, messages, **kwargs):
+                    response = response_list[call_count[0] % len(response_list)]
+                    call_count[0] += 1
+                    return {"choices": [{"message": {"content": response}}]}
+
+            completions = Completions()
+
+        chat = Chat()
 
-    return OpenAIModelAdapter(client=mock_client, model_id=model_id)
+    return OpenAIModelAdapter(client=MockClient(), model_id=model_id)
 
 
 def create_google_genai_adapter(model_id: str = "gemini-pro", responses: Optional[List[str]] = None) -> Any:
@@ -417,7 +425,8 @@ def test_adapter_handles_empty_prompt(self, implementation):
 
             traces = adapter.gather_traces()
             assert traces["total_calls"] == 1
-            assert traces["logs"][0]["prompt_length"] == 0
+            # Empty prompt still creates one message
+            assert traces["logs"][0]["message_count"] == 1
         finally:
             cleanup_adapter(adapter, implementation)
 
@@ -521,7 +530,7 @@ def test_all_adapters_log_same_call_metadata(self):
                 assert "timestamp" in call, f"Missing timestamp in {impl}"
                 assert "status" in call, f"Missing status in {impl}"
                 assert "duration_seconds" in call, f"Missing duration in {impl}"
-                assert "prompt_length" in call, f"Missing prompt_length in {impl}"
+                assert "message_count" in call, f"Missing message_count in {impl}"
         finally:
             for adapter, impl in adapters:
                 cleanup_adapter(adapter, impl)
diff --git a/tests/test_core/test_model_adapter.py b/tests/test_core/test_model_adapter.py
index bbcf3c6..e66861c 100644
--- a/tests/test_core/test_model_adapter.py
+++ b/tests/test_core/test_model_adapter.py
@@ -15,7 +15,6 @@
 import json
 import time
 from datetime import datetime
-from typing import Any, Dict, List, Optional, Union
 from conftest import DummyModelAdapter
 from maseval.core.model import ChatResponse
 

From 203315ec4c040a2aadb1fa2935ce2289d23c0f66 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 25 Dec 2025 16:07:10 +0100
Subject: [PATCH 3/6] improved testing

---
 .../test_model_adapter_contract.py            |  92 +-
 .../test_model_adapters.py                    | 912 ++++++++++++++++++
 2 files changed, 1000 insertions(+), 4 deletions(-)

diff --git a/tests/test_contract/test_model_adapter_contract.py b/tests/test_contract/test_model_adapter_contract.py
index c4eb34f..c8fbd6a 100644
--- a/tests/test_contract/test_model_adapter_contract.py
+++ b/tests/test_contract/test_model_adapter_contract.py
@@ -29,6 +29,7 @@
 from datetime import datetime
 from typing import Any, Dict, Optional, List
 from conftest import DummyModelAdapter
+from maseval.core.model import ChatResponse
 
 
 # ==================== Helper Functions ====================
@@ -226,6 +227,43 @@ def create_dummy_adapter(model_id: str = "test-model", responses: Optional[List[
     return DummyModelAdapter(model_id=model_id, responses=responses)
 
 
+def create_anthropic_adapter(model_id: str = "claude-3", responses: Optional[List[str]] = None) -> Any:
+    """Create AnthropicModelAdapter instance."""
+    pytest.importorskip("anthropic")
+    from maseval.interface.inference.anthropic import AnthropicModelAdapter
+
+    response_list: List[str] = responses or ["Test response"]
+    call_count = [0]
+
+    class MockTextBlock:
+        type = "text"
+
+        def __init__(self, text: str):
+            self.text = text
+
+    class MockUsage:
+        input_tokens = 10
+        output_tokens = 5
+
+    class MockMessages:
+        def create(self, **kwargs):
+            response = response_list[call_count[0] % len(response_list)]
+            call_count[0] += 1
+
+            class MockResponse:
+                content = [MockTextBlock(response)]
+                usage = MockUsage()
+                model = model_id
+                stop_reason = "end_turn"
+
+            return MockResponse()
+
+    class MockClient:
+        messages = MockMessages()
+
+    return AnthropicModelAdapter(client=MockClient(), model_id=model_id)
+
+
 def create_adapter_for_implementation(implementation: str, model_id: str, responses: Optional[List[str]] = None) -> Any:
     """Factory function to create adapter for specified implementation."""
     factories = {
@@ -234,6 +272,7 @@ def create_adapter_for_implementation(implementation: str, model_id: str, respon
         "google_genai": create_google_genai_adapter,
         "huggingface": create_huggingface_adapter,
         "litellm": create_litellm_adapter,
+        "anthropic": create_anthropic_adapter,
     }
 
     if implementation not in factories:
@@ -255,7 +294,7 @@ def cleanup_adapter(adapter: Any, implementation: str) -> None:
 
 @pytest.mark.contract
 @pytest.mark.interface
-@pytest.mark.parametrize("implementation", ["dummy", "openai", "google_genai", "huggingface", "litellm"])
+@pytest.mark.parametrize("implementation", ["dummy", "openai", "google_genai", "huggingface", "litellm", "anthropic"])
 class TestModelAdapterContract:
     """Verify all ModelAdapter implementations honor the same contract."""
 
@@ -270,6 +309,51 @@ def test_adapter_generate_returns_string(self, implementation):
         finally:
             cleanup_adapter(adapter, implementation)
 
+    def test_adapter_chat_returns_chat_response(self, implementation):
+        """All adapters return ChatResponse from chat()."""
+        adapter = create_adapter_for_implementation(implementation, model_id="test-model", responses=["Test response"])
+
+        try:
+            result = adapter.chat([{"role": "user", "content": "Test prompt"}])
+            assert isinstance(result, ChatResponse)
+            assert result.content is not None or result.tool_calls is not None
+            assert result.role == "assistant"
+        finally:
+            cleanup_adapter(adapter, implementation)
+
+    def test_adapter_chat_handles_multi_turn(self, implementation):
+        """All adapters handle multi-turn conversations."""
+        adapter = create_adapter_for_implementation(implementation, model_id="test-model", responses=["Response"])
+
+        try:
+            result = adapter.chat(
+                [
+                    {"role": "user", "content": "Hello"},
+                    {"role": "assistant", "content": "Hi there!"},
+                    {"role": "user", "content": "How are you?"},
+                ]
+            )
+            assert isinstance(result, ChatResponse)
+            assert result.content is not None
+        finally:
+            cleanup_adapter(adapter, implementation)
+
+    def test_adapter_chat_handles_system_message(self, implementation):
+        """All adapters handle system messages."""
+        adapter = create_adapter_for_implementation(implementation, model_id="test-model", responses=["Response"])
+
+        try:
+            result = adapter.chat(
+                [
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": "Hello"},
+                ]
+            )
+            assert isinstance(result, ChatResponse)
+            assert result.content is not None
+        finally:
+            cleanup_adapter(adapter, implementation)
+
     def test_adapter_traces_have_base_fields(self, implementation):
         """All adapters include required trace fields."""
         adapter = create_adapter_for_implementation(implementation, model_id="test-model")
@@ -453,7 +537,7 @@ class TestCrossAdapterConsistency:
 
     def test_all_adapters_have_consistent_trace_structure(self):
         """All adapter implementations have same base trace structure."""
-        implementations = ["dummy", "openai", "google_genai", "huggingface", "litellm"]
+        implementations = ["dummy", "openai", "google_genai", "huggingface", "litellm", "anthropic"]
         adapters = []
 
         try:
@@ -485,7 +569,7 @@ def test_all_adapters_have_consistent_trace_structure(self):
 
     def test_all_adapters_have_consistent_config_structure(self):
         """All adapter implementations have same base config structure."""
-        implementations = ["dummy", "openai", "google_genai", "huggingface", "litellm"]
+        implementations = ["dummy", "openai", "google_genai", "huggingface", "litellm", "anthropic"]
         adapters = []
 
         try:
@@ -511,7 +595,7 @@ def test_all_adapters_have_consistent_config_structure(self):
 
     def test_all_adapters_log_same_call_metadata(self):
         """All adapters log same metadata for each call."""
-        implementations = ["dummy", "openai", "google_genai", "huggingface", "litellm"]
+        implementations = ["dummy", "openai", "google_genai", "huggingface", "litellm", "anthropic"]
         adapters = []
 
         try:
diff --git a/tests/test_interface/test_model_integration/test_model_adapters.py b/tests/test_interface/test_model_integration/test_model_adapters.py
index 8e73ac6..46ed43d 100644
--- a/tests/test_interface/test_model_integration/test_model_adapters.py
+++ b/tests/test_interface/test_model_integration/test_model_adapters.py
@@ -170,6 +170,228 @@ def create(self, model, messages, **kwargs):
         assert "client_type" in config
         assert config["client_type"] == "MockOpenAIClient"
 
+    def test_openai_adapter_tool_calls_response(self):
+        """OpenAIModelAdapter handles tool call responses."""
+        pytest.importorskip("openai")
+        from maseval.interface.inference.openai import OpenAIModelAdapter
+
+        class MockToolCall:
+            id = "call_123"
+            type = "function"
+
+            class function:
+                name = "get_weather"
+                arguments = '{"city": "Paris"}'
+
+        class MockMessage:
+            content = None
+            role = "assistant"
+            tool_calls = [MockToolCall()]
+
+        class MockChoice:
+            message = MockMessage()
+            finish_reason = "tool_calls"
+
+        class MockUsage:
+            prompt_tokens = 10
+            completion_tokens = 5
+            total_tokens = 15
+
+        class MockResponse:
+            choices = [MockChoice()]
+            usage = MockUsage()
+            model = "gpt-4"
+
+        class MockClient:
+            class Chat:
+                class Completions:
+                    def create(self, model, messages, **kwargs):
+                        return MockResponse()
+
+                completions = Completions()
+
+            chat = Chat()
+
+        adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4")
+        response = adapter.chat([{"role": "user", "content": "Weather?"}])
+
+        assert response.tool_calls is not None
+        assert len(response.tool_calls) == 1
+        assert response.tool_calls[0]["function"]["name"] == "get_weather"
+        assert response.usage["input_tokens"] == 10
+        assert response.stop_reason == "tool_calls"
+
+    def test_openai_adapter_tools_parameter_passing(self):
+        """OpenAIModelAdapter passes tools to API."""
+        pytest.importorskip("openai")
+        from maseval.interface.inference.openai import OpenAIModelAdapter
+
+        captured_kwargs = {}
+
+        class MockMessage:
+            content = "I'll check the weather"
+            role = "assistant"
+            tool_calls = None
+
+        class MockChoice:
+            message = MockMessage()
+            finish_reason = "stop"
+
+        class MockResponse:
+            choices = [MockChoice()]
+            model = "gpt-4"
+
+        class MockClient:
+            class Chat:
+                class Completions:
+                    def create(self, model, messages, **kwargs):
+                        captured_kwargs.update(kwargs)
+                        return MockResponse()
+
+                completions = Completions()
+
+            chat = Chat()
+
+        adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4")
+        tools = [{"type": "function", "function": {"name": "get_weather"}}]
+        adapter.chat(
+            [{"role": "user", "content": "Weather?"}],
+            tools=tools,
+            tool_choice="auto",
+        )
+
+        assert "tools" in captured_kwargs
+        assert captured_kwargs["tools"] == tools
+        assert captured_kwargs["tool_choice"] == "auto"
+
+    def test_openai_adapter_legacy_client_fallback(self):
+        """OpenAIModelAdapter falls back to legacy client interface."""
+        pytest.importorskip("openai")
+        from maseval.interface.inference.openai import OpenAIModelAdapter
+
+        class LegacyClient:
+            def create(self, model, messages, **kwargs):
+                return {"choices": [{"message": {"content": "Legacy response"}}]}
+
+        adapter = OpenAIModelAdapter(client=LegacyClient(), model_id="gpt-4")
+        response = adapter.chat([{"role": "user", "content": "Hi"}])
+
+        assert response.content == "Legacy response"
+
+    def test_openai_adapter_callable_client(self):
+        """OpenAIModelAdapter falls back to calling client directly."""
+        pytest.importorskip("openai")
+        from maseval.interface.inference.openai import OpenAIModelAdapter
+
+        def callable_client(model, messages, **kwargs):
+            return {"choices": [{"message": {"content": "Callable response"}}]}
+
+        adapter = OpenAIModelAdapter(client=callable_client, model_id="gpt-4")
+        response = adapter.chat([{"role": "user", "content": "Hi"}])
+
+        assert response.content == "Callable response"
+
+    def test_openai_adapter_text_format_response(self):
+        """OpenAIModelAdapter parses text format responses."""
+        pytest.importorskip("openai")
+        from maseval.interface.inference.openai import OpenAIModelAdapter
+
+        class MockClient:
+            class Chat:
+                class Completions:
+                    def create(self, model, messages, **kwargs):
+                        return {"choices": [{"text": "Completion text"}]}
+
+                completions = Completions()
+
+            chat = Chat()
+
+        adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4")
+        response = adapter.chat([{"role": "user", "content": "Hi"}])
+
+        assert response.content == "Completion text"
+
+    def test_openai_adapter_dict_response_with_tool_calls(self):
+        """OpenAIModelAdapter parses dict responses with tool calls."""
+        pytest.importorskip("openai")
+        from maseval.interface.inference.openai import OpenAIModelAdapter
+
+        class MockClient:
+            class Chat:
+                class Completions:
+                    def create(self, model, messages, **kwargs):
+                        return {
+                            "choices": [
+                                {
+                                    "message": {
+                                        "content": None,
+                                        "tool_calls": [
+                                            {
+                                                "id": "call_1",
+                                                "type": "function",
+                                                "function": {"name": "search", "arguments": "{}"},
+                                            }
+                                        ],
+                                    },
+                                }
+                            ],
+                        }
+
+                completions = Completions()
+
+            chat = Chat()
+
+        adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4")
+        response = adapter.chat([{"role": "user", "content": "Search"}])
+
+        assert response.tool_calls is not None
+        assert response.tool_calls[0]["function"]["name"] == "search"
+
+    def test_openai_adapter_fallback_without_model_param(self):
+        """OpenAIModelAdapter falls back to calling without model param."""
+        pytest.importorskip("openai")
+        from maseval.interface.inference.openai import OpenAIModelAdapter
+
+        class LegacyClient:
+            def create(self, messages, **kwargs):
+                # Only accepts messages, no model param
+                return {"choices": [{"message": {"content": "No model param"}}]}
+
+        adapter = OpenAIModelAdapter(client=LegacyClient(), model_id="gpt-4")
+        response = adapter.chat([{"role": "user", "content": "Hi"}])
+
+        assert response.content == "No model param"
+
+    def test_openai_adapter_gather_config_with_timeout(self):
+        """OpenAIModelAdapter includes timeout in config."""
+        pytest.importorskip("openai")
+        from maseval.interface.inference.openai import OpenAIModelAdapter
+
+        class MockTimeout:
+            connect = 5.0
+            read = 30.0
+            write = 30.0
+            pool = 10.0
+
+        class MockClient:
+            timeout = MockTimeout()
+            max_retries = 3
+
+            class Chat:
+                class Completions:
+                    def create(self, **kwargs):
+                        return {"choices": [{"message": {"content": "R"}}]}
+
+                completions = Completions()
+
+            chat = Chat()
+
+        adapter = OpenAIModelAdapter(client=MockClient(), model_id="gpt-4")
+        config = adapter.gather_config()
+
+        assert "client_config" in config
+        assert config["client_config"]["max_retries"] == 3
+
 
 # ==================== Google GenAI Tests ====================
 
@@ -297,6 +519,125 @@ def __init__(self):
         assert config["default_generation_params"]["temperature"] == 0.9
         assert "client_type" in config
 
+    def test_google_genai_adapter_function_call_response(self):
+        """GoogleGenAIModelAdapter handles function call responses."""
+        pytest.importorskip("google.genai")
+        from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
+
+        class MockFunctionCall:
+            name = "search_web"
+            args = {"query": "test"}
+
+        class MockPart:
+            type = "function_call"
+            function_call = MockFunctionCall()
+
+        class MockContent:
+            parts = [MockPart()]
+
+        class MockCandidate:
+            content = MockContent()
+            finish_reason = "STOP"
+
+        class MockUsage:
+            prompt_token_count = 20
+            candidates_token_count = 10
+            total_token_count = 30
+
+        class MockResponse:
+            text = None
+            candidates = [MockCandidate()]
+            usage_metadata = MockUsage()
+
+        class MockClient:
+            class Models:
+                def generate_content(self, model, contents, config=None):
+                    return MockResponse()
+
+            def __init__(self):
+                self.models = self.Models()
+
+        adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro")
+        response = adapter.chat([{"role": "user", "content": "Search"}])
+
+        assert response.tool_calls is not None
+        assert len(response.tool_calls) == 1
+        assert response.tool_calls[0]["function"]["name"] == "search_web"
+        assert response.usage["input_tokens"] == 20
+
+    def test_google_genai_adapter_tools_conversion(self):
+        """GoogleGenAIModelAdapter converts tools to Google format."""
+        pytest.importorskip("google.genai")
+        from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
+
+        captured_config = None
+
+        class MockResponse:
+            text = "Response"
+            candidates = []
+
+        class MockClient:
+            class Models:
+                def generate_content(self, model, contents, config=None):
+                    nonlocal captured_config
+                    captured_config = config
+                    return MockResponse()
+
+            def __init__(self):
+                self.models = self.Models()
+
+        adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro")
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Get weather",
+                    "parameters": {"type": "object"},
+                },
+            }
+        ]
+        adapter.chat([{"role": "user", "content": "Weather?"}], tools=tools)
+
+        assert captured_config is not None
+
+    def test_google_genai_adapter_tool_choice_options(self):
+        """GoogleGenAIModelAdapter handles various tool_choice options."""
+        pytest.importorskip("google.genai")
+        from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
+
+        class MockResponse:
+            text = "Response"
+            candidates = []
+
+        class MockClient:
+            class Models:
+                def generate_content(self, model, contents, config=None):
+                    return MockResponse()
+
+            def __init__(self):
+                self.models = self.Models()
+
+        adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro")
+        tools = [{"type": "function", "function": {"name": "test"}}]
+
+        # Test different tool_choice values
+        for choice in ["none", "auto", "required"]:
+            response = adapter.chat(
+                [{"role": "user", "content": "Test"}],
+                tools=tools,
+                tool_choice=choice,
+            )
+            assert response is not None
+
+        # Test specific function choice
+        response = adapter.chat(
+            [{"role": "user", "content": "Test"}],
+            tools=tools,
+            tool_choice={"type": "function", "function": {"name": "test"}},
+        )
+        assert response is not None
+
 
 # ==================== HuggingFace Tests ====================
 
@@ -415,6 +756,170 @@ def __call__(self, prompt, **kwargs):
         assert "cpu" in str(config["pipeline_config"]["device"])
         assert config["pipeline_config"]["framework"] == "pt"
 
+    def test_huggingface_adapter_tools_raises_error_without_support(self):
+        """HuggingFaceModelAdapter raises error when tools not supported."""
+        pytest.importorskip("transformers")
+        from maseval.interface.inference.huggingface import (
+            HuggingFaceModelAdapter,
+            ToolCallingNotSupportedError,
+        )
+
+        def mock_model(prompt, **kwargs):
+            return "Response"
+
+        adapter = HuggingFaceModelAdapter(model=mock_model, model_id="test-model")
+
+        with pytest.raises(ToolCallingNotSupportedError):
+            adapter.chat(
+                [{"role": "user", "content": "Test"}],
+                tools=[{"type": "function", "function": {"name": "test"}}],
+            )
+
+    def test_huggingface_adapter_tools_raises_when_template_doesnt_support(self):
+        """HuggingFaceModelAdapter raises error when template doesn't support tools."""
+        pytest.importorskip("transformers")
+        from maseval.interface.inference.huggingface import (
+            HuggingFaceModelAdapter,
+            ToolCallingNotSupportedError,
+        )
+
+        class MockTokenizer:
+            def apply_chat_template(self, messages, add_generation_prompt=True, tokenize=False, **kwargs):
+                if "tools" in kwargs:
+                    raise TypeError("Unexpected keyword argument 'tools'")
+                return "Formatted prompt"
+
+        class MockPipeline:
+            tokenizer = MockTokenizer()
+
+            def __call__(self, prompt, **kwargs):
+                return "Response"
+
+        adapter = HuggingFaceModelAdapter(model=MockPipeline(), model_id="test-model")
+
+        with pytest.raises(ToolCallingNotSupportedError):
+            adapter.chat(
+                [{"role": "user", "content": "Test"}],
+                tools=[{"type": "function", "function": {"name": "test"}}],
+            )
+
+    def test_huggingface_adapter_chat_template_with_tools(self):
+        """HuggingFaceModelAdapter works when template supports tools."""
+        pytest.importorskip("transformers")
+        from maseval.interface.inference.huggingface import HuggingFaceModelAdapter
+
+        class MockTokenizer:
+            def apply_chat_template(self, messages, add_generation_prompt=True, tokenize=False, tools=None, **kwargs):
+                return "Formatted with tools"
+
+        class MockPipeline:
+            tokenizer = MockTokenizer()
+
+            def __call__(self, prompt, **kwargs):
+                return "Response"
+
+        adapter = HuggingFaceModelAdapter(model=MockPipeline(), model_id="test-model")
+        response = adapter.chat(
+            [{"role": "user", "content": "Test"}],
+            tools=[{"type": "function", "function": {"name": "test"}}],
+        )
+
+        assert response is not None
+
+    def test_huggingface_adapter_parses_tool_calls_from_output(self):
+        """HuggingFaceModelAdapter parses tool calls from model output."""
+        pytest.importorskip("transformers")
+        from maseval.interface.inference.huggingface import HuggingFaceModelAdapter
+
+        class MockTokenizer:
+            def apply_chat_template(self, messages, add_generation_prompt=True, tokenize=False, tools=None, **kwargs):
+                return "Prompt"
+
+        class MockPipeline:
+            tokenizer = MockTokenizer()
+
+            def __call__(self, prompt, **kwargs):
+                return '<tool_call>{"name": "search", "arguments": {"q": "test"}}</tool_call>'
+
+        adapter = HuggingFaceModelAdapter(model=MockPipeline(), model_id="test-model")
+        response = adapter.chat(
+            [{"role": "user", "content": "Search"}],
+            tools=[{"type": "function", "function": {"name": "search"}}],
+        )
+
+        assert response.tool_calls is not None
+        assert len(response.tool_calls) >= 1
+        assert any(tc["function"]["name"] == "search" for tc in response.tool_calls)
+
+    def test_huggingface_adapter_chat_with_tokenizer(self):
+        """HuggingFaceModelAdapter uses chat template when available."""
+        pytest.importorskip("transformers")
+        from maseval.interface.inference.huggingface import HuggingFaceModelAdapter
+
+        class MockTokenizer:
+            def apply_chat_template(self, messages, add_generation_prompt=True, tokenize=False, **kwargs):
+                return "Formatted: " + messages[0]["content"]
+
+        class MockPipeline:
+            tokenizer = MockTokenizer()
+
+            def __call__(self, prompt, **kwargs):
+                return f"Response to: {prompt}"
+
+        adapter = HuggingFaceModelAdapter(model=MockPipeline(), model_id="test-model")
+        response = adapter.chat([{"role": "user", "content": "Hello"}])
+
+        assert response.content is not None
+
+    def test_huggingface_adapter_pipeline_response_format(self):
+        """HuggingFaceModelAdapter handles pipeline list response format."""
+        pytest.importorskip("transformers")
+        from maseval.interface.inference.huggingface import HuggingFaceModelAdapter
+
+        def mock_model(prompt, **kwargs):
+            return [{"generated_text": prompt + " Generated"}]
+
+        adapter = HuggingFaceModelAdapter(model=mock_model, model_id="test-model")
+        response = adapter.chat([{"role": "user", "content": "Test"}])
+
+        assert "Generated" in response.content
+
+    def test_huggingface_adapter_dict_response_format(self):
+        """HuggingFaceModelAdapter handles dict response format."""
+        pytest.importorskip("transformers")
+        from maseval.interface.inference.huggingface import HuggingFaceModelAdapter
+
+        def mock_model(prompt, **kwargs):
+            return {"generated_text": "Dict response"}
+
+        adapter = HuggingFaceModelAdapter(model=mock_model, model_id="test-model")
+        response = adapter.chat([{"role": "user", "content": "Test"}])
+
+        assert response.content == "Dict response"
+
+    def test_huggingface_adapter_nested_tokenizer(self):
+        """HuggingFaceModelAdapter gets tokenizer from model.model.tokenizer."""
+        pytest.importorskip("transformers")
+        from maseval.interface.inference.huggingface import HuggingFaceModelAdapter
+
+        class MockTokenizer:
+            def apply_chat_template(self, messages, add_generation_prompt=True, tokenize=False, **kwargs):
+                return "From nested tokenizer"
+
+        class MockInnerModel:
+            tokenizer = MockTokenizer()
+
+        class MockPipeline:
+            model = MockInnerModel()
+
+            def __call__(self, prompt, **kwargs):
+                return "Response"
+
+        adapter = HuggingFaceModelAdapter(model=MockPipeline(), model_id="test-model")
+        response = adapter.chat([{"role": "user", "content": "Test"}])
+
+        assert response is not None
+
 
 # ==================== LiteLLM Tests ====================
 
@@ -466,6 +971,413 @@ def test_litellm_adapter_gather_config(self):
         assert config["default_generation_params"]["max_tokens"] == 200
         assert config["model_id"] == "gpt-4"
 
+    def test_litellm_adapter_tool_calls_response(self):
+        """LiteLLMModelAdapter handles tool call responses."""
+        pytest.importorskip("litellm")
+        import litellm
+        from maseval.interface.inference.litellm import LiteLLMModelAdapter
+
+        class MockToolCall:
+            id = "call_456"
+            type = "function"
+
+            class function:
+                name = "calculator"
+                arguments = '{"expression": "2+2"}'
+
+        class MockMessage:
+            content = None
+            role = "assistant"
+            tool_calls = [MockToolCall()]
+
+        class MockChoice:
+            message = MockMessage()
+            finish_reason = "tool_calls"
+
+        class MockUsage:
+            prompt_tokens = 15
+            completion_tokens = 8
+            total_tokens = 23
+
+        class MockResponse:
+            choices = [MockChoice()]
+            usage = MockUsage()
+            model = "gpt-4"
+
+        original = litellm.completion
+
+        def mock_completion(model, messages, **kwargs):
+            return MockResponse()
+
+        litellm.completion = mock_completion
+
+        try:
+            adapter = LiteLLMModelAdapter(model_id="gpt-4")
+            response = adapter.chat([{"role": "user", "content": "Calculate"}])
+
+            assert response.tool_calls is not None
+            assert len(response.tool_calls) == 1
+            assert response.tool_calls[0]["function"]["name"] == "calculator"
+            assert response.usage["input_tokens"] == 15
+            assert response.stop_reason == "tool_calls"
+        finally:
+            litellm.completion = original
+
+    def test_litellm_adapter_tools_and_credentials_passing(self):
+        """LiteLLMModelAdapter passes tools and credentials."""
+        pytest.importorskip("litellm")
+        import litellm
+        from maseval.interface.inference.litellm import LiteLLMModelAdapter
+
+        captured_kwargs = {}
+
+        class MockMessage:
+            content = "Response"
+            role = "assistant"
+            tool_calls = None
+
+        class MockChoice:
+            message = MockMessage()
+            finish_reason = "stop"
+
+        class MockResponse:
+            choices = [MockChoice()]
+
+        original = litellm.completion
+
+        def mock_completion(model, messages, **kwargs):
+            captured_kwargs.update(kwargs)
+            return MockResponse()
+
+        litellm.completion = mock_completion
+
+        try:
+            adapter = LiteLLMModelAdapter(
+                model_id="gpt-4",
+                api_key="test-key",
+                api_base="https://test.api.com",
+            )
+            tools = [{"type": "function", "function": {"name": "test"}}]
+            adapter.chat(
+                [{"role": "user", "content": "Test"}],
+                tools=tools,
+                tool_choice="required",
+            )
+
+            assert captured_kwargs["api_key"] == "test-key"
+            assert captured_kwargs["api_base"] == "https://test.api.com"
+            assert captured_kwargs["tools"] == tools
+            assert captured_kwargs["tool_choice"] == "required"
+        finally:
+            litellm.completion = original
+
+
+# ==================== Anthropic Tests ====================
+
+
+@pytest.mark.interface
+class TestAnthropicModelAdapterIntegration:
+    """Test AnthropicModelAdapter specific behavior."""
+
+    def test_anthropic_adapter_initialization(self):
+        """AnthropicModelAdapter initializes with client and model_id."""
+        pytest.importorskip("anthropic")
+        from maseval.interface.inference.anthropic import AnthropicModelAdapter
+
+        class MockClient:
+            pass
+
+        adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3")
+        assert adapter.model_id == "claude-3"
+
+    def test_anthropic_adapter_chat_basic(self):
+        """AnthropicModelAdapter handles basic chat."""
+        pytest.importorskip("anthropic")
+        from maseval.interface.inference.anthropic import AnthropicModelAdapter
+
+        class MockTextBlock:
+            type = "text"
+            text = "Hello! How can I help?"
+
+        class MockUsage:
+            input_tokens = 10
+            output_tokens = 8
+
+        class MockResponse:
+            content = [MockTextBlock()]
+            usage = MockUsage()
+            model = "claude-3"
+            stop_reason = "end_turn"
+
+        class MockMessages:
+            def create(self, **kwargs):
+                return MockResponse()
+
+        class MockClient:
+            messages = MockMessages()
+
+        adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3")
+        response = adapter.chat([{"role": "user", "content": "Hello"}])
+
+        assert response.content == "Hello! How can I help?"
+        assert response.usage["input_tokens"] == 10
+        assert response.stop_reason == "end_turn"
+
+    def test_anthropic_adapter_tool_use_response(self):
+        """AnthropicModelAdapter handles tool use responses."""
+        pytest.importorskip("anthropic")
+        from maseval.interface.inference.anthropic import AnthropicModelAdapter
+
+        class MockToolUseBlock:
+            type = "tool_use"
+            id = "tool_123"
+            name = "get_weather"
+            input = {"city": "Paris"}
+
+        class MockUsage:
+            input_tokens = 15
+            output_tokens = 12
+
+        class MockResponse:
+            content = [MockToolUseBlock()]
+            usage = MockUsage()
+            model = "claude-3"
+            stop_reason = "tool_use"
+
+        class MockMessages:
+            def create(self, **kwargs):
+                return MockResponse()
+
+        class MockClient:
+            messages = MockMessages()
+
+        adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3")
+        response = adapter.chat([{"role": "user", "content": "Weather?"}])
+
+        assert response.tool_calls is not None
+        assert len(response.tool_calls) == 1
+        assert response.tool_calls[0]["function"]["name"] == "get_weather"
+        assert response.stop_reason == "tool_use"
+
+    def test_anthropic_adapter_system_message_extraction(self):
+        """AnthropicModelAdapter extracts system message."""
+        pytest.importorskip("anthropic")
+        from maseval.interface.inference.anthropic import AnthropicModelAdapter
+
+        captured_kwargs = {}
+
+        class MockTextBlock:
+            type = "text"
+            text = "I'm helpful!"
+
+        class MockResponse:
+            content = [MockTextBlock()]
+
+        class MockMessages:
+            def create(self, **kwargs):
+                captured_kwargs.update(kwargs)
+                return MockResponse()
+
+        class MockClient:
+            messages = MockMessages()
+
+        adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3")
+        adapter.chat(
+            [
+                {"role": "system", "content": "You are very helpful"},
+                {"role": "user", "content": "Hi"},
+            ]
+        )
+
+        assert captured_kwargs["system"] == "You are very helpful"
+        assert all(m["role"] != "system" for m in captured_kwargs["messages"])
+
+    def test_anthropic_adapter_tools_conversion(self):
+        """AnthropicModelAdapter converts tools to Anthropic format."""
+        pytest.importorskip("anthropic")
+        from maseval.interface.inference.anthropic import AnthropicModelAdapter
+
+        captured_kwargs = {}
+
+        class MockTextBlock:
+            type = "text"
+            text = "Response"
+
+        class MockResponse:
+            content = [MockTextBlock()]
+
+        class MockMessages:
+            def create(self, **kwargs):
+                captured_kwargs.update(kwargs)
+                return MockResponse()
+
+        class MockClient:
+            messages = MockMessages()
+
+        adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3")
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "search",
+                    "description": "Search the web",
+                    "parameters": {"type": "object", "properties": {}},
+                },
+            }
+        ]
+        adapter.chat([{"role": "user", "content": "Search"}], tools=tools)
+
+        assert "tools" in captured_kwargs
+        assert captured_kwargs["tools"][0]["name"] == "search"
+        assert "input_schema" in captured_kwargs["tools"][0]
+
+    def test_anthropic_adapter_tool_choice_conversion(self):
+        """AnthropicModelAdapter converts tool_choice options."""
+        pytest.importorskip("anthropic")
+        from maseval.interface.inference.anthropic import AnthropicModelAdapter
+
+        captured_kwargs = {}
+
+        class MockTextBlock:
+            type = "text"
+            text = "Response"
+
+        class MockResponse:
+            content = [MockTextBlock()]
+
+        class MockMessages:
+            def create(self, **kwargs):
+                captured_kwargs.update(kwargs)
+                return MockResponse()
+
+        class MockClient:
+            messages = MockMessages()
+
+        adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3")
+        tools = [{"type": "function", "function": {"name": "test"}}]
+
+        # Test "required" -> "any"
+        adapter.chat(
+            [{"role": "user", "content": "Test"}],
+            tools=tools,
+            tool_choice="required",
+        )
+        assert captured_kwargs["tool_choice"]["type"] == "any"
+
+        # Test specific function
+        adapter.chat(
+            [{"role": "user", "content": "Test"}],
+            tools=tools,
+            tool_choice={"type": "function", "function": {"name": "test"}},
+        )
+        assert captured_kwargs["tool_choice"]["type"] == "tool"
+        assert captured_kwargs["tool_choice"]["name"] == "test"
+
+    def test_anthropic_adapter_tool_result_conversion(self):
+        """AnthropicModelAdapter converts tool result messages."""
+        pytest.importorskip("anthropic")
+        from maseval.interface.inference.anthropic import AnthropicModelAdapter
+
+        captured_kwargs = {}
+
+        class MockTextBlock:
+            type = "text"
+            text = "Final answer"
+
+        class MockResponse:
+            content = [MockTextBlock()]
+
+        class MockMessages:
+            def create(self, **kwargs):
+                captured_kwargs.update(kwargs)
+                return MockResponse()
+
+        class MockClient:
+            messages = MockMessages()
+
+        adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3")
+        adapter.chat(
+            [
+                {"role": "user", "content": "What's the weather?"},
+                {
+                    "role": "assistant",
+                    "tool_calls": [
+                        {
+                            "id": "tool_1",
+                            "type": "function",
+                            "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'},
+                        }
+                    ],
+                },
+                {"role": "tool", "tool_call_id": "tool_1", "content": "Sunny, 22°C"},
+            ]
+        )
+
+        messages = captured_kwargs["messages"]
+        tool_result_msg = [m for m in messages if m["role"] == "user" and isinstance(m.get("content"), list)]
+        assert len(tool_result_msg) > 0
+
+    def test_anthropic_adapter_mixed_content_response(self):
+        """AnthropicModelAdapter handles mixed text and tool_use response."""
+        pytest.importorskip("anthropic")
+        from maseval.interface.inference.anthropic import AnthropicModelAdapter
+
+        class MockTextBlock:
+            type = "text"
+            text = "Let me check that for you."
+
+        class MockToolUseBlock:
+            type = "tool_use"
+            id = "tool_456"
+            name = "lookup"
+            input = {"id": "123"}
+
+        class MockUsage:
+            input_tokens = 20
+            output_tokens = 15
+
+        class MockResponse:
+            content = [MockTextBlock(), MockToolUseBlock()]
+            usage = MockUsage()
+            model = "claude-3"
+            stop_reason = "tool_use"
+
+        class MockMessages:
+            def create(self, **kwargs):
+                return MockResponse()
+
+        class MockClient:
+            messages = MockMessages()
+
+        adapter = AnthropicModelAdapter(client=MockClient(), model_id="claude-3")
+        response = adapter.chat([{"role": "user", "content": "Look up ID 123"}])
+
+        assert response.content == "Let me check that for you."
+        assert response.tool_calls is not None
+        assert len(response.tool_calls) == 1
+        assert response.tool_calls[0]["function"]["name"] == "lookup"
+
+    def test_anthropic_adapter_gather_config(self):
+        """AnthropicModelAdapter config includes parameters."""
+        pytest.importorskip("anthropic")
+        from maseval.interface.inference.anthropic import AnthropicModelAdapter
+
+        class MockClient:
+            pass
+
+        adapter = AnthropicModelAdapter(
+            client=MockClient(),
+            model_id="claude-3",
+            max_tokens=2048,
+            default_generation_params={"temperature": 0.8},
+        )
+        config = adapter.gather_config()
+
+        assert config["model_id"] == "claude-3"
+        assert config["max_tokens"] == 2048
+        assert config["default_generation_params"]["temperature"] == 0.8
+        assert config["client_type"] == "MockClient"
+
 
 # ==================== Cross-Adapter Tests ====================
 

From 8f98c7bb93b6cf313d38a0aa2369de9f0153ee0b Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 26 Dec 2025 12:16:58 +0100
Subject: [PATCH 4/6] improved call logs

---
 tests/conftest.py                             |  17 +-
 .../test_model_adapter_contract.py            | 493 +++++++++++++++++-
 2 files changed, 489 insertions(+), 21 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index fd40e9e..bd4d5e0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -31,21 +31,30 @@ class DummyModelAdapter(ModelAdapter):
     def __init__(
         self,
         model_id: str = "test-model",
-        responses: Optional[List[str]] = None,
-        tool_calls: Optional[List[List[Dict[str, Any]]]] = None,
+        responses: Optional[List[Optional[str]]] = None,
+        tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None,
+        usage: Optional[Dict[str, int]] = None,
+        stop_reason: Optional[str] = None,
     ):
         """Initialize DummyModelAdapter.
 
         Args:
             model_id: Identifier for this model instance.
             responses: List of text responses to return. Cycles through the list.
+                Can include None for tool-only responses.
             tool_calls: Optional list of tool call lists. If provided, each call
                 returns the corresponding tool_calls (cycling through the list).
+                Can include None for text-only responses.
+            usage: Optional usage dict to include in all responses. Should have
+                input_tokens, output_tokens, total_tokens.
+            stop_reason: Optional stop_reason to include in all responses.
         """
         super().__init__()
         self._model_id = model_id
-        self._responses = responses or ["test response"]
+        self._responses: List[Optional[str]] = responses or ["test response"]
         self._tool_calls = tool_calls
+        self._usage = usage
+        self._stop_reason = stop_reason
         self._call_count = 0
 
     @property
@@ -86,6 +95,8 @@ def _chat_impl(
             tool_calls=response_tool_calls,
             role="assistant",
             model=self._model_id,
+            usage=self._usage,
+            stop_reason=self._stop_reason,
         )
 
 
diff --git a/tests/test_contract/test_model_adapter_contract.py b/tests/test_contract/test_model_adapter_contract.py
index c8fbd6a..6815f39 100644
--- a/tests/test_contract/test_model_adapter_contract.py
+++ b/tests/test_contract/test_model_adapter_contract.py
@@ -120,21 +120,36 @@ def assert_base_config_fields(config: Dict[str, Any], model_id: Optional[str] =
 # ==================== Adapter Factory Functions ====================
 
 
-def create_openai_adapter(model_id: str = "gpt-4", responses: Optional[List[str]] = None) -> Any:
+def create_openai_adapter(
+    model_id: str = "gpt-4", responses: Optional[List[str]] = None, tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None
+) -> Any:
     """Create OpenAIModelAdapter instance."""
     pytest.importorskip("openai")
     from maseval.interface.inference.openai import OpenAIModelAdapter
 
     response_list: List[str] = responses or ["Test response"]
+    tool_calls_list = tool_calls
     call_count = [0]
 
     class MockClient:
         class Chat:
             class Completions:
                 def create(self, model, messages, **kwargs):
-                    response = response_list[call_count[0] % len(response_list)]
+                    response_text = response_list[call_count[0] % len(response_list)]
+                    response_tool_calls = tool_calls_list[call_count[0] % len(tool_calls_list)] if tool_calls_list else None
                     call_count[0] += 1
-                    return {"choices": [{"message": {"content": response}}]}
+
+                    # Mock response structure
+                    message = {"content": response_text, "role": "assistant"}
+
+                    if response_tool_calls:
+                        message["tool_calls"] = response_tool_calls
+
+                    return {
+                        "choices": [{"message": message, "finish_reason": "stop"}],
+                        "model": model,
+                        "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
+                    }
 
             completions = Completions()
 
@@ -143,12 +158,15 @@ def create(self, model, messages, **kwargs):
     return OpenAIModelAdapter(client=MockClient(), model_id=model_id)
 
 
-def create_google_genai_adapter(model_id: str = "gemini-pro", responses: Optional[List[str]] = None) -> Any:
+def create_google_genai_adapter(
+    model_id: str = "gemini-pro", responses: Optional[List[str]] = None, tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None
+) -> Any:
     """Create GoogleGenAIModelAdapter instance."""
     pytest.importorskip("google.genai")
     from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
 
     response_list: List[str] = responses or ["Test response"]
+    tool_calls_list = tool_calls
     call_count = [0]
 
     class MockClient:
@@ -168,7 +186,9 @@ def __init__(self):
     return GoogleGenAIModelAdapter(client=MockClient(), model_id=model_id)
 
 
-def create_huggingface_adapter(model_id: str = "gpt2", responses: Optional[List[str]] = None) -> Any:
+def create_huggingface_adapter(
+    model_id: str = "gpt2", responses: Optional[List[str]] = None, tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None
+) -> Any:
     """Create HuggingFaceModelAdapter instance."""
     pytest.importorskip("transformers")
     from maseval.interface.inference.huggingface import HuggingFaceModelAdapter
@@ -184,7 +204,9 @@ def mock_model(prompt, **kwargs):
     return HuggingFaceModelAdapter(model=mock_model, model_id=model_id)
 
 
-def create_litellm_adapter(model_id: str = "gpt-3.5-turbo", responses: Optional[List[str]] = None) -> Any:
+def create_litellm_adapter(
+    model_id: str = "gpt-3.5-turbo", responses: Optional[List[str]] = None, tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None
+) -> Any:
     """Create LiteLLMModelAdapter instance."""
     pytest.importorskip("litellm")
     import litellm
@@ -192,21 +214,36 @@ def create_litellm_adapter(model_id: str = "gpt-3.5-turbo", responses: Optional[
 
     # Mock litellm.completion
     response_list: List[str] = responses or ["Test response"]
+    tool_calls_list = tool_calls
     call_count = [0]
     original_completion = litellm.completion
 
     def mock_completion(model, messages, **kwargs):
         response = response_list[call_count[0] % len(response_list)]
+        response_tool_calls = tool_calls_list[call_count[0] % len(tool_calls_list)] if tool_calls_list else None
         call_count[0] += 1
 
         class MockMessage:
-            content = response
+            def __init__(self):
+                self.content = response
+                self.role = "assistant"
+                self.tool_calls = response_tool_calls
 
         class MockChoice:
-            message = MockMessage()
+            def __init__(self):
+                self.message = MockMessage()
+                self.finish_reason = "stop"
+
+        class MockUsage:
+            prompt_tokens = 10
+            completion_tokens = 20
+            total_tokens = 30
 
         class MockResponse:
-            choices = [MockChoice()]
+            def __init__(self):
+                self.choices = [MockChoice()]
+                self.usage = MockUsage()
+                self.model = model
 
         return MockResponse()
 
@@ -221,18 +258,24 @@ class MockResponse:
     return adapter
 
 
-def create_dummy_adapter(model_id: str = "test-model", responses: Optional[List[str]] = None) -> DummyModelAdapter:
+def create_dummy_adapter(
+    model_id: str = "test-model", responses: Optional[List[str]] = None, tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None
+) -> DummyModelAdapter:
     """Create DummyModelAdapter instance."""
     responses = responses or ["Test response"]
-    return DummyModelAdapter(model_id=model_id, responses=responses)
+    usage = {"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}
+    return DummyModelAdapter(model_id=model_id, responses=responses, tool_calls=tool_calls, usage=usage, stop_reason="stop")
 
 
-def create_anthropic_adapter(model_id: str = "claude-3", responses: Optional[List[str]] = None) -> Any:
+def create_anthropic_adapter(
+    model_id: str = "claude-3", responses: Optional[List[str]] = None, tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None
+) -> Any:
     """Create AnthropicModelAdapter instance."""
     pytest.importorskip("anthropic")
     from maseval.interface.inference.anthropic import AnthropicModelAdapter
 
     response_list: List[str] = responses or ["Test response"]
+    tool_calls_list = tool_calls
     call_count = [0]
 
     class MockTextBlock:
@@ -241,6 +284,16 @@ class MockTextBlock:
         def __init__(self, text: str):
             self.text = text
 
+    class MockToolUseBlock:
+        type = "tool_use"
+
+        def __init__(self, tool_call: Dict[str, Any]):
+            self.id = tool_call["id"]
+            self.name = tool_call["function"]["name"]
+            import json
+
+            self.input = json.loads(tool_call["function"]["arguments"])
+
     class MockUsage:
         input_tokens = 10
         output_tokens = 5
@@ -248,13 +301,20 @@ class MockUsage:
     class MockMessages:
         def create(self, **kwargs):
             response = response_list[call_count[0] % len(response_list)]
+            response_tool_calls = tool_calls_list[call_count[0] % len(tool_calls_list)] if tool_calls_list else None
             call_count[0] += 1
 
             class MockResponse:
-                content = [MockTextBlock(response)]
-                usage = MockUsage()
-                model = model_id
-                stop_reason = "end_turn"
+                def __init__(self):
+                    self.content = []
+                    if response:
+                        self.content.append(MockTextBlock(response))
+                    if response_tool_calls:
+                        for tc in response_tool_calls:
+                            self.content.append(MockToolUseBlock(tc))
+                    self.usage = MockUsage()
+                    self.model = model_id
+                    self.stop_reason = "end_turn"
 
             return MockResponse()
 
@@ -264,7 +324,12 @@ class MockClient:
     return AnthropicModelAdapter(client=MockClient(), model_id=model_id)
 
 
-def create_adapter_for_implementation(implementation: str, model_id: str, responses: Optional[List[str]] = None) -> Any:
+def create_adapter_for_implementation(
+    implementation: str,
+    model_id: str,
+    responses: Optional[List[Optional[str]]] = None,
+    tool_calls: Optional[List[Optional[List[Dict[str, Any]]]]] = None,
+) -> Any:
     """Factory function to create adapter for specified implementation."""
     factories = {
         "dummy": create_dummy_adapter,
@@ -278,7 +343,7 @@ def create_adapter_for_implementation(implementation: str, model_id: str, respon
     if implementation not in factories:
         raise ValueError(f"Unknown implementation: {implementation}")
 
-    return factories[implementation](model_id=model_id, responses=responses)
+    return factories[implementation](model_id=model_id, responses=responses, tool_calls=tool_calls)
 
 
 def cleanup_adapter(adapter: Any, implementation: str) -> None:
@@ -618,3 +683,395 @@ def test_all_adapters_log_same_call_metadata(self):
         finally:
             for adapter, impl in adapters:
                 cleanup_adapter(adapter, impl)
+
+
+# ==================== Tool Calling Contract Tests ====================
+
+
+@pytest.mark.contract
+@pytest.mark.interface
+@pytest.mark.parametrize("implementation", ["dummy", "openai", "litellm", "anthropic"])
+class TestToolCallingContract:
+    """Contract tests for tool calling functionality across adapters.
+
+    These tests verify that tool-related features work consistently across
+    all model adapters that support tools. This is critical for users building
+    agentic systems that need to swap between providers.
+
+    Note: Only testing adapters that support tools (OpenAI, Anthropic, LiteLLM, Dummy).
+    HuggingFace and GoogleGenAI don't fully support tool calling in their current implementation.
+    """
+
+    def test_adapter_accepts_tools_parameter(self, implementation):
+        """All adapters accept tools parameter without error."""
+        adapter = create_adapter_for_implementation(implementation, model_id="test-model")
+
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Get weather for a city",
+                    "parameters": {"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]},
+                },
+            }
+        ]
+
+        try:
+            result = adapter.chat([{"role": "user", "content": "What's the weather in Paris?"}], tools=tools)
+            assert isinstance(result, ChatResponse)
+        finally:
+            cleanup_adapter(adapter, implementation)
+
+    def test_adapter_accepts_tool_choice_parameter(self, implementation):
+        """All adapters accept tool_choice parameter without error."""
+        adapter = create_adapter_for_implementation(implementation, model_id="test-model")
+
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Get weather for a city",
+                    "parameters": {"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]},
+                },
+            }
+        ]
+
+        try:
+            # Test different tool_choice values
+            for tool_choice in ["auto", "none", "required"]:
+                result = adapter.chat([{"role": "user", "content": "What's the weather?"}], tools=tools, tool_choice=tool_choice)
+                assert isinstance(result, ChatResponse)
+
+            # Test specific tool selection
+            result = adapter.chat(
+                [{"role": "user", "content": "What's the weather?"}],
+                tools=tools,
+                tool_choice={"type": "function", "function": {"name": "get_weather"}},
+            )
+            assert isinstance(result, ChatResponse)
+        finally:
+            cleanup_adapter(adapter, implementation)
+
+    def test_adapter_returns_tool_calls_in_response(self, implementation):
+        """All adapters return tool_calls with consistent structure."""
+        tool_calls_to_return = [
+            [
+                {
+                    "id": "call_123",
+                    "type": "function",
+                    "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'},
+                }
+            ]
+        ]
+
+        adapter = create_adapter_for_implementation(
+            implementation, model_id="test-model", responses=["I'll check the weather"], tool_calls=tool_calls_to_return
+        )
+
+        tools = [
+            {
+                "type": "function",
+                "function": {"name": "get_weather", "description": "Get weather", "parameters": {"type": "object", "properties": {}}},
+            }
+        ]
+
+        try:
+            result = adapter.chat([{"role": "user", "content": "What's the weather in Paris?"}], tools=tools)
+
+            assert result.tool_calls is not None, f"{implementation} did not return tool_calls"
+            assert isinstance(result.tool_calls, list)
+            assert len(result.tool_calls) > 0
+
+            # Verify structure of first tool call
+            tc = result.tool_calls[0]
+            assert "id" in tc, f"{implementation} tool_call missing 'id'"
+            assert "type" in tc, f"{implementation} tool_call missing 'type'"
+            assert "function" in tc, f"{implementation} tool_call missing 'function'"
+            assert "name" in tc["function"], f"{implementation} tool_call function missing 'name'"
+            assert "arguments" in tc["function"], f"{implementation} tool_call function missing 'arguments'"
+
+            # Verify types
+            assert isinstance(tc["id"], str)
+            assert isinstance(tc["type"], str)
+            assert isinstance(tc["function"]["name"], str)
+            assert isinstance(tc["function"]["arguments"], str)  # JSON string
+        finally:
+            cleanup_adapter(adapter, implementation)
+
+    def test_adapter_handles_tool_result_messages(self, implementation):
+        """All adapters handle role='tool' messages in conversations."""
+        adapter = create_adapter_for_implementation(implementation, model_id="test-model")
+
+        # Simulate a conversation with tool use
+        messages = [
+            {"role": "user", "content": "What's the weather in Paris?"},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [
+                    {
+                        "id": "call_123",
+                        "type": "function",
+                        "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'},
+                    }
+                ],
+            },
+            {"role": "tool", "tool_call_id": "call_123", "content": '{"temperature": 72, "condition": "sunny"}'},
+            {"role": "user", "content": "What about London?"},
+        ]
+
+        try:
+            result = adapter.chat(messages)
+            assert isinstance(result, ChatResponse)
+            # Should not raise an error
+        finally:
+            cleanup_adapter(adapter, implementation)
+
+    def test_adapter_handles_assistant_messages_with_tool_calls(self, implementation):
+        """All adapters handle assistant messages containing tool_calls."""
+        adapter = create_adapter_for_implementation(implementation, model_id="test-model")
+
+        # Include an assistant message with tool_calls in the history
+        messages = [
+            {"role": "user", "content": "Get weather for Paris"},
+            {
+                "role": "assistant",
+                "content": "I'll check the weather for you.",
+                "tool_calls": [
+                    {
+                        "id": "call_123",
+                        "type": "function",
+                        "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'},
+                    }
+                ],
+            },
+            {"role": "tool", "tool_call_id": "call_123", "content": '{"temperature": 72}'},
+            {"role": "user", "content": "Thanks!"},
+        ]
+
+        try:
+            result = adapter.chat(messages)
+            assert isinstance(result, ChatResponse)
+            # Should process the conversation history without error
+        finally:
+            cleanup_adapter(adapter, implementation)
+
+    def test_adapter_tool_calls_logs_correctly(self, implementation):
+        """All adapters log tool-related calls consistently."""
+        tool_calls_to_return = [
+            [
+                {
+                    "id": "call_123",
+                    "type": "function",
+                    "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'},
+                }
+            ]
+        ]
+
+        adapter = create_adapter_for_implementation(
+            implementation, model_id="test-model", responses=["I'll check"], tool_calls=tool_calls_to_return
+        )
+
+        tools = [
+            {
+                "type": "function",
+                "function": {"name": "get_weather", "description": "Get weather", "parameters": {"type": "object", "properties": {}}},
+            }
+        ]
+
+        try:
+            adapter.chat([{"role": "user", "content": "Weather?"}], tools=tools)
+
+            traces = adapter.gather_traces()
+            assert traces["total_calls"] == 1
+            assert len(traces["logs"]) == 1
+
+            call_log = traces["logs"][0]
+            assert "response_type" in call_log
+            assert call_log["response_type"] == "tool_call"
+            assert "tool_calls_count" in call_log
+            assert call_log["tool_calls_count"] == 1
+            assert "tools_provided" in call_log
+            assert call_log["tools_provided"] == 1
+        finally:
+            cleanup_adapter(adapter, implementation)
+
+
+# ==================== Usage and Metadata Contract Tests ====================
+
+
+@pytest.mark.contract
+@pytest.mark.interface
+@pytest.mark.parametrize("implementation", ["dummy", "openai", "litellm", "anthropic"])
+class TestUsageAndMetadataContract:
+    """Contract tests for usage tracking and response metadata.
+
+    These tests ensure consistent reporting of token usage, stop reasons,
+    and other metadata across all adapters. This is important for evaluation
+    and cost tracking in production systems.
+
+    Note: Only testing adapters with full metadata support (OpenAI, Anthropic, LiteLLM, Dummy).
+    """
+
+    def test_adapter_returns_usage_info(self, implementation):
+        """All adapters return consistent usage information."""
+        adapter = create_adapter_for_implementation(implementation, model_id="test-model")
+
+        try:
+            result = adapter.chat([{"role": "user", "content": "Hello"}])
+
+            # Usage should be present and have required fields
+            if result.usage is not None:  # Some adapters might not support this
+                assert isinstance(result.usage, dict)
+                assert "input_tokens" in result.usage
+                assert "output_tokens" in result.usage
+                assert "total_tokens" in result.usage
+
+                assert isinstance(result.usage["input_tokens"], int)
+                assert isinstance(result.usage["output_tokens"], int)
+                assert isinstance(result.usage["total_tokens"], int)
+
+                assert result.usage["input_tokens"] >= 0
+                assert result.usage["output_tokens"] >= 0
+                assert result.usage["total_tokens"] >= 0
+        finally:
+            cleanup_adapter(adapter, implementation)
+
+    def test_adapter_returns_stop_reason(self, implementation):
+        """All adapters return stop_reason in responses."""
+        adapter = create_adapter_for_implementation(implementation, model_id="test-model")
+
+        try:
+            result = adapter.chat([{"role": "user", "content": "Hello"}])
+
+            # stop_reason should be present
+            if result.stop_reason is not None:  # Some adapters might not support this
+                assert isinstance(result.stop_reason, str)
+                assert len(result.stop_reason) > 0
+        finally:
+            cleanup_adapter(adapter, implementation)
+
+    def test_adapter_stop_reason_for_tool_calls(self, implementation):
+        """All adapters indicate tool use in stop_reason when applicable."""
+        tool_calls_to_return = [
+            [
+                {
+                    "id": "call_123",
+                    "type": "function",
+                    "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'},
+                }
+            ]
+        ]
+
+        adapter = create_adapter_for_implementation(implementation, model_id="test-model", responses=[None], tool_calls=tool_calls_to_return)
+
+        tools = [
+            {
+                "type": "function",
+                "function": {"name": "get_weather", "description": "Get weather", "parameters": {"type": "object", "properties": {}}},
+            }
+        ]
+
+        try:
+            result = adapter.chat([{"role": "user", "content": "Weather?"}], tools=tools)
+
+            # When tool_calls are returned, should have a stop_reason
+            # (The exact value may vary: "tool_calls", "tool_use", "function_call", etc.)
+            if result.stop_reason is not None:
+                assert isinstance(result.stop_reason, str)
+        finally:
+            cleanup_adapter(adapter, implementation)
+
+    def test_adapter_handles_content_none_with_tool_calls(self, implementation):
+        """All adapters handle responses with content=None and only tool_calls."""
+        tool_calls_to_return = [
+            [
+                {
+                    "id": "call_123",
+                    "type": "function",
+                    "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'},
+                }
+            ]
+        ]
+
+        # Response with None content, only tool_calls
+        adapter = create_adapter_for_implementation(implementation, model_id="test-model", responses=[None], tool_calls=tool_calls_to_return)
+
+        tools = [
+            {
+                "type": "function",
+                "function": {"name": "get_weather", "description": "Get weather", "parameters": {"type": "object", "properties": {}}},
+            }
+        ]
+
+        try:
+            result = adapter.chat([{"role": "user", "content": "What's the weather?"}], tools=tools)
+
+            assert isinstance(result, ChatResponse)
+            # content can be None when model only returns tool calls
+            assert result.tool_calls is not None, f"{implementation} should return tool_calls when content is None"
+            assert isinstance(result.tool_calls, list)
+            assert len(result.tool_calls) > 0
+
+            # Verify the response is still valid
+            msg = result.to_message()
+            assert isinstance(msg, dict)
+            assert msg["role"] == "assistant"
+            assert "tool_calls" in msg
+        finally:
+            cleanup_adapter(adapter, implementation)
+
+    def test_adapter_to_message_includes_tool_calls(self, implementation):
+        """All adapters include tool_calls in to_message() output."""
+        tool_calls_to_return = [
+            [
+                {
+                    "id": "call_123",
+                    "type": "function",
+                    "function": {"name": "get_weather", "arguments": '{"city": "Paris"}'},
+                }
+            ]
+        ]
+
+        adapter = create_adapter_for_implementation(
+            implementation, model_id="test-model", responses=["I'll check"], tool_calls=tool_calls_to_return
+        )
+
+        tools = [
+            {
+                "type": "function",
+                "function": {"name": "get_weather", "description": "Get weather", "parameters": {"type": "object", "properties": {}}},
+            }
+        ]
+
+        try:
+            result = adapter.chat([{"role": "user", "content": "Weather?"}], tools=tools)
+
+            msg = result.to_message()
+            assert isinstance(msg, dict)
+            assert msg["role"] == "assistant"
+            assert "tool_calls" in msg, f"{implementation} to_message() should include tool_calls"
+            assert isinstance(msg["tool_calls"], list)
+            assert len(msg["tool_calls"]) > 0
+        finally:
+            cleanup_adapter(adapter, implementation)
+
+    def test_adapter_usage_tracking_across_calls(self, implementation):
+        """All adapters consistently report usage across multiple calls."""
+        adapter = create_adapter_for_implementation(implementation, model_id="test-model", responses=["R1", "R2"])
+
+        try:
+            result1 = adapter.chat([{"role": "user", "content": "First"}])
+            result2 = adapter.chat([{"role": "user", "content": "Second"}])
+
+            # Both should have usage (if supported)
+            if result1.usage is not None and result2.usage is not None:
+                assert isinstance(result1.usage, dict)
+                assert isinstance(result2.usage, dict)
+
+                # Structure should be consistent
+                assert set(result1.usage.keys()) == set(result2.usage.keys())
+        finally:
+            cleanup_adapter(adapter, implementation)

From 71610bb919754f1e768d05c0ebc04dc79a87917a Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 26 Dec 2025 12:32:01 +0100
Subject: [PATCH 5/6] fixed test file

---
 .../test_model_adapter_contract.py            | 71 +++++++++++++++++--
 1 file changed, 64 insertions(+), 7 deletions(-)

diff --git a/tests/test_contract/test_model_adapter_contract.py b/tests/test_contract/test_model_adapter_contract.py
index 6815f39..cb7dc33 100644
--- a/tests/test_contract/test_model_adapter_contract.py
+++ b/tests/test_contract/test_model_adapter_contract.py
@@ -171,14 +171,49 @@ def create_google_genai_adapter(
 
     class MockClient:
         class Models:
-            def generate_content(self, model, contents, config=None):
+            def generate_content(self_inner, model, contents, config=None):
                 response = response_list[call_count[0] % len(response_list)]
+                response_tool_calls = tool_calls_list[call_count[0] % len(tool_calls_list)] if tool_calls_list else None
                 call_count[0] += 1
 
-                class Response:
-                    text = response
+                # Build mock response with function calls if tool_calls provided
+                if response_tool_calls:
 
-                return Response()
+                    class MockFunctionCall:
+                        def __init__(self, name, args):
+                            self.name = name
+                            self.args = args
+
+                    class MockPart:
+                        def __init__(self, tc_dict):
+                            self.type = "function_call"
+                            func = tc_dict.get("function", {})
+                            args_str = func.get("arguments", "{}")
+                            import json
+
+                            self.function_call = MockFunctionCall(func.get("name", ""), json.loads(args_str) if args_str else {})
+
+                    class MockContent:
+                        def __init__(self):
+                            self.parts = [MockPart(tc) for tc in response_tool_calls]
+
+                    class MockCandidate:
+                        def __init__(self):
+                            self.content = MockContent()
+                            self.finish_reason = "STOP"
+
+                    class MockResponse:
+                        text = None
+                        candidates = [MockCandidate()]
+
+                    return MockResponse()
+                else:
+
+                    class Response:
+                        text = response
+                        candidates = []
+
+                    return Response()
 
         def __init__(self):
             self.models = self.Models()
@@ -220,19 +255,41 @@ def create_litellm_adapter(
 
     def mock_completion(model, messages, **kwargs):
         response = response_list[call_count[0] % len(response_list)]
-        response_tool_calls = tool_calls_list[call_count[0] % len(tool_calls_list)] if tool_calls_list else None
+        response_tool_calls_dicts = tool_calls_list[call_count[0] % len(tool_calls_list)] if tool_calls_list else None
         call_count[0] += 1
 
+        # Convert dict tool_calls to objects with attributes (like real LiteLLM returns)
+        mock_tool_calls = None
+        if response_tool_calls_dicts:
+            mock_tool_calls = []
+            for tc_dict in response_tool_calls_dicts:
+
+                class MockFunction:
+                    pass
+
+                class MockToolCall:
+                    pass
+
+                func = MockFunction()
+                func.name = tc_dict.get("function", {}).get("name", "")
+                func.arguments = tc_dict.get("function", {}).get("arguments", "{}")
+
+                tc = MockToolCall()
+                tc.id = tc_dict.get("id", "")
+                tc.type = tc_dict.get("type", "function")
+                tc.function = func
+                mock_tool_calls.append(tc)
+
         class MockMessage:
             def __init__(self):
                 self.content = response
                 self.role = "assistant"
-                self.tool_calls = response_tool_calls
+                self.tool_calls = mock_tool_calls
 
         class MockChoice:
             def __init__(self):
                 self.message = MockMessage()
-                self.finish_reason = "stop"
+                self.finish_reason = "tool_calls" if mock_tool_calls else "stop"
 
         class MockUsage:
             prompt_tokens = 10

From 2c0a804ed294810d711e91e606ebe6d2c633f264 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 26 Dec 2025 12:43:24 +0100
Subject: [PATCH 6/6] fixed docs

---
 docs/interface/inference/anthropic.md | 7 +++++++
 docs/reference/model.md               | 8 ++++++++
 maseval/core/model.py                 | 3 +--
 mkdocs.yml                            | 1 +
 4 files changed, 17 insertions(+), 2 deletions(-)
 create mode 100644 docs/interface/inference/anthropic.md

diff --git a/docs/interface/inference/anthropic.md b/docs/interface/inference/anthropic.md
new file mode 100644
index 0000000..477608a
--- /dev/null
+++ b/docs/interface/inference/anthropic.md
@@ -0,0 +1,7 @@
+# Anthropic Inference Adapter
+
+This page documents the [Anthropic](https://docs.anthropic.com/) model adapter for MASEval.
+
+[:material-github: View source](https://github.com/parameterlab/maseval/blob/main/maseval/interface/inference/anthropic.py){ .md-source-file }
+
+::: maseval.interface.inference.anthropic.AnthropicModelAdapter
diff --git a/docs/reference/model.md b/docs/reference/model.md
index e2b421e..1569d93 100644
--- a/docs/reference/model.md
+++ b/docs/reference/model.md
@@ -25,3 +25,11 @@ The following adapter classes implement the ModelAdapter interface for specific
 [:material-github: View source](https://github.com/parameterlab/maseval/blob/main/maseval/interface/inference/google_genai.py){ .md-source-file }
 
 ::: maseval.interface.inference.google_genai.GoogleGenAIModelAdapter
+
+[:material-github: View source](https://github.com/parameterlab/maseval/blob/main/maseval/interface/inference/litellm.py){ .md-source-file }
+
+::: maseval.interface.inference.litellm.LiteLLMModelAdapter
+
+[:material-github: View source](https://github.com/parameterlab/maseval/blob/main/maseval/interface/inference/anthropic.py){ .md-source-file }
+
+::: maseval.interface.inference.anthropic.AnthropicModelAdapter
diff --git a/maseval/core/model.py b/maseval/core/model.py
index 69dfa83..d1de156 100644
--- a/maseval/core/model.py
+++ b/maseval/core/model.py
@@ -4,8 +4,7 @@
 implement. It defines a consistent interface for interacting with LLMs across
 different providers (OpenAI, Anthropic, Google, HuggingFace, LiteLLM, etc.).
 
-Concrete implementations for specific inference providers are in:
-    maseval.interface.inference
+See `maseval.interface.inference` for concrete implementations.
 
 Example:
     ```python
diff --git a/mkdocs.yml b/mkdocs.yml
index 8161f8a..76695b6 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -114,6 +114,7 @@ nav:
               - LlamaIndex: interface/agents/llamaindex.md
               - SmolAgents: interface/agents/smolagents.md
           - Models:
+              - Anthropic: interface/inference/anthropic.md
               - Google GenAI: interface/inference/google_genai.md
               - HuggingFace: interface/inference/huggingface.md
               - LiteLLM: interface/inference/litellm.md