SpringCare · jgieringer · Feb 12, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/docs/evaluating.md b/docs/evaluating.md
@@ -186,12 +186,9 @@ def set_system_prompt(self, system_prompt: str) -> None:
     self.system_prompt = system_prompt
 ```
 
-#### `get_last_response_metadata()` - Get response metadata (optional but recommended)
-```python
-def get_last_response_metadata(self) -> Dict[str, Any]:
-    """Get metadata from the last response."""
-    return self.last_response_metadata.copy()
-```
+#### `last_response_metadata` - Response metadata (required)
+
+Set in `__init__` (base sets it to `{}`). Update it in `generate_response()`: assign with `self.last_response_metadata = {...}`. If you need in-place updates (e.g. `self.last_response_metadata["usage"] = ...`), use `self._last_response_metadata` so the stored dict is updated. The property getter returns a copy so callers can use `last_response_metadata` without mutating the client's dict.
 
 ### 3. Add the new LLM client to the factory
 
@@ -227,6 +224,18 @@ python3 judge.py -f conversations/{YOUR_FOLDER} -j your-model-name
 - **LangChain Integration**: The provided implementations use LangChain for robust LLM interactions
 - **Error Handling**: Make sure to handle errors gracefully and return appropriate error messages
 
+### Conversation flow and history
+
+ConversationSimulator holds the full conversation and passes `conversation_history` into your client on every call. Your client is not required to store history. You can:
+
+- **Stateless**: Build each request from `conversation_history` (as the built-in clients do), or
+- **Server-side state**: Send a `conversation_id` to your API and let the server maintain the conversation; in that case you may use `conversation_history` only when needed (e.g. fallback or logging).
+
+**When your endpoint requires a conversation id** (the built-in clients do not; this is for custom clients):
+
+- `conversation_id` is set in the base class `__init__`, so you always have one to send as request metadata. Use `self.conversation_id` when your API needs a conversation ID.
+- For LLM clients that require `conversation_id` handling, in `generate_response()`, you must set `conversation_id` in `_last_response_metadata` (interface requirement). If your API returns its own `conversation_id` in the response metadata (e.g. it ignores the one we send), call `self._update_conversation_id_from_metadata()` at the end of `generate_response()` after setting `_last_response_metadata`; that overwrites `self.conversation_id` with the API’s value.
+
 ## Structured Output Support
 
 ### Native Support (Recommended)

diff --git a/generate_conversations/conversation_simulator.py b/generate_conversations/conversation_simulator.py
@@ -110,7 +110,7 @@ async def start_conversation(
                 input_message=input_msg,
                 response_message=lc_message,
                 early_termination=False,
-                logging_metadata=current_speaker.get_last_response_metadata(),
+                logging_metadata=current_speaker.last_response_metadata,
             )
             self.conversation_history.append(turn_obj)
 

diff --git a/generate_conversations/runner.py b/generate_conversations/runner.py
@@ -49,20 +49,30 @@ def __init__(
         self.max_total_words = max_total_words
         self.max_personas = max_personas
 
-        self.AGENT_SYSTEM_PROMPT = self.agent_model_config.get(
-            "system_prompt", "You are a helpful AI assistant."
-        )
-
     async def run_single_conversation(
         self,
         persona_config: dict,
-        agent,
         max_turns: int,
-        conversation_id: int,
+        conversation_index: int,
         run_number: int,
-        **kargs: dict,
+        **kwargs: dict,
     ) -> Dict[str, Any]:
-        """Run a single conversation asynchronously."""
+        """Run a single simulated conversation (persona vs provider LLM).
+
+        Uses fresh LLM instances per call; safe for concurrent use. Logs turns,
+        writes transcript to self.folder_name, then cleans up logger and LLMs.
+
+        Args:
+            persona_config (dict): Must have "model", "prompt", "name".
+            max_turns (int): Max conversation turns for a conversation.
+            conversation_index (int): Index in the batch of conversations.
+            run_number (int): Run index for this prompt (e.g. 1 of runs_per_prompt).
+            **kwargs: Unused; reserved for future use.
+
+        Returns:
+            Dict[str, Any]: index, llm1_model, llm1_prompt, run_number, turns,
+            filename, log_file, duration, early_termination, conversation.
+        """
         model_name = persona_config["model"]
         system_prompt = persona_config["prompt"]  # This is now the full persona prompt
         persona_name = persona_config["name"]
@@ -83,7 +93,7 @@ async def run_single_conversation(
         logger = setup_conversation_logger(filename_base, run_id=self.run_id)
         start_time = time.time()
 
-        # Create LLM1 instance with the persona prompt and configuration
+        # Create persona instance
         persona = LLMFactory.create_llm(
             model_name=model_name,
             name=f"{model_short} {persona_name}",
@@ -92,6 +102,23 @@ async def run_single_conversation(
             **self.persona_model_config,
         )
 
+        # Create new agent instance to reset conversation_id and metadata.
+        # Exclude selected kwargs to avoid duplicate args expected in create_llm.
+        agent_kwargs = {
+            k: v
+            for k, v in self.agent_model_config.items()
+            if k not in ("model", "name", "system_prompt")
+        }
+        agent = LLMFactory.create_llm(
+            model_name=self.agent_model_config["model"],
+            name=self.agent_model_config.get("name", "Provider"),
+            system_prompt=self.agent_model_config.get(
+                "system_prompt", "You are a helpful AI assistant."
+            ),
+            role=Role.PROVIDER,
+            **agent_kwargs,
+        )
+
         # Log conversation start
         log_conversation_start(
             logger=logger,
@@ -148,7 +175,7 @@ async def run_single_conversation(
             simulator.save_conversation(f"{filename_base}.txt", self.folder_name)
 
             result = {
-                "id": conversation_id,
+                "index": conversation_index,
                 "llm1_model": model_name,
                 "llm1_prompt": persona_name,
                 "run_number": run_number,
@@ -164,11 +191,12 @@ async def run_single_conversation(
 
             # Cleanup LLM resources (e.g., close HTTP sessions for Azure)
             # Always cleanup, even if conversation failed
-            try:
-                await persona.cleanup()
-            except Exception as e:
-                # Log but don't fail if cleanup fails
-                print(f"Warning: Failed to cleanup persona LLM: {e}")
+            for llm in (persona, agent):
+                try:
+                    await llm.cleanup()
+                except Exception as e:
+                    # Log but don't fail if cleanup fails
+                    print(f"Warning: Failed to cleanup LLM: {e}")
 
         return result
 
@@ -179,37 +207,26 @@ async def run_conversations(
         # Load prompts from CSV based on persona names
         personas = load_prompts_from_csv(persona_names, max_personas=self.max_personas)
 
-        # Load agent configuration (fixed, shared across all conversations)
-        agent = LLMFactory.create_llm(
-            model_name=self.agent_model_config["model"],
-            name=self.agent_model_config.pop("name"),
-            system_prompt=self.AGENT_SYSTEM_PROMPT,
-            role=Role.PROVIDER,
-            **self.agent_model_config,
-        )
-
         # Create tasks for all conversations (each prompt run multiple times)
         tasks = []
-        conversation_id = 1
+        conversation_index = 1
 
         for persona in personas:
             for run in range(1, self.runs_per_prompt + 1):
                 tasks.append(
-                    # TODO: should we pass the persona object here?
                     self.run_single_conversation(
                         {
                             "model": self.persona_model_config["model"],
                             "prompt": persona["prompt"],
                             "name": persona["Name"],
                             "run": run,
                         },
-                        agent,
                         self.max_turns,
-                        conversation_id,
+                        conversation_index,
                         run,
                     )
                 )
-                conversation_id += 1
+                conversation_index += 1
 
         # Run all conversations with concurrency limit
         start_time = datetime.now()
@@ -237,11 +254,4 @@ async def run_with_limit(task):
 
         print(f"\nCompleted {len(results)} conversations in {total_time:.2f} seconds")
 
-        # Cleanup agent LLM resources (e.g., close HTTP sessions for Azure)
-        try:
-            await agent.cleanup()
-        except Exception as e:
-            # Log but don't fail if cleanup fails
-            print(f"Warning: Failed to cleanup agent LLM: {e}")
-
         return results
diff --git a/llm_clients/azure_llm.py b/llm_clients/azure_llm.py
@@ -187,19 +187,19 @@ async def generate_response(
                 # Extract token usage
                 if "token_usage" in metadata:
                     usage = metadata["token_usage"]
-                    self.last_response_metadata["usage"] = {
+                    self._last_response_metadata["usage"] = {
                         "input_tokens": usage.get("input_tokens", 0),
                         "output_tokens": usage.get("output_tokens", 0),
                         "total_tokens": usage.get("total_tokens", 0),
                     }
 
                 # Extract finish reason
-                self.last_response_metadata["finish_reason"] = metadata.get(
+                self._last_response_metadata["finish_reason"] = metadata.get(
                     "finish_reason"
                 )
 
                 # Store raw metadata
-                self.last_response_metadata["raw_metadata"] = dict(metadata)
+                self._last_response_metadata["raw_metadata"] = dict(metadata)
 
             return response.text
         except Exception as e:
@@ -307,10 +307,6 @@ async def generate_structured_response(
             }
             raise RuntimeError(f"Error generating structured response: {str(e)}") from e
 
-    def get_last_response_metadata(self) -> Dict[str, Any]:
-        """Get metadata from the last response."""
-        return self.last_response_metadata.copy()
-
     def set_system_prompt(self, system_prompt: str) -> None:
         """Set or update the system prompt."""
         self.system_prompt = system_prompt

diff --git a/llm_clients/claude_llm.py b/llm_clients/claude_llm.py
@@ -118,18 +118,20 @@ async def generate_response(
                 # Extract token usage
                 if "usage" in metadata:
                     usage = metadata["usage"]
-                    self.last_response_metadata["usage"] = {
+                    self._last_response_metadata["usage"] = {
                         "input_tokens": usage.get("input_tokens", 0),
                         "output_tokens": usage.get("output_tokens", 0),
                         "total_tokens": usage.get("input_tokens", 0)
                         + usage.get("output_tokens", 0),
                     }
 
                 # Extract stop reason
-                self.last_response_metadata["stop_reason"] = metadata.get("stop_reason")
+                self._last_response_metadata["stop_reason"] = metadata.get(
+                    "stop_reason"
+                )
 
                 # Store raw metadata
-                self.last_response_metadata["raw_metadata"] = dict(metadata)
+                self._last_response_metadata["raw_metadata"] = dict(metadata)
 
             return response.text
         except Exception as e:
@@ -204,10 +206,6 @@ async def generate_structured_response(
             }
             raise RuntimeError(f"Error generating structured response: {str(e)}") from e
 
-    def get_last_response_metadata(self) -> Dict[str, Any]:
-        """Get metadata from the last response."""
-        return self.last_response_metadata.copy()
-
     def set_system_prompt(self, system_prompt: str) -> None:
         """Set or update the system prompt."""
         self.system_prompt = system_prompt
diff --git a/llm_clients/gemini_llm.py b/llm_clients/gemini_llm.py
@@ -116,7 +116,7 @@ async def generate_response(
                 # Extract token usage - Gemini may have different structure
                 if "usage_metadata" in metadata:
                     usage = metadata["usage_metadata"]
-                    self.last_response_metadata["usage"] = {
+                    self._last_response_metadata["usage"] = {
                         "prompt_token_count": usage.get("prompt_token_count", 0),
                         "candidates_token_count": usage.get(
                             "candidates_token_count", 0
@@ -126,19 +126,19 @@ async def generate_response(
                 elif "token_usage" in metadata:
                     # Fallback structure
                     usage = metadata["token_usage"]
-                    self.last_response_metadata["usage"] = {
+                    self._last_response_metadata["usage"] = {
                         "prompt_tokens": usage.get("prompt_tokens", 0),
                         "completion_tokens": usage.get("completion_tokens", 0),
                         "total_tokens": usage.get("total_tokens", 0),
                     }
 
                 # Extract finish reason
-                self.last_response_metadata["finish_reason"] = metadata.get(
+                self._last_response_metadata["finish_reason"] = metadata.get(
                     "finish_reason"
                 )
 
                 # Store raw metadata
-                self.last_response_metadata["raw_metadata"] = dict(metadata)
+                self._last_response_metadata["raw_metadata"] = dict(metadata)
 
             return response.text
         except Exception as e:
@@ -154,10 +154,6 @@ async def generate_response(
             }
             return f"Error generating response: {str(e)}"
 
-    def get_last_response_metadata(self) -> Dict[str, Any]:
-        """Get metadata from the last response."""
-        return self.last_response_metadata.copy()
-
     async def generate_structured_response(
         self, message: Optional[str], response_model: Type[T]
     ) -> T:

diff --git a/llm_clients/llm_interface.py b/llm_clients/llm_interface.py
@@ -1,3 +1,5 @@
+import copy
+import uuid
 from abc import ABC, abstractmethod
 from enum import Enum
 from typing import Any, Dict, List, Optional, Type, TypeVar
@@ -31,7 +33,39 @@ def __init__(
         self.name = name
         self.role = role
         self.system_prompt = system_prompt or ""
-        self.last_response_metadata: Dict[str, Any] = {}
+        self._last_response_metadata: Dict[str, Any] = {}
+        self.conversation_id = self.create_conversation_id()
+
+    @property
+    def last_response_metadata(self) -> Dict[str, Any]:
+        """Metadata from the last generate_response call. Returns a deep copy so
+        callers cannot mutate internal state (including nested dicts like usage).
+        """
+        return copy.deepcopy(self._last_response_metadata)
+
+    @last_response_metadata.setter
+    def last_response_metadata(self, value: Optional[Dict[str, Any]]) -> None:
+        """Set metadata; use _last_response_metadata for in-place updates."""
+        self._last_response_metadata = value or {}
+
+    def create_conversation_id(self) -> str:
+        """Create a new unique conversation id.
+
+        Used at init and when the API does not return one in response metadata.
+        Subclasses may override to use a different id format.
+        """
+        return str(uuid.uuid4())
+
+    def _update_conversation_id_from_metadata(self) -> None:
+        """If the API returned a conversation_id in response metadata, use it.
+
+        Call after generate_response once _last_response_metadata is set.
+        APIs that ignore our request conversation_id but return their own
+        will overwrite self.conversation_id here.
+        """
+        cid = (self._last_response_metadata or {}).get("conversation_id")
+        if cid is not None:
+            self.conversation_id = cid
 
     @abstractmethod
     async def generate_response(
@@ -49,8 +83,15 @@ async def generate_response(
                 starting the conversation.
 
         Returns:
-            str: The response text. Metadata available via
-                get_last_response_metadata()
+            str: The response text. Metadata in self.last_response_metadata
+                (getter returns a copy so callers need not copy).
+
+        Note:
+            For API thread/session identification, use self.conversation_id
+            (set at init; send as request metadata). If your API returns a
+            conversation_id in response metadata, call
+            self._update_conversation_id_from_metadata() after setting
+            _last_response_metadata to overwrite.
         """
         pass
 

diff --git a/llm_clients/ollama_llm.py b/llm_clients/ollama_llm.py
@@ -134,10 +134,6 @@ async def generate_response(
             }
             return f"Error generating response: {str(e)}"
 
-    def get_last_response_metadata(self) -> Dict[str, Any]:
-        """Get metadata from the last response."""
-        return self.last_response_metadata.copy()
-
     def set_system_prompt(self, system_prompt: str) -> None:
         """Set or update the system prompt."""
         self.system_prompt = system_prompt