diff --git a/docs/evaluating.md b/docs/evaluating.md
index ac14a85b..7e0bfdeb 100644
--- a/docs/evaluating.md
+++ b/docs/evaluating.md
@@ -186,12 +186,9 @@ def set_system_prompt(self, system_prompt: str) -> None:
     self.system_prompt = system_prompt
 ```
 
-#### `get_last_response_metadata()` - Get response metadata (optional but recommended)
-```python
-def get_last_response_metadata(self) -> Dict[str, Any]:
-    """Get metadata from the last response."""
-    return self.last_response_metadata.copy()
-```
+#### `last_response_metadata` - Response metadata (required)
+
+Set in `__init__` (base sets it to `{}`). Update it in `generate_response()`: assign with `self.last_response_metadata = {...}`. If you need in-place updates (e.g. `self.last_response_metadata["usage"] = ...`), use `self._last_response_metadata` so the stored dict is updated. The property getter returns a copy so callers can use `last_response_metadata` without mutating the client's dict.
 
 ### 3. Add the new LLM client to the factory
 
@@ -227,6 +224,18 @@ python3 judge.py -f conversations/{YOUR_FOLDER} -j your-model-name
 - **LangChain Integration**: The provided implementations use LangChain for robust LLM interactions
 - **Error Handling**: Make sure to handle errors gracefully and return appropriate error messages
 
+### Conversation flow and history
+
+ConversationSimulator holds the full conversation and passes `conversation_history` into your client on every call. Your client is not required to store history. You can:
+
+- **Stateless**: Build each request from `conversation_history` (as the built-in clients do), or
+- **Server-side state**: Send a `conversation_id` to your API and let the server maintain the conversation; in that case you may use `conversation_history` only when needed (e.g. fallback or logging).
+
+**When your endpoint requires a conversation id** (the built-in clients do not; this is for custom clients):
+
+- `conversation_id` is set in the base class `__init__`, so you always have one to send as request metadata. Use `self.conversation_id` when your API needs a conversation ID.
+- For LLM clients that require `conversation_id` handling, in `generate_response()`, you must set `conversation_id` in `_last_response_metadata` (interface requirement). If your API returns its own `conversation_id` in the response metadata (e.g. it ignores the one we send), call `self._update_conversation_id_from_metadata()` at the end of `generate_response()` after setting `_last_response_metadata`; that overwrites `self.conversation_id` with the API’s value.
+
 ## Structured Output Support
 
 ### Native Support (Recommended)
diff --git a/generate_conversations/conversation_simulator.py b/generate_conversations/conversation_simulator.py
index 48d0d9c1..df1172b3 100644
--- a/generate_conversations/conversation_simulator.py
+++ b/generate_conversations/conversation_simulator.py
@@ -110,7 +110,7 @@ async def start_conversation(
                 input_message=input_msg,
                 response_message=lc_message,
                 early_termination=False,
-                logging_metadata=current_speaker.get_last_response_metadata(),
+                logging_metadata=current_speaker.last_response_metadata,
             )
             self.conversation_history.append(turn_obj)
 
diff --git a/generate_conversations/runner.py b/generate_conversations/runner.py
index 3ced3f72..2735d880 100644
--- a/generate_conversations/runner.py
+++ b/generate_conversations/runner.py
@@ -49,20 +49,30 @@ def __init__(
         self.max_total_words = max_total_words
         self.max_personas = max_personas
 
-        self.AGENT_SYSTEM_PROMPT = self.agent_model_config.get(
-            "system_prompt", "You are a helpful AI assistant."
-        )
-
     async def run_single_conversation(
         self,
         persona_config: dict,
-        agent,
         max_turns: int,
-        conversation_id: int,
+        conversation_index: int,
         run_number: int,
-        **kargs: dict,
+        **kwargs: dict,
     ) -> Dict[str, Any]:
-        """Run a single conversation asynchronously."""
+        """Run a single simulated conversation (persona vs provider LLM).
+
+        Uses fresh LLM instances per call; safe for concurrent use. Logs turns,
+        writes transcript to self.folder_name, then cleans up logger and LLMs.
+
+        Args:
+            persona_config (dict): Must have "model", "prompt", "name".
+            max_turns (int): Max conversation turns for a conversation.
+            conversation_index (int): Index in the batch of conversations.
+            run_number (int): Run index for this prompt (e.g. 1 of runs_per_prompt).
+            **kwargs: Unused; reserved for future use.
+
+        Returns:
+            Dict[str, Any]: index, llm1_model, llm1_prompt, run_number, turns,
+            filename, log_file, duration, early_termination, conversation.
+        """
         model_name = persona_config["model"]
         system_prompt = persona_config["prompt"]  # This is now the full persona prompt
         persona_name = persona_config["name"]
@@ -83,7 +93,7 @@ async def run_single_conversation(
         logger = setup_conversation_logger(filename_base, run_id=self.run_id)
         start_time = time.time()
 
-        # Create LLM1 instance with the persona prompt and configuration
+        # Create persona instance
         persona = LLMFactory.create_llm(
             model_name=model_name,
             name=f"{model_short} {persona_name}",
@@ -92,6 +102,23 @@ async def run_single_conversation(
             **self.persona_model_config,
         )
 
+        # Create new agent instance to reset conversation_id and metadata.
+        # Exclude selected kwargs to avoid duplicate args expected in create_llm.
+        agent_kwargs = {
+            k: v
+            for k, v in self.agent_model_config.items()
+            if k not in ("model", "name", "system_prompt")
+        }
+        agent = LLMFactory.create_llm(
+            model_name=self.agent_model_config["model"],
+            name=self.agent_model_config.get("name", "Provider"),
+            system_prompt=self.agent_model_config.get(
+                "system_prompt", "You are a helpful AI assistant."
+            ),
+            role=Role.PROVIDER,
+            **agent_kwargs,
+        )
+
         # Log conversation start
         log_conversation_start(
             logger=logger,
@@ -148,7 +175,7 @@ async def run_single_conversation(
             simulator.save_conversation(f"{filename_base}.txt", self.folder_name)
 
             result = {
-                "id": conversation_id,
+                "index": conversation_index,
                 "llm1_model": model_name,
                 "llm1_prompt": persona_name,
                 "run_number": run_number,
@@ -164,11 +191,12 @@ async def run_single_conversation(
 
             # Cleanup LLM resources (e.g., close HTTP sessions for Azure)
             # Always cleanup, even if conversation failed
-            try:
-                await persona.cleanup()
-            except Exception as e:
-                # Log but don't fail if cleanup fails
-                print(f"Warning: Failed to cleanup persona LLM: {e}")
+            for llm in (persona, agent):
+                try:
+                    await llm.cleanup()
+                except Exception as e:
+                    # Log but don't fail if cleanup fails
+                    print(f"Warning: Failed to cleanup LLM: {e}")
 
         return result
 
@@ -179,23 +207,13 @@ async def run_conversations(
         # Load prompts from CSV based on persona names
         personas = load_prompts_from_csv(persona_names, max_personas=self.max_personas)
 
-        # Load agent configuration (fixed, shared across all conversations)
-        agent = LLMFactory.create_llm(
-            model_name=self.agent_model_config["model"],
-            name=self.agent_model_config.pop("name"),
-            system_prompt=self.AGENT_SYSTEM_PROMPT,
-            role=Role.PROVIDER,
-            **self.agent_model_config,
-        )
-
         # Create tasks for all conversations (each prompt run multiple times)
         tasks = []
-        conversation_id = 1
+        conversation_index = 1
 
         for persona in personas:
             for run in range(1, self.runs_per_prompt + 1):
                 tasks.append(
-                    # TODO: should we pass the persona object here?
                     self.run_single_conversation(
                         {
                             "model": self.persona_model_config["model"],
@@ -203,13 +221,12 @@ async def run_conversations(
                             "name": persona["Name"],
                             "run": run,
                         },
-                        agent,
                         self.max_turns,
-                        conversation_id,
+                        conversation_index,
                         run,
                     )
                 )
-                conversation_id += 1
+                conversation_index += 1
 
         # Run all conversations with concurrency limit
         start_time = datetime.now()
@@ -237,11 +254,4 @@ async def run_with_limit(task):
 
         print(f"\nCompleted {len(results)} conversations in {total_time:.2f} seconds")
 
-        # Cleanup agent LLM resources (e.g., close HTTP sessions for Azure)
-        try:
-            await agent.cleanup()
-        except Exception as e:
-            # Log but don't fail if cleanup fails
-            print(f"Warning: Failed to cleanup agent LLM: {e}")
-
         return results
diff --git a/llm_clients/azure_llm.py b/llm_clients/azure_llm.py
index d0205284..835fef5c 100644
--- a/llm_clients/azure_llm.py
+++ b/llm_clients/azure_llm.py
@@ -187,19 +187,19 @@ async def generate_response(
                 # Extract token usage
                 if "token_usage" in metadata:
                     usage = metadata["token_usage"]
-                    self.last_response_metadata["usage"] = {
+                    self._last_response_metadata["usage"] = {
                         "input_tokens": usage.get("input_tokens", 0),
                         "output_tokens": usage.get("output_tokens", 0),
                         "total_tokens": usage.get("total_tokens", 0),
                     }
 
                 # Extract finish reason
-                self.last_response_metadata["finish_reason"] = metadata.get(
+                self._last_response_metadata["finish_reason"] = metadata.get(
                     "finish_reason"
                 )
 
                 # Store raw metadata
-                self.last_response_metadata["raw_metadata"] = dict(metadata)
+                self._last_response_metadata["raw_metadata"] = dict(metadata)
 
             return response.text
         except Exception as e:
@@ -307,10 +307,6 @@ async def generate_structured_response(
             }
             raise RuntimeError(f"Error generating structured response: {str(e)}") from e
 
-    def get_last_response_metadata(self) -> Dict[str, Any]:
-        """Get metadata from the last response."""
-        return self.last_response_metadata.copy()
-
     def set_system_prompt(self, system_prompt: str) -> None:
         """Set or update the system prompt."""
         self.system_prompt = system_prompt
diff --git a/llm_clients/claude_llm.py b/llm_clients/claude_llm.py
index 656e96f7..6daf3064 100644
--- a/llm_clients/claude_llm.py
+++ b/llm_clients/claude_llm.py
@@ -118,7 +118,7 @@ async def generate_response(
                 # Extract token usage
                 if "usage" in metadata:
                     usage = metadata["usage"]
-                    self.last_response_metadata["usage"] = {
+                    self._last_response_metadata["usage"] = {
                         "input_tokens": usage.get("input_tokens", 0),
                         "output_tokens": usage.get("output_tokens", 0),
                         "total_tokens": usage.get("input_tokens", 0)
@@ -126,10 +126,12 @@ async def generate_response(
                     }
 
                 # Extract stop reason
-                self.last_response_metadata["stop_reason"] = metadata.get("stop_reason")
+                self._last_response_metadata["stop_reason"] = metadata.get(
+                    "stop_reason"
+                )
 
                 # Store raw metadata
-                self.last_response_metadata["raw_metadata"] = dict(metadata)
+                self._last_response_metadata["raw_metadata"] = dict(metadata)
 
             return response.text
         except Exception as e:
@@ -204,10 +206,6 @@ async def generate_structured_response(
             }
             raise RuntimeError(f"Error generating structured response: {str(e)}") from e
 
-    def get_last_response_metadata(self) -> Dict[str, Any]:
-        """Get metadata from the last response."""
-        return self.last_response_metadata.copy()
-
     def set_system_prompt(self, system_prompt: str) -> None:
         """Set or update the system prompt."""
         self.system_prompt = system_prompt
diff --git a/llm_clients/gemini_llm.py b/llm_clients/gemini_llm.py
index 23795811..f298ced6 100644
--- a/llm_clients/gemini_llm.py
+++ b/llm_clients/gemini_llm.py
@@ -116,7 +116,7 @@ async def generate_response(
                 # Extract token usage - Gemini may have different structure
                 if "usage_metadata" in metadata:
                     usage = metadata["usage_metadata"]
-                    self.last_response_metadata["usage"] = {
+                    self._last_response_metadata["usage"] = {
                         "prompt_token_count": usage.get("prompt_token_count", 0),
                         "candidates_token_count": usage.get(
                             "candidates_token_count", 0
@@ -126,19 +126,19 @@ async def generate_response(
                 elif "token_usage" in metadata:
                     # Fallback structure
                     usage = metadata["token_usage"]
-                    self.last_response_metadata["usage"] = {
+                    self._last_response_metadata["usage"] = {
                         "prompt_tokens": usage.get("prompt_tokens", 0),
                         "completion_tokens": usage.get("completion_tokens", 0),
                         "total_tokens": usage.get("total_tokens", 0),
                     }
 
                 # Extract finish reason
-                self.last_response_metadata["finish_reason"] = metadata.get(
+                self._last_response_metadata["finish_reason"] = metadata.get(
                     "finish_reason"
                 )
 
                 # Store raw metadata
-                self.last_response_metadata["raw_metadata"] = dict(metadata)
+                self._last_response_metadata["raw_metadata"] = dict(metadata)
 
             return response.text
         except Exception as e:
@@ -154,10 +154,6 @@ async def generate_response(
             }
             return f"Error generating response: {str(e)}"
 
-    def get_last_response_metadata(self) -> Dict[str, Any]:
-        """Get metadata from the last response."""
-        return self.last_response_metadata.copy()
-
     async def generate_structured_response(
         self, message: Optional[str], response_model: Type[T]
     ) -> T:
diff --git a/llm_clients/llm_interface.py b/llm_clients/llm_interface.py
index f19a35e7..683b1904 100644
--- a/llm_clients/llm_interface.py
+++ b/llm_clients/llm_interface.py
@@ -1,3 +1,5 @@
+import copy
+import uuid
 from abc import ABC, abstractmethod
 from enum import Enum
 from typing import Any, Dict, List, Optional, Type, TypeVar
@@ -31,7 +33,39 @@ def __init__(
         self.name = name
         self.role = role
         self.system_prompt = system_prompt or ""
-        self.last_response_metadata: Dict[str, Any] = {}
+        self._last_response_metadata: Dict[str, Any] = {}
+        self.conversation_id = self.create_conversation_id()
+
+    @property
+    def last_response_metadata(self) -> Dict[str, Any]:
+        """Metadata from the last generate_response call. Returns a deep copy so
+        callers cannot mutate internal state (including nested dicts like usage).
+        """
+        return copy.deepcopy(self._last_response_metadata)
+
+    @last_response_metadata.setter
+    def last_response_metadata(self, value: Optional[Dict[str, Any]]) -> None:
+        """Set metadata; use _last_response_metadata for in-place updates."""
+        self._last_response_metadata = value or {}
+
+    def create_conversation_id(self) -> str:
+        """Create a new unique conversation id.
+
+        Used at init and when the API does not return one in response metadata.
+        Subclasses may override to use a different id format.
+        """
+        return str(uuid.uuid4())
+
+    def _update_conversation_id_from_metadata(self) -> None:
+        """If the API returned a conversation_id in response metadata, use it.
+
+        Call after generate_response once _last_response_metadata is set.
+        APIs that ignore our request conversation_id but return their own
+        will overwrite self.conversation_id here.
+        """
+        cid = (self._last_response_metadata or {}).get("conversation_id")
+        if cid is not None:
+            self.conversation_id = cid
 
     @abstractmethod
     async def generate_response(
@@ -49,8 +83,15 @@ async def generate_response(
                 starting the conversation.
 
         Returns:
-            str: The response text. Metadata available via
-                get_last_response_metadata()
+            str: The response text. Metadata in self.last_response_metadata
+                (getter returns a copy so callers need not copy).
+
+        Note:
+            For API thread/session identification, use self.conversation_id
+            (set at init; send as request metadata). If your API returns a
+            conversation_id in response metadata, call
+            self._update_conversation_id_from_metadata() after setting
+            _last_response_metadata to overwrite.
         """
         pass
 
diff --git a/llm_clients/ollama_llm.py b/llm_clients/ollama_llm.py
index cf203e84..92e20f48 100644
--- a/llm_clients/ollama_llm.py
+++ b/llm_clients/ollama_llm.py
@@ -134,10 +134,6 @@ async def generate_response(
             }
             return f"Error generating response: {str(e)}"
 
-    def get_last_response_metadata(self) -> Dict[str, Any]:
-        """Get metadata from the last response."""
-        return self.last_response_metadata.copy()
-
     def set_system_prompt(self, system_prompt: str) -> None:
         """Set or update the system prompt."""
         self.system_prompt = system_prompt
diff --git a/llm_clients/openai_llm.py b/llm_clients/openai_llm.py
index e6d69534..1dae9c06 100644
--- a/llm_clients/openai_llm.py
+++ b/llm_clients/openai_llm.py
@@ -110,7 +110,7 @@ async def generate_response(
 
             # Extract additional_kwargs if available
             if hasattr(response, "additional_kwargs") and response.additional_kwargs:
-                self.last_response_metadata["additional_kwargs"] = dict(
+                self._last_response_metadata["additional_kwargs"] = dict(
                     response.additional_kwargs
                 )
 
@@ -120,34 +120,34 @@ async def generate_response(
 
                 # Update model name from response metadata
                 if "model_name" in metadata:
-                    self.last_response_metadata["model"] = metadata["model_name"]
+                    self._last_response_metadata["model"] = metadata["model_name"]
 
                 # Extract token usage from response_metadata
                 if "token_usage" in metadata:
                     token_usage = metadata["token_usage"]
-                    self.last_response_metadata["usage"] = {
+                    self._last_response_metadata["usage"] = {
                         "prompt_tokens": token_usage.get("prompt_tokens", 0),
                         "completion_tokens": token_usage.get("completion_tokens", 0),
                         "total_tokens": token_usage.get("total_tokens", 0),
                     }
 
                 # Extract other metadata fields
-                self.last_response_metadata["finish_reason"] = metadata.get(
+                self._last_response_metadata["finish_reason"] = metadata.get(
                     "finish_reason"
                 )
-                self.last_response_metadata["system_fingerprint"] = metadata.get(
+                self._last_response_metadata["system_fingerprint"] = metadata.get(
                     "system_fingerprint"
                 )
-                self.last_response_metadata["logprobs"] = metadata.get("logprobs")
+                self._last_response_metadata["logprobs"] = metadata.get("logprobs")
 
                 # Store raw response_metadata
-                self.last_response_metadata["raw_response_metadata"] = dict(metadata)
+                self._last_response_metadata["raw_response_metadata"] = dict(metadata)
 
             # Extract usage_metadata if available (separate from response_metadata)
             if hasattr(response, "usage_metadata") and response.usage_metadata:
                 usage_meta = response.usage_metadata
                 # Merge with existing usage info, preferring usage_metadata values
-                self.last_response_metadata["usage"].update(
+                self._last_response_metadata["usage"].update(
                     {
                         "input_tokens": usage_meta.get("input_tokens", 0),
                         "output_tokens": usage_meta.get("output_tokens", 0),
@@ -155,7 +155,7 @@ async def generate_response(
                     }
                 )
                 # Store raw usage_metadata
-                self.last_response_metadata["raw_usage_metadata"] = dict(usage_meta)
+                self._last_response_metadata["raw_usage_metadata"] = dict(usage_meta)
 
             return response.text
         except Exception as e:
@@ -233,10 +233,6 @@ async def generate_structured_response(
             }
             raise RuntimeError(f"Error generating structured response: {str(e)}") from e
 
-    def get_last_response_metadata(self) -> Dict[str, Any]:
-        """Get metadata from the last response."""
-        return self.last_response_metadata.copy()
-
     def set_system_prompt(self, system_prompt: str) -> None:
         """Set or update the system prompt."""
         self.system_prompt = system_prompt
diff --git a/tests/integration/test_conversation_runner.py b/tests/integration/test_conversation_runner.py
index e58faa49..d62095a5 100644
--- a/tests/integration/test_conversation_runner.py
+++ b/tests/integration/test_conversation_runner.py
@@ -16,6 +16,7 @@
 import pytest
 
 from generate_conversations.runner import ConversationRunner
+from llm_clients.llm_interface import Role
 from tests.mocks.mock_llm import MockLLM
 
 
@@ -25,20 +26,13 @@ def create_test_runner(
     run_id: str,
     **kwargs: Dict[str, Any],
 ) -> ConversationRunner:
-    """Helper to create runner and clean up agent_model_config.
-
-    This removes system_prompt from agent_model_config to avoid duplicate
-    keyword argument errors when run_conversations calls create_llm.
-    """
-    agent_cfg = copy.deepcopy(agent_config)
+    """Helper to create runner with copied configs (no shared mutation)."""
     runner = ConversationRunner(
         persona_model_config=copy.deepcopy(persona_config),
-        agent_model_config=agent_cfg,
+        agent_model_config=copy.deepcopy(agent_config),
         run_id=run_id,
         **kwargs,
     )
-    # Remove system_prompt to avoid duplicate kwarg in run_conversations
-    runner.agent_model_config.pop("system_prompt", None)
     return runner
 
 
@@ -54,16 +48,7 @@ def basic_persona_config() -> Dict[str, Any]:
 
 @pytest.fixture
 def basic_agent_config() -> Dict[str, Any]:
-    """Basic agent model configuration.
-
-    Note: This includes system_prompt which will be extracted in __init__
-    and used separately in run_conversations. The config will be mutated
-    (name popped, system_prompt passed separately).
-    """
-    # Don't include system_prompt in the config dict since it causes
-    # duplicate keyword argument errors when run_conversations calls
-    # create_llm with both system_prompt as arg and **config
-    # However, we do include it initially so __init__ can extract it
+    """Basic agent model configuration."""
     return {
         "model": "mock-agent-model",
         "name": "mock-agent",
@@ -164,7 +149,6 @@ def test_init_with_basic_config(
 
         # Assert
         assert runner.persona_model_config == basic_persona_config
-        # Note: agent_model_config will still have system_prompt
         assert runner.agent_model_config["model"] == basic_agent_config["model"]
         assert runner.run_id == run_id
         assert runner.max_turns == 6
@@ -195,46 +179,105 @@ def test_init_with_custom_parameters(
         assert runner.folder_name == "test_conversations"
         assert runner.max_concurrent == 3
 
-    def test_agent_system_prompt_from_config(
+    @pytest.mark.asyncio
+    async def test_agent_system_prompt_from_config(
         self,
+        tmp_path: Path,
         basic_persona_config: Dict[str, Any],
         basic_agent_config: Dict[str, Any],
+        mock_llm_factory,
     ) -> None:
-        """Test that agent system prompt is extracted from config."""
-        # Arrange
+        """Agent create_llm call receives system_prompt from config."""
         custom_prompt = "You are a mental health support chatbot."
-        basic_agent_config["system_prompt"] = custom_prompt
+        agent_config = copy.deepcopy(basic_agent_config)
+        agent_config["system_prompt"] = custom_prompt
+
+        create_llm_calls = []
+        real_create = mock_llm_factory.side_effect
+
+        def recording_create_llm(*args: Any, **kwargs: Any) -> MockLLM:
+            create_llm_calls.append(copy.deepcopy(kwargs))
+            return real_create(*args, **kwargs)
+
+        mock_llm_factory.side_effect = recording_create_llm
 
-        # Act
         runner = ConversationRunner(
             persona_model_config=basic_persona_config,
-            agent_model_config=basic_agent_config,
+            agent_model_config=agent_config,
             run_id="test_run",
+            folder_name=str(tmp_path / "conversations"),
         )
+        persona_config = {
+            "model": "mock-persona-model",
+            "prompt": "Test persona prompt",
+            "name": "TestPersona",
+            "run": 1,
+        }
 
-        # Assert
-        assert runner.AGENT_SYSTEM_PROMPT == custom_prompt
+        with patch(
+            "generate_conversations.runner.setup_conversation_logger"
+        ) as mock_logger:
+            mock_logger.return_value = MagicMock()
+            await runner.run_single_conversation(
+                persona_config=persona_config,
+                max_turns=2,
+                conversation_index=1,
+                run_number=1,
+            )
 
-    def test_agent_system_prompt_default(
+        agent_calls = [c for c in create_llm_calls if c.get("role") == Role.PROVIDER]
+        assert len(agent_calls) == 1
+        assert agent_calls[0]["system_prompt"] == custom_prompt
+
+    @pytest.mark.asyncio
+    async def test_agent_system_prompt_default(
         self,
+        tmp_path: Path,
         basic_persona_config: Dict[str, Any],
+        mock_llm_factory,
     ) -> None:
-        """Test default agent system prompt when not in config."""
-        # Arrange
+        """When agent config has no system_prompt, create_llm gets the fallback."""
+        default_prompt = "You are a helpful AI assistant."
         agent_config = {
             "model": "mock-agent",
             "name": "test-agent",
         }
+        create_llm_calls = []
+        real_create = mock_llm_factory.side_effect
+
+        def recording_create_llm(*args: Any, **kwargs: Any) -> MockLLM:
+            create_llm_calls.append(copy.deepcopy(kwargs))
+            return real_create(*args, **kwargs)
+
+        mock_llm_factory.side_effect = recording_create_llm
 
-        # Act
         runner = ConversationRunner(
             persona_model_config=basic_persona_config,
             agent_model_config=agent_config,
             run_id="test_run",
+            folder_name=str(tmp_path / "conversations"),
         )
+        persona_config = {
+            "model": "mock-persona-model",
+            "prompt": "Test persona prompt",
+            "name": "TestPersona",
+            "run": 1,
+        }
 
-        # Assert
-        assert runner.AGENT_SYSTEM_PROMPT == "You are a helpful AI assistant."
+        with patch(
+            "generate_conversations.runner.setup_conversation_logger"
+        ) as mock_logger:
+            mock_logger.return_value = MagicMock()
+            await runner.run_single_conversation(
+                persona_config=persona_config,
+                max_turns=2,
+                conversation_index=1,
+                run_number=1,
+            )
+
+        agent_calls = [c for c in create_llm_calls if c.get("role") == Role.PROVIDER]
+        assert len(agent_calls) == 1
+        assert agent_calls[0]["system_prompt"] == default_prompt
 
 
 @pytest.mark.integration
@@ -269,14 +312,6 @@ async def test_run_single_conversation_creates_files(
             "run": 1,
         }
 
-        agent = MockLLM(
-            name="test-agent",
-            model_name="mock-agent-model",
-            responses=["I'm here to help", "Tell me more"],
-            temperature=0.5,
-            max_tokens=500,
-        )
-
         # Act - patch setup_conversation_logger to use tmp_path
         with patch(
             "generate_conversations.runner.setup_conversation_logger"
@@ -290,14 +325,13 @@ async def test_run_single_conversation_creates_files(
 
             result = await runner.run_single_conversation(
                 persona_config=persona_config,
-                agent=agent,
                 max_turns=4,
-                conversation_id=1,
+                conversation_index=1,
                 run_number=1,
             )
 
         # Assert - verify result structure
-        assert result["id"] == 1
+        assert result["index"] == 1
         assert result["llm1_model"] == "mock-persona-model"
         assert result["llm1_prompt"] == "TestPersona"
         assert result["run_number"] == 1
@@ -338,14 +372,6 @@ async def test_run_single_conversation_logging(
             "run": 1,
         }
 
-        agent = MockLLM(
-            name="test-agent",
-            model_name="mock-agent-model",
-            responses=["Response 1"],
-            temperature=0.5,
-            max_tokens=500,
-        )
-
         # Act - use real logging with tmp_path
         os.makedirs(log_folder / run_id, exist_ok=True)
         log_file = log_folder / run_id / "test_conversation.log"
@@ -364,9 +390,8 @@ async def test_run_single_conversation_logging(
 
             await runner.run_single_conversation(
                 persona_config=persona_config,
-                agent=agent,
                 max_turns=2,
-                conversation_id=1,
+                conversation_index=1,
                 run_number=1,
             )
 
@@ -406,14 +431,6 @@ async def test_filename_generation_format(
             "run": 2,
         }
 
-        agent = MockLLM(
-            name="test-agent",
-            model_name="mock-agent-model",
-            responses=["Response"],
-            temperature=0.5,
-            max_tokens=500,
-        )
-
         # Act
         with patch(
             "generate_conversations.runner.setup_conversation_logger"
@@ -423,9 +440,8 @@ async def test_filename_generation_format(
 
             result = await runner.run_single_conversation(
                 persona_config=persona_config,
-                agent=agent,
                 max_turns=2,
-                conversation_id=1,
+                conversation_index=1,
                 run_number=2,
             )
 
@@ -494,9 +510,8 @@ def side_effect(*args, **kwargs):
                 # Act
                 result = await runner.run_single_conversation(
                     persona_config=persona_config,
-                    agent=agent_mock,
                     max_turns=10,
-                    conversation_id=1,
+                    conversation_index=1,
                     run_number=1,
                 )
 
@@ -528,14 +543,6 @@ async def test_conversation_metadata_completeness(
             "run": 1,
         }
 
-        agent = MockLLM(
-            name="test-agent",
-            model_name="mock-agent-model",
-            responses=["Response"],
-            temperature=0.5,
-            max_tokens=500,
-        )
-
         # Act
         with patch(
             "generate_conversations.runner.setup_conversation_logger"
@@ -545,15 +552,14 @@ async def test_conversation_metadata_completeness(
 
             result = await runner.run_single_conversation(
                 persona_config=persona_config,
-                agent=agent,
                 max_turns=2,
-                conversation_id=5,
+                conversation_index=5,
                 run_number=3,
             )
 
         # Assert - verify all required metadata fields
         required_fields = [
-            "id",
+            "index",
             "llm1_model",
             "llm1_prompt",
             "run_number",
@@ -569,7 +575,7 @@ async def test_conversation_metadata_completeness(
             assert field in result, f"Missing field: {field}"
 
         # Verify types
-        assert isinstance(result["id"], int)
+        assert isinstance(result["index"], int)
         assert isinstance(result["turns"], int)
         assert isinstance(result["duration"], float)
         assert isinstance(result["early_termination"], bool)
@@ -721,14 +727,14 @@ async def test_no_concurrent_limit(
         # Assert
         assert len(results) == 2  # 1 persona * 2 runs
 
-    async def test_conversation_id_increments(
+    async def test_conversation_index_increments(
         self,
         tmp_path: Path,
         basic_persona_config: Dict[str, Any],
         basic_agent_config: Dict[str, Any],
         mock_llm_factory,
     ) -> None:
-        """Test that conversation IDs increment correctly."""
+        """Test that conversation indices increment correctly."""
         # Arrange
         conv_folder = tmp_path / "conversations"
         runner = create_test_runner(
@@ -757,9 +763,75 @@ async def test_conversation_id_increments(
                 # Act
                 results = await runner.run_conversations(persona_names=None)
 
-        # Assert - IDs should be 1, 2, 3, 4
-        ids = sorted([r["id"] for r in results])
-        assert ids == [1, 2, 3, 4]
+        # Assert - indices should be 1, 2, 3, 4
+        indices = sorted([r["index"] for r in results])
+        assert indices == [1, 2, 3, 4]
+
+    async def test_agent_config_not_mutated_across_concurrent_conversations(
+        self,
+        tmp_path: Path,
+        basic_persona_config: Dict[str, Any],
+        basic_agent_config: Dict[str, Any],
+        mock_llm_factory,
+    ) -> None:
+        """Ensure agent_model_config is not mutated when running multiple conversations.
+
+        Each run_single_conversation creates an agent from the shared config; the config
+        must remain intact so every conversation gets the same name and system_prompt.
+        """
+        expected_name = "shared-agent-name"
+        expected_prompt = "Shared system prompt for all runs."
+
+        create_llm_calls = []
+        real_create = mock_llm_factory.side_effect
+
+        def recording_create_llm(*args: Any, **kwargs: Any) -> MockLLM:
+            create_llm_calls.append(copy.deepcopy(kwargs))
+            return real_create(*args, **kwargs)
+
+        mock_llm_factory.side_effect = recording_create_llm
+
+        agent_config = copy.deepcopy(basic_agent_config)
+        agent_config["name"] = expected_name
+        agent_config["system_prompt"] = expected_prompt
+
+        conv_folder = tmp_path / "conversations"
+        runner = create_test_runner(
+            basic_persona_config,
+            agent_config,
+            "test_run",
+            max_turns=2,
+            runs_per_prompt=2,
+            folder_name=str(conv_folder),
+        )
+        mock_personas = [
+            {"Name": "Persona1", "prompt": "Prompt 1"},
+            {"Name": "Persona2", "prompt": "Prompt 2"},
+        ]
+
+        with (
+            patch("generate_conversations.runner.load_prompts_from_csv") as mock_load,
+            patch(
+                "generate_conversations.runner.setup_conversation_logger"
+            ) as mock_logger,
+        ):
+            mock_load.return_value = mock_personas
+            mock_logger.return_value = MagicMock()
+            await runner.run_conversations(persona_names=None)
+
+        assert runner.agent_model_config["name"] == expected_name
+        assert runner.agent_model_config["system_prompt"] == expected_prompt
+
+        agent_calls = [
+            c
+            for c in create_llm_calls
+            if c.get("model_name") == basic_agent_config["model"]
+        ]
+        assert (
+            len(agent_calls) == 4
+        ), "Expected 4 conversations => 4 agent create_llm calls"
+        assert all(c.get("name") == expected_name for c in agent_calls)
+        assert all(c.get("system_prompt") == expected_prompt for c in agent_calls)
 
 
 @pytest.mark.integration
@@ -793,14 +865,6 @@ async def test_conversation_folder_creation(
             "run": 1,
         }
 
-        agent = MockLLM(
-            name="test-agent",
-            model_name="mock-agent-model",
-            responses=["Response"],
-            temperature=0.5,
-            max_tokens=500,
-        )
-
         # Act
         with patch(
             "generate_conversations.runner.setup_conversation_logger"
@@ -810,9 +874,8 @@ async def test_conversation_folder_creation(
 
             await runner.run_single_conversation(
                 persona_config=persona_config,
-                agent=agent,
                 max_turns=2,
-                conversation_id=1,
+                conversation_index=1,
                 run_number=1,
             )
 
@@ -901,13 +964,13 @@ async def test_filename_uniqueness(
 class TestConversationRunnerErrorHandling:
     """Test error handling and edge cases."""
 
-    async def test_handles_llm_errors_gracefully(
+    async def test_llm_errors_propagate_from_run_single_conversation(
         self,
         tmp_path: Path,
         basic_persona_config: Dict[str, Any],
         basic_agent_config: Dict[str, Any],
     ) -> None:
-        """Test that LLM errors are handled gracefully."""
+        """When the agent LLM errors during conversation, the exception propagates."""
         # Arrange
         conv_folder = tmp_path / "conversations"
         runner = ConversationRunner(
@@ -924,7 +987,7 @@ async def test_handles_llm_errors_gracefully(
             "run": 1,
         }
 
-        # Create agent that will error
+        # Create agent that will error; persona mock for the other create_llm call
         error_agent = MockLLM(
             name="error-agent",
             model_name="mock-error-model",
@@ -933,6 +996,13 @@ async def test_handles_llm_errors_gracefully(
             temperature=0.5,
             max_tokens=500,
         )
+        persona_mock = MockLLM(
+            name="persona",
+            model_name="mock-persona-model",
+            responses=["Hello"],
+            temperature=0.7,
+            max_tokens=1000,
+        )
 
         with patch(
             "generate_conversations.runner.setup_conversation_logger"
@@ -943,21 +1013,15 @@ async def test_handles_llm_errors_gracefully(
             with patch(
                 "generate_conversations.runner.LLMFactory.create_llm"
             ) as mock_factory:
-                mock_factory.return_value = MockLLM(
-                    name="persona",
-                    model_name="mock-persona-model",
-                    responses=["Hello"],
-                    temperature=0.7,
-                    max_tokens=1000,
-                )
+                # run_single_conversation creates persona first, then agent
+                mock_factory.side_effect = [persona_mock, error_agent]
 
                 # Act & Assert - should raise the error
                 with pytest.raises(Exception) as exc_info:
                     await runner.run_single_conversation(
                         persona_config=persona_config,
-                        agent=error_agent,
                         max_turns=2,
-                        conversation_id=1,
+                        conversation_index=1,
                         run_number=1,
                     )
 
@@ -1019,14 +1083,6 @@ async def test_logger_cleanup_called(
             "run": 1,
         }
 
-        agent = MockLLM(
-            name="test-agent",
-            model_name="mock-agent-model",
-            responses=["Response"],
-            temperature=0.5,
-            max_tokens=500,
-        )
-
         # Act
         with patch(
             "generate_conversations.runner.setup_conversation_logger"
@@ -1037,9 +1093,8 @@ async def test_logger_cleanup_called(
             with patch("generate_conversations.runner.cleanup_logger") as mock_cleanup:
                 await runner.run_single_conversation(
                     persona_config=persona_config,
-                    agent=agent,
                     max_turns=2,
-                    conversation_id=1,
+                    conversation_index=1,
                     run_number=1,
                 )
 
@@ -1076,14 +1131,6 @@ async def test_duration_tracking(
             "run": 1,
         }
 
-        agent = MockLLM(
-            name="test-agent",
-            model_name="mock-agent-model",
-            responses=["Response"],
-            temperature=0.5,
-            max_tokens=500,
-        )
-
         # Act
         with patch(
             "generate_conversations.runner.setup_conversation_logger"
@@ -1094,9 +1141,8 @@ async def test_duration_tracking(
             start = time.time()
             result = await runner.run_single_conversation(
                 persona_config=persona_config,
-                agent=agent,
                 max_turns=2,
-                conversation_id=1,
+                conversation_index=1,
                 run_number=1,
             )
             end = time.time()
diff --git a/tests/mocks/mock_llm.py b/tests/mocks/mock_llm.py
index d89a0377..5e60f9b8 100644
--- a/tests/mocks/mock_llm.py
+++ b/tests/mocks/mock_llm.py
@@ -74,10 +74,6 @@ async def generate_response(
 
         return response
 
-    def get_last_response_metadata(self) -> Dict[str, Any]:
-        """Get metadata from the last response."""
-        return self.last_response_metadata.copy()
-
     def set_system_prompt(self, system_prompt: str) -> None:
         """Set or update the system prompt."""
         self.system_prompt = system_prompt
diff --git a/tests/unit/llm_clients/test_azure_llm.py b/tests/unit/llm_clients/test_azure_llm.py
index 16614013..5e9e6aa7 100644
--- a/tests/unit/llm_clients/test_azure_llm.py
+++ b/tests/unit/llm_clients/test_azure_llm.py
@@ -348,7 +348,7 @@ async def test_generate_response_without_usage_metadata(
         response = await llm.generate_response(conversation_history=mock_system_message)
 
         assert response == "Response"
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["usage"] == {}
 
     @pytest.mark.asyncio
@@ -372,7 +372,7 @@ async def test_generate_response_without_response_metadata(
         response = await llm.generate_response(conversation_history=mock_system_message)
 
         assert response == "Response"
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["model"] == "gpt-5.2"
         assert metadata["usage"] == {}
         assert metadata["finish_reason"] is None
@@ -445,11 +445,11 @@ async def test_generate_response_tracks_timing(
         llm = AzureLLM(name="TestAzure", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert_response_timing(metadata)
 
-    def test_get_last_response_metadata_returns_copy(self):
-        """Test that get_last_response_metadata returns a copy."""
+    def test_last_response_metadata_copy_returns_copy(self):
+        """Test that last_response_metadata.copy() returns a copy, not the original."""
         llm = AzureLLM(name="TestAzure", role=Role.PERSONA)
         assert_metadata_copy_behavior(llm)
 
@@ -535,7 +535,7 @@ class TestResponse(BaseModel):
             assert "Structured output failed" in str(exc_info.value)
 
             # Verify error metadata was stored
-            metadata = llm.get_last_response_metadata()
+            metadata = llm.last_response_metadata
             assert "error" in metadata
             assert "Structured output failed" in metadata["error"]
 
@@ -605,7 +605,7 @@ async def test_timestamp_format(
         llm = AzureLLM(name="TestAzure", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert_iso_timestamp(metadata["timestamp"])
 
     @pytest.mark.asyncio
@@ -664,7 +664,7 @@ async def test_generate_response_with_partial_usage_metadata(
         response = await llm.generate_response(conversation_history=mock_system_message)
 
         assert response == "Partial usage response"
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["usage"]["input_tokens"] == 15
         assert metadata["usage"]["output_tokens"] == 0
         assert metadata["usage"]["total_tokens"] == 0
@@ -685,7 +685,7 @@ async def test_metadata_includes_response_object(
         llm = AzureLLM(name="TestAzure", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert "response" in metadata
         assert metadata["response"] == mock_response
 
@@ -709,7 +709,7 @@ async def test_metadata_with_finish_reason(
         llm = AzureLLM(name="TestAzure", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["finish_reason"] == "length"
 
     @pytest.mark.asyncio
@@ -738,7 +738,7 @@ async def test_raw_metadata_stored(
         llm = AzureLLM(name="TestAzure", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert "raw_metadata" in metadata
         assert metadata["raw_metadata"]["custom_field"] == "custom_value"
         assert metadata["raw_metadata"]["nested"]["key"] == "value"
diff --git a/tests/unit/llm_clients/test_base_llm.py b/tests/unit/llm_clients/test_base_llm.py
index f401126c..31a77829 100644
--- a/tests/unit/llm_clients/test_base_llm.py
+++ b/tests/unit/llm_clients/test_base_llm.py
@@ -198,8 +198,8 @@ async def test_generate_response_updates_metadata(
             assert_iso_timestamp(metadata["timestamp"])
             assert_response_timing(metadata)
 
-    def test_get_last_response_metadata_returns_copy(self):
-        """Test that get_last_response_metadata returns a copy, not original."""
+    def test_last_response_metadata_copy_returns_copy(self):
+        """Test that last_response_metadata.copy() returns a copy, not the original."""
         with self.get_mock_patches():  # pyright: ignore[reportGeneralTypeIssues]
             llm = self.create_llm(role=Role.PROVIDER, name="TestLLM")
 
@@ -382,7 +382,7 @@ class TestResponse(BaseModel):
             assert "Structured output failed" in str(exc_info.value)
 
             # Verify error metadata was stored
-            metadata = llm.get_last_response_metadata()
+            metadata = llm.last_response_metadata
             assert "error" in metadata
             assert "Structured output failed" in metadata["error"]
             assert metadata["provider"] == self.get_provider_name()
diff --git a/tests/unit/llm_clients/test_claude_llm.py b/tests/unit/llm_clients/test_claude_llm.py
index c3b1c2bb..3b2cfa29 100644
--- a/tests/unit/llm_clients/test_claude_llm.py
+++ b/tests/unit/llm_clients/test_claude_llm.py
@@ -203,7 +203,7 @@ async def test_generate_response_without_usage_metadata(
         response = await llm.generate_response(conversation_history=mock_system_message)
 
         assert response == "Response"
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["usage"] == {}
 
     @pytest.mark.asyncio
@@ -229,7 +229,7 @@ async def test_generate_response_without_response_metadata(
         response = await llm.generate_response(conversation_history=mock_system_message)
 
         assert response == "Response"
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["model"] == "claude-sonnet-4-5-20250929"
         assert metadata["usage"] == {}
         assert metadata["stop_reason"] is None
@@ -275,11 +275,11 @@ async def test_generate_response_tracks_timing(
         llm = ClaudeLLM(name="TestClaude", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert_response_timing(metadata)
 
-    def test_get_last_response_metadata_returns_copy(self):
-        """Test that get_last_response_metadata returns a copy."""
+    def test_last_response_metadata_copy_returns_copy(self):
+        """Test that last_response_metadata.copy() returns a copy, not the original."""
         llm = ClaudeLLM(name="TestClaude", role=Role.PERSONA)
         assert_metadata_copy_behavior(llm)
 
@@ -319,7 +319,7 @@ async def test_generate_response_with_partial_usage_metadata(
         response = await llm.generate_response(conversation_history=mock_system_message)
 
         assert response == "Partial usage response"
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["usage"]["input_tokens"] == 15
         assert metadata["usage"]["output_tokens"] == 0  # Default value
         assert metadata["usage"]["total_tokens"] == 15
@@ -343,7 +343,7 @@ async def test_metadata_includes_response_object(
         llm = ClaudeLLM(name="TestClaude", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert "response" in metadata
         assert metadata["response"] == mock_response
 
@@ -366,7 +366,7 @@ async def test_timestamp_format(
         llm = ClaudeLLM(name="TestClaude", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert_iso_timestamp(metadata["timestamp"])
 
     @pytest.mark.asyncio
@@ -394,7 +394,7 @@ async def test_metadata_with_stop_reason(
         llm = ClaudeLLM(name="TestClaude", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["stop_reason"] == "max_tokens"
 
     @pytest.mark.asyncio
@@ -423,7 +423,7 @@ async def test_raw_metadata_stored(
         llm = ClaudeLLM(name="TestClaude", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert "raw_metadata" in metadata
         assert metadata["raw_metadata"]["custom_field"] == "custom_value"
         assert metadata["raw_metadata"]["nested"]["key"] == "value"
@@ -680,7 +680,7 @@ class TestResponse(BaseModel):
             assert "Structured output failed" in str(exc_info.value)
 
             # Verify error metadata was stored
-            metadata = llm.get_last_response_metadata()
+            metadata = llm.last_response_metadata
             assert "error" in metadata
             assert "Structured output failed" in metadata["error"]
 
@@ -709,7 +709,7 @@ class SimpleResponse(BaseModel):
             llm = ClaudeLLM(name="TestClaude", role=Role.JUDGE)
             await llm.generate_structured_response("Test", SimpleResponse)
 
-            metadata = llm.get_last_response_metadata()
+            metadata = llm.last_response_metadata
 
             # Verify required fields
             assert metadata["provider"] == "claude"
diff --git a/tests/unit/llm_clients/test_gemini_llm.py b/tests/unit/llm_clients/test_gemini_llm.py
index 90138655..a8a876b0 100644
--- a/tests/unit/llm_clients/test_gemini_llm.py
+++ b/tests/unit/llm_clients/test_gemini_llm.py
@@ -206,7 +206,7 @@ async def test_generate_response_with_fallback_token_usage(
         response = await llm.generate_response(conversation_history=mock_system_message)
 
         assert response == "Response with fallback"
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         # Should use fallback structure
         assert metadata["usage"]["prompt_tokens"] == 10
         assert metadata["usage"]["completion_tokens"] == 20
@@ -234,7 +234,7 @@ async def test_generate_response_without_usage_metadata(
         response = await llm.generate_response(conversation_history=mock_system_message)
 
         assert response == "Response"
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["usage"] == {}
 
     @pytest.mark.asyncio
@@ -258,7 +258,7 @@ async def test_generate_response_without_response_metadata(
         response = await llm.generate_response(conversation_history=mock_system_message)
 
         assert response == "Response"
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["model"] == "gemini-1.5-pro"
         assert metadata["usage"] == {}
         assert metadata["finish_reason"] is None
@@ -300,11 +300,11 @@ async def test_generate_response_tracks_timing(
         llm = GeminiLLM(name="TestGemini", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert_response_timing(metadata)
 
-    def test_get_last_response_metadata_returns_copy(self):
-        """Test that get_last_response_metadata returns a copy."""
+    def test_last_response_metadata_copy_returns_copy(self):
+        """Test that last_response_metadata.copy() returns a copy, not the original."""
         llm = GeminiLLM(name="TestGemini", role=Role.PERSONA)
         assert_metadata_copy_behavior(llm)
 
@@ -336,7 +336,7 @@ async def test_metadata_includes_response_object(
         llm = GeminiLLM(name="TestGemini", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert "response" in metadata
         assert metadata["response"] == mock_response
 
@@ -358,7 +358,7 @@ async def test_timestamp_format(
         llm = GeminiLLM(name="TestGemini", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert_iso_timestamp(metadata["timestamp"])
 
     @pytest.mark.asyncio
@@ -385,7 +385,7 @@ async def test_finish_reason_extraction(
         llm = GeminiLLM(name="TestGemini", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["finish_reason"] == "MAX_TOKENS"
 
     @pytest.mark.asyncio
@@ -410,7 +410,7 @@ async def test_raw_metadata_stored(self, mock_chat_gemini, mock_system_message):
         llm = GeminiLLM(name="TestGemini", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert "raw_metadata" in metadata
         assert metadata["raw_metadata"]["custom_field"] == "custom_value"
         assert metadata["raw_metadata"]["nested"]["key"] == "value"
@@ -578,7 +578,7 @@ async def test_generate_response_with_partial_usage_metadata(
         response = await llm.generate_response(conversation_history=mock_system_message)
 
         assert response == "Partial usage response"
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["usage"]["prompt_token_count"] == 15
         assert metadata["usage"]["candidates_token_count"] == 0  # Default value
         assert (
@@ -709,7 +709,7 @@ class TestResponse(BaseModel):
             assert "Structured output failed" in str(exc_info.value)
 
             # Verify error metadata was stored
-            metadata = llm.get_last_response_metadata()
+            metadata = llm.last_response_metadata
             assert "error" in metadata
             assert "Structured output failed" in metadata["error"]
 
@@ -737,7 +737,7 @@ class SimpleResponse(BaseModel):
             llm = GeminiLLM(name="TestGemini", role=Role.JUDGE)
             await llm.generate_structured_response("Test", SimpleResponse)
 
-            metadata = llm.get_last_response_metadata()
+            metadata = llm.last_response_metadata
 
             # Verify required fields
             assert metadata["provider"] == "gemini"
diff --git a/tests/unit/llm_clients/test_helpers.py b/tests/unit/llm_clients/test_helpers.py
index 76196a32..5f80cd6c 100644
--- a/tests/unit/llm_clients/test_helpers.py
+++ b/tests/unit/llm_clients/test_helpers.py
@@ -53,7 +53,7 @@ def assert_metadata_structure(
     Raises:
         AssertionError: If metadata structure is invalid
     """
-    metadata = llm.get_last_response_metadata()
+    metadata = llm.last_response_metadata
 
     # Check required fields exist
     assert "model" in metadata, "Metadata missing 'model' field"
@@ -102,11 +102,12 @@ def assert_iso_timestamp(timestamp: str) -> None:
 
 
 def assert_metadata_copy_behavior(llm: LLMInterface) -> None:
-    """Assert that get_last_response_metadata returns a copy.
+    """Assert that last_response_metadata (property) returns a deep copy.
 
     Verifies that:
     1. Multiple calls return equal but different objects
-    2. Modifying returned dict doesn't affect internal state
+    2. Modifying returned dict (top-level) doesn't affect internal state
+    3. Modifying nested dicts (e.g. usage) doesn't affect internal state
 
     Args:
         llm: LLM instance to test
@@ -117,8 +118,8 @@ def assert_metadata_copy_behavior(llm: LLMInterface) -> None:
     # Set some test metadata
     llm.last_response_metadata = {"test": "value"}
 
-    metadata1 = llm.get_last_response_metadata()
-    metadata2 = llm.get_last_response_metadata()
+    metadata1 = llm.last_response_metadata
+    metadata2 = llm.last_response_metadata
 
     # Should be equal but not the same object
     assert metadata1 == metadata2, "Multiple calls should return equal dicts"
@@ -130,6 +131,15 @@ def assert_metadata_copy_behavior(llm: LLMInterface) -> None:
         "modified" not in llm.last_response_metadata
     ), "Modification leaked to internal state"
 
+    # Nested mutation must not affect internal state (deep copy)
+    llm.last_response_metadata = {"usage": {"input_tokens": 10, "output_tokens": 5}}
+    meta = llm.last_response_metadata
+    meta["usage"]["input_tokens"] = 999
+    fresh = llm.last_response_metadata
+    assert (
+        fresh["usage"]["input_tokens"] == 10
+    ), "Nested mutation leaked to internal state (expected deep copy)"
+
 
 def assert_response_timing(metadata: Dict[str, Any]) -> None:
     """Assert that metadata contains valid response timing information.
@@ -165,7 +175,7 @@ def assert_error_metadata(
     Raises:
         AssertionError: If error metadata is invalid
     """
-    metadata = llm.get_last_response_metadata()
+    metadata = llm.last_response_metadata
 
     # Check error field exists and contains expected substring
     assert "error" in metadata, "Metadata missing 'error' field"
diff --git a/tests/unit/llm_clients/test_llm_interface.py b/tests/unit/llm_clients/test_llm_interface.py
index dd60f7a2..13b479b5 100644
--- a/tests/unit/llm_clients/test_llm_interface.py
+++ b/tests/unit/llm_clients/test_llm_interface.py
@@ -1,3 +1,4 @@
+import uuid
 from typing import Optional
 from unittest.mock import MagicMock
 
@@ -153,6 +154,7 @@ def test_multiple_instances_have_independent_state(self):
         assert llm2.name == "LLM2"
         assert llm1.system_prompt == "Prompt 1"
         assert llm2.system_prompt == "Prompt 2"
+        assert llm1.conversation_id != llm2.conversation_id
 
         # Modify one shouldn't affect the other
         llm1.set_system_prompt("Modified Prompt 1")
@@ -204,3 +206,53 @@ def set_system_prompt(self, system_prompt: str) -> None:
         assert isinstance(llm.float_attr, float)
         assert isinstance(llm.bool_attr, bool)
         assert isinstance(llm.list_attr, list)
+
+    def test_init_sets_conversation_id(self):
+        """Test that conversation_id is set at init (e.g. UUID)."""
+        llm = ConcreteLLM(name="TestLLM", role=Role.PROVIDER)
+        assert llm.conversation_id is not None
+        assert isinstance(llm.conversation_id, str)
+        assert len(llm.conversation_id) > 0
+
+    def test_create_conversation_id_returns_string(self):
+        """Test that create_conversation_id returns a non-empty string (e.g. UUID)."""
+        llm = ConcreteLLM(name="TestLLM", role=Role.PROVIDER)
+        cid = llm.create_conversation_id()
+        assert isinstance(cid, str)
+        assert len(cid) > 0
+
+    def test_create_conversation_id_returns_distinct_valid_uuid(self):
+        """Test that repeated calls return distinct values and each is a valid UUID."""
+        llm = ConcreteLLM(name="TestLLM", role=Role.PROVIDER)
+        ids = [llm.create_conversation_id() for _ in range(50)]
+        assert len(ids) == len(set(ids)), "ids must be distinct"
+        for cid in ids:
+            uuid.UUID(cid)  # valid UUID string
+
+    def test_update_conversation_id_from_metadata_leaves_unchanged_when_absent(self):
+        """
+        Test _update_conversation_id_from_metadata preserves conversation_id
+        when key is absent from response metadata.
+        """
+        llm = ConcreteLLM(name="TestLLM", role=Role.PROVIDER)
+        original = llm.conversation_id
+        llm._last_response_metadata = {}
+        llm._update_conversation_id_from_metadata()
+        assert llm.conversation_id == original
+
+    def test_update_conversation_id_from_metadata_overwrites_when_present(self):
+        """
+        Test _update_conversation_id_from_metadata overwrites self.conversation_id
+        with API-returned conversation_id.
+        """
+        llm = ConcreteLLM(name="TestLLM", role=Role.PROVIDER)
+        llm._last_response_metadata = {"conversation_id": "api-provided-id"}
+        llm._update_conversation_id_from_metadata()
+        assert llm.conversation_id == "api-provided-id"
+
+    @pytest.mark.asyncio
+    async def test_conversation_id_available_after_generate_response(self):
+        """Test that conversation_id remains set after generate_response."""
+        llm = ConcreteLLM(name="TestLLM", role=Role.PROVIDER)
+        await llm.generate_response(conversation_history=[])
+        assert llm.conversation_id is not None
diff --git a/tests/unit/llm_clients/test_ollama_llm.py b/tests/unit/llm_clients/test_ollama_llm.py
index aad5a61d..e6626c33 100644
--- a/tests/unit/llm_clients/test_ollama_llm.py
+++ b/tests/unit/llm_clients/test_ollama_llm.py
@@ -585,8 +585,8 @@ async def test_generate_response_with_persona_role_flips_types(self, mock_ollama
         assert "Assistant: How are you?" in call_args
         assert "Assistant:" in call_args
 
-    def test_get_last_response_metadata_returns_copy(self):
-        """Test that get_last_response_metadata returns a copy."""
+    def test_last_response_metadata_copy_returns_copy(self):
+        """Test that last_response_metadata.copy() returns a copy, not the original."""
         from llm_clients.ollama_llm import OllamaLLM
 
         llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER)
@@ -653,7 +653,7 @@ async def test_metadata_tracks_timing(self, mock_ollama, mock_system_message):
         llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert_response_timing(metadata)
 
     @pytest.mark.asyncio
@@ -669,7 +669,7 @@ async def test_timestamp_format(self, mock_ollama, mock_system_message):
         llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert_iso_timestamp(metadata["timestamp"])
 
     @pytest.mark.asyncio
@@ -685,7 +685,7 @@ async def test_metadata_structure_complete(self, mock_ollama, mock_system_messag
         llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER, model_name="mistral:7b")
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
 
         # Verify all expected fields are present using helper
         assert_metadata_structure(
@@ -710,7 +710,7 @@ async def test_metadata_initialized_empty(self, mock_ollama):
         llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER)
 
         # Before any response, metadata should be empty
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata == {}
 
     @pytest.mark.asyncio
@@ -730,7 +730,7 @@ async def test_usage_metadata_always_empty(self, mock_ollama, mock_system_messag
         llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["usage"] == {}
         # Ollama doesn't have these fields
         assert "prompt_tokens" not in metadata["usage"]
@@ -755,7 +755,7 @@ async def test_no_response_object_in_metadata(
         llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         # Ollama doesn't store the response object
         assert "response" not in metadata
 
@@ -775,7 +775,7 @@ async def test_no_finish_reason_in_metadata(self, mock_ollama, mock_system_messa
         llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         # Ollama doesn't have finish_reason
         assert "finish_reason" not in metadata
         assert "stop_reason" not in metadata
@@ -796,7 +796,7 @@ async def test_no_raw_metadata_stored(self, mock_ollama, mock_system_message):
         llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         # Ollama doesn't store raw_metadata or raw_response_metadata
         assert "raw_metadata" not in metadata
         assert "raw_response_metadata" not in metadata
diff --git a/tests/unit/llm_clients/test_openai_llm.py b/tests/unit/llm_clients/test_openai_llm.py
index d4eb3ce9..a69ab3cb 100644
--- a/tests/unit/llm_clients/test_openai_llm.py
+++ b/tests/unit/llm_clients/test_openai_llm.py
@@ -209,7 +209,7 @@ async def test_generate_response_without_additional_kwargs(
         response = await llm.generate_response(conversation_history=mock_system_message)
 
         assert response == "Response"
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["additional_kwargs"] == {}
 
     @pytest.mark.asyncio
@@ -232,7 +232,7 @@ async def test_generate_response_without_response_metadata(
         response = await llm.generate_response(conversation_history=mock_system_message)
 
         assert response == "Response"
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["model"] == "gpt-5.2"
         assert metadata["usage"] == {}
         assert metadata["finish_reason"] is None
@@ -266,7 +266,7 @@ async def test_generate_response_without_usage_metadata(
         response = await llm.generate_response(conversation_history=mock_system_message)
 
         assert response == "Response"
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         # Should still have usage from token_usage
         assert metadata["usage"]["prompt_tokens"] == 10
         assert metadata["usage"]["completion_tokens"] == 20
@@ -311,11 +311,11 @@ async def test_generate_response_tracks_timing(
         llm = OpenAILLM(name="TestOpenAI", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert_response_timing(metadata)
 
-    def test_get_last_response_metadata_returns_copy(self):
-        """Test that get_last_response_metadata returns a copy."""
+    def test_last_response_metadata_copy_returns_copy(self):
+        """Test that last_response_metadata.copy() returns a copy, not the original."""
         llm = OpenAILLM(name="TestOpenAI", role=Role.PERSONA)
         assert_metadata_copy_behavior(llm)
 
@@ -347,7 +347,7 @@ async def test_metadata_includes_response_object(
         llm = OpenAILLM(name="TestOpenAI", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert "response" in metadata
         assert metadata["response"] == mock_response
 
@@ -369,7 +369,7 @@ async def test_timestamp_format(
         llm = OpenAILLM(name="TestOpenAI", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert_iso_timestamp(metadata["timestamp"])
 
     @pytest.mark.asyncio
@@ -393,7 +393,7 @@ async def test_model_name_update_from_metadata(
         llm = OpenAILLM(name="TestOpenAI", role=Role.PERSONA, model_name="gpt-4o")
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["model"] == "gpt-4o-0613-updated"
 
     @pytest.mark.asyncio
@@ -559,7 +559,7 @@ async def test_generate_response_with_partial_usage_metadata(
         response = await llm.generate_response(conversation_history=mock_system_message)
 
         assert response == "Partial usage response"
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["usage"]["prompt_tokens"] == 15
         assert metadata["usage"]["completion_tokens"] == 0  # Default value
         assert (
@@ -587,7 +587,7 @@ async def test_metadata_with_finish_reason(
         llm = OpenAILLM(name="TestOpenAI", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         assert metadata["finish_reason"] == "length"
 
     @pytest.mark.asyncio
@@ -615,7 +615,7 @@ async def test_raw_metadata_stored(
         llm = OpenAILLM(name="TestOpenAI", role=Role.PERSONA)
         await llm.generate_response(conversation_history=mock_system_message)
 
-        metadata = llm.get_last_response_metadata()
+        metadata = llm.last_response_metadata
         # OpenAI uses 'raw_response_metadata' instead of 'raw_metadata'
         assert "raw_response_metadata" in metadata
         assert metadata["raw_response_metadata"]["custom_field"] == "custom_value"
@@ -745,7 +745,7 @@ class TestResponse(BaseModel):
             assert "Structured output failed" in str(exc_info.value)
 
             # Verify error metadata was stored
-            metadata = llm.get_last_response_metadata()
+            metadata = llm.last_response_metadata
             assert "error" in metadata
             assert "Structured output failed" in metadata["error"]
 
@@ -773,7 +773,7 @@ class SimpleResponse(BaseModel):
             llm = OpenAILLM(name="TestOpenAI", role=Role.JUDGE)
             await llm.generate_structured_response("Test", SimpleResponse)
 
-            metadata = llm.get_last_response_metadata()
+            metadata = llm.last_response_metadata
 
             # Verify required fields
             assert metadata["provider"] == "openai"