diff --git a/docs/evaluating.md b/docs/evaluating.md index ac14a85b..7e0bfdeb 100644 --- a/docs/evaluating.md +++ b/docs/evaluating.md @@ -186,12 +186,9 @@ def set_system_prompt(self, system_prompt: str) -> None: self.system_prompt = system_prompt ``` -#### `get_last_response_metadata()` - Get response metadata (optional but recommended) -```python -def get_last_response_metadata(self) -> Dict[str, Any]: - """Get metadata from the last response.""" - return self.last_response_metadata.copy() -``` +#### `last_response_metadata` - Response metadata (required) + +Set in `__init__` (base sets it to `{}`). Update it in `generate_response()`: assign with `self.last_response_metadata = {...}`. If you need in-place updates (e.g. `self.last_response_metadata["usage"] = ...`), use `self._last_response_metadata` so the stored dict is updated. The property getter returns a copy so callers can use `last_response_metadata` without mutating the client's dict. ### 3. Add the new LLM client to the factory @@ -227,6 +224,18 @@ python3 judge.py -f conversations/{YOUR_FOLDER} -j your-model-name - **LangChain Integration**: The provided implementations use LangChain for robust LLM interactions - **Error Handling**: Make sure to handle errors gracefully and return appropriate error messages +### Conversation flow and history + +ConversationSimulator holds the full conversation and passes `conversation_history` into your client on every call. Your client is not required to store history. You can: + +- **Stateless**: Build each request from `conversation_history` (as the built-in clients do), or +- **Server-side state**: Send a `conversation_id` to your API and let the server maintain the conversation; in that case you may use `conversation_history` only when needed (e.g. fallback or logging). + +**When your endpoint requires a conversation id** (the built-in clients do not; this is for custom clients): + +- `conversation_id` is set in the base class `__init__`, so you always have one to send as request metadata. Use `self.conversation_id` when your API needs a conversation ID. +- For LLM clients that require `conversation_id` handling, in `generate_response()`, you must set `conversation_id` in `_last_response_metadata` (interface requirement). If your API returns its own `conversation_id` in the response metadata (e.g. it ignores the one we send), call `self._update_conversation_id_from_metadata()` at the end of `generate_response()` after setting `_last_response_metadata`; that overwrites `self.conversation_id` with the API’s value. + ## Structured Output Support ### Native Support (Recommended) diff --git a/generate_conversations/conversation_simulator.py b/generate_conversations/conversation_simulator.py index 48d0d9c1..df1172b3 100644 --- a/generate_conversations/conversation_simulator.py +++ b/generate_conversations/conversation_simulator.py @@ -110,7 +110,7 @@ async def start_conversation( input_message=input_msg, response_message=lc_message, early_termination=False, - logging_metadata=current_speaker.get_last_response_metadata(), + logging_metadata=current_speaker.last_response_metadata, ) self.conversation_history.append(turn_obj) diff --git a/generate_conversations/runner.py b/generate_conversations/runner.py index 3ced3f72..2735d880 100644 --- a/generate_conversations/runner.py +++ b/generate_conversations/runner.py @@ -49,20 +49,30 @@ def __init__( self.max_total_words = max_total_words self.max_personas = max_personas - self.AGENT_SYSTEM_PROMPT = self.agent_model_config.get( - "system_prompt", "You are a helpful AI assistant." - ) - async def run_single_conversation( self, persona_config: dict, - agent, max_turns: int, - conversation_id: int, + conversation_index: int, run_number: int, - **kargs: dict, + **kwargs: dict, ) -> Dict[str, Any]: - """Run a single conversation asynchronously.""" + """Run a single simulated conversation (persona vs provider LLM). + + Uses fresh LLM instances per call; safe for concurrent use. Logs turns, + writes transcript to self.folder_name, then cleans up logger and LLMs. + + Args: + persona_config (dict): Must have "model", "prompt", "name". + max_turns (int): Max conversation turns for a conversation. + conversation_index (int): Index in the batch of conversations. + run_number (int): Run index for this prompt (e.g. 1 of runs_per_prompt). + **kwargs: Unused; reserved for future use. + + Returns: + Dict[str, Any]: index, llm1_model, llm1_prompt, run_number, turns, + filename, log_file, duration, early_termination, conversation. + """ model_name = persona_config["model"] system_prompt = persona_config["prompt"] # This is now the full persona prompt persona_name = persona_config["name"] @@ -83,7 +93,7 @@ async def run_single_conversation( logger = setup_conversation_logger(filename_base, run_id=self.run_id) start_time = time.time() - # Create LLM1 instance with the persona prompt and configuration + # Create persona instance persona = LLMFactory.create_llm( model_name=model_name, name=f"{model_short} {persona_name}", @@ -92,6 +102,23 @@ async def run_single_conversation( **self.persona_model_config, ) + # Create new agent instance to reset conversation_id and metadata. + # Exclude selected kwargs to avoid duplicate args expected in create_llm. + agent_kwargs = { + k: v + for k, v in self.agent_model_config.items() + if k not in ("model", "name", "system_prompt") + } + agent = LLMFactory.create_llm( + model_name=self.agent_model_config["model"], + name=self.agent_model_config.get("name", "Provider"), + system_prompt=self.agent_model_config.get( + "system_prompt", "You are a helpful AI assistant." + ), + role=Role.PROVIDER, + **agent_kwargs, + ) + # Log conversation start log_conversation_start( logger=logger, @@ -148,7 +175,7 @@ async def run_single_conversation( simulator.save_conversation(f"{filename_base}.txt", self.folder_name) result = { - "id": conversation_id, + "index": conversation_index, "llm1_model": model_name, "llm1_prompt": persona_name, "run_number": run_number, @@ -164,11 +191,12 @@ async def run_single_conversation( # Cleanup LLM resources (e.g., close HTTP sessions for Azure) # Always cleanup, even if conversation failed - try: - await persona.cleanup() - except Exception as e: - # Log but don't fail if cleanup fails - print(f"Warning: Failed to cleanup persona LLM: {e}") + for llm in (persona, agent): + try: + await llm.cleanup() + except Exception as e: + # Log but don't fail if cleanup fails + print(f"Warning: Failed to cleanup LLM: {e}") return result @@ -179,23 +207,13 @@ async def run_conversations( # Load prompts from CSV based on persona names personas = load_prompts_from_csv(persona_names, max_personas=self.max_personas) - # Load agent configuration (fixed, shared across all conversations) - agent = LLMFactory.create_llm( - model_name=self.agent_model_config["model"], - name=self.agent_model_config.pop("name"), - system_prompt=self.AGENT_SYSTEM_PROMPT, - role=Role.PROVIDER, - **self.agent_model_config, - ) - # Create tasks for all conversations (each prompt run multiple times) tasks = [] - conversation_id = 1 + conversation_index = 1 for persona in personas: for run in range(1, self.runs_per_prompt + 1): tasks.append( - # TODO: should we pass the persona object here? self.run_single_conversation( { "model": self.persona_model_config["model"], @@ -203,13 +221,12 @@ async def run_conversations( "name": persona["Name"], "run": run, }, - agent, self.max_turns, - conversation_id, + conversation_index, run, ) ) - conversation_id += 1 + conversation_index += 1 # Run all conversations with concurrency limit start_time = datetime.now() @@ -237,11 +254,4 @@ async def run_with_limit(task): print(f"\nCompleted {len(results)} conversations in {total_time:.2f} seconds") - # Cleanup agent LLM resources (e.g., close HTTP sessions for Azure) - try: - await agent.cleanup() - except Exception as e: - # Log but don't fail if cleanup fails - print(f"Warning: Failed to cleanup agent LLM: {e}") - return results diff --git a/llm_clients/azure_llm.py b/llm_clients/azure_llm.py index d0205284..835fef5c 100644 --- a/llm_clients/azure_llm.py +++ b/llm_clients/azure_llm.py @@ -187,19 +187,19 @@ async def generate_response( # Extract token usage if "token_usage" in metadata: usage = metadata["token_usage"] - self.last_response_metadata["usage"] = { + self._last_response_metadata["usage"] = { "input_tokens": usage.get("input_tokens", 0), "output_tokens": usage.get("output_tokens", 0), "total_tokens": usage.get("total_tokens", 0), } # Extract finish reason - self.last_response_metadata["finish_reason"] = metadata.get( + self._last_response_metadata["finish_reason"] = metadata.get( "finish_reason" ) # Store raw metadata - self.last_response_metadata["raw_metadata"] = dict(metadata) + self._last_response_metadata["raw_metadata"] = dict(metadata) return response.text except Exception as e: @@ -307,10 +307,6 @@ async def generate_structured_response( } raise RuntimeError(f"Error generating structured response: {str(e)}") from e - def get_last_response_metadata(self) -> Dict[str, Any]: - """Get metadata from the last response.""" - return self.last_response_metadata.copy() - def set_system_prompt(self, system_prompt: str) -> None: """Set or update the system prompt.""" self.system_prompt = system_prompt diff --git a/llm_clients/claude_llm.py b/llm_clients/claude_llm.py index 656e96f7..6daf3064 100644 --- a/llm_clients/claude_llm.py +++ b/llm_clients/claude_llm.py @@ -118,7 +118,7 @@ async def generate_response( # Extract token usage if "usage" in metadata: usage = metadata["usage"] - self.last_response_metadata["usage"] = { + self._last_response_metadata["usage"] = { "input_tokens": usage.get("input_tokens", 0), "output_tokens": usage.get("output_tokens", 0), "total_tokens": usage.get("input_tokens", 0) @@ -126,10 +126,12 @@ async def generate_response( } # Extract stop reason - self.last_response_metadata["stop_reason"] = metadata.get("stop_reason") + self._last_response_metadata["stop_reason"] = metadata.get( + "stop_reason" + ) # Store raw metadata - self.last_response_metadata["raw_metadata"] = dict(metadata) + self._last_response_metadata["raw_metadata"] = dict(metadata) return response.text except Exception as e: @@ -204,10 +206,6 @@ async def generate_structured_response( } raise RuntimeError(f"Error generating structured response: {str(e)}") from e - def get_last_response_metadata(self) -> Dict[str, Any]: - """Get metadata from the last response.""" - return self.last_response_metadata.copy() - def set_system_prompt(self, system_prompt: str) -> None: """Set or update the system prompt.""" self.system_prompt = system_prompt diff --git a/llm_clients/gemini_llm.py b/llm_clients/gemini_llm.py index 23795811..f298ced6 100644 --- a/llm_clients/gemini_llm.py +++ b/llm_clients/gemini_llm.py @@ -116,7 +116,7 @@ async def generate_response( # Extract token usage - Gemini may have different structure if "usage_metadata" in metadata: usage = metadata["usage_metadata"] - self.last_response_metadata["usage"] = { + self._last_response_metadata["usage"] = { "prompt_token_count": usage.get("prompt_token_count", 0), "candidates_token_count": usage.get( "candidates_token_count", 0 @@ -126,19 +126,19 @@ async def generate_response( elif "token_usage" in metadata: # Fallback structure usage = metadata["token_usage"] - self.last_response_metadata["usage"] = { + self._last_response_metadata["usage"] = { "prompt_tokens": usage.get("prompt_tokens", 0), "completion_tokens": usage.get("completion_tokens", 0), "total_tokens": usage.get("total_tokens", 0), } # Extract finish reason - self.last_response_metadata["finish_reason"] = metadata.get( + self._last_response_metadata["finish_reason"] = metadata.get( "finish_reason" ) # Store raw metadata - self.last_response_metadata["raw_metadata"] = dict(metadata) + self._last_response_metadata["raw_metadata"] = dict(metadata) return response.text except Exception as e: @@ -154,10 +154,6 @@ async def generate_response( } return f"Error generating response: {str(e)}" - def get_last_response_metadata(self) -> Dict[str, Any]: - """Get metadata from the last response.""" - return self.last_response_metadata.copy() - async def generate_structured_response( self, message: Optional[str], response_model: Type[T] ) -> T: diff --git a/llm_clients/llm_interface.py b/llm_clients/llm_interface.py index f19a35e7..683b1904 100644 --- a/llm_clients/llm_interface.py +++ b/llm_clients/llm_interface.py @@ -1,3 +1,5 @@ +import copy +import uuid from abc import ABC, abstractmethod from enum import Enum from typing import Any, Dict, List, Optional, Type, TypeVar @@ -31,7 +33,39 @@ def __init__( self.name = name self.role = role self.system_prompt = system_prompt or "" - self.last_response_metadata: Dict[str, Any] = {} + self._last_response_metadata: Dict[str, Any] = {} + self.conversation_id = self.create_conversation_id() + + @property + def last_response_metadata(self) -> Dict[str, Any]: + """Metadata from the last generate_response call. Returns a deep copy so + callers cannot mutate internal state (including nested dicts like usage). + """ + return copy.deepcopy(self._last_response_metadata) + + @last_response_metadata.setter + def last_response_metadata(self, value: Optional[Dict[str, Any]]) -> None: + """Set metadata; use _last_response_metadata for in-place updates.""" + self._last_response_metadata = value or {} + + def create_conversation_id(self) -> str: + """Create a new unique conversation id. + + Used at init and when the API does not return one in response metadata. + Subclasses may override to use a different id format. + """ + return str(uuid.uuid4()) + + def _update_conversation_id_from_metadata(self) -> None: + """If the API returned a conversation_id in response metadata, use it. + + Call after generate_response once _last_response_metadata is set. + APIs that ignore our request conversation_id but return their own + will overwrite self.conversation_id here. + """ + cid = (self._last_response_metadata or {}).get("conversation_id") + if cid is not None: + self.conversation_id = cid @abstractmethod async def generate_response( @@ -49,8 +83,15 @@ async def generate_response( starting the conversation. Returns: - str: The response text. Metadata available via - get_last_response_metadata() + str: The response text. Metadata in self.last_response_metadata + (getter returns a copy so callers need not copy). + + Note: + For API thread/session identification, use self.conversation_id + (set at init; send as request metadata). If your API returns a + conversation_id in response metadata, call + self._update_conversation_id_from_metadata() after setting + _last_response_metadata to overwrite. """ pass diff --git a/llm_clients/ollama_llm.py b/llm_clients/ollama_llm.py index cf203e84..92e20f48 100644 --- a/llm_clients/ollama_llm.py +++ b/llm_clients/ollama_llm.py @@ -134,10 +134,6 @@ async def generate_response( } return f"Error generating response: {str(e)}" - def get_last_response_metadata(self) -> Dict[str, Any]: - """Get metadata from the last response.""" - return self.last_response_metadata.copy() - def set_system_prompt(self, system_prompt: str) -> None: """Set or update the system prompt.""" self.system_prompt = system_prompt diff --git a/llm_clients/openai_llm.py b/llm_clients/openai_llm.py index e6d69534..1dae9c06 100644 --- a/llm_clients/openai_llm.py +++ b/llm_clients/openai_llm.py @@ -110,7 +110,7 @@ async def generate_response( # Extract additional_kwargs if available if hasattr(response, "additional_kwargs") and response.additional_kwargs: - self.last_response_metadata["additional_kwargs"] = dict( + self._last_response_metadata["additional_kwargs"] = dict( response.additional_kwargs ) @@ -120,34 +120,34 @@ async def generate_response( # Update model name from response metadata if "model_name" in metadata: - self.last_response_metadata["model"] = metadata["model_name"] + self._last_response_metadata["model"] = metadata["model_name"] # Extract token usage from response_metadata if "token_usage" in metadata: token_usage = metadata["token_usage"] - self.last_response_metadata["usage"] = { + self._last_response_metadata["usage"] = { "prompt_tokens": token_usage.get("prompt_tokens", 0), "completion_tokens": token_usage.get("completion_tokens", 0), "total_tokens": token_usage.get("total_tokens", 0), } # Extract other metadata fields - self.last_response_metadata["finish_reason"] = metadata.get( + self._last_response_metadata["finish_reason"] = metadata.get( "finish_reason" ) - self.last_response_metadata["system_fingerprint"] = metadata.get( + self._last_response_metadata["system_fingerprint"] = metadata.get( "system_fingerprint" ) - self.last_response_metadata["logprobs"] = metadata.get("logprobs") + self._last_response_metadata["logprobs"] = metadata.get("logprobs") # Store raw response_metadata - self.last_response_metadata["raw_response_metadata"] = dict(metadata) + self._last_response_metadata["raw_response_metadata"] = dict(metadata) # Extract usage_metadata if available (separate from response_metadata) if hasattr(response, "usage_metadata") and response.usage_metadata: usage_meta = response.usage_metadata # Merge with existing usage info, preferring usage_metadata values - self.last_response_metadata["usage"].update( + self._last_response_metadata["usage"].update( { "input_tokens": usage_meta.get("input_tokens", 0), "output_tokens": usage_meta.get("output_tokens", 0), @@ -155,7 +155,7 @@ async def generate_response( } ) # Store raw usage_metadata - self.last_response_metadata["raw_usage_metadata"] = dict(usage_meta) + self._last_response_metadata["raw_usage_metadata"] = dict(usage_meta) return response.text except Exception as e: @@ -233,10 +233,6 @@ async def generate_structured_response( } raise RuntimeError(f"Error generating structured response: {str(e)}") from e - def get_last_response_metadata(self) -> Dict[str, Any]: - """Get metadata from the last response.""" - return self.last_response_metadata.copy() - def set_system_prompt(self, system_prompt: str) -> None: """Set or update the system prompt.""" self.system_prompt = system_prompt diff --git a/tests/integration/test_conversation_runner.py b/tests/integration/test_conversation_runner.py index e58faa49..d62095a5 100644 --- a/tests/integration/test_conversation_runner.py +++ b/tests/integration/test_conversation_runner.py @@ -16,6 +16,7 @@ import pytest from generate_conversations.runner import ConversationRunner +from llm_clients.llm_interface import Role from tests.mocks.mock_llm import MockLLM @@ -25,20 +26,13 @@ def create_test_runner( run_id: str, **kwargs: Dict[str, Any], ) -> ConversationRunner: - """Helper to create runner and clean up agent_model_config. - - This removes system_prompt from agent_model_config to avoid duplicate - keyword argument errors when run_conversations calls create_llm. - """ - agent_cfg = copy.deepcopy(agent_config) + """Helper to create runner with copied configs (no shared mutation).""" runner = ConversationRunner( persona_model_config=copy.deepcopy(persona_config), - agent_model_config=agent_cfg, + agent_model_config=copy.deepcopy(agent_config), run_id=run_id, **kwargs, ) - # Remove system_prompt to avoid duplicate kwarg in run_conversations - runner.agent_model_config.pop("system_prompt", None) return runner @@ -54,16 +48,7 @@ def basic_persona_config() -> Dict[str, Any]: @pytest.fixture def basic_agent_config() -> Dict[str, Any]: - """Basic agent model configuration. - - Note: This includes system_prompt which will be extracted in __init__ - and used separately in run_conversations. The config will be mutated - (name popped, system_prompt passed separately). - """ - # Don't include system_prompt in the config dict since it causes - # duplicate keyword argument errors when run_conversations calls - # create_llm with both system_prompt as arg and **config - # However, we do include it initially so __init__ can extract it + """Basic agent model configuration.""" return { "model": "mock-agent-model", "name": "mock-agent", @@ -164,7 +149,6 @@ def test_init_with_basic_config( # Assert assert runner.persona_model_config == basic_persona_config - # Note: agent_model_config will still have system_prompt assert runner.agent_model_config["model"] == basic_agent_config["model"] assert runner.run_id == run_id assert runner.max_turns == 6 @@ -195,46 +179,105 @@ def test_init_with_custom_parameters( assert runner.folder_name == "test_conversations" assert runner.max_concurrent == 3 - def test_agent_system_prompt_from_config( + @pytest.mark.asyncio + async def test_agent_system_prompt_from_config( self, + tmp_path: Path, basic_persona_config: Dict[str, Any], basic_agent_config: Dict[str, Any], + mock_llm_factory, ) -> None: - """Test that agent system prompt is extracted from config.""" - # Arrange + """Agent create_llm call receives system_prompt from config.""" custom_prompt = "You are a mental health support chatbot." - basic_agent_config["system_prompt"] = custom_prompt + agent_config = copy.deepcopy(basic_agent_config) + agent_config["system_prompt"] = custom_prompt + + create_llm_calls = [] + real_create = mock_llm_factory.side_effect + + def recording_create_llm(*args: Any, **kwargs: Any) -> MockLLM: + create_llm_calls.append(copy.deepcopy(kwargs)) + return real_create(*args, **kwargs) + + mock_llm_factory.side_effect = recording_create_llm - # Act runner = ConversationRunner( persona_model_config=basic_persona_config, - agent_model_config=basic_agent_config, + agent_model_config=agent_config, run_id="test_run", + folder_name=str(tmp_path / "conversations"), ) + persona_config = { + "model": "mock-persona-model", + "prompt": "Test persona prompt", + "name": "TestPersona", + "run": 1, + } - # Assert - assert runner.AGENT_SYSTEM_PROMPT == custom_prompt + with patch( + "generate_conversations.runner.setup_conversation_logger" + ) as mock_logger: + mock_logger.return_value = MagicMock() + await runner.run_single_conversation( + persona_config=persona_config, + max_turns=2, + conversation_index=1, + run_number=1, + ) - def test_agent_system_prompt_default( + agent_calls = [c for c in create_llm_calls if c.get("role") == Role.PROVIDER] + assert len(agent_calls) == 1 + assert agent_calls[0]["system_prompt"] == custom_prompt + + @pytest.mark.asyncio + async def test_agent_system_prompt_default( self, + tmp_path: Path, basic_persona_config: Dict[str, Any], + mock_llm_factory, ) -> None: - """Test default agent system prompt when not in config.""" - # Arrange + """When agent config has no system_prompt, create_llm gets the fallback.""" + default_prompt = "You are a helpful AI assistant." agent_config = { "model": "mock-agent", "name": "test-agent", } + create_llm_calls = [] + real_create = mock_llm_factory.side_effect + + def recording_create_llm(*args: Any, **kwargs: Any) -> MockLLM: + create_llm_calls.append(copy.deepcopy(kwargs)) + return real_create(*args, **kwargs) + + mock_llm_factory.side_effect = recording_create_llm - # Act runner = ConversationRunner( persona_model_config=basic_persona_config, agent_model_config=agent_config, run_id="test_run", + folder_name=str(tmp_path / "conversations"), ) + persona_config = { + "model": "mock-persona-model", + "prompt": "Test persona prompt", + "name": "TestPersona", + "run": 1, + } - # Assert - assert runner.AGENT_SYSTEM_PROMPT == "You are a helpful AI assistant." + with patch( + "generate_conversations.runner.setup_conversation_logger" + ) as mock_logger: + mock_logger.return_value = MagicMock() + await runner.run_single_conversation( + persona_config=persona_config, + max_turns=2, + conversation_index=1, + run_number=1, + ) + + agent_calls = [c for c in create_llm_calls if c.get("role") == Role.PROVIDER] + assert len(agent_calls) == 1 + assert agent_calls[0]["system_prompt"] == default_prompt @pytest.mark.integration @@ -269,14 +312,6 @@ async def test_run_single_conversation_creates_files( "run": 1, } - agent = MockLLM( - name="test-agent", - model_name="mock-agent-model", - responses=["I'm here to help", "Tell me more"], - temperature=0.5, - max_tokens=500, - ) - # Act - patch setup_conversation_logger to use tmp_path with patch( "generate_conversations.runner.setup_conversation_logger" @@ -290,14 +325,13 @@ async def test_run_single_conversation_creates_files( result = await runner.run_single_conversation( persona_config=persona_config, - agent=agent, max_turns=4, - conversation_id=1, + conversation_index=1, run_number=1, ) # Assert - verify result structure - assert result["id"] == 1 + assert result["index"] == 1 assert result["llm1_model"] == "mock-persona-model" assert result["llm1_prompt"] == "TestPersona" assert result["run_number"] == 1 @@ -338,14 +372,6 @@ async def test_run_single_conversation_logging( "run": 1, } - agent = MockLLM( - name="test-agent", - model_name="mock-agent-model", - responses=["Response 1"], - temperature=0.5, - max_tokens=500, - ) - # Act - use real logging with tmp_path os.makedirs(log_folder / run_id, exist_ok=True) log_file = log_folder / run_id / "test_conversation.log" @@ -364,9 +390,8 @@ async def test_run_single_conversation_logging( await runner.run_single_conversation( persona_config=persona_config, - agent=agent, max_turns=2, - conversation_id=1, + conversation_index=1, run_number=1, ) @@ -406,14 +431,6 @@ async def test_filename_generation_format( "run": 2, } - agent = MockLLM( - name="test-agent", - model_name="mock-agent-model", - responses=["Response"], - temperature=0.5, - max_tokens=500, - ) - # Act with patch( "generate_conversations.runner.setup_conversation_logger" @@ -423,9 +440,8 @@ async def test_filename_generation_format( result = await runner.run_single_conversation( persona_config=persona_config, - agent=agent, max_turns=2, - conversation_id=1, + conversation_index=1, run_number=2, ) @@ -494,9 +510,8 @@ def side_effect(*args, **kwargs): # Act result = await runner.run_single_conversation( persona_config=persona_config, - agent=agent_mock, max_turns=10, - conversation_id=1, + conversation_index=1, run_number=1, ) @@ -528,14 +543,6 @@ async def test_conversation_metadata_completeness( "run": 1, } - agent = MockLLM( - name="test-agent", - model_name="mock-agent-model", - responses=["Response"], - temperature=0.5, - max_tokens=500, - ) - # Act with patch( "generate_conversations.runner.setup_conversation_logger" @@ -545,15 +552,14 @@ async def test_conversation_metadata_completeness( result = await runner.run_single_conversation( persona_config=persona_config, - agent=agent, max_turns=2, - conversation_id=5, + conversation_index=5, run_number=3, ) # Assert - verify all required metadata fields required_fields = [ - "id", + "index", "llm1_model", "llm1_prompt", "run_number", @@ -569,7 +575,7 @@ async def test_conversation_metadata_completeness( assert field in result, f"Missing field: {field}" # Verify types - assert isinstance(result["id"], int) + assert isinstance(result["index"], int) assert isinstance(result["turns"], int) assert isinstance(result["duration"], float) assert isinstance(result["early_termination"], bool) @@ -721,14 +727,14 @@ async def test_no_concurrent_limit( # Assert assert len(results) == 2 # 1 persona * 2 runs - async def test_conversation_id_increments( + async def test_conversation_index_increments( self, tmp_path: Path, basic_persona_config: Dict[str, Any], basic_agent_config: Dict[str, Any], mock_llm_factory, ) -> None: - """Test that conversation IDs increment correctly.""" + """Test that conversation indices increment correctly.""" # Arrange conv_folder = tmp_path / "conversations" runner = create_test_runner( @@ -757,9 +763,75 @@ async def test_conversation_id_increments( # Act results = await runner.run_conversations(persona_names=None) - # Assert - IDs should be 1, 2, 3, 4 - ids = sorted([r["id"] for r in results]) - assert ids == [1, 2, 3, 4] + # Assert - indices should be 1, 2, 3, 4 + indices = sorted([r["index"] for r in results]) + assert indices == [1, 2, 3, 4] + + async def test_agent_config_not_mutated_across_concurrent_conversations( + self, + tmp_path: Path, + basic_persona_config: Dict[str, Any], + basic_agent_config: Dict[str, Any], + mock_llm_factory, + ) -> None: + """Ensure agent_model_config is not mutated when running multiple conversations. + + Each run_single_conversation creates an agent from the shared config; the config + must remain intact so every conversation gets the same name and system_prompt. + """ + expected_name = "shared-agent-name" + expected_prompt = "Shared system prompt for all runs." + + create_llm_calls = [] + real_create = mock_llm_factory.side_effect + + def recording_create_llm(*args: Any, **kwargs: Any) -> MockLLM: + create_llm_calls.append(copy.deepcopy(kwargs)) + return real_create(*args, **kwargs) + + mock_llm_factory.side_effect = recording_create_llm + + agent_config = copy.deepcopy(basic_agent_config) + agent_config["name"] = expected_name + agent_config["system_prompt"] = expected_prompt + + conv_folder = tmp_path / "conversations" + runner = create_test_runner( + basic_persona_config, + agent_config, + "test_run", + max_turns=2, + runs_per_prompt=2, + folder_name=str(conv_folder), + ) + mock_personas = [ + {"Name": "Persona1", "prompt": "Prompt 1"}, + {"Name": "Persona2", "prompt": "Prompt 2"}, + ] + + with ( + patch("generate_conversations.runner.load_prompts_from_csv") as mock_load, + patch( + "generate_conversations.runner.setup_conversation_logger" + ) as mock_logger, + ): + mock_load.return_value = mock_personas + mock_logger.return_value = MagicMock() + await runner.run_conversations(persona_names=None) + + assert runner.agent_model_config["name"] == expected_name + assert runner.agent_model_config["system_prompt"] == expected_prompt + + agent_calls = [ + c + for c in create_llm_calls + if c.get("model_name") == basic_agent_config["model"] + ] + assert ( + len(agent_calls) == 4 + ), "Expected 4 conversations => 4 agent create_llm calls" + assert all(c.get("name") == expected_name for c in agent_calls) + assert all(c.get("system_prompt") == expected_prompt for c in agent_calls) @pytest.mark.integration @@ -793,14 +865,6 @@ async def test_conversation_folder_creation( "run": 1, } - agent = MockLLM( - name="test-agent", - model_name="mock-agent-model", - responses=["Response"], - temperature=0.5, - max_tokens=500, - ) - # Act with patch( "generate_conversations.runner.setup_conversation_logger" @@ -810,9 +874,8 @@ async def test_conversation_folder_creation( await runner.run_single_conversation( persona_config=persona_config, - agent=agent, max_turns=2, - conversation_id=1, + conversation_index=1, run_number=1, ) @@ -901,13 +964,13 @@ async def test_filename_uniqueness( class TestConversationRunnerErrorHandling: """Test error handling and edge cases.""" - async def test_handles_llm_errors_gracefully( + async def test_llm_errors_propagate_from_run_single_conversation( self, tmp_path: Path, basic_persona_config: Dict[str, Any], basic_agent_config: Dict[str, Any], ) -> None: - """Test that LLM errors are handled gracefully.""" + """When the agent LLM errors during conversation, the exception propagates.""" # Arrange conv_folder = tmp_path / "conversations" runner = ConversationRunner( @@ -924,7 +987,7 @@ async def test_handles_llm_errors_gracefully( "run": 1, } - # Create agent that will error + # Create agent that will error; persona mock for the other create_llm call error_agent = MockLLM( name="error-agent", model_name="mock-error-model", @@ -933,6 +996,13 @@ async def test_handles_llm_errors_gracefully( temperature=0.5, max_tokens=500, ) + persona_mock = MockLLM( + name="persona", + model_name="mock-persona-model", + responses=["Hello"], + temperature=0.7, + max_tokens=1000, + ) with patch( "generate_conversations.runner.setup_conversation_logger" @@ -943,21 +1013,15 @@ async def test_handles_llm_errors_gracefully( with patch( "generate_conversations.runner.LLMFactory.create_llm" ) as mock_factory: - mock_factory.return_value = MockLLM( - name="persona", - model_name="mock-persona-model", - responses=["Hello"], - temperature=0.7, - max_tokens=1000, - ) + # run_single_conversation creates persona first, then agent + mock_factory.side_effect = [persona_mock, error_agent] # Act & Assert - should raise the error with pytest.raises(Exception) as exc_info: await runner.run_single_conversation( persona_config=persona_config, - agent=error_agent, max_turns=2, - conversation_id=1, + conversation_index=1, run_number=1, ) @@ -1019,14 +1083,6 @@ async def test_logger_cleanup_called( "run": 1, } - agent = MockLLM( - name="test-agent", - model_name="mock-agent-model", - responses=["Response"], - temperature=0.5, - max_tokens=500, - ) - # Act with patch( "generate_conversations.runner.setup_conversation_logger" @@ -1037,9 +1093,8 @@ async def test_logger_cleanup_called( with patch("generate_conversations.runner.cleanup_logger") as mock_cleanup: await runner.run_single_conversation( persona_config=persona_config, - agent=agent, max_turns=2, - conversation_id=1, + conversation_index=1, run_number=1, ) @@ -1076,14 +1131,6 @@ async def test_duration_tracking( "run": 1, } - agent = MockLLM( - name="test-agent", - model_name="mock-agent-model", - responses=["Response"], - temperature=0.5, - max_tokens=500, - ) - # Act with patch( "generate_conversations.runner.setup_conversation_logger" @@ -1094,9 +1141,8 @@ async def test_duration_tracking( start = time.time() result = await runner.run_single_conversation( persona_config=persona_config, - agent=agent, max_turns=2, - conversation_id=1, + conversation_index=1, run_number=1, ) end = time.time() diff --git a/tests/mocks/mock_llm.py b/tests/mocks/mock_llm.py index d89a0377..5e60f9b8 100644 --- a/tests/mocks/mock_llm.py +++ b/tests/mocks/mock_llm.py @@ -74,10 +74,6 @@ async def generate_response( return response - def get_last_response_metadata(self) -> Dict[str, Any]: - """Get metadata from the last response.""" - return self.last_response_metadata.copy() - def set_system_prompt(self, system_prompt: str) -> None: """Set or update the system prompt.""" self.system_prompt = system_prompt diff --git a/tests/unit/llm_clients/test_azure_llm.py b/tests/unit/llm_clients/test_azure_llm.py index 16614013..5e9e6aa7 100644 --- a/tests/unit/llm_clients/test_azure_llm.py +++ b/tests/unit/llm_clients/test_azure_llm.py @@ -348,7 +348,7 @@ async def test_generate_response_without_usage_metadata( response = await llm.generate_response(conversation_history=mock_system_message) assert response == "Response" - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["usage"] == {} @pytest.mark.asyncio @@ -372,7 +372,7 @@ async def test_generate_response_without_response_metadata( response = await llm.generate_response(conversation_history=mock_system_message) assert response == "Response" - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["model"] == "gpt-5.2" assert metadata["usage"] == {} assert metadata["finish_reason"] is None @@ -445,11 +445,11 @@ async def test_generate_response_tracks_timing( llm = AzureLLM(name="TestAzure", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert_response_timing(metadata) - def test_get_last_response_metadata_returns_copy(self): - """Test that get_last_response_metadata returns a copy.""" + def test_last_response_metadata_copy_returns_copy(self): + """Test that last_response_metadata.copy() returns a copy, not the original.""" llm = AzureLLM(name="TestAzure", role=Role.PERSONA) assert_metadata_copy_behavior(llm) @@ -535,7 +535,7 @@ class TestResponse(BaseModel): assert "Structured output failed" in str(exc_info.value) # Verify error metadata was stored - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert "error" in metadata assert "Structured output failed" in metadata["error"] @@ -605,7 +605,7 @@ async def test_timestamp_format( llm = AzureLLM(name="TestAzure", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert_iso_timestamp(metadata["timestamp"]) @pytest.mark.asyncio @@ -664,7 +664,7 @@ async def test_generate_response_with_partial_usage_metadata( response = await llm.generate_response(conversation_history=mock_system_message) assert response == "Partial usage response" - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["usage"]["input_tokens"] == 15 assert metadata["usage"]["output_tokens"] == 0 assert metadata["usage"]["total_tokens"] == 0 @@ -685,7 +685,7 @@ async def test_metadata_includes_response_object( llm = AzureLLM(name="TestAzure", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert "response" in metadata assert metadata["response"] == mock_response @@ -709,7 +709,7 @@ async def test_metadata_with_finish_reason( llm = AzureLLM(name="TestAzure", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["finish_reason"] == "length" @pytest.mark.asyncio @@ -738,7 +738,7 @@ async def test_raw_metadata_stored( llm = AzureLLM(name="TestAzure", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert "raw_metadata" in metadata assert metadata["raw_metadata"]["custom_field"] == "custom_value" assert metadata["raw_metadata"]["nested"]["key"] == "value" diff --git a/tests/unit/llm_clients/test_base_llm.py b/tests/unit/llm_clients/test_base_llm.py index f401126c..31a77829 100644 --- a/tests/unit/llm_clients/test_base_llm.py +++ b/tests/unit/llm_clients/test_base_llm.py @@ -198,8 +198,8 @@ async def test_generate_response_updates_metadata( assert_iso_timestamp(metadata["timestamp"]) assert_response_timing(metadata) - def test_get_last_response_metadata_returns_copy(self): - """Test that get_last_response_metadata returns a copy, not original.""" + def test_last_response_metadata_copy_returns_copy(self): + """Test that last_response_metadata.copy() returns a copy, not the original.""" with self.get_mock_patches(): # pyright: ignore[reportGeneralTypeIssues] llm = self.create_llm(role=Role.PROVIDER, name="TestLLM") @@ -382,7 +382,7 @@ class TestResponse(BaseModel): assert "Structured output failed" in str(exc_info.value) # Verify error metadata was stored - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert "error" in metadata assert "Structured output failed" in metadata["error"] assert metadata["provider"] == self.get_provider_name() diff --git a/tests/unit/llm_clients/test_claude_llm.py b/tests/unit/llm_clients/test_claude_llm.py index c3b1c2bb..3b2cfa29 100644 --- a/tests/unit/llm_clients/test_claude_llm.py +++ b/tests/unit/llm_clients/test_claude_llm.py @@ -203,7 +203,7 @@ async def test_generate_response_without_usage_metadata( response = await llm.generate_response(conversation_history=mock_system_message) assert response == "Response" - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["usage"] == {} @pytest.mark.asyncio @@ -229,7 +229,7 @@ async def test_generate_response_without_response_metadata( response = await llm.generate_response(conversation_history=mock_system_message) assert response == "Response" - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["model"] == "claude-sonnet-4-5-20250929" assert metadata["usage"] == {} assert metadata["stop_reason"] is None @@ -275,11 +275,11 @@ async def test_generate_response_tracks_timing( llm = ClaudeLLM(name="TestClaude", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert_response_timing(metadata) - def test_get_last_response_metadata_returns_copy(self): - """Test that get_last_response_metadata returns a copy.""" + def test_last_response_metadata_copy_returns_copy(self): + """Test that last_response_metadata.copy() returns a copy, not the original.""" llm = ClaudeLLM(name="TestClaude", role=Role.PERSONA) assert_metadata_copy_behavior(llm) @@ -319,7 +319,7 @@ async def test_generate_response_with_partial_usage_metadata( response = await llm.generate_response(conversation_history=mock_system_message) assert response == "Partial usage response" - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["usage"]["input_tokens"] == 15 assert metadata["usage"]["output_tokens"] == 0 # Default value assert metadata["usage"]["total_tokens"] == 15 @@ -343,7 +343,7 @@ async def test_metadata_includes_response_object( llm = ClaudeLLM(name="TestClaude", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert "response" in metadata assert metadata["response"] == mock_response @@ -366,7 +366,7 @@ async def test_timestamp_format( llm = ClaudeLLM(name="TestClaude", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert_iso_timestamp(metadata["timestamp"]) @pytest.mark.asyncio @@ -394,7 +394,7 @@ async def test_metadata_with_stop_reason( llm = ClaudeLLM(name="TestClaude", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["stop_reason"] == "max_tokens" @pytest.mark.asyncio @@ -423,7 +423,7 @@ async def test_raw_metadata_stored( llm = ClaudeLLM(name="TestClaude", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert "raw_metadata" in metadata assert metadata["raw_metadata"]["custom_field"] == "custom_value" assert metadata["raw_metadata"]["nested"]["key"] == "value" @@ -680,7 +680,7 @@ class TestResponse(BaseModel): assert "Structured output failed" in str(exc_info.value) # Verify error metadata was stored - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert "error" in metadata assert "Structured output failed" in metadata["error"] @@ -709,7 +709,7 @@ class SimpleResponse(BaseModel): llm = ClaudeLLM(name="TestClaude", role=Role.JUDGE) await llm.generate_structured_response("Test", SimpleResponse) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata # Verify required fields assert metadata["provider"] == "claude" diff --git a/tests/unit/llm_clients/test_gemini_llm.py b/tests/unit/llm_clients/test_gemini_llm.py index 90138655..a8a876b0 100644 --- a/tests/unit/llm_clients/test_gemini_llm.py +++ b/tests/unit/llm_clients/test_gemini_llm.py @@ -206,7 +206,7 @@ async def test_generate_response_with_fallback_token_usage( response = await llm.generate_response(conversation_history=mock_system_message) assert response == "Response with fallback" - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata # Should use fallback structure assert metadata["usage"]["prompt_tokens"] == 10 assert metadata["usage"]["completion_tokens"] == 20 @@ -234,7 +234,7 @@ async def test_generate_response_without_usage_metadata( response = await llm.generate_response(conversation_history=mock_system_message) assert response == "Response" - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["usage"] == {} @pytest.mark.asyncio @@ -258,7 +258,7 @@ async def test_generate_response_without_response_metadata( response = await llm.generate_response(conversation_history=mock_system_message) assert response == "Response" - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["model"] == "gemini-1.5-pro" assert metadata["usage"] == {} assert metadata["finish_reason"] is None @@ -300,11 +300,11 @@ async def test_generate_response_tracks_timing( llm = GeminiLLM(name="TestGemini", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert_response_timing(metadata) - def test_get_last_response_metadata_returns_copy(self): - """Test that get_last_response_metadata returns a copy.""" + def test_last_response_metadata_copy_returns_copy(self): + """Test that last_response_metadata.copy() returns a copy, not the original.""" llm = GeminiLLM(name="TestGemini", role=Role.PERSONA) assert_metadata_copy_behavior(llm) @@ -336,7 +336,7 @@ async def test_metadata_includes_response_object( llm = GeminiLLM(name="TestGemini", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert "response" in metadata assert metadata["response"] == mock_response @@ -358,7 +358,7 @@ async def test_timestamp_format( llm = GeminiLLM(name="TestGemini", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert_iso_timestamp(metadata["timestamp"]) @pytest.mark.asyncio @@ -385,7 +385,7 @@ async def test_finish_reason_extraction( llm = GeminiLLM(name="TestGemini", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["finish_reason"] == "MAX_TOKENS" @pytest.mark.asyncio @@ -410,7 +410,7 @@ async def test_raw_metadata_stored(self, mock_chat_gemini, mock_system_message): llm = GeminiLLM(name="TestGemini", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert "raw_metadata" in metadata assert metadata["raw_metadata"]["custom_field"] == "custom_value" assert metadata["raw_metadata"]["nested"]["key"] == "value" @@ -578,7 +578,7 @@ async def test_generate_response_with_partial_usage_metadata( response = await llm.generate_response(conversation_history=mock_system_message) assert response == "Partial usage response" - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["usage"]["prompt_token_count"] == 15 assert metadata["usage"]["candidates_token_count"] == 0 # Default value assert ( @@ -709,7 +709,7 @@ class TestResponse(BaseModel): assert "Structured output failed" in str(exc_info.value) # Verify error metadata was stored - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert "error" in metadata assert "Structured output failed" in metadata["error"] @@ -737,7 +737,7 @@ class SimpleResponse(BaseModel): llm = GeminiLLM(name="TestGemini", role=Role.JUDGE) await llm.generate_structured_response("Test", SimpleResponse) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata # Verify required fields assert metadata["provider"] == "gemini" diff --git a/tests/unit/llm_clients/test_helpers.py b/tests/unit/llm_clients/test_helpers.py index 76196a32..5f80cd6c 100644 --- a/tests/unit/llm_clients/test_helpers.py +++ b/tests/unit/llm_clients/test_helpers.py @@ -53,7 +53,7 @@ def assert_metadata_structure( Raises: AssertionError: If metadata structure is invalid """ - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata # Check required fields exist assert "model" in metadata, "Metadata missing 'model' field" @@ -102,11 +102,12 @@ def assert_iso_timestamp(timestamp: str) -> None: def assert_metadata_copy_behavior(llm: LLMInterface) -> None: - """Assert that get_last_response_metadata returns a copy. + """Assert that last_response_metadata (property) returns a deep copy. Verifies that: 1. Multiple calls return equal but different objects - 2. Modifying returned dict doesn't affect internal state + 2. Modifying returned dict (top-level) doesn't affect internal state + 3. Modifying nested dicts (e.g. usage) doesn't affect internal state Args: llm: LLM instance to test @@ -117,8 +118,8 @@ def assert_metadata_copy_behavior(llm: LLMInterface) -> None: # Set some test metadata llm.last_response_metadata = {"test": "value"} - metadata1 = llm.get_last_response_metadata() - metadata2 = llm.get_last_response_metadata() + metadata1 = llm.last_response_metadata + metadata2 = llm.last_response_metadata # Should be equal but not the same object assert metadata1 == metadata2, "Multiple calls should return equal dicts" @@ -130,6 +131,15 @@ def assert_metadata_copy_behavior(llm: LLMInterface) -> None: "modified" not in llm.last_response_metadata ), "Modification leaked to internal state" + # Nested mutation must not affect internal state (deep copy) + llm.last_response_metadata = {"usage": {"input_tokens": 10, "output_tokens": 5}} + meta = llm.last_response_metadata + meta["usage"]["input_tokens"] = 999 + fresh = llm.last_response_metadata + assert ( + fresh["usage"]["input_tokens"] == 10 + ), "Nested mutation leaked to internal state (expected deep copy)" + def assert_response_timing(metadata: Dict[str, Any]) -> None: """Assert that metadata contains valid response timing information. @@ -165,7 +175,7 @@ def assert_error_metadata( Raises: AssertionError: If error metadata is invalid """ - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata # Check error field exists and contains expected substring assert "error" in metadata, "Metadata missing 'error' field" diff --git a/tests/unit/llm_clients/test_llm_interface.py b/tests/unit/llm_clients/test_llm_interface.py index dd60f7a2..13b479b5 100644 --- a/tests/unit/llm_clients/test_llm_interface.py +++ b/tests/unit/llm_clients/test_llm_interface.py @@ -1,3 +1,4 @@ +import uuid from typing import Optional from unittest.mock import MagicMock @@ -153,6 +154,7 @@ def test_multiple_instances_have_independent_state(self): assert llm2.name == "LLM2" assert llm1.system_prompt == "Prompt 1" assert llm2.system_prompt == "Prompt 2" + assert llm1.conversation_id != llm2.conversation_id # Modify one shouldn't affect the other llm1.set_system_prompt("Modified Prompt 1") @@ -204,3 +206,53 @@ def set_system_prompt(self, system_prompt: str) -> None: assert isinstance(llm.float_attr, float) assert isinstance(llm.bool_attr, bool) assert isinstance(llm.list_attr, list) + + def test_init_sets_conversation_id(self): + """Test that conversation_id is set at init (e.g. UUID).""" + llm = ConcreteLLM(name="TestLLM", role=Role.PROVIDER) + assert llm.conversation_id is not None + assert isinstance(llm.conversation_id, str) + assert len(llm.conversation_id) > 0 + + def test_create_conversation_id_returns_string(self): + """Test that create_conversation_id returns a non-empty string (e.g. UUID).""" + llm = ConcreteLLM(name="TestLLM", role=Role.PROVIDER) + cid = llm.create_conversation_id() + assert isinstance(cid, str) + assert len(cid) > 0 + + def test_create_conversation_id_returns_distinct_valid_uuid(self): + """Test that repeated calls return distinct values and each is a valid UUID.""" + llm = ConcreteLLM(name="TestLLM", role=Role.PROVIDER) + ids = [llm.create_conversation_id() for _ in range(50)] + assert len(ids) == len(set(ids)), "ids must be distinct" + for cid in ids: + uuid.UUID(cid) # valid UUID string + + def test_update_conversation_id_from_metadata_leaves_unchanged_when_absent(self): + """ + Test _update_conversation_id_from_metadata preserves conversation_id + when key is absent from response metadata. + """ + llm = ConcreteLLM(name="TestLLM", role=Role.PROVIDER) + original = llm.conversation_id + llm._last_response_metadata = {} + llm._update_conversation_id_from_metadata() + assert llm.conversation_id == original + + def test_update_conversation_id_from_metadata_overwrites_when_present(self): + """ + Test _update_conversation_id_from_metadata overwrites self.conversation_id + with API-returned conversation_id. + """ + llm = ConcreteLLM(name="TestLLM", role=Role.PROVIDER) + llm._last_response_metadata = {"conversation_id": "api-provided-id"} + llm._update_conversation_id_from_metadata() + assert llm.conversation_id == "api-provided-id" + + @pytest.mark.asyncio + async def test_conversation_id_available_after_generate_response(self): + """Test that conversation_id remains set after generate_response.""" + llm = ConcreteLLM(name="TestLLM", role=Role.PROVIDER) + await llm.generate_response(conversation_history=[]) + assert llm.conversation_id is not None diff --git a/tests/unit/llm_clients/test_ollama_llm.py b/tests/unit/llm_clients/test_ollama_llm.py index aad5a61d..e6626c33 100644 --- a/tests/unit/llm_clients/test_ollama_llm.py +++ b/tests/unit/llm_clients/test_ollama_llm.py @@ -585,8 +585,8 @@ async def test_generate_response_with_persona_role_flips_types(self, mock_ollama assert "Assistant: How are you?" in call_args assert "Assistant:" in call_args - def test_get_last_response_metadata_returns_copy(self): - """Test that get_last_response_metadata returns a copy.""" + def test_last_response_metadata_copy_returns_copy(self): + """Test that last_response_metadata.copy() returns a copy, not the original.""" from llm_clients.ollama_llm import OllamaLLM llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER) @@ -653,7 +653,7 @@ async def test_metadata_tracks_timing(self, mock_ollama, mock_system_message): llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert_response_timing(metadata) @pytest.mark.asyncio @@ -669,7 +669,7 @@ async def test_timestamp_format(self, mock_ollama, mock_system_message): llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert_iso_timestamp(metadata["timestamp"]) @pytest.mark.asyncio @@ -685,7 +685,7 @@ async def test_metadata_structure_complete(self, mock_ollama, mock_system_messag llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER, model_name="mistral:7b") await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata # Verify all expected fields are present using helper assert_metadata_structure( @@ -710,7 +710,7 @@ async def test_metadata_initialized_empty(self, mock_ollama): llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER) # Before any response, metadata should be empty - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata == {} @pytest.mark.asyncio @@ -730,7 +730,7 @@ async def test_usage_metadata_always_empty(self, mock_ollama, mock_system_messag llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["usage"] == {} # Ollama doesn't have these fields assert "prompt_tokens" not in metadata["usage"] @@ -755,7 +755,7 @@ async def test_no_response_object_in_metadata( llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata # Ollama doesn't store the response object assert "response" not in metadata @@ -775,7 +775,7 @@ async def test_no_finish_reason_in_metadata(self, mock_ollama, mock_system_messa llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata # Ollama doesn't have finish_reason assert "finish_reason" not in metadata assert "stop_reason" not in metadata @@ -796,7 +796,7 @@ async def test_no_raw_metadata_stored(self, mock_ollama, mock_system_message): llm = OllamaLLM(name="test-ollama", role=Role.PROVIDER) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata # Ollama doesn't store raw_metadata or raw_response_metadata assert "raw_metadata" not in metadata assert "raw_response_metadata" not in metadata diff --git a/tests/unit/llm_clients/test_openai_llm.py b/tests/unit/llm_clients/test_openai_llm.py index d4eb3ce9..a69ab3cb 100644 --- a/tests/unit/llm_clients/test_openai_llm.py +++ b/tests/unit/llm_clients/test_openai_llm.py @@ -209,7 +209,7 @@ async def test_generate_response_without_additional_kwargs( response = await llm.generate_response(conversation_history=mock_system_message) assert response == "Response" - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["additional_kwargs"] == {} @pytest.mark.asyncio @@ -232,7 +232,7 @@ async def test_generate_response_without_response_metadata( response = await llm.generate_response(conversation_history=mock_system_message) assert response == "Response" - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["model"] == "gpt-5.2" assert metadata["usage"] == {} assert metadata["finish_reason"] is None @@ -266,7 +266,7 @@ async def test_generate_response_without_usage_metadata( response = await llm.generate_response(conversation_history=mock_system_message) assert response == "Response" - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata # Should still have usage from token_usage assert metadata["usage"]["prompt_tokens"] == 10 assert metadata["usage"]["completion_tokens"] == 20 @@ -311,11 +311,11 @@ async def test_generate_response_tracks_timing( llm = OpenAILLM(name="TestOpenAI", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert_response_timing(metadata) - def test_get_last_response_metadata_returns_copy(self): - """Test that get_last_response_metadata returns a copy.""" + def test_last_response_metadata_copy_returns_copy(self): + """Test that last_response_metadata.copy() returns a copy, not the original.""" llm = OpenAILLM(name="TestOpenAI", role=Role.PERSONA) assert_metadata_copy_behavior(llm) @@ -347,7 +347,7 @@ async def test_metadata_includes_response_object( llm = OpenAILLM(name="TestOpenAI", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert "response" in metadata assert metadata["response"] == mock_response @@ -369,7 +369,7 @@ async def test_timestamp_format( llm = OpenAILLM(name="TestOpenAI", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert_iso_timestamp(metadata["timestamp"]) @pytest.mark.asyncio @@ -393,7 +393,7 @@ async def test_model_name_update_from_metadata( llm = OpenAILLM(name="TestOpenAI", role=Role.PERSONA, model_name="gpt-4o") await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["model"] == "gpt-4o-0613-updated" @pytest.mark.asyncio @@ -559,7 +559,7 @@ async def test_generate_response_with_partial_usage_metadata( response = await llm.generate_response(conversation_history=mock_system_message) assert response == "Partial usage response" - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["usage"]["prompt_tokens"] == 15 assert metadata["usage"]["completion_tokens"] == 0 # Default value assert ( @@ -587,7 +587,7 @@ async def test_metadata_with_finish_reason( llm = OpenAILLM(name="TestOpenAI", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert metadata["finish_reason"] == "length" @pytest.mark.asyncio @@ -615,7 +615,7 @@ async def test_raw_metadata_stored( llm = OpenAILLM(name="TestOpenAI", role=Role.PERSONA) await llm.generate_response(conversation_history=mock_system_message) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata # OpenAI uses 'raw_response_metadata' instead of 'raw_metadata' assert "raw_response_metadata" in metadata assert metadata["raw_response_metadata"]["custom_field"] == "custom_value" @@ -745,7 +745,7 @@ class TestResponse(BaseModel): assert "Structured output failed" in str(exc_info.value) # Verify error metadata was stored - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata assert "error" in metadata assert "Structured output failed" in metadata["error"] @@ -773,7 +773,7 @@ class SimpleResponse(BaseModel): llm = OpenAILLM(name="TestOpenAI", role=Role.JUDGE) await llm.generate_structured_response("Test", SimpleResponse) - metadata = llm.get_last_response_metadata() + metadata = llm.last_response_metadata # Verify required fields assert metadata["provider"] == "openai"