Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions docs/evaluating.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,9 @@ def set_system_prompt(self, system_prompt: str) -> None:
self.system_prompt = system_prompt
```

#### `get_last_response_metadata()` - Get response metadata (optional but recommended)
```python
def get_last_response_metadata(self) -> Dict[str, Any]:
"""Get metadata from the last response."""
return self.last_response_metadata.copy()
```
#### `last_response_metadata` - Response metadata (required)

Set in `__init__` (base sets it to `{}`). Update it in `generate_response()`: assign with `self.last_response_metadata = {...}`. If you need in-place updates (e.g. `self.last_response_metadata["usage"] = ...`), use `self._last_response_metadata` so the stored dict is updated. The property getter returns a copy so callers can use `last_response_metadata` without mutating the client's dict.

### 3. Add the new LLM client to the factory

Expand Down Expand Up @@ -227,6 +224,18 @@ python3 judge.py -f conversations/{YOUR_FOLDER} -j your-model-name
- **LangChain Integration**: The provided implementations use LangChain for robust LLM interactions
- **Error Handling**: Make sure to handle errors gracefully and return appropriate error messages

### Conversation flow and history

ConversationSimulator holds the full conversation and passes `conversation_history` into your client on every call. Your client is not required to store history. You can:

- **Stateless**: Build each request from `conversation_history` (as the built-in clients do), or
- **Server-side state**: Send a `conversation_id` to your API and let the server maintain the conversation; in that case you may use `conversation_history` only when needed (e.g. fallback or logging).

**When your endpoint requires a conversation id** (the built-in clients do not; this is for custom clients):

- `conversation_id` is set in the base class `__init__`, so you always have one to send as request metadata. Use `self.conversation_id` when your API needs a conversation ID.
- For LLM clients that require `conversation_id` handling, in `generate_response()`, you must set `conversation_id` in `_last_response_metadata` (interface requirement). If your API returns its own `conversation_id` in the response metadata (e.g. it ignores the one we send), call `self._update_conversation_id_from_metadata()` at the end of `generate_response()` after setting `_last_response_metadata`; that overwrites `self.conversation_id` with the API’s value.

## Structured Output Support

### Native Support (Recommended)
Expand Down
2 changes: 1 addition & 1 deletion generate_conversations/conversation_simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ async def start_conversation(
input_message=input_msg,
response_message=lc_message,
early_termination=False,
logging_metadata=current_speaker.get_last_response_metadata(),
logging_metadata=current_speaker.last_response_metadata,
)
self.conversation_history.append(turn_obj)

Expand Down
82 changes: 46 additions & 36 deletions generate_conversations/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,20 +49,30 @@ def __init__(
self.max_total_words = max_total_words
self.max_personas = max_personas

self.AGENT_SYSTEM_PROMPT = self.agent_model_config.get(
"system_prompt", "You are a helpful AI assistant."
)

async def run_single_conversation(
self,
persona_config: dict,
agent,
max_turns: int,
conversation_id: int,
conversation_index: int,
run_number: int,
**kargs: dict,
**kwargs: dict,
) -> Dict[str, Any]:
"""Run a single conversation asynchronously."""
"""Run a single simulated conversation (persona vs provider LLM).

Uses fresh LLM instances per call; safe for concurrent use. Logs turns,
writes transcript to self.folder_name, then cleans up logger and LLMs.

Args:
persona_config (dict): Must have "model", "prompt", "name".
max_turns (int): Max conversation turns for a conversation.
conversation_index (int): Index in the batch of conversations.
run_number (int): Run index for this prompt (e.g. 1 of runs_per_prompt).
**kwargs: Unused; reserved for future use.

Returns:
Dict[str, Any]: index, llm1_model, llm1_prompt, run_number, turns,
filename, log_file, duration, early_termination, conversation.
"""
model_name = persona_config["model"]
system_prompt = persona_config["prompt"] # This is now the full persona prompt
persona_name = persona_config["name"]
Expand All @@ -83,7 +93,7 @@ async def run_single_conversation(
logger = setup_conversation_logger(filename_base, run_id=self.run_id)
start_time = time.time()

# Create LLM1 instance with the persona prompt and configuration
# Create persona instance
persona = LLMFactory.create_llm(
model_name=model_name,
name=f"{model_short} {persona_name}",
Expand All @@ -92,6 +102,23 @@ async def run_single_conversation(
**self.persona_model_config,
)

# Create new agent instance to reset conversation_id and metadata.
# Exclude selected kwargs to avoid duplicate args expected in create_llm.
agent_kwargs = {
k: v
for k, v in self.agent_model_config.items()
if k not in ("model", "name", "system_prompt")
}
agent = LLMFactory.create_llm(
model_name=self.agent_model_config["model"],
name=self.agent_model_config.get("name", "Provider"),
system_prompt=self.agent_model_config.get(
"system_prompt", "You are a helpful AI assistant."
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would maybe have this as a optional arg with the default, since it seems a potentially consequential decision and it's buried here

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

),
role=Role.PROVIDER,
**agent_kwargs,
)

# Log conversation start
log_conversation_start(
logger=logger,
Expand Down Expand Up @@ -148,7 +175,7 @@ async def run_single_conversation(
simulator.save_conversation(f"{filename_base}.txt", self.folder_name)

result = {
"id": conversation_id,
"index": conversation_index,
"llm1_model": model_name,
"llm1_prompt": persona_name,
"run_number": run_number,
Expand All @@ -164,11 +191,12 @@ async def run_single_conversation(

# Cleanup LLM resources (e.g., close HTTP sessions for Azure)
# Always cleanup, even if conversation failed
try:
await persona.cleanup()
except Exception as e:
# Log but don't fail if cleanup fails
print(f"Warning: Failed to cleanup persona LLM: {e}")
for llm in (persona, agent):
try:
await llm.cleanup()
except Exception as e:
# Log but don't fail if cleanup fails
print(f"Warning: Failed to cleanup LLM: {e}")

return result

Expand All @@ -179,37 +207,26 @@ async def run_conversations(
# Load prompts from CSV based on persona names
personas = load_prompts_from_csv(persona_names, max_personas=self.max_personas)

# Load agent configuration (fixed, shared across all conversations)
agent = LLMFactory.create_llm(
model_name=self.agent_model_config["model"],
name=self.agent_model_config.pop("name"),
system_prompt=self.AGENT_SYSTEM_PROMPT,
role=Role.PROVIDER,
**self.agent_model_config,
)

# Create tasks for all conversations (each prompt run multiple times)
tasks = []
conversation_id = 1
conversation_index = 1

for persona in personas:
for run in range(1, self.runs_per_prompt + 1):
tasks.append(
# TODO: should we pass the persona object here?
self.run_single_conversation(
{
"model": self.persona_model_config["model"],
"prompt": persona["prompt"],
"name": persona["Name"],
"run": run,
},
agent,
self.max_turns,
conversation_id,
conversation_index,
run,
)
)
conversation_id += 1
conversation_index += 1

# Run all conversations with concurrency limit
start_time = datetime.now()
Expand Down Expand Up @@ -237,11 +254,4 @@ async def run_with_limit(task):

print(f"\nCompleted {len(results)} conversations in {total_time:.2f} seconds")

# Cleanup agent LLM resources (e.g., close HTTP sessions for Azure)
try:
await agent.cleanup()
except Exception as e:
# Log but don't fail if cleanup fails
print(f"Warning: Failed to cleanup agent LLM: {e}")

return results
10 changes: 3 additions & 7 deletions llm_clients/azure_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,19 +187,19 @@ async def generate_response(
# Extract token usage
if "token_usage" in metadata:
usage = metadata["token_usage"]
self.last_response_metadata["usage"] = {
self._last_response_metadata["usage"] = {
"input_tokens": usage.get("input_tokens", 0),
Copy link
Collaborator

@emily-vanark emily-vanark Feb 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is off-topic, but... do we save this metadata anywhere? It doesn't seem to be in the logging output for chat generation, or the logs output for judging... but if we have total token usage for each conversation and judging evaluation somewhere that we could write out, it would help... everyone with understanding costs. (This is probably a separate ticket... but only if we really aren't storing it anywhere.)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"output_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}

# Extract finish reason
self.last_response_metadata["finish_reason"] = metadata.get(
self._last_response_metadata["finish_reason"] = metadata.get(
"finish_reason"
)

# Store raw metadata
self.last_response_metadata["raw_metadata"] = dict(metadata)
self._last_response_metadata["raw_metadata"] = dict(metadata)

return response.text
except Exception as e:
Expand Down Expand Up @@ -307,10 +307,6 @@ async def generate_structured_response(
}
raise RuntimeError(f"Error generating structured response: {str(e)}") from e

def get_last_response_metadata(self) -> Dict[str, Any]:
"""Get metadata from the last response."""
return self.last_response_metadata.copy()

def set_system_prompt(self, system_prompt: str) -> None:
"""Set or update the system prompt."""
self.system_prompt = system_prompt
Expand Down
12 changes: 5 additions & 7 deletions llm_clients/claude_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,18 +118,20 @@ async def generate_response(
# Extract token usage
if "usage" in metadata:
usage = metadata["usage"]
self.last_response_metadata["usage"] = {
self._last_response_metadata["usage"] = {
"input_tokens": usage.get("input_tokens", 0),
"output_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("input_tokens", 0)
+ usage.get("output_tokens", 0),
}

# Extract stop reason
self.last_response_metadata["stop_reason"] = metadata.get("stop_reason")
self._last_response_metadata["stop_reason"] = metadata.get(
"stop_reason"
)

# Store raw metadata
self.last_response_metadata["raw_metadata"] = dict(metadata)
self._last_response_metadata["raw_metadata"] = dict(metadata)

return response.text
except Exception as e:
Expand Down Expand Up @@ -204,10 +206,6 @@ async def generate_structured_response(
}
raise RuntimeError(f"Error generating structured response: {str(e)}") from e

def get_last_response_metadata(self) -> Dict[str, Any]:
"""Get metadata from the last response."""
return self.last_response_metadata.copy()

def set_system_prompt(self, system_prompt: str) -> None:
"""Set or update the system prompt."""
self.system_prompt = system_prompt
12 changes: 4 additions & 8 deletions llm_clients/gemini_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ async def generate_response(
# Extract token usage - Gemini may have different structure
if "usage_metadata" in metadata:
usage = metadata["usage_metadata"]
self.last_response_metadata["usage"] = {
self._last_response_metadata["usage"] = {
"prompt_token_count": usage.get("prompt_token_count", 0),
"candidates_token_count": usage.get(
"candidates_token_count", 0
Expand All @@ -126,19 +126,19 @@ async def generate_response(
elif "token_usage" in metadata:
# Fallback structure
usage = metadata["token_usage"]
self.last_response_metadata["usage"] = {
self._last_response_metadata["usage"] = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}

# Extract finish reason
self.last_response_metadata["finish_reason"] = metadata.get(
self._last_response_metadata["finish_reason"] = metadata.get(
"finish_reason"
)

# Store raw metadata
self.last_response_metadata["raw_metadata"] = dict(metadata)
self._last_response_metadata["raw_metadata"] = dict(metadata)

return response.text
except Exception as e:
Expand All @@ -154,10 +154,6 @@ async def generate_response(
}
return f"Error generating response: {str(e)}"

def get_last_response_metadata(self) -> Dict[str, Any]:
"""Get metadata from the last response."""
return self.last_response_metadata.copy()

async def generate_structured_response(
self, message: Optional[str], response_model: Type[T]
) -> T:
Expand Down
47 changes: 44 additions & 3 deletions llm_clients/llm_interface.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import copy
import uuid
from abc import ABC, abstractmethod
from enum import Enum
from typing import Any, Dict, List, Optional, Type, TypeVar
Expand Down Expand Up @@ -31,7 +33,39 @@ def __init__(
self.name = name
self.role = role
self.system_prompt = system_prompt or ""
self.last_response_metadata: Dict[str, Any] = {}
self._last_response_metadata: Dict[str, Any] = {}
self.conversation_id = self.create_conversation_id()

@property
def last_response_metadata(self) -> Dict[str, Any]:
"""Metadata from the last generate_response call. Returns a deep copy so
callers cannot mutate internal state (including nested dicts like usage).
"""
return copy.deepcopy(self._last_response_metadata)

@last_response_metadata.setter
def last_response_metadata(self, value: Optional[Dict[str, Any]]) -> None:
"""Set metadata; use _last_response_metadata for in-place updates."""
self._last_response_metadata = value or {}

def create_conversation_id(self) -> str:
"""Create a new unique conversation id.

Used at init and when the API does not return one in response metadata.
Subclasses may override to use a different id format.
"""
return str(uuid.uuid4())

def _update_conversation_id_from_metadata(self) -> None:
"""If the API returned a conversation_id in response metadata, use it.

Call after generate_response once _last_response_metadata is set.
APIs that ignore our request conversation_id but return their own
will overwrite self.conversation_id here.
"""
cid = (self._last_response_metadata or {}).get("conversation_id")
if cid is not None:
self.conversation_id = cid

@abstractmethod
async def generate_response(
Expand All @@ -49,8 +83,15 @@ async def generate_response(
starting the conversation.

Returns:
str: The response text. Metadata available via
get_last_response_metadata()
str: The response text. Metadata in self.last_response_metadata
(getter returns a copy so callers need not copy).

Note:
For API thread/session identification, use self.conversation_id
(set at init; send as request metadata). If your API returns a
conversation_id in response metadata, call
self._update_conversation_id_from_metadata() after setting
_last_response_metadata to overwrite.
"""
pass

Expand Down
4 changes: 0 additions & 4 deletions llm_clients/ollama_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,6 @@ async def generate_response(
}
return f"Error generating response: {str(e)}"

def get_last_response_metadata(self) -> Dict[str, Any]:
"""Get metadata from the last response."""
return self.last_response_metadata.copy()

def set_system_prompt(self, system_prompt: str) -> None:
"""Set or update the system prompt."""
self.system_prompt = system_prompt
Loading