Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
c4c9b71
clean up convo sim tests
jgieringer Feb 2, 2026
478f9bf
update generation util tests
jgieringer Feb 2, 2026
315b152
parse_judge_models helper
jgieringer Feb 3, 2026
d8919ce
improve judge parse tests
jgieringer Feb 3, 2026
6ed3d37
update judge extra param tests
jgieringer Feb 3, 2026
509b21a
update runner extra param tests
jgieringer Feb 3, 2026
5258199
updated tests for llm judge
jgieringer Feb 3, 2026
65b27c0
Merge branch 'main' into jgieringer/unit-testing
jgieringer Feb 3, 2026
01eee65
add rubric assign end asset
jgieringer Feb 3, 2026
a2fe619
repurpose judge cli tests into utils
jgieringer Feb 3, 2026
df0ef96
test overall judge script
jgieringer Feb 3, 2026
cdf27d4
add last_response_metadata to LLMInterface init
jgieringer Feb 4, 2026
2114a6f
add role to azure_llm metadata
jgieringer Feb 4, 2026
166f0e7
ensure llm clients are tested + add base tests for llm and judgellm s…
jgieringer Feb 4, 2026
cd2dfbb
cleaning errors
jgieringer Feb 4, 2026
90fe5ac
ignore abstract test class warnings
jgieringer Feb 4, 2026
9dc2da2
reduce # mock azure configs
jgieringer Feb 5, 2026
99f23a8
ensure rubric structure
jgieringer Feb 5, 2026
f5316e9
use conftest fixtures over patches
jgieringer Feb 5, 2026
7c43956
apply usefixtures at class level
jgieringer Feb 5, 2026
d40c38e
match base method signatures for the override
jgieringer Feb 5, 2026
4baaf9b
updated warnings + remove useless tests
jgieringer Feb 6, 2026
7b2c9d9
ensure judge model count validity
jgieringer Feb 6, 2026
e8f7dfd
upgrade from gpt-4 defaults
jgieringer Feb 6, 2026
2597e7d
ensure mock also uses LLM client code
jgieringer Feb 6, 2026
eb52b0d
added note about conftest
jgieringer Feb 6, 2026
f51300f
case-insensitive convo termination
jgieringer Feb 7, 2026
168c678
add helpful commentary
jgieringer Feb 7, 2026
f2db44f
clearer comment
jgieringer Feb 7, 2026
64badd8
explicitly set rubric columns to ignore
jgieringer Feb 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions generate_conversations/conversation_simulator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from typing import Any, Dict, List, Optional

from langchain_core.messages import AIMessage, HumanMessage
Expand Down Expand Up @@ -29,8 +30,8 @@ def _should_terminate_conversation(
if speaker != self.persona:
return False

# Check for exact phrase matches
if self.termination_signal in response:
# Check for exact phrase matches (case insensitive)
if re.search(re.escape(self.termination_signal), response, re.IGNORECASE):
return True

return False
Expand Down
3 changes: 3 additions & 0 deletions generate_conversations/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ def load_prompts_from_csv(
if not template_path.exists():
raise FileNotFoundError(f"Template file not found: {template_path}")

if max_personas is not None and max_personas <= 0:
raise ValueError("max_personas must be > 0")

# Read template once outside the loop for efficiency
with open(template_path, "r", encoding="utf-8") as template_file:
template = template_file.read()
Expand Down
129 changes: 63 additions & 66 deletions judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,74 +11,12 @@
from judge import judge_conversations, judge_single_conversation
from judge.llm_judge import LLMJudge
from judge.rubric_config import ConversationData, RubricConfig, load_conversations
from judge.utils import parse_judge_models
from utils.utils import parse_key_value_list


async def main(args) -> Optional[str]:
"""Main async entrypoint for judging conversations."""
# Parse judge models from args (supports "model" or "model:count" format)
judge_models = {}
for model_spec in args.judge_model:
if ":" in model_spec:
# Format: "model:count"
model, count = model_spec.rsplit(":", 1)
judge_models[model] = int(count)
else:
# Format: "model" (defaults to 1 instance)
judge_models[model_spec] = 1

models_str = ", ".join(f"{model}x{count}" for model, count in judge_models.items())
print(f"🎯 LLM Judge | Models: {models_str}")

# Load rubric configuration once at startup
print("📚 Loading rubric configuration...")
rubric_config = await RubricConfig.load(rubric_folder="data")

if args.conversation:
# Single conversation with first judge model (single instance)
first_model = next(iter(judge_models.keys()))

# Load single conversation
conversation = await ConversationData.load(args.conversation)

# Create judge with rubric config
judge = LLMJudge(
judge_model=first_model,
rubric_config=rubric_config,
judge_model_extra_params=args.judge_model_extra_params,
)
await judge_single_conversation(judge, conversation, args.output)
# Single conversation mode doesn't need output folder for pipeline
print("ℹ️ Single conversation mode: output folder not needed for pipeline")
return None
else:
# Load all conversations at startup
print(f"📂 Loading conversations from {args.folder}...")
conversations = await load_conversations(args.folder, limit=args.limit)
print(f"✅ Loaded {len(conversations)} conversations")

# Batch evaluation with multiple judges
from pathlib import Path

folder_name = Path(args.folder).name

_, output_folder = await judge_conversations(
judge_models=judge_models,
conversations=conversations,
rubric_config=rubric_config,
max_concurrent=args.max_concurrent,
output_root=args.output,
conversation_folder_name=folder_name,
verbose=True,
judge_model_extra_params=args.judge_model_extra_params,
per_judge=args.per_judge,
verbose_workers=args.verbose_workers,
)

return output_folder


if __name__ == "__main__":
def get_parser() -> argparse.ArgumentParser:
"""Build and return the argument parser (for CLI and testing)."""
parser = argparse.ArgumentParser(
description="Judge existing LLM conversations using rubrics"
)
Expand Down Expand Up @@ -178,7 +116,66 @@ async def main(args) -> Optional[str]:
help="Enable verbose worker logging to show concurrency behavior",
)

args = parser.parse_args()
return parser


async def main(args) -> Optional[str]:
"""Main async entrypoint for judging conversations."""
# Parse judge models from args (supports "model" or "model:count" format)
judge_models = parse_judge_models(args.judge_model)

models_str = ", ".join(f"{model}x{count}" for model, count in judge_models.items())
print(f"🎯 LLM Judge | Models: {models_str}")

# Load rubric configuration once at startup
print("📚 Loading rubric configuration...")
rubric_config = await RubricConfig.load(rubric_folder="data")

if args.conversation:
# Single conversation with first judge model (single instance)
first_model = next(iter(judge_models.keys()))

# Load single conversation
conversation = await ConversationData.load(args.conversation)

# Create judge with rubric config
judge = LLMJudge(
judge_model=first_model,
rubric_config=rubric_config,
judge_model_extra_params=args.judge_model_extra_params,
)
await judge_single_conversation(judge, conversation, args.output)
# Single conversation mode doesn't need output folder for pipeline
print("ℹ️ Single conversation mode: output folder not needed for pipeline")
return None
else:
# Load all conversations at startup
print(f"📂 Loading conversations from {args.folder}...")
conversations = await load_conversations(args.folder, limit=args.limit)
print(f"✅ Loaded {len(conversations)} conversations")

# Batch evaluation with multiple judges
from pathlib import Path

folder_name = Path(args.folder).name

_, output_folder = await judge_conversations(
judge_models=judge_models,
conversations=conversations,
rubric_config=rubric_config,
max_concurrent=args.max_concurrent,
output_root=args.output,
conversation_folder_name=folder_name,
verbose=True,
judge_model_extra_params=args.judge_model_extra_params,
per_judge=args.per_judge,
verbose_workers=args.verbose_workers,
)

return output_folder


if __name__ == "__main__":
args = get_parser().parse_args()
print(f"Running judge on: {args.folder or args.conversation}")
asyncio.run(main(args))
58 changes: 43 additions & 15 deletions judge/rubric_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,28 @@
import aiofiles
import pandas as pd

# Rubric TSV column names - single source of truth for rubric structure
COL_QUESTION_ID = "Question ID"
COL_DIMENSION = "Dimension"
COL_SEVERITY = "Severity"
COL_RISK_TYPE = "Risk Type"
COL_QUESTION = "Question"
COL_EXAMPLES = "Examples"
COL_ANSWER = "Answer"
COL_GOTO = "GOTO"

# Rubric TSV columns to ignore
IGNORE_COLUMNS = {"Human notes"}

# Short dimension names for display and column headers
DIMENSION_SHORT_NAMES = {
"Detects Risk": "Detects potential risk",
"Clarifies Risk": "Confirms risk",
"Guides to Human Support": "Guides to human care",
"Collaborates and Validates Appropriately": "Supportive conversation",
"Maintains Safe Boundaries": "Follows AI boundaries",
}


@dataclass
class RubricConfig:
Expand Down Expand Up @@ -120,7 +142,7 @@ def _extract_dimensions(rubric_df: pd.DataFrame) -> List[str]:
"""
dimensions = [
d.strip()
for d in rubric_df["Dimension"].dropna().unique()
for d in rubric_df[COL_DIMENSION].dropna().unique()
if d and str(d).strip() != "nan"
]
return dimensions
Expand Down Expand Up @@ -150,7 +172,7 @@ def _parse_rubric(

for idx, row in rubric_df.iterrows():
question_id_raw = (
row["Question ID"] if pd.notna(row["Question ID"]) else None
row[COL_QUESTION_ID] if pd.notna(row[COL_QUESTION_ID]) else None
)
# Convert to string and clean up (remove .0 from floats)
if question_id_raw is not None:
Expand All @@ -170,7 +192,9 @@ def _parse_rubric(

# Read severity from the question row
severity = (
str(row["Severity"]).strip() if pd.notna(row["Severity"]) else ""
str(row[COL_SEVERITY]).strip()
if pd.notna(row[COL_SEVERITY])
else ""
)
severity = (
severity if severity and severity not in ["nan", ""] else None
Expand All @@ -180,26 +204,28 @@ def _parse_rubric(
current_question_id = question_id
question_order.append(question_id)
current_question_data = {
"dimension": str(row["Dimension"]).strip()
if pd.notna(row["Dimension"])
"dimension": str(row[COL_DIMENSION]).strip()
if pd.notna(row[COL_DIMENSION])
else "",
"risk_type": str(row["Risk Type"]).strip()
if pd.notna(row["Risk Type"])
"risk_type": str(row[COL_RISK_TYPE]).strip()
if pd.notna(row[COL_RISK_TYPE])
else "",
"question": str(row["Question"]).strip()
if pd.notna(row["Question"])
"question": str(row[COL_QUESTION]).strip()
if pd.notna(row[COL_QUESTION])
else "",
"examples": str(row["Examples"]).strip()
if pd.notna(row["Examples"])
"examples": str(row[COL_EXAMPLES]).strip()
if pd.notna(row[COL_EXAMPLES])
else "",
"severity": severity,
"answers": [],
}

# Check if this row also has an answer (single-row question)
answer = str(row["Answer"]).strip() if pd.notna(row["Answer"]) else ""
answer = (
str(row[COL_ANSWER]).strip() if pd.notna(row[COL_ANSWER]) else ""
)
if answer and answer != "nan":
goto_raw = row["GOTO"] if pd.notna(row["GOTO"]) else None
goto_raw = row[COL_GOTO] if pd.notna(row[COL_GOTO]) else None
goto = (
str(int(goto_raw))
if goto_raw and isinstance(goto_raw, (int, float))
Expand All @@ -214,9 +240,11 @@ def _parse_rubric(

# This is a continuation row with an answer option
elif current_question_data is not None:
answer = str(row["Answer"]).strip() if pd.notna(row["Answer"]) else ""
answer = (
str(row[COL_ANSWER]).strip() if pd.notna(row[COL_ANSWER]) else ""
)
if answer and answer != "nan":
goto_raw = row["GOTO"] if pd.notna(row["GOTO"]) else None
goto_raw = row[COL_GOTO] if pd.notna(row[COL_GOTO]) else None
goto = (
str(int(goto_raw))
if goto_raw and isinstance(goto_raw, (int, float))
Expand Down
4 changes: 2 additions & 2 deletions judge/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ async def batch_evaluate_with_individual_judges(
Args:
conversations: List of ConversationData objects
judge_models: Dict mapping model names to number of instances
Example: {"claude-3-7-sonnet": 3, "gpt-4": 2}
Example: {"claude-3-7-sonnet": 3, "gpt-4o": 2}
output_folder: Folder to save evaluation results
rubric_config: Pre-loaded rubric configuration
max_concurrent: Maximum number of concurrent workers
Expand Down Expand Up @@ -440,7 +440,7 @@ async def judge_conversations(

Args:
judge_models: Dict mapping model names to number of instances
Example: {"claude-3-7-sonnet": 3, "gpt-4": 2}
Example: {"claude-3-7-sonnet": 3, "gpt-4o": 2}
conversations: List of pre-loaded ConversationData objects
rubric_config: Pre-loaded rubric configuration
output_root: Root folder for evaluation outputs
Expand Down
23 changes: 23 additions & 0 deletions judge/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,29 @@
import pandas as pd


def parse_judge_models(model_arg):
"""Parse judge model specifications from command line argument into a dictionary."""
judge_models = {}
for model_spec in model_arg:
if ":" in model_spec:
# Format: "model:count"
model, count = model_spec.rsplit(":", 1)
try:
n = int(count)
except ValueError:
raise ValueError(
f"Judge model count must be an integer, got {count!r}"
) from None
if n < 1:
raise ValueError(f"Judge model count must be positive, got {n}")
judge_models[model] = n
else:
# Format: "model" (defaults to 1 instance)
judge_models[model_spec] = 1

return judge_models


def load_rubric_structure(
rubric_path: str, sep: str = "\t"
) -> Tuple[List[str], List[str]]:
Expand Down
7 changes: 4 additions & 3 deletions llm_clients/azure_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,6 @@ def __init__(
self.max_tokens = getattr(self.llm, "max_tokens", None)
self.top_p = getattr(self.llm, "top_p", None)

# Store metadata from last response
self.last_response_metadata: Dict[str, Any] = {}

async def generate_response(
self,
conversation_history: Optional[List[Dict[str, Any]]] = None,
Expand Down Expand Up @@ -175,6 +172,7 @@ async def generate_response(
else self.model_name
),
"provider": "azure",
"role": self.role.value,
"timestamp": datetime.now().isoformat(),
"response_time_seconds": round(end_time - start_time, 3),
"usage": {},
Expand Down Expand Up @@ -211,6 +209,7 @@ async def generate_response(
"response_id": None,
"model": self.model_name,
"provider": "azure",
"role": self.role.value,
"timestamp": datetime.now().isoformat(),
"error": error_msg,
"usage": {},
Expand Down Expand Up @@ -281,6 +280,7 @@ async def generate_structured_response(
"response_id": None,
"model": self.model_name,
"provider": "azure",
"role": self.role.value,
"timestamp": datetime.now().isoformat(),
"response_time_seconds": round(end_time - start_time, 3),
"usage": {},
Expand All @@ -300,6 +300,7 @@ async def generate_structured_response(
"response_id": None,
"model": self.model_name,
"provider": "azure",
"role": self.role.value,
"timestamp": datetime.now().isoformat(),
"error": str(e),
"usage": {},
Expand Down
3 changes: 0 additions & 3 deletions llm_clients/claude_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,6 @@ def __init__(
self.temperature = getattr(self.llm, "temperature", None)
self.max_tokens = getattr(self.llm, "max_tokens", None)

# Store metadata from last response
self.last_response_metadata: Dict[str, Any] = {}

async def generate_response(
self,
conversation_history: Optional[List[Dict[str, Any]]] = None,
Expand Down
Loading