SpringCare · jgieringer · Feb 7, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 3, 2026
diff --git a/generate_conversations/conversation_simulator.py b/generate_conversations/conversation_simulator.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any, Dict, List, Optional
 
 from langchain_core.messages import AIMessage, HumanMessage
@@ -29,8 +30,8 @@ def _should_terminate_conversation(
         if speaker != self.persona:
             return False
 
-        # Check for exact phrase matches
-        if self.termination_signal in response:
+        # Check for exact phrase matches (case insensitive)
+        if re.search(re.escape(self.termination_signal), response, re.IGNORECASE):
             return True
 
         return False

diff --git a/generate_conversations/utils.py b/generate_conversations/utils.py
@@ -31,6 +31,9 @@ def load_prompts_from_csv(
     if not template_path.exists():
         raise FileNotFoundError(f"Template file not found: {template_path}")
 
+    if max_personas is not None and max_personas <= 0:
+        raise ValueError("max_personas must be > 0")
+
     # Read template once outside the loop for efficiency
     with open(template_path, "r", encoding="utf-8") as template_file:
         template = template_file.read()

diff --git a/judge.py b/judge.py
@@ -11,74 +11,12 @@
 from judge import judge_conversations, judge_single_conversation
 from judge.llm_judge import LLMJudge
 from judge.rubric_config import ConversationData, RubricConfig, load_conversations
+from judge.utils import parse_judge_models
 from utils.utils import parse_key_value_list
 
 
-async def main(args) -> Optional[str]:
-    """Main async entrypoint for judging conversations."""
-    # Parse judge models from args (supports "model" or "model:count" format)
-    judge_models = {}
-    for model_spec in args.judge_model:
-        if ":" in model_spec:
-            # Format: "model:count"
-            model, count = model_spec.rsplit(":", 1)
-            judge_models[model] = int(count)
-        else:
-            # Format: "model" (defaults to 1 instance)
-            judge_models[model_spec] = 1
-
-    models_str = ", ".join(f"{model}x{count}" for model, count in judge_models.items())
-    print(f"🎯 LLM Judge | Models: {models_str}")
-
-    # Load rubric configuration once at startup
-    print("📚 Loading rubric configuration...")
-    rubric_config = await RubricConfig.load(rubric_folder="data")
-
-    if args.conversation:
-        # Single conversation with first judge model (single instance)
-        first_model = next(iter(judge_models.keys()))
-
-        # Load single conversation
-        conversation = await ConversationData.load(args.conversation)
-
-        # Create judge with rubric config
-        judge = LLMJudge(
-            judge_model=first_model,
-            rubric_config=rubric_config,
-            judge_model_extra_params=args.judge_model_extra_params,
-        )
-        await judge_single_conversation(judge, conversation, args.output)
-        # Single conversation mode doesn't need output folder for pipeline
-        print("ℹ️  Single conversation mode: output folder not needed for pipeline")
-        return None
-    else:
-        # Load all conversations at startup
-        print(f"📂 Loading conversations from {args.folder}...")
-        conversations = await load_conversations(args.folder, limit=args.limit)
-        print(f"✅ Loaded {len(conversations)} conversations")
-
-        # Batch evaluation with multiple judges
-        from pathlib import Path
-
-        folder_name = Path(args.folder).name
-
-        _, output_folder = await judge_conversations(
-            judge_models=judge_models,
-            conversations=conversations,
-            rubric_config=rubric_config,
-            max_concurrent=args.max_concurrent,
-            output_root=args.output,
-            conversation_folder_name=folder_name,
-            verbose=True,
-            judge_model_extra_params=args.judge_model_extra_params,
-            per_judge=args.per_judge,
-            verbose_workers=args.verbose_workers,
-        )
-
-        return output_folder
-
-
-if __name__ == "__main__":
+def get_parser() -> argparse.ArgumentParser:
+    """Build and return the argument parser (for CLI and testing)."""
     parser = argparse.ArgumentParser(
         description="Judge existing LLM conversations using rubrics"
     )
@@ -178,7 +116,66 @@ async def main(args) -> Optional[str]:
         help="Enable verbose worker logging to show concurrency behavior",
     )
 
-    args = parser.parse_args()
+    return parser
+
 
+async def main(args) -> Optional[str]:
+    """Main async entrypoint for judging conversations."""
+    # Parse judge models from args (supports "model" or "model:count" format)
+    judge_models = parse_judge_models(args.judge_model)
+
+    models_str = ", ".join(f"{model}x{count}" for model, count in judge_models.items())
+    print(f"🎯 LLM Judge | Models: {models_str}")
+
+    # Load rubric configuration once at startup
+    print("📚 Loading rubric configuration...")
+    rubric_config = await RubricConfig.load(rubric_folder="data")
+
+    if args.conversation:
+        # Single conversation with first judge model (single instance)
+        first_model = next(iter(judge_models.keys()))
+
+        # Load single conversation
+        conversation = await ConversationData.load(args.conversation)
+
+        # Create judge with rubric config
+        judge = LLMJudge(
+            judge_model=first_model,
+            rubric_config=rubric_config,
+            judge_model_extra_params=args.judge_model_extra_params,
+        )
+        await judge_single_conversation(judge, conversation, args.output)
+        # Single conversation mode doesn't need output folder for pipeline
+        print("ℹ️  Single conversation mode: output folder not needed for pipeline")
+        return None
+    else:
+        # Load all conversations at startup
+        print(f"📂 Loading conversations from {args.folder}...")
+        conversations = await load_conversations(args.folder, limit=args.limit)
+        print(f"✅ Loaded {len(conversations)} conversations")
+
+        # Batch evaluation with multiple judges
+        from pathlib import Path
+
+        folder_name = Path(args.folder).name
+
+        _, output_folder = await judge_conversations(
+            judge_models=judge_models,
+            conversations=conversations,
+            rubric_config=rubric_config,
+            max_concurrent=args.max_concurrent,
+            output_root=args.output,
+            conversation_folder_name=folder_name,
+            verbose=True,
+            judge_model_extra_params=args.judge_model_extra_params,
+            per_judge=args.per_judge,
+            verbose_workers=args.verbose_workers,
+        )
+
+        return output_folder
+
+
+if __name__ == "__main__":
+    args = get_parser().parse_args()
     print(f"Running judge on: {args.folder or args.conversation}")
     asyncio.run(main(args))
diff --git a/judge/rubric_config.py b/judge/rubric_config.py
@@ -13,6 +13,28 @@
 import aiofiles
 import pandas as pd
 
+# Rubric TSV column names - single source of truth for rubric structure
+COL_QUESTION_ID = "Question ID"
+COL_DIMENSION = "Dimension"
+COL_SEVERITY = "Severity"
+COL_RISK_TYPE = "Risk Type"
+COL_QUESTION = "Question"
+COL_EXAMPLES = "Examples"
+COL_ANSWER = "Answer"
+COL_GOTO = "GOTO"
+
+# Rubric TSV columns to ignore
+IGNORE_COLUMNS = {"Human notes"}
+
+# Short dimension names for display and column headers
+DIMENSION_SHORT_NAMES = {
+    "Detects Risk": "Detects potential risk",
+    "Clarifies Risk": "Confirms risk",
+    "Guides to Human Support": "Guides to human care",
+    "Collaborates and Validates Appropriately": "Supportive conversation",
+    "Maintains Safe Boundaries": "Follows AI boundaries",
+}
+
 
 @dataclass
 class RubricConfig:
@@ -120,7 +142,7 @@ def _extract_dimensions(rubric_df: pd.DataFrame) -> List[str]:
         """
         dimensions = [
             d.strip()
-            for d in rubric_df["Dimension"].dropna().unique()
+            for d in rubric_df[COL_DIMENSION].dropna().unique()
             if d and str(d).strip() != "nan"
         ]
         return dimensions
@@ -150,7 +172,7 @@ def _parse_rubric(
 
         for idx, row in rubric_df.iterrows():
             question_id_raw = (
-                row["Question ID"] if pd.notna(row["Question ID"]) else None
+                row[COL_QUESTION_ID] if pd.notna(row[COL_QUESTION_ID]) else None
             )
             # Convert to string and clean up (remove .0 from floats)
             if question_id_raw is not None:
@@ -170,7 +192,9 @@ def _parse_rubric(
 
                 # Read severity from the question row
                 severity = (
-                    str(row["Severity"]).strip() if pd.notna(row["Severity"]) else ""
+                    str(row[COL_SEVERITY]).strip()
+                    if pd.notna(row[COL_SEVERITY])
+                    else ""
                 )
                 severity = (
                     severity if severity and severity not in ["nan", ""] else None
@@ -180,26 +204,28 @@ def _parse_rubric(
                 current_question_id = question_id
                 question_order.append(question_id)
                 current_question_data = {
-                    "dimension": str(row["Dimension"]).strip()
-                    if pd.notna(row["Dimension"])
+                    "dimension": str(row[COL_DIMENSION]).strip()
+                    if pd.notna(row[COL_DIMENSION])
                     else "",
-                    "risk_type": str(row["Risk Type"]).strip()
-                    if pd.notna(row["Risk Type"])
+                    "risk_type": str(row[COL_RISK_TYPE]).strip()
+                    if pd.notna(row[COL_RISK_TYPE])
                     else "",
-                    "question": str(row["Question"]).strip()
-                    if pd.notna(row["Question"])
+                    "question": str(row[COL_QUESTION]).strip()
+                    if pd.notna(row[COL_QUESTION])
                     else "",
-                    "examples": str(row["Examples"]).strip()
-                    if pd.notna(row["Examples"])
+                    "examples": str(row[COL_EXAMPLES]).strip()
+                    if pd.notna(row[COL_EXAMPLES])
                     else "",
                     "severity": severity,
                     "answers": [],
                 }
 
                 # Check if this row also has an answer (single-row question)
-                answer = str(row["Answer"]).strip() if pd.notna(row["Answer"]) else ""
+                answer = (
+                    str(row[COL_ANSWER]).strip() if pd.notna(row[COL_ANSWER]) else ""
+                )
                 if answer and answer != "nan":
-                    goto_raw = row["GOTO"] if pd.notna(row["GOTO"]) else None
+                    goto_raw = row[COL_GOTO] if pd.notna(row[COL_GOTO]) else None
                     goto = (
                         str(int(goto_raw))
                         if goto_raw and isinstance(goto_raw, (int, float))
@@ -214,9 +240,11 @@ def _parse_rubric(
 
             # This is a continuation row with an answer option
             elif current_question_data is not None:
-                answer = str(row["Answer"]).strip() if pd.notna(row["Answer"]) else ""
+                answer = (
+                    str(row[COL_ANSWER]).strip() if pd.notna(row[COL_ANSWER]) else ""
+                )
                 if answer and answer != "nan":
-                    goto_raw = row["GOTO"] if pd.notna(row["GOTO"]) else None
+                    goto_raw = row[COL_GOTO] if pd.notna(row[COL_GOTO]) else None
                     goto = (
                         str(int(goto_raw))
                         if goto_raw and isinstance(goto_raw, (int, float))

diff --git a/judge/runner.py b/judge/runner.py
@@ -376,7 +376,7 @@ async def batch_evaluate_with_individual_judges(
     Args:
         conversations: List of ConversationData objects
         judge_models: Dict mapping model names to number of instances
-                     Example: {"claude-3-7-sonnet": 3, "gpt-4": 2}
+                     Example: {"claude-3-7-sonnet": 3, "gpt-4o": 2}
         output_folder: Folder to save evaluation results
         rubric_config: Pre-loaded rubric configuration
         max_concurrent: Maximum number of concurrent workers
@@ -440,7 +440,7 @@ async def judge_conversations(
 
     Args:
         judge_models: Dict mapping model names to number of instances
-                     Example: {"claude-3-7-sonnet": 3, "gpt-4": 2}
+                     Example: {"claude-3-7-sonnet": 3, "gpt-4o": 2}
         conversations: List of pre-loaded ConversationData objects
         rubric_config: Pre-loaded rubric configuration
         output_root: Root folder for evaluation outputs

diff --git a/judge/utils.py b/judge/utils.py
@@ -7,6 +7,29 @@
 import pandas as pd
 
 
+def parse_judge_models(model_arg):
+    """Parse judge model specifications from command line argument into a dictionary."""
+    judge_models = {}
+    for model_spec in model_arg:
+        if ":" in model_spec:
+            # Format: "model:count"
+            model, count = model_spec.rsplit(":", 1)
+            try:
+                n = int(count)
+            except ValueError:
+                raise ValueError(
+                    f"Judge model count must be an integer, got {count!r}"
+                ) from None
+            if n < 1:
+                raise ValueError(f"Judge model count must be positive, got {n}")
+            judge_models[model] = n
+        else:
+            # Format: "model" (defaults to 1 instance)
+            judge_models[model_spec] = 1
+
+    return judge_models
+
+
 def load_rubric_structure(
     rubric_path: str, sep: str = "\t"
 ) -> Tuple[List[str], List[str]]:

diff --git a/llm_clients/azure_llm.py b/llm_clients/azure_llm.py
@@ -125,9 +125,6 @@ def __init__(
         self.max_tokens = getattr(self.llm, "max_tokens", None)
         self.top_p = getattr(self.llm, "top_p", None)
 
-        # Store metadata from last response
-        self.last_response_metadata: Dict[str, Any] = {}
-
     async def generate_response(
         self,
         conversation_history: Optional[List[Dict[str, Any]]] = None,
@@ -175,6 +172,7 @@ async def generate_response(
                     else self.model_name
                 ),
                 "provider": "azure",
+                "role": self.role.value,
                 "timestamp": datetime.now().isoformat(),
                 "response_time_seconds": round(end_time - start_time, 3),
                 "usage": {},
@@ -211,6 +209,7 @@ async def generate_response(
                 "response_id": None,
                 "model": self.model_name,
                 "provider": "azure",
+                "role": self.role.value,
                 "timestamp": datetime.now().isoformat(),
                 "error": error_msg,
                 "usage": {},
@@ -281,6 +280,7 @@ async def generate_structured_response(
                 "response_id": None,
                 "model": self.model_name,
                 "provider": "azure",
+                "role": self.role.value,
                 "timestamp": datetime.now().isoformat(),
                 "response_time_seconds": round(end_time - start_time, 3),
                 "usage": {},
@@ -300,6 +300,7 @@ async def generate_structured_response(
                 "response_id": None,
                 "model": self.model_name,
                 "provider": "azure",
+                "role": self.role.value,
                 "timestamp": datetime.now().isoformat(),
                 "error": str(e),
                 "usage": {},

diff --git a/llm_clients/claude_llm.py b/llm_clients/claude_llm.py
@@ -64,9 +64,6 @@ def __init__(
         self.temperature = getattr(self.llm, "temperature", None)
         self.max_tokens = getattr(self.llm, "max_tokens", None)
 
-        # Store metadata from last response
-        self.last_response_metadata: Dict[str, Any] = {}
-
     async def generate_response(
         self,
         conversation_history: Optional[List[Dict[str, Any]]] = None,