From b489be7efa9d35597070b0f89834cce1147ee383 Mon Sep 17 00:00:00 2001
From: Josh Gieringer <josh.gieringer@springhealth.com>
Date: Fri, 6 Feb 2026 18:24:54 -0700
Subject: [PATCH] refactor judge to reduce hacky imports

---
 judge.py                           | 169 +----------------------------
 judge/cli.py                       | 169 +++++++++++++++++++++++++++++
 run_pipeline.py                    |  10 +-
 tests/integration/test_pipeline.py |  91 ++++++----------
 tests/unit/judge/test_judge_cli.py |  31 ++----
 5 files changed, 213 insertions(+), 257 deletions(-)
 create mode 100644 judge/cli.py

diff --git a/judge.py b/judge.py
index 44ce8771..da78ae0e 100644
--- a/judge.py
+++ b/judge.py
@@ -4,176 +4,9 @@
 This script is separate from conversation generation.
 """
 
-import argparse
 import asyncio
-from typing import Optional
-
-from judge import judge_conversations, judge_single_conversation
-from judge.llm_judge import LLMJudge
-from judge.rubric_config import ConversationData, RubricConfig, load_conversations
-from judge.utils import parse_judge_models
-from utils.utils import parse_key_value_list
-
-
-def get_parser() -> argparse.ArgumentParser:
-    """Build and return the argument parser (for CLI and testing)."""
-    parser = argparse.ArgumentParser(
-        description="Judge existing LLM conversations using rubrics"
-    )
-
-    # required source
-    source_group = parser.add_mutually_exclusive_group(required=True)
-    source_group.add_argument(
-        "--conversation", "-c", help="Path to a single conversation file to judge"
-    )
-    source_group.add_argument(
-        "--folder",
-        "-f",
-        help="Path to a conversation run folder "
-        "(e.g. conversations/p_model__a_model__t6__r1__timestamp/)",
-    )
-
-    # rubrics
-    parser.add_argument(
-        "--rubrics",
-        "-r",
-        nargs="+",
-        default=["data/rubric.tsv"],
-        help="Rubric file(s) to use (default: data/rubric.tsv)",
-    )
-
-    # model
-    parser.add_argument(
-        "--judge-model",
-        "-j",
-        nargs="+",
-        required=True,
-        help=(
-            "Model(s) to use for judging. "
-            "Format: 'model' or 'model:count' for multiple instances. "
-            "Can specify multiple models: --judge-model model1 model2:3. "
-            "Examples: claude-sonnet-4-5-20250929, "
-            "claude-sonnet-4-5-20250929:3, "
-            "claude-sonnet-4-5-20250929:2 gpt-4o:1"
-        ),
-    )
-
-    parser.add_argument(
-        "--judge-model-extra-params",
-        "-jep",
-        help=(
-            "Extra parameters for the judge model. "
-            "Examples: temperature=0.7, max_tokens=1000. "
-            "Default: temperature=0 (unless overridden)"
-        ),
-        type=parse_key_value_list,
-        default={},
-    )
-
-    # optional limit
-    parser.add_argument(
-        "--limit",
-        "-l",
-        type=int,
-        default=None,
-        help="Limit number of conversations to judge (for debugging)",
-    )
-
-    # output folder
-    parser.add_argument(
-        "--output",
-        "-o",
-        default="evaluations",
-        help="Output folder for evaluation results (default: evaluations)",
-    )
-
-    # concurrency control
-    parser.add_argument(
-        "--max-concurrent",
-        "-m",
-        type=int,
-        default=None,
-        help=(
-            "Maximum number of concurrent workers (default: None). "
-            "Set to a high number or omit for unlimited concurrency."
-        ),
-    )
-
-    parser.add_argument(
-        "--per-judge",
-        "-pj",
-        action="store_true",
-        help=(
-            "If set, --max-concurrent applies per judge model. "
-            "Otherwise, it applies to total workers across all judges."
-        ),
-    )
-
-    parser.add_argument(
-        "--verbose-workers",
-        "-vw",
-        action="store_true",
-        help="Enable verbose worker logging to show concurrency behavior",
-    )
-
-    return parser
-
-
-async def main(args) -> Optional[str]:
-    """Main async entrypoint for judging conversations."""
-    # Parse judge models from args (supports "model" or "model:count" format)
-    judge_models = parse_judge_models(args.judge_model)
-
-    models_str = ", ".join(f"{model}x{count}" for model, count in judge_models.items())
-    print(f"🎯 LLM Judge | Models: {models_str}")
-
-    # Load rubric configuration once at startup
-    print("📚 Loading rubric configuration...")
-    rubric_config = await RubricConfig.load(rubric_folder="data")
-
-    if args.conversation:
-        # Single conversation with first judge model (single instance)
-        first_model = next(iter(judge_models.keys()))
-
-        # Load single conversation
-        conversation = await ConversationData.load(args.conversation)
-
-        # Create judge with rubric config
-        judge = LLMJudge(
-            judge_model=first_model,
-            rubric_config=rubric_config,
-            judge_model_extra_params=args.judge_model_extra_params,
-        )
-        await judge_single_conversation(judge, conversation, args.output)
-        # Single conversation mode doesn't need output folder for pipeline
-        print("ℹ️  Single conversation mode: output folder not needed for pipeline")
-        return None
-    else:
-        # Load all conversations at startup
-        print(f"📂 Loading conversations from {args.folder}...")
-        conversations = await load_conversations(args.folder, limit=args.limit)
-        print(f"✅ Loaded {len(conversations)} conversations")
-
-        # Batch evaluation with multiple judges
-        from pathlib import Path
-
-        folder_name = Path(args.folder).name
-
-        _, output_folder = await judge_conversations(
-            judge_models=judge_models,
-            conversations=conversations,
-            rubric_config=rubric_config,
-            max_concurrent=args.max_concurrent,
-            output_root=args.output,
-            conversation_folder_name=folder_name,
-            verbose=True,
-            judge_model_extra_params=args.judge_model_extra_params,
-            per_judge=args.per_judge,
-            verbose_workers=args.verbose_workers,
-        )
-
-        return output_folder
 
+from judge.cli import get_parser, main
 
 if __name__ == "__main__":
     args = get_parser().parse_args()
diff --git a/judge/cli.py b/judge/cli.py
new file mode 100644
index 00000000..74770340
--- /dev/null
+++ b/judge/cli.py
@@ -0,0 +1,169 @@
+"""CLI entrypoint for the judge script: argument parser and main async entrypoint."""
+
+import argparse
+from pathlib import Path
+from typing import Optional
+
+from judge import judge_conversations, judge_single_conversation
+from judge.llm_judge import LLMJudge
+from judge.rubric_config import ConversationData, RubricConfig, load_conversations
+from judge.utils import parse_judge_models
+from utils.utils import parse_key_value_list
+
+
+def get_parser() -> argparse.ArgumentParser:
+    """Build and return the argument parser (for CLI and testing)."""
+    parser = argparse.ArgumentParser(
+        description="Judge existing LLM conversations using rubrics"
+    )
+
+    # required source
+    source_group = parser.add_mutually_exclusive_group(required=True)
+    source_group.add_argument(
+        "--conversation", "-c", help="Path to a single conversation file to judge"
+    )
+    source_group.add_argument(
+        "--folder",
+        "-f",
+        help="Path to a conversation run folder "
+        "(e.g. conversations/p_model__a_model__t6__r1__timestamp/)",
+    )
+
+    # rubrics
+    parser.add_argument(
+        "--rubrics",
+        "-r",
+        nargs="+",
+        default=["data/rubric.tsv"],
+        help="Rubric file(s) to use (default: data/rubric.tsv)",
+    )
+
+    # model
+    parser.add_argument(
+        "--judge-model",
+        "-j",
+        nargs="+",
+        required=True,
+        help=(
+            "Model(s) to use for judging. "
+            "Format: 'model' or 'model:count' for multiple instances. "
+            "Can specify multiple models: --judge-model model1 model2:3. "
+            "Examples: claude-sonnet-4-5-20250929, "
+            "claude-sonnet-4-5-20250929:3, "
+            "claude-sonnet-4-5-20250929:2 gpt-4o:1"
+        ),
+    )
+
+    parser.add_argument(
+        "--judge-model-extra-params",
+        "-jep",
+        help=(
+            "Extra parameters for the judge model. "
+            "Examples: temperature=0.7, max_tokens=1000. "
+            "Default: temperature=0 (unless overridden)"
+        ),
+        type=parse_key_value_list,
+        default={},
+    )
+
+    # optional limit
+    parser.add_argument(
+        "--limit",
+        "-l",
+        type=int,
+        default=None,
+        help="Limit number of conversations to judge (for debugging)",
+    )
+
+    # output folder
+    parser.add_argument(
+        "--output",
+        "-o",
+        default="evaluations",
+        help="Output folder for evaluation results (default: evaluations)",
+    )
+
+    # concurrency control
+    parser.add_argument(
+        "--max-concurrent",
+        "-m",
+        type=int,
+        default=None,
+        help=(
+            "Maximum number of concurrent workers (default: None). "
+            "Set to a high number or omit for unlimited concurrency."
+        ),
+    )
+
+    parser.add_argument(
+        "--per-judge",
+        "-pj",
+        action="store_true",
+        help=(
+            "If set, --max-concurrent applies per judge model. "
+            "Otherwise, it applies to total workers across all judges."
+        ),
+    )
+
+    parser.add_argument(
+        "--verbose-workers",
+        "-vw",
+        action="store_true",
+        help="Enable verbose worker logging to show concurrency behavior",
+    )
+
+    return parser
+
+
+async def main(args: argparse.Namespace) -> Optional[str]:
+    """Main async entrypoint for judging conversations."""
+    # Parse judge models from args (supports "model" or "model:count" format)
+    judge_models = parse_judge_models(args.judge_model)
+
+    models_str = ", ".join(f"{model}x{count}" for model, count in judge_models.items())
+    print(f"🎯 LLM Judge | Models: {models_str}")
+
+    # Load rubric configuration once at startup
+    print("📚 Loading rubric configuration...")
+    rubric_config = await RubricConfig.load(rubric_folder="data")
+
+    if args.conversation:
+        # Single conversation with first judge model (single instance)
+        first_model = next(iter(judge_models.keys()))
+
+        # Load single conversation
+        conversation = await ConversationData.load(args.conversation)
+
+        # Create judge with rubric config
+        judge = LLMJudge(
+            judge_model=first_model,
+            rubric_config=rubric_config,
+            judge_model_extra_params=args.judge_model_extra_params,
+        )
+        await judge_single_conversation(judge, conversation, args.output)
+        # Single conversation mode doesn't need output folder for pipeline
+        print("ℹ️  Single conversation mode: output folder not needed for pipeline")
+        return None
+    else:
+        # Load all conversations at startup
+        print(f"📂 Loading conversations from {args.folder}...")
+        conversations = await load_conversations(args.folder, limit=args.limit)
+        print(f"✅ Loaded {len(conversations)} conversations")
+
+        # Batch evaluation with multiple judges
+        folder_name = Path(args.folder).name
+
+        _, output_folder = await judge_conversations(
+            judge_models=judge_models,
+            conversations=conversations,
+            rubric_config=rubric_config,
+            max_concurrent=args.max_concurrent,
+            output_root=args.output,
+            conversation_folder_name=folder_name,
+            verbose=True,
+            judge_model_extra_params=args.judge_model_extra_params,
+            per_judge=args.per_judge,
+            verbose_workers=args.verbose_workers,
+        )
+
+        return output_folder
diff --git a/run_pipeline.py b/run_pipeline.py
index ef702ee9..21e50433 100644
--- a/run_pipeline.py
+++ b/run_pipeline.py
@@ -186,16 +186,8 @@ async def main():
 
     # Import generate and judge main functions
     # We import here to avoid circular dependencies and to allow --debug flag to be set
-    # Import judge.py main function
-    # (note: judge.py is a module file, judge/ is a package)
-    import importlib.util
-
     from generate import main as generate_main
-
-    spec = importlib.util.spec_from_file_location("judge_script", "judge.py")
-    judge_script = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(judge_script)
-    judge_main = judge_script.main
+    from judge.cli import main as judge_main
 
     # Set debug mode if flag is provided
     if args.debug:
diff --git a/tests/integration/test_pipeline.py b/tests/integration/test_pipeline.py
index fdf7797b..a39b691f 100644
--- a/tests/integration/test_pipeline.py
+++ b/tests/integration/test_pipeline.py
@@ -647,15 +647,13 @@ async def mock_generate(*args, **kwargs):
         with patch("generate.main", side_effect=mock_generate):
             # Mock sys.exit to raise SystemExit instead of actually exiting
             with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit:
-                # Mock importlib to avoid judge loading (not needed for step 1 test)
-                with patch("importlib.util.spec_from_file_location"):
-                    with patch("sys.argv", valid_pipeline_args):
-                        # Pipeline should raise SystemExit when folder doesn't exist
-                        with pytest.raises(SystemExit):
-                            await pipeline_main()
+                with patch("sys.argv", valid_pipeline_args):
+                    # Pipeline should raise SystemExit when folder doesn't exist
+                    with pytest.raises(SystemExit):
+                        await pipeline_main()
 
-                        # Verify sys.exit(1) was called
-                        mock_exit.assert_called_once_with(1)
+                # Verify sys.exit(1) was called
+                mock_exit.assert_called_once_with(1)
 
     @pytest.mark.asyncio
     async def test_step1_validation_no_conversation_files(
@@ -678,14 +676,13 @@ async def mock_generate(*args, **kwargs):
         with patch("generate.main", side_effect=mock_generate):
             # Mock sys.exit to raise SystemExit instead of actually exiting
             with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit:
-                with patch("importlib.util.spec_from_file_location"):
-                    with patch("sys.argv", valid_pipeline_args):
-                        # Pipeline should raise SystemExit
-                        with pytest.raises(SystemExit):
-                            await pipeline_main()
+                with patch("sys.argv", valid_pipeline_args):
+                    # Pipeline should raise SystemExit
+                    with pytest.raises(SystemExit):
+                        await pipeline_main()
 
-                        # Verify sys.exit(1) was called
-                        mock_exit.assert_called_once_with(1)
+                    # Verify sys.exit(1) was called
+                    mock_exit.assert_called_once_with(1)
 
     @pytest.mark.asyncio
     async def test_step1_validation_only_log_files(self, tmp_path, valid_pipeline_args):
@@ -708,14 +705,13 @@ async def mock_generate(*args, **kwargs):
         with patch("generate.main", side_effect=mock_generate):
             # Mock sys.exit to raise SystemExit instead of actually exiting
             with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit:
-                with patch("importlib.util.spec_from_file_location"):
-                    with patch("sys.argv", valid_pipeline_args):
-                        # Pipeline should raise SystemExit
-                        with pytest.raises(SystemExit):
-                            await pipeline_main()
+                with patch("sys.argv", valid_pipeline_args):
+                    # Pipeline should raise SystemExit
+                    with pytest.raises(SystemExit):
+                        await pipeline_main()
 
-                        # Verify sys.exit(1) was called
-                        mock_exit.assert_called_once_with(1)
+                    # Verify sys.exit(1) was called
+                    mock_exit.assert_called_once_with(1)
 
     @pytest.mark.asyncio
     async def test_step2_validation_no_evaluation_folder(
@@ -723,7 +719,7 @@ async def test_step2_validation_no_evaluation_folder(
     ):
         """Test that pipeline exits if Step 2 returns None."""
         import sys
-        from unittest.mock import MagicMock, patch
+        from unittest.mock import patch
 
         from run_pipeline import main as pipeline_main
 
@@ -736,18 +732,13 @@ async def test_step2_validation_no_evaluation_folder(
         async def mock_generate(*args, **kwargs):
             return None, str(conv_folder)
 
-        # Mock judge_main to return None
+        # Mock judge CLI main to return None
         async def mock_judge(args):
             return None
 
-        # Create a mock module with the mock judge main function
-        mock_judge_module = MagicMock()
-        mock_judge_module.main = mock_judge
-
         with (
             patch("generate.main", side_effect=mock_generate),
-            patch("importlib.util.module_from_spec", return_value=mock_judge_module),
-            patch("importlib.util.spec_from_file_location"),
+            patch("judge.cli.main", side_effect=mock_judge),
         ):
             # Mock sys.exit to raise SystemExit instead of actually exiting
             with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit:
@@ -765,7 +756,7 @@ async def test_step2_validation_folder_not_exists(
     ):
         """Test that pipeline exits if Step 2 folder doesn't exist."""
         import sys
-        from unittest.mock import MagicMock, patch
+        from unittest.mock import patch
 
         from run_pipeline import main as pipeline_main
 
@@ -778,18 +769,13 @@ async def test_step2_validation_folder_not_exists(
         async def mock_generate(*args, **kwargs):
             return None, str(conv_folder)
 
-        # Mock judge_main to return non-existent folder
+        # Mock judge CLI main to return non-existent folder
         async def mock_judge(args):
             return str(tmp_path / "nonexistent_eval")
 
-        # Create a mock module with the mock judge main function
-        mock_judge_module = MagicMock()
-        mock_judge_module.main = mock_judge
-
         with (
             patch("generate.main", side_effect=mock_generate),
-            patch("importlib.util.module_from_spec", return_value=mock_judge_module),
-            patch("importlib.util.spec_from_file_location"),
+            patch("judge.cli.main", side_effect=mock_judge),
         ):
             # Mock sys.exit to raise SystemExit instead of actually exiting
             with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit:
@@ -805,7 +791,7 @@ async def mock_judge(args):
     async def test_step2_validation_no_results_csv(self, tmp_path, valid_pipeline_args):
         """Test that pipeline exits if Step 2 produces no results.csv."""
         import sys
-        from unittest.mock import MagicMock, patch
+        from unittest.mock import patch
 
         from run_pipeline import main as pipeline_main
 
@@ -823,18 +809,13 @@ async def test_step2_validation_no_results_csv(self, tmp_path, valid_pipeline_ar
         async def mock_generate(*args, **kwargs):
             return None, str(conv_folder)
 
-        # Mock judge_main to return folder without results.csv
+        # Mock judge CLI main to return folder without results.csv
         async def mock_judge(args):
             return str(eval_folder)
 
-        # Create a mock module with the mock judge main function
-        mock_judge_module = MagicMock()
-        mock_judge_module.main = mock_judge
-
         with (
             patch("generate.main", side_effect=mock_generate),
-            patch("importlib.util.module_from_spec", return_value=mock_judge_module),
-            patch("importlib.util.spec_from_file_location"),
+            patch("judge.cli.main", side_effect=mock_judge),
         ):
             # Mock sys.exit to raise SystemExit instead of actually exiting
             with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit:
@@ -852,7 +833,7 @@ async def test_step2_validation_empty_folder_error_message(
     ):
         """Test that error message lists files when folder is not empty."""
         import sys
-        from unittest.mock import MagicMock, patch
+        from unittest.mock import patch
 
         from run_pipeline import main as pipeline_main
 
@@ -875,14 +856,9 @@ async def mock_generate(*args, **kwargs):
         async def mock_judge(args):
             return str(eval_folder)
 
-        # Create a mock module with the mock judge main function
-        mock_judge_module = MagicMock()
-        mock_judge_module.main = mock_judge
-
         with (
             patch("generate.main", side_effect=mock_generate),
-            patch("importlib.util.module_from_spec", return_value=mock_judge_module),
-            patch("importlib.util.spec_from_file_location"),
+            patch("judge.cli.main", side_effect=mock_judge),
         ):
             # Mock sys.exit to raise SystemExit instead of actually exiting
             with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit:
@@ -906,7 +882,7 @@ async def test_validation_success_messages(
         self, tmp_path, valid_pipeline_args, capsys
     ):
         """Test that validation success messages are displayed."""
-        from unittest.mock import MagicMock, patch
+        from unittest.mock import patch
 
         from run_pipeline import main as pipeline_main
 
@@ -933,14 +909,9 @@ async def mock_judge(args):
         def mock_score(*args, **kwargs):
             return {}
 
-        # Create a mock module with the mock judge main function
-        mock_judge_module = MagicMock()
-        mock_judge_module.main = mock_judge
-
         with (
             patch("generate.main", side_effect=mock_generate),
-            patch("importlib.util.module_from_spec", return_value=mock_judge_module),
-            patch("importlib.util.spec_from_file_location"),
+            patch("judge.cli.main", side_effect=mock_judge),
             patch("run_pipeline.score_results", new=mock_score),
             patch("run_pipeline.print_scores"),
             patch("run_pipeline.create_visualizations"),
diff --git a/tests/unit/judge/test_judge_cli.py b/tests/unit/judge/test_judge_cli.py
index cc3723ff..7c7c9887 100644
--- a/tests/unit/judge/test_judge_cli.py
+++ b/tests/unit/judge/test_judge_cli.py
@@ -1,25 +1,16 @@
-"""Unit tests for judge.py CLI and main entrypoint."""
+"""Unit tests for judge CLI and main entrypoint (judge.cli)."""
 
-import importlib.util
-from pathlib import Path
 from unittest.mock import AsyncMock, patch
 
 import pytest
 
-# Load judge.py script (project root) so we can test get_parser and main
-_PROJECT_ROOT = Path(__file__).resolve().parents[3]
-_JUDGE_SCRIPT = _PROJECT_ROOT / "judge.py"
-_spec = importlib.util.spec_from_file_location("judge_script", _JUDGE_SCRIPT)
-assert _spec is not None and _spec.loader is not None
-_judge_script = importlib.util.module_from_spec(_spec)
-_spec.loader.exec_module(_judge_script)
-get_parser = _judge_script.get_parser
-main = _judge_script.main
+from judge import cli as judge_cli
+from judge.cli import get_parser, main
 
 
 @pytest.mark.unit
 class TestJudgeParser:
-    """Test judge.py argument parser (get_parser())."""
+    """Test judge CLI argument parser (get_parser())."""
 
     def test_requires_conversation_or_folder(self):
         """Parser requires exactly one of --conversation or --folder."""
@@ -132,11 +123,11 @@ async def test_main_single_conversation_calls_judge_single(self):
             ]
         )
         with (
-            patch.object(_judge_script, "RubricConfig") as RubricConfig,
-            patch.object(_judge_script, "ConversationData") as ConversationData,
-            patch.object(_judge_script, "LLMJudge") as LLMJudge,
+            patch.object(judge_cli, "RubricConfig") as RubricConfig,
+            patch.object(judge_cli, "ConversationData") as ConversationData,
+            patch.object(judge_cli, "LLMJudge") as LLMJudge,
             patch.object(
-                _judge_script,
+                judge_cli,
                 "judge_single_conversation",
                 new_callable=AsyncMock,
             ) as judge_single,
@@ -180,14 +171,14 @@ async def test_main_folder_calls_judge_conversations(self):
             ]
         )
         with (
-            patch.object(_judge_script, "RubricConfig") as RubricConfig,
+            patch.object(judge_cli, "RubricConfig") as RubricConfig,
             patch.object(
-                _judge_script,
+                judge_cli,
                 "load_conversations",
                 new_callable=AsyncMock,
             ) as load_convos,
             patch.object(
-                _judge_script,
+                judge_cli,
                 "judge_conversations",
                 new_callable=AsyncMock,
             ) as judge_convos,