From b489be7efa9d35597070b0f89834cce1147ee383 Mon Sep 17 00:00:00 2001 From: Josh Gieringer Date: Fri, 6 Feb 2026 18:24:54 -0700 Subject: [PATCH] refactor judge to reduce hacky imports --- judge.py | 169 +---------------------------- judge/cli.py | 169 +++++++++++++++++++++++++++++ run_pipeline.py | 10 +- tests/integration/test_pipeline.py | 91 ++++++---------- tests/unit/judge/test_judge_cli.py | 31 ++---- 5 files changed, 213 insertions(+), 257 deletions(-) create mode 100644 judge/cli.py diff --git a/judge.py b/judge.py index 44ce8771..da78ae0e 100644 --- a/judge.py +++ b/judge.py @@ -4,176 +4,9 @@ This script is separate from conversation generation. """ -import argparse import asyncio -from typing import Optional - -from judge import judge_conversations, judge_single_conversation -from judge.llm_judge import LLMJudge -from judge.rubric_config import ConversationData, RubricConfig, load_conversations -from judge.utils import parse_judge_models -from utils.utils import parse_key_value_list - - -def get_parser() -> argparse.ArgumentParser: - """Build and return the argument parser (for CLI and testing).""" - parser = argparse.ArgumentParser( - description="Judge existing LLM conversations using rubrics" - ) - - # required source - source_group = parser.add_mutually_exclusive_group(required=True) - source_group.add_argument( - "--conversation", "-c", help="Path to a single conversation file to judge" - ) - source_group.add_argument( - "--folder", - "-f", - help="Path to a conversation run folder " - "(e.g. conversations/p_model__a_model__t6__r1__timestamp/)", - ) - - # rubrics - parser.add_argument( - "--rubrics", - "-r", - nargs="+", - default=["data/rubric.tsv"], - help="Rubric file(s) to use (default: data/rubric.tsv)", - ) - - # model - parser.add_argument( - "--judge-model", - "-j", - nargs="+", - required=True, - help=( - "Model(s) to use for judging. " - "Format: 'model' or 'model:count' for multiple instances. " - "Can specify multiple models: --judge-model model1 model2:3. " - "Examples: claude-sonnet-4-5-20250929, " - "claude-sonnet-4-5-20250929:3, " - "claude-sonnet-4-5-20250929:2 gpt-4o:1" - ), - ) - - parser.add_argument( - "--judge-model-extra-params", - "-jep", - help=( - "Extra parameters for the judge model. " - "Examples: temperature=0.7, max_tokens=1000. " - "Default: temperature=0 (unless overridden)" - ), - type=parse_key_value_list, - default={}, - ) - - # optional limit - parser.add_argument( - "--limit", - "-l", - type=int, - default=None, - help="Limit number of conversations to judge (for debugging)", - ) - - # output folder - parser.add_argument( - "--output", - "-o", - default="evaluations", - help="Output folder for evaluation results (default: evaluations)", - ) - - # concurrency control - parser.add_argument( - "--max-concurrent", - "-m", - type=int, - default=None, - help=( - "Maximum number of concurrent workers (default: None). " - "Set to a high number or omit for unlimited concurrency." - ), - ) - - parser.add_argument( - "--per-judge", - "-pj", - action="store_true", - help=( - "If set, --max-concurrent applies per judge model. " - "Otherwise, it applies to total workers across all judges." - ), - ) - - parser.add_argument( - "--verbose-workers", - "-vw", - action="store_true", - help="Enable verbose worker logging to show concurrency behavior", - ) - - return parser - - -async def main(args) -> Optional[str]: - """Main async entrypoint for judging conversations.""" - # Parse judge models from args (supports "model" or "model:count" format) - judge_models = parse_judge_models(args.judge_model) - - models_str = ", ".join(f"{model}x{count}" for model, count in judge_models.items()) - print(f"đŸŽ¯ LLM Judge | Models: {models_str}") - - # Load rubric configuration once at startup - print("📚 Loading rubric configuration...") - rubric_config = await RubricConfig.load(rubric_folder="data") - - if args.conversation: - # Single conversation with first judge model (single instance) - first_model = next(iter(judge_models.keys())) - - # Load single conversation - conversation = await ConversationData.load(args.conversation) - - # Create judge with rubric config - judge = LLMJudge( - judge_model=first_model, - rubric_config=rubric_config, - judge_model_extra_params=args.judge_model_extra_params, - ) - await judge_single_conversation(judge, conversation, args.output) - # Single conversation mode doesn't need output folder for pipeline - print("â„šī¸ Single conversation mode: output folder not needed for pipeline") - return None - else: - # Load all conversations at startup - print(f"📂 Loading conversations from {args.folder}...") - conversations = await load_conversations(args.folder, limit=args.limit) - print(f"✅ Loaded {len(conversations)} conversations") - - # Batch evaluation with multiple judges - from pathlib import Path - - folder_name = Path(args.folder).name - - _, output_folder = await judge_conversations( - judge_models=judge_models, - conversations=conversations, - rubric_config=rubric_config, - max_concurrent=args.max_concurrent, - output_root=args.output, - conversation_folder_name=folder_name, - verbose=True, - judge_model_extra_params=args.judge_model_extra_params, - per_judge=args.per_judge, - verbose_workers=args.verbose_workers, - ) - - return output_folder +from judge.cli import get_parser, main if __name__ == "__main__": args = get_parser().parse_args() diff --git a/judge/cli.py b/judge/cli.py new file mode 100644 index 00000000..74770340 --- /dev/null +++ b/judge/cli.py @@ -0,0 +1,169 @@ +"""CLI entrypoint for the judge script: argument parser and main async entrypoint.""" + +import argparse +from pathlib import Path +from typing import Optional + +from judge import judge_conversations, judge_single_conversation +from judge.llm_judge import LLMJudge +from judge.rubric_config import ConversationData, RubricConfig, load_conversations +from judge.utils import parse_judge_models +from utils.utils import parse_key_value_list + + +def get_parser() -> argparse.ArgumentParser: + """Build and return the argument parser (for CLI and testing).""" + parser = argparse.ArgumentParser( + description="Judge existing LLM conversations using rubrics" + ) + + # required source + source_group = parser.add_mutually_exclusive_group(required=True) + source_group.add_argument( + "--conversation", "-c", help="Path to a single conversation file to judge" + ) + source_group.add_argument( + "--folder", + "-f", + help="Path to a conversation run folder " + "(e.g. conversations/p_model__a_model__t6__r1__timestamp/)", + ) + + # rubrics + parser.add_argument( + "--rubrics", + "-r", + nargs="+", + default=["data/rubric.tsv"], + help="Rubric file(s) to use (default: data/rubric.tsv)", + ) + + # model + parser.add_argument( + "--judge-model", + "-j", + nargs="+", + required=True, + help=( + "Model(s) to use for judging. " + "Format: 'model' or 'model:count' for multiple instances. " + "Can specify multiple models: --judge-model model1 model2:3. " + "Examples: claude-sonnet-4-5-20250929, " + "claude-sonnet-4-5-20250929:3, " + "claude-sonnet-4-5-20250929:2 gpt-4o:1" + ), + ) + + parser.add_argument( + "--judge-model-extra-params", + "-jep", + help=( + "Extra parameters for the judge model. " + "Examples: temperature=0.7, max_tokens=1000. " + "Default: temperature=0 (unless overridden)" + ), + type=parse_key_value_list, + default={}, + ) + + # optional limit + parser.add_argument( + "--limit", + "-l", + type=int, + default=None, + help="Limit number of conversations to judge (for debugging)", + ) + + # output folder + parser.add_argument( + "--output", + "-o", + default="evaluations", + help="Output folder for evaluation results (default: evaluations)", + ) + + # concurrency control + parser.add_argument( + "--max-concurrent", + "-m", + type=int, + default=None, + help=( + "Maximum number of concurrent workers (default: None). " + "Set to a high number or omit for unlimited concurrency." + ), + ) + + parser.add_argument( + "--per-judge", + "-pj", + action="store_true", + help=( + "If set, --max-concurrent applies per judge model. " + "Otherwise, it applies to total workers across all judges." + ), + ) + + parser.add_argument( + "--verbose-workers", + "-vw", + action="store_true", + help="Enable verbose worker logging to show concurrency behavior", + ) + + return parser + + +async def main(args: argparse.Namespace) -> Optional[str]: + """Main async entrypoint for judging conversations.""" + # Parse judge models from args (supports "model" or "model:count" format) + judge_models = parse_judge_models(args.judge_model) + + models_str = ", ".join(f"{model}x{count}" for model, count in judge_models.items()) + print(f"đŸŽ¯ LLM Judge | Models: {models_str}") + + # Load rubric configuration once at startup + print("📚 Loading rubric configuration...") + rubric_config = await RubricConfig.load(rubric_folder="data") + + if args.conversation: + # Single conversation with first judge model (single instance) + first_model = next(iter(judge_models.keys())) + + # Load single conversation + conversation = await ConversationData.load(args.conversation) + + # Create judge with rubric config + judge = LLMJudge( + judge_model=first_model, + rubric_config=rubric_config, + judge_model_extra_params=args.judge_model_extra_params, + ) + await judge_single_conversation(judge, conversation, args.output) + # Single conversation mode doesn't need output folder for pipeline + print("â„šī¸ Single conversation mode: output folder not needed for pipeline") + return None + else: + # Load all conversations at startup + print(f"📂 Loading conversations from {args.folder}...") + conversations = await load_conversations(args.folder, limit=args.limit) + print(f"✅ Loaded {len(conversations)} conversations") + + # Batch evaluation with multiple judges + folder_name = Path(args.folder).name + + _, output_folder = await judge_conversations( + judge_models=judge_models, + conversations=conversations, + rubric_config=rubric_config, + max_concurrent=args.max_concurrent, + output_root=args.output, + conversation_folder_name=folder_name, + verbose=True, + judge_model_extra_params=args.judge_model_extra_params, + per_judge=args.per_judge, + verbose_workers=args.verbose_workers, + ) + + return output_folder diff --git a/run_pipeline.py b/run_pipeline.py index ef702ee9..21e50433 100644 --- a/run_pipeline.py +++ b/run_pipeline.py @@ -186,16 +186,8 @@ async def main(): # Import generate and judge main functions # We import here to avoid circular dependencies and to allow --debug flag to be set - # Import judge.py main function - # (note: judge.py is a module file, judge/ is a package) - import importlib.util - from generate import main as generate_main - - spec = importlib.util.spec_from_file_location("judge_script", "judge.py") - judge_script = importlib.util.module_from_spec(spec) - spec.loader.exec_module(judge_script) - judge_main = judge_script.main + from judge.cli import main as judge_main # Set debug mode if flag is provided if args.debug: diff --git a/tests/integration/test_pipeline.py b/tests/integration/test_pipeline.py index fdf7797b..a39b691f 100644 --- a/tests/integration/test_pipeline.py +++ b/tests/integration/test_pipeline.py @@ -647,15 +647,13 @@ async def mock_generate(*args, **kwargs): with patch("generate.main", side_effect=mock_generate): # Mock sys.exit to raise SystemExit instead of actually exiting with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit: - # Mock importlib to avoid judge loading (not needed for step 1 test) - with patch("importlib.util.spec_from_file_location"): - with patch("sys.argv", valid_pipeline_args): - # Pipeline should raise SystemExit when folder doesn't exist - with pytest.raises(SystemExit): - await pipeline_main() + with patch("sys.argv", valid_pipeline_args): + # Pipeline should raise SystemExit when folder doesn't exist + with pytest.raises(SystemExit): + await pipeline_main() - # Verify sys.exit(1) was called - mock_exit.assert_called_once_with(1) + # Verify sys.exit(1) was called + mock_exit.assert_called_once_with(1) @pytest.mark.asyncio async def test_step1_validation_no_conversation_files( @@ -678,14 +676,13 @@ async def mock_generate(*args, **kwargs): with patch("generate.main", side_effect=mock_generate): # Mock sys.exit to raise SystemExit instead of actually exiting with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit: - with patch("importlib.util.spec_from_file_location"): - with patch("sys.argv", valid_pipeline_args): - # Pipeline should raise SystemExit - with pytest.raises(SystemExit): - await pipeline_main() + with patch("sys.argv", valid_pipeline_args): + # Pipeline should raise SystemExit + with pytest.raises(SystemExit): + await pipeline_main() - # Verify sys.exit(1) was called - mock_exit.assert_called_once_with(1) + # Verify sys.exit(1) was called + mock_exit.assert_called_once_with(1) @pytest.mark.asyncio async def test_step1_validation_only_log_files(self, tmp_path, valid_pipeline_args): @@ -708,14 +705,13 @@ async def mock_generate(*args, **kwargs): with patch("generate.main", side_effect=mock_generate): # Mock sys.exit to raise SystemExit instead of actually exiting with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit: - with patch("importlib.util.spec_from_file_location"): - with patch("sys.argv", valid_pipeline_args): - # Pipeline should raise SystemExit - with pytest.raises(SystemExit): - await pipeline_main() + with patch("sys.argv", valid_pipeline_args): + # Pipeline should raise SystemExit + with pytest.raises(SystemExit): + await pipeline_main() - # Verify sys.exit(1) was called - mock_exit.assert_called_once_with(1) + # Verify sys.exit(1) was called + mock_exit.assert_called_once_with(1) @pytest.mark.asyncio async def test_step2_validation_no_evaluation_folder( @@ -723,7 +719,7 @@ async def test_step2_validation_no_evaluation_folder( ): """Test that pipeline exits if Step 2 returns None.""" import sys - from unittest.mock import MagicMock, patch + from unittest.mock import patch from run_pipeline import main as pipeline_main @@ -736,18 +732,13 @@ async def test_step2_validation_no_evaluation_folder( async def mock_generate(*args, **kwargs): return None, str(conv_folder) - # Mock judge_main to return None + # Mock judge CLI main to return None async def mock_judge(args): return None - # Create a mock module with the mock judge main function - mock_judge_module = MagicMock() - mock_judge_module.main = mock_judge - with ( patch("generate.main", side_effect=mock_generate), - patch("importlib.util.module_from_spec", return_value=mock_judge_module), - patch("importlib.util.spec_from_file_location"), + patch("judge.cli.main", side_effect=mock_judge), ): # Mock sys.exit to raise SystemExit instead of actually exiting with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit: @@ -765,7 +756,7 @@ async def test_step2_validation_folder_not_exists( ): """Test that pipeline exits if Step 2 folder doesn't exist.""" import sys - from unittest.mock import MagicMock, patch + from unittest.mock import patch from run_pipeline import main as pipeline_main @@ -778,18 +769,13 @@ async def test_step2_validation_folder_not_exists( async def mock_generate(*args, **kwargs): return None, str(conv_folder) - # Mock judge_main to return non-existent folder + # Mock judge CLI main to return non-existent folder async def mock_judge(args): return str(tmp_path / "nonexistent_eval") - # Create a mock module with the mock judge main function - mock_judge_module = MagicMock() - mock_judge_module.main = mock_judge - with ( patch("generate.main", side_effect=mock_generate), - patch("importlib.util.module_from_spec", return_value=mock_judge_module), - patch("importlib.util.spec_from_file_location"), + patch("judge.cli.main", side_effect=mock_judge), ): # Mock sys.exit to raise SystemExit instead of actually exiting with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit: @@ -805,7 +791,7 @@ async def mock_judge(args): async def test_step2_validation_no_results_csv(self, tmp_path, valid_pipeline_args): """Test that pipeline exits if Step 2 produces no results.csv.""" import sys - from unittest.mock import MagicMock, patch + from unittest.mock import patch from run_pipeline import main as pipeline_main @@ -823,18 +809,13 @@ async def test_step2_validation_no_results_csv(self, tmp_path, valid_pipeline_ar async def mock_generate(*args, **kwargs): return None, str(conv_folder) - # Mock judge_main to return folder without results.csv + # Mock judge CLI main to return folder without results.csv async def mock_judge(args): return str(eval_folder) - # Create a mock module with the mock judge main function - mock_judge_module = MagicMock() - mock_judge_module.main = mock_judge - with ( patch("generate.main", side_effect=mock_generate), - patch("importlib.util.module_from_spec", return_value=mock_judge_module), - patch("importlib.util.spec_from_file_location"), + patch("judge.cli.main", side_effect=mock_judge), ): # Mock sys.exit to raise SystemExit instead of actually exiting with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit: @@ -852,7 +833,7 @@ async def test_step2_validation_empty_folder_error_message( ): """Test that error message lists files when folder is not empty.""" import sys - from unittest.mock import MagicMock, patch + from unittest.mock import patch from run_pipeline import main as pipeline_main @@ -875,14 +856,9 @@ async def mock_generate(*args, **kwargs): async def mock_judge(args): return str(eval_folder) - # Create a mock module with the mock judge main function - mock_judge_module = MagicMock() - mock_judge_module.main = mock_judge - with ( patch("generate.main", side_effect=mock_generate), - patch("importlib.util.module_from_spec", return_value=mock_judge_module), - patch("importlib.util.spec_from_file_location"), + patch("judge.cli.main", side_effect=mock_judge), ): # Mock sys.exit to raise SystemExit instead of actually exiting with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit: @@ -906,7 +882,7 @@ async def test_validation_success_messages( self, tmp_path, valid_pipeline_args, capsys ): """Test that validation success messages are displayed.""" - from unittest.mock import MagicMock, patch + from unittest.mock import patch from run_pipeline import main as pipeline_main @@ -933,14 +909,9 @@ async def mock_judge(args): def mock_score(*args, **kwargs): return {} - # Create a mock module with the mock judge main function - mock_judge_module = MagicMock() - mock_judge_module.main = mock_judge - with ( patch("generate.main", side_effect=mock_generate), - patch("importlib.util.module_from_spec", return_value=mock_judge_module), - patch("importlib.util.spec_from_file_location"), + patch("judge.cli.main", side_effect=mock_judge), patch("run_pipeline.score_results", new=mock_score), patch("run_pipeline.print_scores"), patch("run_pipeline.create_visualizations"), diff --git a/tests/unit/judge/test_judge_cli.py b/tests/unit/judge/test_judge_cli.py index cc3723ff..7c7c9887 100644 --- a/tests/unit/judge/test_judge_cli.py +++ b/tests/unit/judge/test_judge_cli.py @@ -1,25 +1,16 @@ -"""Unit tests for judge.py CLI and main entrypoint.""" +"""Unit tests for judge CLI and main entrypoint (judge.cli).""" -import importlib.util -from pathlib import Path from unittest.mock import AsyncMock, patch import pytest -# Load judge.py script (project root) so we can test get_parser and main -_PROJECT_ROOT = Path(__file__).resolve().parents[3] -_JUDGE_SCRIPT = _PROJECT_ROOT / "judge.py" -_spec = importlib.util.spec_from_file_location("judge_script", _JUDGE_SCRIPT) -assert _spec is not None and _spec.loader is not None -_judge_script = importlib.util.module_from_spec(_spec) -_spec.loader.exec_module(_judge_script) -get_parser = _judge_script.get_parser -main = _judge_script.main +from judge import cli as judge_cli +from judge.cli import get_parser, main @pytest.mark.unit class TestJudgeParser: - """Test judge.py argument parser (get_parser()).""" + """Test judge CLI argument parser (get_parser()).""" def test_requires_conversation_or_folder(self): """Parser requires exactly one of --conversation or --folder.""" @@ -132,11 +123,11 @@ async def test_main_single_conversation_calls_judge_single(self): ] ) with ( - patch.object(_judge_script, "RubricConfig") as RubricConfig, - patch.object(_judge_script, "ConversationData") as ConversationData, - patch.object(_judge_script, "LLMJudge") as LLMJudge, + patch.object(judge_cli, "RubricConfig") as RubricConfig, + patch.object(judge_cli, "ConversationData") as ConversationData, + patch.object(judge_cli, "LLMJudge") as LLMJudge, patch.object( - _judge_script, + judge_cli, "judge_single_conversation", new_callable=AsyncMock, ) as judge_single, @@ -180,14 +171,14 @@ async def test_main_folder_calls_judge_conversations(self): ] ) with ( - patch.object(_judge_script, "RubricConfig") as RubricConfig, + patch.object(judge_cli, "RubricConfig") as RubricConfig, patch.object( - _judge_script, + judge_cli, "load_conversations", new_callable=AsyncMock, ) as load_convos, patch.object( - _judge_script, + judge_cli, "judge_conversations", new_callable=AsyncMock, ) as judge_convos,