From 2a41bbac4d11ed0d1ec7c1ed0eccef6bea054415 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Wed, 28 Jan 2026 15:20:30 +0100 Subject: [PATCH 1/3] Require explicit model_name for SWE-Bench outputs Co-authored-by: openhands --- benchmarks/swebench/eval_infer.py | 19 ++++++++++++---- benchmarks/swebenchmultimodal/eval_infer.py | 19 ++++++++++++---- tests/test_swebenchmultimodal.py | 24 ++++++++++++++++++++- 3 files changed, 53 insertions(+), 9 deletions(-) diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py index f252a56a..96b61e08 100644 --- a/benchmarks/swebench/eval_infer.py +++ b/benchmarks/swebench/eval_infer.py @@ -26,7 +26,7 @@ def convert_to_swebench_format( - input_file: str, output_file: str, model_name: str = "OpenHands" + input_file: str, output_file: str, model_name: str ) -> None: """ Convert OpenHands output.jsonl to SWE-Bench prediction format. @@ -46,11 +46,19 @@ def convert_to_swebench_format( { "instance_id": "django__django-11333", "model_patch": "diff --git a/file.py b/file.py\n...", - "model_name_or_path": "OpenHands" + "model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929" } + + The model identifier is required for attribution in SWE-Bench reports and + filenames. Typical values mirror the LLM config's `model` field, e.g., + "litellm_proxy/claude-sonnet-4-5-20250929" or + "litellm_proxy/claude-haiku-4-5-20251001". """ logger.info(f"Converting {input_file} to SWE-Bench format: {output_file}") + if not model_name: + raise ValueError("model_name is required and cannot be empty") + converted_count = 0 error_count = 0 @@ -215,8 +223,11 @@ def main() -> None: parser.add_argument( "--model-name", - default="openhands", - help="Model name to use in the model_name_or_path field (default: openhands)", + required=True, + help=( + "Model identifier to record in model_name_or_path " + "(e.g., litellm_proxy/claude-sonnet-4-5-20250929)" + ), ) parser.add_argument( diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py index 0984b3e5..ebef00bd 100644 --- a/benchmarks/swebenchmultimodal/eval_infer.py +++ b/benchmarks/swebenchmultimodal/eval_infer.py @@ -157,7 +157,7 @@ def update_report_with_component_scores(report_json_path: Path) -> dict[str, flo def convert_to_swebench_format( - input_file: str, output_file: str, model_name: str = "OpenHands" + input_file: str, output_file: str, model_name: str ) -> None: """ Convert OpenHands output.jsonl to SWE-Bench prediction format. @@ -177,11 +177,19 @@ def convert_to_swebench_format( { "instance_id": "django__django-11333", "model_patch": "diff --git a/file.py b/file.py\n...", - "model_name_or_path": "OpenHands" + "model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929" } + + The model identifier is required for attribution in SWE-Bench reports and + filenames. Typical values mirror the LLM config's `model` field, e.g., + "litellm_proxy/claude-sonnet-4-5-20250929" or + "litellm_proxy/claude-haiku-4-5-20251001". """ logger.info(f"Converting {input_file} to SWE-Bench format: {output_file}") + if not model_name: + raise ValueError("model_name is required and cannot be empty") + converted_count = 0 error_count = 0 @@ -400,8 +408,11 @@ def main() -> None: parser.add_argument( "--model-name", - default="openhands", - help="Model name to use in the model_name_or_path field (default: openhands)", + required=True, + help=( + "Model identifier to record in model_name_or_path " + "(e.g., litellm_proxy/claude-sonnet-4-5-20250929)" + ), ) parser.add_argument( diff --git a/tests/test_swebenchmultimodal.py b/tests/test_swebenchmultimodal.py index 3238ee90..f13b9f6f 100644 --- a/tests/test_swebenchmultimodal.py +++ b/tests/test_swebenchmultimodal.py @@ -22,9 +22,31 @@ def test_empty_input_file_does_not_raise(self): output_path = outfile.name # Should not raise - let the harness handle empty results - convert_to_swebench_format(input_path, output_path) + convert_to_swebench_format( + input_path, output_path, "litellm_proxy/claude-sonnet-4-5-20250929" + ) # Verify output file is empty with open(output_path, "r") as f: lines = f.readlines() assert len(lines) == 0 + + def test_raises_when_model_name_missing(self): + """Ensure a missing model identifier is rejected.""" + with tempfile.NamedTemporaryFile( + mode="w", suffix=".jsonl", delete=False + ) as infile: + infile.write("") + input_path = infile.name + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".swebench.jsonl", delete=False + ) as outfile: + output_path = outfile.name + + try: + convert_to_swebench_format(input_path, output_path, None) # type: ignore[arg-type] + except ValueError: + return + + assert False, "Expected ValueError when model_name_or_path is None" From 16bbbac3ea370565d0f2fda035bbb658272f1dc6 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Wed, 28 Jan 2026 15:35:41 +0100 Subject: [PATCH 2/3] Require explicit model_name across eval converters Co-authored-by: openhands --- benchmarks/commit0/eval_infer.py | 19 ++++++++++++++----- benchmarks/gaia/eval_infer.py | 19 +++++++++++++++---- benchmarks/swtbench/eval_infer.py | 19 +++++++++++++++---- 3 files changed, 44 insertions(+), 13 deletions(-) diff --git a/benchmarks/commit0/eval_infer.py b/benchmarks/commit0/eval_infer.py index f03e73f6..f3219b69 100644 --- a/benchmarks/commit0/eval_infer.py +++ b/benchmarks/commit0/eval_infer.py @@ -26,9 +26,7 @@ logger = logging.getLogger(__name__) -def process_commit0_results( - input_file: str, output_file: str, model_name: str = "openhands" -) -> None: +def process_commit0_results(input_file: str, output_file: str, model_name: str) -> None: """ Process Commit0 output.jsonl and generate evaluation report. @@ -63,9 +61,17 @@ def process_commit0_results( "resolved_ids": [...], "unresolved_ids": [...] } + + The model identifier is required for attribution in downstream reports and + filenames. Typical values mirror the LLM config's `model` field, e.g., + "litellm_proxy/claude-sonnet-4-5-20250929" or + "litellm_proxy/claude-haiku-4-5-20251001". """ logger.info(f"Processing {input_file} to generate report: {output_file}") + if not model_name: + raise ValueError("model_name is required and cannot be empty") + completed_ids = [] resolved_ids = [] unresolved_ids = [] @@ -174,8 +180,11 @@ def main() -> None: parser.add_argument( "--model-name", - default="openhands", - help="Model name to use in the model_name_or_path field (default: openhands)", + required=True, + help=( + "Model identifier to record in model_name_or_path " + "(e.g., litellm_proxy/claude-sonnet-4-5-20250929)" + ), ) args = parser.parse_args() diff --git a/benchmarks/gaia/eval_infer.py b/benchmarks/gaia/eval_infer.py index 889d132d..af0707c3 100644 --- a/benchmarks/gaia/eval_infer.py +++ b/benchmarks/gaia/eval_infer.py @@ -29,7 +29,7 @@ def process_gaia_results( input_file: str, output_file: str, - model_name: str = "openhands", + model_name: str, ) -> None: """ Process GAIA output.jsonl and generate evaluation report. @@ -49,7 +49,7 @@ def process_gaia_results( Report format (similar to SWE-Bench): { - "model_name_or_path": "openhands", + "model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929", "total_instances": 165, "submitted_instances": 165, "completed_instances": 165, @@ -64,9 +64,17 @@ def process_gaia_results( "resolved_ids": [...], "unresolved_ids": [...] } + + The model identifier is required for attribution in GAIA reports and + filenames. Typical values mirror the LLM config's `model` field, e.g., + "litellm_proxy/claude-sonnet-4-5-20250929" or + "litellm_proxy/claude-haiku-4-5-20251001". """ logger.info(f"Processing {input_file} to generate report: {output_file}") + if not model_name: + raise ValueError("model_name is required and cannot be empty") + completed_ids = [] resolved_ids = [] unresolved_ids = [] @@ -197,8 +205,11 @@ def main() -> None: parser.add_argument( "--model-name", - default="openhands", - help="Model name to use in the model_name_or_path field (default: openhands)", + required=True, + help=( + "Model identifier to record in model_name_or_path " + "(e.g., litellm_proxy/claude-sonnet-4-5-20250929)" + ), ) args = parser.parse_args() diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 4f5f0632..24a76929 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -147,7 +147,7 @@ def update_report_with_submitted_instances( def convert_to_swtbench_format( - input_file: str, output_file: str, model_name: str = "OpenHands" + input_file: str, output_file: str, model_name: str ) -> None: """ Convert OpenHands output.jsonl to SWT-Bench prediction format. @@ -167,11 +167,19 @@ def convert_to_swtbench_format( { "instance_id": "sympy__sympy-20590", "model_patch": "diff --git a/file.py b/file.py\n...", - "model_name_or_path": "OpenHands" + "model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929" } + + The model identifier is required for attribution in SWT-Bench reports and + filenames. Typical values mirror the LLM config's `model` field, e.g., + "litellm_proxy/claude-sonnet-4-5-20250929" or + "litellm_proxy/claude-haiku-4-5-20251001". """ logger.info(f"Converting {input_file} to SWT-Bench format: {output_file}") + if not model_name: + raise ValueError("model_name is required and cannot be empty") + converted_count = 0 error_count = 0 @@ -378,8 +386,11 @@ def main() -> None: parser.add_argument( "--model-name", - default="OpenHands", - help="Model name to use in the model_name_or_path field (default: OpenHands)", + required=True, + help=( + "Model identifier to record in model_name_or_path " + "(e.g., litellm_proxy/claude-sonnet-4-5-20250929)" + ), ) parser.add_argument( From c5c482085d3d25b417a0c0fe423318da19cdf4a5 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Wed, 28 Jan 2026 16:07:49 +0100 Subject: [PATCH 3/3] Fix Commit0 tests after model_name requirement Co-authored-by: openhands --- benchmarks/commit0/tests/test_eval_infer.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/benchmarks/commit0/tests/test_eval_infer.py b/benchmarks/commit0/tests/test_eval_infer.py index 17c907e5..9c777b78 100644 --- a/benchmarks/commit0/tests/test_eval_infer.py +++ b/benchmarks/commit0/tests/test_eval_infer.py @@ -33,7 +33,11 @@ def test_output_file_naming(): expected_output_file = Path(tmpdir) / "output.report.json" # Process the results - process_commit0_results(str(input_file), str(expected_output_file)) + process_commit0_results( + str(input_file), + str(expected_output_file), + "litellm_proxy/claude-sonnet-4-5-20250929", + ) # Verify the output file was created assert expected_output_file.exists(), ( @@ -75,7 +79,11 @@ def test_output_file_naming_with_different_input_name(): expected_output_file = Path(tmpdir) / "results.report.json" # Process the results - process_commit0_results(str(input_file), str(expected_output_file)) + process_commit0_results( + str(input_file), + str(expected_output_file), + "litellm_proxy/claude-sonnet-4-5-20250929", + ) # Verify the output file was created assert expected_output_file.exists(), (