From 2a41bbac4d11ed0d1ec7c1ed0eccef6bea054415 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 28 Jan 2026 15:20:30 +0100
Subject: [PATCH 1/3] Require explicit model_name for SWE-Bench outputs

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swebench/eval_infer.py           | 19 ++++++++++++----
 benchmarks/swebenchmultimodal/eval_infer.py | 19 ++++++++++++----
 tests/test_swebenchmultimodal.py            | 24 ++++++++++++++++++++-
 3 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py
index f252a56a..96b61e08 100644
--- a/benchmarks/swebench/eval_infer.py
+++ b/benchmarks/swebench/eval_infer.py
@@ -26,7 +26,7 @@
 
 
 def convert_to_swebench_format(
-    input_file: str, output_file: str, model_name: str = "OpenHands"
+    input_file: str, output_file: str, model_name: str
 ) -> None:
     """
     Convert OpenHands output.jsonl to SWE-Bench prediction format.
@@ -46,11 +46,19 @@ def convert_to_swebench_format(
     {
         "instance_id": "django__django-11333",
         "model_patch": "diff --git a/file.py b/file.py\n...",
-        "model_name_or_path": "OpenHands"
+        "model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929"
     }
+
+    The model identifier is required for attribution in SWE-Bench reports and
+    filenames. Typical values mirror the LLM config's `model` field, e.g.,
+    "litellm_proxy/claude-sonnet-4-5-20250929" or
+    "litellm_proxy/claude-haiku-4-5-20251001".
     """
     logger.info(f"Converting {input_file} to SWE-Bench format: {output_file}")
 
+    if not model_name:
+        raise ValueError("model_name is required and cannot be empty")
+
     converted_count = 0
     error_count = 0
 
@@ -215,8 +223,11 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        default="openhands",
-        help="Model name to use in the model_name_or_path field (default: openhands)",
+        required=True,
+        help=(
+            "Model identifier to record in model_name_or_path "
+            "(e.g., litellm_proxy/claude-sonnet-4-5-20250929)"
+        ),
     )
 
     parser.add_argument(
diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py
index 0984b3e5..ebef00bd 100644
--- a/benchmarks/swebenchmultimodal/eval_infer.py
+++ b/benchmarks/swebenchmultimodal/eval_infer.py
@@ -157,7 +157,7 @@ def update_report_with_component_scores(report_json_path: Path) -> dict[str, flo
 
 
 def convert_to_swebench_format(
-    input_file: str, output_file: str, model_name: str = "OpenHands"
+    input_file: str, output_file: str, model_name: str
 ) -> None:
     """
     Convert OpenHands output.jsonl to SWE-Bench prediction format.
@@ -177,11 +177,19 @@ def convert_to_swebench_format(
     {
         "instance_id": "django__django-11333",
         "model_patch": "diff --git a/file.py b/file.py\n...",
-        "model_name_or_path": "OpenHands"
+        "model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929"
     }
+
+    The model identifier is required for attribution in SWE-Bench reports and
+    filenames. Typical values mirror the LLM config's `model` field, e.g.,
+    "litellm_proxy/claude-sonnet-4-5-20250929" or
+    "litellm_proxy/claude-haiku-4-5-20251001".
     """
     logger.info(f"Converting {input_file} to SWE-Bench format: {output_file}")
 
+    if not model_name:
+        raise ValueError("model_name is required and cannot be empty")
+
     converted_count = 0
     error_count = 0
 
@@ -400,8 +408,11 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        default="openhands",
-        help="Model name to use in the model_name_or_path field (default: openhands)",
+        required=True,
+        help=(
+            "Model identifier to record in model_name_or_path "
+            "(e.g., litellm_proxy/claude-sonnet-4-5-20250929)"
+        ),
     )
 
     parser.add_argument(
diff --git a/tests/test_swebenchmultimodal.py b/tests/test_swebenchmultimodal.py
index 3238ee90..f13b9f6f 100644
--- a/tests/test_swebenchmultimodal.py
+++ b/tests/test_swebenchmultimodal.py
@@ -22,9 +22,31 @@ def test_empty_input_file_does_not_raise(self):
             output_path = outfile.name
 
         # Should not raise - let the harness handle empty results
-        convert_to_swebench_format(input_path, output_path)
+        convert_to_swebench_format(
+            input_path, output_path, "litellm_proxy/claude-sonnet-4-5-20250929"
+        )
 
         # Verify output file is empty
         with open(output_path, "r") as f:
             lines = f.readlines()
         assert len(lines) == 0
+
+    def test_raises_when_model_name_missing(self):
+        """Ensure a missing model identifier is rejected."""
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".jsonl", delete=False
+        ) as infile:
+            infile.write("")
+            input_path = infile.name
+
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".swebench.jsonl", delete=False
+        ) as outfile:
+            output_path = outfile.name
+
+        try:
+            convert_to_swebench_format(input_path, output_path, None)  # type: ignore[arg-type]
+        except ValueError:
+            return
+
+        assert False, "Expected ValueError when model_name_or_path is None"

From 16bbbac3ea370565d0f2fda035bbb658272f1dc6 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 28 Jan 2026 15:35:41 +0100
Subject: [PATCH 2/3] Require explicit model_name across eval converters

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/eval_infer.py  | 19 ++++++++++++++-----
 benchmarks/gaia/eval_infer.py     | 19 +++++++++++++++----
 benchmarks/swtbench/eval_infer.py | 19 +++++++++++++++----
 3 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/benchmarks/commit0/eval_infer.py b/benchmarks/commit0/eval_infer.py
index f03e73f6..f3219b69 100644
--- a/benchmarks/commit0/eval_infer.py
+++ b/benchmarks/commit0/eval_infer.py
@@ -26,9 +26,7 @@
 logger = logging.getLogger(__name__)
 
 
-def process_commit0_results(
-    input_file: str, output_file: str, model_name: str = "openhands"
-) -> None:
+def process_commit0_results(input_file: str, output_file: str, model_name: str) -> None:
     """
     Process Commit0 output.jsonl and generate evaluation report.
 
@@ -63,9 +61,17 @@ def process_commit0_results(
         "resolved_ids": [...],
         "unresolved_ids": [...]
     }
+
+    The model identifier is required for attribution in downstream reports and
+    filenames. Typical values mirror the LLM config's `model` field, e.g.,
+    "litellm_proxy/claude-sonnet-4-5-20250929" or
+    "litellm_proxy/claude-haiku-4-5-20251001".
     """
     logger.info(f"Processing {input_file} to generate report: {output_file}")
 
+    if not model_name:
+        raise ValueError("model_name is required and cannot be empty")
+
     completed_ids = []
     resolved_ids = []
     unresolved_ids = []
@@ -174,8 +180,11 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        default="openhands",
-        help="Model name to use in the model_name_or_path field (default: openhands)",
+        required=True,
+        help=(
+            "Model identifier to record in model_name_or_path "
+            "(e.g., litellm_proxy/claude-sonnet-4-5-20250929)"
+        ),
     )
 
     args = parser.parse_args()
diff --git a/benchmarks/gaia/eval_infer.py b/benchmarks/gaia/eval_infer.py
index 889d132d..af0707c3 100644
--- a/benchmarks/gaia/eval_infer.py
+++ b/benchmarks/gaia/eval_infer.py
@@ -29,7 +29,7 @@
 def process_gaia_results(
     input_file: str,
     output_file: str,
-    model_name: str = "openhands",
+    model_name: str,
 ) -> None:
     """
     Process GAIA output.jsonl and generate evaluation report.
@@ -49,7 +49,7 @@ def process_gaia_results(
 
     Report format (similar to SWE-Bench):
     {
-        "model_name_or_path": "openhands",
+        "model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929",
         "total_instances": 165,
         "submitted_instances": 165,
         "completed_instances": 165,
@@ -64,9 +64,17 @@ def process_gaia_results(
         "resolved_ids": [...],
         "unresolved_ids": [...]
     }
+
+    The model identifier is required for attribution in GAIA reports and
+    filenames. Typical values mirror the LLM config's `model` field, e.g.,
+    "litellm_proxy/claude-sonnet-4-5-20250929" or
+    "litellm_proxy/claude-haiku-4-5-20251001".
     """
     logger.info(f"Processing {input_file} to generate report: {output_file}")
 
+    if not model_name:
+        raise ValueError("model_name is required and cannot be empty")
+
     completed_ids = []
     resolved_ids = []
     unresolved_ids = []
@@ -197,8 +205,11 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        default="openhands",
-        help="Model name to use in the model_name_or_path field (default: openhands)",
+        required=True,
+        help=(
+            "Model identifier to record in model_name_or_path "
+            "(e.g., litellm_proxy/claude-sonnet-4-5-20250929)"
+        ),
     )
 
     args = parser.parse_args()
diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 4f5f0632..24a76929 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -147,7 +147,7 @@ def update_report_with_submitted_instances(
 
 
 def convert_to_swtbench_format(
-    input_file: str, output_file: str, model_name: str = "OpenHands"
+    input_file: str, output_file: str, model_name: str
 ) -> None:
     """
     Convert OpenHands output.jsonl to SWT-Bench prediction format.
@@ -167,11 +167,19 @@ def convert_to_swtbench_format(
     {
         "instance_id": "sympy__sympy-20590",
         "model_patch": "diff --git a/file.py b/file.py\n...",
-        "model_name_or_path": "OpenHands"
+        "model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929"
     }
+
+    The model identifier is required for attribution in SWT-Bench reports and
+    filenames. Typical values mirror the LLM config's `model` field, e.g.,
+    "litellm_proxy/claude-sonnet-4-5-20250929" or
+    "litellm_proxy/claude-haiku-4-5-20251001".
     """
     logger.info(f"Converting {input_file} to SWT-Bench format: {output_file}")
 
+    if not model_name:
+        raise ValueError("model_name is required and cannot be empty")
+
     converted_count = 0
     error_count = 0
 
@@ -378,8 +386,11 @@ def main() -> None:
 
     parser.add_argument(
         "--model-name",
-        default="OpenHands",
-        help="Model name to use in the model_name_or_path field (default: OpenHands)",
+        required=True,
+        help=(
+            "Model identifier to record in model_name_or_path "
+            "(e.g., litellm_proxy/claude-sonnet-4-5-20250929)"
+        ),
     )
 
     parser.add_argument(

From c5c482085d3d25b417a0c0fe423318da19cdf4a5 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 28 Jan 2026 16:07:49 +0100
Subject: [PATCH 3/3] Fix Commit0 tests after model_name requirement

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/tests/test_eval_infer.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/benchmarks/commit0/tests/test_eval_infer.py b/benchmarks/commit0/tests/test_eval_infer.py
index 17c907e5..9c777b78 100644
--- a/benchmarks/commit0/tests/test_eval_infer.py
+++ b/benchmarks/commit0/tests/test_eval_infer.py
@@ -33,7 +33,11 @@ def test_output_file_naming():
         expected_output_file = Path(tmpdir) / "output.report.json"
 
         # Process the results
-        process_commit0_results(str(input_file), str(expected_output_file))
+        process_commit0_results(
+            str(input_file),
+            str(expected_output_file),
+            "litellm_proxy/claude-sonnet-4-5-20250929",
+        )
 
         # Verify the output file was created
         assert expected_output_file.exists(), (
@@ -75,7 +79,11 @@ def test_output_file_naming_with_different_input_name():
         expected_output_file = Path(tmpdir) / "results.report.json"
 
         # Process the results
-        process_commit0_results(str(input_file), str(expected_output_file))
+        process_commit0_results(
+            str(input_file),
+            str(expected_output_file),
+            "litellm_proxy/claude-sonnet-4-5-20250929",
+        )
 
         # Verify the output file was created
         assert expected_output_file.exists(), (