Require explicit model_name for SWE-Bench converters #376

juanmichelini · 2026-01-29T14:46:56Z

Model name should include OpenHands, it can also include LLM model.

OK. But I asked an agent "what are the actualy values that are given to this function given our CI framework" and those are the values. So we need to fix something I guess

I guess this function is not used the way it's intended to be used ?
There should be no argument model_name or model_name_or_path, and "OpenHands" should just what's written there?

-Original file line number
+Diff line change
@@ Expand Up / @@ -26,9 +26,7 @@ @@
     logger = logging.getLogger(__name__)
-    def process_commit0_results(
-        input_file: str, output_file: str, model_name: str = "openhands"
-    ) -> None:
+    def process_commit0_results(input_file: str, output_file: str, model_name: str) -> None:
         """
         Process Commit0 output.jsonl and generate evaluation report.
@@ Expand Down Expand Up / @@ -63,9 +61,17 @@ def process_commit0_results( @@
             "resolved_ids": [...],
             "unresolved_ids": [...]
         }
+        The model identifier is required for attribution in downstream reports and
+        filenames. Typical values mirror the LLM config's `model` field, e.g.,
+        "litellm_proxy/claude-sonnet-4-5-20250929" or
+        "litellm_proxy/claude-haiku-4-5-20251001".
         """
         logger.info(f"Processing {input_file} to generate report: {output_file}")
+        if not model_name:
+            raise ValueError("model_name is required and cannot be empty")
         completed_ids = []
         resolved_ids = []
         unresolved_ids = []
@@ Expand Down Expand Up / @@ -174,8 +180,11 @@ def main() -> None: @@
         parser.add_argument(
             "--model-name",
-            default="openhands",
-            help="Model name to use in the model_name_or_path field (default: openhands)",
+            required=True,
+            help=(
+                "Model identifier to record in model_name_or_path "
+                "(e.g., litellm_proxy/claude-sonnet-4-5-20250929)"
+            ),
         )
         args = parser.parse_args()
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -33,7 +33,11 @@ def test_output_file_naming(): @@
             expected_output_file = Path(tmpdir) / "output.report.json"
             # Process the results
-            process_commit0_results(str(input_file), str(expected_output_file))
+            process_commit0_results(
+                str(input_file),
+                str(expected_output_file),
+                "litellm_proxy/claude-sonnet-4-5-20250929",
+            )
             # Verify the output file was created
             assert expected_output_file.exists(), (
@@ Expand Down Expand Up / @@ -75,7 +79,11 @@ def test_output_file_naming_with_different_input_name(): @@
             expected_output_file = Path(tmpdir) / "results.report.json"
             # Process the results
-            process_commit0_results(str(input_file), str(expected_output_file))
+            process_commit0_results(
+                str(input_file),
+                str(expected_output_file),
+                "litellm_proxy/claude-sonnet-4-5-20250929",
+            )
             # Verify the output file was created
             assert expected_output_file.exists(), (
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -29,7 +29,7 @@ @@
     def process_gaia_results(
         input_file: str,
         output_file: str,
-        model_name: str = "openhands",
+        model_name: str,
     ) -> None:
         """
         Process GAIA output.jsonl and generate evaluation report.
@@ Expand All / @@ -49,7 +49,7 @@ def process_gaia_results( @@
         Report format (similar to SWE-Bench):
         {
-            "model_name_or_path": "openhands",
+            "model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929",
             "total_instances": 165,
             "submitted_instances": 165,
             "completed_instances": 165,
@@ Expand All / @@ -64,9 +64,17 @@ def process_gaia_results( @@
             "resolved_ids": [...],
             "unresolved_ids": [...]
         }
+        The model identifier is required for attribution in GAIA reports and
+        filenames. Typical values mirror the LLM config's `model` field, e.g.,
+        "litellm_proxy/claude-sonnet-4-5-20250929" or
+        "litellm_proxy/claude-haiku-4-5-20251001".
         """
         logger.info(f"Processing {input_file} to generate report: {output_file}")
+        if not model_name:
+            raise ValueError("model_name is required and cannot be empty")
         completed_ids = []
         resolved_ids = []
         unresolved_ids = []
@@ Expand Down Expand Up / @@ -197,8 +205,11 @@ def main() -> None: @@
         parser.add_argument(
             "--model-name",
-            default="openhands",
-            help="Model name to use in the model_name_or_path field (default: openhands)",
+            required=True,
+            help=(
+                "Model identifier to record in model_name_or_path "
+                "(e.g., litellm_proxy/claude-sonnet-4-5-20250929)"
+            ),
         )
         args = parser.parse_args()
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -26,7 +26,7 @@ @@
     def convert_to_swebench_format(
-        input_file: str, output_file: str, model_name: str = "OpenHands"
+        input_file: str, output_file: str, model_name: str
     ) -> None:
         """
         Convert OpenHands output.jsonl to SWE-Bench prediction format.
@@ Expand All / @@ -46,11 +46,19 @@ def convert_to_swebench_format( @@
         {
             "instance_id": "django__django-11333",
             "model_patch": "diff --git a/file.py b/file.py\n...",
-            "model_name_or_path": "OpenHands"
+            "model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929"
         }
+        The model identifier is required for attribution in SWE-Bench reports and
+        filenames. Typical values mirror the LLM config's `model` field, e.g.,
+        "litellm_proxy/claude-sonnet-4-5-20250929" or
+        "litellm_proxy/claude-haiku-4-5-20251001".
         """
         logger.info(f"Converting {input_file} to SWE-Bench format: {output_file}")
+        if not model_name:
+            raise ValueError("model_name is required and cannot be empty")
         converted_count = 0
         error_count = 0
@@ Expand Down Expand Up / @@ -215,8 +223,11 @@ def main() -> None: @@
         parser.add_argument(
             "--model-name",
-            default="openhands",
-            help="Model name to use in the model_name_or_path field (default: openhands)",
+            required=True,
+            help=(
+                "Model identifier to record in model_name_or_path "
+                "(e.g., litellm_proxy/claude-sonnet-4-5-20250929)"
+            ),
         )
         parser.add_argument(
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up @@
     def convert_to_swebench_format(
-        input_file: str, output_file: str, model_name: str = "OpenHands"
+        input_file: str, output_file: str, model_name: str
     ) -> None:
         """
         Convert OpenHands output.jsonl to SWE-Bench prediction format.
@@ Expand All / @@ -177,11 +177,19 @@ def convert_to_swebench_format( @@
         {
             "instance_id": "django__django-11333",
             "model_patch": "diff --git a/file.py b/file.py\n...",
-            "model_name_or_path": "OpenHands"
+            "model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929"
         }
+        The model identifier is required for attribution in SWE-Bench reports and
+        filenames. Typical values mirror the LLM config's `model` field, e.g.,
+        "litellm_proxy/claude-sonnet-4-5-20250929" or
+        "litellm_proxy/claude-haiku-4-5-20251001".
         """
         logger.info(f"Converting {input_file} to SWE-Bench format: {output_file}")
+        if not model_name:
+            raise ValueError("model_name is required and cannot be empty")
         converted_count = 0
         error_count = 0
@@ Expand Down Expand Up / @@ -400,8 +408,11 @@ def main() -> None: @@
         parser.add_argument(
             "--model-name",
-            default="openhands",
-            help="Model name to use in the model_name_or_path field (default: openhands)",
+            required=True,
+            help=(
+                "Model identifier to record in model_name_or_path "
+                "(e.g., litellm_proxy/claude-sonnet-4-5-20250929)"
+            ),
         )
         parser.add_argument(
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Require explicit model_name for SWE-Bench converters #376

Diff view

Diff view

There are no files selected for viewing

juanmichelini Jan 29, 2026

Uh oh!

simonrosenberg Jan 29, 2026

Uh oh!

simonrosenberg Jan 29, 2026

Uh oh!

-Original file line number
+Diff line change
@@ Expand Up / @@ -22,9 +22,31 @@ def test_empty_input_file_does_not_raise(self): @@
                 output_path = outfile.name
             # Should not raise - let the harness handle empty results
-            convert_to_swebench_format(input_path, output_path)
+            convert_to_swebench_format(
+                input_path, output_path, "litellm_proxy/claude-sonnet-4-5-20250929"
+            )
             # Verify output file is empty
             with open(output_path, "r") as f:
                 lines = f.readlines()
             assert len(lines) == 0
+        def test_raises_when_model_name_missing(self):
+            """Ensure a missing model identifier is rejected."""
+            with tempfile.NamedTemporaryFile(
+                mode="w", suffix=".jsonl", delete=False
+            ) as infile:
+                infile.write("")
+                input_path = infile.name
+            with tempfile.NamedTemporaryFile(
+                mode="w", suffix=".swebench.jsonl", delete=False
+            ) as outfile:
+                output_path = outfile.name
+            try:
+                convert_to_swebench_format(input_path, output_path, None)  # type: ignore[arg-type]
+            except ValueError:
+                return
+            assert False, "Expected ValueError when model_name_or_path is None"

Require explicit model_name for SWE-Bench converters #376

Are you sure you want to change the base?

Require explicit model_name for SWE-Bench converters #376

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

juanmichelini Jan 29, 2026

Choose a reason for hiding this comment

Uh oh!

simonrosenberg Jan 29, 2026

Choose a reason for hiding this comment

Uh oh!

simonrosenberg Jan 29, 2026

Choose a reason for hiding this comment

Uh oh!