Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions benchmarks/commit0/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@
logger = logging.getLogger(__name__)


def process_commit0_results(
input_file: str, output_file: str, model_name: str = "openhands"
) -> None:
def process_commit0_results(input_file: str, output_file: str, model_name: str) -> None:
"""
Process Commit0 output.jsonl and generate evaluation report.

Expand Down Expand Up @@ -63,9 +61,17 @@ def process_commit0_results(
"resolved_ids": [...],
"unresolved_ids": [...]
}

The model identifier is required for attribution in downstream reports and
filenames. Typical values mirror the LLM config's `model` field, e.g.,
"litellm_proxy/claude-sonnet-4-5-20250929" or
"litellm_proxy/claude-haiku-4-5-20251001".
"""
logger.info(f"Processing {input_file} to generate report: {output_file}")

if not model_name:
raise ValueError("model_name is required and cannot be empty")

completed_ids = []
resolved_ids = []
unresolved_ids = []
Expand Down Expand Up @@ -174,8 +180,11 @@ def main() -> None:

parser.add_argument(
"--model-name",
default="openhands",
help="Model name to use in the model_name_or_path field (default: openhands)",
required=True,
help=(
"Model identifier to record in model_name_or_path "
"(e.g., litellm_proxy/claude-sonnet-4-5-20250929)"
),
)

args = parser.parse_args()
Expand Down
12 changes: 10 additions & 2 deletions benchmarks/commit0/tests/test_eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,11 @@ def test_output_file_naming():
expected_output_file = Path(tmpdir) / "output.report.json"

# Process the results
process_commit0_results(str(input_file), str(expected_output_file))
process_commit0_results(
str(input_file),
str(expected_output_file),
"litellm_proxy/claude-sonnet-4-5-20250929",
)

# Verify the output file was created
assert expected_output_file.exists(), (
Expand Down Expand Up @@ -75,7 +79,11 @@ def test_output_file_naming_with_different_input_name():
expected_output_file = Path(tmpdir) / "results.report.json"

# Process the results
process_commit0_results(str(input_file), str(expected_output_file))
process_commit0_results(
str(input_file),
str(expected_output_file),
"litellm_proxy/claude-sonnet-4-5-20250929",
)

# Verify the output file was created
assert expected_output_file.exists(), (
Expand Down
19 changes: 15 additions & 4 deletions benchmarks/gaia/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
def process_gaia_results(
input_file: str,
output_file: str,
model_name: str = "openhands",
model_name: str,
) -> None:
"""
Process GAIA output.jsonl and generate evaluation report.
Expand All @@ -49,7 +49,7 @@ def process_gaia_results(

Report format (similar to SWE-Bench):
{
"model_name_or_path": "openhands",
"model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929",
"total_instances": 165,
"submitted_instances": 165,
"completed_instances": 165,
Expand All @@ -64,9 +64,17 @@ def process_gaia_results(
"resolved_ids": [...],
"unresolved_ids": [...]
}

The model identifier is required for attribution in GAIA reports and
filenames. Typical values mirror the LLM config's `model` field, e.g.,
"litellm_proxy/claude-sonnet-4-5-20250929" or
"litellm_proxy/claude-haiku-4-5-20251001".
"""
logger.info(f"Processing {input_file} to generate report: {output_file}")

if not model_name:
raise ValueError("model_name is required and cannot be empty")

completed_ids = []
resolved_ids = []
unresolved_ids = []
Expand Down Expand Up @@ -197,8 +205,11 @@ def main() -> None:

parser.add_argument(
"--model-name",
default="openhands",
help="Model name to use in the model_name_or_path field (default: openhands)",
required=True,
help=(
"Model identifier to record in model_name_or_path "
"(e.g., litellm_proxy/claude-sonnet-4-5-20250929)"
),
)

args = parser.parse_args()
Expand Down
19 changes: 15 additions & 4 deletions benchmarks/swebench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@


def convert_to_swebench_format(
input_file: str, output_file: str, model_name: str = "OpenHands"
input_file: str, output_file: str, model_name: str
) -> None:
"""
Convert OpenHands output.jsonl to SWE-Bench prediction format.
Expand All @@ -46,11 +46,19 @@ def convert_to_swebench_format(
{
"instance_id": "django__django-11333",
"model_patch": "diff --git a/file.py b/file.py\n...",
"model_name_or_path": "OpenHands"
"model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Model name should include OpenHands, it can also include LLM model.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. But I asked an agent "what are the actualy values that are given to this function given our CI framework" and those are the values. So we need to fix something I guess

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this function is not used the way it's intended to be used ?
There should be no argument model_name or model_name_or_path, and "OpenHands" should just what's written there?

}
The model identifier is required for attribution in SWE-Bench reports and
filenames. Typical values mirror the LLM config's `model` field, e.g.,
"litellm_proxy/claude-sonnet-4-5-20250929" or
"litellm_proxy/claude-haiku-4-5-20251001".
"""
logger.info(f"Converting {input_file} to SWE-Bench format: {output_file}")

if not model_name:
raise ValueError("model_name is required and cannot be empty")

converted_count = 0
error_count = 0

Expand Down Expand Up @@ -215,8 +223,11 @@ def main() -> None:

parser.add_argument(
"--model-name",
default="openhands",
help="Model name to use in the model_name_or_path field (default: openhands)",
required=True,
help=(
"Model identifier to record in model_name_or_path "
"(e.g., litellm_proxy/claude-sonnet-4-5-20250929)"
),
)

parser.add_argument(
Expand Down
19 changes: 15 additions & 4 deletions benchmarks/swebenchmultimodal/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def update_report_with_component_scores(report_json_path: Path) -> dict[str, flo


def convert_to_swebench_format(
input_file: str, output_file: str, model_name: str = "OpenHands"
input_file: str, output_file: str, model_name: str
) -> None:
"""
Convert OpenHands output.jsonl to SWE-Bench prediction format.
Expand All @@ -177,11 +177,19 @@ def convert_to_swebench_format(
{
"instance_id": "django__django-11333",
"model_patch": "diff --git a/file.py b/file.py\n...",
"model_name_or_path": "OpenHands"
"model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929"
}

The model identifier is required for attribution in SWE-Bench reports and
filenames. Typical values mirror the LLM config's `model` field, e.g.,
"litellm_proxy/claude-sonnet-4-5-20250929" or
"litellm_proxy/claude-haiku-4-5-20251001".
"""
logger.info(f"Converting {input_file} to SWE-Bench format: {output_file}")

if not model_name:
raise ValueError("model_name is required and cannot be empty")

converted_count = 0
error_count = 0

Expand Down Expand Up @@ -400,8 +408,11 @@ def main() -> None:

parser.add_argument(
"--model-name",
default="openhands",
help="Model name to use in the model_name_or_path field (default: openhands)",
required=True,
help=(
"Model identifier to record in model_name_or_path "
"(e.g., litellm_proxy/claude-sonnet-4-5-20250929)"
),
)

parser.add_argument(
Expand Down
19 changes: 15 additions & 4 deletions benchmarks/swtbench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def update_report_with_submitted_instances(


def convert_to_swtbench_format(
input_file: str, output_file: str, model_name: str = "OpenHands"
input_file: str, output_file: str, model_name: str
) -> None:
"""
Convert OpenHands output.jsonl to SWT-Bench prediction format.
Expand All @@ -167,11 +167,19 @@ def convert_to_swtbench_format(
{
"instance_id": "sympy__sympy-20590",
"model_patch": "diff --git a/file.py b/file.py\n...",
"model_name_or_path": "OpenHands"
"model_name_or_path": "litellm_proxy/claude-sonnet-4-5-20250929"
}

The model identifier is required for attribution in SWT-Bench reports and
filenames. Typical values mirror the LLM config's `model` field, e.g.,
"litellm_proxy/claude-sonnet-4-5-20250929" or
"litellm_proxy/claude-haiku-4-5-20251001".
"""
logger.info(f"Converting {input_file} to SWT-Bench format: {output_file}")

if not model_name:
raise ValueError("model_name is required and cannot be empty")

converted_count = 0
error_count = 0

Expand Down Expand Up @@ -378,8 +386,11 @@ def main() -> None:

parser.add_argument(
"--model-name",
default="OpenHands",
help="Model name to use in the model_name_or_path field (default: OpenHands)",
required=True,
help=(
"Model identifier to record in model_name_or_path "
"(e.g., litellm_proxy/claude-sonnet-4-5-20250929)"
),
)

parser.add_argument(
Expand Down
24 changes: 23 additions & 1 deletion tests/test_swebenchmultimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,31 @@ def test_empty_input_file_does_not_raise(self):
output_path = outfile.name

# Should not raise - let the harness handle empty results
convert_to_swebench_format(input_path, output_path)
convert_to_swebench_format(
input_path, output_path, "litellm_proxy/claude-sonnet-4-5-20250929"
)

# Verify output file is empty
with open(output_path, "r") as f:
lines = f.readlines()
assert len(lines) == 0

def test_raises_when_model_name_missing(self):
"""Ensure a missing model identifier is rejected."""
with tempfile.NamedTemporaryFile(
mode="w", suffix=".jsonl", delete=False
) as infile:
infile.write("")
input_path = infile.name

with tempfile.NamedTemporaryFile(
mode="w", suffix=".swebench.jsonl", delete=False
) as outfile:
output_path = outfile.name

try:
convert_to_swebench_format(input_path, output_path, None) # type: ignore[arg-type]
except ValueError:
return

assert False, "Expected ValueError when model_name_or_path is None"