From 0317f7d1a4c2f2afa5125e6bf0c0e909bb4da8cb Mon Sep 17 00:00:00 2001 From: "zhanggaohua@huawei.com" Date: Sat, 14 Mar 2026 15:29:39 +0800 Subject: [PATCH 1/2] convert calss to str before dump cfg --- ais_bench/benchmark/cli/config_manager.py | 6 +- ais_bench/benchmark/cli/utils.py | 20 ++ ais_bench/benchmark/cli/workers.py | 72 ++--- ais_bench/benchmark/datasets/__init__.py | 1 + ais_bench/benchmark/datasets/swebench.py | 68 +++++ ais_bench/benchmark/tasks/__init__.py | 3 + ais_bench/benchmark/tasks/swebench_eval.py | 159 +++++++++++ ais_bench/benchmark/tasks/swebench_infer.py | 247 ++++++++++++++++++ ais_bench/benchmark/utils/config/run.py | 21 +- .../swe_bench_examples/swe_bench_lite.py | 53 ++++ .../swe_bench_examples/swe_bench_verified.py | 53 ++++ 11 files changed, 667 insertions(+), 36 deletions(-) create mode 100644 ais_bench/benchmark/datasets/swebench.py create mode 100644 ais_bench/benchmark/tasks/swebench_eval.py create mode 100644 ais_bench/benchmark/tasks/swebench_infer.py create mode 100644 ais_bench/configs/swe_bench_examples/swe_bench_lite.py create mode 100644 ais_bench/configs/swe_bench_examples/swe_bench_verified.py diff --git a/ais_bench/benchmark/cli/config_manager.py b/ais_bench/benchmark/cli/config_manager.py index dde76527..cb05db73 100644 --- a/ais_bench/benchmark/cli/config_manager.py +++ b/ais_bench/benchmark/cli/config_manager.py @@ -9,11 +9,11 @@ from ais_bench.benchmark.utils.file import match_cfg_file from ais_bench.benchmark.utils.config.run import try_fill_in_custom_cfgs from ais_bench.benchmark.utils.logging.exceptions import CommandError, AISBenchConfigError -from ais_bench.benchmark.cli.utils import fill_model_path_if_datasets_need, fill_test_range_use_num_prompts +from ais_bench.benchmark.cli.utils import fill_model_path_if_datasets_need, fill_test_range_use_num_prompts, recur_convert_config_type class CustomConfigChecker: MODEL_REQUIRED_FIELDS = ['type', 'abbr', 'attr'] - DATASET_REQUIRED_FIELDS = ['type', 'abbr', 'reader_cfg', 'infer_cfg', 'eval_cfg'] + DATASET_REQUIRED_FIELDS = ['type', 'abbr'] SUMMARIZER_REQUIRED_FIELDS = ['attr'] def __init__(self, config, file_path): @@ -327,6 +327,8 @@ def _dump_and_reload_config(self): # dump config output_config_path = osp.join(self.cfg.work_dir, 'configs', f'{self.cfg_time_str}_{os.getpid()}.py') + + recur_convert_config_type(self.cfg) self.cfg.dump(output_config_path) # eval nums set if (self.args.num_prompts and self.args.num_prompts < 0) or self.args.num_prompts == 0: diff --git a/ais_bench/benchmark/cli/utils.py b/ais_bench/benchmark/cli/utils.py index 01ff50e5..81e92a84 100644 --- a/ais_bench/benchmark/cli/utils.py +++ b/ais_bench/benchmark/cli/utils.py @@ -2,6 +2,7 @@ import os from datetime import datetime +from mmengine.config import ConfigDict, Config from ais_bench.benchmark.utils.logging.exceptions import AISBenchConfigError from ais_bench.benchmark.utils.logging.logger import AISLogger from ais_bench.benchmark.utils.logging.error_codes import UTILS_CODES @@ -20,6 +21,25 @@ def get_config_type(obj) -> str: return obj return f"{obj.__module__}.{obj.__name__}" +def recur_convert_config_type(cfg): + """Recursively convert the type of the config to the string type. + + Args: + cfg: The config to convert. + """ + if isinstance(cfg, (dict, ConfigDict, Config)): + for key, value in cfg.items(): + if key == "type": + cfg[key] = get_config_type(value) + else: + cfg[key] = recur_convert_config_type(value) + elif isinstance(cfg, list): + for i, item in enumerate(cfg): + cfg[i] = recur_convert_config_type(item) if isinstance(item, (dict, ConfigDict, Config, list)) else item + else: + return cfg + return cfg + def get_current_time_str(): return datetime.now().strftime("%Y%m%d_%H%M%S") diff --git a/ais_bench/benchmark/cli/workers.py b/ais_bench/benchmark/cli/workers.py index ce1dd8bb..eb477361 100644 --- a/ais_bench/benchmark/cli/workers.py +++ b/ais_bench/benchmark/cli/workers.py @@ -42,26 +42,30 @@ class Infer(BaseWorker): def update_cfg(self, cfg: ConfigDict) -> None: def get_task_type() -> str: if cfg["models"][0]["attr"] == "service": - return get_config_type(OpenICLApiInferTask) + return OpenICLApiInferTask else: - return get_config_type(OpenICLInferTask) + return OpenICLInferTask - new_cfg = dict( - infer=dict( - partitioner=dict(type=get_config_type(NaivePartitioner)), - runner=dict( - max_num_workers=self.args.max_num_workers, - max_workers_per_gpu=self.args.max_workers_per_gpu, - debug=self.args.debug, - task=dict(type=get_task_type()), - type=get_config_type(LocalRunner), - ), - ), - ) + def update_new_infer_cfg(new_cfg: ConfigDict) -> None: + runner_cfg = new_cfg['infer']['runner'] + runner_cfg['max_num_workers'] = self.args.max_num_workers + runner_cfg['max_workers_per_gpu'] = self.args.max_workers_per_gpu + runner_cfg['debug'] = self.args.debug or cfg.cli_args.debug + if cfg.get('infer'): + new_cfg = dict(infer=cfg.infer) + else: + new_cfg = dict( + infer=dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + task=dict(type=get_task_type()), + type=LocalRunner, + ), + ), + ) + update_new_infer_cfg(new_cfg) cfg.merge_from_dict(new_cfg) - if cfg.cli_args.debug: - cfg.infer.runner.debug = True cfg.infer.partitioner["out_dir"] = osp.join(cfg["work_dir"], "predictions/") return cfg @@ -259,26 +263,28 @@ def _result_post_process(self, tasks, cfg: ConfigDict): class Eval(BaseWorker): def update_cfg(self, cfg: ConfigDict) -> None: - new_cfg = dict( - eval=dict( - partitioner=dict(type=get_config_type(NaivePartitioner)), - runner=dict( - max_num_workers=self.args.max_num_workers, - debug=self.args.debug, - task=dict(type=get_config_type(OpenICLEvalTask)), + def update_eval_cfg(new_cfg: ConfigDict) -> None: + runner_cfg = new_cfg['eval']['runner'] + runner_cfg['max_num_workers'] = self.args.max_num_workers + runner_cfg['max_workers_per_gpu'] = self.args.max_workers_per_gpu + runner_cfg['debug'] = self.args.debug + runner_cfg['dump_details'] = cfg.cli_args.dump_eval_details + runner_cfg['cal_extract_rate'] = cfg.cli_args.dump_extract_rate + + if cfg.get('eval'): + new_cfg = dict(eval=cfg.eval) + else: + new_cfg = dict( + eval=dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + task=dict(type=OpenICLEvalTask), ), - ), - ) + )) - new_cfg["eval"]["runner"]["type"] = get_config_type(LocalRunner) - new_cfg["eval"]["runner"]["max_workers_per_gpu"] = self.args.max_workers_per_gpu + update_eval_cfg(new_cfg) cfg.merge_from_dict(new_cfg) - if cfg.cli_args.dump_eval_details: - cfg.eval.runner.task.dump_details = True - if cfg.cli_args.dump_extract_rate: - cfg.eval.runner.task.cal_extract_rate = True - if cfg.cli_args.debug: - cfg.eval.runner.debug = True cfg.eval.partitioner["out_dir"] = osp.join(cfg["work_dir"], "results/") return cfg diff --git a/ais_bench/benchmark/datasets/__init__.py b/ais_bench/benchmark/datasets/__init__.py index 1581a2af..634c4a34 100644 --- a/ais_bench/benchmark/datasets/__init__.py +++ b/ais_bench/benchmark/datasets/__init__.py @@ -53,3 +53,4 @@ from ais_bench.benchmark.datasets.mmstar import * # noqa: F401, F403 from ais_bench.benchmark.datasets.dapo_math import * # noqa: F401, F403 from ais_bench.benchmark.datasets.mooncake_trace import * # noqa: F401, F403 +from ais_bench.benchmark.datasets.swebench import * # noqa: F401, F403 \ No newline at end of file diff --git a/ais_bench/benchmark/datasets/swebench.py b/ais_bench/benchmark/datasets/swebench.py new file mode 100644 index 00000000..a466e612 --- /dev/null +++ b/ais_bench/benchmark/datasets/swebench.py @@ -0,0 +1,68 @@ +import re +import random +from datasets import load_dataset, Dataset, DatasetDict + +from ais_bench.benchmark.registry import LOAD_DATASET +from ais_bench.benchmark.utils.logging.exceptions import ParameterValueError +from ais_bench.benchmark.utils.logging.error_codes import DSET_CODES +from ais_bench.benchmark.datasets.base import BaseDataset + +DATASET_MAPPING = { + "full": "princeton-nlp/SWE-Bench", + "verified": "princeton-nlp/SWE-Bench_Verified", + "lite": "princeton-nlp/SWE-Bench_Lite", + "multimodal": "princeton-nlp/SWE-Bench_Multimodal", + "multilingual": "swe-bench/SWE-Bench_Multilingual", +} + + +@LOAD_DATASET.register_module() +class SWEBenchDataset(BaseDataset): + def filter_instances( + self, instances: list[dict], *, filter_spec: str, shuffle: bool = False + ) -> list[dict]: + """Filter and slice a list of SWEBench instances.""" + if shuffle: + instances = sorted(instances.copy(), key=lambda x: x["instance_id"]) + random.seed(42) + random.shuffle(instances) + before_filter = len(instances) + instances = [ + instance + for instance in instances + if re.match(filter_spec, instance["instance_id"]) + ] + if (after_filter := len(instances)) != before_filter: + self.logger.info( + f"Instance filter: {before_filter} -> {after_filter} instances" + ) + return instances + + def load( + self, + path: str, + name: str, + split: str = "test", + filter_spec: str = "", + shuffle: bool = False, + ): + if name not in DATASET_MAPPING: + raise ParameterValueError( + DSET_CODES.INVALID_PARAM_VALUE, + f"Invalid swebench dataset name, expected one of {list(DATASET_MAPPING.keys())} but got {name}", + ) + try: + dataset = load_dataset("parquet", data_files={split: path}) + except Exception as e: + self.logger.warning( + f"Failed to load swebench dataset {name} from {path} with error: {e}, trying to load from Hugging Face" + ) + try: + dataset = load_dataset(DATASET_MAPPING[name], split=split) + except Exception as e: + raise ParameterValueError( + DSET_CODES.DATA_PREPROCESSING_ERROR, + f"Failed to load swebench dataset {name} from Hugging Face with error: {e}.", + ) + dataset = self.filter_instances(list(dataset), filter_spec=filter_spec, shuffle=shuffle) + return DatasetDict({"test": Dataset.from_list(dataset)}) diff --git a/ais_bench/benchmark/tasks/__init__.py b/ais_bench/benchmark/tasks/__init__.py index 7ba624f8..c094003a 100644 --- a/ais_bench/benchmark/tasks/__init__.py +++ b/ais_bench/benchmark/tasks/__init__.py @@ -1,3 +1,6 @@ from ais_bench.benchmark.tasks.openicl_eval import * # noqa: F401, F403 from ais_bench.benchmark.tasks.openicl_infer import * # noqa: F401, F403 from ais_bench.benchmark.tasks.openicl_api_infer import OpenICLApiInferTask +from ais_bench.benchmark.tasks.swebench_infer import SWEBenchInferTask +from ais_bench.benchmark.tasks.swebench_eval import SWEBenchEvalTask + diff --git a/ais_bench/benchmark/tasks/swebench_eval.py b/ais_bench/benchmark/tasks/swebench_eval.py new file mode 100644 index 00000000..413edc7a --- /dev/null +++ b/ais_bench/benchmark/tasks/swebench_eval.py @@ -0,0 +1,159 @@ +import argparse +import json +import os +import os.path as osp +import sys +import threading +import time + +from mmengine.config import Config, ConfigDict +from mmengine.utils import mkdir_or_exist + +from ais_bench.benchmark.registry import TASKS +from ais_bench.benchmark.tasks.base import BaseTask, TaskStateManager +from ais_bench.benchmark.utils.core.abbr import ( + get_infer_output_path, + task_abbr_from_cfg, +) +from ais_bench.benchmark.utils.logging import AISLogger + + +@TASKS.register_module() +class SWEBenchEvalTask(BaseTask): + """SWEBench Evaluation Task. + + Evaluates SWE-bench predictions using the official harness and writes + results to work_dir/results. + """ + + name_prefix = "SWEBenchEval" + log_subdir = "logs/eval" + output_subdir = "results" + + def __init__(self, cfg: ConfigDict): + super().__init__(cfg) + + def get_command(self, cfg_path: str, template: str) -> str: + sys.path.append(os.getcwd()) + script_path = __file__ + python = sys.executable + command = f"{python} {script_path} {cfg_path}" + return template.format(task_cmd=command) + + def run(self, task_state_manager: TaskStateManager): + self.task_state_manager = task_state_manager + self.logger.info("SWEBenchEvalTask %s", task_abbr_from_cfg(self.cfg)) + + dataset_cfg = self.dataset_cfgs[0] + dataset_name = dataset_cfg.get("name", "lite") + + pred_path = get_infer_output_path( + self.model_cfg, + dataset_cfg, + osp.join(self.work_dir, "predictions"), + file_extension="jsonl", + ) + if not osp.isfile(pred_path): + raise FileNotFoundError( + f"Predictions file not found: {pred_path}. Run infer first." + ) + + out_path = get_infer_output_path( + self.model_cfg, + dataset_cfg, + osp.join(self.work_dir, self.output_subdir), + file_extension="json", + ) + mkdir_or_exist(osp.dirname(out_path)) + + task_state_manager.update_task_state( + {"status": "eval", "progress_description": "SWE-bench harness"} + ) + + try: + import swebench.harness.run_evaluation as run_eval + except ImportError as e: + raise ImportError( + "SWEBenchEvalTask requires the SWE-bench harness. " + "Install from: https://github.com/princeton-nlp/SWE-bench" + ) from e + + run_id = task_abbr_from_cfg(self.cfg).replace("/", "_") + eval_runner = self.cfg.get("eval", {}).get("runner", {}) + max_workers = eval_runner.get("max_num_workers", 4) + report_dir = osp.dirname(out_path) + + try: + run_eval.main( + dataset_name=dataset_name, + split="test", + instance_ids=[], + predictions_path=pred_path, + max_workers=max_workers, + force_rebuild=False, + cache_level="env", + clean=False, + open_file_limit=4096, + run_id=run_id, + timeout=1800, + namespace=None, + rewrite_reports=False, + modal=False, + report_dir=report_dir, + ) + harness_exit = 0 + except SystemExit as e: + harness_exit = e.code if e.code is not None else 1 + except Exception as e: + self.logger.exception("Harness failed: %s", e) + harness_exit = 1 + + results = { + "harness_exit_code": harness_exit, + "dataset_name": dataset_name, + "predictions_path": pred_path, + "run_id": run_id, + } + with open(out_path, "w") as f: + json.dump(results, f, indent=2) + + if harness_exit != 0: + self.logger.warning("Harness exited with code %s", harness_exit) + + +def parse_args(): + parser = argparse.ArgumentParser(description="SWEBench Eval") + parser.add_argument("config", help="Config file path") + return parser.parse_args() + + +if __name__ == "__main__": + logger = AISLogger() + args = parse_args() + cfg = Config.fromfile(args.config) + task_state_manager = TaskStateManager( + tmp_path=os.path.join(cfg["work_dir"], "status_tmp"), + task_name=task_abbr_from_cfg(cfg), + is_debug=cfg["cli_args"]["debug"], + ) + manager_t = threading.Thread(target=task_state_manager.launch, args=()) + manager_t.start() + task_state_manager.update_task_state( + { + "status": "start", + "task_log_path": os.path.join( + "logs/eval/", f"{task_abbr_from_cfg(cfg)}.out" + ), + } + ) + start_time = time.perf_counter() + try: + task = SWEBenchEvalTask(cfg) + task.run(task_state_manager) + except Exception as e: + task_state_manager.update_task_state({"status": "error"}) + raise + end_time = time.perf_counter() + logger.info("SWEBench eval time: %.2fs", end_time - start_time) + task_state_manager.update_task_state({"status": "finish"}) + manager_t.join() diff --git a/ais_bench/benchmark/tasks/swebench_infer.py b/ais_bench/benchmark/tasks/swebench_infer.py new file mode 100644 index 00000000..453d0167 --- /dev/null +++ b/ais_bench/benchmark/tasks/swebench_infer.py @@ -0,0 +1,247 @@ +import argparse +import json +import os +import os.path as osp +import sys +import threading +import time +from pathlib import Path +from typing import List + +from mmengine.config import Config, ConfigDict +from mmengine.utils import mkdir_or_exist + +from ais_bench.benchmark.registry import TASKS +from ais_bench.benchmark.tasks.base import BaseTask, TaskStateManager +from ais_bench.benchmark.utils.config import build_dataset_from_cfg +from ais_bench.benchmark.utils.core.abbr import ( + get_infer_output_path, + model_abbr_from_cfg, + task_abbr_from_cfg, +) +from ais_bench.benchmark.utils.logging import AISLogger + + +def _get_minisweagent_config(model_cfg: ConfigDict) -> dict: + """Build mini-swe-agent model config from ais_bench model_cfg (e.g. LiteLLMChat).""" + model_name = model_cfg.get("model") or model_cfg.get("model_name") or "" + model_type = ( + getattr(model_cfg.get("type"), "__name__", None) + or (model_cfg.get("type", "") if isinstance(model_cfg.get("type"), str) else "") + ) + if isinstance(model_type, str): + model_type = model_type.split(".")[-1] + model_kwargs = dict(model_cfg.get("generation_kwargs", {})) + if model_cfg.get("api_key"): + model_kwargs["api_key"] = model_cfg["api_key"] + if model_cfg.get("url"): + model_kwargs["api_base"] = model_cfg["url"] + model_class = "litellm" + if "openrouter" in (model_type or "").lower() or "openrouter" in (str(model_cfg.get("type", ""))).lower(): + model_class = "openrouter" + return { + "model": { + "model_name": model_name, + "model_class": model_class, + "model_kwargs": model_kwargs, + } + } + + +class _AISBenchProgressManager: + """Minimal progress manager that forwards to TaskStateManager for process_instance.""" + + def __init__(self, task_state_manager: TaskStateManager, total: int): + self._tsm = task_state_manager + self._total = total + self._finish_count = 0 + + def on_instance_start(self, instance_id: str) -> None: + self._tsm.update_task_state( + { + "status": "inferencing", + "finish_count": self._finish_count, + "total_count": self._total, + "progress_description": "SWEBench infer", + "other_kwargs": {"current": instance_id}, + } + ) + + def update_instance_status(self, instance_id: str, message: str) -> None: + self._tsm.update_task_state( + { + "status": "inferencing", + "finish_count": self._finish_count, + "total_count": self._total, + "progress_description": "SWEBench infer", + "other_kwargs": {"current": instance_id, "message": message}, + } + ) + + def on_instance_end(self, instance_id: str, exit_status: str = None) -> None: + self._finish_count += 1 + self._tsm.update_task_state( + { + "status": "inferencing", + "finish_count": self._finish_count, + "total_count": self._total, + "progress_description": "SWEBench infer", + } + ) + + +@TASKS.register_module() +class SWEBenchInferTask(BaseTask): + """SWEBench Inference Task. + + Runs mini-swe-agent on SWE-bench instances and writes predictions as JSONL. + """ + + name_prefix = "SWEBenchInfer" + log_subdir = "logs/infer" + output_subdir = "predictions" + + def __init__(self, cfg: ConfigDict): + super().__init__(cfg) + + def get_command(self, cfg_path: str, template: str) -> str: + sys.path.append(os.getcwd()) + script_path = __file__ + python = sys.executable + command = f"{python} {script_path} {cfg_path}" + return template.format(task_cmd=command) + + def get_output_paths(self, file_extension: str = "jsonl") -> List[str]: + paths = [] + for dataset_cfg in self.dataset_cfgs: + paths.append( + get_infer_output_path( + self.model_cfg, + dataset_cfg, + os.path.join(self.work_dir, self.output_subdir), + file_extension=file_extension, + ) + ) + return paths + + def run(self, task_state_manager: TaskStateManager): + self.task_state_manager = task_state_manager + self.logger.info("SWEBenchInferTask %s", task_abbr_from_cfg(self.cfg)) + + try: + from minisweagent.run.benchmarks.swebench import process_instance + except ImportError as e: + raise ImportError( + "SWEBenchInferTask requires mini-swe-agent. " + "Install with: pip install mini-swe-agent" + ) from e + + dataset_cfg = self.dataset_cfgs[0] + dataset = build_dataset_from_cfg( + dataset_cfg, task_state_manager=task_state_manager + ) + test_data = dataset.test + if hasattr(test_data, "__iter__") and not isinstance(test_data, (list, dict)): + instances = list(test_data) + else: + instances = [test_data[i] for i in range(len(test_data))] + + model_abbr = model_abbr_from_cfg(self.model_cfg) + pred_root = osp.join(self.work_dir, self.output_subdir, model_abbr) + mkdir_or_exist(pred_root) + out_path = get_infer_output_path( + self.model_cfg, + dataset_cfg, + osp.join(self.work_dir, self.output_subdir), + file_extension="jsonl", + ) + out_dir = Path(osp.splitext(out_path)[0] + "_tmp") + out_dir.mkdir(parents=True, exist_ok=True) + + base_config = _get_minisweagent_config(self.model_cfg) + base_config.setdefault("environment", {})["environment_class"] = "docker" + base_config.setdefault("agent", {}) + + progress_manager = _AISBenchProgressManager( + task_state_manager, len(instances) + ) + task_state_manager.update_task_state( + { + "status": "inferencing", + "total_count": len(instances), + "finish_count": 0, + "progress_description": "SWEBench infer", + } + ) + + for instance in instances: + process_instance( + instance, + out_dir, + base_config, + progress_manager, + ) + + preds_path = out_dir / "preds.json" + preds = {} + if preds_path.exists(): + with open(preds_path) as f: + preds = json.load(f) + + mkdir_or_exist(osp.dirname(out_path)) + with open(out_path, "w") as f: + for instance_id, rec in preds.items(): + line = json.dumps( + { + "instance_id": instance_id, + "model_name_or_path": rec.get("model_name_or_path", model_abbr), + "model_patch": rec.get("model_patch", ""), + }, + ensure_ascii=False, + ) + f.write(line + "\n") + + if out_dir.exists(): + import shutil + try: + shutil.rmtree(out_dir) + except OSError: + pass + + +def parse_args(): + parser = argparse.ArgumentParser(description="SWEBench Infer") + parser.add_argument("config", help="Config file path") + return parser.parse_args() + + +if __name__ == "__main__": + logger = AISLogger() + args = parse_args() + cfg = Config.fromfile(args.config) + task_state_manager = TaskStateManager( + tmp_path=os.path.join(cfg["work_dir"], "status_tmp"), + task_name=task_abbr_from_cfg(cfg), + is_debug=cfg["cli_args"]["debug"], + ) + manager_t = threading.Thread(target=task_state_manager.launch, args=()) + manager_t.start() + task_state_manager.update_task_state( + { + "status": "start", + "task_log_path": os.path.join( + "logs/infer/", f"{task_abbr_from_cfg(cfg)}.out" + ), + } + ) + start_time = time.perf_counter() + try: + task = SWEBenchInferTask(cfg) + task.run(task_state_manager) + except Exception as e: + task_state_manager.update_task_state({"status": "error"}) + raise + end_time = time.perf_counter() + logger.info("SWEBench infer time: %.2fs", end_time - start_time) + task_state_manager.update_task_state({"status": "finish"}) + manager_t.join() diff --git a/ais_bench/benchmark/utils/config/run.py b/ais_bench/benchmark/utils/config/run.py index a56a88bd..9ef6c712 100644 --- a/ais_bench/benchmark/utils/config/run.py +++ b/ais_bench/benchmark/utils/config/run.py @@ -6,10 +6,29 @@ from ais_bench.benchmark.utils.logging import AISLogger from ais_bench.benchmark.utils.logging.exceptions import AISBenchConfigError from ais_bench.benchmark.utils.logging.error_codes import UTILS_CODES +from ais_bench.benchmark.openicl.icl_prompt_template import PromptTemplate +from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever +from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer +from ais_bench.benchmark.openicl.icl_evaluator import AccEvaluator logger = AISLogger() def try_fill_in_custom_cfgs(config): + for dataset_cfg in config["datasets"]: + if "infer_cfg" not in dataset_cfg: + logger.debug(f"Filling in infer config for dataset {dataset_cfg['abbr']}") + dataset_cfg["infer_cfg"] = dict( + reader_cfg=dict(input_columns=["dummy"], output_column="dummy"), + prompt_template=dict(type=get_config_type(PromptTemplate), template="{dummy}"), + retriever=dict(type=get_config_type(ZeroRetriever)), + inferencer=dict(type=get_config_type(GenInferencer)), + ) + if "eval_cfg" not in dataset_cfg: + logger.debug(f"Filling in eval config for dataset {dataset_cfg['abbr']}") + dataset_cfg["eval_cfg"] = dict( + evaluator=dict(type=get_config_type(AccEvaluator)), + ) + return config @@ -70,4 +89,4 @@ def fill_eval_cfg(cfg, args): new_cfg['eval']['runner']['type'] = get_config_type(LocalRunner) new_cfg['eval']['runner']['max_workers_per_gpu'] = args.max_workers_per_gpu cfg.merge_from_dict(new_cfg) - logger.debug("Evaluation config filled successfully") \ No newline at end of file + logger.debug("Evaluation config filled successfully") diff --git a/ais_bench/configs/swe_bench_examples/swe_bench_lite.py b/ais_bench/configs/swe_bench_examples/swe_bench_lite.py new file mode 100644 index 00000000..b6789dcf --- /dev/null +++ b/ais_bench/configs/swe_bench_examples/swe_bench_lite.py @@ -0,0 +1,53 @@ +from ais_bench.benchmark.datasets import SWEBenchDataset +from ais_bench.benchmark.partitioners import NaivePartitioner +from ais_bench.benchmark.runners import LocalRunner +from ais_bench.benchmark.tasks import SWEBenchInferTask, SWEBenchEvalTask +from ais_bench.benchmark.summarizers import DefaultSummarizer + +models = [ + dict( + attr="local", + abbr="swebench", + type="LiteLLMChat", + model="", + api_key="", + url="", + batch_size=1, + generation_kwargs=dict(), + ) +] + +datasets = [ + dict( + type=SWEBenchDataset, + abbr="swebench_lite", + path="ais_bench/datasets/SWE-bench_Lite", + name="lite", + split="test", + filter_spec="", + shuffle=False, + prediction_file_extension="jsonl", + ), +] + +summarizer = dict( + attr="accuracy", + type=DefaultSummarizer, +) + + +infer = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + task=dict(type=SWEBenchInferTask), + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + task=dict(type=SWEBenchEvalTask), + ), +) diff --git a/ais_bench/configs/swe_bench_examples/swe_bench_verified.py b/ais_bench/configs/swe_bench_examples/swe_bench_verified.py new file mode 100644 index 00000000..fb8f6f7f --- /dev/null +++ b/ais_bench/configs/swe_bench_examples/swe_bench_verified.py @@ -0,0 +1,53 @@ +from ais_bench.benchmark.datasets import SWEBenchDataset +from ais_bench.benchmark.partitioners import NaivePartitioner +from ais_bench.benchmark.runners import LocalRunner +from ais_bench.benchmark.tasks import SWEBenchInferTask, SWEBenchEvalTask +from ais_bench.benchmark.summarizers import DefaultSummarizer + +models = [ + dict( + attr="local", + abbr="swebench", + type="LiteLLMChat", + model="", + api_key="", + url="", + batch_size=1, + generation_kwargs=dict(), + ) +] + +datasets = [ + dict( + type=SWEBenchDataset, + abbr="swebench_verified", + path="ais_bench/datasets/SWE-bench_Verified", + name="verified", + split="test", + filter_spec="", + shuffle=False, + prediction_file_extension="jsonl", + ), +] + +summarizer = dict( + attr="accuracy", + type=DefaultSummarizer, +) + + +infer = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + task=dict(type=SWEBenchInferTask), + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + task=dict(type=SWEBenchEvalTask), + ), +) From 9b774027dc14e52172db6189f3e41b27902d6057 Mon Sep 17 00:00:00 2001 From: "zhanggaohua@huawei.com" Date: Sat, 14 Mar 2026 16:40:00 +0800 Subject: [PATCH 2/2] adapter swe mini agent --- ais_bench/benchmark/datasets/swebench.py | 3 +- ais_bench/benchmark/tasks/swebench_infer.py | 194 +++++++++++++----- ais_bench/benchmark/utils/config/run.py | 4 +- .../swe_bench_examples/swe_bench_lite.py | 16 +- .../swe_bench_examples/swe_bench_verified.py | 6 +- 5 files changed, 156 insertions(+), 67 deletions(-) diff --git a/ais_bench/benchmark/datasets/swebench.py b/ais_bench/benchmark/datasets/swebench.py index a466e612..594fbfc7 100644 --- a/ais_bench/benchmark/datasets/swebench.py +++ b/ais_bench/benchmark/datasets/swebench.py @@ -45,6 +45,7 @@ def load( split: str = "test", filter_spec: str = "", shuffle: bool = False, + **kwargs, ): if name not in DATASET_MAPPING: raise ParameterValueError( @@ -65,4 +66,4 @@ def load( f"Failed to load swebench dataset {name} from Hugging Face with error: {e}.", ) dataset = self.filter_instances(list(dataset), filter_spec=filter_spec, shuffle=shuffle) - return DatasetDict({"test": Dataset.from_list(dataset)}) + return Dataset.from_list(dataset) diff --git a/ais_bench/benchmark/tasks/swebench_infer.py b/ais_bench/benchmark/tasks/swebench_infer.py index 453d0167..9b14227c 100644 --- a/ais_bench/benchmark/tasks/swebench_infer.py +++ b/ais_bench/benchmark/tasks/swebench_infer.py @@ -1,12 +1,14 @@ import argparse +import concurrent.futures import json import os import os.path as osp import sys import threading import time +import shutil from pathlib import Path -from typing import List +from typing import Any, List, Optional, Tuple from mmengine.config import Config, ConfigDict from mmengine.utils import mkdir_or_exist @@ -25,6 +27,9 @@ def _get_minisweagent_config(model_cfg: ConfigDict) -> dict: """Build mini-swe-agent model config from ais_bench model_cfg (e.g. LiteLLMChat).""" model_name = model_cfg.get("model") or model_cfg.get("model_name") or "" + # LiteLLM requires provider prefix (e.g. hosted_vllm/qwen3) for custom API; add it when url is set and name has no / + if model_cfg.get("url") and model_name: + model_name = f"hosted_vllm/{model_name}" model_type = ( getattr(model_cfg.get("type"), "__name__", None) or (model_cfg.get("type", "") if isinstance(model_cfg.get("type"), str) else "") @@ -39,13 +44,14 @@ def _get_minisweagent_config(model_cfg: ConfigDict) -> dict: model_class = "litellm" if "openrouter" in (model_type or "").lower() or "openrouter" in (str(model_cfg.get("type", ""))).lower(): model_class = "openrouter" - return { - "model": { - "model_name": model_name, - "model_class": model_class, - "model_kwargs": model_kwargs, - } + # Avoid cost-calculation errors for local/custom models (e.g. hosted_vllm) not in litellm price map + model_dict = { + "model_name": model_name, + "model_class": model_class, + "model_kwargs": model_kwargs, + "cost_tracking": "ignore_errors", } + return {"model": model_dict} class _AISBenchProgressManager: @@ -89,12 +95,63 @@ def on_instance_end(self, instance_id: str, exit_status: str = None) -> None: } ) + def on_uncaught_exception(self, instance_id: str, exception: Exception) -> None: + self.on_instance_end(instance_id, f"Uncaught {type(exception).__name__}") + + +class _CompositeProgressManager: + """Forwards progress calls to multiple delegates (e.g. TaskStateManager + Rich dashboard).""" + + def __init__(self, *delegates: Any): + self._delegates = [d for d in delegates if d is not None] + + def on_instance_start(self, instance_id: str) -> None: + for d in self._delegates: + d.on_instance_start(instance_id) + + def update_instance_status(self, instance_id: str, message: str) -> None: + for d in self._delegates: + d.update_instance_status(instance_id, message) + + def on_instance_end(self, instance_id: str, exit_status: str = None) -> None: + for d in self._delegates: + d.on_instance_end(instance_id, exit_status) + + def on_uncaught_exception(self, instance_id: str, exception: Exception) -> None: + for d in self._delegates: + d.on_uncaught_exception(instance_id, exception) + + +def _make_swebench_progress_manager( + task_state_manager: TaskStateManager, + num_instances: int, +) -> Tuple[Any, Optional[Any]]: + """Build progress manager and optional Rich live display. + + Returns: + (progress_manager, live_render_group or None). + When live_render_group is not None, caller should wrap execution in + Live(live_render_group, refresh_per_second=4). + """ + tsm_manager = _AISBenchProgressManager(task_state_manager, num_instances) + try: + from minisweagent.run.benchmarks.utils.batch_progress import ( + RunBatchProgressManager, + ) + from rich.live import Live + + run_batch_manager = RunBatchProgressManager(num_instances, yaml_report_path=None) + composite = _CompositeProgressManager(tsm_manager, run_batch_manager) + return composite, run_batch_manager.render_group + except ImportError: + return tsm_manager, None + @TASKS.register_module() class SWEBenchInferTask(BaseTask): """SWEBench Inference Task. - Runs mini-swe-agent on SWE-bench instances and writes predictions as JSONL. + Runs mini-swe-agent on SWE-bench instances and writes predictions as JSON. """ name_prefix = "SWEBenchInfer" @@ -111,25 +168,14 @@ def get_command(self, cfg_path: str, template: str) -> str: command = f"{python} {script_path} {cfg_path}" return template.format(task_cmd=command) - def get_output_paths(self, file_extension: str = "jsonl") -> List[str]: - paths = [] - for dataset_cfg in self.dataset_cfgs: - paths.append( - get_infer_output_path( - self.model_cfg, - dataset_cfg, - os.path.join(self.work_dir, self.output_subdir), - file_extension=file_extension, - ) - ) - return paths - def run(self, task_state_manager: TaskStateManager): self.task_state_manager = task_state_manager self.logger.info("SWEBenchInferTask %s", task_abbr_from_cfg(self.cfg)) try: from minisweagent.run.benchmarks.swebench import process_instance + from minisweagent.config import get_config_from_spec + from minisweagent.utils.serialize import recursive_merge except ImportError as e: raise ImportError( "SWEBenchInferTask requires mini-swe-agent. " @@ -153,16 +199,31 @@ def run(self, task_state_manager: TaskStateManager): self.model_cfg, dataset_cfg, osp.join(self.work_dir, self.output_subdir), - file_extension="jsonl", + file_extension="json", ) - out_dir = Path(osp.splitext(out_path)[0] + "_tmp") + + + out_dir = Path(osp.splitext(out_path)[0]) out_dir.mkdir(parents=True, exist_ok=True) - base_config = _get_minisweagent_config(self.model_cfg) - base_config.setdefault("environment", {})["environment_class"] = "docker" - base_config.setdefault("agent", {}) + # Load default swebench config (agent.system_template, agent.instance_template, etc.) + # then override with our model so mini-swe-agent gets required AgentConfig fields. + default_swebench_config = get_config_from_spec("swebench.yaml") + our_config = _get_minisweagent_config(self.model_cfg) + model_name = (our_config.get("model") or {}).get("model_name") or "" + if not (model_name or "").strip(): + raise ValueError( + "No model set for SWEBench infer. In your config (e.g. swe_bench_lite.py), set " + "models[0]['model'], models[0]['url'], and models[0]['api_key']. " + "Example for local vLLM: model='hosted_vllm/qwen3', url='http://127.0.0.1:2998/v1', api_key='EMPTY'. " + "Or run: mini-extra config setup (to use mini-swe-agent defaults)." + ) + our_config.setdefault("environment", {})["environment_class"] = "docker" + base_config = recursive_merge(default_swebench_config, our_config) + if dataset_cfg.get("step_limit") is not None: + base_config.setdefault("agent", {})["step_limit"] = dataset_cfg["step_limit"] - progress_manager = _AISBenchProgressManager( + progress_manager, live_render_group = _make_swebench_progress_manager( task_state_manager, len(instances) ) task_state_manager.update_task_state( @@ -174,39 +235,58 @@ def run(self, task_state_manager: TaskStateManager): } ) - for instance in instances: - process_instance( - instance, - out_dir, - base_config, - progress_manager, - ) + workers = self.model_cfg.get("batch_size", 1) + + def process_futures(futures): + for future in concurrent.futures.as_completed(futures): + try: + future.result() + except concurrent.futures.CancelledError: + pass + except Exception as e: + instance_id = futures[future] + self.logger.error( + "Error in future for instance %s: %s", + instance_id, + e, + exc_info=True, + ) + progress_manager.on_uncaught_exception(instance_id, e) + + def run_executor(): + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: + futures = { + executor.submit( + process_instance, + instance, + out_dir, + base_config, + progress_manager, + ): instance["instance_id"] + for instance in instances + } + try: + process_futures(futures) + except KeyboardInterrupt: + self.logger.info( + "Cancelling all pending jobs. Press ^C again to exit immediately." + ) + for future in futures: + if not future.running() and not future.done(): + future.cancel() + process_futures(futures) + + if live_render_group is not None: + from rich.live import Live + + with Live(live_render_group, refresh_per_second=4): + run_executor() + else: + run_executor() preds_path = out_dir / "preds.json" - preds = {} if preds_path.exists(): - with open(preds_path) as f: - preds = json.load(f) - - mkdir_or_exist(osp.dirname(out_path)) - with open(out_path, "w") as f: - for instance_id, rec in preds.items(): - line = json.dumps( - { - "instance_id": instance_id, - "model_name_or_path": rec.get("model_name_or_path", model_abbr), - "model_patch": rec.get("model_patch", ""), - }, - ensure_ascii=False, - ) - f.write(line + "\n") - - if out_dir.exists(): - import shutil - try: - shutil.rmtree(out_dir) - except OSError: - pass + shutil.move(preds_path, out_path) def parse_args(): diff --git a/ais_bench/benchmark/utils/config/run.py b/ais_bench/benchmark/utils/config/run.py index 9ef6c712..e7385aff 100644 --- a/ais_bench/benchmark/utils/config/run.py +++ b/ais_bench/benchmark/utils/config/run.py @@ -18,11 +18,13 @@ def try_fill_in_custom_cfgs(config): if "infer_cfg" not in dataset_cfg: logger.debug(f"Filling in infer config for dataset {dataset_cfg['abbr']}") dataset_cfg["infer_cfg"] = dict( - reader_cfg=dict(input_columns=["dummy"], output_column="dummy"), prompt_template=dict(type=get_config_type(PromptTemplate), template="{dummy}"), retriever=dict(type=get_config_type(ZeroRetriever)), inferencer=dict(type=get_config_type(GenInferencer)), ) + if "reader_cfg" not in dataset_cfg: + logger.debug(f"Filling in reader config for dataset {dataset_cfg['abbr']}") + dataset_cfg["reader_cfg"] = dict(input_columns=["dummy"], output_column="dummy") if "eval_cfg" not in dataset_cfg: logger.debug(f"Filling in eval config for dataset {dataset_cfg['abbr']}") dataset_cfg["eval_cfg"] = dict( diff --git a/ais_bench/configs/swe_bench_examples/swe_bench_lite.py b/ais_bench/configs/swe_bench_examples/swe_bench_lite.py index b6789dcf..f69b6933 100644 --- a/ais_bench/configs/swe_bench_examples/swe_bench_lite.py +++ b/ais_bench/configs/swe_bench_examples/swe_bench_lite.py @@ -4,15 +4,19 @@ from ais_bench.benchmark.tasks import SWEBenchInferTask, SWEBenchEvalTask from ais_bench.benchmark.summarizers import DefaultSummarizer +STEP_LIMIT = 2 + +# For local vLLM: set model (e.g. hosted_vllm/qwen3), url (vLLM API base), api_key (e.g. "EMPTY"). +# Example matching: mini-extra swebench -m hosted_vllm/qwen3 -c model.model_kwargs.api_base='"http://127.0.0.1:2998/v1"' ... models = [ dict( attr="local", abbr="swebench", type="LiteLLMChat", - model="", - api_key="", - url="", - batch_size=1, + model="qwen3", # e.g. hosted_vllm/qwen3 for local vLLM + api_key="EMPTY", + url="http://127.0.0.1:2998/v1", # vLLM API base + batch_size=2, generation_kwargs=dict(), ) ] @@ -21,12 +25,12 @@ dict( type=SWEBenchDataset, abbr="swebench_lite", - path="ais_bench/datasets/SWE-bench_Lite", + path="/data/zhanggaohua/datasets/SWE-bench_Lite", name="lite", split="test", filter_spec="", shuffle=False, - prediction_file_extension="jsonl", + step_limit=STEP_LIMIT, ), ] diff --git a/ais_bench/configs/swe_bench_examples/swe_bench_verified.py b/ais_bench/configs/swe_bench_examples/swe_bench_verified.py index fb8f6f7f..b9de7821 100644 --- a/ais_bench/configs/swe_bench_examples/swe_bench_verified.py +++ b/ais_bench/configs/swe_bench_examples/swe_bench_verified.py @@ -4,6 +4,8 @@ from ais_bench.benchmark.tasks import SWEBenchInferTask, SWEBenchEvalTask from ais_bench.benchmark.summarizers import DefaultSummarizer +STEP_LIMIT = 100 + models = [ dict( attr="local", @@ -12,7 +14,7 @@ model="", api_key="", url="", - batch_size=1, + batch_size=2, generation_kwargs=dict(), ) ] @@ -24,9 +26,9 @@ path="ais_bench/datasets/SWE-bench_Verified", name="verified", split="test", + step_limit=STEP_LIMIT, filter_spec="", shuffle=False, - prediction_file_extension="jsonl", ), ]