diff --git a/ais_bench/benchmark/cli/config_manager.py b/ais_bench/benchmark/cli/config_manager.py index dde76527..cb05db73 100644 --- a/ais_bench/benchmark/cli/config_manager.py +++ b/ais_bench/benchmark/cli/config_manager.py @@ -9,11 +9,11 @@ from ais_bench.benchmark.utils.file import match_cfg_file from ais_bench.benchmark.utils.config.run import try_fill_in_custom_cfgs from ais_bench.benchmark.utils.logging.exceptions import CommandError, AISBenchConfigError -from ais_bench.benchmark.cli.utils import fill_model_path_if_datasets_need, fill_test_range_use_num_prompts +from ais_bench.benchmark.cli.utils import fill_model_path_if_datasets_need, fill_test_range_use_num_prompts, recur_convert_config_type class CustomConfigChecker: MODEL_REQUIRED_FIELDS = ['type', 'abbr', 'attr'] - DATASET_REQUIRED_FIELDS = ['type', 'abbr', 'reader_cfg', 'infer_cfg', 'eval_cfg'] + DATASET_REQUIRED_FIELDS = ['type', 'abbr'] SUMMARIZER_REQUIRED_FIELDS = ['attr'] def __init__(self, config, file_path): @@ -327,6 +327,8 @@ def _dump_and_reload_config(self): # dump config output_config_path = osp.join(self.cfg.work_dir, 'configs', f'{self.cfg_time_str}_{os.getpid()}.py') + + recur_convert_config_type(self.cfg) self.cfg.dump(output_config_path) # eval nums set if (self.args.num_prompts and self.args.num_prompts < 0) or self.args.num_prompts == 0: diff --git a/ais_bench/benchmark/cli/utils.py b/ais_bench/benchmark/cli/utils.py index 01ff50e5..81e92a84 100644 --- a/ais_bench/benchmark/cli/utils.py +++ b/ais_bench/benchmark/cli/utils.py @@ -2,6 +2,7 @@ import os from datetime import datetime +from mmengine.config import ConfigDict, Config from ais_bench.benchmark.utils.logging.exceptions import AISBenchConfigError from ais_bench.benchmark.utils.logging.logger import AISLogger from ais_bench.benchmark.utils.logging.error_codes import UTILS_CODES @@ -20,6 +21,25 @@ def get_config_type(obj) -> str: return obj return f"{obj.__module__}.{obj.__name__}" +def recur_convert_config_type(cfg): + """Recursively convert the type of the config to the string type. + + Args: + cfg: The config to convert. + """ + if isinstance(cfg, (dict, ConfigDict, Config)): + for key, value in cfg.items(): + if key == "type": + cfg[key] = get_config_type(value) + else: + cfg[key] = recur_convert_config_type(value) + elif isinstance(cfg, list): + for i, item in enumerate(cfg): + cfg[i] = recur_convert_config_type(item) if isinstance(item, (dict, ConfigDict, Config, list)) else item + else: + return cfg + return cfg + def get_current_time_str(): return datetime.now().strftime("%Y%m%d_%H%M%S") diff --git a/ais_bench/benchmark/cli/workers.py b/ais_bench/benchmark/cli/workers.py index ce1dd8bb..eb477361 100644 --- a/ais_bench/benchmark/cli/workers.py +++ b/ais_bench/benchmark/cli/workers.py @@ -42,26 +42,30 @@ class Infer(BaseWorker): def update_cfg(self, cfg: ConfigDict) -> None: def get_task_type() -> str: if cfg["models"][0]["attr"] == "service": - return get_config_type(OpenICLApiInferTask) + return OpenICLApiInferTask else: - return get_config_type(OpenICLInferTask) + return OpenICLInferTask - new_cfg = dict( - infer=dict( - partitioner=dict(type=get_config_type(NaivePartitioner)), - runner=dict( - max_num_workers=self.args.max_num_workers, - max_workers_per_gpu=self.args.max_workers_per_gpu, - debug=self.args.debug, - task=dict(type=get_task_type()), - type=get_config_type(LocalRunner), - ), - ), - ) + def update_new_infer_cfg(new_cfg: ConfigDict) -> None: + runner_cfg = new_cfg['infer']['runner'] + runner_cfg['max_num_workers'] = self.args.max_num_workers + runner_cfg['max_workers_per_gpu'] = self.args.max_workers_per_gpu + runner_cfg['debug'] = self.args.debug or cfg.cli_args.debug + if cfg.get('infer'): + new_cfg = dict(infer=cfg.infer) + else: + new_cfg = dict( + infer=dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + task=dict(type=get_task_type()), + type=LocalRunner, + ), + ), + ) + update_new_infer_cfg(new_cfg) cfg.merge_from_dict(new_cfg) - if cfg.cli_args.debug: - cfg.infer.runner.debug = True cfg.infer.partitioner["out_dir"] = osp.join(cfg["work_dir"], "predictions/") return cfg @@ -259,26 +263,28 @@ def _result_post_process(self, tasks, cfg: ConfigDict): class Eval(BaseWorker): def update_cfg(self, cfg: ConfigDict) -> None: - new_cfg = dict( - eval=dict( - partitioner=dict(type=get_config_type(NaivePartitioner)), - runner=dict( - max_num_workers=self.args.max_num_workers, - debug=self.args.debug, - task=dict(type=get_config_type(OpenICLEvalTask)), + def update_eval_cfg(new_cfg: ConfigDict) -> None: + runner_cfg = new_cfg['eval']['runner'] + runner_cfg['max_num_workers'] = self.args.max_num_workers + runner_cfg['max_workers_per_gpu'] = self.args.max_workers_per_gpu + runner_cfg['debug'] = self.args.debug + runner_cfg['dump_details'] = cfg.cli_args.dump_eval_details + runner_cfg['cal_extract_rate'] = cfg.cli_args.dump_extract_rate + + if cfg.get('eval'): + new_cfg = dict(eval=cfg.eval) + else: + new_cfg = dict( + eval=dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + task=dict(type=OpenICLEvalTask), ), - ), - ) + )) - new_cfg["eval"]["runner"]["type"] = get_config_type(LocalRunner) - new_cfg["eval"]["runner"]["max_workers_per_gpu"] = self.args.max_workers_per_gpu + update_eval_cfg(new_cfg) cfg.merge_from_dict(new_cfg) - if cfg.cli_args.dump_eval_details: - cfg.eval.runner.task.dump_details = True - if cfg.cli_args.dump_extract_rate: - cfg.eval.runner.task.cal_extract_rate = True - if cfg.cli_args.debug: - cfg.eval.runner.debug = True cfg.eval.partitioner["out_dir"] = osp.join(cfg["work_dir"], "results/") return cfg diff --git a/ais_bench/benchmark/datasets/__init__.py b/ais_bench/benchmark/datasets/__init__.py index 1581a2af..634c4a34 100644 --- a/ais_bench/benchmark/datasets/__init__.py +++ b/ais_bench/benchmark/datasets/__init__.py @@ -53,3 +53,4 @@ from ais_bench.benchmark.datasets.mmstar import * # noqa: F401, F403 from ais_bench.benchmark.datasets.dapo_math import * # noqa: F401, F403 from ais_bench.benchmark.datasets.mooncake_trace import * # noqa: F401, F403 +from ais_bench.benchmark.datasets.swebench import * # noqa: F401, F403 \ No newline at end of file diff --git a/ais_bench/benchmark/datasets/swebench.py b/ais_bench/benchmark/datasets/swebench.py new file mode 100644 index 00000000..594fbfc7 --- /dev/null +++ b/ais_bench/benchmark/datasets/swebench.py @@ -0,0 +1,69 @@ +import re +import random +from datasets import load_dataset, Dataset, DatasetDict + +from ais_bench.benchmark.registry import LOAD_DATASET +from ais_bench.benchmark.utils.logging.exceptions import ParameterValueError +from ais_bench.benchmark.utils.logging.error_codes import DSET_CODES +from ais_bench.benchmark.datasets.base import BaseDataset + +DATASET_MAPPING = { + "full": "princeton-nlp/SWE-Bench", + "verified": "princeton-nlp/SWE-Bench_Verified", + "lite": "princeton-nlp/SWE-Bench_Lite", + "multimodal": "princeton-nlp/SWE-Bench_Multimodal", + "multilingual": "swe-bench/SWE-Bench_Multilingual", +} + + +@LOAD_DATASET.register_module() +class SWEBenchDataset(BaseDataset): + def filter_instances( + self, instances: list[dict], *, filter_spec: str, shuffle: bool = False + ) -> list[dict]: + """Filter and slice a list of SWEBench instances.""" + if shuffle: + instances = sorted(instances.copy(), key=lambda x: x["instance_id"]) + random.seed(42) + random.shuffle(instances) + before_filter = len(instances) + instances = [ + instance + for instance in instances + if re.match(filter_spec, instance["instance_id"]) + ] + if (after_filter := len(instances)) != before_filter: + self.logger.info( + f"Instance filter: {before_filter} -> {after_filter} instances" + ) + return instances + + def load( + self, + path: str, + name: str, + split: str = "test", + filter_spec: str = "", + shuffle: bool = False, + **kwargs, + ): + if name not in DATASET_MAPPING: + raise ParameterValueError( + DSET_CODES.INVALID_PARAM_VALUE, + f"Invalid swebench dataset name, expected one of {list(DATASET_MAPPING.keys())} but got {name}", + ) + try: + dataset = load_dataset("parquet", data_files={split: path}) + except Exception as e: + self.logger.warning( + f"Failed to load swebench dataset {name} from {path} with error: {e}, trying to load from Hugging Face" + ) + try: + dataset = load_dataset(DATASET_MAPPING[name], split=split) + except Exception as e: + raise ParameterValueError( + DSET_CODES.DATA_PREPROCESSING_ERROR, + f"Failed to load swebench dataset {name} from Hugging Face with error: {e}.", + ) + dataset = self.filter_instances(list(dataset), filter_spec=filter_spec, shuffle=shuffle) + return Dataset.from_list(dataset) diff --git a/ais_bench/benchmark/tasks/__init__.py b/ais_bench/benchmark/tasks/__init__.py index 7ba624f8..c094003a 100644 --- a/ais_bench/benchmark/tasks/__init__.py +++ b/ais_bench/benchmark/tasks/__init__.py @@ -1,3 +1,6 @@ from ais_bench.benchmark.tasks.openicl_eval import * # noqa: F401, F403 from ais_bench.benchmark.tasks.openicl_infer import * # noqa: F401, F403 from ais_bench.benchmark.tasks.openicl_api_infer import OpenICLApiInferTask +from ais_bench.benchmark.tasks.swebench_infer import SWEBenchInferTask +from ais_bench.benchmark.tasks.swebench_eval import SWEBenchEvalTask + diff --git a/ais_bench/benchmark/tasks/swebench_eval.py b/ais_bench/benchmark/tasks/swebench_eval.py new file mode 100644 index 00000000..413edc7a --- /dev/null +++ b/ais_bench/benchmark/tasks/swebench_eval.py @@ -0,0 +1,159 @@ +import argparse +import json +import os +import os.path as osp +import sys +import threading +import time + +from mmengine.config import Config, ConfigDict +from mmengine.utils import mkdir_or_exist + +from ais_bench.benchmark.registry import TASKS +from ais_bench.benchmark.tasks.base import BaseTask, TaskStateManager +from ais_bench.benchmark.utils.core.abbr import ( + get_infer_output_path, + task_abbr_from_cfg, +) +from ais_bench.benchmark.utils.logging import AISLogger + + +@TASKS.register_module() +class SWEBenchEvalTask(BaseTask): + """SWEBench Evaluation Task. + + Evaluates SWE-bench predictions using the official harness and writes + results to work_dir/results. + """ + + name_prefix = "SWEBenchEval" + log_subdir = "logs/eval" + output_subdir = "results" + + def __init__(self, cfg: ConfigDict): + super().__init__(cfg) + + def get_command(self, cfg_path: str, template: str) -> str: + sys.path.append(os.getcwd()) + script_path = __file__ + python = sys.executable + command = f"{python} {script_path} {cfg_path}" + return template.format(task_cmd=command) + + def run(self, task_state_manager: TaskStateManager): + self.task_state_manager = task_state_manager + self.logger.info("SWEBenchEvalTask %s", task_abbr_from_cfg(self.cfg)) + + dataset_cfg = self.dataset_cfgs[0] + dataset_name = dataset_cfg.get("name", "lite") + + pred_path = get_infer_output_path( + self.model_cfg, + dataset_cfg, + osp.join(self.work_dir, "predictions"), + file_extension="jsonl", + ) + if not osp.isfile(pred_path): + raise FileNotFoundError( + f"Predictions file not found: {pred_path}. Run infer first." + ) + + out_path = get_infer_output_path( + self.model_cfg, + dataset_cfg, + osp.join(self.work_dir, self.output_subdir), + file_extension="json", + ) + mkdir_or_exist(osp.dirname(out_path)) + + task_state_manager.update_task_state( + {"status": "eval", "progress_description": "SWE-bench harness"} + ) + + try: + import swebench.harness.run_evaluation as run_eval + except ImportError as e: + raise ImportError( + "SWEBenchEvalTask requires the SWE-bench harness. " + "Install from: https://github.com/princeton-nlp/SWE-bench" + ) from e + + run_id = task_abbr_from_cfg(self.cfg).replace("/", "_") + eval_runner = self.cfg.get("eval", {}).get("runner", {}) + max_workers = eval_runner.get("max_num_workers", 4) + report_dir = osp.dirname(out_path) + + try: + run_eval.main( + dataset_name=dataset_name, + split="test", + instance_ids=[], + predictions_path=pred_path, + max_workers=max_workers, + force_rebuild=False, + cache_level="env", + clean=False, + open_file_limit=4096, + run_id=run_id, + timeout=1800, + namespace=None, + rewrite_reports=False, + modal=False, + report_dir=report_dir, + ) + harness_exit = 0 + except SystemExit as e: + harness_exit = e.code if e.code is not None else 1 + except Exception as e: + self.logger.exception("Harness failed: %s", e) + harness_exit = 1 + + results = { + "harness_exit_code": harness_exit, + "dataset_name": dataset_name, + "predictions_path": pred_path, + "run_id": run_id, + } + with open(out_path, "w") as f: + json.dump(results, f, indent=2) + + if harness_exit != 0: + self.logger.warning("Harness exited with code %s", harness_exit) + + +def parse_args(): + parser = argparse.ArgumentParser(description="SWEBench Eval") + parser.add_argument("config", help="Config file path") + return parser.parse_args() + + +if __name__ == "__main__": + logger = AISLogger() + args = parse_args() + cfg = Config.fromfile(args.config) + task_state_manager = TaskStateManager( + tmp_path=os.path.join(cfg["work_dir"], "status_tmp"), + task_name=task_abbr_from_cfg(cfg), + is_debug=cfg["cli_args"]["debug"], + ) + manager_t = threading.Thread(target=task_state_manager.launch, args=()) + manager_t.start() + task_state_manager.update_task_state( + { + "status": "start", + "task_log_path": os.path.join( + "logs/eval/", f"{task_abbr_from_cfg(cfg)}.out" + ), + } + ) + start_time = time.perf_counter() + try: + task = SWEBenchEvalTask(cfg) + task.run(task_state_manager) + except Exception as e: + task_state_manager.update_task_state({"status": "error"}) + raise + end_time = time.perf_counter() + logger.info("SWEBench eval time: %.2fs", end_time - start_time) + task_state_manager.update_task_state({"status": "finish"}) + manager_t.join() diff --git a/ais_bench/benchmark/tasks/swebench_infer.py b/ais_bench/benchmark/tasks/swebench_infer.py new file mode 100644 index 00000000..9b14227c --- /dev/null +++ b/ais_bench/benchmark/tasks/swebench_infer.py @@ -0,0 +1,327 @@ +import argparse +import concurrent.futures +import json +import os +import os.path as osp +import sys +import threading +import time +import shutil +from pathlib import Path +from typing import Any, List, Optional, Tuple + +from mmengine.config import Config, ConfigDict +from mmengine.utils import mkdir_or_exist + +from ais_bench.benchmark.registry import TASKS +from ais_bench.benchmark.tasks.base import BaseTask, TaskStateManager +from ais_bench.benchmark.utils.config import build_dataset_from_cfg +from ais_bench.benchmark.utils.core.abbr import ( + get_infer_output_path, + model_abbr_from_cfg, + task_abbr_from_cfg, +) +from ais_bench.benchmark.utils.logging import AISLogger + + +def _get_minisweagent_config(model_cfg: ConfigDict) -> dict: + """Build mini-swe-agent model config from ais_bench model_cfg (e.g. LiteLLMChat).""" + model_name = model_cfg.get("model") or model_cfg.get("model_name") or "" + # LiteLLM requires provider prefix (e.g. hosted_vllm/qwen3) for custom API; add it when url is set and name has no / + if model_cfg.get("url") and model_name: + model_name = f"hosted_vllm/{model_name}" + model_type = ( + getattr(model_cfg.get("type"), "__name__", None) + or (model_cfg.get("type", "") if isinstance(model_cfg.get("type"), str) else "") + ) + if isinstance(model_type, str): + model_type = model_type.split(".")[-1] + model_kwargs = dict(model_cfg.get("generation_kwargs", {})) + if model_cfg.get("api_key"): + model_kwargs["api_key"] = model_cfg["api_key"] + if model_cfg.get("url"): + model_kwargs["api_base"] = model_cfg["url"] + model_class = "litellm" + if "openrouter" in (model_type or "").lower() or "openrouter" in (str(model_cfg.get("type", ""))).lower(): + model_class = "openrouter" + # Avoid cost-calculation errors for local/custom models (e.g. hosted_vllm) not in litellm price map + model_dict = { + "model_name": model_name, + "model_class": model_class, + "model_kwargs": model_kwargs, + "cost_tracking": "ignore_errors", + } + return {"model": model_dict} + + +class _AISBenchProgressManager: + """Minimal progress manager that forwards to TaskStateManager for process_instance.""" + + def __init__(self, task_state_manager: TaskStateManager, total: int): + self._tsm = task_state_manager + self._total = total + self._finish_count = 0 + + def on_instance_start(self, instance_id: str) -> None: + self._tsm.update_task_state( + { + "status": "inferencing", + "finish_count": self._finish_count, + "total_count": self._total, + "progress_description": "SWEBench infer", + "other_kwargs": {"current": instance_id}, + } + ) + + def update_instance_status(self, instance_id: str, message: str) -> None: + self._tsm.update_task_state( + { + "status": "inferencing", + "finish_count": self._finish_count, + "total_count": self._total, + "progress_description": "SWEBench infer", + "other_kwargs": {"current": instance_id, "message": message}, + } + ) + + def on_instance_end(self, instance_id: str, exit_status: str = None) -> None: + self._finish_count += 1 + self._tsm.update_task_state( + { + "status": "inferencing", + "finish_count": self._finish_count, + "total_count": self._total, + "progress_description": "SWEBench infer", + } + ) + + def on_uncaught_exception(self, instance_id: str, exception: Exception) -> None: + self.on_instance_end(instance_id, f"Uncaught {type(exception).__name__}") + + +class _CompositeProgressManager: + """Forwards progress calls to multiple delegates (e.g. TaskStateManager + Rich dashboard).""" + + def __init__(self, *delegates: Any): + self._delegates = [d for d in delegates if d is not None] + + def on_instance_start(self, instance_id: str) -> None: + for d in self._delegates: + d.on_instance_start(instance_id) + + def update_instance_status(self, instance_id: str, message: str) -> None: + for d in self._delegates: + d.update_instance_status(instance_id, message) + + def on_instance_end(self, instance_id: str, exit_status: str = None) -> None: + for d in self._delegates: + d.on_instance_end(instance_id, exit_status) + + def on_uncaught_exception(self, instance_id: str, exception: Exception) -> None: + for d in self._delegates: + d.on_uncaught_exception(instance_id, exception) + + +def _make_swebench_progress_manager( + task_state_manager: TaskStateManager, + num_instances: int, +) -> Tuple[Any, Optional[Any]]: + """Build progress manager and optional Rich live display. + + Returns: + (progress_manager, live_render_group or None). + When live_render_group is not None, caller should wrap execution in + Live(live_render_group, refresh_per_second=4). + """ + tsm_manager = _AISBenchProgressManager(task_state_manager, num_instances) + try: + from minisweagent.run.benchmarks.utils.batch_progress import ( + RunBatchProgressManager, + ) + from rich.live import Live + + run_batch_manager = RunBatchProgressManager(num_instances, yaml_report_path=None) + composite = _CompositeProgressManager(tsm_manager, run_batch_manager) + return composite, run_batch_manager.render_group + except ImportError: + return tsm_manager, None + + +@TASKS.register_module() +class SWEBenchInferTask(BaseTask): + """SWEBench Inference Task. + + Runs mini-swe-agent on SWE-bench instances and writes predictions as JSON. + """ + + name_prefix = "SWEBenchInfer" + log_subdir = "logs/infer" + output_subdir = "predictions" + + def __init__(self, cfg: ConfigDict): + super().__init__(cfg) + + def get_command(self, cfg_path: str, template: str) -> str: + sys.path.append(os.getcwd()) + script_path = __file__ + python = sys.executable + command = f"{python} {script_path} {cfg_path}" + return template.format(task_cmd=command) + + def run(self, task_state_manager: TaskStateManager): + self.task_state_manager = task_state_manager + self.logger.info("SWEBenchInferTask %s", task_abbr_from_cfg(self.cfg)) + + try: + from minisweagent.run.benchmarks.swebench import process_instance + from minisweagent.config import get_config_from_spec + from minisweagent.utils.serialize import recursive_merge + except ImportError as e: + raise ImportError( + "SWEBenchInferTask requires mini-swe-agent. " + "Install with: pip install mini-swe-agent" + ) from e + + dataset_cfg = self.dataset_cfgs[0] + dataset = build_dataset_from_cfg( + dataset_cfg, task_state_manager=task_state_manager + ) + test_data = dataset.test + if hasattr(test_data, "__iter__") and not isinstance(test_data, (list, dict)): + instances = list(test_data) + else: + instances = [test_data[i] for i in range(len(test_data))] + + model_abbr = model_abbr_from_cfg(self.model_cfg) + pred_root = osp.join(self.work_dir, self.output_subdir, model_abbr) + mkdir_or_exist(pred_root) + out_path = get_infer_output_path( + self.model_cfg, + dataset_cfg, + osp.join(self.work_dir, self.output_subdir), + file_extension="json", + ) + + + out_dir = Path(osp.splitext(out_path)[0]) + out_dir.mkdir(parents=True, exist_ok=True) + + # Load default swebench config (agent.system_template, agent.instance_template, etc.) + # then override with our model so mini-swe-agent gets required AgentConfig fields. + default_swebench_config = get_config_from_spec("swebench.yaml") + our_config = _get_minisweagent_config(self.model_cfg) + model_name = (our_config.get("model") or {}).get("model_name") or "" + if not (model_name or "").strip(): + raise ValueError( + "No model set for SWEBench infer. In your config (e.g. swe_bench_lite.py), set " + "models[0]['model'], models[0]['url'], and models[0]['api_key']. " + "Example for local vLLM: model='hosted_vllm/qwen3', url='http://127.0.0.1:2998/v1', api_key='EMPTY'. " + "Or run: mini-extra config setup (to use mini-swe-agent defaults)." + ) + our_config.setdefault("environment", {})["environment_class"] = "docker" + base_config = recursive_merge(default_swebench_config, our_config) + if dataset_cfg.get("step_limit") is not None: + base_config.setdefault("agent", {})["step_limit"] = dataset_cfg["step_limit"] + + progress_manager, live_render_group = _make_swebench_progress_manager( + task_state_manager, len(instances) + ) + task_state_manager.update_task_state( + { + "status": "inferencing", + "total_count": len(instances), + "finish_count": 0, + "progress_description": "SWEBench infer", + } + ) + + workers = self.model_cfg.get("batch_size", 1) + + def process_futures(futures): + for future in concurrent.futures.as_completed(futures): + try: + future.result() + except concurrent.futures.CancelledError: + pass + except Exception as e: + instance_id = futures[future] + self.logger.error( + "Error in future for instance %s: %s", + instance_id, + e, + exc_info=True, + ) + progress_manager.on_uncaught_exception(instance_id, e) + + def run_executor(): + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: + futures = { + executor.submit( + process_instance, + instance, + out_dir, + base_config, + progress_manager, + ): instance["instance_id"] + for instance in instances + } + try: + process_futures(futures) + except KeyboardInterrupt: + self.logger.info( + "Cancelling all pending jobs. Press ^C again to exit immediately." + ) + for future in futures: + if not future.running() and not future.done(): + future.cancel() + process_futures(futures) + + if live_render_group is not None: + from rich.live import Live + + with Live(live_render_group, refresh_per_second=4): + run_executor() + else: + run_executor() + + preds_path = out_dir / "preds.json" + if preds_path.exists(): + shutil.move(preds_path, out_path) + + +def parse_args(): + parser = argparse.ArgumentParser(description="SWEBench Infer") + parser.add_argument("config", help="Config file path") + return parser.parse_args() + + +if __name__ == "__main__": + logger = AISLogger() + args = parse_args() + cfg = Config.fromfile(args.config) + task_state_manager = TaskStateManager( + tmp_path=os.path.join(cfg["work_dir"], "status_tmp"), + task_name=task_abbr_from_cfg(cfg), + is_debug=cfg["cli_args"]["debug"], + ) + manager_t = threading.Thread(target=task_state_manager.launch, args=()) + manager_t.start() + task_state_manager.update_task_state( + { + "status": "start", + "task_log_path": os.path.join( + "logs/infer/", f"{task_abbr_from_cfg(cfg)}.out" + ), + } + ) + start_time = time.perf_counter() + try: + task = SWEBenchInferTask(cfg) + task.run(task_state_manager) + except Exception as e: + task_state_manager.update_task_state({"status": "error"}) + raise + end_time = time.perf_counter() + logger.info("SWEBench infer time: %.2fs", end_time - start_time) + task_state_manager.update_task_state({"status": "finish"}) + manager_t.join() diff --git a/ais_bench/benchmark/utils/config/run.py b/ais_bench/benchmark/utils/config/run.py index a56a88bd..e7385aff 100644 --- a/ais_bench/benchmark/utils/config/run.py +++ b/ais_bench/benchmark/utils/config/run.py @@ -6,10 +6,31 @@ from ais_bench.benchmark.utils.logging import AISLogger from ais_bench.benchmark.utils.logging.exceptions import AISBenchConfigError from ais_bench.benchmark.utils.logging.error_codes import UTILS_CODES +from ais_bench.benchmark.openicl.icl_prompt_template import PromptTemplate +from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever +from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer +from ais_bench.benchmark.openicl.icl_evaluator import AccEvaluator logger = AISLogger() def try_fill_in_custom_cfgs(config): + for dataset_cfg in config["datasets"]: + if "infer_cfg" not in dataset_cfg: + logger.debug(f"Filling in infer config for dataset {dataset_cfg['abbr']}") + dataset_cfg["infer_cfg"] = dict( + prompt_template=dict(type=get_config_type(PromptTemplate), template="{dummy}"), + retriever=dict(type=get_config_type(ZeroRetriever)), + inferencer=dict(type=get_config_type(GenInferencer)), + ) + if "reader_cfg" not in dataset_cfg: + logger.debug(f"Filling in reader config for dataset {dataset_cfg['abbr']}") + dataset_cfg["reader_cfg"] = dict(input_columns=["dummy"], output_column="dummy") + if "eval_cfg" not in dataset_cfg: + logger.debug(f"Filling in eval config for dataset {dataset_cfg['abbr']}") + dataset_cfg["eval_cfg"] = dict( + evaluator=dict(type=get_config_type(AccEvaluator)), + ) + return config @@ -70,4 +91,4 @@ def fill_eval_cfg(cfg, args): new_cfg['eval']['runner']['type'] = get_config_type(LocalRunner) new_cfg['eval']['runner']['max_workers_per_gpu'] = args.max_workers_per_gpu cfg.merge_from_dict(new_cfg) - logger.debug("Evaluation config filled successfully") \ No newline at end of file + logger.debug("Evaluation config filled successfully") diff --git a/ais_bench/configs/swe_bench_examples/swe_bench_lite.py b/ais_bench/configs/swe_bench_examples/swe_bench_lite.py new file mode 100644 index 00000000..f69b6933 --- /dev/null +++ b/ais_bench/configs/swe_bench_examples/swe_bench_lite.py @@ -0,0 +1,57 @@ +from ais_bench.benchmark.datasets import SWEBenchDataset +from ais_bench.benchmark.partitioners import NaivePartitioner +from ais_bench.benchmark.runners import LocalRunner +from ais_bench.benchmark.tasks import SWEBenchInferTask, SWEBenchEvalTask +from ais_bench.benchmark.summarizers import DefaultSummarizer + +STEP_LIMIT = 2 + +# For local vLLM: set model (e.g. hosted_vllm/qwen3), url (vLLM API base), api_key (e.g. "EMPTY"). +# Example matching: mini-extra swebench -m hosted_vllm/qwen3 -c model.model_kwargs.api_base='"http://127.0.0.1:2998/v1"' ... +models = [ + dict( + attr="local", + abbr="swebench", + type="LiteLLMChat", + model="qwen3", # e.g. hosted_vllm/qwen3 for local vLLM + api_key="EMPTY", + url="http://127.0.0.1:2998/v1", # vLLM API base + batch_size=2, + generation_kwargs=dict(), + ) +] + +datasets = [ + dict( + type=SWEBenchDataset, + abbr="swebench_lite", + path="/data/zhanggaohua/datasets/SWE-bench_Lite", + name="lite", + split="test", + filter_spec="", + shuffle=False, + step_limit=STEP_LIMIT, + ), +] + +summarizer = dict( + attr="accuracy", + type=DefaultSummarizer, +) + + +infer = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + task=dict(type=SWEBenchInferTask), + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + task=dict(type=SWEBenchEvalTask), + ), +) diff --git a/ais_bench/configs/swe_bench_examples/swe_bench_verified.py b/ais_bench/configs/swe_bench_examples/swe_bench_verified.py new file mode 100644 index 00000000..b9de7821 --- /dev/null +++ b/ais_bench/configs/swe_bench_examples/swe_bench_verified.py @@ -0,0 +1,55 @@ +from ais_bench.benchmark.datasets import SWEBenchDataset +from ais_bench.benchmark.partitioners import NaivePartitioner +from ais_bench.benchmark.runners import LocalRunner +from ais_bench.benchmark.tasks import SWEBenchInferTask, SWEBenchEvalTask +from ais_bench.benchmark.summarizers import DefaultSummarizer + +STEP_LIMIT = 100 + +models = [ + dict( + attr="local", + abbr="swebench", + type="LiteLLMChat", + model="", + api_key="", + url="", + batch_size=2, + generation_kwargs=dict(), + ) +] + +datasets = [ + dict( + type=SWEBenchDataset, + abbr="swebench_verified", + path="ais_bench/datasets/SWE-bench_Verified", + name="verified", + split="test", + step_limit=STEP_LIMIT, + filter_spec="", + shuffle=False, + ), +] + +summarizer = dict( + attr="accuracy", + type=DefaultSummarizer, +) + + +infer = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + task=dict(type=SWEBenchInferTask), + ), +) + +eval = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + task=dict(type=SWEBenchEvalTask), + ), +)