From 0317f7d1a4c2f2afa5125e6bf0c0e909bb4da8cb Mon Sep 17 00:00:00 2001
From: "zhanggaohua@huawei.com" <GaoHua>
Date: Sat, 14 Mar 2026 15:29:39 +0800
Subject: [PATCH 1/2] convert calss to str before dump cfg

---
 ais_bench/benchmark/cli/config_manager.py     |   6 +-
 ais_bench/benchmark/cli/utils.py              |  20 ++
 ais_bench/benchmark/cli/workers.py            |  72 ++---
 ais_bench/benchmark/datasets/__init__.py      |   1 +
 ais_bench/benchmark/datasets/swebench.py      |  68 +++++
 ais_bench/benchmark/tasks/__init__.py         |   3 +
 ais_bench/benchmark/tasks/swebench_eval.py    | 159 +++++++++++
 ais_bench/benchmark/tasks/swebench_infer.py   | 247 ++++++++++++++++++
 ais_bench/benchmark/utils/config/run.py       |  21 +-
 .../swe_bench_examples/swe_bench_lite.py      |  53 ++++
 .../swe_bench_examples/swe_bench_verified.py  |  53 ++++
 11 files changed, 667 insertions(+), 36 deletions(-)
 create mode 100644 ais_bench/benchmark/datasets/swebench.py
 create mode 100644 ais_bench/benchmark/tasks/swebench_eval.py
 create mode 100644 ais_bench/benchmark/tasks/swebench_infer.py
 create mode 100644 ais_bench/configs/swe_bench_examples/swe_bench_lite.py
 create mode 100644 ais_bench/configs/swe_bench_examples/swe_bench_verified.py
diff --git a/ais_bench/benchmark/cli/config_manager.py b/ais_bench/benchmark/cli/config_manager.py
index dde76527..cb05db73 100644
--- a/ais_bench/benchmark/cli/config_manager.py
+++ b/ais_bench/benchmark/cli/config_manager.py
@@ -9,11 +9,11 @@
 from ais_bench.benchmark.utils.file import match_cfg_file
 from ais_bench.benchmark.utils.config.run import try_fill_in_custom_cfgs
 from ais_bench.benchmark.utils.logging.exceptions import CommandError, AISBenchConfigError
-from ais_bench.benchmark.cli.utils import fill_model_path_if_datasets_need, fill_test_range_use_num_prompts
+from ais_bench.benchmark.cli.utils import fill_model_path_if_datasets_need, fill_test_range_use_num_prompts, recur_convert_config_type
 
 class CustomConfigChecker:
     MODEL_REQUIRED_FIELDS = ['type', 'abbr', 'attr']
-    DATASET_REQUIRED_FIELDS = ['type', 'abbr', 'reader_cfg', 'infer_cfg', 'eval_cfg']
+    DATASET_REQUIRED_FIELDS = ['type', 'abbr']
     SUMMARIZER_REQUIRED_FIELDS = ['attr']
 
     def __init__(self, config, file_path):
@@ -327,6 +327,8 @@ def _dump_and_reload_config(self):
         # dump config
         output_config_path = osp.join(self.cfg.work_dir, 'configs',
                                     f'{self.cfg_time_str}_{os.getpid()}.py')
+
+        recur_convert_config_type(self.cfg)
         self.cfg.dump(output_config_path)
         # eval nums set
         if (self.args.num_prompts and self.args.num_prompts < 0) or self.args.num_prompts == 0:
diff --git a/ais_bench/benchmark/cli/utils.py b/ais_bench/benchmark/cli/utils.py
index 01ff50e5..81e92a84 100644
--- a/ais_bench/benchmark/cli/utils.py
+++ b/ais_bench/benchmark/cli/utils.py
@@ -2,6 +2,7 @@
 import os
 from datetime import datetime
 
+from mmengine.config import ConfigDict, Config
 from ais_bench.benchmark.utils.logging.exceptions import AISBenchConfigError
 from ais_bench.benchmark.utils.logging.logger import AISLogger
 from ais_bench.benchmark.utils.logging.error_codes import UTILS_CODES
@@ -20,6 +21,25 @@ def get_config_type(obj) -> str:
         return obj
     return f"{obj.__module__}.{obj.__name__}"
 
+def recur_convert_config_type(cfg):
+    """Recursively convert the type of the config to the string type.
+
+    Args:
+        cfg: The config to convert.
+    """
+    if isinstance(cfg, (dict, ConfigDict, Config)):
+        for key, value in cfg.items():
+            if key == "type":
+                cfg[key] = get_config_type(value)
+            else:
+                cfg[key] = recur_convert_config_type(value)
+    elif isinstance(cfg, list):
+        for i, item in enumerate(cfg):
+            cfg[i] = recur_convert_config_type(item) if isinstance(item, (dict, ConfigDict, Config, list)) else item
+    else:
+        return cfg
+    return cfg
+
 
 def get_current_time_str():
     return datetime.now().strftime("%Y%m%d_%H%M%S")
diff --git a/ais_bench/benchmark/cli/workers.py b/ais_bench/benchmark/cli/workers.py
index ce1dd8bb..eb477361 100644
--- a/ais_bench/benchmark/cli/workers.py
+++ b/ais_bench/benchmark/cli/workers.py
@@ -42,26 +42,30 @@ class Infer(BaseWorker):
     def update_cfg(self, cfg: ConfigDict) -> None:
         def get_task_type() -> str:
             if cfg["models"][0]["attr"] == "service":
-                return get_config_type(OpenICLApiInferTask)
+                return OpenICLApiInferTask
             else:
-                return get_config_type(OpenICLInferTask)
+                return OpenICLInferTask
 
-        new_cfg = dict(
-            infer=dict(
-                partitioner=dict(type=get_config_type(NaivePartitioner)),
-                runner=dict(
-                    max_num_workers=self.args.max_num_workers,
-                    max_workers_per_gpu=self.args.max_workers_per_gpu,
-                    debug=self.args.debug,
-                    task=dict(type=get_task_type()),
-                    type=get_config_type(LocalRunner),
-                ),
-            ),
-        )
+        def update_new_infer_cfg(new_cfg: ConfigDict) -> None:
+            runner_cfg = new_cfg['infer']['runner']
+            runner_cfg['max_num_workers'] = self.args.max_num_workers
+            runner_cfg['max_workers_per_gpu'] = self.args.max_workers_per_gpu
+            runner_cfg['debug'] = self.args.debug or cfg.cli_args.debug
 
+        if cfg.get('infer'):
+            new_cfg = dict(infer=cfg.infer)
+        else:
+            new_cfg = dict(
+                infer=dict(
+                    partitioner=dict(type=NaivePartitioner),
+                    runner=dict(
+                        task=dict(type=get_task_type()),
+                        type=LocalRunner,
+                    ),
+                ),
+            )
+        update_new_infer_cfg(new_cfg)
         cfg.merge_from_dict(new_cfg)
-        if cfg.cli_args.debug:
-            cfg.infer.runner.debug = True
         cfg.infer.partitioner["out_dir"] = osp.join(cfg["work_dir"], "predictions/")
         return cfg
 
@@ -259,26 +263,28 @@ def _result_post_process(self, tasks, cfg: ConfigDict):
 
 class Eval(BaseWorker):
     def update_cfg(self, cfg: ConfigDict) -> None:
-        new_cfg = dict(
-            eval=dict(
-                partitioner=dict(type=get_config_type(NaivePartitioner)),
-                runner=dict(
-                    max_num_workers=self.args.max_num_workers,
-                    debug=self.args.debug,
-                    task=dict(type=get_config_type(OpenICLEvalTask)),
+        def update_eval_cfg(new_cfg: ConfigDict) -> None:
+            runner_cfg = new_cfg['eval']['runner']
+            runner_cfg['max_num_workers'] = self.args.max_num_workers
+            runner_cfg['max_workers_per_gpu'] = self.args.max_workers_per_gpu
+            runner_cfg['debug'] = self.args.debug
+            runner_cfg['dump_details'] = cfg.cli_args.dump_eval_details
+            runner_cfg['cal_extract_rate'] = cfg.cli_args.dump_extract_rate
+
+        if cfg.get('eval'):
+            new_cfg = dict(eval=cfg.eval)
+        else:
+            new_cfg = dict(
+                eval=dict(
+                    partitioner=dict(type=NaivePartitioner),
+                    runner=dict(
+                        type=LocalRunner,
+                    task=dict(type=OpenICLEvalTask),
                 ),
-            ),
-        )
+            ))
 
-        new_cfg["eval"]["runner"]["type"] = get_config_type(LocalRunner)
-        new_cfg["eval"]["runner"]["max_workers_per_gpu"] = self.args.max_workers_per_gpu
+        update_eval_cfg(new_cfg)
         cfg.merge_from_dict(new_cfg)
-        if cfg.cli_args.dump_eval_details:
-            cfg.eval.runner.task.dump_details = True
-        if cfg.cli_args.dump_extract_rate:
-            cfg.eval.runner.task.cal_extract_rate = True
-        if cfg.cli_args.debug:
-            cfg.eval.runner.debug = True
         cfg.eval.partitioner["out_dir"] = osp.join(cfg["work_dir"], "results/")
         return cfg
 
diff --git a/ais_bench/benchmark/datasets/__init__.py b/ais_bench/benchmark/datasets/__init__.py
index 1581a2af..634c4a34 100644
--- a/ais_bench/benchmark/datasets/__init__.py
+++ b/ais_bench/benchmark/datasets/__init__.py
@@ -53,3 +53,4 @@
 from ais_bench.benchmark.datasets.mmstar import * # noqa: F401, F403
 from ais_bench.benchmark.datasets.dapo_math import * # noqa: F401, F403
 from ais_bench.benchmark.datasets.mooncake_trace import * # noqa: F401, F403
+from ais_bench.benchmark.datasets.swebench import * # noqa: F401, F403
\ No newline at end of file
diff --git a/ais_bench/benchmark/datasets/swebench.py b/ais_bench/benchmark/datasets/swebench.py
new file mode 100644
index 00000000..a466e612
--- /dev/null
+++ b/ais_bench/benchmark/datasets/swebench.py
@@ -0,0 +1,68 @@
+import re
+import random
+from datasets import load_dataset, Dataset, DatasetDict
+
+from ais_bench.benchmark.registry import LOAD_DATASET
+from ais_bench.benchmark.utils.logging.exceptions import ParameterValueError
+from ais_bench.benchmark.utils.logging.error_codes import DSET_CODES
+from ais_bench.benchmark.datasets.base import BaseDataset
+
+DATASET_MAPPING = {
+    "full": "princeton-nlp/SWE-Bench",
+    "verified": "princeton-nlp/SWE-Bench_Verified",
+    "lite": "princeton-nlp/SWE-Bench_Lite",
+    "multimodal": "princeton-nlp/SWE-Bench_Multimodal",
+    "multilingual": "swe-bench/SWE-Bench_Multilingual",
+}
+
+
+@LOAD_DATASET.register_module()
+class SWEBenchDataset(BaseDataset):
+    def filter_instances(
+        self, instances: list[dict], *, filter_spec: str, shuffle: bool = False
+    ) -> list[dict]:
+        """Filter and slice a list of SWEBench instances."""
+        if shuffle:
+            instances = sorted(instances.copy(), key=lambda x: x["instance_id"])
+            random.seed(42)
+            random.shuffle(instances)
+        before_filter = len(instances)
+        instances = [
+            instance
+            for instance in instances
+            if re.match(filter_spec, instance["instance_id"])
+        ]
+        if (after_filter := len(instances)) != before_filter:
+            self.logger.info(
+                f"Instance filter: {before_filter} -> {after_filter} instances"
+            )
+        return instances
+
+    def load(
+        self,
+        path: str,
+        name: str,
+        split: str = "test",
+        filter_spec: str = "",
+        shuffle: bool = False,
+    ):
+        if name not in DATASET_MAPPING:
+            raise ParameterValueError(
+                DSET_CODES.INVALID_PARAM_VALUE,
+                f"Invalid swebench dataset name, expected one of {list(DATASET_MAPPING.keys())} but got {name}",
+            )
+        try:
+            dataset = load_dataset("parquet", data_files={split: path})
+        except Exception as e:
+            self.logger.warning(
+                f"Failed to load swebench dataset {name} from {path} with error: {e}, trying to load from Hugging Face"
+            )
+            try:
+                dataset = load_dataset(DATASET_MAPPING[name], split=split)
+            except Exception as e:
+                raise ParameterValueError(
+                    DSET_CODES.DATA_PREPROCESSING_ERROR,
+                    f"Failed to load swebench dataset {name} from Hugging Face with error: {e}.",
+                )
+        dataset = self.filter_instances(list(dataset), filter_spec=filter_spec, shuffle=shuffle)
+        return DatasetDict({"test": Dataset.from_list(dataset)})
diff --git a/ais_bench/benchmark/tasks/__init__.py b/ais_bench/benchmark/tasks/__init__.py
index 7ba624f8..c094003a 100644
--- a/ais_bench/benchmark/tasks/__init__.py
+++ b/ais_bench/benchmark/tasks/__init__.py
@@ -1,3 +1,6 @@
 from ais_bench.benchmark.tasks.openicl_eval import *  # noqa: F401, F403
 from ais_bench.benchmark.tasks.openicl_infer import *  # noqa: F401, F403
 from ais_bench.benchmark.tasks.openicl_api_infer import OpenICLApiInferTask
+from ais_bench.benchmark.tasks.swebench_infer import SWEBenchInferTask
+from ais_bench.benchmark.tasks.swebench_eval import SWEBenchEvalTask
+
diff --git a/ais_bench/benchmark/tasks/swebench_eval.py b/ais_bench/benchmark/tasks/swebench_eval.py
new file mode 100644
index 00000000..413edc7a
--- /dev/null
+++ b/ais_bench/benchmark/tasks/swebench_eval.py
@@ -0,0 +1,159 @@
+import argparse
+import json
+import os
+import os.path as osp
+import sys
+import threading
+import time
+
+from mmengine.config import Config, ConfigDict
+from mmengine.utils import mkdir_or_exist
+
+from ais_bench.benchmark.registry import TASKS
+from ais_bench.benchmark.tasks.base import BaseTask, TaskStateManager
+from ais_bench.benchmark.utils.core.abbr import (
+    get_infer_output_path,
+    task_abbr_from_cfg,
+)
+from ais_bench.benchmark.utils.logging import AISLogger
+
+
+@TASKS.register_module()
+class SWEBenchEvalTask(BaseTask):
+    """SWEBench Evaluation Task.
+
+    Evaluates SWE-bench predictions using the official harness and writes
+    results to work_dir/results.
+    """
+
+    name_prefix = "SWEBenchEval"
+    log_subdir = "logs/eval"
+    output_subdir = "results"
+
+    def __init__(self, cfg: ConfigDict):
+        super().__init__(cfg)
+
+    def get_command(self, cfg_path: str, template: str) -> str:
+        sys.path.append(os.getcwd())
+        script_path = __file__
+        python = sys.executable
+        command = f"{python} {script_path} {cfg_path}"
+        return template.format(task_cmd=command)
+
+    def run(self, task_state_manager: TaskStateManager):
+        self.task_state_manager = task_state_manager
+        self.logger.info("SWEBenchEvalTask %s", task_abbr_from_cfg(self.cfg))
+
+        dataset_cfg = self.dataset_cfgs[0]
+        dataset_name = dataset_cfg.get("name", "lite")
+
+        pred_path = get_infer_output_path(
+            self.model_cfg,
+            dataset_cfg,
+            osp.join(self.work_dir, "predictions"),
+            file_extension="jsonl",
+        )
+        if not osp.isfile(pred_path):
+            raise FileNotFoundError(
+                f"Predictions file not found: {pred_path}. Run infer first."
+            )
+
+        out_path = get_infer_output_path(
+            self.model_cfg,
+            dataset_cfg,
+            osp.join(self.work_dir, self.output_subdir),
+            file_extension="json",
+        )
+        mkdir_or_exist(osp.dirname(out_path))
+
+        task_state_manager.update_task_state(
+            {"status": "eval", "progress_description": "SWE-bench harness"}
+        )
+
+        try:
+            import swebench.harness.run_evaluation as run_eval
+        except ImportError as e:
+            raise ImportError(
+                "SWEBenchEvalTask requires the SWE-bench harness. "
+                "Install from: https://github.com/princeton-nlp/SWE-bench"
+            ) from e
+
+        run_id = task_abbr_from_cfg(self.cfg).replace("/", "_")
+        eval_runner = self.cfg.get("eval", {}).get("runner", {})
+        max_workers = eval_runner.get("max_num_workers", 4)
+        report_dir = osp.dirname(out_path)
+
+        try:
+            run_eval.main(
+                dataset_name=dataset_name,
+                split="test",
+                instance_ids=[],
+                predictions_path=pred_path,
+                max_workers=max_workers,
+                force_rebuild=False,
+                cache_level="env",
+                clean=False,
+                open_file_limit=4096,
+                run_id=run_id,
+                timeout=1800,
+                namespace=None,
+                rewrite_reports=False,
+                modal=False,
+                report_dir=report_dir,
+            )
+            harness_exit = 0
+        except SystemExit as e:
+            harness_exit = e.code if e.code is not None else 1
+        except Exception as e:
+            self.logger.exception("Harness failed: %s", e)
+            harness_exit = 1
+
+        results = {
+            "harness_exit_code": harness_exit,
+            "dataset_name": dataset_name,
+            "predictions_path": pred_path,
+            "run_id": run_id,
+        }
+        with open(out_path, "w") as f:
+            json.dump(results, f, indent=2)
+
+        if harness_exit != 0:
+            self.logger.warning("Harness exited with code %s", harness_exit)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="SWEBench Eval")
+    parser.add_argument("config", help="Config file path")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    logger = AISLogger()
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    task_state_manager = TaskStateManager(
+        tmp_path=os.path.join(cfg["work_dir"], "status_tmp"),
+        task_name=task_abbr_from_cfg(cfg),
+        is_debug=cfg["cli_args"]["debug"],
+    )
+    manager_t = threading.Thread(target=task_state_manager.launch, args=())
+    manager_t.start()
+    task_state_manager.update_task_state(
+        {
+            "status": "start",
+            "task_log_path": os.path.join(
+                "logs/eval/", f"{task_abbr_from_cfg(cfg)}.out"
+            ),
+        }
+    )
+    start_time = time.perf_counter()
+    try:
+        task = SWEBenchEvalTask(cfg)
+        task.run(task_state_manager)
+    except Exception as e:
+        task_state_manager.update_task_state({"status": "error"})
+        raise
+    end_time = time.perf_counter()
+    logger.info("SWEBench eval time: %.2fs", end_time - start_time)
+    task_state_manager.update_task_state({"status": "finish"})
+    manager_t.join()
diff --git a/ais_bench/benchmark/tasks/swebench_infer.py b/ais_bench/benchmark/tasks/swebench_infer.py
new file mode 100644
index 00000000..453d0167
--- /dev/null
+++ b/ais_bench/benchmark/tasks/swebench_infer.py
@@ -0,0 +1,247 @@
+import argparse
+import json
+import os
+import os.path as osp
+import sys
+import threading
+import time
+from pathlib import Path
+from typing import List
+
+from mmengine.config import Config, ConfigDict
+from mmengine.utils import mkdir_or_exist
+
+from ais_bench.benchmark.registry import TASKS
+from ais_bench.benchmark.tasks.base import BaseTask, TaskStateManager
+from ais_bench.benchmark.utils.config import build_dataset_from_cfg
+from ais_bench.benchmark.utils.core.abbr import (
+    get_infer_output_path,
+    model_abbr_from_cfg,
+    task_abbr_from_cfg,
+)
+from ais_bench.benchmark.utils.logging import AISLogger
+
+
+def _get_minisweagent_config(model_cfg: ConfigDict) -> dict:
+    """Build mini-swe-agent model config from ais_bench model_cfg (e.g. LiteLLMChat)."""
+    model_name = model_cfg.get("model") or model_cfg.get("model_name") or ""
+    model_type = (
+        getattr(model_cfg.get("type"), "__name__", None)
+        or (model_cfg.get("type", "") if isinstance(model_cfg.get("type"), str) else "")
+    )
+    if isinstance(model_type, str):
+        model_type = model_type.split(".")[-1]
+    model_kwargs = dict(model_cfg.get("generation_kwargs", {}))
+    if model_cfg.get("api_key"):
+        model_kwargs["api_key"] = model_cfg["api_key"]
+    if model_cfg.get("url"):
+        model_kwargs["api_base"] = model_cfg["url"]
+    model_class = "litellm"
+    if "openrouter" in (model_type or "").lower() or "openrouter" in (str(model_cfg.get("type", ""))).lower():
+        model_class = "openrouter"
+    return {
+        "model": {
+            "model_name": model_name,
+            "model_class": model_class,
+            "model_kwargs": model_kwargs,
+        }
+    }
+
+
+class _AISBenchProgressManager:
+    """Minimal progress manager that forwards to TaskStateManager for process_instance."""
+
+    def __init__(self, task_state_manager: TaskStateManager, total: int):
+        self._tsm = task_state_manager
+        self._total = total
+        self._finish_count = 0
+
+    def on_instance_start(self, instance_id: str) -> None:
+        self._tsm.update_task_state(
+            {
+                "status": "inferencing",
+                "finish_count": self._finish_count,
+                "total_count": self._total,
+                "progress_description": "SWEBench infer",
+                "other_kwargs": {"current": instance_id},
+            }
+        )
+
+    def update_instance_status(self, instance_id: str, message: str) -> None:
+        self._tsm.update_task_state(
+            {
+                "status": "inferencing",
+                "finish_count": self._finish_count,
+                "total_count": self._total,
+                "progress_description": "SWEBench infer",
+                "other_kwargs": {"current": instance_id, "message": message},
+            }
+        )
+
+    def on_instance_end(self, instance_id: str, exit_status: str = None) -> None:
+        self._finish_count += 1
+        self._tsm.update_task_state(
+            {
+                "status": "inferencing",
+                "finish_count": self._finish_count,
+                "total_count": self._total,
+                "progress_description": "SWEBench infer",
+            }
+        )
+
+
+@TASKS.register_module()
+class SWEBenchInferTask(BaseTask):
+    """SWEBench Inference Task.
+
+    Runs mini-swe-agent on SWE-bench instances and writes predictions as JSONL.
+    """
+
+    name_prefix = "SWEBenchInfer"
+    log_subdir = "logs/infer"
+    output_subdir = "predictions"
+
+    def __init__(self, cfg: ConfigDict):
+        super().__init__(cfg)
+
+    def get_command(self, cfg_path: str, template: str) -> str:
+        sys.path.append(os.getcwd())
+        script_path = __file__
+        python = sys.executable
+        command = f"{python} {script_path} {cfg_path}"
+        return template.format(task_cmd=command)
+
+    def get_output_paths(self, file_extension: str = "jsonl") -> List[str]:
+        paths = []
+        for dataset_cfg in self.dataset_cfgs:
+            paths.append(
+                get_infer_output_path(
+                    self.model_cfg,
+                    dataset_cfg,
+                    os.path.join(self.work_dir, self.output_subdir),
+                    file_extension=file_extension,
+                )
+            )
+        return paths
+
+    def run(self, task_state_manager: TaskStateManager):
+        self.task_state_manager = task_state_manager
+        self.logger.info("SWEBenchInferTask %s", task_abbr_from_cfg(self.cfg))
+
+        try:
+            from minisweagent.run.benchmarks.swebench import process_instance
+        except ImportError as e:
+            raise ImportError(
+                "SWEBenchInferTask requires mini-swe-agent. "
+                "Install with: pip install mini-swe-agent"
+            ) from e
+
+        dataset_cfg = self.dataset_cfgs[0]
+        dataset = build_dataset_from_cfg(
+            dataset_cfg, task_state_manager=task_state_manager
+        )
+        test_data = dataset.test
+        if hasattr(test_data, "__iter__") and not isinstance(test_data, (list, dict)):
+            instances = list(test_data)
+        else:
+            instances = [test_data[i] for i in range(len(test_data))]
+
+        model_abbr = model_abbr_from_cfg(self.model_cfg)
+        pred_root = osp.join(self.work_dir, self.output_subdir, model_abbr)
+        mkdir_or_exist(pred_root)
+        out_path = get_infer_output_path(
+            self.model_cfg,
+            dataset_cfg,
+            osp.join(self.work_dir, self.output_subdir),
+            file_extension="jsonl",
+        )
+        out_dir = Path(osp.splitext(out_path)[0] + "_tmp")
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+        base_config = _get_minisweagent_config(self.model_cfg)
+        base_config.setdefault("environment", {})["environment_class"] = "docker"
+        base_config.setdefault("agent", {})
+
+        progress_manager = _AISBenchProgressManager(
+            task_state_manager, len(instances)
+        )
+        task_state_manager.update_task_state(
+            {
+                "status": "inferencing",
+                "total_count": len(instances),
+                "finish_count": 0,
+                "progress_description": "SWEBench infer",
+            }
+        )
+
+        for instance in instances:
+            process_instance(
+                instance,
+                out_dir,
+                base_config,
+                progress_manager,
+            )
+
+        preds_path = out_dir / "preds.json"
+        preds = {}
+        if preds_path.exists():
+            with open(preds_path) as f:
+                preds = json.load(f)
+
+        mkdir_or_exist(osp.dirname(out_path))
+        with open(out_path, "w") as f:
+            for instance_id, rec in preds.items():
+                line = json.dumps(
+                    {
+                        "instance_id": instance_id,
+                        "model_name_or_path": rec.get("model_name_or_path", model_abbr),
+                        "model_patch": rec.get("model_patch", ""),
+                    },
+                    ensure_ascii=False,
+                )
+                f.write(line + "\n")
+
+        if out_dir.exists():
+            import shutil
+            try:
+                shutil.rmtree(out_dir)
+            except OSError:
+                pass
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="SWEBench Infer")
+    parser.add_argument("config", help="Config file path")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    logger = AISLogger()
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    task_state_manager = TaskStateManager(
+        tmp_path=os.path.join(cfg["work_dir"], "status_tmp"),
+        task_name=task_abbr_from_cfg(cfg),
+        is_debug=cfg["cli_args"]["debug"],
+    )
+    manager_t = threading.Thread(target=task_state_manager.launch, args=())
+    manager_t.start()
+    task_state_manager.update_task_state(
+        {
+            "status": "start",
+            "task_log_path": os.path.join(
+                "logs/infer/", f"{task_abbr_from_cfg(cfg)}.out"
+            ),
+        }
+    )
+    start_time = time.perf_counter()
+    try:
+        task = SWEBenchInferTask(cfg)
+        task.run(task_state_manager)
+    except Exception as e:
+        task_state_manager.update_task_state({"status": "error"})
+        raise
+    end_time = time.perf_counter()
+    logger.info("SWEBench infer time: %.2fs", end_time - start_time)
+    task_state_manager.update_task_state({"status": "finish"})
+    manager_t.join()
diff --git a/ais_bench/benchmark/utils/config/run.py b/ais_bench/benchmark/utils/config/run.py
index a56a88bd..9ef6c712 100644
--- a/ais_bench/benchmark/utils/config/run.py
+++ b/ais_bench/benchmark/utils/config/run.py
@@ -6,10 +6,29 @@
 from ais_bench.benchmark.utils.logging import AISLogger
 from ais_bench.benchmark.utils.logging.exceptions import AISBenchConfigError
 from ais_bench.benchmark.utils.logging.error_codes import UTILS_CODES
+from ais_bench.benchmark.openicl.icl_prompt_template import PromptTemplate
+from ais_bench.benchmark.openicl.icl_retriever import ZeroRetriever
+from ais_bench.benchmark.openicl.icl_inferencer import GenInferencer
+from ais_bench.benchmark.openicl.icl_evaluator import AccEvaluator
 
 logger = AISLogger()
 
 def try_fill_in_custom_cfgs(config):
+    for dataset_cfg in config["datasets"]:
+        if "infer_cfg" not in dataset_cfg:
+            logger.debug(f"Filling in infer config for dataset {dataset_cfg['abbr']}")
+            dataset_cfg["infer_cfg"] = dict(
+            reader_cfg=dict(input_columns=["dummy"], output_column="dummy"),
+            prompt_template=dict(type=get_config_type(PromptTemplate), template="{dummy}"),
+            retriever=dict(type=get_config_type(ZeroRetriever)),
+            inferencer=dict(type=get_config_type(GenInferencer)),
+            )
+        if "eval_cfg" not in dataset_cfg:
+            logger.debug(f"Filling in eval config for dataset {dataset_cfg['abbr']}")
+            dataset_cfg["eval_cfg"] = dict(
+            evaluator=dict(type=get_config_type(AccEvaluator)),
+            )
+
     return config
 
 
@@ -70,4 +89,4 @@ def fill_eval_cfg(cfg, args):
     new_cfg['eval']['runner']['type'] = get_config_type(LocalRunner)
     new_cfg['eval']['runner']['max_workers_per_gpu'] = args.max_workers_per_gpu
     cfg.merge_from_dict(new_cfg)
-    logger.debug("Evaluation config filled successfully")
\ No newline at end of file
+    logger.debug("Evaluation config filled successfully")
diff --git a/ais_bench/configs/swe_bench_examples/swe_bench_lite.py b/ais_bench/configs/swe_bench_examples/swe_bench_lite.py
new file mode 100644
index 00000000..b6789dcf
--- /dev/null
+++ b/ais_bench/configs/swe_bench_examples/swe_bench_lite.py
@@ -0,0 +1,53 @@
+from ais_bench.benchmark.datasets import SWEBenchDataset
+from ais_bench.benchmark.partitioners import NaivePartitioner
+from ais_bench.benchmark.runners import LocalRunner
+from ais_bench.benchmark.tasks import SWEBenchInferTask, SWEBenchEvalTask
+from ais_bench.benchmark.summarizers import DefaultSummarizer
+
+models = [
+    dict(
+        attr="local",
+        abbr="swebench",
+        type="LiteLLMChat",
+        model="",
+        api_key="",
+        url="",
+        batch_size=1,
+        generation_kwargs=dict(),
+    )
+]
+
+datasets = [
+    dict(
+        type=SWEBenchDataset,
+        abbr="swebench_lite",
+        path="ais_bench/datasets/SWE-bench_Lite",
+        name="lite",
+        split="test",
+        filter_spec="",
+        shuffle=False,
+        prediction_file_extension="jsonl",
+    ),
+]
+
+summarizer = dict(
+    attr="accuracy",
+    type=DefaultSummarizer,
+)
+
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=LocalRunner,
+        task=dict(type=SWEBenchInferTask),
+    ),
+)
+
+eval = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=LocalRunner,
+        task=dict(type=SWEBenchEvalTask),
+    ),
+)
diff --git a/ais_bench/configs/swe_bench_examples/swe_bench_verified.py b/ais_bench/configs/swe_bench_examples/swe_bench_verified.py
new file mode 100644
index 00000000..fb8f6f7f
--- /dev/null
+++ b/ais_bench/configs/swe_bench_examples/swe_bench_verified.py
@@ -0,0 +1,53 @@
+from ais_bench.benchmark.datasets import SWEBenchDataset
+from ais_bench.benchmark.partitioners import NaivePartitioner
+from ais_bench.benchmark.runners import LocalRunner
+from ais_bench.benchmark.tasks import SWEBenchInferTask, SWEBenchEvalTask
+from ais_bench.benchmark.summarizers import DefaultSummarizer
+
+models = [
+    dict(
+        attr="local",
+        abbr="swebench",
+        type="LiteLLMChat",
+        model="",
+        api_key="",
+        url="",
+        batch_size=1,
+        generation_kwargs=dict(),
+    )
+]
+
+datasets = [
+    dict(
+        type=SWEBenchDataset,
+        abbr="swebench_verified",
+        path="ais_bench/datasets/SWE-bench_Verified",
+        name="verified",
+        split="test",
+        filter_spec="",
+        shuffle=False,
+        prediction_file_extension="jsonl",
+    ),
+]
+
+summarizer = dict(
+    attr="accuracy",
+    type=DefaultSummarizer,
+)
+
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=LocalRunner,
+        task=dict(type=SWEBenchInferTask),
+    ),
+)
+
+eval = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=LocalRunner,
+        task=dict(type=SWEBenchEvalTask),
+    ),
+)

From 9b774027dc14e52172db6189f3e41b27902d6057 Mon Sep 17 00:00:00 2001
From: "zhanggaohua@huawei.com" <GaoHua>
Date: Sat, 14 Mar 2026 16:40:00 +0800
Subject: [PATCH 2/2] adapter swe mini agent

---
 ais_bench/benchmark/datasets/swebench.py      |   3 +-
 ais_bench/benchmark/tasks/swebench_infer.py   | 194 +++++++++++++-----
 ais_bench/benchmark/utils/config/run.py       |   4 +-
 .../swe_bench_examples/swe_bench_lite.py      |  16 +-
 .../swe_bench_examples/swe_bench_verified.py  |   6 +-
 5 files changed, 156 insertions(+), 67 deletions(-)

diff --git a/ais_bench/benchmark/datasets/swebench.py b/ais_bench/benchmark/datasets/swebench.py
index a466e612..594fbfc7 100644
--- a/ais_bench/benchmark/datasets/swebench.py
+++ b/ais_bench/benchmark/datasets/swebench.py
@@ -45,6 +45,7 @@ def load(
         split: str = "test",
         filter_spec: str = "",
         shuffle: bool = False,
+        **kwargs,
     ):
         if name not in DATASET_MAPPING:
             raise ParameterValueError(
@@ -65,4 +66,4 @@ def load(
                     f"Failed to load swebench dataset {name} from Hugging Face with error: {e}.",
                 )
         dataset = self.filter_instances(list(dataset), filter_spec=filter_spec, shuffle=shuffle)
-        return DatasetDict({"test": Dataset.from_list(dataset)})
+        return  Dataset.from_list(dataset)
diff --git a/ais_bench/benchmark/tasks/swebench_infer.py b/ais_bench/benchmark/tasks/swebench_infer.py
index 453d0167..9b14227c 100644
--- a/ais_bench/benchmark/tasks/swebench_infer.py
+++ b/ais_bench/benchmark/tasks/swebench_infer.py
@@ -1,12 +1,14 @@
 import argparse
+import concurrent.futures
 import json
 import os
 import os.path as osp
 import sys
 import threading
 import time
+import shutil
 from pathlib import Path
-from typing import List
+from typing import Any, List, Optional, Tuple
 
 from mmengine.config import Config, ConfigDict
 from mmengine.utils import mkdir_or_exist
@@ -25,6 +27,9 @@
 def _get_minisweagent_config(model_cfg: ConfigDict) -> dict:
     """Build mini-swe-agent model config from ais_bench model_cfg (e.g. LiteLLMChat)."""
     model_name = model_cfg.get("model") or model_cfg.get("model_name") or ""
+    # LiteLLM requires provider prefix (e.g. hosted_vllm/qwen3) for custom API; add it when url is set and name has no /
+    if model_cfg.get("url") and model_name:
+        model_name = f"hosted_vllm/{model_name}"
     model_type = (
         getattr(model_cfg.get("type"), "__name__", None)
         or (model_cfg.get("type", "") if isinstance(model_cfg.get("type"), str) else "")
@@ -39,13 +44,14 @@ def _get_minisweagent_config(model_cfg: ConfigDict) -> dict:
     model_class = "litellm"
     if "openrouter" in (model_type or "").lower() or "openrouter" in (str(model_cfg.get("type", ""))).lower():
         model_class = "openrouter"
-    return {
-        "model": {
-            "model_name": model_name,
-            "model_class": model_class,
-            "model_kwargs": model_kwargs,
-        }
+    # Avoid cost-calculation errors for local/custom models (e.g. hosted_vllm) not in litellm price map
+    model_dict = {
+        "model_name": model_name,
+        "model_class": model_class,
+        "model_kwargs": model_kwargs,
+        "cost_tracking": "ignore_errors",
     }
+    return {"model": model_dict}
 
 
 class _AISBenchProgressManager:
@@ -89,12 +95,63 @@ def on_instance_end(self, instance_id: str, exit_status: str = None) -> None:
             }
         )
 
+    def on_uncaught_exception(self, instance_id: str, exception: Exception) -> None:
+        self.on_instance_end(instance_id, f"Uncaught {type(exception).__name__}")
+
+
+class _CompositeProgressManager:
+    """Forwards progress calls to multiple delegates (e.g. TaskStateManager + Rich dashboard)."""
+
+    def __init__(self, *delegates: Any):
+        self._delegates = [d for d in delegates if d is not None]
+
+    def on_instance_start(self, instance_id: str) -> None:
+        for d in self._delegates:
+            d.on_instance_start(instance_id)
+
+    def update_instance_status(self, instance_id: str, message: str) -> None:
+        for d in self._delegates:
+            d.update_instance_status(instance_id, message)
+
+    def on_instance_end(self, instance_id: str, exit_status: str = None) -> None:
+        for d in self._delegates:
+            d.on_instance_end(instance_id, exit_status)
+
+    def on_uncaught_exception(self, instance_id: str, exception: Exception) -> None:
+        for d in self._delegates:
+            d.on_uncaught_exception(instance_id, exception)
+
+
+def _make_swebench_progress_manager(
+    task_state_manager: TaskStateManager,
+    num_instances: int,
+) -> Tuple[Any, Optional[Any]]:
+    """Build progress manager and optional Rich live display.
+
+    Returns:
+        (progress_manager, live_render_group or None).
+        When live_render_group is not None, caller should wrap execution in
+        Live(live_render_group, refresh_per_second=4).
+    """
+    tsm_manager = _AISBenchProgressManager(task_state_manager, num_instances)
+    try:
+        from minisweagent.run.benchmarks.utils.batch_progress import (
+            RunBatchProgressManager,
+        )
+        from rich.live import Live
+
+        run_batch_manager = RunBatchProgressManager(num_instances, yaml_report_path=None)
+        composite = _CompositeProgressManager(tsm_manager, run_batch_manager)
+        return composite, run_batch_manager.render_group
+    except ImportError:
+        return tsm_manager, None
+
 
 @TASKS.register_module()
 class SWEBenchInferTask(BaseTask):
     """SWEBench Inference Task.
 
-    Runs mini-swe-agent on SWE-bench instances and writes predictions as JSONL.
+    Runs mini-swe-agent on SWE-bench instances and writes predictions as JSON.
     """
 
     name_prefix = "SWEBenchInfer"
@@ -111,25 +168,14 @@ def get_command(self, cfg_path: str, template: str) -> str:
         command = f"{python} {script_path} {cfg_path}"
         return template.format(task_cmd=command)
 
-    def get_output_paths(self, file_extension: str = "jsonl") -> List[str]:
-        paths = []
-        for dataset_cfg in self.dataset_cfgs:
-            paths.append(
-                get_infer_output_path(
-                    self.model_cfg,
-                    dataset_cfg,
-                    os.path.join(self.work_dir, self.output_subdir),
-                    file_extension=file_extension,
-                )
-            )
-        return paths
-
     def run(self, task_state_manager: TaskStateManager):
         self.task_state_manager = task_state_manager
         self.logger.info("SWEBenchInferTask %s", task_abbr_from_cfg(self.cfg))
 
         try:
             from minisweagent.run.benchmarks.swebench import process_instance
+            from minisweagent.config import get_config_from_spec
+            from minisweagent.utils.serialize import recursive_merge
         except ImportError as e:
             raise ImportError(
                 "SWEBenchInferTask requires mini-swe-agent. "
@@ -153,16 +199,31 @@ def run(self, task_state_manager: TaskStateManager):
             self.model_cfg,
             dataset_cfg,
             osp.join(self.work_dir, self.output_subdir),
-            file_extension="jsonl",
+            file_extension="json",
         )
-        out_dir = Path(osp.splitext(out_path)[0] + "_tmp")
+
+
+        out_dir = Path(osp.splitext(out_path)[0])
         out_dir.mkdir(parents=True, exist_ok=True)
 
-        base_config = _get_minisweagent_config(self.model_cfg)
-        base_config.setdefault("environment", {})["environment_class"] = "docker"
-        base_config.setdefault("agent", {})
+        # Load default swebench config (agent.system_template, agent.instance_template, etc.)
+        # then override with our model so mini-swe-agent gets required AgentConfig fields.
+        default_swebench_config = get_config_from_spec("swebench.yaml")
+        our_config = _get_minisweagent_config(self.model_cfg)
+        model_name = (our_config.get("model") or {}).get("model_name") or ""
+        if not (model_name or "").strip():
+            raise ValueError(
+                "No model set for SWEBench infer. In your config (e.g. swe_bench_lite.py), set "
+                "models[0]['model'], models[0]['url'], and models[0]['api_key']. "
+                "Example for local vLLM: model='hosted_vllm/qwen3', url='http://127.0.0.1:2998/v1', api_key='EMPTY'. "
+                "Or run: mini-extra config setup (to use mini-swe-agent defaults)."
+            )
+        our_config.setdefault("environment", {})["environment_class"] = "docker"
+        base_config = recursive_merge(default_swebench_config, our_config)
+        if dataset_cfg.get("step_limit") is not None:
+            base_config.setdefault("agent", {})["step_limit"] = dataset_cfg["step_limit"]
 
-        progress_manager = _AISBenchProgressManager(
+        progress_manager, live_render_group = _make_swebench_progress_manager(
             task_state_manager, len(instances)
         )
         task_state_manager.update_task_state(
@@ -174,39 +235,58 @@ def run(self, task_state_manager: TaskStateManager):
             }
         )
 
-        for instance in instances:
-            process_instance(
-                instance,
-                out_dir,
-                base_config,
-                progress_manager,
-            )
+        workers = self.model_cfg.get("batch_size", 1)
+
+        def process_futures(futures):
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    future.result()
+                except concurrent.futures.CancelledError:
+                    pass
+                except Exception as e:
+                    instance_id = futures[future]
+                    self.logger.error(
+                        "Error in future for instance %s: %s",
+                        instance_id,
+                        e,
+                        exc_info=True,
+                    )
+                    progress_manager.on_uncaught_exception(instance_id, e)
+
+        def run_executor():
+            with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
+                futures = {
+                    executor.submit(
+                        process_instance,
+                        instance,
+                        out_dir,
+                        base_config,
+                        progress_manager,
+                    ): instance["instance_id"]
+                    for instance in instances
+                }
+                try:
+                    process_futures(futures)
+                except KeyboardInterrupt:
+                    self.logger.info(
+                        "Cancelling all pending jobs. Press ^C again to exit immediately."
+                    )
+                    for future in futures:
+                        if not future.running() and not future.done():
+                            future.cancel()
+                    process_futures(futures)
+
+        if live_render_group is not None:
+            from rich.live import Live
+
+            with Live(live_render_group, refresh_per_second=4):
+                run_executor()
+        else:
+            run_executor()
 
         preds_path = out_dir / "preds.json"
-        preds = {}
         if preds_path.exists():
-            with open(preds_path) as f:
-                preds = json.load(f)
-
-        mkdir_or_exist(osp.dirname(out_path))
-        with open(out_path, "w") as f:
-            for instance_id, rec in preds.items():
-                line = json.dumps(
-                    {
-                        "instance_id": instance_id,
-                        "model_name_or_path": rec.get("model_name_or_path", model_abbr),
-                        "model_patch": rec.get("model_patch", ""),
-                    },
-                    ensure_ascii=False,
-                )
-                f.write(line + "\n")
-
-        if out_dir.exists():
-            import shutil
-            try:
-                shutil.rmtree(out_dir)
-            except OSError:
-                pass
+            shutil.move(preds_path, out_path)
 
 
 def parse_args():
diff --git a/ais_bench/benchmark/utils/config/run.py b/ais_bench/benchmark/utils/config/run.py
index 9ef6c712..e7385aff 100644
--- a/ais_bench/benchmark/utils/config/run.py
+++ b/ais_bench/benchmark/utils/config/run.py
@@ -18,11 +18,13 @@ def try_fill_in_custom_cfgs(config):
         if "infer_cfg" not in dataset_cfg:
             logger.debug(f"Filling in infer config for dataset {dataset_cfg['abbr']}")
             dataset_cfg["infer_cfg"] = dict(
-            reader_cfg=dict(input_columns=["dummy"], output_column="dummy"),
             prompt_template=dict(type=get_config_type(PromptTemplate), template="{dummy}"),
             retriever=dict(type=get_config_type(ZeroRetriever)),
             inferencer=dict(type=get_config_type(GenInferencer)),
             )
+        if "reader_cfg" not in dataset_cfg:
+            logger.debug(f"Filling in reader config for dataset {dataset_cfg['abbr']}")
+            dataset_cfg["reader_cfg"] = dict(input_columns=["dummy"], output_column="dummy")
         if "eval_cfg" not in dataset_cfg:
             logger.debug(f"Filling in eval config for dataset {dataset_cfg['abbr']}")
             dataset_cfg["eval_cfg"] = dict(
diff --git a/ais_bench/configs/swe_bench_examples/swe_bench_lite.py b/ais_bench/configs/swe_bench_examples/swe_bench_lite.py
index b6789dcf..f69b6933 100644
--- a/ais_bench/configs/swe_bench_examples/swe_bench_lite.py
+++ b/ais_bench/configs/swe_bench_examples/swe_bench_lite.py
@@ -4,15 +4,19 @@
 from ais_bench.benchmark.tasks import SWEBenchInferTask, SWEBenchEvalTask
 from ais_bench.benchmark.summarizers import DefaultSummarizer
 
+STEP_LIMIT = 2
+
+# For local vLLM: set model (e.g. hosted_vllm/qwen3), url (vLLM API base), api_key (e.g. "EMPTY").
+# Example matching: mini-extra swebench -m hosted_vllm/qwen3 -c model.model_kwargs.api_base='"http://127.0.0.1:2998/v1"' ...
 models = [
     dict(
         attr="local",
         abbr="swebench",
         type="LiteLLMChat",
-        model="",
-        api_key="",
-        url="",
-        batch_size=1,
+        model="qwen3",  # e.g. hosted_vllm/qwen3 for local vLLM
+        api_key="EMPTY",
+        url="http://127.0.0.1:2998/v1",  # vLLM API base
+        batch_size=2,
         generation_kwargs=dict(),
     )
 ]
@@ -21,12 +25,12 @@
     dict(
         type=SWEBenchDataset,
         abbr="swebench_lite",
-        path="ais_bench/datasets/SWE-bench_Lite",
+        path="/data/zhanggaohua/datasets/SWE-bench_Lite",
         name="lite",
         split="test",
         filter_spec="",
         shuffle=False,
-        prediction_file_extension="jsonl",
+        step_limit=STEP_LIMIT,
     ),
 ]
 
diff --git a/ais_bench/configs/swe_bench_examples/swe_bench_verified.py b/ais_bench/configs/swe_bench_examples/swe_bench_verified.py
index fb8f6f7f..b9de7821 100644
--- a/ais_bench/configs/swe_bench_examples/swe_bench_verified.py
+++ b/ais_bench/configs/swe_bench_examples/swe_bench_verified.py
@@ -4,6 +4,8 @@
 from ais_bench.benchmark.tasks import SWEBenchInferTask, SWEBenchEvalTask
 from ais_bench.benchmark.summarizers import DefaultSummarizer
 
+STEP_LIMIT = 100
+
 models = [
     dict(
         attr="local",
@@ -12,7 +14,7 @@
         model="",
         api_key="",
         url="",
-        batch_size=1,
+        batch_size=2,
         generation_kwargs=dict(),
     )
 ]
@@ -24,9 +26,9 @@
         path="ais_bench/datasets/SWE-bench_Verified",
         name="verified",
         split="test",
+        step_limit=STEP_LIMIT,
         filter_spec="",
         shuffle=False,
-        prediction_file_extension="jsonl",
     ),
 ]