From 6a096a77977bd2ef0cb4db7813a0d8056caa6bd1 Mon Sep 17 00:00:00 2001 From: Zhiyao Jiang Date: Wed, 14 Jan 2026 22:20:33 -0500 Subject: [PATCH 1/7] Integrate Terminal Bench into Miles Co-authored-by: Zhiyao Jiang Co-authored-by: Xinyu Jiang --- examples/eval/eval_delegate.py | 10 + examples/eval/{ => nemo_skills}/README.md | 0 examples/eval/scripts/eval_tb_example.yaml | 29 ++ examples/eval/scripts/run-eval-tb-qwen.sh | 159 +++++++ examples/eval/terminal_bench/README-cn.md | 122 +++++ examples/eval/terminal_bench/README.md | 129 ++++++ examples/eval/terminal_bench/__init__.py | 1 + examples/eval/terminal_bench/requirements.txt | 3 + examples/eval/terminal_bench/tb_client.py | 104 +++++ examples/eval/terminal_bench/tb_config.py | 53 +++ examples/eval/terminal_bench/tb_server.py | 432 ++++++++++++++++++ 11 files changed, 1042 insertions(+) rename examples/eval/{ => nemo_skills}/README.md (100%) create mode 100644 examples/eval/scripts/eval_tb_example.yaml create mode 100644 examples/eval/scripts/run-eval-tb-qwen.sh create mode 100644 examples/eval/terminal_bench/README-cn.md create mode 100644 examples/eval/terminal_bench/README.md create mode 100644 examples/eval/terminal_bench/__init__.py create mode 100644 examples/eval/terminal_bench/requirements.txt create mode 100644 examples/eval/terminal_bench/tb_client.py create mode 100644 examples/eval/terminal_bench/tb_config.py create mode 100644 examples/eval/terminal_bench/tb_server.py diff --git a/examples/eval/eval_delegate.py b/examples/eval/eval_delegate.py index fd6b9878d..1ecabe659 100644 --- a/examples/eval/eval_delegate.py +++ b/examples/eval/eval_delegate.py @@ -91,6 +91,12 @@ def _rebuild_delegate_config( env_cfg = build_skills_eval_env_config(args, env, defaults) if env_cfg is not None: envs.append(env_cfg) + elif env_name == "terminal_bench": + from examples.eval.terminal_bench.tb_config import build_terminal_bench_config + + env_cfg = build_terminal_bench_config(args, env, defaults) + if env_cfg is not None: + envs.append(env_cfg) else: raise ValueError(f"Unknown delegate environment: {env_name}") return envs @@ -151,6 +157,10 @@ def _create_delegate(env_cfg: EvalEnvConfig, router_addr: str): from examples.eval.nemo_skills.skills_client import SkillsEvalClient return SkillsEvalClient.from_config(env_cfg, router_addr) + elif env_name == "terminal_bench": + from examples.eval.terminal_bench.tb_client import TerminalBenchClient + + return TerminalBenchClient.from_config(env_cfg, router_addr) logger.warning("No delegate client registered for environment: %s", env_name) return None diff --git a/examples/eval/README.md b/examples/eval/nemo_skills/README.md similarity index 100% rename from examples/eval/README.md rename to examples/eval/nemo_skills/README.md diff --git a/examples/eval/scripts/eval_tb_example.yaml b/examples/eval/scripts/eval_tb_example.yaml new file mode 100644 index 000000000..2e2308981 --- /dev/null +++ b/examples/eval/scripts/eval_tb_example.yaml @@ -0,0 +1,29 @@ +eval: + defaults: + n_samples_per_eval_prompt: 1 + temperature: 0.6 + top_p: 0.95 + top_k: -1 + max_response_len: 24576 + datasets: # these eval tasks go through miles dataset config and default rollout function (miles.rollout.sglang_rollout.generate_rollout) + - name: gpqa # huggingface-cli download --repo-type dataset zyzshishui0627/gpqa_diamond --local-dir /root/gpqa + path: /root/gpqa/gpqa_eval.jsonl + rm_type: gpqa + n_samples_per_eval_prompt: 2 + - name: ifbench # huggingface-cli download --repo-type dataset zyzshishui0627/IFBench --local-dir /root/ifbench + path: /root/ifbench/IFBench_eval.jsonl + rm_type: ifbench + n_samples_per_eval_prompt: 1 + delegate: + - name: terminal_bench + url: http://172.17.0.1:9051 # Port must match the TB server running on the host machine + timeout_secs: 86400 # 24 hours + max_retries: 1 # HTTP request retries from Miles to the TB server + model_name: qwen3-8b + api_base: http://127.0.0.1:30005/v1 # Port must match the sglang router port set in run-eval-tb-qwen.sh + dataset_path: /mnt/data/xinyu/program/miles-tb/terminal-bench/tasks # Dataset path on the host machine + # task_ids: + # - hello-world + # n_tasks: 10 + n_attempts: 1 # TB task-level retries (per task within tb run) + n_concurrent: 8 \ No newline at end of file diff --git a/examples/eval/scripts/run-eval-tb-qwen.sh b/examples/eval/scripts/run-eval-tb-qwen.sh new file mode 100644 index 000000000..471a59d56 --- /dev/null +++ b/examples/eval/scripts/run-eval-tb-qwen.sh @@ -0,0 +1,159 @@ +#!/bin/bash + +# Example launcher that reuses the Qwen3-8B recipe but delegates evaluation to an +# external Terminal Bench server via the eval_delegate_rollout wrapper. + +# Clean up any stale processes from a previous run. +pkill -9 sglang +sleep 3 +ray stop --force +pkill -9 ray +pkill -9 python +sleep 3 +pkill -9 ray +pkill -9 python + +set -ex + +export PYTHONBUFFERED=16 +export MILES_HOST_IP=${MILES_HOST_IP:-"127.0.0.1"} + +MODEL_DIR="${MODEL_DIR:-/root/.cache}" +export MODEL_DIR + +NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l) +if [ "$NVLINK_COUNT" -gt 0 ]; then + HAS_NVLINK=1 +else + HAS_NVLINK=0 +fi +echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)" + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." &>/dev/null && pwd)" +source "${REPO_ROOT}/scripts/models/qwen3-8B.sh" + +# Store eval/delegate settings in a YAML config similar to examples/eval_multi_task. +EVAL_CONFIG_PATH=${TB_EVAL_CONFIG_PATH:-"${REPO_ROOT}/examples/eval/scripts/eval_tb_example.yaml"} + +CKPT_ARGS=( + --hf-checkpoint ${MODEL_DIR}/OpenThinker-Agent-v1 # huggingface-cli download open-thoughts/OpenThinker-Agent-v1 + --ref-load ${MODEL_DIR}/OpenThinker-Agent-v1_torch_dist + # --load ${MODEL_DIR}/OpenThinker-Agent-v1_miles/ + --save ${MODEL_DIR}/OpenThinker-Agent-v1_miles/ + --save-interval 20 +) + +ROLLOUT_ARGS=( + --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl + --input-key prompt + --label-key label + --apply-chat-template + --rollout-shuffle + --rm-type deepscaler + --num-rollout 3000 + --rollout-batch-size 32 + --n-samples-per-prompt 8 + --rollout-max-response-len 8192 + --rollout-temperature 0.8 + --global-batch-size 256 + --balance-data +) + +EVAL_ARGS=( + --eval-interval 5 + --eval-config "${EVAL_CONFIG_PATH}" + --eval-function-path examples.eval.eval_delegate_rollout.generate_rollout +) + +PERF_ARGS=( + --tensor-model-parallel-size 1 + --pipeline-model-parallel-size 1 + --context-parallel-size 1 + --expert-model-parallel-size 1 + --expert-tensor-parallel-size 1 + + --recompute-granularity full + --recompute-method uniform + --recompute-num-layers 1 + + --use-dynamic-batch-size + --max-tokens-per-gpu 9216 +) + +GRPO_ARGS=( + --advantage-estimator grpo + --use-kl-loss + --kl-loss-coef 0.00 + --kl-loss-type low_var_kl + --entropy-coef 0.00 + --eps-clip 0.2 + --eps-clip-high 0.28 +) + +OPTIMIZER_ARGS=( + --optimizer adam + --lr 1e-6 + --lr-decay-style constant + --weight-decay 0.1 + --adam-beta1 0.9 + --adam-beta2 0.98 +) + +WANDB_ARGS=( + --use-wandb + --wandb-project miles-eval + --wandb-group qwen3-8b-eval + --wandb-key ${WANDB_KEY} # export WANDB_KEY="your_key" +) + +SGLANG_ARGS=( + --rollout-num-gpus-per-engine 1 + --sglang-mem-fraction-static 0.7 + --sglang-router-port 30005 +) + +MISC_ARGS=( + --attention-dropout 0.0 + --hidden-dropout 0.0 + --accumulate-allreduce-grads-in-fp32 + --attention-softmax-in-fp32 + --attention-backend flash +) + +export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} +export CUDA_VISIBLE_DEVICES=0,1 + +ray start --head --node-ip-address ${MASTER_ADDR} --port 6380 --num-gpus 2 \ + --disable-usage-stats \ + --dashboard-host=0.0.0.0 \ + --dashboard-port=8266 \ + --dashboard-agent-listen-port 52366 \ + --dashboard-agent-grpc-port 52367 \ + --runtime-env-agent-port 52368 + + +RUNTIME_ENV_JSON="{ + \"env_vars\": { + \"PYTHONPATH\": \"/root/Megatron-LM/\", + \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\" + } +}" + +ray job submit --address="http://${MASTER_ADDR}:8266" \ + --working-dir "${REPO_ROOT}" \ + --runtime-env-json="${RUNTIME_ENV_JSON}" \ + -- python3 train.py \ + --actor-num-nodes 1 \ + --actor-num-gpus-per-node 2 \ + --colocate \ + ${MODEL_ARGS[@]} \ + ${CKPT_ARGS[@]} \ + ${ROLLOUT_ARGS[@]} \ + ${OPTIMIZER_ARGS[@]} \ + ${GRPO_ARGS[@]} \ + ${WANDB_ARGS[@]} \ + ${PERF_ARGS[@]} \ + ${EVAL_ARGS[@]} \ + ${SGLANG_ARGS[@]} \ + ${MISC_ARGS[@]} diff --git a/examples/eval/terminal_bench/README-cn.md b/examples/eval/terminal_bench/README-cn.md new file mode 100644 index 000000000..e6b6b3048 --- /dev/null +++ b/examples/eval/terminal_bench/README-cn.md @@ -0,0 +1,122 @@ +# Terminal Bench 评估集成 + +本目录将 Terminal Bench (TB) 封装为 Miles 的评估委托(Eval Delegate)。评估过程在宿主机(Host)上通过 `tb` CLI 执行,Miles 负责读取并汇总各项指标,包括 `accuracy`、`n_resolved`、`n_unresolved`、`pass_at_k/*` 以及 Token 统计数据(如 `total_input_tokens_mean/median` 和 `total_output_tokens_mean/median`)。 + +## 运行架构 + +* **Miles 内部**:运行训练/评估主循环;调用 TB delegate client。 +* **宿主机(Host)**:运行 TB delegate server (`tb_server.py`),由其执行 `tb run ...`。 +* **Server逻辑**:读取最新的 TB JSON 结果并将各项指标返回给 Miles。 + +## 1) 获取代码 (宿主机) + +```bash +mkdir miles-tb +cd miles-tb +git clone https://github.com/radixark/miles.git +git clone https://github.com/laude-institute/terminal-bench +``` + +## 2) 启动 Miles 容器 + +```bash +docker run \ + -itd \ + --gpus all \ + --shm-size 32g \ + --network host \ + --ipc=host \ + --privileged \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --ulimit nofile=65536:65536 \ + -v /mnt/data/.cache:/root/.cache \ + -v $(pwd):/shared/miles-tb \ + --name \ + radixark/miles:latest \ + /bin/bash +``` + +## 3) 进入 Miles 容器 + +```bash +docker exec -it /bin/bash +``` + +## 4) 配置 Terminal Bench 环境 (宿主机) + +在运行 `tb_server.py` 的宿主机上执行: + +```bash +# 在宿主机终端执行(非 Docker 内部) +uv venv --python 3.13 .venv +source .venv/bin/activate +uv pip install terminal-bench/. +uv pip install -r miles/examples/eval/terminal_bench/requirements.txt +``` + +*如果仓库路径不是 `./miles` 和 `./terminal-bench`,请根据实际路径调整。* + +## 5) 启动 Terminal Bench server + +在宿主机上启动(即 `tb` 命令可用的环境): + +```bash +python miles/examples/eval/terminal_bench/tb_server.py \ + --host 0.0.0.0 --port 9051 \ + --output-root tb_eval_output +``` + +**该脚本的功能:** + +* 默认设置 `OPENAI_API_KEY=EMPTY`。 +* 执行 `tb run -a terminus-2 -m openai/ ... --n-concurrent 8`。 +* 等待运行完成后,返回 `accuracy`、`pass_at_k` 以及 Token 消耗等统计数据。 + +## 6) 运行评估脚本 (示例) + +如果使用提供的 Qwen 评估启动脚本 (`run-eval-tb-qwen.sh`),请按以下步骤操作: + +**更新路径**:将 `eval_tb_example.yaml` 中的 `dataset_path` 修改为宿主机上 `terminal-bench/tasks` 的**绝对路径**(注意不是 Docker 内部路径)。 + +**下载模型**:在 Miles 容器内下载 HuggingFace 权重: +```bash +huggingface-cli download open-thoughts/OpenThinker-Agent-v1 \ +--local-dir /root/.cache/OpenThinker-Agent-v1 +``` + +**格式转换**:将 HuggingFace 权重转换为 Miles 的 torch distributed 格式。在 Miles 根目录下执行: +```bash +cd /shared/miles-tb/miles +source scripts/models/qwen3-8B.sh + +export PYTHONPATH=/root/Megatron-LM:/shared/miles-tb/miles + +python tools/convert_hf_to_torch_dist.py \ + ${MODEL_ARGS[@]} \ + --hf-checkpoint /root/.cache/OpenThinker-Agent-v1 \ + --save /root/.cache/OpenThinker-Agent-v1_torch_dist +``` + +**开始评估**:在 Miles 容器内运行: +```bash +bash miles/examples/eval/scripts/run-eval-tb-qwen.sh 2>&1 | tee run.log +``` + +*为了快速测试,可以在 `eval_tb_example.yaml` 中通过 `task_ids` 指定特定任务,或通过 `n_tasks` 限制评估任务的数量。* + +## 7) 常见问题 + +当在 Docker 容器中使用 `--network host` 运行 Miles 时,Ray 可能由于与宿主机共享网络而出现端口冲突。 + +这会导致 Ray 启动失败,或报 Redis/会话相关错误。通常可以在启动 Ray head 时显式指定未占用端口来解决,比如设置非默认的 `--port` 和 `--dashboard-port`。 + +有时甚至会导致 Ray job 提交失败,提示没有可用 agent 接受任务。这通常是 dashboard agent 或 runtime env agent 的端口也发生冲突。此时可在启动 Ray 时指定这些端口(如 `--dashboard-agent-listen-port`、`--dashboard-agent-grpc-port`、`--runtime-env-agent-port`)来解决。 + +如果 TB server无法通过 sglang router 连接到 Miles(`InternalServerError`),请检查 router 端口(例如 30005)实际监听的地址,并更新 `eval_tb_example.yaml` 中的 `api_base`: + +```bash +ss -lntp | grep 30005 +``` + +TB server开始接受请求后,可能会在输出中看到 `Parser warnings`、`Context length exceeded`、`Command 1 should end with newline`、`Harness execution failed`等。这些是Terminal Bench 的警告,如果正常运行可以忽略。 \ No newline at end of file diff --git a/examples/eval/terminal_bench/README.md b/examples/eval/terminal_bench/README.md new file mode 100644 index 000000000..341e543fc --- /dev/null +++ b/examples/eval/terminal_bench/README.md @@ -0,0 +1,129 @@ +# Terminal Bench Eval + +This folder wires Terminal Bench (TB) into Miles as an eval delegate. The TB run happens on the host via the `tb` CLI, and Miles reads back aggregated metrics such as `accuracy`, `n_resolved`, `n_unresolved`, `pass_at_k/*`, and token stats like `total_input_tokens_mean/median` and `total_output_tokens_mean/median`. + +## What runs where + +- Miles runs your training/eval loop inside the Docker container. +- Miles calls the TB delegate client. +- The TB delegate server (`tb_server.py`) runs `tb run ...` on the host. +- The server reads the latest TB JSON results and returns metrics to Miles. + +## 1) Get the code (host) + +```bash +mkdir miles-tb +cd miles-tb +git clone https://github.com/radixark/miles.git +git clone https://github.com/laude-institute/terminal-bench +``` + +## 2) Launch the Miles container + +```bash +docker run \ + -itd \ + --gpus all \ + --shm-size 32g \ + --network host \ + --ipc=host \ + --privileged \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --ulimit nofile=65536:65536 \ + -v /mnt/data/.cache:/root/.cache \ + -v $(pwd):/shared/miles-tb \ + --name \ + radixark/miles:latest \ + /bin/bash +``` + +## 3) Inside the Miles container + +```bash +docker exec -it /bin/bash +``` + +## 4) Terminal Bench environment (host) + +Run on the machine that will host `tb_server.py` (where you cloned both repos): + +```bash +# Host machine terminal (outside Docker) +uv venv --python 3.13 .venv +source .venv/bin/activate + +uv pip install terminal-bench/. +uv pip install -r miles/examples/eval/terminal_bench/requirements.txt +``` + +Notes: +- Use your local repo paths if they are not `./miles` and `./terminal-bench`. + +## 5) Start the Terminal Bench server + +Run on the host (same machine where `tb` works): + +```bash +python miles/examples/eval/terminal_bench/tb_server.py \ + --host 0.0.0.0 --port 9051 \ + --output-root tb_eval_output +``` + +What it does: +- Uses `OPENAI_API_KEY=EMPTY` +- Runs `tb run -a terminus-2 -m openai/ ... --n-concurrent 8` +- Waits for completion, then returns `accuracy`, `n_resolved`, + `n_unresolved`, `pass_at_k/*`, and token stats such as + `total_input_tokens_mean/median` and `total_output_tokens_mean/median` + +## 6) Run the eval script (example) + +If you use the provided Qwen eval launcher (`run-eval-tb-qwen.sh`), follow the steps below to run Terminal-Bench evaluation. + +First, update the `dataset_path` in `eval_tb_example.yaml` to the local path of `terminal-bench/tasks` on your host (not an internal Docker-only path). + +Then download the HuggingFace model checkpoint inside the Miles container: + +```bash +huggingface-cli download open-thoughts/OpenThinker-Agent-v1 \ +--local-dir /root/.cache/OpenThinker-Agent-v1 +``` + +After downloading, convert the HuggingFace checkpoint to Miles's torch distributed format. From the Miles root directory, run: + +```bash +cd /shared/miles-tb/miles +source scripts/models/qwen3-8B.sh + +export PYTHONPATH=/root/Megatron-LM:/shared/miles-tb/miles + +python tools/convert_hf_to_torch_dist.py \ + ${MODEL_ARGS[@]} \ + --hf-checkpoint /root/.cache/OpenThinker-Agent-v1 \ + --save /root/.cache/OpenThinker-Agent-v1_torch_dist +``` + +Finally, run the following command inside the Miles container: + +```bash +bash miles/examples/eval/scripts/run-eval-tb-qwen.sh 2>&1 | tee run.log +``` + +For convenience, you can restrict the evaluation scope in `eval_tb_example.yaml`, either by specifying a single task or multiple tasks (`task_ids`), or by limiting the number of tasks via `n_tasks`. + +## 7) Common Issues + +When running Miles inside a Docker container with `--network host`, Ray may encounter port conflicts due to shared networking with the host. + +In some cases, this manifests as Ray failing to start or reporting Redis- or session-related errors. This can usually be resolved by explicitly assigning unused ports when starting the Ray head node, for example by setting a non-default `--port` and `--dashboard-port`. + +In more severe cases, Ray job submission may fail with errors indicating that no available agent can accept jobs. This typically happens when the dashboard agent or runtime environment agent ports are also in conflict. In such situations, explicitly specifying the agent-related ports (e.g. `--dashboard-agent-listen-port`, `--dashboard-agent-grpc-port`, and `--runtime-env-agent-port`) when starting Ray can resolve the issue. + +If the TB server cannot connect to the Miles server through the sglang router (`InternalServerError`), check which address is actually listening on the router port (e.g. 30005 in this example) and update the `api_base` in `eval_tb_example.yaml` accordingly: + +```bash +ss -lntp | grep 30005 +``` + +You may see `Parser warnings`, `Context length exceeded`, `Command 1 should end with newline`, `Harness execution failed` in `tb_server.py` logs. They are warnings from Terminal Bench and can be ignored if runs proceed normally. \ No newline at end of file diff --git a/examples/eval/terminal_bench/__init__.py b/examples/eval/terminal_bench/__init__.py new file mode 100644 index 000000000..6d2704250 --- /dev/null +++ b/examples/eval/terminal_bench/__init__.py @@ -0,0 +1 @@ +"""Terminal Bench evaluation helpers.""" diff --git a/examples/eval/terminal_bench/requirements.txt b/examples/eval/terminal_bench/requirements.txt new file mode 100644 index 000000000..1a0006c93 --- /dev/null +++ b/examples/eval/terminal_bench/requirements.txt @@ -0,0 +1,3 @@ +flask +omegaconf +requests diff --git a/examples/eval/terminal_bench/tb_client.py b/examples/eval/terminal_bench/tb_client.py new file mode 100644 index 000000000..2a93b7161 --- /dev/null +++ b/examples/eval/terminal_bench/tb_client.py @@ -0,0 +1,104 @@ +import logging +import time +from typing import Any + +import requests +from examples.eval.eval_delegate import EvalClient, EvalDelegateError +from examples.eval.terminal_bench.tb_config import TerminalBenchConfig + +logger = logging.getLogger(__name__) + + +class TerminalBenchClient(EvalClient): + """HTTP client that proxies evaluation requests to the Terminal Bench server.""" + + def __init__(self, config: TerminalBenchConfig, router_url: str): + super().__init__(config.name or "terminal_bench") + self._config = config + endpoint = (config.url or "").rstrip("/") + if endpoint.endswith("/evaluate"): + base_endpoint = endpoint[: -len("/evaluate")] + else: + base_endpoint = endpoint + self._endpoint = f"{base_endpoint}/evaluate" if base_endpoint else "" + self._status_endpoint = f"{base_endpoint}/status" if base_endpoint else "" + self._timeout_secs = float(config.timeout_secs) + self._max_retries = max(1, int(config.max_retries)) + self._headers = dict(config.headers or {}) + self._session = requests.Session() + + @classmethod + def from_config(cls, config: TerminalBenchConfig, router_url: str): + if not config.url: + return None + return cls(config, router_url) + + def evaluate(self, args, rollout_id: int) -> tuple[dict[str, Any], dict[str, Any]]: + payload = self._build_payload(args, rollout_id) + response = self._request(payload) + metrics = response.get("raw_metrics", {}) + return metrics, response + + def _build_payload(self, args, rollout_id: int) -> dict[str, Any]: + payload = { + "model_name": self._config.model_name, + "api_base": self._config.api_base, + "n_tasks": self._config.n_tasks, + "n_concurrent": self._config.n_concurrent, + "metric_prefix": self._config.name, + } + if self._config.dataset_path: + payload["dataset_path"] = self._config.dataset_path + if self._config.task_ids: + payload["task_ids"] = list(self._config.task_ids) + if self._config.n_attempts is not None: + payload["n_attempts"] = self._config.n_attempts + return payload + + def _request(self, payload: dict[str, Any]) -> dict[str, Any]: + last_error: Exception | None = None + for attempt in range(1, self._max_retries + 1): + try: + response = self._session.post( + self._endpoint, + json=payload, + timeout=self._timeout_secs, + headers=self._headers, + ) + response.raise_for_status() + if not response.content: + return {} + body = response.json() + if body.get("status") == "completed": + return body + job_id = body.get("job_id") + if not job_id: + return body + return self._poll_status(job_id) + except requests.RequestException as exc: + last_error = exc + logger.warning( + "Terminal Bench delegate request failed (attempt %s/%s): %s", attempt, self._max_retries, exc + ) + if attempt < self._max_retries: + time.sleep(min(2**attempt, 30)) + raise EvalDelegateError("Terminal Bench evaluation request failed") from last_error + + def _poll_status(self, job_id: str) -> dict[str, Any]: + status_url = f"{self._status_endpoint}/{job_id}" + deadline = time.time() + self._timeout_secs + while time.time() < deadline: + response = self._session.get(status_url, timeout=min(self._timeout_secs, 30), headers=self._headers) + response.raise_for_status() + if not response.content: + time.sleep(2) + continue + body = response.json() + status = body.get("status") + if status == "completed": + return body + if status == "failed": + error = body.get("error") or "Terminal Bench job failed" + raise EvalDelegateError(error) + time.sleep(2) + raise EvalDelegateError("Terminal Bench evaluation timed out") diff --git a/examples/eval/terminal_bench/tb_config.py b/examples/eval/terminal_bench/tb_config.py new file mode 100644 index 000000000..adb4f2c30 --- /dev/null +++ b/examples/eval/terminal_bench/tb_config.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from collections.abc import Mapping +from dataclasses import dataclass, field +from typing import Any + +from examples.eval.eval_delegate import EvalEnvConfig + + +@dataclass +class TerminalBenchConfig(EvalEnvConfig): + """Environment configuration shared by the Terminal Bench client/server.""" + + model_name: str = "qwen3-8b" + api_base: str = "http://127.0.1.1:30001/v1" + dataset_path: str | None = None + n_tasks: int | None = None + task_ids: list[str] = field(default_factory=list) + n_attempts: int | None = None + n_concurrent: int = 8 + + @classmethod + def parse(cls, args, raw_env_config: Mapping[str, Any], defaults: Mapping[str, Any]) -> TerminalBenchConfig: + clean_raw = dict(raw_env_config or {}) + clean_raw.pop("type", None) + base_cfg: TerminalBenchConfig = super().parse(clean_raw, defaults) + + field_casts = { + "model_name": str, + "api_base": str, + "n_attempts": int, + "n_tasks": int, + "n_concurrent": int, + "dataset_path": str, + } + + for key, caster in field_casts.items(): + value = clean_raw.get(key) + if value is not None: + setattr(base_cfg, key, caster(value)) + + task_ids = clean_raw.get("task_ids") + if isinstance(task_ids, (list, tuple)): + base_cfg.task_ids = [str(item) for item in task_ids if item] + elif task_ids is not None: + raise ValueError("task_ids must be a list") + + return base_cfg + + + +def build_terminal_bench_config(args, raw_env_config: Mapping[str, Any], defaults: Mapping[str, Any]): + return TerminalBenchConfig.parse(args, raw_env_config, defaults) diff --git a/examples/eval/terminal_bench/tb_server.py b/examples/eval/terminal_bench/tb_server.py new file mode 100644 index 000000000..8085c54d2 --- /dev/null +++ b/examples/eval/terminal_bench/tb_server.py @@ -0,0 +1,432 @@ +#!/usr/bin/env python3 +""" +Simple HTTP server that proxies Miles evaluation requests to the `tb run` +command shipped with Terminal Bench. + +Usage: + python examples/eval/terminal_bench/tb_server.py \ + --host 0.0.0.0 --port 9050 \ + --output-root /opt/tb-eval + +Miles (or Miles-compatible runners) should POST the payload described in +`EvalRequestPayload` to http://:/evaluate. The server blocks until +`tb run` finishes, then returns aggregated metrics along with paths to the +generated artifacts (logs + raw metrics). +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import shlex +import subprocess +import sys +import threading +import time +import uuid +import statistics +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[3] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from flask import Flask, jsonify, request +from omegaconf import OmegaConf +from omegaconf.errors import OmegaConfBaseException + +logger = logging.getLogger("terminal_bench_server") +logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") + + +# --------------------------------------------------------------------------- +# Request payload helpers +# --------------------------------------------------------------------------- + + +@dataclass +class EvalRequestPayload: + model_name: str = "" + api_base: str = "" + n_tasks: int | None = None + n_concurrent: int | None = None + dataset_path: str | None = None + task_ids: list[str] | None = None + n_attempts: int | None = None + metric_prefix: str | None = None + + +@dataclass +class JobRecord: + job_id: str + status: str + run_id: str + command: str + output_dir: str + log_path: str + raw_metrics: dict[str, Any] | None = None + error: str | None = None + created_at: float = field(default_factory=time.time) + started_at: float | None = None + finished_at: float | None = None + + def to_dict(self) -> dict[str, Any]: + payload: dict[str, Any] = { + "job_id": self.job_id, + "status": self.status, + "run_id": self.run_id, + "command": self.command, + "output_dir": self.output_dir, + "log_path": self.log_path, + "created_at": self.created_at, + "started_at": self.started_at, + "finished_at": self.finished_at, + } + if self.raw_metrics is not None: + payload["raw_metrics"] = self.raw_metrics + if self.error: + payload["error"] = self.error + return payload + + +# --------------------------------------------------------------------------- +# Configuration + command helpers +# --------------------------------------------------------------------------- + + +def _normalize_model_name(model_name: str) -> str: + name = (model_name or "").strip() + if not name: + return "" + if "/" in name: + return name + return f"openai/{name}" + + +@dataclass +class ServerConfig: + output_root: Path + + @classmethod + def from_args(cls, args: argparse.Namespace) -> "ServerConfig": + return cls(output_root=Path(args.output_root).expanduser().resolve()) + + +class TerminalBenchEvaluator: + def __init__(self, config: ServerConfig): + self._config = config + self._lock = threading.Lock() + self._jobs_lock = threading.Lock() + self._jobs: dict[str, JobRecord] = {} + self._config.output_root.mkdir(parents=True, exist_ok=True) + self._log_root = REPO_ROOT.parent / "tb_eval_logs" + self._log_root.mkdir(parents=True, exist_ok=True) + + def evaluate(self, payload: EvalRequestPayload) -> dict[str, Any]: + if not payload.model_name: + raise ValueError("Missing `model_name` in request payload.") + if not payload.api_base: + raise ValueError("Missing `api_base` in request payload.") + + job_id = uuid.uuid4().hex + run_id = f"{int(time.time())}-{job_id[:8]}" + run_dir = self._config.output_root / run_id + + command = self._build_command(payload, run_id) + command_str = " ".join(shlex.quote(part) for part in command) + log_path = self._log_root / f"{run_id}.log" + + record = JobRecord( + job_id=job_id, + status="queued", + run_id=run_id, + command=command_str, + output_dir=str(run_dir), + log_path=str(log_path), + ) + with self._jobs_lock: + self._jobs[job_id] = record + + thread = threading.Thread( + target=self._run_job, + args=(job_id, payload, run_dir, command, log_path), + daemon=True, + ) + thread.start() + + return { + "job_id": job_id, + "status": "queued", + "status_url": f"/status/{job_id}", + "run_id": run_id, + "command": command_str, + "output_dir": str(run_dir), + "log_path": str(log_path), + } + + def _run_job( + self, + job_id: str, + payload: EvalRequestPayload, + run_dir: Path, + command: list[str], + log_path: Path, + ) -> None: + with self._jobs_lock: + record = self._jobs.get(job_id) + if record is None: + return + record.status = "running" + record.started_at = time.time() + + env = self._build_env() + logger.info("Starting Terminal Bench run: %s", " ".join(shlex.quote(part) for part in command)) + try: + with self._lock: + self._run_command(command, env=env, log_path=log_path) + metrics = self._collect_metrics(run_dir) + if payload.metric_prefix: + metrics = {payload.metric_prefix: metrics} + with self._jobs_lock: + record = self._jobs.get(job_id) + if record is None: + return + record.status = "completed" + record.raw_metrics = metrics + record.finished_at = time.time() + except Exception as exc: # noqa: BLE001 + with self._jobs_lock: + record = self._jobs.get(job_id) + if record is None: + return + record.status = "failed" + record.error = str(exc) + record.finished_at = time.time() + + def get_job_status(self, job_id: str) -> dict[str, Any] | None: + with self._jobs_lock: + record = self._jobs.get(job_id) + if record is None: + return None + return record.to_dict() + + def _build_command(self, payload: EvalRequestPayload, run_id: str) -> list[str]: + # 1. Normalize model name (add openai/ prefix) + model_name = _normalize_model_name(payload.model_name) + + cmd = [ + "tb", + "run", + "-a", + "terminus-2", # Added Agent flag + "--output-path", + str(self._config.output_root), + "--run-id", + run_id, + ] + + # 2. Add model + if model_name: + cmd.extend(["--model", model_name]) + + # 3. Add Agent kwargs (Use api_base exactly like the CLI command) + if payload.api_base: + cmd.extend(["--agent-kwarg", f"api_base={payload.api_base}"]) + + if payload.dataset_path: + cmd.extend(["--dataset-path", payload.dataset_path]) + + if payload.n_attempts is not None: + cmd.extend(["--n-attempts", str(payload.n_attempts)]) + + # 4. Add n_tasks if present + task_ids = [] + if payload.task_ids: + task_ids.extend([str(item) for item in payload.task_ids if item]) + if task_ids: + for task_id in task_ids: + cmd.extend(["--task-id", task_id]) + elif payload.n_tasks is not None: + cmd.extend(["--n-tasks", str(payload.n_tasks)]) + + # 5. Add concurrency + n_concurrent = payload.n_concurrent + if n_concurrent is None: + n_concurrent = 1 + cmd.extend(["--n-concurrent", str(n_concurrent)]) + + return cmd + + def _build_env(self) -> dict[str, str]: + env = os.environ.copy() + # Inject env var to simulate "OPENAI_API_KEY=EMPTY" + env["OPENAI_API_KEY"] = "EMPTY" + return env + + @staticmethod + def _run_command(cmd: list[str], *, env: dict[str, str], log_path: Path): + with open(log_path, "w", encoding="utf-8") as log_file: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env=env, + text=True, + bufsize=1, + ) + assert process.stdout is not None + for line in process.stdout: + log_file.write(line) + log_file.flush() + sys.stdout.write(line) + sys.stdout.flush() + retcode = process.wait() + if retcode != 0: + with open(log_path, encoding="utf-8", errors="ignore") as log_file: + tail = "".join(log_file.readlines()[-200:]) + raise RuntimeError(f"`tb run` failed with exit code {retcode}. See {log_path}\n{tail}") + + @staticmethod + def _collect_metrics(run_dir: Path) -> dict[str, Any]: + metrics_path = run_dir / "results.json" + if not metrics_path.exists(): + logger.warning("Results file missing at %s", metrics_path) + return {} + + metrics = TerminalBenchEvaluator._extract_metrics(metrics_path) + if not metrics: + logger.warning("No accuracy/n_resolved metrics found in %s", metrics_path) + return metrics + + @staticmethod + def _extract_metrics(metrics_path: Path) -> dict[str, Any]: + try: + with open(metrics_path, encoding="utf-8") as fp: + metrics_data = json.load(fp) + except json.JSONDecodeError as exc: + logger.warning("Failed to parse %s: %s", metrics_path, exc) + return {} + + metrics: dict[str, Any] = {} + + # core metrics + accuracy = metrics_data.get("accuracy") + if isinstance(accuracy, (int, float)): + metrics["accuracy"] = float(accuracy) + + n_resolved = metrics_data.get("n_resolved") + if isinstance(n_resolved, (int, float)): + metrics["n_resolved"] = int(n_resolved) + + n_unresolved = metrics_data.get("n_unresolved") + if isinstance(n_unresolved, (int, float)): + metrics["n_unresolved"] = int(n_unresolved) + + # pass@k flatten + pass_at_k = metrics_data.get("pass_at_k") + if isinstance(pass_at_k, dict): + for k, v in pass_at_k.items(): + if isinstance(v, (int, float)): + metrics[f"pass_at_k/{k}"] = float(v) + + # token stats from per-task results + results = metrics_data.get("results") + if isinstance(results, list): + input_tokens = [ + r.get("total_input_tokens") + for r in results + if isinstance(r, dict) and isinstance(r.get("total_input_tokens"), (int, float)) + ] + output_tokens = [ + r.get("total_output_tokens") + for r in results + if isinstance(r, dict) and isinstance(r.get("total_output_tokens"), (int, float)) + ] + + if input_tokens: + metrics["total_input_tokens_mean"] = float(statistics.mean(input_tokens)) + metrics["total_input_tokens_median"] = float(statistics.median(input_tokens)) + if output_tokens: + metrics["total_output_tokens_mean"] = float(statistics.mean(output_tokens)) + metrics["total_output_tokens_median"] = float(statistics.median(output_tokens)) + + return metrics + + +# --------------------------------------------------------------------------- +# HTTP server +# --------------------------------------------------------------------------- + + +def build_app(evaluator: TerminalBenchEvaluator) -> Flask: + app = Flask(__name__) + + @app.get("/health") + def health_check(): + return jsonify({"status": "ok"}) + + @app.post("/evaluate") + def evaluate_endpoint(): + try: + raw_payload = request.get_json(force=True, silent=False) + cfg = OmegaConf.merge( + OmegaConf.structured(EvalRequestPayload), + OmegaConf.create(raw_payload or {}), + ) + payload = OmegaConf.to_object(cfg) + result = evaluator.evaluate(payload) + return jsonify(result) + except OmegaConfBaseException as exc: + logger.exception("Invalid request payload") + return jsonify({"error": str(exc)}), 400 + except Exception as exc: # noqa: BLE001 + logger.exception("Evaluation failed") + return jsonify({"error": str(exc)}), 500 + + @app.get("/status/") + def status_endpoint(job_id: str): + status = evaluator.get_job_status(job_id) + if status is None: + return jsonify({"error": "job not found"}), 404 + return jsonify(status) + + return app + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run the Terminal Bench evaluation HTTP server.") + parser.add_argument("--host", type=str, default="0.0.0.0") + parser.add_argument("--port", type=int, default=9050) + parser.add_argument( + "--output-root", + type=str, + default="./terminal-bench-output", + help="Directory to store `tb run` outputs.", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + config = ServerConfig.from_args(args) + evaluator = TerminalBenchEvaluator(config) + app = build_app(evaluator) + logger.info( + "Starting Terminal Bench evaluation server on %s:%s (output root=%s)", + args.host, + args.port, + config.output_root, + ) + app.run(host=args.host, port=args.port) + + +if __name__ == "__main__": + main() From 592fd6aeb761f0d34049b00040507d1a61cf0584 Mon Sep 17 00:00:00 2001 From: Zhiyao Jiang Date: Sat, 17 Jan 2026 04:24:34 +0000 Subject: [PATCH 2/7] update readme in terminal-bench eval --- examples/eval/terminal_bench/README-cn.md | 122 ---------------------- 1 file changed, 122 deletions(-) delete mode 100644 examples/eval/terminal_bench/README-cn.md diff --git a/examples/eval/terminal_bench/README-cn.md b/examples/eval/terminal_bench/README-cn.md deleted file mode 100644 index e6b6b3048..000000000 --- a/examples/eval/terminal_bench/README-cn.md +++ /dev/null @@ -1,122 +0,0 @@ -# Terminal Bench 评估集成 - -本目录将 Terminal Bench (TB) 封装为 Miles 的评估委托(Eval Delegate)。评估过程在宿主机(Host)上通过 `tb` CLI 执行,Miles 负责读取并汇总各项指标,包括 `accuracy`、`n_resolved`、`n_unresolved`、`pass_at_k/*` 以及 Token 统计数据(如 `total_input_tokens_mean/median` 和 `total_output_tokens_mean/median`)。 - -## 运行架构 - -* **Miles 内部**:运行训练/评估主循环;调用 TB delegate client。 -* **宿主机(Host)**:运行 TB delegate server (`tb_server.py`),由其执行 `tb run ...`。 -* **Server逻辑**:读取最新的 TB JSON 结果并将各项指标返回给 Miles。 - -## 1) 获取代码 (宿主机) - -```bash -mkdir miles-tb -cd miles-tb -git clone https://github.com/radixark/miles.git -git clone https://github.com/laude-institute/terminal-bench -``` - -## 2) 启动 Miles 容器 - -```bash -docker run \ - -itd \ - --gpus all \ - --shm-size 32g \ - --network host \ - --ipc=host \ - --privileged \ - --ulimit memlock=-1 \ - --ulimit stack=67108864 \ - --ulimit nofile=65536:65536 \ - -v /mnt/data/.cache:/root/.cache \ - -v $(pwd):/shared/miles-tb \ - --name \ - radixark/miles:latest \ - /bin/bash -``` - -## 3) 进入 Miles 容器 - -```bash -docker exec -it /bin/bash -``` - -## 4) 配置 Terminal Bench 环境 (宿主机) - -在运行 `tb_server.py` 的宿主机上执行: - -```bash -# 在宿主机终端执行(非 Docker 内部) -uv venv --python 3.13 .venv -source .venv/bin/activate -uv pip install terminal-bench/. -uv pip install -r miles/examples/eval/terminal_bench/requirements.txt -``` - -*如果仓库路径不是 `./miles` 和 `./terminal-bench`,请根据实际路径调整。* - -## 5) 启动 Terminal Bench server - -在宿主机上启动(即 `tb` 命令可用的环境): - -```bash -python miles/examples/eval/terminal_bench/tb_server.py \ - --host 0.0.0.0 --port 9051 \ - --output-root tb_eval_output -``` - -**该脚本的功能:** - -* 默认设置 `OPENAI_API_KEY=EMPTY`。 -* 执行 `tb run -a terminus-2 -m openai/ ... --n-concurrent 8`。 -* 等待运行完成后,返回 `accuracy`、`pass_at_k` 以及 Token 消耗等统计数据。 - -## 6) 运行评估脚本 (示例) - -如果使用提供的 Qwen 评估启动脚本 (`run-eval-tb-qwen.sh`),请按以下步骤操作: - -**更新路径**:将 `eval_tb_example.yaml` 中的 `dataset_path` 修改为宿主机上 `terminal-bench/tasks` 的**绝对路径**(注意不是 Docker 内部路径)。 - -**下载模型**:在 Miles 容器内下载 HuggingFace 权重: -```bash -huggingface-cli download open-thoughts/OpenThinker-Agent-v1 \ ---local-dir /root/.cache/OpenThinker-Agent-v1 -``` - -**格式转换**:将 HuggingFace 权重转换为 Miles 的 torch distributed 格式。在 Miles 根目录下执行: -```bash -cd /shared/miles-tb/miles -source scripts/models/qwen3-8B.sh - -export PYTHONPATH=/root/Megatron-LM:/shared/miles-tb/miles - -python tools/convert_hf_to_torch_dist.py \ - ${MODEL_ARGS[@]} \ - --hf-checkpoint /root/.cache/OpenThinker-Agent-v1 \ - --save /root/.cache/OpenThinker-Agent-v1_torch_dist -``` - -**开始评估**:在 Miles 容器内运行: -```bash -bash miles/examples/eval/scripts/run-eval-tb-qwen.sh 2>&1 | tee run.log -``` - -*为了快速测试,可以在 `eval_tb_example.yaml` 中通过 `task_ids` 指定特定任务,或通过 `n_tasks` 限制评估任务的数量。* - -## 7) 常见问题 - -当在 Docker 容器中使用 `--network host` 运行 Miles 时,Ray 可能由于与宿主机共享网络而出现端口冲突。 - -这会导致 Ray 启动失败,或报 Redis/会话相关错误。通常可以在启动 Ray head 时显式指定未占用端口来解决,比如设置非默认的 `--port` 和 `--dashboard-port`。 - -有时甚至会导致 Ray job 提交失败,提示没有可用 agent 接受任务。这通常是 dashboard agent 或 runtime env agent 的端口也发生冲突。此时可在启动 Ray 时指定这些端口(如 `--dashboard-agent-listen-port`、`--dashboard-agent-grpc-port`、`--runtime-env-agent-port`)来解决。 - -如果 TB server无法通过 sglang router 连接到 Miles(`InternalServerError`),请检查 router 端口(例如 30005)实际监听的地址,并更新 `eval_tb_example.yaml` 中的 `api_base`: - -```bash -ss -lntp | grep 30005 -``` - -TB server开始接受请求后,可能会在输出中看到 `Parser warnings`、`Context length exceeded`、`Command 1 should end with newline`、`Harness execution failed`等。这些是Terminal Bench 的警告,如果正常运行可以忽略。 \ No newline at end of file From 81f85d8f004db5e1081cc0b1f68a061cb99ecb65 Mon Sep 17 00:00:00 2001 From: Jiajun Li Date: Fri, 16 Jan 2026 23:25:50 -0800 Subject: [PATCH 3/7] fix formatting --- examples/eval/terminal_bench/tb_config.py | 1 - examples/eval/terminal_bench/tb_server.py | 11 ++++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/eval/terminal_bench/tb_config.py b/examples/eval/terminal_bench/tb_config.py index adb4f2c30..f57b445dd 100644 --- a/examples/eval/terminal_bench/tb_config.py +++ b/examples/eval/terminal_bench/tb_config.py @@ -48,6 +48,5 @@ def parse(cls, args, raw_env_config: Mapping[str, Any], defaults: Mapping[str, A return base_cfg - def build_terminal_bench_config(args, raw_env_config: Mapping[str, Any], defaults: Mapping[str, Any]): return TerminalBenchConfig.parse(args, raw_env_config, defaults) diff --git a/examples/eval/terminal_bench/tb_server.py b/examples/eval/terminal_bench/tb_server.py index 8085c54d2..58c9d54ad 100644 --- a/examples/eval/terminal_bench/tb_server.py +++ b/examples/eval/terminal_bench/tb_server.py @@ -21,12 +21,12 @@ import logging import os import shlex +import statistics import subprocess import sys import threading import time import uuid -import statistics from dataclasses import dataclass, field from pathlib import Path from typing import Any @@ -112,7 +112,7 @@ class ServerConfig: output_root: Path @classmethod - def from_args(cls, args: argparse.Namespace) -> "ServerConfig": + def from_args(cls, args: argparse.Namespace) -> ServerConfig: return cls(output_root=Path(args.output_root).expanduser().resolve()) @@ -236,10 +236,10 @@ def _build_command(self, payload: EvalRequestPayload, run_id: str) -> list[str]: # 3. Add Agent kwargs (Use api_base exactly like the CLI command) if payload.api_base: cmd.extend(["--agent-kwarg", f"api_base={payload.api_base}"]) - + if payload.dataset_path: cmd.extend(["--dataset-path", payload.dataset_path]) - + if payload.n_attempts is not None: cmd.extend(["--n-attempts", str(payload.n_attempts)]) @@ -312,7 +312,7 @@ def _extract_metrics(metrics_path: Path) -> dict[str, Any]: return {} metrics: dict[str, Any] = {} - + # core metrics accuracy = metrics_data.get("accuracy") if isinstance(accuracy, (int, float)): @@ -401,6 +401,7 @@ def status_endpoint(job_id: str): # Entry point # --------------------------------------------------------------------------- + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Run the Terminal Bench evaluation HTTP server.") parser.add_argument("--host", type=str, default="0.0.0.0") From 62f2e0f74a9649b3223a854cd53b8e86df989aa2 Mon Sep 17 00:00:00 2001 From: Zhiyao Jiang Date: Mon, 19 Jan 2026 15:08:07 -0500 Subject: [PATCH 4/7] refactor to support harbor runner and tb runner Co-authored-by: Zhiyao Jiang Co-authored-by: Xinyu Jiang --- .../{ => nemo_skills}/multi_tasks.yaml | 0 .../{ => nemo_skills}/run-qwen3-32B.sh | 4 +- .../scripts/{ => nemo_skills}/run-qwen3-4B.sh | 4 +- .../scripts/terminal_bench/harbor_runner.yaml | 31 ++ .../{ => terminal_bench}/run-eval-tb-qwen.sh | 43 ++- .../tb_runner.yaml} | 9 +- examples/eval/terminal_bench/README.md | 33 +- examples/eval/terminal_bench/tb_client.py | 27 +- examples/eval/terminal_bench/tb_config.py | 34 ++- examples/eval/terminal_bench/tb_server.py | 288 ++++++++++++++---- 10 files changed, 379 insertions(+), 94 deletions(-) rename examples/eval/scripts/{ => nemo_skills}/multi_tasks.yaml (100%) rename examples/eval/scripts/{ => nemo_skills}/run-qwen3-32B.sh (96%) rename examples/eval/scripts/{ => nemo_skills}/run-qwen3-4B.sh (96%) create mode 100644 examples/eval/scripts/terminal_bench/harbor_runner.yaml rename examples/eval/scripts/{ => terminal_bench}/run-eval-tb-qwen.sh (74%) rename examples/eval/scripts/{eval_tb_example.yaml => terminal_bench/tb_runner.yaml} (85%) diff --git a/examples/eval/scripts/multi_tasks.yaml b/examples/eval/scripts/nemo_skills/multi_tasks.yaml similarity index 100% rename from examples/eval/scripts/multi_tasks.yaml rename to examples/eval/scripts/nemo_skills/multi_tasks.yaml diff --git a/examples/eval/scripts/run-qwen3-32B.sh b/examples/eval/scripts/nemo_skills/run-qwen3-32B.sh similarity index 96% rename from examples/eval/scripts/run-qwen3-32B.sh rename to examples/eval/scripts/nemo_skills/run-qwen3-32B.sh index eb6702deb..4d3da4f18 100644 --- a/examples/eval/scripts/run-qwen3-32B.sh +++ b/examples/eval/scripts/nemo_skills/run-qwen3-32B.sh @@ -29,11 +29,11 @@ fi echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)" SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." &>/dev/null && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../../.." &>/dev/null && pwd)" source "${REPO_ROOT}/scripts/models/qwen3-32B.sh" # Store eval/delegate settings in a YAML config similar to examples/eval_multi_task. -EVAL_CONFIG_PATH=${SKILLS_EVAL_CONFIG_PATH:-"${REPO_ROOT}/examples/eval/scripts/multi_tasks.yaml"} +EVAL_CONFIG_PATH=${SKILLS_EVAL_CONFIG_PATH:-"${REPO_ROOT}/examples/eval/scripts/nemo_skills/multi_tasks.yaml"} CKPT_ARGS=( --hf-checkpoint /root/shared/Qwen3-32B diff --git a/examples/eval/scripts/run-qwen3-4B.sh b/examples/eval/scripts/nemo_skills/run-qwen3-4B.sh similarity index 96% rename from examples/eval/scripts/run-qwen3-4B.sh rename to examples/eval/scripts/nemo_skills/run-qwen3-4B.sh index 34891126d..679a7a7bc 100644 --- a/examples/eval/scripts/run-qwen3-4B.sh +++ b/examples/eval/scripts/nemo_skills/run-qwen3-4B.sh @@ -29,11 +29,11 @@ fi echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)" SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." &>/dev/null && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../../.." &>/dev/null && pwd)" source "${REPO_ROOT}/scripts/models/qwen3-4B.sh" # Store eval/delegate settings in a YAML config similar to examples/eval_multi_task. -EVAL_CONFIG_PATH=${SKILLS_EVAL_CONFIG_PATH:-"${REPO_ROOT}/examples/eval/scripts/multi_tasks.yaml"} +EVAL_CONFIG_PATH=${SKILLS_EVAL_CONFIG_PATH:-"${REPO_ROOT}/examples/eval/scripts/nemo_skills/multi_tasks.yaml"} CKPT_ARGS=( --hf-checkpoint /root/Qwen3-4B diff --git a/examples/eval/scripts/terminal_bench/harbor_runner.yaml b/examples/eval/scripts/terminal_bench/harbor_runner.yaml new file mode 100644 index 000000000..1ad3115f3 --- /dev/null +++ b/examples/eval/scripts/terminal_bench/harbor_runner.yaml @@ -0,0 +1,31 @@ +eval: + defaults: + n_samples_per_eval_prompt: 1 + temperature: 0.6 + top_p: 0.95 + top_k: -1 + max_response_len: 24576 + datasets: # these eval tasks go through miles dataset config and default rollout function (miles.rollout.sglang_rollout.generate_rollout) + - name: gpqa # huggingface-cli download --repo-type dataset zyzshishui0627/gpqa_diamond --local-dir /root/gpqa + path: /root/gpqa/gpqa_eval.jsonl + rm_type: gpqa + n_samples_per_eval_prompt: 2 + - name: ifbench # huggingface-cli download --repo-type dataset zyzshishui0627/IFBench --local-dir /root/ifbench + path: /root/ifbench/IFBench_eval.jsonl + rm_type: ifbench + n_samples_per_eval_prompt: 1 + delegate: + - name: terminal_bench + url: http://172.17.0.1:9051 # Port must match the TB server running on the host machine + timeout_secs: 86400 # 24 hours + max_retries: 1 # HTTP request retries from Miles to the TB server + model_name: qwen3-8b + agent_name: terminus-2 + api_base: http://127.0.0.1:30005/v1 # Port must match the sglang router port set in run-eval-tb-qwen.sh + runner: harbor + dataset_name: terminal-bench + dataset_version: "2.0" + output_path: harbor_runner_jobs + # task_ids: + # - fix-git + n_concurrent: 8 diff --git a/examples/eval/scripts/run-eval-tb-qwen.sh b/examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh similarity index 74% rename from examples/eval/scripts/run-eval-tb-qwen.sh rename to examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh index 471a59d56..1d0b88af2 100644 --- a/examples/eval/scripts/run-eval-tb-qwen.sh +++ b/examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh @@ -18,7 +18,7 @@ set -ex export PYTHONBUFFERED=16 export MILES_HOST_IP=${MILES_HOST_IP:-"127.0.0.1"} -MODEL_DIR="${MODEL_DIR:-/root/.cache}" +MODEL_DIR="${MODEL_DIR:-/root/.cache/huggingface}" export MODEL_DIR NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l) @@ -30,11 +30,11 @@ fi echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)" SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" -REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." &>/dev/null && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../../.." &>/dev/null && pwd)" source "${REPO_ROOT}/scripts/models/qwen3-8B.sh" -# Store eval/delegate settings in a YAML config similar to examples/eval_multi_task. -EVAL_CONFIG_PATH=${TB_EVAL_CONFIG_PATH:-"${REPO_ROOT}/examples/eval/scripts/eval_tb_example.yaml"} +# EVAL_CONFIG_PATH=${TB_EVAL_CONFIG_PATH:-"${SCRIPT_DIR}/harbor_runner.yaml"} +EVAL_CONFIG_PATH=${TB_EVAL_CONFIG_PATH:-"${SCRIPT_DIR}/tb_runner.yaml"} CKPT_ARGS=( --hf-checkpoint ${MODEL_DIR}/OpenThinker-Agent-v1 # huggingface-cli download open-thoughts/OpenThinker-Agent-v1 @@ -51,17 +51,18 @@ ROLLOUT_ARGS=( --apply-chat-template --rollout-shuffle --rm-type deepscaler - --num-rollout 3000 + # --num-rollout 3000 + --num-rollout 1 --rollout-batch-size 32 --n-samples-per-prompt 8 --rollout-max-response-len 8192 - --rollout-temperature 0.8 + --rollout-temperature 1 --global-batch-size 256 --balance-data ) EVAL_ARGS=( - --eval-interval 5 + --eval-interval 1 --eval-config "${EVAL_CONFIG_PATH}" --eval-function-path examples.eval.eval_delegate_rollout.generate_rollout ) @@ -102,7 +103,7 @@ OPTIMIZER_ARGS=( WANDB_ARGS=( --use-wandb - --wandb-project miles-eval + --wandb-project miles-tb --wandb-group qwen3-8b-eval --wandb-key ${WANDB_KEY} # export WANDB_KEY="your_key" ) @@ -122,15 +123,25 @@ MISC_ARGS=( ) export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} -export CUDA_VISIBLE_DEVICES=0,1 - -ray start --head --node-ip-address ${MASTER_ADDR} --port 6380 --num-gpus 2 \ +export CUDA_VISIBLE_DEVICES=6,7 +# export CUDA_VISIBLE_DEVICES=4,5,6,7 +# export CUDA_VISIBLE_DEVICES=0,1,2,3 + +# ray start --head --node-ip-address ${MASTER_ADDR} --port 6380 --num-gpus 2 \ +# --disable-usage-stats \ +# --dashboard-host=0.0.0.0 \ +# --dashboard-port=8266 \ +# --dashboard-agent-listen-port 52366 \ +# --dashboard-agent-grpc-port 52367 \ +# --runtime-env-agent-port 52368 + +ray start --head --node-ip-address ${MASTER_ADDR} --port 6381 --num-gpus 2 \ --disable-usage-stats \ --dashboard-host=0.0.0.0 \ - --dashboard-port=8266 \ - --dashboard-agent-listen-port 52366 \ - --dashboard-agent-grpc-port 52367 \ - --runtime-env-agent-port 52368 + --dashboard-port=8267 \ + --dashboard-agent-listen-port 52266 \ + --dashboard-agent-grpc-port 52267 \ + --runtime-env-agent-port 52268 RUNTIME_ENV_JSON="{ @@ -140,7 +151,7 @@ RUNTIME_ENV_JSON="{ } }" -ray job submit --address="http://${MASTER_ADDR}:8266" \ +ray job submit --address="http://${MASTER_ADDR}:8267" \ --working-dir "${REPO_ROOT}" \ --runtime-env-json="${RUNTIME_ENV_JSON}" \ -- python3 train.py \ diff --git a/examples/eval/scripts/eval_tb_example.yaml b/examples/eval/scripts/terminal_bench/tb_runner.yaml similarity index 85% rename from examples/eval/scripts/eval_tb_example.yaml rename to examples/eval/scripts/terminal_bench/tb_runner.yaml index 2e2308981..53cab623c 100644 --- a/examples/eval/scripts/eval_tb_example.yaml +++ b/examples/eval/scripts/terminal_bench/tb_runner.yaml @@ -20,10 +20,13 @@ eval: timeout_secs: 86400 # 24 hours max_retries: 1 # HTTP request retries from Miles to the TB server model_name: qwen3-8b + agent_name: terminus-2 api_base: http://127.0.0.1:30005/v1 # Port must match the sglang router port set in run-eval-tb-qwen.sh - dataset_path: /mnt/data/xinyu/program/miles-tb/terminal-bench/tasks # Dataset path on the host machine + runner: tb + dataset_name: terminal-bench-core + dataset_version: "0.1.1" + output_path: tb_runner_jobs # task_ids: # - hello-world # n_tasks: 10 - n_attempts: 1 # TB task-level retries (per task within tb run) - n_concurrent: 8 \ No newline at end of file + n_concurrent: 8 diff --git a/examples/eval/terminal_bench/README.md b/examples/eval/terminal_bench/README.md index 341e543fc..cc683e541 100644 --- a/examples/eval/terminal_bench/README.md +++ b/examples/eval/terminal_bench/README.md @@ -1,12 +1,12 @@ # Terminal Bench Eval -This folder wires Terminal Bench (TB) into Miles as an eval delegate. The TB run happens on the host via the `tb` CLI, and Miles reads back aggregated metrics such as `accuracy`, `n_resolved`, `n_unresolved`, `pass_at_k/*`, and token stats like `total_input_tokens_mean/median` and `total_output_tokens_mean/median`. +This folder wires Terminal Bench (TB) into Miles as an eval delegate. The run happens on the host via `harbor run` (Terminal Bench 2.0, default) or `tb run` (Terminal Bench 1.0, legacy). Miles reads back aggregated metrics such as `accuracy`, `n_resolved`, `n_unresolved`, `pass_at_k/*`, and token stats like `total_input_tokens_mean/median` and `total_output_tokens_mean/median`. ## What runs where - Miles runs your training/eval loop inside the Docker container. - Miles calls the TB delegate client. -- The TB delegate server (`tb_server.py`) runs `tb run ...` on the host. +- The TB delegate server (`tb_server.py`) runs `harbor run ...` or `tb run ...` on the host. - The server reads the latest TB JSON results and returns metrics to Miles. ## 1) Get the code (host) @@ -46,15 +46,26 @@ docker exec -it /bin/bash ## 4) Terminal Bench environment (host) -Run on the machine that will host `tb_server.py` (where you cloned both repos): +Run on the machine that will host `tb_server.py`: ```bash # Host machine terminal (outside Docker) uv venv --python 3.13 .venv source .venv/bin/activate +uv pip install -r miles/examples/eval/terminal_bench/requirements.txt +``` + +Terminal Bench 2.0 (default, via harbor): + +```bash +uv tool install harbor +``` +Terminal Bench 1.0 (legacy, via tb CLI): + +```bash +git clone https://github.com/laude-institute/terminal-bench uv pip install terminal-bench/. -uv pip install -r miles/examples/eval/terminal_bench/requirements.txt ``` Notes: @@ -66,13 +77,13 @@ Run on the host (same machine where `tb` works): ```bash python miles/examples/eval/terminal_bench/tb_server.py \ - --host 0.0.0.0 --port 9051 \ - --output-root tb_eval_output + --host 0.0.0.0 --port 9051 ``` What it does: - Uses `OPENAI_API_KEY=EMPTY` -- Runs `tb run -a terminus-2 -m openai/ ... --n-concurrent 8` +- Runs `harbor run -d terminal-bench@2.0 -a terminus-2 -m openai/ ... -n 8` by default +- Supports `tb run ... --n-concurrent 8` when `runner: tb` is used - Waits for completion, then returns `accuracy`, `n_resolved`, `n_unresolved`, `pass_at_k/*`, and token stats such as `total_input_tokens_mean/median` and `total_output_tokens_mean/median` @@ -81,7 +92,9 @@ What it does: If you use the provided Qwen eval launcher (`run-eval-tb-qwen.sh`), follow the steps below to run Terminal-Bench evaluation. -First, update the `dataset_path` in `eval_tb_example.yaml` to the local path of `terminal-bench/tasks` on your host (not an internal Docker-only path). +For Terminal Bench 2.0, set `runner: harbor` and specify `dataset_name`, `dataset_version`, and `jobs_dir` in `eval_tb_example.yaml`. No local `terminal-bench/tasks` path is needed. + +For Terminal Bench 1.0, set `runner: tb` and update the `dataset_path` to the local path of `terminal-bench/tasks` on your host (not an internal Docker-only path). Then download the HuggingFace model checkpoint inside the Miles container: @@ -107,7 +120,7 @@ python tools/convert_hf_to_torch_dist.py \ Finally, run the following command inside the Miles container: ```bash -bash miles/examples/eval/scripts/run-eval-tb-qwen.sh 2>&1 | tee run.log +bash miles/examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh 2>&1 | tee run.log ``` For convenience, you can restrict the evaluation scope in `eval_tb_example.yaml`, either by specifying a single task or multiple tasks (`task_ids`), or by limiting the number of tasks via `n_tasks`. @@ -126,4 +139,4 @@ If the TB server cannot connect to the Miles server through the sglang router (` ss -lntp | grep 30005 ``` -You may see `Parser warnings`, `Context length exceeded`, `Command 1 should end with newline`, `Harness execution failed` in `tb_server.py` logs. They are warnings from Terminal Bench and can be ignored if runs proceed normally. \ No newline at end of file +You may see `Parser warnings`, `Context length exceeded`, `Command 1 should end with newline`, `Harness execution failed` in `tb_server.py` logs. They are warnings from Terminal Bench and can be ignored if runs proceed normally. diff --git a/examples/eval/terminal_bench/tb_client.py b/examples/eval/terminal_bench/tb_client.py index 2a93b7161..889af7bad 100644 --- a/examples/eval/terminal_bench/tb_client.py +++ b/examples/eval/terminal_bench/tb_client.py @@ -40,19 +40,36 @@ def evaluate(self, args, rollout_id: int) -> tuple[dict[str, Any], dict[str, Any return metrics, response def _build_payload(self, args, rollout_id: int) -> dict[str, Any]: + payload = self._base_payload() + runner = self._config.runner + if runner not in {"tb", "harbor"}: + raise ValueError( + f"Invalid runner: {runner}. Supported values are: tb (Terminal Bench 1.0), harbor (Terminal Bench 2.0)." + ) + if runner == "tb": + payload.update(self._payload_tb()) + return payload + + def _base_payload(self) -> dict[str, Any]: payload = { "model_name": self._config.model_name, + "agent_name": self._config.agent_name, + "dataset_name": self._config.dataset_name, + "dataset_version": self._config.dataset_version, "api_base": self._config.api_base, - "n_tasks": self._config.n_tasks, "n_concurrent": self._config.n_concurrent, "metric_prefix": self._config.name, + "runner": self._config.runner, + "output_path": self._config.output_path, } - if self._config.dataset_path: - payload["dataset_path"] = self._config.dataset_path if self._config.task_ids: payload["task_ids"] = list(self._config.task_ids) - if self._config.n_attempts is not None: - payload["n_attempts"] = self._config.n_attempts + return payload + + def _payload_tb(self) -> dict[str, Any]: + payload: dict[str, Any] = {} + if self._config.n_tasks is not None: + payload["n_tasks"] = self._config.n_tasks return payload def _request(self, payload: dict[str, Any]) -> dict[str, Any]: diff --git a/examples/eval/terminal_bench/tb_config.py b/examples/eval/terminal_bench/tb_config.py index f57b445dd..6f274694d 100644 --- a/examples/eval/terminal_bench/tb_config.py +++ b/examples/eval/terminal_bench/tb_config.py @@ -12,11 +12,14 @@ class TerminalBenchConfig(EvalEnvConfig): """Environment configuration shared by the Terminal Bench client/server.""" model_name: str = "qwen3-8b" + agent_name: str = "terminus-2" api_base: str = "http://127.0.1.1:30001/v1" - dataset_path: str | None = None + runner: str = "harbor" + dataset_name: str = "" + dataset_version: str = "" + output_path: str | None = None n_tasks: int | None = None task_ids: list[str] = field(default_factory=list) - n_attempts: int | None = None n_concurrent: int = 8 @classmethod @@ -27,11 +30,14 @@ def parse(cls, args, raw_env_config: Mapping[str, Any], defaults: Mapping[str, A field_casts = { "model_name": str, + "agent_name": str, "api_base": str, - "n_attempts": int, + "runner": str, + "dataset_name": lambda v: str(v).strip(), + "dataset_version": lambda v: str(v).strip(), + "output_path": lambda v: str(v).strip(), "n_tasks": int, "n_concurrent": int, - "dataset_path": str, } for key, caster in field_casts.items(): @@ -39,6 +45,26 @@ def parse(cls, args, raw_env_config: Mapping[str, Any], defaults: Mapping[str, A if value is not None: setattr(base_cfg, key, caster(value)) + runner = (base_cfg.runner or "").strip().lower() + if not runner: + runner = "harbor" + elif runner not in {"tb", "harbor"}: + raise ValueError( + f"Invalid runner: {runner}. Supported values are: tb (Terminal Bench 1.0), harbor (Terminal Bench 2.0)." + ) + base_cfg.runner = runner + # runner-specific defaults + if runner == "tb": + if not base_cfg.dataset_name: + base_cfg.dataset_name = "terminal-bench-core" + if not base_cfg.dataset_version: + base_cfg.dataset_version = "0.1.1" + else: + if not base_cfg.dataset_name: + base_cfg.dataset_name = "terminal-bench" + if not base_cfg.dataset_version: + base_cfg.dataset_version = "2.0" + task_ids = clean_raw.get("task_ids") if isinstance(task_ids, (list, tuple)): base_cfg.task_ids = [str(item) for item in task_ids if item] diff --git a/examples/eval/terminal_bench/tb_server.py b/examples/eval/terminal_bench/tb_server.py index 58c9d54ad..2b3a0ef04 100644 --- a/examples/eval/terminal_bench/tb_server.py +++ b/examples/eval/terminal_bench/tb_server.py @@ -1,22 +1,22 @@ #!/usr/bin/env python3 """ -Simple HTTP server that proxies Miles evaluation requests to the `tb run` -command shipped with Terminal Bench. +Simple HTTP server that proxies Miles evaluation requests to `tb run` (1.0) +or `harbor run` (2.0), depending on the request payload. Usage: python examples/eval/terminal_bench/tb_server.py \ - --host 0.0.0.0 --port 9050 \ - --output-root /opt/tb-eval + --host 0.0.0.0 --port 9050 Miles (or Miles-compatible runners) should POST the payload described in `EvalRequestPayload` to http://:/evaluate. The server blocks until -`tb run` finishes, then returns aggregated metrics along with paths to the +the run finishes, then returns aggregated metrics along with paths to the generated artifacts (logs + raw metrics). """ from __future__ import annotations import argparse +import cmd import json import logging import os @@ -51,14 +51,16 @@ @dataclass class EvalRequestPayload: model_name: str = "" + agent_name: str | None = None api_base: str = "" + runner: str | None = None + dataset_name: str | None = None + dataset_version: str | None = None n_tasks: int | None = None n_concurrent: int | None = None - dataset_path: str | None = None task_ids: list[str] | None = None - n_attempts: int | None = None metric_prefix: str | None = None - + output_path: str | None = None @dataclass class JobRecord: @@ -106,6 +108,15 @@ def _normalize_model_name(model_name: str) -> str: return name return f"openai/{name}" +def _normalize_runner(runner: str | None) -> str: + value = (runner or "").strip().lower() + if not value: + return "harbor" + if value in {"tb", "harbor"}: + return value + raise ValueError( + f"Invalid runner: {runner}. Supported values are: tb (Terminal Bench 1.0), harbor (Terminal Bench 2.0)." + ) @dataclass class ServerConfig: @@ -134,9 +145,19 @@ def evaluate(self, payload: EvalRequestPayload) -> dict[str, Any]: job_id = uuid.uuid4().hex run_id = f"{int(time.time())}-{job_id[:8]}" - run_dir = self._config.output_root / run_id - - command = self._build_command(payload, run_id) + runner = _normalize_runner(payload.runner) + job_name = None + if runner == "harbor": + jobs_dir = Path(payload.output_path or "jobs").expanduser() + jobs_dir.mkdir(parents=True, exist_ok=True) + job_name = run_id + run_dir = jobs_dir / job_name + else: + tb_root = Path(payload.output_path or self._config.output_root).expanduser() + tb_root.mkdir(parents=True, exist_ok=True) + run_dir = tb_root / run_id + + command = self._build_command(payload, run_id, runner, job_name) command_str = " ".join(shlex.quote(part) for part in command) log_path = self._log_root / f"{run_id}.log" @@ -153,7 +174,7 @@ def evaluate(self, payload: EvalRequestPayload) -> dict[str, Any]: thread = threading.Thread( target=self._run_job, - args=(job_id, payload, run_dir, command, log_path), + args=(job_id, payload, run_dir, command, log_path, runner), daemon=True, ) thread.start() @@ -175,6 +196,7 @@ def _run_job( run_dir: Path, command: list[str], log_path: Path, + runner: str, ) -> None: with self._jobs_lock: record = self._jobs.get(job_id) @@ -188,7 +210,7 @@ def _run_job( try: with self._lock: self._run_command(command, env=env, log_path=log_path) - metrics = self._collect_metrics(run_dir) + metrics = self._collect_metrics(run_dir, runner, payload) if payload.metric_prefix: metrics = {payload.metric_prefix: metrics} with self._jobs_lock: @@ -214,51 +236,81 @@ def get_job_status(self, job_id: str) -> dict[str, Any] | None: return None return record.to_dict() - def _build_command(self, payload: EvalRequestPayload, run_id: str) -> list[str]: - # 1. Normalize model name (add openai/ prefix) - model_name = _normalize_model_name(payload.model_name) - - cmd = [ - "tb", - "run", - "-a", - "terminus-2", # Added Agent flag - "--output-path", - str(self._config.output_root), - "--run-id", - run_id, - ] + def _build_command( + self, + payload: EvalRequestPayload, + run_id: str, + runner: str, + job_name: str | None, + ) -> list[str]: + if runner == "harbor": + cmd = self._build_harbor_command(payload, job_name) + elif runner == "tb": + cmd = self._build_tb_command(payload, run_id) + else: + raise ValueError( + f"Invalid runner: {runner}. Supported values are: tb (Terminal Bench 1.0), harbor (Terminal Bench 2.0)." + ) - # 2. Add model + model_name = _normalize_model_name(payload.model_name) if model_name: cmd.extend(["--model", model_name]) - # 3. Add Agent kwargs (Use api_base exactly like the CLI command) + agent_name = (payload.agent_name or "terminus-2").strip() + if agent_name: + cmd.extend(["--agent", agent_name]) + if payload.api_base: cmd.extend(["--agent-kwarg", f"api_base={payload.api_base}"]) - if payload.dataset_path: - cmd.extend(["--dataset-path", payload.dataset_path]) + n_concurrent = payload.n_concurrent if payload.n_concurrent is not None else 1 + cmd.extend(["--n-concurrent", str(n_concurrent)]) + + return cmd + + def _build_harbor_command(self, payload: EvalRequestPayload, job_name: str | None) -> list[str]: + dataset_name = (payload.dataset_name or "terminal-bench").strip() or "terminal-bench" + dataset_version = (payload.dataset_version or "2.0").strip() or "2.0" + cmd = [ + "harbor", + "run", + "-d", + f"{dataset_name}@{dataset_version}", + ] + jobs_dir = payload.output_path + if jobs_dir: + cmd.extend(["--jobs-dir", jobs_dir]) + if job_name: + cmd.extend(["--job-name", job_name]) - if payload.n_attempts is not None: - cmd.extend(["--n-attempts", str(payload.n_attempts)]) + task_ids = [str(item) for item in (payload.task_ids or []) if item] + if task_ids: + for task_name in task_ids: + cmd.extend(["--task-name", task_name]) + elif payload.n_tasks is not None: + raise ValueError("n_tasks is only supported for runner=tb.") + + return cmd - # 4. Add n_tasks if present - task_ids = [] - if payload.task_ids: - task_ids.extend([str(item) for item in payload.task_ids if item]) + def _build_tb_command(self, payload: EvalRequestPayload, run_id: str) -> list[str]: + dataset_name = (payload.dataset_name or "terminal-bench-core").strip() or "terminal-bench-core" + dataset_version = (payload.dataset_version or "0.1.1").strip() or "0.1.1" + cmd = [ + "tb", + "run", + "-d", + f"{dataset_name}=={dataset_version}", + ] + output_root = str(Path(payload.output_path or self._config.output_root).expanduser()) + cmd.extend(["--output-path", output_root, "--run-id", run_id]) + + task_ids = [str(item) for item in (payload.task_ids or []) if item] if task_ids: for task_id in task_ids: cmd.extend(["--task-id", task_id]) elif payload.n_tasks is not None: cmd.extend(["--n-tasks", str(payload.n_tasks)]) - # 5. Add concurrency - n_concurrent = payload.n_concurrent - if n_concurrent is None: - n_concurrent = 1 - cmd.extend(["--n-concurrent", str(n_concurrent)]) - return cmd def _build_env(self) -> dict[str, str]: @@ -288,16 +340,33 @@ def _run_command(cmd: list[str], *, env: dict[str, str], log_path: Path): if retcode != 0: with open(log_path, encoding="utf-8", errors="ignore") as log_file: tail = "".join(log_file.readlines()[-200:]) - raise RuntimeError(f"`tb run` failed with exit code {retcode}. See {log_path}\n{tail}") + raise RuntimeError(f"Command failed with exit code {retcode}. See {log_path}\n{tail}") @staticmethod - def _collect_metrics(run_dir: Path) -> dict[str, Any]: - metrics_path = run_dir / "results.json" - if not metrics_path.exists(): - logger.warning("Results file missing at %s", metrics_path) - return {} - - metrics = TerminalBenchEvaluator._extract_metrics(metrics_path) + def _collect_metrics(run_dir: Path, runner: str, payload: EvalRequestPayload) -> dict[str, Any]: + if runner == "harbor": + metrics_path = run_dir / "result.json" + if not metrics_path.exists(): + fallback = TerminalBenchEvaluator._find_latest_result( + Path(payload.output_path or "jobs").expanduser() + ) + if fallback is not None: + metrics_path = fallback + if not metrics_path.exists(): + logger.warning("Results file missing at %s", metrics_path) + return {} + metrics = TerminalBenchEvaluator._extract_harbor_metrics( + metrics_path, + model_name=_normalize_model_name(payload.model_name), + dataset_name=(payload.dataset_name or "terminal-bench"), + agent_name=(payload.agent_name or "terminus-2"), + ) + else: + metrics_path = run_dir / "results.json" + if not metrics_path.exists(): + logger.warning("Results file missing at %s", metrics_path) + return {} + metrics = TerminalBenchEvaluator._extract_metrics(metrics_path) if not metrics: logger.warning("No accuracy/n_resolved metrics found in %s", metrics_path) return metrics @@ -356,6 +425,121 @@ def _extract_metrics(metrics_path: Path) -> dict[str, Any]: return metrics + @staticmethod + def _find_latest_result(jobs_dir: Path) -> Path | None: + if not jobs_dir.exists(): + return None + candidates = list(jobs_dir.glob("**/result.json")) + if not candidates: + return None + return max(candidates, key=lambda path: path.stat().st_mtime) + + @staticmethod + def _extract_harbor_metrics( + metrics_path: Path, + *, + model_name: str, + dataset_name: str, + agent_name: str, + ) -> dict[str, Any]: + try: + with open(metrics_path, encoding="utf-8") as fp: + metrics_data = json.load(fp) + except json.JSONDecodeError as exc: + logger.warning("Failed to parse %s: %s", metrics_path, exc) + return {} + + metrics: dict[str, Any] = {} + stats = metrics_data.get("stats") + if isinstance(stats, dict): + evals = stats.get("evals") + else: + evals = None + + accuracy = None + if isinstance(evals, dict): + candidates = [ + f"{agent_name}__{model_name}__{dataset_name}", + f"{agent_name}__{model_name}__terminal-bench", + ] + for key in candidates: + entry = evals.get(key) + accuracy = TerminalBenchEvaluator._extract_harbor_accuracy(entry) + if accuracy is not None: + break + if accuracy is None: + for entry in evals.values(): + accuracy = TerminalBenchEvaluator._extract_harbor_accuracy(entry) + if accuracy is not None: + break + + if isinstance(accuracy, (int, float)): + metrics["accuracy"] = float(accuracy) + metrics["pass_at_k/1"] = float(accuracy) + + reward_stats = metrics_data.get("reward_stats") + if isinstance(reward_stats, dict): + reward_counts = reward_stats.get("reward") + else: + reward_counts = None + + if isinstance(reward_counts, dict): + resolved = TerminalBenchEvaluator._extract_reward_count(reward_counts, 1.0) + unresolved = TerminalBenchEvaluator._extract_reward_count(reward_counts, 0.0) + if resolved is not None: + metrics["n_resolved"] = resolved + if unresolved is not None: + metrics["n_unresolved"] = unresolved + + results = metrics_data.get("results") + if isinstance(results, list): + input_tokens = [ + r.get("total_input_tokens") + for r in results + if isinstance(r, dict) and isinstance(r.get("total_input_tokens"), (int, float)) + ] + output_tokens = [ + r.get("total_output_tokens") + for r in results + if isinstance(r, dict) and isinstance(r.get("total_output_tokens"), (int, float)) + ] + + if input_tokens: + metrics["total_input_tokens_mean"] = float(statistics.mean(input_tokens)) + metrics["total_input_tokens_median"] = float(statistics.median(input_tokens)) + if output_tokens: + metrics["total_output_tokens_mean"] = float(statistics.mean(output_tokens)) + metrics["total_output_tokens_median"] = float(statistics.median(output_tokens)) + + return metrics + + @staticmethod + def _extract_harbor_accuracy(entry: Any) -> float | None: + if not isinstance(entry, dict): + return None + metrics_block = entry.get("metrics") + if isinstance(metrics_block, list) and metrics_block: + first_metric = metrics_block[0] + if isinstance(first_metric, dict): + mean = first_metric.get("mean") + if isinstance(mean, (int, float)): + return float(mean) + mean = entry.get("mean") + if isinstance(mean, (int, float)): + return float(mean) + return None + + @staticmethod + def _extract_reward_count(reward_counts: dict[Any, Any], reward_value: float) -> int | None: + for key, value in reward_counts.items(): + try: + key_value = float(key) + except (TypeError, ValueError): + continue + if key_value == reward_value and isinstance(value, (int, float)): + return int(value) + return None + # --------------------------------------------------------------------------- # HTTP server @@ -410,7 +594,7 @@ def parse_args() -> argparse.Namespace: "--output-root", type=str, default="./terminal-bench-output", - help="Directory to store `tb run` outputs.", + help="Directory to store `tb run` outputs (Terminal Bench 1.0).", ) return parser.parse_args() From 9fbeee2e7b4785e46cc43707657ad81b4c8c4cdb Mon Sep 17 00:00:00 2001 From: Zhiyao Jiang Date: Wed, 21 Jan 2026 23:19:03 -0500 Subject: [PATCH 5/7] eval: support generic param passthrough via runner_kwargs (#3) Co-authored-by: Zhiyao Jiang Co-authored-by: Xinyu Jiang --- .../scripts/terminal_bench/harbor_runner.yaml | 17 +- .../terminal_bench/run-eval-tb-qwen.sh | 38 +- .../scripts/terminal_bench/tb_runner.yaml | 10 +- examples/eval/terminal_bench/README.md | 34 +- examples/eval/terminal_bench/tb_client.py | 22 +- examples/eval/terminal_bench/tb_config.py | 36 +- examples/eval/terminal_bench/tb_server.py | 420 +++++------------- .../eval/terminal_bench/utils/__init__.py | 1 + examples/eval/terminal_bench/utils/metrics.py | 163 +++++++ examples/eval/terminal_bench/utils/runner.py | 107 +++++ 10 files changed, 433 insertions(+), 415 deletions(-) create mode 100644 examples/eval/terminal_bench/utils/__init__.py create mode 100644 examples/eval/terminal_bench/utils/metrics.py create mode 100644 examples/eval/terminal_bench/utils/runner.py diff --git a/examples/eval/scripts/terminal_bench/harbor_runner.yaml b/examples/eval/scripts/terminal_bench/harbor_runner.yaml index 1ad3115f3..6ab080339 100644 --- a/examples/eval/scripts/terminal_bench/harbor_runner.yaml +++ b/examples/eval/scripts/terminal_bench/harbor_runner.yaml @@ -26,6 +26,17 @@ eval: dataset_name: terminal-bench dataset_version: "2.0" output_path: harbor_runner_jobs - # task_ids: - # - fix-git - n_concurrent: 8 + n_concurrent: 32 + runner_kwargs: + # task_name: + # - fix-git + # debug: true + # timeout_multiplier: 2.0 + # retry_exclude: + # - AgentTimeoutError + # - VerifierTimeoutError + n_attempts: 2 + agent_kwarg: + model_info: + max_input_tokens: 40960 + max_output_tokens: 8192 diff --git a/examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh b/examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh index 1d0b88af2..a2a5d733c 100644 --- a/examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh +++ b/examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh @@ -33,8 +33,8 @@ SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../../.." &>/dev/null && pwd)" source "${REPO_ROOT}/scripts/models/qwen3-8B.sh" -# EVAL_CONFIG_PATH=${TB_EVAL_CONFIG_PATH:-"${SCRIPT_DIR}/harbor_runner.yaml"} -EVAL_CONFIG_PATH=${TB_EVAL_CONFIG_PATH:-"${SCRIPT_DIR}/tb_runner.yaml"} +EVAL_CONFIG_PATH=${TB_EVAL_CONFIG_PATH:-"${SCRIPT_DIR}/harbor_runner.yaml"} +# EVAL_CONFIG_PATH=${TB_EVAL_CONFIG_PATH:-"${SCRIPT_DIR}/tb_runner.yaml"} CKPT_ARGS=( --hf-checkpoint ${MODEL_DIR}/OpenThinker-Agent-v1 # huggingface-cli download open-thoughts/OpenThinker-Agent-v1 @@ -123,25 +123,23 @@ MISC_ARGS=( ) export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} -export CUDA_VISIBLE_DEVICES=6,7 -# export CUDA_VISIBLE_DEVICES=4,5,6,7 -# export CUDA_VISIBLE_DEVICES=0,1,2,3 +export CUDA_VISIBLE_DEVICES=4,5,6,7 -# ray start --head --node-ip-address ${MASTER_ADDR} --port 6380 --num-gpus 2 \ -# --disable-usage-stats \ -# --dashboard-host=0.0.0.0 \ -# --dashboard-port=8266 \ -# --dashboard-agent-listen-port 52366 \ -# --dashboard-agent-grpc-port 52367 \ -# --runtime-env-agent-port 52368 - -ray start --head --node-ip-address ${MASTER_ADDR} --port 6381 --num-gpus 2 \ +ray start --head --node-ip-address ${MASTER_ADDR} --port 6380 --num-gpus 4 \ --disable-usage-stats \ --dashboard-host=0.0.0.0 \ - --dashboard-port=8267 \ - --dashboard-agent-listen-port 52266 \ - --dashboard-agent-grpc-port 52267 \ - --runtime-env-agent-port 52268 + --dashboard-port=8266 \ + --dashboard-agent-listen-port 52366 \ + --dashboard-agent-grpc-port 52367 \ + --runtime-env-agent-port 52368 + +# ray start --head --node-ip-address ${MASTER_ADDR} --port 6381 --num-gpus 2 \ +# --disable-usage-stats \ +# --dashboard-host=0.0.0.0 \ +# --dashboard-port=8267 \ +# --dashboard-agent-listen-port 52266 \ +# --dashboard-agent-grpc-port 52267 \ +# --runtime-env-agent-port 52268 RUNTIME_ENV_JSON="{ @@ -151,12 +149,12 @@ RUNTIME_ENV_JSON="{ } }" -ray job submit --address="http://${MASTER_ADDR}:8267" \ +ray job submit --address="http://${MASTER_ADDR}:8266" \ --working-dir "${REPO_ROOT}" \ --runtime-env-json="${RUNTIME_ENV_JSON}" \ -- python3 train.py \ --actor-num-nodes 1 \ - --actor-num-gpus-per-node 2 \ + --actor-num-gpus-per-node 4 \ --colocate \ ${MODEL_ARGS[@]} \ ${CKPT_ARGS[@]} \ diff --git a/examples/eval/scripts/terminal_bench/tb_runner.yaml b/examples/eval/scripts/terminal_bench/tb_runner.yaml index 53cab623c..bdd46e137 100644 --- a/examples/eval/scripts/terminal_bench/tb_runner.yaml +++ b/examples/eval/scripts/terminal_bench/tb_runner.yaml @@ -26,7 +26,9 @@ eval: dataset_name: terminal-bench-core dataset_version: "0.1.1" output_path: tb_runner_jobs - # task_ids: - # - hello-world - # n_tasks: 10 - n_concurrent: 8 + n_concurrent: 16 + # runner_kwargs: + # task_id: + # - hello-world + # n_tasks: 10 + # example_flag: value diff --git a/examples/eval/terminal_bench/README.md b/examples/eval/terminal_bench/README.md index cc683e541..9e8116579 100644 --- a/examples/eval/terminal_bench/README.md +++ b/examples/eval/terminal_bench/README.md @@ -15,7 +15,6 @@ This folder wires Terminal Bench (TB) into Miles as an eval delegate. The run ha mkdir miles-tb cd miles-tb git clone https://github.com/radixark/miles.git -git clone https://github.com/laude-institute/terminal-bench ``` ## 2) Launch the Miles container @@ -31,7 +30,7 @@ docker run \ --ulimit memlock=-1 \ --ulimit stack=67108864 \ --ulimit nofile=65536:65536 \ - -v /mnt/data/.cache:/root/.cache \ + -v /data/cache:/root/.cache \ -v $(pwd):/shared/miles-tb \ --name \ radixark/miles:latest \ @@ -58,7 +57,7 @@ uv pip install -r miles/examples/eval/terminal_bench/requirements.txt Terminal Bench 2.0 (default, via harbor): ```bash -uv tool install harbor +uv pip install harbor ``` Terminal Bench 1.0 (legacy, via tb CLI): @@ -73,34 +72,33 @@ Notes: ## 5) Start the Terminal Bench server -Run on the host (same machine where `tb` works): +Run on the host (same machine where `tb`/`harbor` works). Match the port in your +eval config (examples use `9051`): ```bash -python miles/examples/eval/terminal_bench/tb_server.py \ - --host 0.0.0.0 --port 9051 +python miles/examples/eval/terminal_bench/tb_server.py --host 0.0.0.0 --port 9051 ``` What it does: - Uses `OPENAI_API_KEY=EMPTY` -- Runs `harbor run -d terminal-bench@2.0 -a terminus-2 -m openai/ ... -n 8` by default -- Supports `tb run ... --n-concurrent 8` when `runner: tb` is used +- For `runner: harbor`, builds a command like: + `harbor run -d terminal-bench@2.0 --jobs-dir --job-name --model openai/ --agent --agent-kwarg api_base=... --n-concurrent ...` +- For `runner: tb`, builds a command like: + `tb run -d terminal-bench-core==0.1.1 --output-path --run-id --model openai/ --agent --agent-kwarg api_base=... --n-concurrent ...` - Waits for completion, then returns `accuracy`, `n_resolved`, `n_unresolved`, `pass_at_k/*`, and token stats such as `total_input_tokens_mean/median` and `total_output_tokens_mean/median` ## 6) Run the eval script (example) -If you use the provided Qwen eval launcher (`run-eval-tb-qwen.sh`), follow the steps below to run Terminal-Bench evaluation. - -For Terminal Bench 2.0, set `runner: harbor` and specify `dataset_name`, `dataset_version`, and `jobs_dir` in `eval_tb_example.yaml`. No local `terminal-bench/tasks` path is needed. +If you use the provided Qwen eval launcher (`run-eval-tb-qwen.sh`), follow the steps below to run Terminal-Bench evaluation. Configure the runner via `harbor_runner.yaml` or `tb_runner.yaml`. runner_kwargs is used to pass through extra CLI arguments, new parameters can be added directly via runner_kwargs. -For Terminal Bench 1.0, set `runner: tb` and update the `dataset_path` to the local path of `terminal-bench/tasks` on your host (not an internal Docker-only path). Then download the HuggingFace model checkpoint inside the Miles container: ```bash huggingface-cli download open-thoughts/OpenThinker-Agent-v1 \ ---local-dir /root/.cache/OpenThinker-Agent-v1 +--local-dir /root/.cache/huggingface/OpenThinker-Agent-v1 ``` After downloading, convert the HuggingFace checkpoint to Miles's torch distributed format. From the Miles root directory, run: @@ -113,8 +111,8 @@ export PYTHONPATH=/root/Megatron-LM:/shared/miles-tb/miles python tools/convert_hf_to_torch_dist.py \ ${MODEL_ARGS[@]} \ - --hf-checkpoint /root/.cache/OpenThinker-Agent-v1 \ - --save /root/.cache/OpenThinker-Agent-v1_torch_dist + --hf-checkpoint /root/.cache/huggingface/OpenThinker-Agent-v1 \ + --save /root/.cache/huggingface/OpenThinker-Agent-v1_torch_dist ``` Finally, run the following command inside the Miles container: @@ -123,8 +121,6 @@ Finally, run the following command inside the Miles container: bash miles/examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh 2>&1 | tee run.log ``` -For convenience, you can restrict the evaluation scope in `eval_tb_example.yaml`, either by specifying a single task or multiple tasks (`task_ids`), or by limiting the number of tasks via `n_tasks`. - ## 7) Common Issues When running Miles inside a Docker container with `--network host`, Ray may encounter port conflicts due to shared networking with the host. @@ -133,10 +129,10 @@ In some cases, this manifests as Ray failing to start or reporting Redis- or ses In more severe cases, Ray job submission may fail with errors indicating that no available agent can accept jobs. This typically happens when the dashboard agent or runtime environment agent ports are also in conflict. In such situations, explicitly specifying the agent-related ports (e.g. `--dashboard-agent-listen-port`, `--dashboard-agent-grpc-port`, and `--runtime-env-agent-port`) when starting Ray can resolve the issue. -If the TB server cannot connect to the Miles server through the sglang router (`InternalServerError`), check which address is actually listening on the router port (e.g. 30005 in this example) and update the `api_base` in `eval_tb_example.yaml` accordingly: +If the TB server cannot connect to the Miles server through the sglang router (`InternalServerError`), check which address is actually listening on the router port (e.g. 30005 in this example) and update the `api_base` in `harbor_runner.yaml` or `tb_runner.yaml` accordingly: ```bash ss -lntp | grep 30005 ``` -You may see `Parser warnings`, `Context length exceeded`, `Command 1 should end with newline`, `Harness execution failed` in `tb_server.py` logs. They are warnings from Terminal Bench and can be ignored if runs proceed normally. +You may see `Parser warnings`, `Context length exceeded`, `Command 1 should end with newline`, `Harness execution failed`, `Provider List` in `tb_server.py` logs. They are warnings from Terminal Bench and can be ignored if runs proceed normally. diff --git a/examples/eval/terminal_bench/tb_client.py b/examples/eval/terminal_bench/tb_client.py index 889af7bad..43104fc14 100644 --- a/examples/eval/terminal_bench/tb_client.py +++ b/examples/eval/terminal_bench/tb_client.py @@ -40,16 +40,8 @@ def evaluate(self, args, rollout_id: int) -> tuple[dict[str, Any], dict[str, Any return metrics, response def _build_payload(self, args, rollout_id: int) -> dict[str, Any]: - payload = self._base_payload() - runner = self._config.runner - if runner not in {"tb", "harbor"}: - raise ValueError( - f"Invalid runner: {runner}. Supported values are: tb (Terminal Bench 1.0), harbor (Terminal Bench 2.0)." - ) - if runner == "tb": - payload.update(self._payload_tb()) - return payload - + return self._base_payload() + def _base_payload(self) -> dict[str, Any]: payload = { "model_name": self._config.model_name, @@ -62,14 +54,8 @@ def _base_payload(self) -> dict[str, Any]: "runner": self._config.runner, "output_path": self._config.output_path, } - if self._config.task_ids: - payload["task_ids"] = list(self._config.task_ids) - return payload - - def _payload_tb(self) -> dict[str, Any]: - payload: dict[str, Any] = {} - if self._config.n_tasks is not None: - payload["n_tasks"] = self._config.n_tasks + if self._config.runner_kwargs: + payload["runner_kwargs"] = dict(self._config.runner_kwargs) return payload def _request(self, payload: dict[str, Any]) -> dict[str, Any]: diff --git a/examples/eval/terminal_bench/tb_config.py b/examples/eval/terminal_bench/tb_config.py index 6f274694d..ac0eb3bb2 100644 --- a/examples/eval/terminal_bench/tb_config.py +++ b/examples/eval/terminal_bench/tb_config.py @@ -15,12 +15,11 @@ class TerminalBenchConfig(EvalEnvConfig): agent_name: str = "terminus-2" api_base: str = "http://127.0.1.1:30001/v1" runner: str = "harbor" - dataset_name: str = "" - dataset_version: str = "" + dataset_name: str = "terminal-bench" + dataset_version: str = "2.0" output_path: str | None = None - n_tasks: int | None = None - task_ids: list[str] = field(default_factory=list) n_concurrent: int = 8 + runner_kwargs: dict[str, Any] = field(default_factory=dict) @classmethod def parse(cls, args, raw_env_config: Mapping[str, Any], defaults: Mapping[str, Any]) -> TerminalBenchConfig: @@ -36,7 +35,6 @@ def parse(cls, args, raw_env_config: Mapping[str, Any], defaults: Mapping[str, A "dataset_name": lambda v: str(v).strip(), "dataset_version": lambda v: str(v).strip(), "output_path": lambda v: str(v).strip(), - "n_tasks": int, "n_concurrent": int, } @@ -45,31 +43,9 @@ def parse(cls, args, raw_env_config: Mapping[str, Any], defaults: Mapping[str, A if value is not None: setattr(base_cfg, key, caster(value)) - runner = (base_cfg.runner or "").strip().lower() - if not runner: - runner = "harbor" - elif runner not in {"tb", "harbor"}: - raise ValueError( - f"Invalid runner: {runner}. Supported values are: tb (Terminal Bench 1.0), harbor (Terminal Bench 2.0)." - ) - base_cfg.runner = runner - # runner-specific defaults - if runner == "tb": - if not base_cfg.dataset_name: - base_cfg.dataset_name = "terminal-bench-core" - if not base_cfg.dataset_version: - base_cfg.dataset_version = "0.1.1" - else: - if not base_cfg.dataset_name: - base_cfg.dataset_name = "terminal-bench" - if not base_cfg.dataset_version: - base_cfg.dataset_version = "2.0" - - task_ids = clean_raw.get("task_ids") - if isinstance(task_ids, (list, tuple)): - base_cfg.task_ids = [str(item) for item in task_ids if item] - elif task_ids is not None: - raise ValueError("task_ids must be a list") + runner_kwargs = clean_raw.get("runner_kwargs") + if runner_kwargs is not None: + base_cfg.runner_kwargs = dict(runner_kwargs) return base_cfg diff --git a/examples/eval/terminal_bench/tb_server.py b/examples/eval/terminal_bench/tb_server.py index 2b3a0ef04..a278ba493 100644 --- a/examples/eval/terminal_bench/tb_server.py +++ b/examples/eval/terminal_bench/tb_server.py @@ -9,19 +9,16 @@ Miles (or Miles-compatible runners) should POST the payload described in `EvalRequestPayload` to http://:/evaluate. The server blocks until -the run finishes, then returns aggregated metrics along with paths to the -generated artifacts (logs + raw metrics). +the run finishes, then returns aggregated metrics. """ from __future__ import annotations import argparse -import cmd -import json import logging import os +import pty import shlex -import statistics import subprocess import sys import threading @@ -31,14 +28,19 @@ from pathlib import Path from typing import Any -REPO_ROOT = Path(__file__).resolve().parents[3] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - from flask import Flask, jsonify, request from omegaconf import OmegaConf from omegaconf.errors import OmegaConfBaseException +from utils.metrics import extract_harbor_metrics, extract_tb_metrics +from utils.runner import ( + Runner, + ServerConfig, + _build_harbor_command, + _build_tb_command, + _normalize_model_name, +) + logger = logging.getLogger("terminal_bench_server") logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") @@ -56,11 +58,11 @@ class EvalRequestPayload: runner: str | None = None dataset_name: str | None = None dataset_version: str | None = None - n_tasks: int | None = None n_concurrent: int | None = None - task_ids: list[str] | None = None metric_prefix: str | None = None output_path: str | None = None + runner_kwargs: dict[str, Any] | None = None + @dataclass class JobRecord: @@ -69,7 +71,6 @@ class JobRecord: run_id: str command: str output_dir: str - log_path: str raw_metrics: dict[str, Any] | None = None error: str | None = None created_at: float = field(default_factory=time.time) @@ -83,7 +84,6 @@ def to_dict(self) -> dict[str, Any]: "run_id": self.run_id, "command": self.command, "output_dir": self.output_dir, - "log_path": self.log_path, "created_at": self.created_at, "started_at": self.started_at, "finished_at": self.finished_at, @@ -95,47 +95,12 @@ def to_dict(self) -> dict[str, Any]: return payload -# --------------------------------------------------------------------------- -# Configuration + command helpers -# --------------------------------------------------------------------------- - - -def _normalize_model_name(model_name: str) -> str: - name = (model_name or "").strip() - if not name: - return "" - if "/" in name: - return name - return f"openai/{name}" - -def _normalize_runner(runner: str | None) -> str: - value = (runner or "").strip().lower() - if not value: - return "harbor" - if value in {"tb", "harbor"}: - return value - raise ValueError( - f"Invalid runner: {runner}. Supported values are: tb (Terminal Bench 1.0), harbor (Terminal Bench 2.0)." - ) - -@dataclass -class ServerConfig: - output_root: Path - - @classmethod - def from_args(cls, args: argparse.Namespace) -> ServerConfig: - return cls(output_root=Path(args.output_root).expanduser().resolve()) - - class TerminalBenchEvaluator: def __init__(self, config: ServerConfig): self._config = config self._lock = threading.Lock() self._jobs_lock = threading.Lock() self._jobs: dict[str, JobRecord] = {} - self._config.output_root.mkdir(parents=True, exist_ok=True) - self._log_root = REPO_ROOT.parent / "tb_eval_logs" - self._log_root.mkdir(parents=True, exist_ok=True) def evaluate(self, payload: EvalRequestPayload) -> dict[str, Any]: if not payload.model_name: @@ -145,21 +110,11 @@ def evaluate(self, payload: EvalRequestPayload) -> dict[str, Any]: job_id = uuid.uuid4().hex run_id = f"{int(time.time())}-{job_id[:8]}" - runner = _normalize_runner(payload.runner) - job_name = None - if runner == "harbor": - jobs_dir = Path(payload.output_path or "jobs").expanduser() - jobs_dir.mkdir(parents=True, exist_ok=True) - job_name = run_id - run_dir = jobs_dir / job_name - else: - tb_root = Path(payload.output_path or self._config.output_root).expanduser() - tb_root.mkdir(parents=True, exist_ok=True) - run_dir = tb_root / run_id + runner = Runner(payload.runner) + run_dir, job_name = self._prepare_run_dir(payload, runner, run_id) command = self._build_command(payload, run_id, runner, job_name) - command_str = " ".join(shlex.quote(part) for part in command) - log_path = self._log_root / f"{run_id}.log" + command_str = self._format_command(command) record = JobRecord( job_id=job_id, @@ -167,14 +122,13 @@ def evaluate(self, payload: EvalRequestPayload) -> dict[str, Any]: run_id=run_id, command=command_str, output_dir=str(run_dir), - log_path=str(log_path), ) with self._jobs_lock: self._jobs[job_id] = record thread = threading.Thread( target=self._run_job, - args=(job_id, payload, run_dir, command, log_path, runner), + args=(job_id, payload, run_dir, command, runner), daemon=True, ) thread.start() @@ -186,7 +140,6 @@ def evaluate(self, payload: EvalRequestPayload) -> dict[str, Any]: "run_id": run_id, "command": command_str, "output_dir": str(run_dir), - "log_path": str(log_path), } def _run_job( @@ -195,39 +148,34 @@ def _run_job( payload: EvalRequestPayload, run_dir: Path, command: list[str], - log_path: Path, - runner: str, + runner: Runner, ) -> None: - with self._jobs_lock: - record = self._jobs.get(job_id) - if record is None: - return - record.status = "running" - record.started_at = time.time() + self._update_job(job_id, status="running", started_at=time.time()) env = self._build_env() logger.info("Starting Terminal Bench run: %s", " ".join(shlex.quote(part) for part in command)) try: with self._lock: - self._run_command(command, env=env, log_path=log_path) + self._run_command( + command, + env=env, + ) metrics = self._collect_metrics(run_dir, runner, payload) if payload.metric_prefix: metrics = {payload.metric_prefix: metrics} - with self._jobs_lock: - record = self._jobs.get(job_id) - if record is None: - return - record.status = "completed" - record.raw_metrics = metrics - record.finished_at = time.time() + self._update_job( + job_id, + status="completed", + raw_metrics=metrics, + finished_at=time.time(), + ) except Exception as exc: # noqa: BLE001 - with self._jobs_lock: - record = self._jobs.get(job_id) - if record is None: - return - record.status = "failed" - record.error = str(exc) - record.finished_at = time.time() + self._update_job( + job_id, + status="failed", + error=str(exc), + finished_at=time.time(), + ) def get_job_status(self, job_id: str) -> dict[str, Any] | None: with self._jobs_lock: @@ -240,17 +188,13 @@ def _build_command( self, payload: EvalRequestPayload, run_id: str, - runner: str, + runner: Runner, job_name: str | None, ) -> list[str]: - if runner == "harbor": - cmd = self._build_harbor_command(payload, job_name) - elif runner == "tb": - cmd = self._build_tb_command(payload, run_id) + if runner is Runner.HARBOR: + cmd = _build_harbor_command(payload, job_name) else: - raise ValueError( - f"Invalid runner: {runner}. Supported values are: tb (Terminal Bench 1.0), harbor (Terminal Bench 2.0)." - ) + cmd = _build_tb_command(payload, run_id, self._config.output_root) model_name = _normalize_model_name(payload.model_name) if model_name: @@ -268,50 +212,32 @@ def _build_command( return cmd - def _build_harbor_command(self, payload: EvalRequestPayload, job_name: str | None) -> list[str]: - dataset_name = (payload.dataset_name or "terminal-bench").strip() or "terminal-bench" - dataset_version = (payload.dataset_version or "2.0").strip() or "2.0" - cmd = [ - "harbor", - "run", - "-d", - f"{dataset_name}@{dataset_version}", - ] - jobs_dir = payload.output_path - if jobs_dir: - cmd.extend(["--jobs-dir", jobs_dir]) - if job_name: - cmd.extend(["--job-name", job_name]) - - task_ids = [str(item) for item in (payload.task_ids or []) if item] - if task_ids: - for task_name in task_ids: - cmd.extend(["--task-name", task_name]) - elif payload.n_tasks is not None: - raise ValueError("n_tasks is only supported for runner=tb.") - - return cmd + def _prepare_run_dir( + self, + payload: EvalRequestPayload, + runner: Runner, + run_id: str, + ) -> tuple[Path, str | None]: + if runner is Runner.HARBOR: + jobs_dir = Path(payload.output_path or "jobs").expanduser() + jobs_dir.mkdir(parents=True, exist_ok=True) + return jobs_dir / run_id, run_id - def _build_tb_command(self, payload: EvalRequestPayload, run_id: str) -> list[str]: - dataset_name = (payload.dataset_name or "terminal-bench-core").strip() or "terminal-bench-core" - dataset_version = (payload.dataset_version or "0.1.1").strip() or "0.1.1" - cmd = [ - "tb", - "run", - "-d", - f"{dataset_name}=={dataset_version}", - ] - output_root = str(Path(payload.output_path or self._config.output_root).expanduser()) - cmd.extend(["--output-path", output_root, "--run-id", run_id]) - - task_ids = [str(item) for item in (payload.task_ids or []) if item] - if task_ids: - for task_id in task_ids: - cmd.extend(["--task-id", task_id]) - elif payload.n_tasks is not None: - cmd.extend(["--n-tasks", str(payload.n_tasks)]) + tb_root = Path(payload.output_path or self._config.output_root).expanduser() + tb_root.mkdir(parents=True, exist_ok=True) + return tb_root / run_id, None - return cmd + def _update_job(self, job_id: str, **updates: Any) -> None: + with self._jobs_lock: + record = self._jobs.get(job_id) + if record is None: + return + for key, value in updates.items(): + setattr(record, key, value) + + @staticmethod + def _format_command(command: list[str]) -> str: + return " ".join(shlex.quote(part) for part in command) def _build_env(self) -> dict[str, str]: env = os.environ.copy() @@ -320,31 +246,41 @@ def _build_env(self) -> dict[str, str]: return env @staticmethod - def _run_command(cmd: list[str], *, env: dict[str, str], log_path: Path): - with open(log_path, "w", encoding="utf-8") as log_file: - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - env=env, - text=True, - bufsize=1, - ) - assert process.stdout is not None - for line in process.stdout: - log_file.write(line) - log_file.flush() - sys.stdout.write(line) - sys.stdout.flush() - retcode = process.wait() + def _run_command( + cmd: list[str], + *, + env: dict[str, str], + ): + env = env.copy() + env.setdefault("TERM", "xterm-256color") + env.setdefault("RICH_FORCE_TERMINAL", "1") + master_fd, slave_fd = pty.openpty() + process = subprocess.Popen( + cmd, + stdout=slave_fd, + stderr=slave_fd, + env=env, + ) + os.close(slave_fd) + try: + while True: + try: + data = os.read(master_fd, 1024) + except OSError: + break + if not data: + break + sys.stdout.buffer.write(data) + sys.stdout.buffer.flush() + finally: + os.close(master_fd) + retcode = process.wait() if retcode != 0: - with open(log_path, encoding="utf-8", errors="ignore") as log_file: - tail = "".join(log_file.readlines()[-200:]) - raise RuntimeError(f"Command failed with exit code {retcode}. See {log_path}\n{tail}") + raise RuntimeError(f"Command failed with exit code {retcode}.") @staticmethod - def _collect_metrics(run_dir: Path, runner: str, payload: EvalRequestPayload) -> dict[str, Any]: - if runner == "harbor": + def _collect_metrics(run_dir: Path, runner: Runner, payload: EvalRequestPayload) -> dict[str, Any]: + if runner is Runner.HARBOR: metrics_path = run_dir / "result.json" if not metrics_path.exists(): fallback = TerminalBenchEvaluator._find_latest_result( @@ -355,8 +291,9 @@ def _collect_metrics(run_dir: Path, runner: str, payload: EvalRequestPayload) -> if not metrics_path.exists(): logger.warning("Results file missing at %s", metrics_path) return {} - metrics = TerminalBenchEvaluator._extract_harbor_metrics( + metrics = extract_harbor_metrics( metrics_path, + run_dir, model_name=_normalize_model_name(payload.model_name), dataset_name=(payload.dataset_name or "terminal-bench"), agent_name=(payload.agent_name or "terminus-2"), @@ -366,65 +303,11 @@ def _collect_metrics(run_dir: Path, runner: str, payload: EvalRequestPayload) -> if not metrics_path.exists(): logger.warning("Results file missing at %s", metrics_path) return {} - metrics = TerminalBenchEvaluator._extract_metrics(metrics_path) + metrics = extract_tb_metrics(metrics_path) if not metrics: logger.warning("No accuracy/n_resolved metrics found in %s", metrics_path) return metrics - @staticmethod - def _extract_metrics(metrics_path: Path) -> dict[str, Any]: - try: - with open(metrics_path, encoding="utf-8") as fp: - metrics_data = json.load(fp) - except json.JSONDecodeError as exc: - logger.warning("Failed to parse %s: %s", metrics_path, exc) - return {} - - metrics: dict[str, Any] = {} - - # core metrics - accuracy = metrics_data.get("accuracy") - if isinstance(accuracy, (int, float)): - metrics["accuracy"] = float(accuracy) - - n_resolved = metrics_data.get("n_resolved") - if isinstance(n_resolved, (int, float)): - metrics["n_resolved"] = int(n_resolved) - - n_unresolved = metrics_data.get("n_unresolved") - if isinstance(n_unresolved, (int, float)): - metrics["n_unresolved"] = int(n_unresolved) - - # pass@k flatten - pass_at_k = metrics_data.get("pass_at_k") - if isinstance(pass_at_k, dict): - for k, v in pass_at_k.items(): - if isinstance(v, (int, float)): - metrics[f"pass_at_k/{k}"] = float(v) - - # token stats from per-task results - results = metrics_data.get("results") - if isinstance(results, list): - input_tokens = [ - r.get("total_input_tokens") - for r in results - if isinstance(r, dict) and isinstance(r.get("total_input_tokens"), (int, float)) - ] - output_tokens = [ - r.get("total_output_tokens") - for r in results - if isinstance(r, dict) and isinstance(r.get("total_output_tokens"), (int, float)) - ] - - if input_tokens: - metrics["total_input_tokens_mean"] = float(statistics.mean(input_tokens)) - metrics["total_input_tokens_median"] = float(statistics.median(input_tokens)) - if output_tokens: - metrics["total_output_tokens_mean"] = float(statistics.mean(output_tokens)) - metrics["total_output_tokens_median"] = float(statistics.median(output_tokens)) - - return metrics - @staticmethod def _find_latest_result(jobs_dir: Path) -> Path | None: if not jobs_dir.exists(): @@ -434,112 +317,6 @@ def _find_latest_result(jobs_dir: Path) -> Path | None: return None return max(candidates, key=lambda path: path.stat().st_mtime) - @staticmethod - def _extract_harbor_metrics( - metrics_path: Path, - *, - model_name: str, - dataset_name: str, - agent_name: str, - ) -> dict[str, Any]: - try: - with open(metrics_path, encoding="utf-8") as fp: - metrics_data = json.load(fp) - except json.JSONDecodeError as exc: - logger.warning("Failed to parse %s: %s", metrics_path, exc) - return {} - - metrics: dict[str, Any] = {} - stats = metrics_data.get("stats") - if isinstance(stats, dict): - evals = stats.get("evals") - else: - evals = None - - accuracy = None - if isinstance(evals, dict): - candidates = [ - f"{agent_name}__{model_name}__{dataset_name}", - f"{agent_name}__{model_name}__terminal-bench", - ] - for key in candidates: - entry = evals.get(key) - accuracy = TerminalBenchEvaluator._extract_harbor_accuracy(entry) - if accuracy is not None: - break - if accuracy is None: - for entry in evals.values(): - accuracy = TerminalBenchEvaluator._extract_harbor_accuracy(entry) - if accuracy is not None: - break - - if isinstance(accuracy, (int, float)): - metrics["accuracy"] = float(accuracy) - metrics["pass_at_k/1"] = float(accuracy) - - reward_stats = metrics_data.get("reward_stats") - if isinstance(reward_stats, dict): - reward_counts = reward_stats.get("reward") - else: - reward_counts = None - - if isinstance(reward_counts, dict): - resolved = TerminalBenchEvaluator._extract_reward_count(reward_counts, 1.0) - unresolved = TerminalBenchEvaluator._extract_reward_count(reward_counts, 0.0) - if resolved is not None: - metrics["n_resolved"] = resolved - if unresolved is not None: - metrics["n_unresolved"] = unresolved - - results = metrics_data.get("results") - if isinstance(results, list): - input_tokens = [ - r.get("total_input_tokens") - for r in results - if isinstance(r, dict) and isinstance(r.get("total_input_tokens"), (int, float)) - ] - output_tokens = [ - r.get("total_output_tokens") - for r in results - if isinstance(r, dict) and isinstance(r.get("total_output_tokens"), (int, float)) - ] - - if input_tokens: - metrics["total_input_tokens_mean"] = float(statistics.mean(input_tokens)) - metrics["total_input_tokens_median"] = float(statistics.median(input_tokens)) - if output_tokens: - metrics["total_output_tokens_mean"] = float(statistics.mean(output_tokens)) - metrics["total_output_tokens_median"] = float(statistics.median(output_tokens)) - - return metrics - - @staticmethod - def _extract_harbor_accuracy(entry: Any) -> float | None: - if not isinstance(entry, dict): - return None - metrics_block = entry.get("metrics") - if isinstance(metrics_block, list) and metrics_block: - first_metric = metrics_block[0] - if isinstance(first_metric, dict): - mean = first_metric.get("mean") - if isinstance(mean, (int, float)): - return float(mean) - mean = entry.get("mean") - if isinstance(mean, (int, float)): - return float(mean) - return None - - @staticmethod - def _extract_reward_count(reward_counts: dict[Any, Any], reward_value: float) -> int | None: - for key, value in reward_counts.items(): - try: - key_value = float(key) - except (TypeError, ValueError): - continue - if key_value == reward_value and isinstance(value, (int, float)): - return int(value) - return None - # --------------------------------------------------------------------------- # HTTP server @@ -604,6 +381,7 @@ def main(): config = ServerConfig.from_args(args) evaluator = TerminalBenchEvaluator(config) app = build_app(evaluator) + logging.getLogger("werkzeug").setLevel(logging.WARNING) logger.info( "Starting Terminal Bench evaluation server on %s:%s (output root=%s)", args.host, diff --git a/examples/eval/terminal_bench/utils/__init__.py b/examples/eval/terminal_bench/utils/__init__.py new file mode 100644 index 000000000..63f2cf887 --- /dev/null +++ b/examples/eval/terminal_bench/utils/__init__.py @@ -0,0 +1 @@ +"""Helper modules for terminal_bench server.""" diff --git a/examples/eval/terminal_bench/utils/metrics.py b/examples/eval/terminal_bench/utils/metrics.py new file mode 100644 index 000000000..10c43824c --- /dev/null +++ b/examples/eval/terminal_bench/utils/metrics.py @@ -0,0 +1,163 @@ +from __future__ import annotations + +import json +import logging +import statistics +from pathlib import Path +from typing import Any + +logger = logging.getLogger("terminal_bench_server") + + +def extract_tb_metrics(metrics_path: Path) -> dict[str, Any]: + try: + with open(metrics_path, encoding="utf-8") as fp: + metrics_data = json.load(fp) + except json.JSONDecodeError as exc: + logger.warning("Failed to parse %s: %s", metrics_path, exc) + return {} + + metrics: dict[str, Any] = {} + + # core metrics + accuracy = metrics_data.get("accuracy") + if isinstance(accuracy, (int, float)): + metrics["accuracy"] = float(accuracy) + + n_resolved = metrics_data.get("n_resolved") + if isinstance(n_resolved, (int, float)): + metrics["n_resolved"] = int(n_resolved) + + n_unresolved = metrics_data.get("n_unresolved") + if isinstance(n_unresolved, (int, float)): + metrics["n_unresolved"] = int(n_unresolved) + + # pass@k flatten + pass_at_k = metrics_data.get("pass_at_k") + if isinstance(pass_at_k, dict): + for k, v in pass_at_k.items(): + if isinstance(v, (int, float)): + metrics[f"pass_at_k/{k}"] = float(v) + + # token stats from per-task results + results = metrics_data.get("results") + if isinstance(results, list): + input_tokens = [ + r.get("total_input_tokens") + for r in results + if isinstance(r, dict) + and isinstance(r.get("total_input_tokens"), (int, float)) + ] + output_tokens = [ + r.get("total_output_tokens") + for r in results + if isinstance(r, dict) + and isinstance(r.get("total_output_tokens"), (int, float)) + ] + + if input_tokens: + metrics["total_input_tokens_mean"] = float(statistics.mean(input_tokens)) + metrics["total_input_tokens_median"] = float( + statistics.median(input_tokens) + ) + metrics["total_input_tokens_min"] = float(min(input_tokens)) + metrics["total_input_tokens_max"] = float(max(input_tokens)) + if output_tokens: + metrics["total_output_tokens_mean"] = float( + statistics.mean(output_tokens) + ) + metrics["total_output_tokens_median"] = float( + statistics.median(output_tokens) + ) + + return metrics + + +def extract_harbor_metrics( + metrics_path: Path, + run_dir: Path, + *, + model_name: str, + dataset_name: str, + agent_name: str, +) -> dict[str, Any]: + try: + with open(metrics_path, encoding="utf-8") as fp: + metrics_data = json.load(fp) + except json.JSONDecodeError as exc: + logger.warning("Failed to parse %s: %s", metrics_path, exc) + return {} + + evals = metrics_data.get("stats", {}).get("evals", {}) + if not isinstance(evals, dict) or not evals: + return {} + + candidates = ( + f"{agent_name}__{model_name}__{dataset_name}", + f"{agent_name}__{model_name}__terminal-bench", + ) + entry = next((evals.get(key) for key in candidates if key in evals), None) + if entry is None: + entry = next(iter(evals.values())) + if not isinstance(entry, dict): + return {} + + metrics: dict[str, Any] = {} + for key in ("n_trials", "n_errors"): + value = entry.get(key) + if isinstance(value, (int, float)): + metrics[key] = int(value) + + metrics_block = entry.get("metrics") + if isinstance(metrics_block, list): + for metric in metrics_block: + if isinstance(metric, dict): + for name, value in metric.items(): + if isinstance(value, (int, float)): + metrics[name] = float(value) + + reward_stats = entry.get("reward_stats") + if isinstance(reward_stats, dict): + for reward_name, reward_values in reward_stats.items(): + if isinstance(reward_values, dict): + for reward_value, trials in reward_values.items(): + if isinstance(trials, list): + metrics[f"reward_stats/{reward_name}/{reward_value}"] = len( + trials + ) + + exception_stats = entry.get("exception_stats") + if isinstance(exception_stats, dict): + for exception_name, trials in exception_stats.items(): + if isinstance(trials, list): + metrics[f"exception_stats/{exception_name}"] = len(trials) + + input_tokens = [] + output_tokens = [] + for result_path in run_dir.glob("*/result.json"): + try: + with open(result_path, encoding="utf-8") as fp: + task_data = json.load(fp) + except json.JSONDecodeError: + logger.warning("Failed to parse %s", result_path) + continue + agent_result = task_data.get("agent_result") or {} + n_input = agent_result.get("n_input_tokens") + if isinstance(n_input, (int, float)): + input_tokens.append(float(n_input)) + n_output = agent_result.get("n_output_tokens") + if isinstance(n_output, (int, float)): + output_tokens.append(float(n_output)) + + def add_token_stats(name: str, values: list[float]) -> None: + if not values: + return + metrics[f"{name}/min"] = float(min(values)) + metrics[f"{name}/max"] = float(max(values)) + metrics[f"{name}/mean"] = float(statistics.mean(values)) + metrics[f"{name}/median"] = float(statistics.median(values)) + + add_token_stats("n_input_tokens", input_tokens) + add_token_stats("n_output_tokens", output_tokens) + + return metrics diff --git a/examples/eval/terminal_bench/utils/runner.py b/examples/eval/terminal_bench/utils/runner.py new file mode 100644 index 000000000..af70d435d --- /dev/null +++ b/examples/eval/terminal_bench/utils/runner.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +import argparse +import json +from collections.abc import Mapping +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Any + + +class Runner(str, Enum): + TB = "tb" + HARBOR = "harbor" + + +def _normalize_model_name(model_name: str) -> str: + name = (model_name or "").strip() + if not name: + return "" + if "/" in name: + return name + return f"openai/{name}" + + +def _snake_to_kebab(value: str) -> str: + return value.replace("_", "-") + + +def _json_value(value: Any) -> str: + return json.dumps(value, separators=(",", ":")) + + +def _append_runner_kwargs(cmd: list[str], runner_kwargs: Mapping[str, Any]) -> None: + for key, value in runner_kwargs.items(): + flag = f"--{_snake_to_kebab(str(key))}" + if isinstance(value, bool): + if value: + cmd.append(flag) + continue + if isinstance(value, list): + for item in value: + if isinstance(item, (dict, list)): + cmd.extend([flag, _json_value(item)]) + else: + cmd.extend([flag, str(item)]) + continue + if isinstance(value, dict): + if key == "agent_kwarg": + for agent_key, agent_value in value.items(): + if isinstance(agent_value, (dict, list)): + agent_value_str = _json_value(agent_value) + else: + agent_value_str = str(agent_value) + cmd.extend([flag, f"{agent_key}={agent_value_str}"]) + else: + cmd.extend([flag, _json_value(value)]) + continue + cmd.extend([flag, str(value)]) + + +@dataclass +class ServerConfig: + output_root: Path + + @classmethod + def from_args(cls, args: argparse.Namespace) -> ServerConfig: + return cls(output_root=Path(args.output_root).expanduser().resolve()) + + +def _build_harbor_command(payload: Any, job_name: str | None) -> list[str]: + dataset_name = (payload.dataset_name or "terminal-bench").strip() or "terminal-bench" + dataset_version = (payload.dataset_version or "2.0").strip() or "2.0" + cmd = [ + "harbor", + "run", + "-d", + f"{dataset_name}@{dataset_version}", + ] + jobs_dir = payload.output_path + if jobs_dir: + cmd.extend(["--jobs-dir", jobs_dir]) + if job_name: + cmd.extend(["--job-name", job_name]) + + if payload.runner_kwargs: + _append_runner_kwargs(cmd, payload.runner_kwargs) + + return cmd + + +def _build_tb_command(payload: Any, run_id: str, output_root: Path) -> list[str]: + dataset_name = (payload.dataset_name or "terminal-bench-core").strip() or "terminal-bench-core" + dataset_version = (payload.dataset_version or "0.1.1").strip() or "0.1.1" + cmd = [ + "tb", + "run", + "-d", + f"{dataset_name}=={dataset_version}", + ] + output_root = str(Path(payload.output_path or output_root).expanduser()) + Path(output_root).mkdir(parents=True, exist_ok=True) + cmd.extend(["--output-path", output_root, "--run-id", run_id]) + if payload.runner_kwargs: + _append_runner_kwargs(cmd, payload.runner_kwargs) + + return cmd From f1275e66e0f330b2d0496ce033971da107488ba6 Mon Sep 17 00:00:00 2001 From: Xinyu Jiang Date: Thu, 22 Jan 2026 04:34:51 +0000 Subject: [PATCH 6/7] update readme --- examples/eval/terminal_bench/README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/eval/terminal_bench/README.md b/examples/eval/terminal_bench/README.md index 9e8116579..904e63169 100644 --- a/examples/eval/terminal_bench/README.md +++ b/examples/eval/terminal_bench/README.md @@ -1,6 +1,6 @@ # Terminal Bench Eval -This folder wires Terminal Bench (TB) into Miles as an eval delegate. The run happens on the host via `harbor run` (Terminal Bench 2.0, default) or `tb run` (Terminal Bench 1.0, legacy). Miles reads back aggregated metrics such as `accuracy`, `n_resolved`, `n_unresolved`, `pass_at_k/*`, and token stats like `total_input_tokens_mean/median` and `total_output_tokens_mean/median`. +This folder wires Terminal Bench (TB) into Miles as an eval delegate. The run happens on the host via `harbor run` (Terminal Bench 2.0, default) or `tb run` (Terminal Bench 1.0, legacy). Metrics extraction lives in `utils/metrics.py` and command construction lives in `utils/runner.py`. ## What runs where @@ -85,9 +85,11 @@ What it does: `harbor run -d terminal-bench@2.0 --jobs-dir --job-name --model openai/ --agent --agent-kwarg api_base=... --n-concurrent ...` - For `runner: tb`, builds a command like: `tb run -d terminal-bench-core==0.1.1 --output-path --run-id --model openai/ --agent --agent-kwarg api_base=... --n-concurrent ...` -- Waits for completion, then returns `accuracy`, `n_resolved`, - `n_unresolved`, `pass_at_k/*`, and token stats such as - `total_input_tokens_mean/median` and `total_output_tokens_mean/median` +- Waits for completion, then returns TB metrics (`accuracy`, `n_resolved`, + `n_unresolved`, `pass_at_k/*`, `total_input_tokens_mean/median/min/max`, + `total_output_tokens_mean/median`) or Harbor metrics (`n_trials`, `n_errors`, + `metrics` entries like `mean`, `reward_stats/*`, `exception_stats/*`, + `n_input_tokens/*`, `n_output_tokens/*`). ## 6) Run the eval script (example) From 91ce722be20e0be92aebc65afeed32eec69977cf Mon Sep 17 00:00:00 2001 From: Xinyu Jiang Date: Thu, 22 Jan 2026 04:36:54 +0000 Subject: [PATCH 7/7] clean --- examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh | 9 --------- 1 file changed, 9 deletions(-) diff --git a/examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh b/examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh index a2a5d733c..fc827129f 100644 --- a/examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh +++ b/examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh @@ -133,15 +133,6 @@ ray start --head --node-ip-address ${MASTER_ADDR} --port 6380 --num-gpus 4 \ --dashboard-agent-grpc-port 52367 \ --runtime-env-agent-port 52368 -# ray start --head --node-ip-address ${MASTER_ADDR} --port 6381 --num-gpus 2 \ -# --disable-usage-stats \ -# --dashboard-host=0.0.0.0 \ -# --dashboard-port=8267 \ -# --dashboard-agent-listen-port 52266 \ -# --dashboard-agent-grpc-port 52267 \ -# --runtime-env-agent-port 52268 - - RUNTIME_ENV_JSON="{ \"env_vars\": { \"PYTHONPATH\": \"/root/Megatron-LM/\",