radixark · XinyuJiangCMU · Jan 15, 2026 · Jan 17, 2026 · Jan 17, 2026 · Jan 17, 2026
diff --git a/examples/eval/scripts/multi_tasks.yaml → ...eval/scripts/nemo_skills/multi_tasks.yaml b/examples/eval/scripts/multi_tasks.yaml → ...eval/scripts/nemo_skills/multi_tasks.yaml
diff --git a/examples/eval/scripts/run-qwen3-32B.sh → ...eval/scripts/nemo_skills/run-qwen3-32B.sh b/examples/eval/scripts/run-qwen3-32B.sh → ...eval/scripts/nemo_skills/run-qwen3-32B.sh
@@ -29,11 +29,11 @@ fi
 echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
 
 SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
-REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." &>/dev/null && pwd)"
+REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../../.." &>/dev/null && pwd)"
 source "${REPO_ROOT}/scripts/models/qwen3-32B.sh"
 
 # Store eval/delegate settings in a YAML config similar to examples/eval_multi_task.
-EVAL_CONFIG_PATH=${SKILLS_EVAL_CONFIG_PATH:-"${REPO_ROOT}/examples/eval/scripts/multi_tasks.yaml"}
+EVAL_CONFIG_PATH=${SKILLS_EVAL_CONFIG_PATH:-"${REPO_ROOT}/examples/eval/scripts/nemo_skills/multi_tasks.yaml"}
 
 CKPT_ARGS=(
    --hf-checkpoint /root/shared/Qwen3-32B

diff --git a/examples/eval/scripts/run-qwen3-4B.sh → .../eval/scripts/nemo_skills/run-qwen3-4B.sh b/examples/eval/scripts/run-qwen3-4B.sh → .../eval/scripts/nemo_skills/run-qwen3-4B.sh
@@ -29,11 +29,11 @@ fi
 echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
 
 SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
-REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." &>/dev/null && pwd)"
+REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../../.." &>/dev/null && pwd)"
 source "${REPO_ROOT}/scripts/models/qwen3-4B.sh"
 
 # Store eval/delegate settings in a YAML config similar to examples/eval_multi_task.
-EVAL_CONFIG_PATH=${SKILLS_EVAL_CONFIG_PATH:-"${REPO_ROOT}/examples/eval/scripts/multi_tasks.yaml"}
+EVAL_CONFIG_PATH=${SKILLS_EVAL_CONFIG_PATH:-"${REPO_ROOT}/examples/eval/scripts/nemo_skills/multi_tasks.yaml"}
 
 CKPT_ARGS=(
    --hf-checkpoint /root/Qwen3-4B

diff --git a/examples/eval/scripts/terminal_bench/harbor_runner.yaml b/examples/eval/scripts/terminal_bench/harbor_runner.yaml
@@ -0,0 +1,42 @@
+eval:
+  defaults:
+    n_samples_per_eval_prompt: 1
+    temperature: 0.6
+    top_p: 0.95
+    top_k: -1
+    max_response_len: 24576
+  datasets: # these eval tasks go through miles dataset config and default rollout function (miles.rollout.sglang_rollout.generate_rollout)
+    - name: gpqa    # huggingface-cli download --repo-type dataset zyzshishui0627/gpqa_diamond --local-dir /root/gpqa
+      path: /root/gpqa/gpqa_eval.jsonl
+      rm_type: gpqa
+      n_samples_per_eval_prompt: 2
+    - name: ifbench   # huggingface-cli download --repo-type dataset zyzshishui0627/IFBench --local-dir /root/ifbench
+      path: /root/ifbench/IFBench_eval.jsonl
+      rm_type: ifbench
+      n_samples_per_eval_prompt: 1
+  delegate:
+    - name: terminal_bench
+      url: http://172.17.0.1:9051 # Port must match the TB server running on the host machine
+      timeout_secs: 86400  # 24 hours
+      max_retries: 1 # HTTP request retries from Miles to the TB server
+      model_name: qwen3-8b
+      agent_name: terminus-2
+      api_base: http://127.0.0.1:30005/v1 # Port must match the sglang router port set in run-eval-tb-qwen.sh
+      runner: harbor
+      dataset_name: terminal-bench
+      dataset_version: "2.0"
+      output_path: harbor_runner_jobs
+      n_concurrent: 32
+      runner_kwargs:
+        # task_name:
+        #   - fix-git
+        # debug: true
+        # timeout_multiplier: 2.0
+        # retry_exclude:
+        #   - AgentTimeoutError
+        #   - VerifierTimeoutError
+        n_attempts: 2
+        agent_kwarg:
+          model_info:
+            max_input_tokens: 40960
+            max_output_tokens: 8192
diff --git a/examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh b/examples/eval/scripts/terminal_bench/run-eval-tb-qwen.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+
+# Example launcher that reuses the Qwen3-8B recipe but delegates evaluation to an
+# external Terminal Bench server via the eval_delegate_rollout wrapper.
+
+# Clean up any stale processes from a previous run.
+pkill -9 sglang
+sleep 3
+ray stop --force
+pkill -9 ray
+pkill -9 python
+sleep 3
+pkill -9 ray
+pkill -9 python
+
+set -ex
+
+export PYTHONBUFFERED=16
+export MILES_HOST_IP=${MILES_HOST_IP:-"127.0.0.1"}
+
+MODEL_DIR="${MODEL_DIR:-/root/.cache/huggingface}"
+export MODEL_DIR
+
+NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
+if [ "$NVLINK_COUNT" -gt 0 ]; then
+    HAS_NVLINK=1
+else
+    HAS_NVLINK=0
+fi
+echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../../.." &>/dev/null && pwd)"
+source "${REPO_ROOT}/scripts/models/qwen3-8B.sh"
+
+EVAL_CONFIG_PATH=${TB_EVAL_CONFIG_PATH:-"${SCRIPT_DIR}/harbor_runner.yaml"}
+# EVAL_CONFIG_PATH=${TB_EVAL_CONFIG_PATH:-"${SCRIPT_DIR}/tb_runner.yaml"}
+
+CKPT_ARGS=(
+   --hf-checkpoint ${MODEL_DIR}/OpenThinker-Agent-v1 # huggingface-cli download open-thoughts/OpenThinker-Agent-v1
+   --ref-load ${MODEL_DIR}/OpenThinker-Agent-v1_torch_dist
+   # --load ${MODEL_DIR}/OpenThinker-Agent-v1_miles/
+   --save ${MODEL_DIR}/OpenThinker-Agent-v1_miles/
+   --save-interval 20
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
+   --input-key prompt
+   --label-key label
+   --apply-chat-template
+   --rollout-shuffle
+   --rm-type deepscaler
+   # --num-rollout 3000
+   --num-rollout 1
+   --rollout-batch-size 32
+   --n-samples-per-prompt 8
+   --rollout-max-response-len 8192
+   --rollout-temperature 1
+   --global-batch-size 256
+   --balance-data
+)
+
+EVAL_ARGS=(
+   --eval-interval 1
+   --eval-config "${EVAL_CONFIG_PATH}"
+   --eval-function-path examples.eval.eval_delegate_rollout.generate_rollout
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 1
+   --pipeline-model-parallel-size 1
+   --context-parallel-size 1
+   --expert-model-parallel-size 1
+   --expert-tensor-parallel-size 1
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 9216
+)
+
+GRPO_ARGS=(
+   --advantage-estimator grpo
+   --use-kl-loss
+   --kl-loss-coef 0.00
+   --kl-loss-type low_var_kl
+   --entropy-coef 0.00
+   --eps-clip 0.2
+   --eps-clip-high 0.28
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+)
+
+WANDB_ARGS=(
+   --use-wandb
+   --wandb-project miles-tb
+   --wandb-group qwen3-8b-eval
+   --wandb-key ${WANDB_KEY}   # export WANDB_KEY="your_key"
+)
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine 1
+   --sglang-mem-fraction-static 0.7
+   --sglang-router-port 30005
+)
+
+MISC_ARGS=(
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   --attention-backend flash
+)
+
+export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+export CUDA_VISIBLE_DEVICES=4,5,6,7
+
+ray start --head --node-ip-address ${MASTER_ADDR} --port 6380 --num-gpus 4 \
+            --disable-usage-stats \
+            --dashboard-host=0.0.0.0 \
+            --dashboard-port=8266 \
+            --dashboard-agent-listen-port 52366 \
+            --dashboard-agent-grpc-port 52367 \
+            --runtime-env-agent-port 52368
+
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"/root/Megatron-LM/\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\"
+  }
+}"
+
+ray job submit --address="http://${MASTER_ADDR}:8266" \
+   --working-dir "${REPO_ROOT}" \
+   --runtime-env-json="${RUNTIME_ENV_JSON}" \
+   -- python3 train.py \
+   --actor-num-nodes 1 \
+   --actor-num-gpus-per-node 4 \
+   --colocate \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${WANDB_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${EVAL_ARGS[@]} \
+   ${SGLANG_ARGS[@]} \
+   ${MISC_ARGS[@]}
diff --git a/examples/eval/scripts/terminal_bench/tb_runner.yaml b/examples/eval/scripts/terminal_bench/tb_runner.yaml
@@ -0,0 +1,34 @@
+eval:
+  defaults:
+    n_samples_per_eval_prompt: 1
+    temperature: 0.6
+    top_p: 0.95
+    top_k: -1
+    max_response_len: 24576
+  datasets: # these eval tasks go through miles dataset config and default rollout function (miles.rollout.sglang_rollout.generate_rollout)
+    - name: gpqa    # huggingface-cli download --repo-type dataset zyzshishui0627/gpqa_diamond --local-dir /root/gpqa
+      path: /root/gpqa/gpqa_eval.jsonl
+      rm_type: gpqa
+      n_samples_per_eval_prompt: 2
+    - name: ifbench   # huggingface-cli download --repo-type dataset zyzshishui0627/IFBench --local-dir /root/ifbench
+      path: /root/ifbench/IFBench_eval.jsonl
+      rm_type: ifbench
+      n_samples_per_eval_prompt: 1
+  delegate:
+    - name: terminal_bench
+      url: http://172.17.0.1:9051 # Port must match the TB server running on the host machine
+      timeout_secs: 86400  # 24 hours
+      max_retries: 1 # HTTP request retries from Miles to the TB server
+      model_name: qwen3-8b
+      agent_name: terminus-2
+      api_base: http://127.0.0.1:30005/v1 # Port must match the sglang router port set in run-eval-tb-qwen.sh
+      runner: tb
+      dataset_name: terminal-bench-core
+      dataset_version: "0.1.1"
+      output_path: tb_runner_jobs
+      n_concurrent: 16
+      # runner_kwargs:
+        # task_id:
+        #   - hello-world
+        # n_tasks: 10
+        # example_flag: value