MacAgentBench/eval.sh at main · JetAstra/MacAgentBench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/bin/bash

# ================================
# Required Environment Variables
# ================================

# Path to system image files used by the container
export MACOS_ARENA_MAC_HDD_IMG_PATH="/home/fuyikun/Documents/ckpt/all4/mac_hdd_ng.img"
export MACOS_ARENA_BASESYSTEM_IMG_PATH="/home/fuyikun/Documents/BaseSystem.img"

# Optional: export keys for GPT-based models if used
# export OPENAI_API_KEY="your-api-key-here"
# export ANTHROPIC_API_KEY="your-api-key-here"

# ================================
# Configurable constants
# ================================

WORK_DIR="/home/fuyikun/Documents/OS-Mac/evaluation/MacOSArena"
TASK_ROOT="${WORK_DIR}/task"

# DOMAINS: Task domain to evaluate.
#          Use "single_app" to test all single-app tasks,
#          or "multi_app" to test all cross-app tasks.
DOMAINS=(
  "new_apple_notes"
  "new_blogwatcher"
  "new_clawhub"
  "new_gifgrep"
  "new_github"
  "new_himalaya"
  "new_obsidian"
  "new_peekaboo"
  "new_reminders"
  "new_sherpa_onnx_tts"
  "new_songsee"
  "new_tmux"
  "new_video_frames"
  "new_weather"
  "new_whisper"
  "keynote"
  "numbers"
  "pages"
)

# MODELS: List of agent model names to evaluate.
#         Use ("none") for planner + grounder agent (requires PLANNER_EXECUTOR_MODEL).
MODELS=("openclaw")


# MODEL_TYPE_LIST: Explicit model types matching MODELS
MODEL_TYPE_LIST=("openclaw")

# URL_LIST: Model API URLs for each agent in MODELS (same order).
URL_LIST=("")

# PLANNER_EXECUTOR_MODEL: List of (planner, executor) pairs. Used if MODELS=("none")
# EXEC_MODEL_URL_LIST: URL list for each executor
PLANNER_EXECUTOR_MODEL=()
EXEC_MODEL_URL_LIST=()

# MODEL_SUB_DIR: Subdirectory under RESULT_ROOT/{model_name}/ to store logs of this evaluation run.
MODEL_SUB_DIR="claude-opus-4-6-thinking"

# DEBUG_OPENCLAW: Set to true to disable recording while debugging startup/gateway issues.
DEBUG_OPENCLAW=false

# CONFIG_FILE: Path to YAML configuration file
CONFIG_FILE="config/default_config_linux.yaml"  # For Linux
# CONFIG_FILE="config/default_config.yaml"      # For WSL

# RESULT_ROOT: Root directory to store all agent evaluation outputs
RESULT_ROOT="${WORK_DIR}/results"
LOG_DIR="${WORK_DIR}/logs"

# ================================
# Preparation
# ================================

cd "${WORK_DIR}" || exit 1
mkdir -p "${RESULT_ROOT}"
mkdir -p "${LOG_DIR}"

RUN_TS="$(date +%Y%m%d_%H%M%S)"
LOG_FILE="${LOG_DIR}/eval_${RUN_TS}.log"
exec > >(tee -a "${LOG_FILE}") 2>&1

echo "WORK_DIR=${WORK_DIR}"
echo "TASK_ROOT=${TASK_ROOT}"
echo "DOMAINS=${DOMAINS[*]}"
echo "MODELS=${MODELS[*]}"
echo "RESULT_ROOT=${RESULT_ROOT}"
echo "CONFIG_FILE=${CONFIG_FILE}"
echo "DEBUG_OPENCLAW=${DEBUG_OPENCLAW}"
echo "LOG_FILE=${LOG_FILE}"

# ================================
# Run evaluation
# ================================

# ⚠️ NOTE:
# This command requires `sudo` because the evaluation will remove and start Docker containers before each task.
# Make sure that the `python` used under sudo is still the one from your conda environment.
# You can check it via `sudo which python`, and replace `python` with the absolute path if needed.
# (i.e., the result of `which python` in your activated conda environment).

sudo \
  MACOS_ARENA_MAC_HDD_IMG_PATH="${MACOS_ARENA_MAC_HDD_IMG_PATH}" \
  MACOS_ARENA_BASESYSTEM_IMG_PATH="${MACOS_ARENA_BASESYSTEM_IMG_PATH}" \
  /home/fuyikun/miniconda3/envs/eval/bin/python -m batch_run \
  --task_root "${TASK_ROOT}" \
  --domains "${DOMAINS[@]}" \
  --models "${MODELS[@]}" \
  --url_list "${URL_LIST[@]}" \
  --model_type_list "${MODEL_TYPE_LIST[@]}" \
  --planner_executor_model "${PLANNER_EXECUTOR_MODEL[@]}" \
  --exec_model_url_list "${EXEC_MODEL_URL_LIST[@]}" \
  --model_sub_dir "${MODEL_SUB_DIR}" \
  --config_file "${CONFIG_FILE}" \
  --result_root "${RESULT_ROOT}" \
  $( [ "${DEBUG_OPENCLAW}" = "true" ] && printf '%s' "--disable_recording" )