scripts/bench_runner.sh at main · pinchbench/scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
#!/bin/bash
# Autonomous benchmark runner — runs on each Vultr instance at boot.
#
# Reads the model list from the Vultr instance metadata API (set as userdata
# by the orchestrator), runs registration and benchmarks for each model,
# and self-destructs the instance on full success.
#
# This script is invoked by bench-runner.service (systemd) on first boot.
# It should be placed at /root/run_benchmarks.sh on the snapshot image.
# Use setup_snapshot.sh to install it.

set -o pipefail

LOG="/var/log/bench-runner.log"
REMOTE_DIR="/root/skill/scripts"
METADATA="http://169.254.169.254"

# Tee all output to log file
exec > >(tee -a "$LOG") 2>&1

# ── Load environment first, before strict mode ──
# .bashrc references $PS1 which is unbound in non-interactive shells.
# Source with -u disabled to avoid a fatal error, then enable strict mode.
source /root/.profile 2>/dev/null || true
source /root/.bashrc 2>/dev/null || true
set -o allexport
source /etc/environment 2>/dev/null || true
set +o allexport

# Ensure tool paths are present regardless of how systemd invoked this script
export PATH="/root/.local/bin:/usr/local/bin:/usr/local/sbin:/usr/bin:/usr/sbin:/bin:$PATH"

# Ensure HOME is set — systemd may not set it, but the Vultr CLI needs it to
# locate its config directory ($XDG_CONFIG_HOME falls back to $HOME/.config).
export HOME="${HOME:-/root}"

# Now enable strict mode (unbound variables are safe after sourcing profiles)
set -uo pipefail

# ── Slack notification helper ──
# Reads SLACK_WEBHOOK_URL from /etc/environment (set during snapshot setup).
# Silently no-ops if the variable is unset so the script still works without Slack.
slack_notify() {
    local text="$1"
    if [ -z "${SLACK_WEBHOOK_URL:-}" ]; then
        return 0
    fi
    curl -sf -X POST "$SLACK_WEBHOOK_URL" \
        -H "Content-Type: application/json" \
        -d "{\"text\": $(echo "$text" | jq -Rs .)}" \
        >/dev/null 2>&1 || echo "WARNING: Slack notification failed"
}

echo "=== Benchmark runner started at $(date -u) ==="
echo "Hostname: $(hostname)"

# ── Verify required tools ──
for tool in curl jq uv vultr; do
    if ! command -v "$tool" &>/dev/null; then
        echo "ERROR: '$tool' not found in PATH (PATH=$PATH)"
        exit 1
    fi
done

# ── Get instance ID and IP from Vultr metadata ──
echo "Fetching instance metadata..."
INSTANCE_ID=""
INSTANCE_IP=""
for attempt in $(seq 1 12); do
    INSTANCE_ID=$(curl -sf --retry 3 --retry-delay 5 "$METADATA/v1/instance-v2-id" 2>/dev/null || true)
    INSTANCE_IP=$(curl -sf --retry 3 --retry-delay 5 "$METADATA/v1/interfaces/0/ipv4/address" 2>/dev/null || true)
    if [ -n "$INSTANCE_ID" ] && [ -n "$INSTANCE_IP" ]; then
        break
    fi
    echo "  Metadata not ready (attempt $attempt/12), retrying in 10s..."
    sleep 10
done

if [ -z "$INSTANCE_ID" ]; then
    echo "ERROR: Could not retrieve instance ID from metadata API after 2 minutes"
    exit 1
fi

echo "Instance ID: $INSTANCE_ID"
echo "Instance IP: $INSTANCE_IP"

# ── Register a safety-net self-destruct in 5 hours ──
# This fires even if the benchmarks hang or the runner crashes after registration.
# Requires 'at' to be available (installed by setup_snapshot.sh).
if command -v at &>/dev/null && [ -n "$INSTANCE_ID" ]; then
    echo "vultr instance delete $INSTANCE_ID" | at now + 5 hours 2>/dev/null && \
        echo "Safety-net self-destruct scheduled in 5 hours" || \
        echo "WARNING: Could not schedule safety-net self-destruct (at daemon not running?)"
fi

# ── Read model list from file ──
# The orchestrator creates a Vultr startup script per instance that writes
# /root/benchmark_models.txt (one model per line) at first boot.
# Startup scripts run regardless of snapshot vs OS image, unlike --userdata
# which is silently ignored on snapshot-based instances.
MODEL_FILE="/root/benchmark_models.txt"

# Wait for the startup script to run and write the file (it runs early in boot
# but may not be done by the time this service starts)
echo "Waiting for model assignment file..."
for attempt in $(seq 1 12); do
    if [ -s "$MODEL_FILE" ]; then
        break
    fi
    echo "  $MODEL_FILE not ready yet (attempt $attempt/12), waiting 10s..."
    sleep 10
done

if [ ! -s "$MODEL_FILE" ]; then
    echo "ERROR: $MODEL_FILE not found or empty after 2 minutes"
    echo "Was this instance launched by the orchestrator?"
    exit 1
fi

echo "Model assignment file:"
cat "$MODEL_FILE"

mapfile -t MODELS < "$MODEL_FILE"

if [ ${#MODELS[@]} -eq 0 ]; then
    echo "ERROR: No models found in $MODEL_FILE"
    exit 1
fi

echo "Models assigned to this instance:"
for m in "${MODELS[@]}"; do
    echo "  - $m"
done

# ── Read optional official key ──
OFFICIAL_KEY_FILE="/root/benchmark_official_key.txt"
OFFICIAL_KEY_ARG=()
OFFICIAL_KEY_SET="no"
if [ -s "$OFFICIAL_KEY_FILE" ]; then
    OFFICIAL_KEY=$(cat "$OFFICIAL_KEY_FILE")
    OFFICIAL_KEY_ARG=(--official-key "$OFFICIAL_KEY")
    OFFICIAL_KEY_SET="yes"
    echo "Official key loaded from $OFFICIAL_KEY_FILE"
elif [ -n "${PINCHBENCH_OFFICIAL_KEY:-}" ]; then
    OFFICIAL_KEY_ARG=(--official-key "$PINCHBENCH_OFFICIAL_KEY")
    OFFICIAL_KEY_SET="yes"
    echo "Official key loaded from PINCHBENCH_OFFICIAL_KEY env var"
else
    echo "No official key found — submissions will be unofficial"
fi

# ── Pull latest benchmark code ──
echo ""
echo "=== Updating benchmark code ==="
cd "$REMOTE_DIR"
git pull || echo "WARNING: git pull failed, continuing with existing code"
SKILL_GIT_HASH=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")

# ── Registration ──
echo ""
echo "=== Running registration ==="
# Use a temp file so output is both written to the log (via the exec redirect)
# and available for URL extraction. Command substitution $(...) runs in a subshell
# that doesn't inherit the exec redirect, so tee /dev/stderr wouldn't reach the log.
REG_TMPFILE=$(mktemp)
    uv run benchmark.py --register "${OFFICIAL_KEY_ARG[@]}" 2>&1 | tee "$REG_TMPFILE"
REGISTER_EXIT=${PIPESTATUS[0]}
REGISTRATION_OUTPUT=$(cat "$REG_TMPFILE")
rm -f "$REG_TMPFILE"

if [ "$REGISTER_EXIT" -ne 0 ]; then
    echo "ERROR: Registration failed"
    slack_notify "❌ *bench-runner failed* on \`$(hostname)\` ($INSTANCE_ID)
IP: \`$INSTANCE_IP\`
Registration failed after assigning models: ${MODELS[*]}
Instance retained for debugging.
Check: \`ssh root@$INSTANCE_IP tail -f $LOG\`"
    exit 1
fi
echo "✓ Registration complete"

# Unset any pre-baked PINCHBENCH_TOKEN from the snapshot environment so that
# benchmark runs use the freshly registered token saved to the config file.
# lib_upload._resolve_token checks the env var before the config file, so
# a stale snapshot token would otherwise shadow the new one and cause 401s.
unset PINCHBENCH_TOKEN

# Extract claim URL ("Claim URL: https://...") from registration output
CLAIM_URL=$(echo "$REGISTRATION_OUTPUT" | grep -i "Claim URL" | grep -oE 'https?://[^ ]+' | head -1 || true)

MODEL_LIST=$(printf ' • %s\n' "${MODELS[@]}")
slack_notify "🚀 *bench-runner started* on \`$(hostname)\` ($INSTANCE_ID)
IP: \`$INSTANCE_IP\`
Skill git hash: \`$SKILL_GIT_HASH\`
Official key set: \`$OFFICIAL_KEY_SET\`
Models (${#MODELS[@]}):
$MODEL_LIST
${CLAIM_URL:+Claim URL: $CLAIM_URL}"

# ── Run benchmarks ──
FAILED_MODELS=()
RESULT_URLS=()
SKIPPED_MODELS=()
FAIL_FAST_TRIGGERED=0
FAIL_FAST_MODEL=""

for i in "${!MODELS[@]}"; do
    model="${MODELS[$i]}"
    echo ""
    echo "=== Benchmarking: $model ==="
    echo "Started at: $(date -u)"

    MODEL_TMPFILE=$(mktemp)
    uv run benchmark.py --model "$model" "${OFFICIAL_KEY_ARG[@]}" 2>&1 | tee "$MODEL_TMPFILE"
    MODEL_EXIT=${PIPESTATUS[0]}
    MODEL_OUTPUT=$(cat "$MODEL_TMPFILE")
    rm -f "$MODEL_TMPFILE"

    # Extract score, submission ID, and leaderboard URL from model run output
    MODEL_SCORE=$(echo "$MODEL_OUTPUT" | grep -oP "Overall Score: \K[\d.]+% \([\d.]+ / [\d.]+\)" | head -1 || true)
    MODEL_SUBMISSION=$(echo "$MODEL_OUTPUT" | grep -i "Submission ID" | grep -oE '[a-f0-9-]{36}' | head -1 || true)
    MODEL_URL=$(echo "$MODEL_OUTPUT" | grep -i "View at" | grep -oE 'https?://[^ ]+' | head -1 || true)

    # Build result entry with score and URL
    RESULT_ENTRY="$model"
    if [ -n "$MODEL_SCORE" ]; then
        RESULT_ENTRY="$RESULT_ENTRY: $MODEL_SCORE"
    fi
    if [ -n "$MODEL_URL" ]; then
        RESULT_ENTRY="$RESULT_ENTRY — $MODEL_URL"
    fi
    if [ -n "$MODEL_SUBMISSION" ]; then
        RESULT_ENTRY="$RESULT_ENTRY (submission: $MODEL_SUBMISSION)"
    fi
    RESULT_URLS+=("$RESULT_ENTRY")

    if [ "$MODEL_EXIT" -eq 0 ]; then
        echo "✓ $model complete at $(date -u)"
    elif [ "$MODEL_EXIT" -eq 3 ]; then
        echo "🚨 FAIL FAST triggered by $model (exit code 3)"
        FAILED_MODELS+=("$model")
        if [ "$i" -lt "$(( ${#MODELS[@]} - 1 ))" ]; then
            SKIPPED_MODELS=("${MODELS[@]:$((i + 1))}")
        fi
        FAIL_FAST_TRIGGERED=1
        FAIL_FAST_MODEL="$model"
        FAIL_FAST_SKIPPED_LIST=""
        if [ ${#SKIPPED_MODELS[@]} -gt 0 ]; then
            FAIL_FAST_SKIPPED_LIST=$(printf ' • %s\n' "${SKIPPED_MODELS[@]}")
        fi
        slack_notify "🚨 *bench-runner fail-fast* on \`$(hostname)\` ($INSTANCE_ID)
IP: \`$INSTANCE_IP\`
Model: \`$model\`
Reason: sanity check scored 0% (exit code 3)
Skipped (${#SKIPPED_MODELS[@]}):
$FAIL_FAST_SKIPPED_LIST"
        break
    else
        echo "✗ $model failed at $(date -u)"
        FAILED_MODELS+=("$model")
    fi
done

# ── Summary ──
echo ""
echo "=== Run complete at $(date -u) ==="
echo "Total models: ${#MODELS[@]}"
echo "Failed:       ${#FAILED_MODELS[@]}"
if [ "$FAIL_FAST_TRIGGERED" -eq 1 ]; then
    echo "Fail-fast:    yes (triggered by $FAIL_FAST_MODEL)"
    echo "Skipped:      ${#SKIPPED_MODELS[@]}"
fi
if [ ${#FAILED_MODELS[@]} -gt 0 ]; then
    echo "Failed models:"
    for m in "${FAILED_MODELS[@]}"; do
        echo "  - $m"
    done
fi
if [ ${#SKIPPED_MODELS[@]} -gt 0 ]; then
    echo "Skipped models:"
    for m in "${SKIPPED_MODELS[@]}"; do
        echo "  - $m"
    done
fi

PROCESSED=$(( ${#MODELS[@]} - ${#SKIPPED_MODELS[@]} ))
SUCCEEDED=$(( PROCESSED - ${#FAILED_MODELS[@]} ))
if [ "$FAIL_FAST_TRIGGERED" -eq 1 ]; then
    SUMMARY_EMOJI="🚨"
    SKIPPED_LIST=""
    if [ ${#SKIPPED_MODELS[@]} -gt 0 ]; then
        SKIPPED_LIST=$(printf ' • %s\n' "${SKIPPED_MODELS[@]}")
    fi
    SUMMARY_STATUS="fail-fast triggered by $FAIL_FAST_MODEL after $SUCCEEDED/${#MODELS[@]} succeeded.
Skipped (${#SKIPPED_MODELS[@]}):
$SKIPPED_LIST"
elif [ ${#FAILED_MODELS[@]} -eq 0 ]; then
    SUMMARY_EMOJI="✅"
    SUMMARY_STATUS="all ${#MODELS[@]} models completed"
else
    SUMMARY_EMOJI="⚠️"
    FAILED_LIST=$(printf ' • %s\n' "${FAILED_MODELS[@]}")
    SUMMARY_STATUS="$SUCCEEDED/${#MODELS[@]} succeeded. Failed:
$FAILED_LIST"
fi

SHOULD_DESTROY=1
if [ "$FAIL_FAST_TRIGGERED" -eq 1 ] || [ ${#FAILED_MODELS[@]} -gt 0 ]; then
    SHOULD_DESTROY=0
fi

if [ "$SHOULD_DESTROY" -eq 1 ]; then
    LIFECYCLE_NOTE="Destroying instance now."
else
    LIFECYCLE_NOTE="Instance retained for debugging (safety-net auto-delete is still scheduled)."
fi

RESULTS_SECTION=""
if [ ${#RESULT_URLS[@]} -gt 0 ]; then
    RESULTS_SECTION="
Results:
$(printf ' • %s\n' "${RESULT_URLS[@]}")"
fi

slack_notify "$SUMMARY_EMOJI *bench-runner done* on \`$(hostname)\` ($INSTANCE_ID)
IP: \`$INSTANCE_IP\`
$SUMMARY_STATUS
${CLAIM_URL:+Claim URL: $CLAIM_URL}$RESULTS_SECTION
$LIFECYCLE_NOTE"

# ── Self-destruct ──
if [ "$SHOULD_DESTROY" -eq 1 ]; then
    echo ""
    echo "=== Deleting instance $INSTANCE_ID ==="
    if vultr instance delete "$INSTANCE_ID"; then
        echo "✓ Instance deletion requested"
    else
        echo "WARNING: Self-destruct failed — instance $INSTANCE_ID may need manual cleanup"
        echo "Run: vultr instance delete $INSTANCE_ID"
    fi
else
    echo ""
    echo "=== Instance retained for debugging: $INSTANCE_ID ($INSTANCE_IP) ==="
    echo "Run when done: vultr instance delete $INSTANCE_ID"
fi