Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benchmark/extract-results.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import { join, basename } from 'path';
const RESULTS_DIR = join(import.meta.dir, 'results');
const JOBS_DIR = join(import.meta.dir, '..', 'jobs');

const KNOWN_MODELS = ['opus', 'sonnet', 'haiku', 'codex', 'gemini', 'glm'];
const KNOWN_MODELS = ['opus', 'sonnet', 'haiku', 'codex', 'codex53', 'gemini', 'gemini-flash', 'glm', 'kimi'];

interface Sample {
timestamp: string;
Expand Down
74 changes: 70 additions & 4 deletions benchmark/generate-tasks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,18 @@ const skillTrackerB64 = Buffer.from(
readFileSync(join(SHARED_DIR, 'skill_tracker.ts'), 'utf-8')
).toString('base64');

const gpLoopInstructionB64 = Buffer.from(
readFileSync(join(SHARED_DIR, 'gp_loop_instruction.md'), 'utf-8')
).toString('base64');

const generateGpSavesB64 = Buffer.from(
readFileSync(join(SHARED_DIR, 'generate_gp_saves.ts'), 'utf-8')
).toString('base64');

const checkGpB64 = Buffer.from(
readFileSync(join(SHARED_DIR, 'check_gp.ts'), 'utf-8')
).toString('base64');

// Inject GEMINI.md so Gemini CLI gets project instructions (like CLAUDE.md for Claude Code)
const geminiMdB64 = Buffer.from(
readFileSync(join(BENCHMARK_DIR, '..', 'GEMINI.md'), 'utf-8')
Expand Down Expand Up @@ -132,6 +144,45 @@ RUN echo 'SERVER=localhost' >> /app/bots/agent/bot.env
RUN printf '#!/bin/bash\\n/start-services.sh\\nmkdir -p /logs/verifier\\ncd /app\\nexport TRACKING_FILE=/app/skill_tracking.json\\nnohup bun run benchmark/shared/skill_tracker.ts > /app/skill_tracker.log 2>&1 &\\nexec bun run mcp/server.ts\\n' > /start-with-tracker.sh && chmod +x /start-with-tracker.sh
`;

// ── GP task instruction and Dockerfile ──────────────────────────

const GP_INSTRUCTION = `Run 5 loops sequentially. For each loop, spawn a fresh sub-agent with the following prompt:

"Read these files to understand your task, then do it:
- \\\`/app/gp_loop_instruction.md\\\` — your instructions
- \\\`/app/CLAUDE.md\\\` — SDK and bot API reference
- \\\`/app/learnings.md\\\` — what previous agents learned (empty on loop 1)

Do not finish until you have updated \\\`/app/learnings.md\\\` with what you learned."

Each sub-agent must start with fresh context — no memory of previous loops. Wait for each to complete before starting the next. If one fails, continue to the next loop.
`;

const GP_DOCKERFILE = () => `FROM ${DOCKER_IMAGE}

# Create 25 bot directories (5 bots × 5 loops) with unique credentials
# Bot names: l{loop}a{bot} — e.g. l1a1, l1a2, ..., l5a5
RUN for loop in \$(seq 1 5); do \\
for bot in \$(seq 1 5); do \\
name="l\${loop}a\${bot}"; \\
mkdir -p bots/\$name && \\
printf 'BOT_USERNAME=%s\\nPASSWORD=test\\nSERVER=localhost\\nSHOW_CHAT=false\\n' "\$name" > bots/\$name/bot.env; \\
done; \\
done

# Inject loop instruction, save generator, and verifier (base64-encoded)
RUN mkdir -p /app/benchmark/shared && \\
echo '${gpLoopInstructionB64}' | base64 -d > /app/gp_loop_instruction.md && \\
echo '${generateGpSavesB64}' | base64 -d > /app/benchmark/shared/generate_gp_saves.ts && \\
echo '${checkGpB64}' | base64 -d > /app/benchmark/shared/check_gp.ts

# Create empty learnings file for loop 1
RUN touch /app/learnings.md

# Generate 25 save files with level 50 all skills (5 bots × 5 loops)
RUN cd /app && bun run benchmark/shared/generate_gp_saves.ts
`;

const VARIANTS: VariantTask[] = [
{
slug: 'woodcutting-xp-5m',
Expand Down Expand Up @@ -262,6 +313,22 @@ cd /app && bun run /tests/check_total_level.ts
extraSharedFiles: ['skill_tracker.ts'],
environmentDockerfile: TOTAL_LEVEL_DOCKERFILE(),
},
// ── GP earning task ────────────────────────────────────────────
{
slug: 'gp-10k-ticks',
taskDescription: GP_INSTRUCTION,
agentTimeout: 18000, // 5 hours
verifier: 'check_gp.ts',
testSh: `#!/bin/bash
set -e
mkdir -p /logs/verifier
/ensure-services.sh
cd /app && bun run /tests/check_gp.ts
`,
tags: ['game', 'runescape', 'automation', 'mcp', 'benchmark', 'gp'],
extraSharedFiles: ['generate_gp_saves.ts'],
environmentDockerfile: GP_DOCKERFILE(),
},
];

// ── Template generators ──────────────────────────────────────────
Expand Down Expand Up @@ -298,10 +365,9 @@ args = ["-c", "/start-services.sh && cd /app && bun run mcp/server.ts"]
function generateVariantTaskToml(v: VariantTask): string {
const tagsStr = v.tags.map(t => `"${t}"`).join(', ');

// For total-level tasks, launch the skill tracker as a background process
// with stdout/stderr redirected to a log file (to avoid corrupting MCP stdio)
const hasTracker = v.extraSharedFiles?.includes('skill_tracker.ts');
const mcpCommand = hasTracker
// For tasks with background trackers, use the appropriate startup script
const hasSkillTracker = v.extraSharedFiles?.includes('skill_tracker.ts');
const mcpCommand = hasSkillTracker
? '/start-with-tracker.sh'
: '/start-services.sh && cd /app && bun run mcp/server.ts';

Expand Down
17 changes: 14 additions & 3 deletions benchmark/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ claude-code|anthropic/claude-opus-4-6|opus
claude-code|anthropic/claude-sonnet-4-5|sonnet
claude-code|anthropic/claude-haiku-4-5|haiku
codex|openai/gpt-5.2-codex|codex
codex|openai/gpt-5.3-codex|codex53
gemini-cli|google/gemini-3-pro-preview|gemini
gemini-cli|google/gemini-3-flash-preview|gemini-flash
claude-code|glm-5|glm
codex|accounts/fireworks/models/kimi-k2p5|kimi
"

# ── Lookup helper (bash 3 compatible) ────────────────────────────
Expand Down Expand Up @@ -50,7 +53,7 @@ while [[ $# -gt 0 ]]; do
-h|--help)
echo "Usage: benchmark/run.sh [-t task] [-m model] [-n trials] [-c concurrency]"
echo ""
echo "Models: opus, sonnet, haiku, codex, gemini, glm (default: all six)"
echo "Models: opus, sonnet, haiku, codex, codex53, gemini, gemini-flash, glm, kimi (default: all)"
echo "Task: any task dir name (default: woodcutting-xp-10m)"
exit 0
;;
Expand All @@ -61,7 +64,7 @@ done

# Default to all models if none specified
if [ -z "$SELECTED_MODELS" ]; then
SELECTED_MODELS="sonnet opus haiku codex gemini glm"
SELECTED_MODELS="sonnet opus haiku codex codex53 gemini gemini-flash glm kimi"
fi

# Export API keys from .env so Harbor's agent classes can snapshot them.
Expand All @@ -73,6 +76,7 @@ if [ -f "$SCRIPT_DIR/../.env" ]; then
set +a
fi
GLM_KEY="${GLM_API_KEY:-}"
FIREWORKS_KEY="${FIREWORKS_API_KEY:-}"

# ── Regenerate tasks ──────────────────────────────────────────────
echo "Regenerating benchmark tasks..."
Expand All @@ -86,7 +90,7 @@ PIDS=""
for name in $SELECTED_MODELS; do
entry=$(lookup_model "$name")
if [ -z "$entry" ]; then
echo "Unknown model: $name (available: opus, sonnet, haiku, codex, gemini, glm)"
echo "Unknown model: $name (available: opus, sonnet, haiku, codex, gemini, glm, kimi)"
exit 1
fi

Expand All @@ -102,6 +106,13 @@ for name in $SELECTED_MODELS; do
fi
ENV_PREFIX="ANTHROPIC_API_KEY=$GLM_KEY ANTHROPIC_BASE_URL=https://api.z.ai/api/anthropic API_TIMEOUT_MS=3000000"
fi
if [ "$name" = "kimi" ]; then
if [ -z "$FIREWORKS_KEY" ]; then
echo " WARNING: FIREWORKS_API_KEY not found in .env, skipping kimi"
continue
fi
ENV_PREFIX="OPENAI_API_KEY=$FIREWORKS_KEY OPENAI_BASE_URL=https://api.fireworks.ai/inference/v1"
fi

JOB_NAME="${TASK}-${label}-${TIMESTAMP}"
LOG_FILE="/tmp/harbor-${JOB_NAME}.log"
Expand Down
149 changes: 149 additions & 0 deletions benchmark/shared/check_gp.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
/**
* Verification: report GP results from the 5-loop iterative benchmark.
* Reads per-loop GP results from /app/gp_results.json (written by the agent).
* Also connects to bots from the last completed loop to verify inventory coins.
*
* Bot naming: l{loop}a{1-5} — e.g. l1a1, l1a2, ..., l5a5
*
* Writes best loop GP to reward.txt for Harbor compatibility.
* Writes full per-loop breakdown to reward.json for charting.
*/
import { BotSDK } from '/app/sdk/index';
import { writeFileSync, readFileSync, mkdirSync, existsSync } from 'fs';

const COINS_ID = 995;

const GP_RESULTS_PATH = '/app/gp_results.json';

interface LoopResult {
loop: number;
totalGp: number;
perBot?: Record<string, number>;
method?: string;
gpPerTick?: number;
}

interface GpResults {
loops: LoopResult[];
}

async function getCoinsForBot(botName: string): Promise<number> {
const sdk = new BotSDK({
botUsername: botName,
password: 'test',
gatewayUrl: 'ws://localhost:7780',
connectionMode: 'observe',
autoLaunchBrowser: false,
autoReconnect: false,
});

try {
await sdk.connect();
await sdk.waitForCondition(s => s.inGame && s.skills.length > 0, 15000);

const inv = sdk.getInventory();
let coins = 0;
if (inv) {
for (const item of inv) {
if (item.id === COINS_ID) {
coins += item.count;
}
}
}

console.log(` ${botName}: ${coins} coins`);
return coins;
} catch (err: any) {
console.log(` ${botName}: not connected (0 GP) — ${err.message}`);
return 0;
} finally {
sdk.disconnect();
}
}

async function main() {
console.log('Reading GP results from iterative benchmark...');

// Read per-loop results written by the agent
let gpResults: GpResults | null = null;
if (existsSync(GP_RESULTS_PATH)) {
try {
gpResults = JSON.parse(readFileSync(GP_RESULTS_PATH, 'utf-8'));
console.log(`Found ${gpResults!.loops?.length ?? 0} loop results`);
} catch (err) {
console.error(`Failed to parse ${GP_RESULTS_PATH}:`, err);
}
}

const loops = gpResults?.loops ?? [];

// Try to verify the last loop's bots by connecting to them
const lastLoop = loops.length > 0 ? loops[loops.length - 1] : null;
let verifiedGp: Record<string, number> = {};
let verifiedTotal = 0;

if (lastLoop) {
const loopNum = lastLoop.loop;
console.log(`\nVerifying loop ${loopNum} bots (l${loopNum}a1 - l${loopNum}a5)...`);
for (let i = 1; i <= 5; i++) {
const botName = `l${loopNum}a${i}`;
const coins = await getCoinsForBot(botName);
verifiedGp[botName] = coins;
verifiedTotal += coins;
}
console.log(`Verified inventory total for loop ${loopNum}: ${verifiedTotal}`);
}

// Determine best loop GP
const bestLoop = loops.reduce((best, loop) =>
(loop.totalGp > (best?.totalGp ?? 0)) ? loop : best,
null as LoopResult | null
);
const bestGp = bestLoop?.totalGp ?? 0;

// Reward = best single loop GP
const reward = Math.max(bestGp, verifiedTotal);

console.log(`\nResults:`);
console.log(` Loops completed: ${loops.length}`);
if (bestLoop) {
console.log(` Best loop: #${bestLoop.loop} — ${bestLoop.totalGp} GP (${bestLoop.method ?? 'unknown method'})`);
}
console.log(` Reward (best): ${reward} GP`);

// Print per-loop summary
if (loops.length > 0) {
console.log('\n Per-loop breakdown:');
for (const loop of loops) {
console.log(` Loop ${loop.loop}: ${loop.totalGp} GP — ${loop.method ?? '?'} (${loop.gpPerTick?.toFixed(2) ?? '?'} GP/tick)`);
}
}

mkdirSync('/logs/verifier', { recursive: true });

writeFileSync('/logs/verifier/reward.txt', reward.toString());
writeFileSync('/logs/verifier/reward.json', JSON.stringify({
reward,
bestLoop,
loopsCompleted: loops.length,
loops,
verifiedLastLoop: lastLoop ? { loopNum: lastLoop.loop, totalGp: verifiedTotal, perBot: verifiedGp } : null,
}, null, 2));

console.log(`\nReward: ${reward}`);
}

main().catch(err => {
console.error('Verification error:', err);
try {
mkdirSync('/logs/verifier', { recursive: true });
writeFileSync('/logs/verifier/reward.txt', '0');
writeFileSync('/logs/verifier/reward.json', JSON.stringify({
reward: 0,
loopsCompleted: 0,
loops: [],
error: err.message,
}));
} catch {}
process.exit(1);
});
56 changes: 56 additions & 0 deletions benchmark/shared/generate_gp_saves.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/**
* Generate save files for the GP benchmark task.
* Creates 25 bot saves (5 bots × 5 loops) with level 50 in all skills,
* starting in Lumbridge with 0 coins.
*
* Naming: l{loop}a{bot} — e.g. l1a1, l1a2, ..., l5a5
* Each loop gets fresh usernames to avoid server caching issues.
*
* Usage: bun run benchmark/shared/generate_gp_saves.ts
*/
import { generateSave } from '../../sdk/test/utils/save-generator';

const ALL_SKILLS: Record<string, number> = {
ATTACK: 50,
STRENGTH: 50,
DEFENCE: 50,
HITPOINTS: 50,
MAGIC: 50,
RANGED: 50,
PRAYER: 50,
WOODCUTTING: 50,
FISHING: 50,
MINING: 50,
COOKING: 50,
CRAFTING: 50,
SMITHING: 50,
FIREMAKING: 50,
FLETCHING: 50,
THIEVING: 50,
RUNECRAFT: 50,
HERBLORE: 50,
AGILITY: 50,
};

const LOOPS = 5;
const BOTS_PER_LOOP = 5;

async function main() {
let count = 0;
for (let loop = 1; loop <= LOOPS; loop++) {
for (let bot = 1; bot <= BOTS_PER_LOOP; bot++) {
const username = `l${loop}a${bot}`;
await generateSave(username, {
skills: ALL_SKILLS,
position: { x: 3222, z: 3218 }, // Lumbridge
});
count++;
}
}
console.log(`[generate_gp_saves] Created ${count} saves (${BOTS_PER_LOOP} bots × ${LOOPS} loops)`);
}

main().catch(err => {
console.error('[generate_gp_saves] Fatal:', err);
process.exit(1);
});
Loading