Skip to content

Commit 9d76ff0

Browse files
authored
Merge pull request #6 from ghostwright/feat/activate-evolution-judges
feat: activate LLM judges for self-evolution engine
2 parents 8693453 + 9484ddb commit 9d76ff0

18 files changed

Lines changed: 701 additions & 42 deletions

config/evolution.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,17 @@ reflection:
2828
effort: "high"
2929
max_budget_usd: 0.50
3030

31+
# LLM Judge Configuration
32+
judges:
33+
# "auto" enables when ANTHROPIC_API_KEY is available
34+
# "always" enables unconditionally
35+
# "never" disables unconditionally
36+
enabled: "auto"
37+
# Safety net against runaway costs (daily reset)
38+
cost_cap_usd_per_day: 50.0
39+
# Maximum golden suite entries (prune oldest when exceeded)
40+
max_golden_suite_size: 50
41+
3142
# Directory paths (relative to project root)
3243
paths:
3344
config_dir: "phantom-config"

src/evolution/__tests__/application.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ function testConfig(): EvolutionConfig {
1212
cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 },
1313
gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 },
1414
reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 },
15+
judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 },
1516
paths: {
1617
config_dir: TEST_DIR,
1718
constitution: `${TEST_DIR}/constitution.md`,

src/evolution/__tests__/constitution.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ function testConfig(): EvolutionConfig {
1111
cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 },
1212
gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 },
1313
reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 },
14+
judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 },
1415
paths: {
1516
config_dir: TEST_DIR,
1617
constitution: `${TEST_DIR}/constitution.md`,
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
import { afterEach, beforeEach, describe, expect, test } from "bun:test";
2+
import { mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
3+
import { EvolutionEngine } from "../engine.ts";
4+
import type { SessionSummary } from "../types.ts";
5+
6+
const TEST_DIR = "/tmp/phantom-test-cost-cap";
7+
const CONFIG_PATH = `${TEST_DIR}/config/evolution.yaml`;
8+
9+
let savedApiKey: string | undefined;
10+
11+
function setupTestEnv(costCap: number): void {
12+
mkdirSync(`${TEST_DIR}/config`, { recursive: true });
13+
mkdirSync(`${TEST_DIR}/phantom-config/meta`, { recursive: true });
14+
mkdirSync(`${TEST_DIR}/phantom-config/strategies`, { recursive: true });
15+
mkdirSync(`${TEST_DIR}/phantom-config/memory`, { recursive: true });
16+
17+
writeFileSync(
18+
CONFIG_PATH,
19+
[
20+
"cadence:",
21+
" reflection_interval: 1",
22+
" consolidation_interval: 10",
23+
"gates:",
24+
" drift_threshold: 0.7",
25+
" max_file_lines: 200",
26+
" auto_rollback_threshold: 0.1",
27+
" auto_rollback_window: 5",
28+
"judges:",
29+
' enabled: "never"',
30+
` cost_cap_usd_per_day: ${costCap}`,
31+
" max_golden_suite_size: 50",
32+
"paths:",
33+
` config_dir: "${TEST_DIR}/phantom-config"`,
34+
` constitution: "${TEST_DIR}/phantom-config/constitution.md"`,
35+
` version_file: "${TEST_DIR}/phantom-config/meta/version.json"`,
36+
` metrics_file: "${TEST_DIR}/phantom-config/meta/metrics.json"`,
37+
` evolution_log: "${TEST_DIR}/phantom-config/meta/evolution-log.jsonl"`,
38+
` golden_suite: "${TEST_DIR}/phantom-config/meta/golden-suite.jsonl"`,
39+
` session_log: "${TEST_DIR}/phantom-config/memory/session-log.jsonl"`,
40+
].join("\n"),
41+
"utf-8",
42+
);
43+
44+
writeFileSync(`${TEST_DIR}/phantom-config/constitution.md`, "# Constitution\n1. Be honest.\n", "utf-8");
45+
writeFileSync(`${TEST_DIR}/phantom-config/persona.md`, "", "utf-8");
46+
writeFileSync(`${TEST_DIR}/phantom-config/user-profile.md`, "# User Profile\n", "utf-8");
47+
writeFileSync(`${TEST_DIR}/phantom-config/domain-knowledge.md`, "", "utf-8");
48+
writeFileSync(`${TEST_DIR}/phantom-config/strategies/task-patterns.md`, "", "utf-8");
49+
writeFileSync(`${TEST_DIR}/phantom-config/strategies/tool-preferences.md`, "", "utf-8");
50+
writeFileSync(`${TEST_DIR}/phantom-config/strategies/error-recovery.md`, "", "utf-8");
51+
writeFileSync(`${TEST_DIR}/phantom-config/memory/session-log.jsonl`, "", "utf-8");
52+
writeFileSync(
53+
`${TEST_DIR}/phantom-config/meta/version.json`,
54+
JSON.stringify({
55+
version: 0,
56+
parent: null,
57+
timestamp: new Date().toISOString(),
58+
changes: [],
59+
metrics_at_change: { session_count: 0, success_rate_7d: 0, correction_rate_7d: 0 },
60+
}),
61+
"utf-8",
62+
);
63+
writeFileSync(
64+
`${TEST_DIR}/phantom-config/meta/metrics.json`,
65+
JSON.stringify({
66+
session_count: 0,
67+
success_count: 0,
68+
failure_count: 0,
69+
correction_count: 0,
70+
evolution_count: 0,
71+
rollback_count: 0,
72+
last_session_at: null,
73+
last_evolution_at: null,
74+
success_rate_7d: 0,
75+
correction_rate_7d: 0,
76+
sessions_since_consolidation: 0,
77+
}),
78+
"utf-8",
79+
);
80+
writeFileSync(`${TEST_DIR}/phantom-config/meta/evolution-log.jsonl`, "", "utf-8");
81+
writeFileSync(`${TEST_DIR}/phantom-config/meta/golden-suite.jsonl`, "", "utf-8");
82+
}
83+
84+
function makeSession(overrides: Partial<SessionSummary> = {}): SessionSummary {
85+
return {
86+
session_id: `session-${Date.now()}`,
87+
session_key: "cli:main",
88+
user_id: "user-1",
89+
user_messages: ["No, use TypeScript not JavaScript"],
90+
assistant_messages: ["Got it."],
91+
tools_used: [],
92+
files_tracked: [],
93+
outcome: "success",
94+
cost_usd: 0.05,
95+
started_at: "2026-03-25T10:00:00Z",
96+
ended_at: "2026-03-25T10:05:00Z",
97+
...overrides,
98+
};
99+
}
100+
101+
describe("Cost Cap", () => {
102+
beforeEach(() => {
103+
savedApiKey = process.env.ANTHROPIC_API_KEY;
104+
});
105+
106+
afterEach(() => {
107+
if (savedApiKey !== undefined) {
108+
process.env.ANTHROPIC_API_KEY = savedApiKey;
109+
} else {
110+
process.env.ANTHROPIC_API_KEY = undefined;
111+
}
112+
rmSync(TEST_DIR, { recursive: true, force: true });
113+
});
114+
115+
test("cost cap config is parsed from YAML", () => {
116+
setupTestEnv(10.0);
117+
const engine = new EvolutionEngine(CONFIG_PATH);
118+
const config = engine.getEvolutionConfig();
119+
expect(config.judges.cost_cap_usd_per_day).toBe(10.0);
120+
});
121+
122+
test("cost cap defaults to 50 when not configured", () => {
123+
mkdirSync(`${TEST_DIR}/config`, { recursive: true });
124+
mkdirSync(`${TEST_DIR}/phantom-config/meta`, { recursive: true });
125+
mkdirSync(`${TEST_DIR}/phantom-config/strategies`, { recursive: true });
126+
mkdirSync(`${TEST_DIR}/phantom-config/memory`, { recursive: true });
127+
128+
writeFileSync(
129+
CONFIG_PATH,
130+
[
131+
"paths:",
132+
` config_dir: "${TEST_DIR}/phantom-config"`,
133+
` constitution: "${TEST_DIR}/phantom-config/constitution.md"`,
134+
` version_file: "${TEST_DIR}/phantom-config/meta/version.json"`,
135+
` metrics_file: "${TEST_DIR}/phantom-config/meta/metrics.json"`,
136+
` evolution_log: "${TEST_DIR}/phantom-config/meta/evolution-log.jsonl"`,
137+
` golden_suite: "${TEST_DIR}/phantom-config/meta/golden-suite.jsonl"`,
138+
` session_log: "${TEST_DIR}/phantom-config/memory/session-log.jsonl"`,
139+
].join("\n"),
140+
"utf-8",
141+
);
142+
writeFileSync(`${TEST_DIR}/phantom-config/constitution.md`, "# Constitution\n", "utf-8");
143+
writeFileSync(`${TEST_DIR}/phantom-config/persona.md`, "", "utf-8");
144+
writeFileSync(`${TEST_DIR}/phantom-config/user-profile.md`, "", "utf-8");
145+
writeFileSync(`${TEST_DIR}/phantom-config/domain-knowledge.md`, "", "utf-8");
146+
writeFileSync(`${TEST_DIR}/phantom-config/strategies/task-patterns.md`, "", "utf-8");
147+
writeFileSync(`${TEST_DIR}/phantom-config/strategies/tool-preferences.md`, "", "utf-8");
148+
writeFileSync(`${TEST_DIR}/phantom-config/strategies/error-recovery.md`, "", "utf-8");
149+
writeFileSync(`${TEST_DIR}/phantom-config/memory/session-log.jsonl`, "", "utf-8");
150+
writeFileSync(
151+
`${TEST_DIR}/phantom-config/meta/version.json`,
152+
JSON.stringify({
153+
version: 0,
154+
parent: null,
155+
timestamp: new Date().toISOString(),
156+
changes: [],
157+
metrics_at_change: { session_count: 0, success_rate_7d: 0, correction_rate_7d: 0 },
158+
}),
159+
"utf-8",
160+
);
161+
writeFileSync(
162+
`${TEST_DIR}/phantom-config/meta/metrics.json`,
163+
JSON.stringify({
164+
session_count: 0,
165+
success_count: 0,
166+
failure_count: 0,
167+
correction_count: 0,
168+
evolution_count: 0,
169+
rollback_count: 0,
170+
last_session_at: null,
171+
last_evolution_at: null,
172+
success_rate_7d: 0,
173+
correction_rate_7d: 0,
174+
sessions_since_consolidation: 0,
175+
}),
176+
"utf-8",
177+
);
178+
writeFileSync(`${TEST_DIR}/phantom-config/meta/evolution-log.jsonl`, "", "utf-8");
179+
writeFileSync(`${TEST_DIR}/phantom-config/meta/golden-suite.jsonl`, "", "utf-8");
180+
181+
const engine = new EvolutionEngine(CONFIG_PATH);
182+
expect(engine.getEvolutionConfig().judges.cost_cap_usd_per_day).toBe(50.0);
183+
});
184+
185+
test("engine uses heuristic path when judges are disabled", async () => {
186+
setupTestEnv(50.0);
187+
const engine = new EvolutionEngine(CONFIG_PATH);
188+
189+
// judges.enabled: "never" means heuristics
190+
expect(engine.usesLLMJudges()).toBe(false);
191+
192+
const result = await engine.afterSession(makeSession());
193+
// Should still work with heuristics
194+
expect(result.changes_applied.length).toBeGreaterThan(0);
195+
196+
const userProfile = readFileSync(`${TEST_DIR}/phantom-config/user-profile.md`, "utf-8");
197+
expect(userProfile).toContain("TypeScript");
198+
});
199+
});

src/evolution/__tests__/engine.test.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ function setupTestEnvironment(): void {
2727
" auto_rollback_window: 5",
2828
"reflection:",
2929
' model: "claude-sonnet-4-20250514"',
30+
"judges:",
31+
' enabled: "never"',
3032
"paths:",
3133
` config_dir: "${TEST_DIR}/phantom-config"`,
3234
` constitution: "${TEST_DIR}/phantom-config/constitution.md"`,
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import { afterEach, beforeEach, describe, expect, test } from "bun:test";
2+
import { mkdirSync, rmSync, writeFileSync } from "node:fs";
3+
import type { EvolutionConfig } from "../config.ts";
4+
import { addCase, loadSuite, pruneSuite } from "../golden-suite.ts";
5+
import type { GoldenCase } from "../types.ts";
6+
7+
const TEST_DIR = "/tmp/phantom-test-golden-cap";
8+
9+
function testConfig(): EvolutionConfig {
10+
return {
11+
cadence: { reflection_interval: 1, consolidation_interval: 10, full_review_interval: 50, drift_check_interval: 20 },
12+
gates: { drift_threshold: 0.7, max_file_lines: 200, auto_rollback_threshold: 0.1, auto_rollback_window: 5 },
13+
reflection: { model: "claude-sonnet-4-20250514", effort: "high", max_budget_usd: 0.5 },
14+
judges: { enabled: "auto", cost_cap_usd_per_day: 50.0, max_golden_suite_size: 50 },
15+
paths: {
16+
config_dir: TEST_DIR,
17+
constitution: `${TEST_DIR}/constitution.md`,
18+
version_file: `${TEST_DIR}/meta/version.json`,
19+
metrics_file: `${TEST_DIR}/meta/metrics.json`,
20+
evolution_log: `${TEST_DIR}/meta/evolution-log.jsonl`,
21+
golden_suite: `${TEST_DIR}/meta/golden-suite.jsonl`,
22+
session_log: `${TEST_DIR}/memory/session-log.jsonl`,
23+
},
24+
};
25+
}
26+
27+
function makeGoldenCase(index: number, daysAgo = 0): GoldenCase {
28+
const date = new Date();
29+
date.setDate(date.getDate() - daysAgo);
30+
return {
31+
id: `golden-${index}`,
32+
description: `Correction ${index}`,
33+
lesson: `Lesson for correction ${index}`,
34+
session_id: `session-${index}`,
35+
created_at: date.toISOString(),
36+
};
37+
}
38+
39+
describe("Golden Suite Cap", () => {
40+
beforeEach(() => {
41+
mkdirSync(`${TEST_DIR}/meta`, { recursive: true });
42+
writeFileSync(`${TEST_DIR}/meta/golden-suite.jsonl`, "", "utf-8");
43+
});
44+
45+
afterEach(() => {
46+
rmSync(TEST_DIR, { recursive: true, force: true });
47+
});
48+
49+
test("pruneSuite is a no-op when suite is under the cap", () => {
50+
const config = testConfig();
51+
for (let i = 0; i < 5; i++) {
52+
addCase(config, makeGoldenCase(i));
53+
}
54+
const removed = pruneSuite(config, 50);
55+
expect(removed).toBe(0);
56+
expect(loadSuite(config)).toHaveLength(5);
57+
});
58+
59+
test("pruneSuite removes oldest entries when suite exceeds cap", () => {
60+
const config = testConfig();
61+
// Add 10 cases with decreasing age (0 = newest, 9 = oldest)
62+
for (let i = 0; i < 10; i++) {
63+
addCase(config, makeGoldenCase(i, i));
64+
}
65+
expect(loadSuite(config)).toHaveLength(10);
66+
67+
const removed = pruneSuite(config, 5);
68+
expect(removed).toBe(5);
69+
70+
const remaining = loadSuite(config);
71+
expect(remaining).toHaveLength(5);
72+
73+
// Remaining should be the 5 newest (days ago 0-4)
74+
for (const entry of remaining) {
75+
const id = Number.parseInt(entry.id.replace("golden-", ""), 10);
76+
expect(id).toBeLessThan(5);
77+
}
78+
});
79+
80+
test("pruneSuite with max_golden_suite_size defaults to 50", () => {
81+
const config = testConfig();
82+
// Default cap is 50 from the config
83+
expect(config.judges.max_golden_suite_size).toBe(50);
84+
});
85+
86+
test("pruneSuite handles empty suite", () => {
87+
const config = testConfig();
88+
const removed = pruneSuite(config, 50);
89+
expect(removed).toBe(0);
90+
});
91+
92+
test("pruneSuite handles suite at exactly the cap", () => {
93+
const config = testConfig();
94+
for (let i = 0; i < 5; i++) {
95+
addCase(config, makeGoldenCase(i));
96+
}
97+
const removed = pruneSuite(config, 5);
98+
expect(removed).toBe(0);
99+
expect(loadSuite(config)).toHaveLength(5);
100+
});
101+
102+
test("pruneSuite keeps newest entries when exceeding cap by 1", () => {
103+
const config = testConfig();
104+
// oldest first, then newest
105+
addCase(config, makeGoldenCase(0, 10));
106+
addCase(config, makeGoldenCase(1, 0));
107+
108+
const removed = pruneSuite(config, 1);
109+
expect(removed).toBe(1);
110+
111+
const remaining = loadSuite(config);
112+
expect(remaining).toHaveLength(1);
113+
// The newest entry (days ago 0) should remain
114+
expect(remaining[0].id).toBe("golden-1");
115+
});
116+
});

0 commit comments

Comments
 (0)