Skip to content

Commit 593ce56

Browse files
committed
fix(eval): ground LLM judge with command reference to prevent false negatives
The skill eval judge (Haiku 4.5) had no context about the sentry CLI and was hallucinating that valid commands don't exist, confusing it with the legacy sentry-cli. This caused Opus 4.6 to fail 3/8 eval cases (62.5%, below the 75% threshold) on the overall-quality criterion. Extract the Command Reference section from SKILL.md and inject it into the judge prompt so it can verify planned commands against actual CLI capabilities.
1 parent 33b21a9 commit 593ce56

File tree

2 files changed

+37
-6
lines changed

2 files changed

+37
-6
lines changed

script/eval-skill.ts

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,26 @@ const VERSION_RE = /^version:\s*(.+)$/m;
4646
*/
4747
const DEFAULT_THRESHOLD = 0.75;
4848

49+
/** Extract the "## Command Reference" section from SKILL.md for the judge. */
50+
function extractCommandReference(skillContent: string): string {
51+
const start = skillContent.indexOf("## Command Reference");
52+
if (start === -1) {
53+
return "";
54+
}
55+
const end = skillContent.indexOf("\n## ", start + 1);
56+
return end === -1
57+
? skillContent.slice(start)
58+
: skillContent.slice(start, end);
59+
}
60+
4961
/** Run all eval cases against a single model */
5062
async function evalModel(
5163
client: Awaited<ReturnType<typeof createClient>>,
5264
model: string,
5365
skillContent: string,
5466
testCases: TestCase[]
5567
): Promise<ModelResult> {
68+
const commandReference = extractCommandReference(skillContent);
5669
console.log(`\nEvaluating: ${model}`);
5770
console.log("─".repeat(40));
5871

@@ -67,7 +80,7 @@ async function evalModel(
6780
skillContent,
6881
testCase.prompt
6982
);
70-
const result = await judgePlan(client, testCase, plan);
83+
const result = await judgePlan(client, testCase, plan, commandReference);
7184
results.push(result);
7285

7386
const icon = result.passed ? "✓" : "✗";

test/skill-eval/helpers/judge.ts

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,19 +74,26 @@ function evaluateDeterministic(
7474

7575
/**
7676
* Use the LLM judge to evaluate overall plan quality.
77-
* Returns null if the judge call fails.
77+
* The command reference (extracted from SKILL.md) grounds the judge so it
78+
* doesn't hallucinate that valid `sentry` commands don't exist.
7879
*/
7980
async function evaluateWithLLMJudge(
8081
client: LLMClient,
8182
prompt: string,
82-
plan: AgentPlan
83+
plan: AgentPlan,
84+
commandReference: string
8385
): Promise<CriterionResult> {
8486
const commandList = plan.commands
8587
.map((c, i) => `${i + 1}. \`${c.command}\` — ${c.purpose}`)
8688
.join("\n");
8789

8890
const judgePrompt = `You are evaluating whether an AI agent's CLI command plan is good.
8991
92+
The agent was given a skill guide for the \`sentry\` CLI (not the legacy \`sentry-cli\`).
93+
Here are the valid commands from that guide:
94+
95+
${commandReference}
96+
9097
The user asked: "${prompt}"
9198
9299
The agent's plan:
@@ -96,11 +103,13 @@ ${commandList}
96103
Notes: ${plan.notes}
97104
98105
Evaluate the plan on overall quality. A good plan:
99-
- Uses the right Sentry CLI commands for the task
106+
- Uses commands that exist in the reference above
100107
- Would actually work if executed
101108
- Is efficient (no unnecessary commands)
102109
- Directly addresses what the user asked for
103110
111+
Do NOT penalize commands that appear in the reference above. This is a real CLI tool.
112+
104113
Return ONLY valid JSON:
105114
{"pass": true, "reason": "Brief explanation"}
106115
@@ -146,11 +155,15 @@ or
146155
/**
147156
* Evaluate a test case's plan against all its criteria.
148157
* Runs deterministic checks first, then the LLM judge for overall quality.
158+
*
159+
* @param commandReference - The Command Reference section from SKILL.md,
160+
* injected into the judge prompt so it can verify commands exist.
149161
*/
150162
export async function judgePlan(
151163
client: LLMClient,
152164
testCase: TestCase,
153-
plan: AgentPlan | null
165+
plan: AgentPlan | null,
166+
commandReference: string
154167
): Promise<CaseResult> {
155168
// If the planner failed to produce a plan, fail all criteria
156169
if (!plan) {
@@ -181,7 +194,12 @@ export async function judgePlan(
181194
}
182195

183196
// Run LLM judge for overall quality
184-
const llmVerdict = await evaluateWithLLMJudge(client, testCase.prompt, plan);
197+
const llmVerdict = await evaluateWithLLMJudge(
198+
client,
199+
testCase.prompt,
200+
plan,
201+
commandReference
202+
);
185203
criteria.push(llmVerdict);
186204

187205
// Compute score: fraction of criteria that passed

0 commit comments

Comments
 (0)