fix(eval): ground LLM judge with command reference to prevent false negatives

BYK · BYK · commit 593ce56906e1 · 2026-04-10T10:17:46.000Z
The skill eval judge (Haiku 4.5) had no context about the sentry CLI and
was hallucinating that valid commands don't exist, confusing it with the
legacy sentry-cli. This caused Opus 4.6 to fail 3/8 eval cases (62.5%,
below the 75% threshold) on the overall-quality criterion.

Extract the Command Reference section from SKILL.md and inject it into the
judge prompt so it can verify planned commands against actual CLI capabilities.
diff --git a/script/eval-skill.ts b/script/eval-skill.ts
@@ -46,13 +46,26 @@ const VERSION_RE = /^version:\s*(.+)$/m;
  */
 const DEFAULT_THRESHOLD = 0.75;
 
+/** Extract the "## Command Reference" section from SKILL.md for the judge. */
+function extractCommandReference(skillContent: string): string {
+  const start = skillContent.indexOf("## Command Reference");
+  if (start === -1) {
+    return "";
+  }
+  const end = skillContent.indexOf("\n## ", start + 1);
+  return end === -1
+    ? skillContent.slice(start)
+    : skillContent.slice(start, end);
+}
+
 /** Run all eval cases against a single model */
 async function evalModel(
   client: Awaited<ReturnType<typeof createClient>>,
   model: string,
   skillContent: string,
   testCases: TestCase[]
 ): Promise<ModelResult> {
+  const commandReference = extractCommandReference(skillContent);
   console.log(`\nEvaluating: ${model}`);
   console.log("─".repeat(40));
 
@@ -67,7 +80,7 @@ async function evalModel(
       skillContent,
       testCase.prompt
     );
-    const result = await judgePlan(client, testCase, plan);
+    const result = await judgePlan(client, testCase, plan, commandReference);
     results.push(result);
 
     const icon = result.passed ? "✓" : "✗";
diff --git a/test/skill-eval/helpers/judge.ts b/test/skill-eval/helpers/judge.ts
@@ -74,19 +74,26 @@ function evaluateDeterministic(
 
 /**
  * Use the LLM judge to evaluate overall plan quality.
- * Returns null if the judge call fails.
+ * The command reference (extracted from SKILL.md) grounds the judge so it
+ * doesn't hallucinate that valid `sentry` commands don't exist.
  */
 async function evaluateWithLLMJudge(
   client: LLMClient,
   prompt: string,
-  plan: AgentPlan
+  plan: AgentPlan,
+  commandReference: string
 ): Promise<CriterionResult> {
   const commandList = plan.commands
     .map((c, i) => `${i + 1}. \`${c.command}\` — ${c.purpose}`)
     .join("\n");
 
   const judgePrompt = `You are evaluating whether an AI agent's CLI command plan is good.
 
+The agent was given a skill guide for the \`sentry\` CLI (not the legacy \`sentry-cli\`).
+Here are the valid commands from that guide:
+
+${commandReference}
+
 The user asked: "${prompt}"
 
 The agent's plan:
@@ -96,11 +103,13 @@ ${commandList}
 Notes: ${plan.notes}
 
 Evaluate the plan on overall quality. A good plan:
-- Uses the right Sentry CLI commands for the task
+- Uses commands that exist in the reference above
 - Would actually work if executed
 - Is efficient (no unnecessary commands)
 - Directly addresses what the user asked for
 
+Do NOT penalize commands that appear in the reference above. This is a real CLI tool.
+
 Return ONLY valid JSON:
 {"pass": true, "reason": "Brief explanation"}
 
@@ -146,11 +155,15 @@ or
 /**
  * Evaluate a test case's plan against all its criteria.
  * Runs deterministic checks first, then the LLM judge for overall quality.
+ *
+ * @param commandReference - The Command Reference section from SKILL.md,
+ *   injected into the judge prompt so it can verify commands exist.
  */
 export async function judgePlan(
   client: LLMClient,
   testCase: TestCase,
-  plan: AgentPlan | null
+  plan: AgentPlan | null,
+  commandReference: string
 ): Promise<CaseResult> {
   // If the planner failed to produce a plan, fail all criteria
   if (!plan) {
@@ -181,7 +194,12 @@ export async function judgePlan(
   }
 
   // Run LLM judge for overall quality
-  const llmVerdict = await evaluateWithLLMJudge(client, testCase.prompt, plan);
+  const llmVerdict = await evaluateWithLLMJudge(
+    client,
+    testCase.prompt,
+    plan,
+    commandReference
+  );
   criteria.push(llmVerdict);
 
   // Compute score: fraction of criteria that passed