refactor(eval): replace OpenAI with Anthropic SDK in init-eval judge (#683)

betegon · claude · web-flow · commit 79e72c10d403 · 2026-04-07T17:40:32.000+02:00
## Summary Standardizes all evals on the Anthropic SDK. The skill-eval already used `@anthropic-ai/sdk`; this switches the init-eval judge from OpenAI (`gpt-4o`) to Anthropic (`claude-sonnet-4-6`) and drops the `openai` dependency. ## Changes - `test/init-eval/helpers/judge.ts`: swap OpenAI client/API for Anthropic Messages API - `package.json`: remove `openai` from devDependencies - `OPENAI_API_KEY` → `ANTHROPIC_API_KEY` env var (already required by skill-eval) ## Test plan - [x] `bun eval:skill` passes (sonnet 100%, opus 87.5%) - [x] `bun test:init-eval` — judge calls succeed with Anthropic (wizard auth is a separate issue) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/bun.lock b/bun.lock
diff --git a/package.json b/package.json
@@ -30,7 +30,6 @@
     "http-cache-semantics": "^4.2.0",
     "ignore": "^7.0.5",
     "marked": "^15",
-    "openai": "^6.22.0",
     "p-limit": "^7.2.0",
     "picomatch": "^4.0.3",
     "pretty-ms": "^9.3.0",
diff --git a/test/init-eval/helpers/judge.ts b/test/init-eval/helpers/judge.ts
@@ -17,7 +17,7 @@ export type JudgeVerdict = {
 
 /**
  * Use an LLM judge to evaluate whether a **single feature** was correctly set
- * up by the wizard. Returns null if OPENAI_API_KEY is not set.
+ * up by the wizard. Returns null if ANTHROPIC_API_KEY is not set.
  *
  * `docsContent` is the pre-fetched plain-text documentation to include as
  * ground truth in the prompt.
@@ -28,25 +28,25 @@ export async function judgeFeature(
   feature: FeatureDoc,
   docsContent: string
 ): Promise<JudgeVerdict | null> {
-  const apiKey = process.env.OPENAI_API_KEY;
+  const apiKey = process.env.ANTHROPIC_API_KEY;
   if (!apiKey) {
     console.log(
-      `  [judge:${feature.feature}] Skipping LLM judge (no OPENAI_API_KEY set)`
+      `  [judge:${feature.feature}] Skipping LLM judge (no ANTHROPIC_API_KEY set)`
     );
     return null;
   }
 
   // Restore real fetch — test preload mocks it to catch accidental network
-  // calls, but we need real HTTP for the OpenAI API.
+  // calls, but we need real HTTP for the Anthropic API.
   const realFetch = (globalThis as { __originalFetch?: typeof fetch })
     .__originalFetch;
   if (realFetch) {
     globalThis.fetch = realFetch;
   }
 
   // Dynamic import so we don't fail when the package isn't installed
-  const { default: OpenAI } = await import("openai");
-  const client = new OpenAI({ apiKey });
+  const { default: Anthropic } = await import("@anthropic-ai/sdk");
+  const client = new Anthropic({ apiKey });
 
   const newFilesSection = Object.entries(result.newFiles)
     .map(([path, content]) => `### ${path}\n\`\`\`\n${content}\n\`\`\``)
@@ -86,13 +86,14 @@ Return ONLY valid JSON with this structure:
   "summary": "Brief overall assessment of ${feature.feature} setup"
 }`;
 
-  const response = await client.chat.completions.create({
-    model: "gpt-4o",
+  const response = await client.messages.create({
+    model: "claude-sonnet-4-6",
     max_tokens: 1024,
     messages: [{ role: "user", content: prompt }],
   });
 
-  const text = response.choices[0]?.message?.content ?? "";
+  const textBlock = response.content.find((b) => b.type === "text");
+  const text = textBlock?.text ?? "";
 
   // Extract JSON from response (handle markdown code blocks)
   const jsonMatch = text.match(/\{[\s\S]*\}/);