From 7462fab7b14890a8af8f0d11dd43e881cdef06b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Beteg=C3=B3n?= <miguelbetegongarcia@gmail.com>
Date: Tue, 7 Apr 2026 17:29:50 +0200
Subject: [PATCH] refactor(eval): replace OpenAI with Anthropic SDK in
 init-eval judge

Standardizes on a single LLM provider across all evals. The skill-eval
already used Anthropic; now init-eval does too, removing the openai
dependency entirely.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 bun.lock                        |  3 ---
 package.json                    |  1 -
 test/init-eval/helpers/judge.ts | 19 ++++++++++---------
 3 files changed, 10 insertions(+), 13 deletions(-)
diff --git a/bun.lock b/bun.lock
index 12f4b2751..0dd76ba4f 100644
--- a/bun.lock
+++ b/bun.lock
@@ -28,7 +28,6 @@
         "http-cache-semantics": "^4.2.0",
         "ignore": "^7.0.5",
         "marked": "^15",
-        "openai": "^6.22.0",
         "p-limit": "^7.2.0",
         "picomatch": "^4.0.3",
         "pretty-ms": "^9.3.0",
@@ -502,8 +501,6 @@
 
     "once": ["once@1.4.0", "", { "dependencies": { "wrappy": "1" } }, "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w=="],
 
-    "openai": ["openai@6.25.0", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-mEh6VZ2ds2AGGokWARo18aPISI1OhlgdEIC1ewhkZr8pSIT31dec0ecr9Nhxx0JlybyOgoAT1sWeKtwPZzJyww=="],
-
     "openapi-types": ["openapi-types@12.1.3", "", {}, "sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw=="],
 
     "p-limit": ["p-limit@7.2.0", "", { "dependencies": { "yocto-queue": "^1.2.1" } }, "sha512-ATHLtwoTNDloHRFFxFJdHnG6n2WUeFjaR8XQMFdKIv0xkXjrER8/iG9iu265jOM95zXHAfv9oTkqhrfbIzosrQ=="],
diff --git a/package.json b/package.json
index a18d73c4a..12faa5ae4 100644
--- a/package.json
+++ b/package.json
@@ -30,7 +30,6 @@
     "http-cache-semantics": "^4.2.0",
     "ignore": "^7.0.5",
     "marked": "^15",
-    "openai": "^6.22.0",
     "p-limit": "^7.2.0",
     "picomatch": "^4.0.3",
     "pretty-ms": "^9.3.0",
diff --git a/test/init-eval/helpers/judge.ts b/test/init-eval/helpers/judge.ts
index 4b8ee863d..103839021 100644
--- a/test/init-eval/helpers/judge.ts
+++ b/test/init-eval/helpers/judge.ts
@@ -17,7 +17,7 @@ export type JudgeVerdict = {
 
 /**
  * Use an LLM judge to evaluate whether a **single feature** was correctly set
- * up by the wizard. Returns null if OPENAI_API_KEY is not set.
+ * up by the wizard. Returns null if ANTHROPIC_API_KEY is not set.
  *
  * `docsContent` is the pre-fetched plain-text documentation to include as
  * ground truth in the prompt.
@@ -28,16 +28,16 @@ export async function judgeFeature(
   feature: FeatureDoc,
   docsContent: string
 ): Promise<JudgeVerdict | null> {
-  const apiKey = process.env.OPENAI_API_KEY;
+  const apiKey = process.env.ANTHROPIC_API_KEY;
   if (!apiKey) {
     console.log(
-      `  [judge:${feature.feature}] Skipping LLM judge (no OPENAI_API_KEY set)`
+      `  [judge:${feature.feature}] Skipping LLM judge (no ANTHROPIC_API_KEY set)`
     );
     return null;
   }
 
   // Restore real fetch — test preload mocks it to catch accidental network
-  // calls, but we need real HTTP for the OpenAI API.
+  // calls, but we need real HTTP for the Anthropic API.
   const realFetch = (globalThis as { __originalFetch?: typeof fetch })
     .__originalFetch;
   if (realFetch) {
@@ -45,8 +45,8 @@ export async function judgeFeature(
   }
 
   // Dynamic import so we don't fail when the package isn't installed
-  const { default: OpenAI } = await import("openai");
-  const client = new OpenAI({ apiKey });
+  const { default: Anthropic } = await import("@anthropic-ai/sdk");
+  const client = new Anthropic({ apiKey });
 
   const newFilesSection = Object.entries(result.newFiles)
     .map(([path, content]) => `### ${path}\n\`\`\`\n${content}\n\`\`\``)
@@ -86,13 +86,14 @@ Return ONLY valid JSON with this structure:
   "summary": "Brief overall assessment of ${feature.feature} setup"
 }`;
 
-  const response = await client.chat.completions.create({
-    model: "gpt-4o",
+  const response = await client.messages.create({
+    model: "claude-sonnet-4-6",
     max_tokens: 1024,
     messages: [{ role: "user", content: prompt }],
   });
 
-  const text = response.choices[0]?.message?.content ?? "";
+  const textBlock = response.content.find((b) => b.type === "text");
+  const text = textBlock?.text ?? "";
 
   // Extract JSON from response (handle markdown code blocks)
   const jsonMatch = text.match(/\{[\s\S]*\}/);