perstack-ai · FL4TLiN3 · Feb 15, 2026 · Feb 15, 2026 · Feb 15, 2026
diff --git a/e2e/perstack-cli/reasoning-budget.test.ts b/e2e/perstack-cli/reasoning-budget.test.ts
@@ -1,8 +1,8 @@
 /**
  * Reasoning Budget E2E Tests
  *
- * Tests that different reasoning budget levels produce different reasoning token counts.
- * This validates that the reasoningBudget configuration is correctly passed to providers.
+ * Tests that reasoning budget is correctly passed to each provider
+ * and produces reasoning tokens or thinking text.
  *
  * TOML: e2e/experts/reasoning-budget.toml
  */
@@ -14,22 +14,11 @@ const REASONING_BUDGET_CONFIG = "./e2e/experts/reasoning-budget.toml"
 // Extended thinking requires longer timeout
 const LLM_TIMEOUT = 180000
 
-type BudgetLevel = "minimal" | "low" | "medium" | "high"
-
-interface ReasoningTestResult {
-  budget: BudgetLevel
-  reasoningTokens: number
-  /** Thinking text from extended thinking (Anthropic/Google) */
-  thinking?: string
-  success: boolean
-}
-
 async function runReasoningTest(
   provider: "anthropic" | "openai" | "google",
-  budget: BudgetLevel,
   model: string,
-): Promise<ReasoningTestResult> {
-  const expertKey = `e2e-reasoning-${provider}-${budget}`
+): Promise<{ reasoningTokens: number; thinking?: string; success: boolean }> {
+  const expertKey = `e2e-reasoning-${provider}-medium`
   const cmdResult = await runCli(
     [
       "run",
@@ -42,7 +31,7 @@ async function runReasoningTest(
       "--model",
       model,
       "--reasoning-budget",
-      budget,
+      "medium",
     ],
     { timeout: LLM_TIMEOUT },
   )
@@ -73,7 +62,6 @@ async function runReasoningTest(
   // Get thinking from completeReasoning event or from checkpoint messages
   let thinking = reasoningEvent?.text
   if (!thinking && completeEvent?.checkpoint?.messages) {
-    // Look for thinkingPart in any message
     for (const message of completeEvent.checkpoint.messages) {
       if (message.contents) {
         for (const content of message.contents) {
@@ -87,169 +75,43 @@ async function runReasoningTest(
     }
   }
 
-  return {
-    budget,
-    reasoningTokens,
-    thinking,
-    success: result.exitCode === 0,
-  }
+  return { reasoningTokens, thinking, success: result.exitCode === 0 }
 }
 
-describe("Reasoning Budget", () => {
-  describe("Anthropic Extended Thinking", () => {
-    // Note: Claude claude-haiku-4-5 supports extended thinking
-    const ANTHROPIC_MODEL = "claude-haiku-4-5"
-
-    it(
-      "should produce reasoning tokens when budget is set",
-      async () => {
-        const result = await runReasoningTest("anthropic", "medium", ANTHROPIC_MODEL)
-
-        expect(result.success).toBe(true)
-        // Extended thinking should produce reasoning tokens or thinking text
-        // Note: AI SDK currently doesn't report reasoningTokens for Anthropic,
-        // but we can verify thinking content is generated
-        const hasThinking = result.reasoningTokens > 0 || (result.thinking?.length ?? 0) > 0
-        expect(hasThinking).toBe(true)
-      },
-      LLM_TIMEOUT,
-    )
-
-    it(
-      "should produce more reasoning tokens with higher budget",
-      async () => {
-        // Run minimal and high budget tests
-        const minimalResult = await runReasoningTest("anthropic", "minimal", ANTHROPIC_MODEL)
-        const highResult = await runReasoningTest("anthropic", "high", ANTHROPIC_MODEL)
-
-        expect(minimalResult.success).toBe(true)
-        expect(highResult.success).toBe(true)
-
-        // Both should produce reasoning tokens or thinking text
-        // Note: AI SDK currently doesn't report reasoningTokens for Anthropic
-        const minimalHasThinking =
-          minimalResult.reasoningTokens > 0 || (minimalResult.thinking?.length ?? 0) > 0
-        const highHasThinking =
-          highResult.reasoningTokens > 0 || (highResult.thinking?.length ?? 0) > 0
-        expect(minimalHasThinking).toBe(true)
-        expect(highHasThinking).toBe(true)
-      },
-      LLM_TIMEOUT * 2, // Two API calls
-    )
-
-    it(
-      "should complete successfully with all budget levels",
-      async () => {
-        const budgets: BudgetLevel[] = ["minimal", "low", "medium", "high"]
-
-        for (const budget of budgets) {
-          const result = await runReasoningTest("anthropic", budget, ANTHROPIC_MODEL)
-          expect(result.success).toBe(true)
-        }
-      },
-      LLM_TIMEOUT * 4, // Four API calls
-    )
-  })
-
-  describe("OpenAI Reasoning Effort", () => {
-    // Note: gpt-5-nano supports reasoning effort
-    const OPENAI_MODEL = "gpt-5-nano"
-
-    it(
-      "should produce reasoning tokens when budget is set",
-      async () => {
-        const result = await runReasoningTest("openai", "medium", OPENAI_MODEL)
-
-        expect(result.success).toBe(true)
-        // OpenAI reasoning models may not always surface reasoning token counts,
-        // so verify either tokens or thinking text is present
-        const hasReasoning = result.reasoningTokens > 0 || (result.thinking?.length ?? 0) > 0
-        expect(hasReasoning).toBe(true)
-      },
-      LLM_TIMEOUT,
-    )
-
-    it(
-      "should produce more reasoning tokens with higher budget",
-      async () => {
-        // Run minimal and high budget tests
-        const minimalResult = await runReasoningTest("openai", "minimal", OPENAI_MODEL)
-        const highResult = await runReasoningTest("openai", "high", OPENAI_MODEL)
-
-        expect(minimalResult.success).toBe(true)
-        expect(highResult.success).toBe(true)
-
-        // At least one budget level should produce reasoning tokens or thinking text
-        const highHasReasoning =
-          highResult.reasoningTokens > 0 || (highResult.thinking?.length ?? 0) > 0
-        expect(highHasReasoning).toBe(true)
-      },
-      LLM_TIMEOUT * 2, // Two API calls
-    )
-
-    it(
-      "should complete successfully with all budget levels",
-      async () => {
-        const budgets: BudgetLevel[] = ["minimal", "low", "medium", "high"]
-
-        for (const budget of budgets) {
-          const result = await runReasoningTest("openai", budget, OPENAI_MODEL)
-          expect(result.success).toBe(true)
-        }
-      },
-      LLM_TIMEOUT * 4, // Four API calls
-    )
-  })
-
-  describe("Google Flash Thinking", () => {
-    // Note: gemini-2.5-flash supports thinking mode
-    const GOOGLE_MODEL = "gemini-2.5-flash"
-
-    it(
-      "should produce reasoning tokens when budget is set",
-      async () => {
-        const result = await runReasoningTest("google", "medium", GOOGLE_MODEL)
-
-        expect(result.success).toBe(true)
-        // Flash thinking should produce reasoning tokens or thinking text
-        const hasThinking = result.reasoningTokens > 0 || (result.thinking?.length ?? 0) > 0
-        expect(hasThinking).toBe(true)
-      },
-      LLM_TIMEOUT,
-    )
-
-    it(
-      "should produce more reasoning tokens with higher budget",
-      async () => {
-        // Run minimal and high budget tests
-        const minimalResult = await runReasoningTest("google", "minimal", GOOGLE_MODEL)
-        const highResult = await runReasoningTest("google", "high", GOOGLE_MODEL)
+describe.concurrent("Reasoning Budget", () => {
+  it(
+    "should produce reasoning tokens with Anthropic extended thinking",
+    async () => {
+      const result = await runReasoningTest("anthropic", "claude-haiku-4-5")
+
+      expect(result.success).toBe(true)
+      const hasThinking = result.reasoningTokens > 0 || (result.thinking?.length ?? 0) > 0
+      expect(hasThinking).toBe(true)
+    },
+    LLM_TIMEOUT,
+  )
 
-        expect(minimalResult.success).toBe(true)
-        expect(highResult.success).toBe(true)
+  it(
+    "should produce reasoning tokens with OpenAI reasoning effort",
+    async () => {
+      const result = await runReasoningTest("openai", "gpt-5-nano")
 
-        // Both should produce reasoning tokens or thinking text
-        const minimalHasThinking =
-          minimalResult.reasoningTokens > 0 || (minimalResult.thinking?.length ?? 0) > 0
-        const highHasThinking =
-          highResult.reasoningTokens > 0 || (highResult.thinking?.length ?? 0) > 0
-        expect(minimalHasThinking).toBe(true)
-        expect(highHasThinking).toBe(true)
-      },
-      LLM_TIMEOUT * 2, // Two API calls
-    )
+      expect(result.success).toBe(true)
+      const hasReasoning = result.reasoningTokens > 0 || (result.thinking?.length ?? 0) > 0
+      expect(hasReasoning).toBe(true)
+    },
+    LLM_TIMEOUT,
+  )
 
-    it(
-      "should complete successfully with all budget levels",
-      async () => {
-        const budgets: BudgetLevel[] = ["minimal", "low", "medium", "high"]
+  it(
+    "should produce reasoning tokens with Google flash thinking",
+    async () => {
+      const result = await runReasoningTest("google", "gemini-2.5-flash")
 
-        for (const budget of budgets) {
-          const result = await runReasoningTest("google", budget, GOOGLE_MODEL)
-          expect(result.success).toBe(true)
-        }
-      },
-      LLM_TIMEOUT * 4, // Four API calls
-    )
-  })
+      expect(result.success).toBe(true)
+      const hasThinking = result.reasoningTokens > 0 || (result.thinking?.length ?? 0) > 0
+      expect(hasThinking).toBe(true)
+    },
+    LLM_TIMEOUT,
+  )
 })