Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 38 additions & 176 deletions e2e/perstack-cli/reasoning-budget.test.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
/**
* Reasoning Budget E2E Tests
*
* Tests that different reasoning budget levels produce different reasoning token counts.
* This validates that the reasoningBudget configuration is correctly passed to providers.
* Tests that reasoning budget is correctly passed to each provider
* and produces reasoning tokens or thinking text.
*
* TOML: e2e/experts/reasoning-budget.toml
*/
Expand All @@ -14,22 +14,11 @@ const REASONING_BUDGET_CONFIG = "./e2e/experts/reasoning-budget.toml"
// Extended thinking requires longer timeout
const LLM_TIMEOUT = 180000

type BudgetLevel = "minimal" | "low" | "medium" | "high"

interface ReasoningTestResult {
budget: BudgetLevel
reasoningTokens: number
/** Thinking text from extended thinking (Anthropic/Google) */
thinking?: string
success: boolean
}

async function runReasoningTest(
provider: "anthropic" | "openai" | "google",
budget: BudgetLevel,
model: string,
): Promise<ReasoningTestResult> {
const expertKey = `e2e-reasoning-${provider}-${budget}`
): Promise<{ reasoningTokens: number; thinking?: string; success: boolean }> {
const expertKey = `e2e-reasoning-${provider}-medium`
const cmdResult = await runCli(
[
"run",
Expand All @@ -42,7 +31,7 @@ async function runReasoningTest(
"--model",
model,
"--reasoning-budget",
budget,
"medium",
],
{ timeout: LLM_TIMEOUT },
)
Expand Down Expand Up @@ -73,7 +62,6 @@ async function runReasoningTest(
// Get thinking from completeReasoning event or from checkpoint messages
let thinking = reasoningEvent?.text
if (!thinking && completeEvent?.checkpoint?.messages) {
// Look for thinkingPart in any message
for (const message of completeEvent.checkpoint.messages) {
if (message.contents) {
for (const content of message.contents) {
Expand All @@ -87,169 +75,43 @@ async function runReasoningTest(
}
}

return {
budget,
reasoningTokens,
thinking,
success: result.exitCode === 0,
}
return { reasoningTokens, thinking, success: result.exitCode === 0 }
}

describe("Reasoning Budget", () => {
describe("Anthropic Extended Thinking", () => {
// Note: Claude claude-haiku-4-5 supports extended thinking
const ANTHROPIC_MODEL = "claude-haiku-4-5"

it(
"should produce reasoning tokens when budget is set",
async () => {
const result = await runReasoningTest("anthropic", "medium", ANTHROPIC_MODEL)

expect(result.success).toBe(true)
// Extended thinking should produce reasoning tokens or thinking text
// Note: AI SDK currently doesn't report reasoningTokens for Anthropic,
// but we can verify thinking content is generated
const hasThinking = result.reasoningTokens > 0 || (result.thinking?.length ?? 0) > 0
expect(hasThinking).toBe(true)
},
LLM_TIMEOUT,
)

it(
"should produce more reasoning tokens with higher budget",
async () => {
// Run minimal and high budget tests
const minimalResult = await runReasoningTest("anthropic", "minimal", ANTHROPIC_MODEL)
const highResult = await runReasoningTest("anthropic", "high", ANTHROPIC_MODEL)

expect(minimalResult.success).toBe(true)
expect(highResult.success).toBe(true)

// Both should produce reasoning tokens or thinking text
// Note: AI SDK currently doesn't report reasoningTokens for Anthropic
const minimalHasThinking =
minimalResult.reasoningTokens > 0 || (minimalResult.thinking?.length ?? 0) > 0
const highHasThinking =
highResult.reasoningTokens > 0 || (highResult.thinking?.length ?? 0) > 0
expect(minimalHasThinking).toBe(true)
expect(highHasThinking).toBe(true)
},
LLM_TIMEOUT * 2, // Two API calls
)

it(
"should complete successfully with all budget levels",
async () => {
const budgets: BudgetLevel[] = ["minimal", "low", "medium", "high"]

for (const budget of budgets) {
const result = await runReasoningTest("anthropic", budget, ANTHROPIC_MODEL)
expect(result.success).toBe(true)
}
},
LLM_TIMEOUT * 4, // Four API calls
)
})

describe("OpenAI Reasoning Effort", () => {
// Note: gpt-5-nano supports reasoning effort
const OPENAI_MODEL = "gpt-5-nano"

it(
"should produce reasoning tokens when budget is set",
async () => {
const result = await runReasoningTest("openai", "medium", OPENAI_MODEL)

expect(result.success).toBe(true)
// OpenAI reasoning models may not always surface reasoning token counts,
// so verify either tokens or thinking text is present
const hasReasoning = result.reasoningTokens > 0 || (result.thinking?.length ?? 0) > 0
expect(hasReasoning).toBe(true)
},
LLM_TIMEOUT,
)

it(
"should produce more reasoning tokens with higher budget",
async () => {
// Run minimal and high budget tests
const minimalResult = await runReasoningTest("openai", "minimal", OPENAI_MODEL)
const highResult = await runReasoningTest("openai", "high", OPENAI_MODEL)

expect(minimalResult.success).toBe(true)
expect(highResult.success).toBe(true)

// At least one budget level should produce reasoning tokens or thinking text
const highHasReasoning =
highResult.reasoningTokens > 0 || (highResult.thinking?.length ?? 0) > 0
expect(highHasReasoning).toBe(true)
},
LLM_TIMEOUT * 2, // Two API calls
)

it(
"should complete successfully with all budget levels",
async () => {
const budgets: BudgetLevel[] = ["minimal", "low", "medium", "high"]

for (const budget of budgets) {
const result = await runReasoningTest("openai", budget, OPENAI_MODEL)
expect(result.success).toBe(true)
}
},
LLM_TIMEOUT * 4, // Four API calls
)
})

describe("Google Flash Thinking", () => {
// Note: gemini-2.5-flash supports thinking mode
const GOOGLE_MODEL = "gemini-2.5-flash"

it(
"should produce reasoning tokens when budget is set",
async () => {
const result = await runReasoningTest("google", "medium", GOOGLE_MODEL)

expect(result.success).toBe(true)
// Flash thinking should produce reasoning tokens or thinking text
const hasThinking = result.reasoningTokens > 0 || (result.thinking?.length ?? 0) > 0
expect(hasThinking).toBe(true)
},
LLM_TIMEOUT,
)

it(
"should produce more reasoning tokens with higher budget",
async () => {
// Run minimal and high budget tests
const minimalResult = await runReasoningTest("google", "minimal", GOOGLE_MODEL)
const highResult = await runReasoningTest("google", "high", GOOGLE_MODEL)
describe.concurrent("Reasoning Budget", () => {
it(
"should produce reasoning tokens with Anthropic extended thinking",
async () => {
const result = await runReasoningTest("anthropic", "claude-haiku-4-5")

expect(result.success).toBe(true)
const hasThinking = result.reasoningTokens > 0 || (result.thinking?.length ?? 0) > 0
expect(hasThinking).toBe(true)
},
LLM_TIMEOUT,
)

expect(minimalResult.success).toBe(true)
expect(highResult.success).toBe(true)
it(
"should produce reasoning tokens with OpenAI reasoning effort",
async () => {
const result = await runReasoningTest("openai", "gpt-5-nano")

// Both should produce reasoning tokens or thinking text
const minimalHasThinking =
minimalResult.reasoningTokens > 0 || (minimalResult.thinking?.length ?? 0) > 0
const highHasThinking =
highResult.reasoningTokens > 0 || (highResult.thinking?.length ?? 0) > 0
expect(minimalHasThinking).toBe(true)
expect(highHasThinking).toBe(true)
},
LLM_TIMEOUT * 2, // Two API calls
)
expect(result.success).toBe(true)
const hasReasoning = result.reasoningTokens > 0 || (result.thinking?.length ?? 0) > 0
expect(hasReasoning).toBe(true)
},
LLM_TIMEOUT,
)

it(
"should complete successfully with all budget levels",
async () => {
const budgets: BudgetLevel[] = ["minimal", "low", "medium", "high"]
it(
"should produce reasoning tokens with Google flash thinking",
async () => {
const result = await runReasoningTest("google", "gemini-2.5-flash")

for (const budget of budgets) {
const result = await runReasoningTest("google", budget, GOOGLE_MODEL)
expect(result.success).toBe(true)
}
},
LLM_TIMEOUT * 4, // Four API calls
)
})
expect(result.success).toBe(true)
const hasThinking = result.reasoningTokens > 0 || (result.thinking?.length ?? 0) > 0
expect(hasThinking).toBe(true)
},
LLM_TIMEOUT,
)
})