Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 57 additions & 86 deletions e2e/create-expert/create-expert.test.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,3 @@
/**
* Create Expert E2E Tests
*
* Tests the create-expert agent that creates/modifies perstack.toml files:
* - Creates new expert definitions via planner + definition-writer + expert-tester delegates
* - Investigates MCP registry skills via skill-finder delegate when external integrations needed
* - Adds discovered MCP skills to generated perstack.toml expert definitions
* - Tests experts via addDelegateFromConfig after writing perstack.toml
* - Preserves existing experts when modifying perstack.toml
*
* Binary: apps/create-expert/dist/bin/cli.js (--headless mode)
*/

import { describe, expect, it } from "bun:test"
import { spawn } from "node:child_process"
import fs from "node:fs"
Expand All @@ -23,10 +10,6 @@ import { type CommandResult, type RunResult, withEventParsing } from "../lib/run

const PROJECT_ROOT = path.resolve(process.cwd())
const CLI_PATH = path.join(PROJECT_ROOT, "apps/create-expert/dist/bin/cli.js")
// LLM API calls require extended timeout; delegation adds extra LLM round-trips.
// The create-expert workflow involves multiple delegation round-trips (planner →
// skill-finder → definition-writer → expert-tester, with possible retries) which
// can exceed 10 minutes in CI environments.
const LLM_TIMEOUT = 900_000

function runCreateExpert(query: string, cwd: string, timeout = LLM_TIMEOUT): Promise<RunResult> {
Expand Down Expand Up @@ -64,14 +47,12 @@ function createTempDir(): string {
return fs.mkdtempSync(path.join(os.tmpdir(), "create-expert-"))
}

/** Extract all tool names called across callTools events */
function getAllCalledToolNames(result: RunResult): string[] {
return filterEventsByType(result.events, "callTools").flatMap((e) =>
extractToolCalls(e).map((tc) => tc.toolName),
)
}

/** Build a diagnostic string from RunResult for assertion failure messages */
function diagnostics(result: RunResult): string {
const errorEvents = result.events
.filter((e) => e.type === "stopRunByError")
Expand All @@ -89,68 +70,64 @@ describe("create-expert", () => {
it(
"should create a new perstack.toml with MCP skill integration",
async () => {
const tempDir = createTempDir()

// Request an expert that requires external API integration to trigger skill-finder
const result = await runCreateExpert(
"Create a GitHub repository analyzer expert that reads GitHub issues and pull requests via the GitHub API to generate project status reports",
tempDir,
)

expect(result.exitCode, diagnostics(result)).toBe(0)

// Verify control flow: coordinator starts, delegates, then completes
const controlFlow = result.events
.filter((e) => ["startRun", "stopRunByDelegate", "completeRun"].includes(e.type))
.map((e) => e.type)
expect(controlFlow[0]).toBe("startRun")
expect(controlFlow).toContain("stopRunByDelegate")
expect(controlFlow.at(-1)).toBe("completeRun")

// Verify the coordinator (create-expert) starts and completes
const startEvents = filterEventsByType(result.events, "startRun")
const completeEvents = filterEventsByType(result.events, "completeRun")
expect(
startEvents.some((e) => (e as { expertKey: string }).expertKey === "create-expert"),
).toBe(true)
expect(
completeEvents.some((e) => (e as { expertKey: string }).expertKey === "create-expert"),
).toBe(true)

// Verify delegation: at least 3 completeRun (planner + skill-finder + definition-writer/tester + coordinator)
expect(completeEvents.length).toBeGreaterThanOrEqual(3)
let result: RunResult | undefined
let tomlContent = ""
let nonBaseSkillMatches: RegExpMatchArray | null = null

for (let attempt = 0; attempt < 3; attempt++) {
const tempDir = createTempDir()

result = await runCreateExpert(
"Create a GitHub repository analyzer expert that reads GitHub issues and pull requests via the GitHub API to generate project status reports",
tempDir,
)

expect(result.exitCode, diagnostics(result)).toBe(0)

const controlFlow = result.events
.filter((e) => ["startRun", "stopRunByDelegate", "completeRun"].includes(e.type))
.map((e) => e.type)
expect(controlFlow[0]).toBe("startRun")
expect(controlFlow).toContain("stopRunByDelegate")
expect(controlFlow.at(-1)).toBe("completeRun")

const startEvents = filterEventsByType(result.events, "startRun")
const completeEvents = filterEventsByType(result.events, "completeRun")
expect(
startEvents.some((e) => (e as { expertKey: string }).expertKey === "create-expert"),
).toBe(true)
expect(
completeEvents.some((e) => (e as { expertKey: string }).expertKey === "create-expert"),
).toBe(true)

expect(completeEvents.length).toBeGreaterThanOrEqual(3)

const toolNames = getAllCalledToolNames(result)
expect(toolNames).toContain("writeTextFile")
expect(toolNames).toContain("addDelegateFromConfig")

expect(toolNames, "searchMcpRegistry should be called by skill-finder").toContain(
"searchMcpRegistry",
)

const skillReportPath = path.join(tempDir, "skill-report.md")
expect(fs.existsSync(skillReportPath)).toBe(true)

const tomlPath = path.join(tempDir, "perstack.toml")
expect(fs.existsSync(tomlPath)).toBe(true)
tomlContent = fs.readFileSync(tomlPath, "utf-8")
const expertMatches = tomlContent.match(/\[experts\."[^"]+"\]/g)
expect(expertMatches).not.toBeNull()
expect(expertMatches!.length).toBeGreaterThanOrEqual(1)

nonBaseSkillMatches = tomlContent.match(
/\[experts\."[^"]+".skills\."(?!@perstack\/base")[^"]+"\]/g,
)
if (nonBaseSkillMatches && nonBaseSkillMatches.length > 0) {
break
}
}

// Verify definition-writer writes TOML and expert-tester tests via addDelegateFromConfig
const toolNames = getAllCalledToolNames(result)
expect(toolNames).toContain("writeTextFile")
expect(toolNames).toContain("addDelegateFromConfig")

// Verify skill investigation: skill-finder searched the MCP registry
expect(toolNames, "searchMcpRegistry should be called by skill-finder").toContain(
"searchMcpRegistry",
)

// Verify skill-report.md was created (skill-finder output)
const skillReportPath = path.join(tempDir, "skill-report.md")
expect(
fs.existsSync(skillReportPath),
"skill-report.md should be created by skill-finder",
).toBe(true)

// Verify perstack.toml was created with at least one expert definition
const tomlPath = path.join(tempDir, "perstack.toml")
expect(fs.existsSync(tomlPath)).toBe(true)
const tomlContent = fs.readFileSync(tomlPath, "utf-8")
const expertMatches = tomlContent.match(/\[experts\."[^"]+"\]/g)
expect(expertMatches).not.toBeNull()
expect(expertMatches!.length).toBeGreaterThanOrEqual(1)

// Verify skill addition: at least one expert has a non-base skill (MCP integration).
// This depends on the LLM correctly forwarding skill-report.md through the delegation
// chain (coordinator → definition-writer), which can be non-deterministic with smaller models.
const nonBaseSkillMatches = tomlContent.match(
/\[experts\."[^"]+".skills\."(?!@perstack\/base")[^"]+"\]/g,
)
expect(
nonBaseSkillMatches && nonBaseSkillMatches.length > 0,
"at least one expert should have a non-base MCP skill",
Expand All @@ -164,7 +141,6 @@ describe("create-expert", () => {
async () => {
const tempDir = createTempDir()

// Create an existing perstack.toml with one expert
const existingToml = `model = "claude-sonnet-4-5"

[provider]
Expand All @@ -187,23 +163,18 @@ pick = ["attemptCompletion"]

expect(result.exitCode, diagnostics(result)).toBe(0)

// Verify control flow: start → delegate → complete
expect(
assertEventSequenceContains(result.events, ["startRun", "stopRunByDelegate", "completeRun"])
.passed,
).toBe(true)

// Verify definition-writer writes TOML and expert-tester tests via addDelegateFromConfig
const toolNames = getAllCalledToolNames(result)
expect(toolNames).toContain("writeTextFile")
expect(toolNames).toContain("addDelegateFromConfig")

// Verify perstack.toml was updated with existing + new experts
const tomlPath = path.join(tempDir, "perstack.toml")
const tomlContent = fs.readFileSync(tomlPath, "utf-8")
// Original expert should be preserved
expect(tomlContent).toContain('[experts."existing-expert"]')
// New expert should be added (at least 2 expert sections)
const expertMatches = tomlContent.match(/\[experts\."[^"]+"\]/g)
expect(expertMatches).not.toBeNull()
expect(expertMatches!.length).toBeGreaterThanOrEqual(2)
Expand Down
9 changes: 2 additions & 7 deletions e2e/fixtures/minimal-mcp-server.mjs
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
#!/usr/bin/env bun
/**
* Minimal MCP server for e2e testing.
* Uses newline-delimited JSON (NDJSON) protocol matching the MCP SDK stdio transport.
*/

import { createInterface } from "readline"
import { createInterface } from "node:readline"

const rl = createInterface({ input: process.stdin, terminal: false })

Expand All @@ -21,7 +16,7 @@ rl.on("line", (line) => {
return
}

const { id, method, params } = message
const { id, method } = message

switch (method) {
case "initialize":
Expand Down
13 changes: 1 addition & 12 deletions e2e/lib/event-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ export type CheckpointState = {
partialToolResults: ToolCallInfo[]
}

// Note: callDelegate, callInteractiveTool, finishAllToolCalls were removed in state-machine-redesign
const RELEVANT_EVENT_TYPES = [
"startRun",
"resumeFromStop",
Expand All @@ -26,13 +25,6 @@ const RELEVANT_EVENT_TYPES = [
"resolveToolResults",
] as const

/**
* Parses NDJSON events from CLI output that may contain literal newlines
* inside JSON string values (e.g. base64 data with MIME-style line breaks).
*
* Strategy: group lines by event boundaries (lines starting with '{"type":')
* and rejoin internal lines with escaped newlines before parsing.
*/
export function parseEvents(output: string): ParsedEvent[] {
const events: ParsedEvent[] = []
const lines = output.split("\n")
Expand All @@ -45,7 +37,6 @@ export function parseEvents(output: string): ParsedEvent[] {
}
buffer = line
} else if (buffer) {
// Continuation line (e.g. base64 data with literal newlines) — rejoin with escaped newline
buffer += "\\n" + line
} else {
tryParseEvent(line, events)
Expand All @@ -64,9 +55,7 @@ function tryParseEvent(text: string, events: ParsedEvent[]): void {
if (data.type) {
events.push({ ...data, raw: text })
}
} catch {
// skip unparseable lines
}
} catch {}
}

export function filterEventsByType<T extends RunEvent["type"]>(
Expand Down
3 changes: 0 additions & 3 deletions e2e/lib/round-robin.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
// Fixed provider/model for E2E tests
// - OpenAI: excluded due to reasoning overhead (~64s vs ~17s), see #194
// - Google: excluded due to empty text bug in delegation, see #195
const DEFAULT_PROVIDER = "anthropic"
const DEFAULT_MODEL = "claude-haiku-4-5"

Expand Down
7 changes: 0 additions & 7 deletions e2e/lib/runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,6 @@ function buildFinalArgs(args: string[], options?: RunOptions): string[] {
return injectProviderArgs(args)
}

/**
* Retries a CLI run if the LLM doesn't call the expected tool.
* Handles LLM non-determinism where the model sometimes skips tool calls.
*/
export async function runCliUntilToolCalled(
args: string[],
options: RunOptions,
Expand All @@ -64,7 +60,6 @@ export async function runCliUntilToolCalled(
try {
cmdResult = await runCli(args, options)
} catch {
// Timeout or spawn error — retry
continue
}
result = withEventParsing(cmdResult)
Expand All @@ -85,8 +80,6 @@ export async function runCli(args: string[], options?: RunOptions): Promise<Comm
const cwd = options?.cwd ?? process.cwd()
const env = options?.env ?? { ...process.env }
const finalArgs = buildFinalArgs(args, options)
// Redirect stdout to a temp file to avoid Bun pipe buffering issues
// that truncate large outputs (e.g. events with base64-encoded file data)
const stdoutFile = join(
tmpdir(),
`perstack-e2e-${Date.now()}-${Math.random().toString(36).slice(2)}.out`,
Expand Down
16 changes: 1 addition & 15 deletions e2e/perstack-cli/bundled-base.test.ts
Original file line number Diff line number Diff line change
@@ -1,22 +1,12 @@
/**
* Bundled Base Skill E2E Tests
*
* Tests that the bundled @perstack/base skill uses InMemoryTransport
* for near-zero initialization latency.
*
* TOML: e2e/experts/bundled-base.toml
*/
import { describe, expect, it } from "bun:test"
import { assertEventSequenceContains } from "../lib/assertions.js"
import { filterEventsByType } from "../lib/event-parser.js"
import { runCli, withEventParsing } from "../lib/runner.js"

const BUNDLED_BASE_CONFIG = "./e2e/experts/bundled-base.toml"
// LLM API calls require extended timeout
const LLM_TIMEOUT = 120000

describe.concurrent("Bundled Base Skill", () => {
/** Verifies bundled base skill initializes with InMemoryTransport (spawnDurationMs = 0). */
it(
"should use InMemoryTransport for bundled base (spawnDurationMs = 0)",
async () => {
Expand All @@ -31,7 +21,6 @@ describe.concurrent("Bundled Base Skill", () => {
true,
)

// Check that skillConnected event for @perstack/base has spawnDurationMs = 0
const skillConnectedEvents = filterEventsByType(result.events, "skillConnected")
const baseSkillEvent = skillConnectedEvents.find((e) => {
const event = e as { skillName?: string }
Expand All @@ -44,15 +33,13 @@ describe.concurrent("Bundled Base Skill", () => {
spawnDurationMs?: number
totalDurationMs?: number
}
expect(baseEvent.spawnDurationMs).toBe(0) // InMemoryTransport has no spawn
expect(baseEvent.spawnDurationMs).toBe(0)
expect(baseEvent.totalDurationMs).toBeDefined()
// InMemoryTransport should be significantly faster than ~500ms for npx
expect(baseEvent.totalDurationMs).toBeLessThan(100)
},
LLM_TIMEOUT,
)

/** Verifies bundled base skill tools are available. */
it(
"should have all base skill tools available",
async () => {
Expand All @@ -64,7 +51,6 @@ describe.concurrent("Bundled Base Skill", () => {

expect(result.exitCode).toBe(0)

// Check that readTextFile was called (from pick list)
const callToolsEvents = filterEventsByType(result.events, "callTools")
const hasHealthCheck = callToolsEvents.some((e) => {
const event = e as { toolCalls?: Array<{ toolName: string }> }
Expand Down
Loading