perstack-ai · FL4TLiN3 · Feb 25, 2026 · Feb 25, 2026
diff --git a/e2e/create-expert/create-expert.test.ts b/e2e/create-expert/create-expert.test.ts
@@ -1,16 +1,3 @@
-/**
- * Create Expert E2E Tests
- *
- * Tests the create-expert agent that creates/modifies perstack.toml files:
- * - Creates new expert definitions via planner + definition-writer + expert-tester delegates
- * - Investigates MCP registry skills via skill-finder delegate when external integrations needed
- * - Adds discovered MCP skills to generated perstack.toml expert definitions
- * - Tests experts via addDelegateFromConfig after writing perstack.toml
- * - Preserves existing experts when modifying perstack.toml
- *
- * Binary: apps/create-expert/dist/bin/cli.js (--headless mode)
- */
-
 import { describe, expect, it } from "bun:test"
 import { spawn } from "node:child_process"
 import fs from "node:fs"
@@ -23,10 +10,6 @@ import { type CommandResult, type RunResult, withEventParsing } from "../lib/run
 
 const PROJECT_ROOT = path.resolve(process.cwd())
 const CLI_PATH = path.join(PROJECT_ROOT, "apps/create-expert/dist/bin/cli.js")
-// LLM API calls require extended timeout; delegation adds extra LLM round-trips.
-// The create-expert workflow involves multiple delegation round-trips (planner →
-// skill-finder → definition-writer → expert-tester, with possible retries) which
-// can exceed 10 minutes in CI environments.
 const LLM_TIMEOUT = 900_000
 
 function runCreateExpert(query: string, cwd: string, timeout = LLM_TIMEOUT): Promise<RunResult> {
@@ -64,14 +47,12 @@ function createTempDir(): string {
   return fs.mkdtempSync(path.join(os.tmpdir(), "create-expert-"))
 }
 
-/** Extract all tool names called across callTools events */
 function getAllCalledToolNames(result: RunResult): string[] {
   return filterEventsByType(result.events, "callTools").flatMap((e) =>
     extractToolCalls(e).map((tc) => tc.toolName),
   )
 }
 
-/** Build a diagnostic string from RunResult for assertion failure messages */
 function diagnostics(result: RunResult): string {
   const errorEvents = result.events
     .filter((e) => e.type === "stopRunByError")
@@ -89,68 +70,64 @@ describe("create-expert", () => {
   it(
     "should create a new perstack.toml with MCP skill integration",
     async () => {
-      const tempDir = createTempDir()
-
-      // Request an expert that requires external API integration to trigger skill-finder
-      const result = await runCreateExpert(
-        "Create a GitHub repository analyzer expert that reads GitHub issues and pull requests via the GitHub API to generate project status reports",
-        tempDir,
-      )
-
-      expect(result.exitCode, diagnostics(result)).toBe(0)
-
-      // Verify control flow: coordinator starts, delegates, then completes
-      const controlFlow = result.events
-        .filter((e) => ["startRun", "stopRunByDelegate", "completeRun"].includes(e.type))
-        .map((e) => e.type)
-      expect(controlFlow[0]).toBe("startRun")
-      expect(controlFlow).toContain("stopRunByDelegate")
-      expect(controlFlow.at(-1)).toBe("completeRun")
-
-      // Verify the coordinator (create-expert) starts and completes
-      const startEvents = filterEventsByType(result.events, "startRun")
-      const completeEvents = filterEventsByType(result.events, "completeRun")
-      expect(
-        startEvents.some((e) => (e as { expertKey: string }).expertKey === "create-expert"),
-      ).toBe(true)
-      expect(
-        completeEvents.some((e) => (e as { expertKey: string }).expertKey === "create-expert"),
-      ).toBe(true)
-
-      // Verify delegation: at least 3 completeRun (planner + skill-finder + definition-writer/tester + coordinator)
-      expect(completeEvents.length).toBeGreaterThanOrEqual(3)
+      let result: RunResult | undefined
+      let tomlContent = ""
+      let nonBaseSkillMatches: RegExpMatchArray | null = null
+
+      for (let attempt = 0; attempt < 3; attempt++) {
+        const tempDir = createTempDir()
+
+        result = await runCreateExpert(
+          "Create a GitHub repository analyzer expert that reads GitHub issues and pull requests via the GitHub API to generate project status reports",
+          tempDir,
+        )
+
+        expect(result.exitCode, diagnostics(result)).toBe(0)
+
+        const controlFlow = result.events
+          .filter((e) => ["startRun", "stopRunByDelegate", "completeRun"].includes(e.type))
+          .map((e) => e.type)
+        expect(controlFlow[0]).toBe("startRun")
+        expect(controlFlow).toContain("stopRunByDelegate")
+        expect(controlFlow.at(-1)).toBe("completeRun")
+
+        const startEvents = filterEventsByType(result.events, "startRun")
+        const completeEvents = filterEventsByType(result.events, "completeRun")
+        expect(
+          startEvents.some((e) => (e as { expertKey: string }).expertKey === "create-expert"),
+        ).toBe(true)
+        expect(
+          completeEvents.some((e) => (e as { expertKey: string }).expertKey === "create-expert"),
+        ).toBe(true)
+
+        expect(completeEvents.length).toBeGreaterThanOrEqual(3)
+
+        const toolNames = getAllCalledToolNames(result)
+        expect(toolNames).toContain("writeTextFile")
+        expect(toolNames).toContain("addDelegateFromConfig")
+
+        expect(toolNames, "searchMcpRegistry should be called by skill-finder").toContain(
+          "searchMcpRegistry",
+        )
+
+        const skillReportPath = path.join(tempDir, "skill-report.md")
+        expect(fs.existsSync(skillReportPath)).toBe(true)
+
+        const tomlPath = path.join(tempDir, "perstack.toml")
+        expect(fs.existsSync(tomlPath)).toBe(true)
+        tomlContent = fs.readFileSync(tomlPath, "utf-8")
+        const expertMatches = tomlContent.match(/\[experts\."[^"]+"\]/g)
+        expect(expertMatches).not.toBeNull()
+        expect(expertMatches!.length).toBeGreaterThanOrEqual(1)
+
+        nonBaseSkillMatches = tomlContent.match(
+          /\[experts\."[^"]+".skills\."(?!@perstack\/base")[^"]+"\]/g,
+        )
+        if (nonBaseSkillMatches && nonBaseSkillMatches.length > 0) {
+          break
+        }
+      }
 
-      // Verify definition-writer writes TOML and expert-tester tests via addDelegateFromConfig
-      const toolNames = getAllCalledToolNames(result)
-      expect(toolNames).toContain("writeTextFile")
-      expect(toolNames).toContain("addDelegateFromConfig")
-
-      // Verify skill investigation: skill-finder searched the MCP registry
-      expect(toolNames, "searchMcpRegistry should be called by skill-finder").toContain(
-        "searchMcpRegistry",
-      )
-
-      // Verify skill-report.md was created (skill-finder output)
-      const skillReportPath = path.join(tempDir, "skill-report.md")
-      expect(
-        fs.existsSync(skillReportPath),
-        "skill-report.md should be created by skill-finder",
-      ).toBe(true)
-
-      // Verify perstack.toml was created with at least one expert definition
-      const tomlPath = path.join(tempDir, "perstack.toml")
-      expect(fs.existsSync(tomlPath)).toBe(true)
-      const tomlContent = fs.readFileSync(tomlPath, "utf-8")
-      const expertMatches = tomlContent.match(/\[experts\."[^"]+"\]/g)
-      expect(expertMatches).not.toBeNull()
-      expect(expertMatches!.length).toBeGreaterThanOrEqual(1)
-
-      // Verify skill addition: at least one expert has a non-base skill (MCP integration).
-      // This depends on the LLM correctly forwarding skill-report.md through the delegation
-      // chain (coordinator → definition-writer), which can be non-deterministic with smaller models.
-      const nonBaseSkillMatches = tomlContent.match(
-        /\[experts\."[^"]+".skills\."(?!@perstack\/base")[^"]+"\]/g,
-      )
       expect(
         nonBaseSkillMatches && nonBaseSkillMatches.length > 0,
         "at least one expert should have a non-base MCP skill",
@@ -164,7 +141,6 @@ describe("create-expert", () => {
     async () => {
       const tempDir = createTempDir()
 
-      // Create an existing perstack.toml with one expert
       const existingToml = `model = "claude-sonnet-4-5"
 
 [provider]
@@ -187,23 +163,18 @@ pick = ["attemptCompletion"]
 
       expect(result.exitCode, diagnostics(result)).toBe(0)
 
-      // Verify control flow: start → delegate → complete
       expect(
         assertEventSequenceContains(result.events, ["startRun", "stopRunByDelegate", "completeRun"])
           .passed,
       ).toBe(true)
 
-      // Verify definition-writer writes TOML and expert-tester tests via addDelegateFromConfig
       const toolNames = getAllCalledToolNames(result)
       expect(toolNames).toContain("writeTextFile")
       expect(toolNames).toContain("addDelegateFromConfig")
 
-      // Verify perstack.toml was updated with existing + new experts
       const tomlPath = path.join(tempDir, "perstack.toml")
       const tomlContent = fs.readFileSync(tomlPath, "utf-8")
-      // Original expert should be preserved
       expect(tomlContent).toContain('[experts."existing-expert"]')
-      // New expert should be added (at least 2 expert sections)
       const expertMatches = tomlContent.match(/\[experts\."[^"]+"\]/g)
       expect(expertMatches).not.toBeNull()
       expect(expertMatches!.length).toBeGreaterThanOrEqual(2)

diff --git a/e2e/fixtures/minimal-mcp-server.mjs b/e2e/fixtures/minimal-mcp-server.mjs
@@ -1,10 +1,5 @@
 #!/usr/bin/env bun
-/**
- * Minimal MCP server for e2e testing.
- * Uses newline-delimited JSON (NDJSON) protocol matching the MCP SDK stdio transport.
- */
-
-import { createInterface } from "readline"
+import { createInterface } from "node:readline"
 
 const rl = createInterface({ input: process.stdin, terminal: false })
 
@@ -21,7 +16,7 @@ rl.on("line", (line) => {
     return
   }
 
-  const { id, method, params } = message
+  const { id, method } = message
 
   switch (method) {
     case "initialize":

diff --git a/e2e/lib/event-parser.ts b/e2e/lib/event-parser.ts
@@ -14,7 +14,6 @@ export type CheckpointState = {
   partialToolResults: ToolCallInfo[]
 }
 
-// Note: callDelegate, callInteractiveTool, finishAllToolCalls were removed in state-machine-redesign
 const RELEVANT_EVENT_TYPES = [
   "startRun",
   "resumeFromStop",
@@ -26,13 +25,6 @@ const RELEVANT_EVENT_TYPES = [
   "resolveToolResults",
 ] as const
 
-/**
- * Parses NDJSON events from CLI output that may contain literal newlines
- * inside JSON string values (e.g. base64 data with MIME-style line breaks).
- *
- * Strategy: group lines by event boundaries (lines starting with '{"type":')
- * and rejoin internal lines with escaped newlines before parsing.
- */
 export function parseEvents(output: string): ParsedEvent[] {
   const events: ParsedEvent[] = []
   const lines = output.split("\n")
@@ -45,7 +37,6 @@ export function parseEvents(output: string): ParsedEvent[] {
       }
       buffer = line
     } else if (buffer) {
-      // Continuation line (e.g. base64 data with literal newlines) — rejoin with escaped newline
       buffer += "\\n" + line
     } else {
       tryParseEvent(line, events)
@@ -64,9 +55,7 @@ function tryParseEvent(text: string, events: ParsedEvent[]): void {
     if (data.type) {
       events.push({ ...data, raw: text })
     }
-  } catch {
-    // skip unparseable lines
-  }
+  } catch {}
 }
 
 export function filterEventsByType<T extends RunEvent["type"]>(

diff --git a/e2e/lib/round-robin.ts b/e2e/lib/round-robin.ts
@@ -1,6 +1,3 @@
-// Fixed provider/model for E2E tests
-// - OpenAI: excluded due to reasoning overhead (~64s vs ~17s), see #194
-// - Google: excluded due to empty text bug in delegation, see #195
 const DEFAULT_PROVIDER = "anthropic"
 const DEFAULT_MODEL = "claude-haiku-4-5"
 

diff --git a/e2e/lib/runner.ts b/e2e/lib/runner.ts
@@ -48,10 +48,6 @@ function buildFinalArgs(args: string[], options?: RunOptions): string[] {
   return injectProviderArgs(args)
 }
 
-/**
- * Retries a CLI run if the LLM doesn't call the expected tool.
- * Handles LLM non-determinism where the model sometimes skips tool calls.
- */
 export async function runCliUntilToolCalled(
   args: string[],
   options: RunOptions,
@@ -64,7 +60,6 @@ export async function runCliUntilToolCalled(
     try {
       cmdResult = await runCli(args, options)
     } catch {
-      // Timeout or spawn error — retry
       continue
     }
     result = withEventParsing(cmdResult)
@@ -85,8 +80,6 @@ export async function runCli(args: string[], options?: RunOptions): Promise<Comm
   const cwd = options?.cwd ?? process.cwd()
   const env = options?.env ?? { ...process.env }
   const finalArgs = buildFinalArgs(args, options)
-  // Redirect stdout to a temp file to avoid Bun pipe buffering issues
-  // that truncate large outputs (e.g. events with base64-encoded file data)
   const stdoutFile = join(
     tmpdir(),
     `perstack-e2e-${Date.now()}-${Math.random().toString(36).slice(2)}.out`,

diff --git a/e2e/perstack-cli/bundled-base.test.ts b/e2e/perstack-cli/bundled-base.test.ts
@@ -1,22 +1,12 @@
-/**
- * Bundled Base Skill E2E Tests
- *
- * Tests that the bundled @perstack/base skill uses InMemoryTransport
- * for near-zero initialization latency.
- *
- * TOML: e2e/experts/bundled-base.toml
- */
 import { describe, expect, it } from "bun:test"
 import { assertEventSequenceContains } from "../lib/assertions.js"
 import { filterEventsByType } from "../lib/event-parser.js"
 import { runCli, withEventParsing } from "../lib/runner.js"
 
 const BUNDLED_BASE_CONFIG = "./e2e/experts/bundled-base.toml"
-// LLM API calls require extended timeout
 const LLM_TIMEOUT = 120000
 
 describe.concurrent("Bundled Base Skill", () => {
-  /** Verifies bundled base skill initializes with InMemoryTransport (spawnDurationMs = 0). */
   it(
     "should use InMemoryTransport for bundled base (spawnDurationMs = 0)",
     async () => {
@@ -31,7 +21,6 @@ describe.concurrent("Bundled Base Skill", () => {
         true,
       )
 
-      // Check that skillConnected event for @perstack/base has spawnDurationMs = 0
       const skillConnectedEvents = filterEventsByType(result.events, "skillConnected")
       const baseSkillEvent = skillConnectedEvents.find((e) => {
         const event = e as { skillName?: string }
@@ -44,15 +33,13 @@ describe.concurrent("Bundled Base Skill", () => {
         spawnDurationMs?: number
         totalDurationMs?: number
       }
-      expect(baseEvent.spawnDurationMs).toBe(0) // InMemoryTransport has no spawn
+      expect(baseEvent.spawnDurationMs).toBe(0)
       expect(baseEvent.totalDurationMs).toBeDefined()
-      // InMemoryTransport should be significantly faster than ~500ms for npx
       expect(baseEvent.totalDurationMs).toBeLessThan(100)
     },
     LLM_TIMEOUT,
   )
 
-  /** Verifies bundled base skill tools are available. */
   it(
     "should have all base skill tools available",
     async () => {
@@ -64,7 +51,6 @@ describe.concurrent("Bundled Base Skill", () => {
 
       expect(result.exitCode).toBe(0)
 
-      // Check that readTextFile was called (from pick list)
       const callToolsEvents = filterEventsByType(result.events, "callTools")
       const hasHealthCheck = callToolsEvents.some((e) => {
         const event = e as { toolCalls?: Array<{ toolName: string }> }