perstack-ai · FL4TLiN3 · Feb 19, 2026 · Feb 18, 2026 · Feb 19, 2026
diff --git a/.changeset/fix-runtime-robustness.md b/.changeset/fix-runtime-robustness.md
@@ -0,0 +1,10 @@
+---
+"create-expert": patch
+"@perstack/filesystem-storage": patch
+"@perstack/runtime": patch
+"@perstack/skill-manager": patch
+"@perstack/tui-components": patch
+"@perstack/tui": patch
+---
+
+fix: improve runtime robustness
diff --git a/apps/create-expert/perstack.toml b/apps/create-expert/perstack.toml
@@ -68,23 +68,62 @@ pick = ["readTextFile", "writeTextFile", "listDirectory", "think", "attemptCompl
 3. Based on the user's request, draft the expert definition
 4. Create the expert in memory using `createExpert` to validate the definition
 5. Add it as a delegate using `addDelegate` so you can test it
-6. Test the expert by calling the delegate tool with a simple, realistic query
-7. Review the result: check that the expert behaves as expected
-8. If the test shows errors or unexpected behavior:
+6. **Practical test**: Call the delegate with a realistic query that matches what the user would actually ask (see Testing Guide below)
+7. **Verify outputs**: After the delegate returns, verify the actual artifacts and process (see Testing Guide below)
+8. If the test shows errors, missing artifacts, or quality issues:
    - Use `removeDelegate` to remove the current delegate
    - Modify the definition and call `createExpert` again with the same key
    - Add it as a delegate again with `addDelegate` and re-test
-9. Once the expert works correctly, write the final `perstack.toml` using `writeTextFile`
+9. Once the expert produces correct, complete outputs, write the final `perstack.toml` using `writeTextFile`
 10. Use `attemptCompletion` when the expert is created and verified
 
-## Testing with createExpert + addDelegate
+## Testing Guide
 
-After drafting an expert definition, always test it in memory before writing perstack.toml:
-- Use `createExpert` with the expert key, instruction, description, skills, and other fields
-- Use `addDelegate` with the expert key to make it callable
-- Call the delegate tool with a simple query that exercises the expert's core functionality
-- Review the result to verify correctness
-- If issues arise, iterate: `removeDelegate` -> fix -> `createExpert` -> `addDelegate` -> re-test
+You MUST perform practical, end-to-end testing before writing perstack.toml. The test must simulate the user's actual use case, not just check that the expert "runs without errors".
+
+### Step 1: Design a realistic test query
+
+Before calling the delegate, think about what the user will actually ask this expert to do. The test query should be a concrete, representative task — not a trivial or abstract one.
+
+- If the expert generates code: ask it to generate a small but complete, realistic piece (e.g., "Create a responsive landing page with a hero section, feature cards, and a contact form")
+- If the expert writes documentation: ask it to document a specific scenario (e.g., "Write API documentation for a user authentication endpoint with examples")
+- If the expert performs analysis: give it real-looking data to analyze
+- If the expert manages a workflow with sub-experts: give it a task that exercises delegation to at least one sub-expert
+
+### Step 2: Verify the artifacts after delegation
+
+After the delegate returns its text result, you must verify what was actually produced. Do NOT just read the delegate's response text and assume success.
+
+**For experts that create files:**
+1. Use `listDirectory` to confirm all expected files were created
+2. Use `readTextFile` to read each generated file
+3. Check that file contents are correct, complete, and well-structured
+4. Verify no placeholder content (e.g., "TODO", "Lorem ipsum" where real content is expected)
+
+**For experts that modify existing files:**
+1. Use `readTextFile` to read the modified files
+2. Verify the changes are correct and the file is still valid
+3. Check that unrelated parts of the file were not damaged
+
+**For experts that perform tasks (build, test, deploy, etc.):**
+1. Use `exec` to run `perstack logs --last` to inspect the execution process
+2. Verify the task steps were performed in the correct order
+3. Check that the final state matches expectations (files created, commands run, etc.)
+
+**For experts with delegates (coordinator/lead experts):**
+1. Use `exec` to run `perstack logs --last` to verify delegation occurred
+2. Confirm that each sub-expert was called with appropriate queries
+3. Verify the coordinator properly synthesized results from sub-experts
+
+### Step 3: Evaluate quality, not just correctness
+
+Ask yourself: "If I were the user, would I be satisfied with this output?"
+- Is the output complete, or are parts missing?
+- Is the quality appropriate for the task?
+- Does the expert follow its instruction faithfully?
+- Would the user need to manually fix or redo anything?
+
+If the answer to any of these is unsatisfactory, iterate: fix the instruction, recreate, and re-test.
 
 ## Important Rules
 
@@ -108,6 +147,7 @@ pick = [
   "writeTextFile",
   "listDirectory",
   "getFileInfo",
+  "exec",
   "think",
   "attemptCompletion",
   "createExpert",

diff --git a/packages/filesystem/README.md b/packages/filesystem/README.md
@@ -32,10 +32,10 @@ import {
 
 // Create and store a job
 const job = createInitialJob("job-123", "my-expert", 50)
-storeJob(job)
+await storeJob(job)
 
 // Retrieve a job
-const retrieved = retrieveJob("job-123")
+const retrieved = await retrieveJob("job-123")
 
 // Get all jobs
 const jobs = getAllJobs()

diff --git a/packages/filesystem/src/job.ts b/packages/filesystem/src/job.ts
@@ -1,4 +1,5 @@
-import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs"
+import { existsSync, readdirSync, readFileSync } from "node:fs"
+import { mkdir, readFile, writeFile } from "node:fs/promises"
 import path from "node:path"
 import { type Job, jobSchema } from "@perstack/core"
 
@@ -10,23 +11,29 @@ export function getJobDir(jobId: string): string {
   return `${getJobsDir()}/${jobId}`
 }
 
-export function storeJob(job: Job): void {
+export async function storeJob(job: Job): Promise<void> {
   const jobDir = getJobDir(job.id)
-  if (!existsSync(jobDir)) {
-    mkdirSync(jobDir, { recursive: true })
-  }
+  await mkdir(jobDir, { recursive: true })
   const jobPath = path.resolve(jobDir, "job.json")
-  writeFileSync(jobPath, JSON.stringify(job, null, 2))
+  await writeFile(jobPath, JSON.stringify(job, null, 2))
 }
 
-export function retrieveJob(jobId: string): Job | undefined {
+export async function retrieveJob(jobId: string): Promise<Job | undefined> {
   const jobDir = getJobDir(jobId)
   const jobPath = path.resolve(jobDir, "job.json")
-  if (!existsSync(jobPath)) {
-    return undefined
+  try {
+    const content = await readFile(jobPath, "utf-8")
+    return jobSchema.parse(JSON.parse(content))
+  } catch (error) {
+    if (
+      error instanceof Error &&
+      "code" in error &&
+      (error as NodeJS.ErrnoException).code === "ENOENT"
+    ) {
+      return undefined
+    }
+    throw error
   }
-  const content = readFileSync(jobPath, "utf-8")
-  return jobSchema.parse(JSON.parse(content))
 }
 
 export function getAllJobs(): Job[] {

diff --git a/packages/runtime/src/orchestration/delegation-executor.ts b/packages/runtime/src/orchestration/delegation-executor.ts
@@ -3,6 +3,7 @@ import type {
   Checkpoint,
   DelegationTarget,
   Expert,
+  Job,
   Message,
   RunEvent,
   RunParamsInput,
@@ -45,6 +46,8 @@ export type DelegationExecutionResult = {
 export type DelegationRunOptions = {
   storeCheckpoint?: (checkpoint: Checkpoint) => Promise<void>
   retrieveCheckpoint?: (jobId: string, checkpointId: string) => Promise<Checkpoint>
+  storeJob?: (job: Job) => Promise<void>
+  retrieveJob?: (jobId: string) => Promise<Job | undefined>
   eventListener?: (event: RunEvent | RuntimeEvent) => void
   storeEvent?: (event: RunEvent) => Promise<void>
   returnOnDelegationComplete?: boolean

diff --git a/packages/runtime/src/run.test.ts b/packages/runtime/src/run.test.ts
@@ -100,7 +100,7 @@ describe("@perstack/runtime: run", () => {
       setupMockExecutor([{ checkpoint: completedCheckpoint }])
 
       const storeJob = vi.fn()
-      const retrieveJob = vi.fn().mockReturnValue(undefined)
+      const retrieveJob = vi.fn().mockResolvedValue(undefined)
 
       await run({ setting, checkpoint }, { storeJob, retrieveJob })
 
@@ -131,7 +131,7 @@ describe("@perstack/runtime: run", () => {
         },
       }
       const storeJob = vi.fn()
-      const retrieveJob = vi.fn().mockReturnValue(existingJob)
+      const retrieveJob = vi.fn().mockResolvedValue(existingJob)
 
       await run({ setting, checkpoint }, { storeJob, retrieveJob })
 
@@ -158,7 +158,7 @@ describe("@perstack/runtime: run", () => {
         },
       }
       const storeJob = vi.fn()
-      const retrieveJob = vi.fn().mockReturnValue(pausedJob)
+      const retrieveJob = vi.fn().mockResolvedValue(pausedJob)
 
       await run({ setting, checkpoint }, { storeJob, retrieveJob })
 

diff --git a/packages/runtime/src/run.ts b/packages/runtime/src/run.ts
@@ -27,8 +27,8 @@ export type RunOptions = {
   retrieveCheckpoint?: (jobId: string, checkpointId: string) => Promise<Checkpoint>
   storeCheckpoint?: (checkpoint: Checkpoint) => Promise<void>
   storeEvent?: (event: RunEvent) => Promise<void>
-  storeJob?: (job: Job) => void
-  retrieveJob?: (jobId: string) => Job | undefined
+  storeJob?: (job: Job) => Promise<void>
+  retrieveJob?: (jobId: string) => Promise<Job | undefined>
   createJob?: (jobId: string, expertKey: string, maxSteps?: number) => Job
   eventListener?: (event: RunEvent | RuntimeEvent) => void
   resolveExpertToRun?: ResolveExpertToRunFn
@@ -67,8 +67,8 @@ export async function run(runInput: RunParamsInput, options?: RunOptions): Promi
   const runParams = runParamsSchema.parse(runInput)
   let { setting, checkpoint } = runParams
 
-  const storeJob = options?.storeJob ?? (() => {})
-  const retrieveJob = options?.retrieveJob ?? (() => undefined)
+  const storeJob = options?.storeJob ?? (async () => {})
+  const retrieveJob = options?.retrieveJob ?? (async () => undefined)
   const retrieveCheckpoint =
     options?.retrieveCheckpoint ??
     (async () => {
@@ -77,11 +77,12 @@ export async function run(runInput: RunParamsInput, options?: RunOptions): Promi
   const createJob = options?.createJob ?? defaultCreateJob
 
   let job: Job =
-    retrieveJob(setting.jobId) ?? createJob(setting.jobId, setting.expertKey, setting.maxSteps)
+    (await retrieveJob(setting.jobId)) ??
+    createJob(setting.jobId, setting.expertKey, setting.maxSteps)
   if (job.status !== "running") {
     job = { ...job, status: "running", finishedAt: undefined }
   }
-  storeJob(job)
+  await storeJob(job)
 
   const runExecutor = new CoordinatorExecutor({
     shouldContinueRun: options?.shouldContinueRun,
@@ -105,11 +106,11 @@ export async function run(runInput: RunParamsInput, options?: RunOptions): Promi
     switch (resultCheckpoint.status) {
       case "completed": {
         if (options?.returnOnDelegationComplete) {
-          storeJob(job)
+          await storeJob(job)
           return resultCheckpoint
         }
         if (resultCheckpoint.delegatedBy) {
-          storeJob(job)
+          await storeJob(job)
           const parentCheckpoint = await retrieveCheckpoint(
             setting.jobId,
             resultCheckpoint.delegatedBy.checkpointId,
@@ -119,17 +120,17 @@ export async function run(runInput: RunParamsInput, options?: RunOptions): Promi
           checkpoint = result.checkpoint
           break
         }
-        storeJob({ ...job, status: "completed", finishedAt: Date.now() })
+        await storeJob({ ...job, status: "completed", finishedAt: Date.now() })
         return resultCheckpoint
       }
 
       case "stoppedByInteractiveTool": {
-        storeJob({ ...job, status: "stoppedByInteractiveTool" })
+        await storeJob({ ...job, status: "stoppedByInteractiveTool" })
         return resultCheckpoint
       }
 
       case "stoppedByDelegate": {
-        storeJob(job)
+        await storeJob(job)
         const { delegateTo } = resultCheckpoint
         if (!delegateTo || delegateTo.length === 0) {
           throw new Error("No delegations found in checkpoint")
@@ -155,17 +156,17 @@ export async function run(runInput: RunParamsInput, options?: RunOptions): Promi
       }
 
       case "stoppedByExceededMaxSteps": {
-        storeJob({ ...job, status: "stoppedByMaxSteps", finishedAt: Date.now() })
+        await storeJob({ ...job, status: "stoppedByMaxSteps", finishedAt: Date.now() })
         return resultCheckpoint
       }
 
       case "stoppedByError": {
-        storeJob({ ...job, status: "stoppedByError", finishedAt: Date.now() })
+        await storeJob({ ...job, status: "stoppedByError", finishedAt: Date.now() })
         return resultCheckpoint
       }
 
       case "stoppedByCancellation": {
-        storeJob({ ...job, status: "stoppedByCancellation", finishedAt: Date.now() })
+        await storeJob({ ...job, status: "stoppedByCancellation", finishedAt: Date.now() })
         return resultCheckpoint
       }
 

diff --git a/packages/runtime/src/state-machine/machine.ts b/packages/runtime/src/state-machine/machine.ts
@@ -204,7 +204,7 @@ export const runtimeStateMachine = setup({
                 messages: [...context.checkpoint.messages, event.newMessage],
                 usage: newUsage,
                 contextWindowUsage: context.checkpoint.contextWindow
-                  ? calculateContextWindowUsage(newUsage, context.checkpoint.contextWindow)
+                  ? calculateContextWindowUsage(event.usage, context.checkpoint.contextWindow)
                   : undefined,
                 retryCount: 0,
               } satisfies Checkpoint

diff --git a/packages/runtime/src/state-machine/states/calling-mcp-tools.ts b/packages/runtime/src/state-machine/states/calling-mcp-tools.ts
@@ -6,7 +6,6 @@ import {
   resolveToolResults,
   type ToolResult,
 } from "@perstack/core"
-import { calculateContextWindowUsage } from "../../helpers/model.js"
 import { createEmptyUsage, sumUsage } from "../../helpers/usage.js"
 import { createExpertMessage, createToolMessage } from "../../messages/message.js"
 import { classifyToolCalls, toolExecutorFactory } from "../../tool-execution/index.js"
@@ -115,9 +114,7 @@ export async function callingMcpToolsLogic({
           ...checkpoint,
           messages: [...checkpoint.messages, ...newMessages],
           usage: newUsage,
-          contextWindowUsage: checkpoint.contextWindow
-            ? calculateContextWindowUsage(newUsage, checkpoint.contextWindow)
-            : undefined,
+          contextWindowUsage: checkpoint.contextWindowUsage,
           status: "completed",
           // Clear tool handling state on completion
           pendingToolCalls: undefined,

diff --git a/packages/runtime/src/state-machine/states/generating-run-result.ts b/packages/runtime/src/state-machine/states/generating-run-result.ts
@@ -177,7 +177,7 @@ export async function generatingRunResultLogic({
       messages: [...messages, ...newMessages],
       usage: newUsage,
       contextWindowUsage: checkpoint.contextWindow
-        ? calculateContextWindowUsage(newUsage, checkpoint.contextWindow)
+        ? calculateContextWindowUsage(usage, checkpoint.contextWindow)
         : undefined,
       status: "completed",
       // Clear tool handling state on completion

diff --git a/packages/skill-manager/src/skill-manager.ts b/packages/skill-manager/src/skill-manager.ts
@@ -249,12 +249,31 @@ export class SkillManager {
             const delegateExpert = experts[key]
             if (!delegateExpert) throw new Error(`Expert "${key}" not found`)
             await sm.addDelegate(delegateExpert)
+            // Persist in expert definition so it survives SkillManager recreation across runs
+            if (!expert.delegates.includes(key)) {
+              expert.delegates.push(key)
+            }
             const added = sm.getAdapters().get(delegateExpert.name)
             const toolName = added?.getToolDefinitions()[0]?.name ?? delegateExpert.name
             return { delegateToolName: toolName }
           },
-          removeDelegate: (name) => sm.removeDelegate(name),
+          removeDelegate: async (name) => {
+            await sm.removeDelegate(name)
+            const index = expert.delegates.indexOf(name)
+            if (index >= 0) {
+              expert.delegates.splice(index, 1)
+            }
+          },
           createExpert: async (input) => {
+            // Validate that all declared delegates exist before creating the expert
+            if (input.delegates) {
+              const missing = input.delegates.filter((d) => !experts[d])
+              if (missing.length > 0) {
+                throw new Error(
+                  `Cannot create expert "${input.key}": delegates [${missing.join(", ")}] do not exist. Create them with createExpert first.`,
+                )
+              }
+            }
             // Ensure @perstack/base is always included in skills
             const skills = input.skills
               ? {
@@ -267,7 +286,7 @@ export class SkillManager {
                   ...input.skills,
                 }
               : undefined
-            const expert = expertSchema.parse({
+            const newExpert = expertSchema.parse({
               key: input.key,
               name: input.key,
               version: input.version ?? "1.0.0",
@@ -278,8 +297,8 @@ export class SkillManager {
               tags: input.tags,
               providerTools: input.providerTools,
             })
-            experts[expert.key] = expert
-            return { expertKey: expert.key }
+            experts[newExpert.key] = newExpert
+            return { expertKey: newExpert.key }
           },
         })
         break

diff --git a/packages/tui-components/src/constants.ts b/packages/tui-components/src/constants.ts
@@ -26,7 +26,6 @@ export const INDICATOR = {
 
 export const STOP_EVENT_TYPES = [
   "stopRunByInteractiveTool",
-  "stopRunByDelegate",
   "stopRunByExceededMaxSteps",
 ] as const satisfies readonly RunEvent["type"][]