From ce05848a8a024e823c4cdfbf909497f466576675 Mon Sep 17 00:00:00 2001 From: HiranoMasaaki Date: Sat, 14 Mar 2026 00:10:48 +0000 Subject: [PATCH] fix: delegate file discipline, verify-test artifact reading ban, build loop limits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three issues observed in production: 1. Delegates creating new files instead of editing existing ones → workspace sprawl 2. verify-test reading artifacts and doing content review instead of running commands 3. build loop running indefinitely on soft opinion feedback Fixes: - Delegate meta instruction: prefer editing over creating, no proactive docs - verify-test: explicit ban on reading/reviewing produced artifacts - build: validate CONTINUE feedback (only hard signal failures loop), max 3 iterations - Bump create-expert to 1.0.19 Co-Authored-By: Claude Opus 4.6 (1M context) --- .changeset/delegate-file-discipline.md | 6 +++++ definitions/create-expert/perstack.toml | 27 +++++++++++-------- .../src/messages/instruction-message.ts | 3 +++ 3 files changed, 25 insertions(+), 11 deletions(-) create mode 100644 .changeset/delegate-file-discipline.md diff --git a/.changeset/delegate-file-discipline.md b/.changeset/delegate-file-discipline.md new file mode 100644 index 00000000..380211a1 --- /dev/null +++ b/.changeset/delegate-file-discipline.md @@ -0,0 +1,6 @@ +--- +"perstack": patch +"@perstack/runtime": patch +--- + +fix: add file creation discipline to delegate meta instruction and prevent infinite verify loops diff --git a/definitions/create-expert/perstack.toml b/definitions/create-expert/perstack.toml index 5c7592ae..595731d9 100644 --- a/definitions/create-expert/perstack.toml +++ b/definitions/create-expert/perstack.toml @@ -88,7 +88,7 @@ [experts."create-expert"] defaultModelTier = "high" -version = "1.0.18" +version = "1.0.19" description = "Creates and modifies Perstack expert definitions in perstack.toml" instruction = """ You are the coordinator for creating and modifying Perstack expert definitions. perstack.toml is the single source of truth — your job is to produce or modify it according to the user's request. @@ -133,7 +133,7 @@ pick = ["readTextFile", "exec", "attemptCompletion"] [experts."@create-expert/plan"] defaultModelTier = "high" -version = "1.0.18" +version = "1.0.19" description = """ Analyzes the user's request and produces plan.md: domain constraints, test query, verification signals, and role architecture. Provide: (1) what the expert should do, (2) path to existing perstack.toml if one exists. @@ -220,7 +220,7 @@ pick = [ [experts."@create-expert/build"] defaultModelTier = "low" -version = "1.0.18" +version = "1.0.19" description = """ Orchestrates the write → review → test → verify cycle for perstack.toml. Provide: path to plan.md (containing requirements, architecture, test query, and verification signals). @@ -245,11 +245,14 @@ You do NOT write perstack.toml yourself. You do NOT evaluate results yourself. Y 3. If review returns CONTINUE: delegate to write-definition with the review feedback, then restart from step 2 4. If review returns PASS: delegate to test-expert with the test query from plan.md, perstack.toml path, and coordinator expert name (do NOT pass verification signals — test-expert is a pure executor) 5. Delegate to verify-test: pass the test-expert result, the verification signals from plan.md, and the perstack.toml path -6. If verify-test returns CONTINUE: delegate to write-definition with the failure feedback, then restart from step 2 +6. If verify-test returns CONTINUE: validate the feedback (see below), then delegate to write-definition with actionable feedback, restart from step 2 7. If verify-test returns PASS: done — attemptCompletion with the verification evidence -### Why one query is enough -Hard signals are deterministic — same input, same result. If all signals pass AND reproduce identically on re-execution (verified by verify-test's reproducibility step), a single query provides the same confidence as multiple runs. Multiple queries compensate for soft signals; hard signals need no compensation. +### Validating CONTINUE feedback +Before looping, check whether verify-test's feedback is a hard signal failure (command X → expected Y → got Z) or a soft opinion (content quality, style, implementation choices). Only hard signal failures and structural check failures are valid loop reasons. If verify-test reports issues that are not tied to a specific command result, do NOT loop — report the current state and note the discrepancy. + +### Maximum 3 iterations +The write → review → test → verify cycle may run at most 3 times. If all signals have not passed after 3 iterations, attemptCompletion with: what passed, what failed, and the feedback from each iteration. Do NOT continue looping beyond this limit. ### IMPORTANT: One delegate call per response Delegate to exactly ONE delegate per response. Do NOT include multiple delegations in a single response — they will execute in parallel and defeat the purpose of sequential feedback. @@ -278,7 +281,7 @@ pick = ["readTextFile", "exec", "todo", "attemptCompletion"] [experts."@create-expert/write-definition"] defaultModelTier = "low" -version = "1.0.18" +version = "1.0.19" description = """ Writes or modifies a perstack.toml definition from plan.md requirements and architecture. Provide: (1) path to plan.md, (2) optionally path to existing perstack.toml to preserve, (3) optionally feedback from a failed test to address. @@ -381,7 +384,7 @@ pick = [ [experts."@create-expert/review-definition"] defaultModelTier = "low" -version = "1.0.18" +version = "1.0.19" description = """ Reviews perstack.toml against plan.md for domain knowledge alignment and instruction quality. Provide: (1) path to plan.md, (2) path to perstack.toml. @@ -430,14 +433,16 @@ pick = ["readTextFile", "todo", "attemptCompletion"] [experts."@create-expert/verify-test"] defaultModelTier = "low" -version = "1.0.18" +version = "1.0.19" description = """ Executes hard signal checks against test-expert's results, verifies their reproducibility, and checks the definition structure. Provide: (1) the test-expert's factual report (query, what was produced, errors), (2) the verification signals from plan.md, (3) path to perstack.toml. Returns a verdict: PASS (all signals pass and reproduce) or CONTINUE (iteration needed). """ instruction = """ -You are the verifier in the build loop. You execute hard signal checks — verification whose results do not depend on your judgment. You run commands, compare outputs, and report pass/fail. You do NOT read artifacts and form opinions about their quality. +You are the verifier in the build loop. You execute hard signal checks — verification whose results do not depend on your judgment. You run commands, compare outputs, and report pass/fail. + +You do NOT read produced artifacts. You do NOT review the content, quality, or style of produced artifacts. You do NOT open artifacts to inspect their contents. Your only inputs are command outputs and their expected results. If an issue is not observable through a command's output, it is not your concern. All three steps below are MANDATORY. Skipping any step is grounds for an invalid verification. @@ -493,7 +498,7 @@ pick = ["readTextFile", "exec", "todo", "attemptCompletion"] [experts."@create-expert/test-expert"] defaultModelTier = "low" -version = "1.0.18" +version = "1.0.19" description = """ Executes a single test query against a Perstack expert definition and reports what happened. Provide: (1) path to perstack.toml, (2) the test query to execute, (3) the coordinator expert name to test. diff --git a/packages/runtime/src/messages/instruction-message.ts b/packages/runtime/src/messages/instruction-message.ts index a63e297f..8e3c14c6 100644 --- a/packages/runtime/src/messages/instruction-message.ts +++ b/packages/runtime/src/messages/instruction-message.ts @@ -17,6 +17,9 @@ function getDelegateMetaInstruction(): string { return dedent` Before starting work, investigate the workspace and understand the current state. Then use the todo tool to create a plan of action. Work through the todos step by step, marking each completed as you go. + NEVER create files unless they're absolutely necessary for achieving your goal. ALWAYS prefer editing an existing file to creating a new one. + NEVER proactively create documentation files. Only create documentation files if explicitly requested. + When the task is complete, call attemptCompletion with a result parameter containing your final response. When you cannot help, call attemptCompletion without a result.