diff --git a/.github/workflows/validate-stack.yml b/.github/workflows/validate-stack.yml
index c50723e..c0b528c 100644
--- a/.github/workflows/validate-stack.yml
+++ b/.github/workflows/validate-stack.yml
@@ -26,7 +26,7 @@ jobs:
run: python scripts/validate_stack.py
- name: Python syntax check
- run: python -m py_compile scripts/validate_stack.py scripts/aoa-host-facts
+ run: python -m py_compile scripts/validate_stack.py scripts/aoa-host-facts scripts/aoa-local-ai-trials scripts/aoa-langgraph-pilot scripts/aoa-w5-pilot scripts/aoa-w6-pilot scripts/aoa-llamacpp-pilot
- name: Shellcheck scripts
run: |
@@ -34,6 +34,7 @@ jobs:
scripts/aoa-lib.sh \
scripts/aoa-doctor \
scripts/aoa-install-layout \
+ scripts/aoa-sync-federation-surfaces \
scripts/aoa-sync-configs \
scripts/aoa-bootstrap-configs \
scripts/aoa-check-layout \
@@ -131,6 +132,11 @@ jobs:
export AOA_EXTRA_COMPOSE_FILES="compose/tuning/ollama.cpu.yml"
scripts/aoa-render-config --profile core >/dev/null
+ printf 'GGUFTEST' > "$RUNNER_TEMP/qwen3.5-9b.gguf"
+ export AOA_LLAMACPP_MODEL_HOST_PATH="$RUNNER_TEMP/qwen3.5-9b.gguf"
+ export AOA_EXTRA_COMPOSE_FILES="compose/modules/32-llamacpp-inference.yml,compose/modules/44-llamacpp-agent-sidecar.yml"
+ scripts/aoa-render-config --preset intel-full >/dev/null
+
- name: Capture host-facts artifacts
run: |
mkdir -p "$RUNNER_TEMP/host-facts"
diff --git a/README.md b/README.md
index 973751f..57b3fd8 100644
--- a/README.md
+++ b/README.md
@@ -52,31 +52,33 @@ This repository should not absorb:
7. Read [docs/PROFILE_RECIPES](docs/PROFILE_RECIPES.md).
8. Read [docs/RENDER_TRUTH](docs/RENDER_TRUTH.md).
9. Read [docs/RUNTIME_BENCH_POLICY](docs/RUNTIME_BENCH_POLICY.md).
-10. Read [docs/INTERNAL_PROBES](docs/INTERNAL_PROBES.md).
-11. Read [docs/PATHS](docs/PATHS.md).
-12. Read [docs/WINDOWS_BRIDGE](docs/WINDOWS_BRIDGE.md).
-13. Read [docs/WINDOWS_SETUP](docs/WINDOWS_SETUP.md).
-14. Read [docs/WINDOWS_PERFORMANCE](docs/WINDOWS_PERFORMANCE.md).
-15. Read [docs/STORAGE_LAYOUT](docs/STORAGE_LAYOUT.md).
-16. Read [docs/REFERENCE_PLATFORM](docs/REFERENCE_PLATFORM.md).
-17. Read [docs/REFERENCE_PLATFORM_SPEC](docs/REFERENCE_PLATFORM_SPEC.md).
-18. Read [docs/MACHINE_FIT_POLICY](docs/MACHINE_FIT_POLICY.md).
-19. Read [docs/PLATFORM_ADAPTATION_POLICY](docs/PLATFORM_ADAPTATION_POLICY.md).
-20. Read [docs/BRANCH_POLICY](docs/BRANCH_POLICY.md).
-21. Read [docs/MEMO_RUNTIME_SEAM](docs/MEMO_RUNTIME_SEAM.md).
-22. Read [docs/EVAL_RUNTIME_SEAM](docs/EVAL_RUNTIME_SEAM.md).
-23. Read [docs/PLAYBOOK_RUNTIME_SEAM](docs/PLAYBOOK_RUNTIME_SEAM.md).
-24. Read [docs/MODEL_PROFILES](docs/MODEL_PROFILES.md).
-25. Read [docs/CONTEXT_BUDGET_POLICY](docs/CONTEXT_BUDGET_POLICY.md).
-26. Read [docs/RECURRENCE_RUNTIME_POLICY](docs/RECURRENCE_RUNTIME_POLICY.md).
-27. Read [docs/DEPLOYMENT](docs/DEPLOYMENT.md).
-28. Read [docs/FIRST_RUN](docs/FIRST_RUN.md).
-29. Read [docs/DOCTOR](docs/DOCTOR.md).
-30. Read [docs/SECRETS_BOOTSTRAP](docs/SECRETS_BOOTSTRAP.md).
-31. Read [docs/LIFECYCLE](docs/LIFECYCLE.md).
-32. Read [docs/RUNBOOK](docs/RUNBOOK.md).
-33. Read [docs/SECURITY](docs/SECURITY.md).
-34. Read [docs/MIGRATION_FROM_OLD](docs/MIGRATION_FROM_OLD.md).
+10. Read [docs/LLAMACPP_PILOT](docs/LLAMACPP_PILOT.md).
+11. Read [docs/LOCAL_AI_TRIALS](docs/LOCAL_AI_TRIALS.md).
+12. Read [docs/INTERNAL_PROBES](docs/INTERNAL_PROBES.md).
+13. Read [docs/PATHS](docs/PATHS.md).
+14. Read [docs/WINDOWS_BRIDGE](docs/WINDOWS_BRIDGE.md).
+15. Read [docs/WINDOWS_SETUP](docs/WINDOWS_SETUP.md).
+16. Read [docs/WINDOWS_PERFORMANCE](docs/WINDOWS_PERFORMANCE.md).
+17. Read [docs/STORAGE_LAYOUT](docs/STORAGE_LAYOUT.md).
+18. Read [docs/REFERENCE_PLATFORM](docs/REFERENCE_PLATFORM.md).
+19. Read [docs/REFERENCE_PLATFORM_SPEC](docs/REFERENCE_PLATFORM_SPEC.md).
+20. Read [docs/MACHINE_FIT_POLICY](docs/MACHINE_FIT_POLICY.md).
+21. Read [docs/PLATFORM_ADAPTATION_POLICY](docs/PLATFORM_ADAPTATION_POLICY.md).
+22. Read [docs/BRANCH_POLICY](docs/BRANCH_POLICY.md).
+23. Read [docs/MEMO_RUNTIME_SEAM](docs/MEMO_RUNTIME_SEAM.md).
+24. Read [docs/EVAL_RUNTIME_SEAM](docs/EVAL_RUNTIME_SEAM.md).
+25. Read [docs/PLAYBOOK_RUNTIME_SEAM](docs/PLAYBOOK_RUNTIME_SEAM.md).
+26. Read [docs/MODEL_PROFILES](docs/MODEL_PROFILES.md).
+27. Read [docs/CONTEXT_BUDGET_POLICY](docs/CONTEXT_BUDGET_POLICY.md).
+28. Read [docs/RECURRENCE_RUNTIME_POLICY](docs/RECURRENCE_RUNTIME_POLICY.md).
+29. Read [docs/DEPLOYMENT](docs/DEPLOYMENT.md).
+30. Read [docs/FIRST_RUN](docs/FIRST_RUN.md).
+31. Read [docs/DOCTOR](docs/DOCTOR.md).
+32. Read [docs/SECRETS_BOOTSTRAP](docs/SECRETS_BOOTSTRAP.md).
+33. Read [docs/LIFECYCLE](docs/LIFECYCLE.md).
+34. Read [docs/RUNBOOK](docs/RUNBOOK.md).
+35. Read [docs/SECURITY](docs/SECURITY.md).
+36. Read [docs/MIGRATION_FROM_OLD](docs/MIGRATION_FROM_OLD.md).
For the shortest next route by intent:
- if you need the ecosystem center, layer map, or federation rules, go to [`Agents-of-Abyss`](https://github.com/8Dionysus/Agents-of-Abyss)
@@ -89,6 +91,8 @@ For the shortest next route by intent:
- if you need playbook meaning, activation doctrine, or authored execution bundles, go to [`aoa-playbooks`](https://github.com/8Dionysus/aoa-playbooks)
- if you need the Windows host and WSL bridge workflow, read [docs/WINDOWS_BRIDGE](docs/WINDOWS_BRIDGE.md), [docs/WINDOWS_SETUP](docs/WINDOWS_SETUP.md), and [docs/WINDOWS_PERFORMANCE](docs/WINDOWS_PERFORMANCE.md)
- if you need runtime benchmark ownership, storage, and manifest rules, read [docs/RUNTIME_BENCH_POLICY](docs/RUNTIME_BENCH_POLICY.md)
+- if you need the bounded llama.cpp A/B runtime pilot next to the validated Ollama path, read [docs/LLAMACPP_PILOT](docs/LLAMACPP_PILOT.md)
+- if you need bounded local-model trial contracts, W4 supervised edits, or the promoted W5/W6 local-worker path, read [docs/LOCAL_AI_TRIALS](docs/LOCAL_AI_TRIALS.md)
- if you need normative host posture or machine-readable host-facts capture, read [docs/REFERENCE_PLATFORM](docs/REFERENCE_PLATFORM.md) and [docs/REFERENCE_PLATFORM_SPEC](docs/REFERENCE_PLATFORM_SPEC.md)
- if you need to tune the runtime to the current machine, confirm driver freshness, or decide which preset the host should prefer, read [docs/MACHINE_FIT_POLICY](docs/MACHINE_FIT_POLICY.md)
- if you need a compact record of platform-specific quirks, adaptations, and portability notes, read [docs/PLATFORM_ADAPTATION_POLICY](docs/PLATFORM_ADAPTATION_POLICY.md)
@@ -145,9 +149,11 @@ The stack is organized around explicit compose modules rather than one swollen f
- `20-orchestration.yml`
- `30-local-inference.yml`
- `31-intel-inference.yml`
+- `32-llamacpp-inference.yml`
- `40-llm-gateway.yml`
- `41-agent-api.yml`
- `42-agent-api-intel.yml`
+- `44-llamacpp-agent-sidecar.yml`
- `50-speech.yml`
- `51-browser-tools.yml`
- `60-monitoring.yml`
diff --git a/compose/README.md b/compose/README.md
index a60a049..39c9901 100644
--- a/compose/README.md
+++ b/compose/README.md
@@ -8,9 +8,11 @@ The new stack uses small compose modules, named profiles, and named presets.
- `modules/20-orchestration.yml`
- `modules/30-local-inference.yml`
- `modules/31-intel-inference.yml`
+- `modules/32-llamacpp-inference.yml`
- `modules/40-llm-gateway.yml`
- `modules/41-agent-api.yml`
- `modules/42-agent-api-intel.yml`
+- `modules/44-llamacpp-agent-sidecar.yml`
- `modules/50-speech.yml`
- `modules/51-browser-tools.yml`
- `modules/60-monitoring.yml`
@@ -38,6 +40,15 @@ A profile is only a list of module filenames in activation order.
A preset is a list of profile names in activation order.
+## Optional pilot modules
+
+`32-llamacpp-inference.yml` and `44-llamacpp-agent-sidecar.yml` are not part of the default profiles or presets.
+
+They exist for the bounded `llama.cpp` sidecar pilot and are typically activated through:
+
+- `scripts/aoa-llamacpp-pilot`
+- or `AOA_EXTRA_COMPOSE_FILES` when you intentionally want the sidecar path
+
## Rule
New capability should arrive as:
diff --git a/compose/modules/32-llamacpp-inference.yml b/compose/modules/32-llamacpp-inference.yml
new file mode 100644
index 0000000..3695ad3
--- /dev/null
+++ b/compose/modules/32-llamacpp-inference.yml
@@ -0,0 +1,33 @@
+services:
+ llama-cpp:
+ image: "${AOA_LLAMACPP_IMAGE:-ghcr.io/ggml-org/llama.cpp:server-openvino}"
+ platform: linux/amd64
+ container_name: llama-cpp
+ restart: unless-stopped
+ cpus: "${AOA_LLAMACPP_CPUS:-4.0}"
+ mem_limit: "${AOA_LLAMACPP_MEM_LIMIT:-12g}"
+ mem_reservation: "${AOA_LLAMACPP_MEM_RESERVATION:-8g}"
+ environment:
+ LLAMA_ARG_MODEL: /models/qwen3.5-9b.gguf
+ LLAMA_ARG_ALIAS: "${AOA_LLAMACPP_MODEL_ALIAS:-qwen3.5:9b}"
+ LLAMA_ARG_HOST: 0.0.0.0
+ LLAMA_ARG_PORT: "8080"
+ LLAMA_ARG_CTX_SIZE: "${AOA_LLAMACPP_CTX_SIZE:-4096}"
+ LLAMA_ARG_THREADS: "${AOA_LLAMACPP_THREADS:-4}"
+ LLAMA_ARG_THREADS_BATCH: "${AOA_LLAMACPP_THREADS_BATCH:-4}"
+ LLAMA_ARG_THREADS_HTTP: "${AOA_LLAMACPP_THREADS_HTTP:-2}"
+ LLAMA_ARG_PARALLEL: "${AOA_LLAMACPP_PARALLEL:-1}"
+ LLAMA_ARG_BATCH_SIZE: "${AOA_LLAMACPP_BATCH_SIZE:-512}"
+ LLAMA_ARG_UBATCH_SIZE: "${AOA_LLAMACPP_UBATCH_SIZE:-128}"
+ LLAMA_ARG_N_GPU_LAYERS: "${AOA_LLAMACPP_N_GPU_LAYERS:-0}"
+ LLAMA_ARG_DEVICE: "${AOA_LLAMACPP_DEVICE:-none}"
+ LLAMA_ARG_ENDPOINT_METRICS: "${AOA_LLAMACPP_ENDPOINT_METRICS:-1}"
+ LLAMA_ARG_JINJA: "${AOA_LLAMACPP_JINJA:-1}"
+ LLAMA_ARG_REASONING: "${AOA_LLAMACPP_REASONING:-off}"
+ LLAMA_ARG_THINK: "${AOA_LLAMACPP_THINK:-none}"
+ LLAMA_ARG_NO_OP_OFFLOAD: "${AOA_LLAMACPP_NO_OP_OFFLOAD:-1}"
+ LLAMA_ARG_NO_WARMUP: "${AOA_LLAMACPP_NO_WARMUP:-1}"
+ volumes:
+ - "${AOA_LLAMACPP_MODEL_HOST_PATH:-/srv/abyss-stack/Logs/llamacpp/missing-model.gguf}:/models/qwen3.5-9b.gguf:ro,Z"
+ ports:
+ - "127.0.0.1:${AOA_LLAMACPP_HOST_PORT:-11435}:8080"
diff --git a/compose/modules/44-llamacpp-agent-sidecar.yml b/compose/modules/44-llamacpp-agent-sidecar.yml
new file mode 100644
index 0000000..ef92ec7
--- /dev/null
+++ b/compose/modules/44-llamacpp-agent-sidecar.yml
@@ -0,0 +1,32 @@
+services:
+ langchain-api-llamacpp:
+ build: "${AOA_STACK_ROOT:-/srv/abyss-stack}/Services/langchain-api"
+ container_name: langchain-api-llamacpp
+ env_file:
+ - "${AOA_STACK_ROOT:-/srv/abyss-stack}/Secrets/Configs/langchain-api.env"
+ environment:
+ LC_BASE_URL: http://llama-cpp:8080/v1
+ LC_API_KEY: EMPTY
+ LC_MODEL: "${AOA_LLAMACPP_MODEL_ALIAS:-qwen3.5:9b}"
+ LC_TIMEOUT_S: 300
+ LC_OLLAMA_NATIVE_CHAT: "false"
+ LC_OPENAI_LITERAL_COMPLETIONS: "true"
+ AOA_RETURN_ENABLED: "${AOA_RETURN_ENABLED:-true}"
+ AOA_RETURN_POLICY_PATH: "${AOA_RETURN_POLICY_PATH:-/app/config/return-policy.yaml}"
+ AOA_RETURN_LOG_ROOT: "${AOA_RETURN_LOG_ROOT:-/app/logs/returns-llamacpp}"
+ AOA_FEDERATED_RUN_ENABLED: "false"
+ EMBEDDINGS_PROVIDER: ovms
+ OVMS_EMBEDDINGS_URL: http://host.containers.internal:8200/v3/embeddings
+ OVMS_EMBEDDINGS_MODEL: qwen3-embed-0.6b-int8-ov
+ volumes:
+ - "${AOA_STACK_ROOT:-/srv/abyss-stack}/Configs/agent-api/return-policy.yaml:/app/config/return-policy.yaml:ro,Z"
+ - "${AOA_STACK_ROOT:-/srv/abyss-stack}/Logs/returns-llamacpp:/app/logs/returns-llamacpp:Z"
+ ports:
+ - "127.0.0.1:${AOA_LLAMACPP_LANGCHAIN_HOST_PORT:-5403}:5401"
+ healthcheck:
+ test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:5401/health', timeout=2).read()"]
+ interval: 5s
+ timeout: 3s
+ retries: 12
+ start_period: 5s
+ restart: unless-stopped
diff --git a/config-templates/Services/langchain-api/app/main.py b/config-templates/Services/langchain-api/app/main.py
index 1c79167..b9cce06 100644
--- a/config-templates/Services/langchain-api/app/main.py
+++ b/config-templates/Services/langchain-api/app/main.py
@@ -1,5 +1,6 @@
import json
import os
+import re
import urllib.error
import urllib.request
from pathlib import Path
@@ -18,6 +19,9 @@
app = FastAPI()
+THINK_TAG_PREFIX_RE = re.compile(r"^\s*.*?\s*", re.DOTALL)
+LITERAL_REPLY_PROMPT_RE = re.compile(r"^Reply exactly with:\s*(.+?)\s*$", re.DOTALL)
+
BASE_URL = os.getenv("LC_BASE_URL", "http://ollama:11434/v1").rstrip("/")
API_KEY = os.getenv("LC_API_KEY", "EMPTY")
MODEL = os.getenv("LC_MODEL", "qwen3.5:9b")
@@ -29,6 +33,10 @@
"yes",
"on",
}
+OPENAI_LITERAL_COMPLETIONS = os.getenv(
+ "LC_OPENAI_LITERAL_COMPLETIONS",
+ "false",
+).strip().lower() in {"1", "true", "yes", "on"}
OLLAMA_NATIVE_CHAT_URL = os.getenv(
"LC_OLLAMA_NATIVE_CHAT_URL",
"http://ollama:11434/api/chat",
@@ -209,6 +217,18 @@ def _http_post_json(
return parsed
+def _http_auth_headers() -> dict[str, str] | None:
+ if not API_KEY:
+ return None
+ return {"Authorization": f"Bearer {API_KEY}"}
+
+
+def _llamacpp_completion_url() -> str:
+ if BASE_URL.endswith("/v1"):
+ return f"{BASE_URL[:-3]}/completion"
+ return f"{BASE_URL}/completion"
+
+
def _route_api_post(path: str, payload: dict[str, Any]) -> dict[str, Any]:
url = f"{ROUTE_API_BASE_URL}{path}"
req = urllib.request.Request(
@@ -368,6 +388,96 @@ def _ollama_chat(req: RunReq) -> dict[str, Any]:
return {"ok": True, "backend": "ollama-native", "model": MODEL, "answer": content}
+def _flatten_response_content(content: Any) -> str:
+ if isinstance(content, str):
+ return content
+ if isinstance(content, list):
+ chunks: list[str] = []
+ for item in content:
+ if isinstance(item, str):
+ chunks.append(item)
+ continue
+ if isinstance(item, dict) and item.get("type") == "text" and isinstance(item.get("text"), str):
+ chunks.append(item["text"])
+ return "".join(chunks)
+ return ""
+
+
+def _normalize_answer_text(content: Any) -> str:
+ text = _flatten_response_content(content).strip()
+ while text:
+ updated = THINK_TAG_PREFIX_RE.sub("", text, count=1).strip()
+ if updated == text:
+ break
+ text = updated
+ return text
+
+
+def _literal_reply_target(req: RunReq) -> str | None:
+ if not OPENAI_LITERAL_COMPLETIONS:
+ return None
+ if float(req.temperature) != 0.0:
+ return None
+ if int(req.max_tokens) > 16:
+ return None
+ match = LITERAL_REPLY_PROMPT_RE.fullmatch(req.user_text.strip())
+ if not match:
+ return None
+ target = match.group(1).strip()
+ if not target or len(target) > 160:
+ return None
+ return target
+
+
+def _openai_completion(req: RunReq) -> dict[str, Any]:
+ text = ""
+ try:
+ native_payload = {
+ "model": MODEL,
+ "prompt": req.user_text,
+ "temperature": float(req.temperature),
+ "n_predict": int(req.max_tokens),
+ }
+ native_data = _http_post_json(
+ _llamacpp_completion_url(),
+ native_payload,
+ TIMEOUT,
+ headers=_http_auth_headers(),
+ )
+ native_text = native_data.get("content")
+ if isinstance(native_text, str):
+ text = native_text
+ except RuntimeError:
+ text = ""
+
+ if not text:
+ payload = {
+ "model": MODEL,
+ "prompt": req.user_text,
+ "temperature": float(req.temperature),
+ "max_tokens": int(req.max_tokens),
+ }
+ data = _http_post_json(
+ f"{BASE_URL}/completions",
+ payload,
+ TIMEOUT,
+ headers=_http_auth_headers(),
+ )
+ choices = data.get("choices")
+ if isinstance(choices, list) and choices:
+ first = choices[0]
+ if isinstance(first, dict):
+ text = str(first.get("text") or "")
+ if not isinstance(text, str) or not text:
+ raise RuntimeError("unexpected_openai_completion_response: missing text")
+ return {
+ "ok": True,
+ "backend": "langchain",
+ "model": MODEL,
+ "answer": _normalize_answer_text(text),
+ }
+
+
def _invoke_run_backend(req: RunReq) -> dict[str, Any]:
if OLLAMA_NATIVE_CHAT and ("litellm" in BASE_URL or "ollama" in BASE_URL):
return _ollama_chat(req)
@@ -375,6 +485,9 @@ def _invoke_run_backend(req: RunReq) -> dict[str, Any]:
if ChatOpenAI is None or HumanMessage is None:
raise RuntimeError("langchain_openai dependencies are not installed")
+ if _literal_reply_target(req) is not None:
+ return _openai_completion(req)
+
llm_kwargs: dict[str, Any] = {
"model": MODEL,
"base_url": BASE_URL,
@@ -402,7 +515,12 @@ def _invoke_run_backend(req: RunReq) -> dict[str, Any]:
llm = ChatOpenAI(**llm_kwargs)
resp = llm.invoke([HumanMessage(content=req.user_text)])
- return {"ok": True, "backend": "langchain", "model": MODEL, "answer": (resp.content or "")}
+ return {
+ "ok": True,
+ "backend": "langchain",
+ "model": MODEL,
+ "answer": _normalize_answer_text(resp.content),
+ }
def _effective_profile_class(profile_class: PROFILE_CLASS | None) -> PROFILE_CLASS:
diff --git a/docs/FIRST_RUN.md b/docs/FIRST_RUN.md
index 2dfb0fe..d6955c6 100644
--- a/docs/FIRST_RUN.md
+++ b/docs/FIRST_RUN.md
@@ -149,6 +149,17 @@ scripts/aoa-local-ai-trials run-wave W0
That flow keeps machine-readable trial truth under `Logs/local-ai-trials/` and writes Markdown mirrors to `Dionysus/reports/local-ai-trials/`.
Use [LOCAL_AI_TRIALS](LOCAL_AI_TRIALS.md) for the full contract.
+## Optional llama.cpp backend-parity pilot
+
+If you want to compare a bounded `llama.cpp` sidecar against the current validated Ollama path without replacing the canonical runtime:
+
+```bash
+scripts/aoa-llamacpp-pilot run --preset intel-full
+```
+
+That pilot resolves the resident Ollama GGUF blob, starts `llama-cpp` on a separate host port, exposes a sidecar `langchain-api-llamacpp` on `127.0.0.1:5403`, and writes comparison artifacts under `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/comparisons/`.
+Use [LLAMACPP_PILOT](LLAMACPP_PILOT.md) for the full contract.
+
## Compose optional layers manually
### Agent runtime plus tools
diff --git a/docs/LANGGRAPH_PILOT.md b/docs/LANGGRAPH_PILOT.md
new file mode 100644
index 0000000..68a53a6
--- /dev/null
+++ b/docs/LANGGRAPH_PILOT.md
@@ -0,0 +1,96 @@
+# LANGGRAPH PILOT
+
+## Purpose
+
+This document defines the bounded LangGraph sidecar pilot for `abyss-stack`.
+
+It is not a new service and not a migration of `aoa-local-ai-trials`.
+It is a comparison layer for one W4-shaped supervised edit flow.
+
+## Current pilot
+
+Program id:
+- `langgraph-sidecar-pilot-v1`
+- `langgraph-sidecar-llamacpp-v1` for the disposable backend-promotion fixture gate
+
+Current runtime path:
+- `intel-full -> langchain-api /run -> ollama-native`
+
+Current cases:
+- `8dionysus-profile-routing-clarity`
+- `aoa-routing-generated-surface-refresh`
+- `fixture-docs-wording-alignment` only when the program id is `langgraph-sidecar-llamacpp-v1`
+
+The docs case is also used for the explicit pause/resume scenario.
+
+## Operator surface
+
+Install the pilot dependency manifest before use:
+
+```bash
+python3 -m pip install --user -r scripts/requirements-langgraph-pilot.txt
+```
+
+Use:
+
+```bash
+scripts/aoa-langgraph-pilot materialize
+scripts/aoa-langgraph-pilot run-case 8dionysus-profile-routing-clarity --until approval
+scripts/aoa-langgraph-pilot resume-case 8dionysus-profile-routing-clarity
+scripts/aoa-langgraph-pilot run-case aoa-routing-generated-surface-refresh --until done
+scripts/aoa-langgraph-pilot status 8dionysus-profile-routing-clarity
+```
+
+Alternate backend/program roots are supported:
+
+```bash
+scripts/aoa-langgraph-pilot --url http://127.0.0.1:5403/run --program-id langgraph-sidecar-llamacpp-v1 run-case fixture-docs-wording-alignment --until approval
+scripts/aoa-langgraph-pilot --url http://127.0.0.1:5403/run --program-id langgraph-sidecar-llamacpp-v1 resume-case fixture-docs-wording-alignment
+```
+
+## Boundaries
+
+The sidecar pilot:
+- reuses the W4 bounded-mutation contract
+- reuses `approval.status.json`
+- reuses the existing worktree-first landing safety posture
+- keeps runtime truth local under `Logs/local-ai-trials/`
+- mirrors only Markdown summaries to `Dionysus`
+
+The sidecar pilot does not:
+- add a new HTTP API
+- replace `aoa-local-ai-trials`
+- replace `langchain-api /run`
+- widen W4 into autonomous long-horizon execution
+
+## Artifacts
+
+Runtime truth:
+- `${AOA_STACK_ROOT}/Logs/local-ai-trials/langgraph-sidecar-pilot-v1/`
+- `${AOA_STACK_ROOT}/Logs/local-ai-trials/langgraph-sidecar-llamacpp-v1/` for the disposable promotion fixture
+
+Mirror:
+- `/srv/Dionysus/reports/local-ai-trials/langgraph-sidecar-pilot-v1/`
+- `/srv/Dionysus/reports/local-ai-trials/langgraph-sidecar-llamacpp-v1/` for the disposable promotion fixture
+
+Per-case packets keep the existing local-trial packet shape:
+- `case.spec.json`
+- `run.manifest.json`
+- `result.summary.json`
+- `report.md`
+
+The sidecar adds:
+- `graph.state.json`
+- `graph.history.jsonl`
+- `interrupt.json`
+- `approval.status.json`
+- `node-artifacts/`
+
+## Comparison goal
+
+The sidecar should answer a narrow question:
+
+- does LangGraph improve pause/resume and recovery clarity for a bounded supervised edit flow
+- without reducing W4 safety, scope discipline, or reportability
+
+Until that answer is positive, the existing runner remains the execution baseline.
diff --git a/docs/LLAMACPP_PILOT.md b/docs/LLAMACPP_PILOT.md
new file mode 100644
index 0000000..af4a4e0
--- /dev/null
+++ b/docs/LLAMACPP_PILOT.md
@@ -0,0 +1,199 @@
+# LLAMACPP PILOT
+
+## Purpose
+
+This document defines the bounded `llama.cpp` sidecar pilot for `abyss-stack`.
+
+It exists to answer a narrow question:
+
+**does a `llama.cpp` sidecar improve the local Qwen runtime posture on this machine without replacing the validated canonical Ollama path yet?**
+
+## Boundary
+
+The pilot is:
+- sidecar-only
+- operator-invoked
+- bounded to runtime-parity work
+- allowed to compare latency and runtime behavior
+
+The pilot is not:
+- a silent replacement for the canonical local runtime
+- a proof-layer quality verdict
+- a claim that `llama.cpp` is already promoted into machine-fit canon
+
+## Current default posture
+
+The validated canonical path remains:
+
+`intel-full -> langchain-api /run -> litellm/ollama + route-api`
+
+The `llama.cpp` pilot is intentionally separate from that path until a reviewed promotion decision says otherwise.
+
+## What the pilot reuses
+
+The pilot does not require a second large model download by default.
+
+It resolves the resident Ollama `qwen3.5:9b` manifest under:
+
+- `${AOA_STACK_ROOT}/Services/ollama/models/manifests/registry.ollama.ai/library/qwen3.5/9b`
+
+Then it mounts the corresponding GGUF blob into the `llama.cpp` container as a read-only model file.
+
+This keeps the pilot honest:
+- same local Qwen family
+- same quantized resident artifact
+- different serving runtime
+
+## Pilot services
+
+When the pilot is active, it adds two localhost-only services:
+
+- `llama-cpp` -> `http://127.0.0.1:11435`
+- `langchain-api-llamacpp` -> `http://127.0.0.1:5403/health`
+
+The canonical services stay in place:
+
+- `ollama` -> `http://127.0.0.1:11434`
+- `langchain-api` -> `http://127.0.0.1:5401/health`
+
+That separation preserves honest A/B comparison.
+
+## Operator commands
+
+Use the source-checkout script:
+
+```bash
+scripts/aoa-llamacpp-pilot doctor --preset intel-full
+scripts/aoa-llamacpp-pilot up --preset intel-full
+scripts/aoa-llamacpp-pilot bench --preset intel-full
+scripts/aoa-llamacpp-pilot run --preset intel-full
+scripts/aoa-llamacpp-pilot promote --preset intel-full
+scripts/aoa-llamacpp-pilot status
+scripts/aoa-llamacpp-pilot down
+```
+
+### `doctor`
+
+- syncs source-managed configs into the runtime mirror unless `--skip-sync` is used
+- confirms `aoa-doctor --preset intel-full`
+- resolves the reusable GGUF model blob
+- reports the base runtime health
+
+### `up`
+
+- ensures the base preset is up
+- starts the `llama.cpp` sidecar services
+- waits for `llama.cpp` and `langchain-api-llamacpp` health
+
+### `bench`
+
+- runs the bounded Qwen latency bench against `http://127.0.0.1:5403/run`
+- labels the result as a `llama.cpp` sidecar run
+
+### `run`
+
+- runs a fresh Ollama baseline bench on `5401`
+- runs a fresh `llama.cpp` sidecar bench on `5403`
+- writes a comparison packet under:
+ - `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/comparisons/llamacpp-sidecar-pilot-v1/`
+
+### `promote`
+
+- screens the fixed `Q4_K_M` and `Q6_K` `bartowski` candidates on the same CPU-safe sidecar posture
+- chooses a winner only if the candidate stays stable and `exact-reply` is not more than `15%` slower than the fresh Ollama baseline
+- runs `W0` on `http://127.0.0.1:5403/run` under `qwen-llamacpp-pilot-v1`
+- runs one disposable `W4` docs fixture dry-run under `langgraph-sidecar-llamacpp-v1`
+- writes the promotion packet under:
+ - `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/promotions/llamacpp-promotion-gate-v1/`
+
+### `status`
+
+- reports the latest saved comparison ref
+- reports current sidecar and baseline health
+
+### `down`
+
+- stops and removes only the sidecar services
+- does not tear down the canonical base stack
+
+## Runtime knobs
+
+The pilot accepts the upstream `llama-server` posture through environment variables such as:
+
+- `AOA_LLAMACPP_IMAGE`
+- `AOA_LLAMACPP_CTX_SIZE`
+- `AOA_LLAMACPP_THREADS`
+- `AOA_LLAMACPP_N_GPU_LAYERS`
+- `AOA_LLAMACPP_JINJA`
+- `AOA_LLAMACPP_REASONING_FORMAT`
+
+Default posture is conservative:
+- official `ghcr.io/ggml-org/llama.cpp:server-openvino`
+- CPU-safe sidecar defaults before any acceleration attempt:
+ - `AOA_LLAMACPP_DEVICE=none`
+ - `AOA_LLAMACPP_NO_OP_OFFLOAD=1`
+ - `AOA_LLAMACPP_THREADS=4`
+ - `AOA_LLAMACPP_THREADS_BATCH=4`
+ - `AOA_LLAMACPP_THREADS_HTTP=2`
+ - `AOA_LLAMACPP_CTX_SIZE=4096`
+ - `AOA_LLAMACPP_BATCH_SIZE=512`
+ - `AOA_LLAMACPP_UBATCH_SIZE=128`
+ - `AOA_LLAMACPP_REASONING=off`
+ - `AOA_LLAMACPP_THINK=none`
+ - `AOA_LLAMACPP_CPUS=4.0`
+ - `AOA_LLAMACPP_MEM_LIMIT=12g`
+- localhost-only exposure
+- separate sidecar `langchain-api`
+- OVMS embeddings remain in place for the Intel pilot path
+
+The pilot now brings services up in two stages:
+- `llama-cpp`
+- health check
+- `langchain-api-llamacpp`
+
+This reduces host shock during first model load and gives a clean failure boundary before the API sidecar is attached.
+
+If you want a more machine-specific acceleration attempt, override the pilot image or GPU-layer posture explicitly and record the outcome as a bounded runtime comparison rather than as an immediate canonical promotion.
+
+## Artifacts
+
+The pilot writes comparison packets under:
+
+```text
+${AOA_STACK_ROOT}/Logs/runtime-benchmarks/comparisons/llamacpp-sidecar-pilot-v1/
+ latest.json
+ runs/
+ /
+ model-resolution.json
+ baseline.bench.stdout.txt
+ baseline.bench.stderr.txt
+ candidate.bench.stdout.txt
+ candidate.bench.stderr.txt
+ pilot.manifest.json
+ comparison.json
+ report.md
+```
+
+These artifacts stay runtime-local.
+
+Promotion packets stay runtime-local too and capture:
+
+- fresh Ollama baseline smoke + bench
+- both quant screening outcomes
+- winner selection
+- `W0` verdict on the sidecar path
+- disposable `W4` fixture verdict
+- rollback status after sidecar teardown
+
+## Promotion rule
+
+A green or promising pilot does not automatically change the machine-fit record.
+
+Promotion requires:
+- reviewed comparison output
+- a clear recommendation that the sidecar is better for the intended bounded path
+- an explicit update to machine-fit and the validated runtime docs
+
+Until then:
+- Ollama remains the validated preferred path
+- `llama.cpp` remains an optional pilot substrate
diff --git a/docs/LOCAL_AI_TRIALS.md b/docs/LOCAL_AI_TRIALS.md
index 6f5b4e2..b63eca7 100644
--- a/docs/LOCAL_AI_TRIALS.md
+++ b/docs/LOCAL_AI_TRIALS.md
@@ -11,26 +11,49 @@ It is narrower than a proof layer and narrower than a benchmark-only surface:
- durable human+AI-readable summaries may be mirrored elsewhere
- no new HTTP APIs are introduced for the trial surface
-## Canonical pilot in this runtime
+## Pilot lineage in this runtime
-Current program:
+Baseline control program:
- `qwen-local-pilot-v1`
-Canonical baseline:
+Promoted local-worker path:
+- `w5-langgraph-llamacpp-v1`
+- `w6-bounded-autonomy-llamacpp-v1`
+
+Control baseline:
- preset: `intel-full`
-- runtime path: `langchain-api /run`
+- runtime path: `http://127.0.0.1:5401/run`
- local Qwen posture:
- `LC_OLLAMA_NUM_THREAD=6`
- `LC_OLLAMA_NUM_BATCH=32`
- `LC_OLLAMA_THINK=false`
+Promoted bounded-worker path:
+- runtime path: `http://127.0.0.1:5403/run`
+- backend: `llama.cpp`
+- orchestration: `LangGraph` for `W5` and `W6`
+
+Durable program roots now in use:
+- `qwen-local-pilot-v1`
+- `langgraph-sidecar-pilot-v1`
+- `qwen-llamacpp-pilot-v1`
+- `w5-langgraph-llamacpp-v1`
+- `w6-bounded-autonomy-llamacpp-v1`
+
## Dual-surface reporting
-Runtime truth root:
-- `${AOA_STACK_ROOT}/Logs/local-ai-trials/qwen-local-pilot-v1/`
+Runtime truth root family:
+- `${AOA_STACK_ROOT}/Logs/local-ai-trials//`
+
+Durable human+AI-readable mirror family:
+- `/srv/Dionysus/reports/local-ai-trials//`
-Durable human+AI-readable mirror:
-- `/srv/Dionysus/reports/local-ai-trials/qwen-local-pilot-v1/`
+Current durable program roots:
+- `qwen-local-pilot-v1`
+- `langgraph-sidecar-pilot-v1`
+- `qwen-llamacpp-pilot-v1`
+- `w5-langgraph-llamacpp-v1`
+- `w6-bounded-autonomy-llamacpp-v1`
Keep the split explicit:
@@ -79,6 +102,12 @@ scripts/aoa-local-ai-trials prepare-wave W4 --lane docs
scripts/aoa-local-ai-trials apply-case W4
```
+Optional backend/program overrides:
+
+```bash
+scripts/aoa-local-ai-trials --url http://127.0.0.1:5403/run --program-id qwen-llamacpp-pilot-v1 run-wave W0
+```
+
What the helper does now:
- materializes contracts and frozen case specs for `W0` through `W4`
@@ -97,6 +126,73 @@ What it does not do:
- it does not upgrade runtime success into portable proof wording
- it does not collapse `W4` into a silent monolithic mutator
+## LangGraph sidecar pilot
+
+The current trial runner remains the execution baseline.
+
+An optional comparison layer now also exists:
+
+```bash
+scripts/aoa-langgraph-pilot materialize
+scripts/aoa-langgraph-pilot run-case 8dionysus-profile-routing-clarity --until approval
+scripts/aoa-langgraph-pilot resume-case 8dionysus-profile-routing-clarity
+```
+
+The same runner can also be pointed at an alternate backend/program root:
+
+```bash
+scripts/aoa-langgraph-pilot --url http://127.0.0.1:5403/run --program-id langgraph-sidecar-llamacpp-v1 run-case fixture-docs-wording-alignment --until approval
+```
+
+Use [LANGGRAPH_PILOT](LANGGRAPH_PILOT.md) for the sidecar contract.
+
+## W5 long-horizon pilot
+
+The next bounded scenario layer lives beside the earlier waves:
+
+```bash
+scripts/aoa-w5-pilot materialize
+scripts/aoa-w5-pilot run-scenario --until milestone
+scripts/aoa-w5-pilot resume-scenario
+scripts/aoa-w5-pilot status --all
+```
+
+Use [W5_PILOT](W5_PILOT.md) for the full W5 contract.
+
+The W5 runner:
+
+- defaults to `http://127.0.0.1:5403/run`
+- treats the promoted `llama.cpp` path as the primary substrate while keeping baseline `5401` as a control path
+- keeps `LangGraph` as the primary orchestration layer
+- uses milestone gates instead of a monolithic `run-wave W5`
+- supports `read_only_summary`, `qwen_patch`, `script_refresh`, and `implementation_patch`
+- reuses `approval.status.json` at `plan_freeze`, `first_mutation`, and `landing`
+- keeps mutation scenarios worktree-first and explicitly approved before landing
+- records one local checkpoint commit per successful mutation scenario when a tracked diff is present
+
+## W6 bounded autonomy pilot
+
+The autonomy-focused layer lives beside W5 and keeps the same promoted substrate:
+
+```bash
+scripts/aoa-w6-pilot materialize
+scripts/aoa-w6-pilot run-scenario --until milestone
+scripts/aoa-w6-pilot resume-scenario
+scripts/aoa-w6-pilot status --all
+```
+
+Use [W6_PILOT](W6_PILOT.md) for the full W6 contract.
+
+The W6 runner:
+
+- defaults to `http://127.0.0.1:5403/run`
+- keeps `LangGraph` as the primary orchestration layer
+- reduces approvals to `plan_freeze` and `landing`
+- removes `first_mutation` from the normal mutation path
+- keeps mutation scenarios worktree-first and explicitly approved before landing
+- supports one bounded `autonomous_repair_loop` after `post_change_validation_failure`
+- tracks `novel_implementation_passes`, `preexisting_noop_count`, `repair_attempted_count`, and `repair_success_count`
+
## W1 grounded execution
Use:
diff --git a/docs/MACHINE_FIT_POLICY.md b/docs/MACHINE_FIT_POLICY.md
index a53f2dd..4d540c2 100644
--- a/docs/MACHINE_FIT_POLICY.md
+++ b/docs/MACHINE_FIT_POLICY.md
@@ -139,3 +139,6 @@ scripts/aoa-machine-fit \
`abyss-stack` may own the runtime-local record of what this machine should run and re-check.
It does not own the global meaning of sibling AoA layers, and it does not replace runtime benchmarks or proof artifacts.
+
+An optional runtime sidecar pilot, such as a bounded `llama.cpp` comparison, does not change the preferred machine-fit posture by itself.
+Only a reviewed promotion decision should move a pilot path into the validated preferred runtime path.
diff --git a/docs/PROFILES.md b/docs/PROFILES.md
index 7f7d064..dbcb8ee 100644
--- a/docs/PROFILES.md
+++ b/docs/PROFILES.md
@@ -65,6 +65,9 @@ Profiles stay small and legible.
A new service should usually enter through a module.
Only then should it be included in one or more profiles.
+The optional `llama.cpp` sidecar pilot deliberately stays outside the default profiles and presets.
+Use [LLAMACPP_PILOT](LLAMACPP_PILOT.md) when you want a bounded backend-parity comparison without redefining the validated canonical runtime path.
+
## Dependency note
Some modules rely on sibling modules being present in the same profile.
diff --git a/docs/PROFILE_RECIPES.md b/docs/PROFILE_RECIPES.md
index 70361b4..682468c 100644
--- a/docs/PROFILE_RECIPES.md
+++ b/docs/PROFILE_RECIPES.md
@@ -31,6 +31,21 @@ scripts/aoa-smoke --with-internal --profile
For profiles that include local Ollama inference, `aoa-up` now performs a post-start warmup of `qwen3.5:9b` and relies on Ollama keep-alive to avoid repeated cold loads during normal short idle periods.
+## Optional sidecar runtime pilot
+
+If you want a bounded `llama.cpp` backend-parity check without replacing the validated Ollama path, use:
+
+```bash
+scripts/aoa-llamacpp-pilot run --preset intel-full
+```
+
+That pilot keeps:
+- the canonical `langchain-api` on `127.0.0.1:5401`
+- the `llama.cpp` sidecar on `127.0.0.1:11435`
+- the sidecar `langchain-api-llamacpp` on `127.0.0.1:5403`
+
+Use [LLAMACPP_PILOT](LLAMACPP_PILOT.md) for the full operator contract.
+
## `core`
### What it is for
diff --git a/docs/RUNTIME_BENCH_POLICY.md b/docs/RUNTIME_BENCH_POLICY.md
index 26cbc4d..384f4de 100644
--- a/docs/RUNTIME_BENCH_POLICY.md
+++ b/docs/RUNTIME_BENCH_POLICY.md
@@ -133,6 +133,17 @@ That helper may reuse runtime benchmark artifacts as evidence inside case packet
- wave verdicts remain bounded trial judgments, not portable eval canon
- portable proof wording still belongs in `aoa-evals`
+## Optional backend-parity pilot
+
+For a bounded `llama.cpp` versus Ollama comparison on the same host and the same `langchain-api /run` contract, use:
+
+```bash
+scripts/aoa-llamacpp-pilot run --preset intel-full
+```
+
+That pilot runs a fresh Ollama baseline on `5401`, a fresh `llama.cpp` sidecar bench on `5403`, and writes a comparison packet under `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/comparisons/`.
+It is a runtime-parity aid, not a promotion decision by itself.
+
## Comparison hygiene
Before treating two runs as comparable, keep stable:
- host hardware class or disclose the delta
diff --git a/docs/SERVICE_CATALOG.md b/docs/SERVICE_CATALOG.md
index 43a2c5d..52fcd52 100644
--- a/docs/SERVICE_CATALOG.md
+++ b/docs/SERVICE_CATALOG.md
@@ -21,6 +21,11 @@ This file maps the first migrated runtime modules to their intended services.
- `ovms` — Intel and OpenVINO oriented model serving
+## `32-llamacpp-inference.yml`
+
+- `llama-cpp` — optional OpenAI-compatible GGUF serving sidecar for bounded backend-parity work
+- reuses a resolved local GGUF model file rather than changing the canonical validated Ollama path
+
## `40-llm-gateway.yml`
- `litellm` — model gateway and routing facade
@@ -38,6 +43,12 @@ This file maps the first migrated runtime modules to their intended services.
- `langchain-api` overlay — switches embeddings path to OVMS
- adds explicit OVMS runtime dependency for Intel-aware profiles
+## `44-llamacpp-agent-sidecar.yml`
+
+- `langchain-api-llamacpp` — optional sidecar agent API bound to a `llama.cpp` backend on a separate host port
+- preserves the canonical `langchain-api` service and `5401` path for honest A/B comparison
+- keeps embeddings on OVMS for Intel-aware pilot runs
+
## `43-federation-router.yml`
- `route-api` — localhost-only federation seam reader for mirrored `aoa-agents` contracts, `aoa-routing advisory routing surfaces`, `aoa-memo` recall surfaces, `aoa-evals` eval selection surfaces, `aoa-playbooks` activation/composition advisory surfaces, `aoa-kag` retrieval/regrounding surfaces, and the source-owned `tos-source` handoff companion
@@ -74,8 +85,10 @@ Expected localhost-only services include:
- n8n
- ollama
- ovms
+- llama-cpp
- litellm
- langchain-api
+- langchain-api-llamacpp
- route-api
- qwen-tts
- tts-router
diff --git a/docs/W5_PILOT.md b/docs/W5_PILOT.md
new file mode 100644
index 0000000..434a981
--- /dev/null
+++ b/docs/W5_PILOT.md
@@ -0,0 +1,139 @@
+# W5 PILOT
+
+## Purpose
+
+This document defines the bounded W5 long-horizon supervised pilot for `abyss-stack`.
+
+W5 is:
+
+- scenario-based rather than one monolithic `run-wave`
+- LangGraph-first for orchestration
+- milestone-gated for human supervision
+- llama.cpp-first on `http://127.0.0.1:5403/run`
+
+W5 is not:
+
+- a new public HTTP API
+- a replacement for `aoa-local-ai-trials`
+- an unbounded autonomy claim
+
+## Operator Surface
+
+Use:
+
+```bash
+scripts/aoa-w5-pilot materialize
+scripts/aoa-w5-pilot run-scenario --until milestone|done
+scripts/aoa-w5-pilot resume-scenario
+scripts/aoa-w5-pilot status --all
+scripts/aoa-w5-pilot status
+```
+
+Defaults:
+
+- run URL: `http://127.0.0.1:5403/run`
+- program id: `w5-langgraph-llamacpp-v1`
+- runtime truth: `${AOA_STACK_ROOT}/Logs/local-ai-trials/w5-langgraph-llamacpp-v1/`
+- mirror: `/srv/Dionysus/reports/local-ai-trials/w5-langgraph-llamacpp-v1/`
+
+## Scenario Catalog
+
+Materialize exactly these `8` scenarios in this order:
+
+1. `runtime-inspect-langchain-health`
+2. `runtime-inspect-route-api-health`
+3. `runtime-inspect-platform-adaptation`
+4. `evals-validate-and-explain`
+5. `aoa-evals-contract-wording-alignment`
+6. `aoa-routing-doc-boundary-alignment`
+7. `aoa-routing-generated-surface-refresh`
+8. `stack-sync-federation-check-mode`
+
+Execution modes:
+
+- `read_only_summary`
+- `qwen_patch`
+- `script_refresh`
+- `implementation_patch`
+
+The fixed recovery scenario is:
+
+- `stack-sync-federation-check-mode`
+- `force_pause_on_milestone = plan_freeze`
+
+## Milestone Gates
+
+Every scenario pauses at `plan_freeze`.
+
+Mutation scenarios also pause at:
+
+- `first_mutation`
+- `landing`
+
+Approval state is written into `approval.status.json` with:
+
+- `milestone_id`
+- `milestone_status`
+- `approved`
+- `approved_at`
+- `notes`
+
+## Artifacts
+
+Each scenario keeps the standard packet:
+
+- `case.spec.json`
+- `run.manifest.json`
+- `result.summary.json`
+- `report.md`
+
+W5 adds:
+
+- `graph.state.json`
+- `graph.history.jsonl`
+- `interrupt.json`
+- `approval.status.json`
+- `scenario.plan.json`
+- `step.journal.jsonl`
+- `node-artifacts/`
+- `worktree.manifest.json` for mutation scenarios
+- `landing.diff` for landed mutation scenarios
+
+Wave-level outputs:
+
+- `W5-long-horizon-index.json`
+- `W5-long-horizon-index.md`
+- `W5_SUMMARY.md`
+
+## Boundaries
+
+W5 keeps these constraints:
+
+- read-only scenarios never create worktrees or commits
+- mutation scenarios reuse the bounded W4 proposal and worktree posture
+- every landing remains explicitly approved
+- every successful mutation scenario records one local checkpoint commit when a tracked diff exists
+- no push or PR creation is part of W5
+
+The implementation scenario is intentionally narrow:
+
+- `stack-sync-federation-check-mode`
+- repo scope: `abyss-stack`
+- allowed file: `scripts/aoa-sync-federation-surfaces`
+- required behavior: add `--check` without widening sync semantics
+
+## Gate
+
+The hard W5 gate is:
+
+- `pass_count == 8`
+- `critical_failures == 0`
+- `pause_resume_proved == true`
+- `implementation_case_passed == true`
+- `generated_case_passed == true`
+- `unauthorized_scope_expansion == 0`
+- `post_change_validation_failure == 0`
+
+If the gate passes, the next action is:
+
+`W5 passed on promoted llama.cpp + LangGraph. Use this substrate as the bounded baseline for the next autonomy-focused wave.`
diff --git a/docs/W6_PILOT.md b/docs/W6_PILOT.md
new file mode 100644
index 0000000..4482482
--- /dev/null
+++ b/docs/W6_PILOT.md
@@ -0,0 +1,161 @@
+# W6 PILOT
+
+## Purpose
+
+This document defines the bounded `W6` autonomy pilot for `abyss-stack`.
+
+W6 is:
+
+- scenario-based rather than a monolithic `run-wave`
+- LangGraph-first for orchestration
+- llama.cpp-first on `http://127.0.0.1:5403/run`
+- reduced-touch, with approval gates at `plan_freeze` and `landing` only
+
+W6 is not:
+
+- a new public HTTP API
+- a replacement for `aoa-local-ai-trials`, `aoa-langgraph-pilot`, or `aoa-w5-pilot`
+- an unbounded autonomy claim
+
+## Operator Surface
+
+Use:
+
+```bash
+scripts/aoa-w6-pilot materialize
+scripts/aoa-w6-pilot run-scenario --until milestone|done
+scripts/aoa-w6-pilot resume-scenario
+scripts/aoa-w6-pilot status --all
+scripts/aoa-w6-pilot status
+```
+
+Defaults:
+
+- run URL: `http://127.0.0.1:5403/run`
+- program id: `w6-bounded-autonomy-llamacpp-v1`
+- runtime truth: `${AOA_STACK_ROOT}/Logs/local-ai-trials/w6-bounded-autonomy-llamacpp-v1/`
+- mirror: `/srv/Dionysus/reports/local-ai-trials/w6-bounded-autonomy-llamacpp-v1/`
+
+## Scenario Catalog
+
+Materialize exactly these `6` scenarios in this order:
+
+1. `runtime-inspect-langchain-health`
+2. `runtime-inspect-route-api-health`
+3. `aoa-evals-contract-wording-alignment`
+4. `aoa-routing-generated-surface-refresh`
+5. `stack-sync-federation-json-check-report`
+6. `llamacpp-pilot-verify-command`
+
+Execution modes:
+
+- `read_only_summary`
+- `qwen_patch`
+- `script_refresh`
+- `implementation_patch`
+
+Novel implementation scenarios:
+
+- `stack-sync-federation-json-check-report`
+- `llamacpp-pilot-verify-command`
+
+The fixed pause/resume proof scenario is:
+
+- `llamacpp-pilot-verify-command`
+- `force_pause_on_milestone = landing`
+
+## Milestone Gates
+
+Every scenario pauses at `plan_freeze`.
+
+Mutation scenarios also pause at:
+
+- `landing`
+
+`first_mutation` is intentionally removed from the normal `W6` path.
+
+Approval state is written into `approval.status.json` with:
+
+- `milestone_id`
+- `milestone_status`
+- `approved`
+- `approved_at`
+- `notes`
+
+## Artifacts
+
+Each scenario keeps the standard packet:
+
+- `case.spec.json`
+- `run.manifest.json`
+- `result.summary.json`
+- `report.md`
+
+W6 adds:
+
+- `graph.state.json`
+- `graph.history.jsonl`
+- `interrupt.json`
+- `approval.status.json`
+- `scenario.plan.json`
+- `step.journal.jsonl`
+- `node-artifacts/`
+- `worktree.manifest.json`
+- `landing.diff`
+
+Wave-level outputs:
+
+- `W6-autonomy-index.json`
+- `W6-autonomy-index.md`
+- `W6_SUMMARY.md`
+
+## Boundaries
+
+W6 keeps these constraints:
+
+- read-only scenarios never create worktrees or commits
+- mutation scenarios reuse the bounded W4 proposal and worktree posture
+- `autonomous_repair_loop` may retry at most once and only after `post_change_validation_failure`
+- repair must stay inside the same `allowed_files`
+- landing remains explicitly approved
+- every successful mutation scenario records one local checkpoint commit when a tracked diff exists
+- no push or PR creation is part of W6
+
+The two new implementation scenarios are intentionally narrow:
+
+- `stack-sync-federation-json-check-report`
+ - repo scope: `abyss-stack`
+ - allowed file: `scripts/aoa-sync-federation-surfaces`
+ - required behavior: add `--json` for `--check`
+
+- `llamacpp-pilot-verify-command`
+ - repo scope: `abyss-stack`
+ - allowed file: `scripts/aoa-llamacpp-pilot`
+ - required behavior: add a bounded `verify` subcommand
+
+Neither implementation scenario may pass as `preexisting-noop`.
+
+## Gate
+
+The hard W6 gate is:
+
+- `pass_count == 6`
+- `critical_failures == 0`
+- `pause_resume_proved == true`
+- `novel_implementation_passes == 2`
+- `generated_case_passed == true`
+- `implementation_case_passed == true`
+- `preexisting_noop_count == 0`
+- `unauthorized_scope_expansion == 0`
+- `post_change_validation_failure == 0`
+
+Repair metrics are mandatory to record:
+
+- `repair_attempted_count`
+- `repair_success_count`
+
+But they are not hard-gate fields for W6.
+
+If the gate passes, the next action is:
+
+`W6 passed on the promoted llama.cpp + LangGraph autonomy track. Use this substrate and approval posture as the baseline for the next implementation-heavy autonomy wave.`
diff --git a/scripts/aoa-langgraph-pilot b/scripts/aoa-langgraph-pilot
new file mode 100755
index 0000000..db7a1e4
--- /dev/null
+++ b/scripts/aoa-langgraph-pilot
@@ -0,0 +1,1364 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import copy
+import importlib.machinery
+import importlib.util
+import json
+import shutil
+import subprocess
+from contextlib import contextmanager
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, TypedDict
+
+try:
+ from langgraph.graph import END, START, StateGraph
+ from langgraph.types import Command
+except ImportError as exc: # pragma: no cover - guarded by runtime usage
+ raise SystemExit(
+ "langgraph is not installed. Install dependencies from "
+ "`scripts/requirements-langgraph-pilot.txt` first."
+ ) from exc
+
+
+DEFAULT_PROGRAM_ID = "langgraph-sidecar-pilot-v1"
+FIXTURE_PROGRAM_ID = "langgraph-sidecar-llamacpp-v1"
+PROGRAM_ID = DEFAULT_PROGRAM_ID
+WAVE_ID = "W4"
+MODEL = "qwen3.5:9b"
+DEFAULT_LANGCHAIN_RUN_URL = "http://127.0.0.1:5401/run"
+LANGCHAIN_RUN_URL = DEFAULT_LANGCHAIN_RUN_URL
+
+SOURCE_ROOT = Path(__file__).resolve().parents[1]
+STACK_ROOT = Path("/srv/abyss-stack")
+CONFIGS_ROOT = STACK_ROOT / "Configs"
+SCRIPTS_ROOT = CONFIGS_ROOT / "scripts"
+LOG_ROOT_DEFAULT = STACK_ROOT / "Logs" / "local-ai-trials" / PROGRAM_ID
+MIRROR_ROOT_DEFAULT = Path("/srv/Dionysus/reports/local-ai-trials") / PROGRAM_ID
+BASELINE_PROGRAM_ID = "qwen-local-pilot-v1"
+BASELINE_LOG_ROOT = STACK_ROOT / "Logs" / "local-ai-trials" / BASELINE_PROGRAM_ID
+COMPARISON_MEMO_NAME = "LANGGRAPH_COMPARISON.md"
+PILOT_INDEX_NAME = "W4-langgraph-sidecar-index"
+
+DEFAULT_DOCS_CASE_ID = "8dionysus-profile-routing-clarity"
+GENERATED_CASE_ID = "aoa-routing-generated-surface-refresh"
+FIXTURE_DOCS_CASE_ID = "fixture-docs-wording-alignment"
+FIXTURE_VERSION = "v2"
+DOCS_CASE_ID = DEFAULT_DOCS_CASE_ID
+DOC_CASE_IDS = {DOCS_CASE_ID}
+GENERATED_CASE_IDS = {GENERATED_CASE_ID}
+
+
+class PilotState(TypedDict, total=False):
+ case_id: str
+ until: str
+ execution_mode: str
+ current_node: str
+ next_node: str | None
+ proposal_valid: bool
+ approval_status: str | None
+ paused: bool
+ pause_reason: str | None
+ terminal_status: str | None
+ failure_class: str | None
+ resume_count: int
+ history: list[dict[str, Any]]
+ note: str | None
+
+
+def utc_now() -> str:
+ return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+
+
+def absolute(path: Path) -> str:
+ return str(path.resolve())
+
+
+def default_log_root_for(program_id: str) -> Path:
+ return STACK_ROOT / "Logs" / "local-ai-trials" / program_id
+
+
+def default_mirror_root_for(program_id: str) -> Path:
+ return Path("/srv/Dionysus/reports/local-ai-trials") / program_id
+
+
+def configure_program_runtime(*, program_id: str, run_url: str) -> None:
+ global PROGRAM_ID, DOCS_CASE_ID, DOC_CASE_IDS, LOG_ROOT_DEFAULT, MIRROR_ROOT_DEFAULT, LANGCHAIN_RUN_URL
+ PROGRAM_ID = program_id
+ DOCS_CASE_ID = FIXTURE_DOCS_CASE_ID if is_fixture_program(program_id) else DEFAULT_DOCS_CASE_ID
+ DOC_CASE_IDS = {DOCS_CASE_ID}
+ LOG_ROOT_DEFAULT = default_log_root_for(program_id)
+ MIRROR_ROOT_DEFAULT = default_mirror_root_for(program_id)
+ LANGCHAIN_RUN_URL = run_url
+
+
+def is_fixture_program(program_id: str | None = None) -> bool:
+ return (program_id or PROGRAM_ID) == FIXTURE_PROGRAM_ID
+
+
+def load_trials_module() -> Any:
+ target = SOURCE_ROOT / "scripts" / "aoa-local-ai-trials"
+ loader = importlib.machinery.SourceFileLoader("aoa_local_ai_trials_sidecar", str(target))
+ spec = importlib.util.spec_from_loader(loader.name, loader)
+ if spec is None:
+ raise RuntimeError(f"could not create module spec for {target}")
+ module = importlib.util.module_from_spec(spec)
+ loader.exec_module(module) # type: ignore[arg-type]
+ return module
+
+
+TRIALS = load_trials_module()
+ORIGINAL_TRIALS_BUILD_CATALOG = TRIALS.build_catalog
+
+
+def fixture_repo_root(log_root: Path) -> Path:
+ return log_root / "_fixtures" / FIXTURE_DOCS_CASE_ID / "repo"
+
+
+def fixture_case_from_template(log_root: Path) -> dict[str, Any]:
+ catalog = ORIGINAL_TRIALS_BUILD_CATALOG()
+ template = next(case for case in catalog["W4"] if case["case_id"] == DEFAULT_DOCS_CASE_ID)
+ item = copy.deepcopy(template)
+ repo_root = fixture_repo_root(log_root)
+ readme = repo_root / "README.md"
+ style = repo_root / "docs" / "STYLE.md"
+ check_script = repo_root / "scripts" / "check_fixture.py"
+ item["case_id"] = FIXTURE_DOCS_CASE_ID
+ item["program_id"] = PROGRAM_ID
+ item["title"] = "Disposable Docs Fixture Wording Alignment"
+ item["repo_scope"] = ["langgraph-fixture-docs"]
+ item["source_refs"] = [absolute(readme), absolute(style)]
+ item["inputs"] = [
+ "Align the README wording to the style note without widening ownership claims.",
+ "Keep the fixture framed as a coordination surface rather than a source-of-truth implementation repo.",
+ "Replace `It is not the source of truth for implementation details or routing policy authorship.` with exactly `Implementation details and routing policy live elsewhere.`",
+ ]
+ item["acceptance_checks"] = ["python3 scripts/check_fixture.py"]
+ item["mutation_policy"]["allowed_files"] = [absolute(readme)]
+ item["expected_result"]["allowed_files"] = [absolute(readme)]
+ item["notes"] = list(item.get("notes") or []) + [
+ "This disposable fixture exists only for the llama.cpp promotion dry-run and must not touch any live repo.",
+ ]
+ return item
+
+
+def available_cases(log_root: Path | None = None) -> list[dict[str, Any]]:
+ catalog = ORIGINAL_TRIALS_BUILD_CATALOG()
+ if is_fixture_program():
+ if log_root is None:
+ raise RuntimeError("fixture program requires a log_root to build its disposable repo case")
+ return [fixture_case_from_template(log_root)]
+ selected = []
+ for case in catalog["W4"]:
+ if case["case_id"] not in {DEFAULT_DOCS_CASE_ID, GENERATED_CASE_ID}:
+ continue
+ item = copy.deepcopy(case)
+ item["program_id"] = PROGRAM_ID
+ item["notes"] = list(item.get("notes") or []) + [
+ "This case is frozen into the LangGraph sidecar pilot and intentionally reuses the W4 bounded-mutation contract.",
+ ]
+ selected.append(item)
+ by_id = {case["case_id"]: case for case in selected}
+ return [by_id[DEFAULT_DOCS_CASE_ID], by_id[GENERATED_CASE_ID]]
+
+
+def pilot_catalog(log_root: Path | None = None) -> dict[str, list[dict[str, Any]]]:
+ return {WAVE_ID: available_cases(log_root)}
+
+
+def run_git(repo_root: Path, *args: str) -> None:
+ subprocess.run(["git", *args], cwd=str(repo_root), check=True, text=True, capture_output=True)
+
+
+def ensure_fixture_repo(log_root: Path) -> Path:
+ repo_root = fixture_repo_root(log_root)
+ parent = repo_root.parent
+ version_file = repo_root / ".fixture-version"
+ expected_files = [
+ repo_root / ".git",
+ repo_root / "README.md",
+ repo_root / "docs" / "STYLE.md",
+ repo_root / "AGENTS.md",
+ repo_root / "scripts" / "check_fixture.py",
+ version_file,
+ ]
+ if all(path.exists() for path in expected_files) and version_file.read_text(encoding="utf-8").strip() == FIXTURE_VERSION:
+ return repo_root
+ if parent.exists():
+ shutil.rmtree(parent)
+ (repo_root / "docs").mkdir(parents=True, exist_ok=True)
+ (repo_root / "scripts").mkdir(parents=True, exist_ok=True)
+ (repo_root / "README.md").write_text(
+ "\n".join(
+ [
+ "# Fixture Docs Repo",
+ "",
+ "This repository is the public coordination surface for the fixture ecosystem.",
+ "It should help people navigate to the right source repo quickly.",
+ "It is not the source of truth for implementation details or routing policy authorship.",
+ "",
+ "Use the docs folder for compact guidance about what this fixture owns.",
+ ]
+ )
+ + "\n",
+ encoding="utf-8",
+ )
+ (repo_root / "docs" / "STYLE.md").write_text(
+ "\n".join(
+ [
+ "# Style",
+ "",
+ "- Frame the fixture as a coordination surface.",
+ '- Replace the long source-of-truth sentence with exactly: `Implementation details and routing policy live elsewhere.`',
+ "- Keep wording compact and navigation-first.",
+ ]
+ )
+ + "\n",
+ encoding="utf-8",
+ )
+ (repo_root / "AGENTS.md").write_text(
+ "\n".join(
+ [
+ "# AGENTS.md",
+ "",
+ "## Purpose",
+ "",
+ "This disposable repository exists only for bounded local-ai pilot checks.",
+ "",
+ "## Editing rules",
+ "",
+ "- Keep README.md concise and navigation-first.",
+ "- Do not claim this repo authors implementation truth.",
+ ]
+ )
+ + "\n",
+ encoding="utf-8",
+ )
+ (repo_root / "scripts" / "check_fixture.py").write_text(
+ "\n".join(
+ [
+ "from pathlib import Path",
+ "",
+ "readme = Path('README.md').read_text(encoding='utf-8')",
+ "required = 'coordination surface'",
+ "required_replacement = 'Implementation details and routing policy live elsewhere.'",
+ "forbidden = 'source of truth for implementation details or routing policy authorship'",
+ "if required not in readme:",
+ " raise SystemExit('missing required wording')",
+ "if required_replacement not in readme:",
+ " raise SystemExit('replacement wording missing')",
+ "if forbidden in readme:",
+ " raise SystemExit('forbidden wording still present')",
+ "print('fixture acceptance passed')",
+ ]
+ )
+ + "\n",
+ encoding="utf-8",
+ )
+ version_file.write_text(FIXTURE_VERSION + "\n", encoding="utf-8")
+ run_git(repo_root, "init", "-b", "main")
+ run_git(repo_root, "config", "user.name", "Codex Fixture")
+ run_git(repo_root, "config", "user.email", "codex-fixture@example.invalid")
+ run_git(repo_root, "add", ".")
+ run_git(repo_root, "commit", "-m", "Seed disposable fixture docs repo")
+ return repo_root
+
+
+def case_root(log_root: Path, case_id: str) -> Path:
+ return TRIALS.case_dir(log_root, WAVE_ID, case_id)
+
+
+def state_path(log_root: Path, case_id: str) -> Path:
+ return case_root(log_root, case_id) / "graph.state.json"
+
+
+def history_path(log_root: Path, case_id: str) -> Path:
+ return case_root(log_root, case_id) / "graph.history.jsonl"
+
+
+def interrupt_path(log_root: Path, case_id: str) -> Path:
+ return case_root(log_root, case_id) / "interrupt.json"
+
+
+def node_artifacts_dir(log_root: Path, case_id: str) -> Path:
+ path = case_root(log_root, case_id) / "node-artifacts"
+ path.mkdir(parents=True, exist_ok=True)
+ return path
+
+
+def program_readme() -> str:
+ return (
+ f"# {PROGRAM_ID}\n\n"
+ "This directory stores the runtime-truth artifacts for the bounded LangGraph sidecar pilot.\n\n"
+ "It reuses the W4 supervised-edit contract while comparing a graph-shaped orchestration layer to the existing runner.\n"
+ )
+
+
+def mirror_readme() -> str:
+ return (
+ f"# {PROGRAM_ID}\n\n"
+ "This folder mirrors human+AI-readable LangGraph sidecar pilot reports.\n\n"
+ "Machine-readable runtime truth stays local under `/srv/abyss-stack/Logs/local-ai-trials/`.\n"
+ )
+
+
+def comparison_memo(log_root: Path) -> str:
+ docs_result = load_result_summary(log_root, DOCS_CASE_ID)
+ docs_state = load_graph_state(log_root, DOCS_CASE_ID)
+ docs_history = docs_state.get("history", []) if docs_state else []
+ pause_seen = any(item.get("node") == "await_approval" and item.get("status") == "paused" for item in docs_history)
+ resumed = (docs_state or {}).get("resume_count", 0) > 0
+ docs_pass = docs_result is not None and docs_result.get("status") == "pass"
+ generated_result = load_result_summary(log_root, GENERATED_CASE_ID) if not is_fixture_program() else None
+ generated_pass = generated_result is not None and generated_result.get("status") == "pass"
+
+ if is_fixture_program():
+ recommendation = (
+ "This fixture pilot is suitable as a bounded promotion gate for backend comparison before W5."
+ if docs_pass
+ else "This fixture pilot is not yet suitable as a promotion gate because the disposable docs case has not passed."
+ )
+ elif docs_pass and generated_pass and pause_seen and resumed:
+ recommendation = (
+ "LangGraph sidecar is recommended as the next bounded W5 execution substrate, "
+ "while keeping `aoa-local-ai-trials` as the baseline comparator."
+ )
+ else:
+ recommendation = (
+ "LangGraph sidecar is not yet the recommended W5 substrate. Keep the current runner as the execution baseline "
+ "until both pilot cases pass and pause/resume is proven end-to-end."
+ )
+
+ return "\n".join(
+ [
+ f"# {PROGRAM_ID} Comparison Memo",
+ "",
+ "## Summary",
+ "- This pilot compares graph-shaped orchestration against the existing W4 bounded runner.",
+ "",
+ "## Current Evidence",
+ f"- Docs case pass: `{docs_pass}`",
+ f"- Generated case pass: `{generated_pass}`",
+ f"- Pause observed: `{pause_seen}`",
+ f"- Resume observed: `{resumed}`",
+ "",
+ "## Comparison Notes",
+ "- Pause/resume is explicit through persisted `graph.state.json`, `graph.history.jsonl`, and `approval.status.json`.",
+ "- Proposal and worktree safety continue to reuse the established W4 bounded-mutation contract.",
+ "- Glue code increases slightly because the pilot stays side-by-side with the existing runner instead of replacing it.",
+ "",
+ "## Recommendation",
+ recommendation,
+ ]
+ ) + "\n"
+
+
+def render_index_md(index_payload: dict[str, Any]) -> str:
+ return TRIALS.render_wave_index_md(index_payload)
+
+
+def write_json(path: Path, payload: dict[str, Any]) -> None:
+ TRIALS.write_json(path, payload)
+
+
+def write_text(path: Path, text: str) -> None:
+ TRIALS.write_text(path, text)
+
+
+def load_json(path: Path) -> dict[str, Any]:
+ return json.loads(path.read_text(encoding="utf-8"))
+
+
+def load_case_spec(log_root: Path, case_id: str) -> dict[str, Any]:
+ return load_json(case_root(log_root, case_id) / "case.spec.json")
+
+
+def load_result_summary(log_root: Path, case_id: str) -> dict[str, Any] | None:
+ path = case_root(log_root, case_id) / "result.summary.json"
+ if not path.exists():
+ return None
+ return load_json(path)
+
+
+def load_graph_state(log_root: Path, case_id: str) -> PilotState | None:
+ path = state_path(log_root, case_id)
+ if not path.exists():
+ return None
+ return json.loads(path.read_text(encoding="utf-8"))
+
+
+def save_graph_state(log_root: Path, case_id: str, state: PilotState) -> None:
+ sanitized = {
+ "case_id": state.get("case_id"),
+ "until": state.get("until"),
+ "execution_mode": state.get("execution_mode"),
+ "current_node": state.get("current_node"),
+ "next_node": state.get("next_node"),
+ "proposal_valid": state.get("proposal_valid"),
+ "approval_status": state.get("approval_status"),
+ "paused": state.get("paused", False),
+ "pause_reason": state.get("pause_reason"),
+ "terminal_status": state.get("terminal_status"),
+ "failure_class": state.get("failure_class"),
+ "resume_count": state.get("resume_count", 0),
+ "note": state.get("note"),
+ "history": state.get("history", []),
+ }
+ write_json(state_path(log_root, case_id), sanitized)
+ history_lines = [json.dumps(item, ensure_ascii=True) for item in sanitized["history"]]
+ history_file = history_path(log_root, case_id)
+ history_file.parent.mkdir(parents=True, exist_ok=True)
+ history_file.write_text("\n".join(history_lines) + ("\n" if history_lines else ""), encoding="utf-8")
+
+
+def record_event(state: PilotState, *, node: str, status: str, note: str, extra: dict[str, Any] | None = None) -> list[dict[str, Any]]:
+ history = list(state.get("history", []))
+ payload: dict[str, Any] = {
+ "at": utc_now(),
+ "node": node,
+ "status": status,
+ "note": note,
+ }
+ if extra:
+ payload.update(extra)
+ history.append(payload)
+ return history
+
+
+def make_index_payload(log_root: Path, mirror_root: Path) -> dict[str, Any]:
+ cases = available_cases(log_root)
+ case_entries: list[dict[str, Any]] = []
+ pass_count = 0
+ fail_count = 0
+ planned_count = 0
+ critical_failures: list[str] = []
+ pause_resume_proved = False
+
+ for case in cases:
+ result = load_result_summary(log_root, case["case_id"])
+ graph_state = load_graph_state(log_root, case["case_id"])
+ terminal_status = (graph_state or {}).get("terminal_status")
+ if result:
+ status = result["status"]
+ if status == "pass":
+ pass_count += 1
+ elif status == "fail":
+ fail_count += 1
+ if result.get("failure_class") in TRIALS.W4_CRITICAL_FAILURES:
+ critical_failures.append(case["case_id"])
+ elif terminal_status == "rejected":
+ status = "rejected"
+ fail_count += 1
+ if (graph_state or {}).get("failure_class") in TRIALS.W4_CRITICAL_FAILURES:
+ critical_failures.append(case["case_id"])
+ elif graph_state:
+ status = "in-progress" if graph_state.get("paused") else "prepared"
+ else:
+ status = "planned"
+ planned_count += 1
+
+ if case["case_id"] == DOCS_CASE_ID and graph_state:
+ history = graph_state.get("history", [])
+ pause_resume_proved = (
+ any(item.get("node") == "await_approval" and item.get("status") == "paused" for item in history)
+ and graph_state.get("resume_count", 0) > 0
+ )
+
+ case_entries.append(
+ {
+ "case_id": case["case_id"],
+ "status": status,
+ "repo_scope": case["repo_scope"],
+ "task_family": case["task_family"],
+ "case_spec": str(case_root(log_root, case["case_id"]) / "case.spec.json"),
+ "summary": case["title"],
+ **(
+ {"report_md": str(mirror_root / TRIALS.case_report_name(WAVE_ID, case["case_id"]))}
+ if (case_root(log_root, case["case_id"]) / "report.md").exists()
+ else {}
+ ),
+ "current_node": (graph_state or {}).get("current_node"),
+ "approval_status": (graph_state or {}).get("approval_status"),
+ "landing_status": "landed" if result and result.get("status") == "pass" else "not-landed",
+ }
+ )
+
+ required_passes = 1 if is_fixture_program() else 2
+ gate_pass = pass_count == required_passes and not critical_failures and (True if is_fixture_program() else pause_resume_proved)
+ if gate_pass:
+ gate_result = "pass"
+ next_action = (
+ "Use the fixture packet as the W4 dry-run promotion verdict for the candidate backend."
+ if is_fixture_program()
+ else "Use the comparison memo to decide whether W5 should run on the LangGraph sidecar substrate."
+ )
+ elif fail_count or critical_failures:
+ gate_result = "fail"
+ next_action = "Inspect the failed case packet and compare it against the baseline W4 runner before promoting LangGraph."
+ elif planned_count == len(cases):
+ gate_result = "not-run"
+ next_action = "Materialize the sidecar pilot and run the docs case to the approval boundary first."
+ else:
+ gate_result = "in-progress"
+ next_action = "Resume the paused docs case or execute the remaining generated case to complete the comparison."
+
+ return {
+ "artifact_kind": "aoa.local-ai-trial.wave-index",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "wave_title": "LangGraph Sidecar Pilot",
+ "wave_summary": (
+ "Bounded disposable W4 fixture used as a backend promotion gate."
+ if is_fixture_program()
+ else "Bounded comparison pilot for a graph-shaped W4 execution layer."
+ ),
+ "case_count": len(cases),
+ "status_counts": {
+ "pass": pass_count,
+ "fail": fail_count,
+ "planned": planned_count,
+ },
+ "gate_result": gate_result,
+ "next_action": next_action,
+ "cases": case_entries,
+ "gate_detail": {
+ "pass_count": pass_count,
+ "fail_count": fail_count,
+ "planned_count": planned_count,
+ "critical_failures": critical_failures,
+ "pause_resume_proved": pause_resume_proved,
+ "comparison_memo": str(mirror_root / COMPARISON_MEMO_NAME),
+ "fixture_mode": is_fixture_program(),
+ "next_action": next_action,
+ },
+ }
+
+
+def refresh_sidecar_outputs(log_root: Path, mirror_root: Path) -> None:
+ index_payload = make_index_payload(log_root, mirror_root)
+ write_json(log_root / f"{PILOT_INDEX_NAME}.json", index_payload)
+ index_md = render_index_md(index_payload)
+ write_text(log_root / f"{PILOT_INDEX_NAME}.md", index_md)
+ write_text(mirror_root / f"{PILOT_INDEX_NAME}.md", index_md)
+ write_text(mirror_root / COMPARISON_MEMO_NAME, comparison_memo(log_root))
+
+
+def materialize(log_root: Path, mirror_root: Path) -> None:
+ log_root.mkdir(parents=True, exist_ok=True)
+ mirror_root.mkdir(parents=True, exist_ok=True)
+ write_text(log_root / "README.md", program_readme())
+ write_text(mirror_root / "README.md", mirror_readme())
+ if is_fixture_program():
+ ensure_fixture_repo(log_root)
+
+ contracts = {
+ "case.spec.schema.json": TRIALS.CASE_SCHEMA,
+ "run.manifest.schema.json": TRIALS.RUN_MANIFEST_SCHEMA,
+ "result.summary.schema.json": TRIALS.RESULT_SUMMARY_SCHEMA,
+ "wave-index.schema.json": TRIALS.WAVE_INDEX_SCHEMA,
+ }
+ for name, payload in contracts.items():
+ write_json(log_root / "contracts" / name, payload)
+
+ for case in available_cases(log_root):
+ write_json(case_root(log_root, case["case_id"]) / "case.spec.json", case)
+ node_artifacts_dir(log_root, case["case_id"])
+
+ refresh_sidecar_outputs(log_root, mirror_root)
+
+
+def ensure_baseline_w4_closeout() -> None:
+ closeout_path = BASELINE_LOG_ROOT / "W4-closeout.json"
+ if not closeout_path.exists():
+ raise RuntimeError(f"missing W4 closeout artifact: {closeout_path}")
+ payload = load_json(closeout_path)
+ if payload.get("gate_result") != "pass":
+ raise RuntimeError(f"W4 closeout is not pass: {closeout_path}")
+
+
+def ensure_runtime_ready(case_dir_path: Path) -> None:
+ doctor_raw = TRIALS.run_command(
+ [absolute(SCRIPTS_ROOT / "aoa-doctor"), "--preset", "intel-full"],
+ cwd=CONFIGS_ROOT,
+ timeout_s=120,
+ )
+ TRIALS.persist_command_result(case_dir_path, "graph-preflight-doctor", doctor_raw)
+ if doctor_raw["exit_code"] != 0 or doctor_raw["timed_out"]:
+ raise RuntimeError("aoa-doctor preflight failed")
+
+ health_raw = TRIALS.run_command(
+ ["curl", "-fsS", TRIALS.langchain_endpoint("/health")],
+ cwd=CONFIGS_ROOT,
+ timeout_s=30,
+ )
+ TRIALS.persist_command_result(case_dir_path, "graph-preflight-langchain-health", health_raw)
+ if health_raw["exit_code"] != 0 or health_raw["timed_out"]:
+ raise RuntimeError("langchain-api /health preflight failed")
+ payload = json.loads(health_raw["stdout"])
+ if not payload.get("ok") or payload.get("service") != "langchain-api":
+ raise RuntimeError("langchain-api /health returned an unexpected payload")
+
+
+def write_interrupt(log_root: Path, state: PilotState, *, reason: str) -> None:
+ payload = {
+ "artifact_kind": "aoa.local-ai-trial.langgraph-interrupt",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": state["case_id"],
+ "paused_at": utc_now(),
+ "reason": reason,
+ "approval_status": state.get("approval_status"),
+ "resume_hint": "Set approval.status.json to approved or rejected, then run `scripts/aoa-langgraph-pilot resume-case `.",
+ }
+ write_json(interrupt_path(LOG_ROOT_DEFAULT, state["case_id"]), payload)
+
+
+def write_rejected_terminal(case: dict[str, Any], *, log_root: Path, mirror_root: Path, approval_payload: dict[str, Any]) -> None:
+ command_refs: list[dict[str, Any]] = []
+ approval_path = case_root(log_root, case["case_id"]) / "artifacts" / "approval.status.json"
+ run_manifest = {
+ "artifact_kind": "aoa.local-ai-trial.run-manifest",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "executed_at": utc_now(),
+ "runtime_selection": case["runtime_selection"],
+ "model": MODEL,
+ "backend": "langgraph-sidecar",
+ "commands": command_refs,
+ "artifact_refs": [str(approval_path)],
+ "notes": [
+ "The case was explicitly rejected at the approval boundary and no mutation was attempted.",
+ ],
+ }
+ result_summary = TRIALS.build_result_summary(
+ case=case,
+ status="fail",
+ score_breakdown={
+ "proposal_valid": True,
+ "approval_present": True,
+ "approval_rejected": True,
+ "unauthorized_scope_expansion": False,
+ "post_change_validation_failure": False,
+ },
+ observed={
+ "highlights": [
+ "The LangGraph sidecar reached the explicit approval boundary.",
+ f"Approval status: `{approval_payload.get('status')}`.",
+ ],
+ "failures": ["The operator rejected the proposal before any mutation was attempted."],
+ },
+ failure_class="approval_rejected",
+ reviewer_notes="The case was intentionally stopped at the approval boundary.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Review the rejected proposal or refresh the case before retrying.",
+ )
+ TRIALS.finalize_case(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ run_manifest=run_manifest,
+ result_summary=result_summary,
+ )
+
+
+def node_json(log_root: Path, case_id: str, name: str, payload: dict[str, Any]) -> None:
+ write_json(node_artifacts_dir(log_root, case_id) / f"{name}.json", payload)
+
+
+def approval_payload(log_root: Path, case_id: str) -> dict[str, Any] | None:
+ path = case_root(log_root, case_id) / "artifacts" / "approval.status.json"
+ if not path.exists():
+ return None
+ return load_json(path)
+
+
+@contextmanager
+def patched_trials_context(*, active_log_root: Path | None = None, active_mirror_root: Path | None = None) -> Any:
+ active_log_root = active_log_root or LOG_ROOT_DEFAULT
+ active_mirror_root = active_mirror_root or MIRROR_ROOT_DEFAULT
+ originals = {
+ "PROGRAM_ID": TRIALS.PROGRAM_ID,
+ "LOG_ROOT_DEFAULT": TRIALS.LOG_ROOT_DEFAULT,
+ "MIRROR_ROOT_DEFAULT": TRIALS.MIRROR_ROOT_DEFAULT,
+ "LANGCHAIN_RUN_URL": getattr(TRIALS, "LANGCHAIN_RUN_URL", None),
+ "LANGCHAIN_BASE_URL": getattr(TRIALS, "LANGCHAIN_BASE_URL", None),
+ "W4_DOC_CASE_IDS": TRIALS.W4_DOC_CASE_IDS,
+ "W4_GENERATED_CASE_IDS": TRIALS.W4_GENERATED_CASE_IDS,
+ "W4_DOC_PREPARE_ORDER": TRIALS.W4_DOC_PREPARE_ORDER,
+ "W4_GENERATED_PREPARE_ORDER": TRIALS.W4_GENERATED_PREPARE_ORDER,
+ "W4_DOC_TARGET_FALLBACKS": TRIALS.W4_DOC_TARGET_FALLBACKS,
+ "build_catalog": TRIALS.build_catalog,
+ "w4_docs_lane_state": TRIALS.w4_docs_lane_state,
+ "repo_root_for_w4_case": TRIALS.repo_root_for_w4_case,
+ }
+
+ def custom_build_catalog() -> dict[str, list[dict[str, Any]]]:
+ return pilot_catalog(active_log_root)
+
+ def custom_w4_docs_lane_state(log_root: Path, catalog: dict[str, list[dict[str, Any]]]) -> dict[str, Any]:
+ results_by_id = {
+ result["case_id"]: result
+ for result in TRIALS.load_w4_results(log_root, catalog)
+ }
+ docs_results = [
+ results_by_id[case_id]
+ for case_id in DOC_CASE_IDS
+ if case_id in results_by_id
+ ]
+ docs_pass = sum(1 for item in docs_results if item["status"] == "pass")
+ docs_criticals = [
+ item["case_id"]
+ for item in docs_results
+ if item.get("failure_class") in TRIALS.W4_CRITICAL_FAILURES
+ ]
+ return {
+ "pass_count": docs_pass,
+ "critical_case_ids": docs_criticals,
+ "unlock_generated_lane": docs_pass >= 1 and not docs_criticals,
+ }
+
+ def custom_repo_root_for_w4_case(case: dict[str, Any]) -> Path:
+ if case["case_id"] == FIXTURE_DOCS_CASE_ID:
+ return fixture_repo_root(active_log_root)
+ return originals["repo_root_for_w4_case"](case)
+
+ TRIALS.configure_program_runtime(program_id=PROGRAM_ID, run_url=LANGCHAIN_RUN_URL)
+ TRIALS.LOG_ROOT_DEFAULT = active_log_root
+ TRIALS.MIRROR_ROOT_DEFAULT = active_mirror_root
+ TRIALS.W4_DOC_CASE_IDS = set(DOC_CASE_IDS)
+ TRIALS.W4_GENERATED_CASE_IDS = set() if is_fixture_program() else set(GENERATED_CASE_IDS)
+ TRIALS.W4_DOC_PREPARE_ORDER = [DOCS_CASE_ID]
+ TRIALS.W4_GENERATED_PREPARE_ORDER = [] if is_fixture_program() else [GENERATED_CASE_ID]
+ target_fallbacks = dict(TRIALS.W4_DOC_TARGET_FALLBACKS)
+ if is_fixture_program():
+ target_fallbacks[FIXTURE_DOCS_CASE_ID] = "README.md"
+ TRIALS.W4_DOC_TARGET_FALLBACKS = target_fallbacks
+ TRIALS.build_catalog = custom_build_catalog
+ TRIALS.w4_docs_lane_state = custom_w4_docs_lane_state
+ TRIALS.repo_root_for_w4_case = custom_repo_root_for_w4_case
+ try:
+ yield TRIALS
+ finally:
+ TRIALS.PROGRAM_ID = originals["PROGRAM_ID"]
+ TRIALS.LOG_ROOT_DEFAULT = originals["LOG_ROOT_DEFAULT"]
+ TRIALS.MIRROR_ROOT_DEFAULT = originals["MIRROR_ROOT_DEFAULT"]
+ if originals["LANGCHAIN_RUN_URL"] is not None:
+ TRIALS.LANGCHAIN_RUN_URL = originals["LANGCHAIN_RUN_URL"]
+ if originals["LANGCHAIN_BASE_URL"] is not None:
+ TRIALS.LANGCHAIN_BASE_URL = originals["LANGCHAIN_BASE_URL"]
+ TRIALS.W4_DOC_CASE_IDS = originals["W4_DOC_CASE_IDS"]
+ TRIALS.W4_GENERATED_CASE_IDS = originals["W4_GENERATED_CASE_IDS"]
+ TRIALS.W4_DOC_PREPARE_ORDER = originals["W4_DOC_PREPARE_ORDER"]
+ TRIALS.W4_GENERATED_PREPARE_ORDER = originals["W4_GENERATED_PREPARE_ORDER"]
+ TRIALS.W4_DOC_TARGET_FALLBACKS = originals["W4_DOC_TARGET_FALLBACKS"]
+ TRIALS.build_catalog = originals["build_catalog"]
+ TRIALS.w4_docs_lane_state = originals["w4_docs_lane_state"]
+ TRIALS.repo_root_for_w4_case = originals["repo_root_for_w4_case"]
+
+
+def build_graph(log_root: Path, mirror_root: Path):
+ def route_from_phase(state: PilotState) -> Command[str]:
+ next_node = state.get("next_node") or "preflight"
+ return Command(update={"current_node": "route"}, goto=next_node)
+
+ def preflight(state: PilotState) -> Command[str]:
+ case_id = state["case_id"]
+ root = case_root(log_root, case_id)
+ try:
+ ensure_baseline_w4_closeout()
+ ensure_runtime_ready(root)
+ history = record_event(state, node="preflight", status="pass", note="Baseline W4 closeout and local runtime preflight are green.")
+ node_json(
+ log_root,
+ case_id,
+ "preflight",
+ {
+ "case_id": case_id,
+ "checked_at": utc_now(),
+ "baseline_closeout": str(BASELINE_LOG_ROOT / "W4-closeout.json"),
+ "doctor_preset": "intel-full",
+ "langchain_health": TRIALS.langchain_endpoint("/health"),
+ "status": "pass",
+ },
+ )
+ return Command(
+ update={
+ "current_node": "preflight",
+ "next_node": "load_case",
+ "history": history,
+ "paused": False,
+ "pause_reason": None,
+ "failure_class": None,
+ "terminal_status": None,
+ },
+ goto="load_case",
+ )
+ except Exception as exc:
+ history = record_event(state, node="preflight", status="fail", note=str(exc))
+ node_json(
+ log_root,
+ case_id,
+ "preflight",
+ {
+ "case_id": case_id,
+ "checked_at": utc_now(),
+ "status": "fail",
+ "error": str(exc),
+ },
+ )
+ case = load_case_spec(log_root, case_id)
+ with patched_trials_context(active_log_root=log_root, active_mirror_root=mirror_root):
+ run_manifest = {
+ "artifact_kind": "aoa.local-ai-trial.run-manifest",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case_id,
+ "executed_at": utc_now(),
+ "runtime_selection": case["runtime_selection"],
+ "model": MODEL,
+ "backend": "langgraph-sidecar",
+ "commands": [],
+ "artifact_refs": [],
+ "notes": ["Pilot stopped before proposal preparation because preflight failed."],
+ }
+ result_summary = TRIALS.build_result_summary(
+ case=case,
+ status="fail",
+ score_breakdown={"preflight_ok": False},
+ observed={
+ "highlights": ["The sidecar pilot stopped before proposal preparation."],
+ "failures": [str(exc)],
+ },
+ failure_class="preflight_failure",
+ reviewer_notes="The LangGraph sidecar preflight did not satisfy the required W4 closeout and runtime-health posture.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Repair baseline W4 or runtime readiness before retrying the sidecar pilot.",
+ )
+ TRIALS.finalize_case(case=case, log_root=log_root, mirror_root=mirror_root, run_manifest=run_manifest, result_summary=result_summary)
+ return Command(
+ update={
+ "current_node": "preflight",
+ "next_node": "finalize_report",
+ "history": history,
+ "failure_class": "preflight_failure",
+ "terminal_status": "fail",
+ },
+ goto="finalize_report",
+ )
+
+ def load_case(state: PilotState) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ execution_mode = case["execution_mode"]
+ history = record_event(state, node="load_case", status="pass", note=f"Loaded `{case['case_id']}` with execution_mode `{execution_mode}`.")
+ node_json(
+ log_root,
+ state["case_id"],
+ "load-case",
+ {
+ "loaded_at": utc_now(),
+ "case_id": case["case_id"],
+ "execution_mode": execution_mode,
+ "repo_scope": case["repo_scope"],
+ },
+ )
+ next_node = "write_initial_packet"
+ return Command(
+ update={
+ "current_node": "load_case",
+ "next_node": next_node,
+ "execution_mode": execution_mode,
+ "history": history,
+ },
+ goto=next_node,
+ )
+
+ def write_initial_packet(state: PilotState) -> Command[str]:
+ case_id = state["case_id"]
+ croot = case_root(log_root, case_id)
+ croot.mkdir(parents=True, exist_ok=True)
+ node_artifacts_dir(log_root, case_id)
+ ipath = interrupt_path(log_root, case_id)
+ if ipath.exists():
+ ipath.unlink()
+ history = record_event(state, node="write_initial_packet", status="pass", note="Initial pilot packet and runtime-side artifact directories are ready.")
+ node_json(
+ log_root,
+ case_id,
+ "write-initial-packet",
+ {
+ "prepared_at": utc_now(),
+ "case_root": str(croot),
+ "node_artifacts": str(node_artifacts_dir(log_root, case_id)),
+ },
+ )
+ next_node = "collect_refs" if state["execution_mode"] == "qwen_patch" else "prepare_generated_proposal"
+ return Command(
+ update={
+ "current_node": "write_initial_packet",
+ "next_node": next_node,
+ "history": history,
+ },
+ goto=next_node,
+ )
+
+ def collect_refs(state: PilotState) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ with patched_trials_context(active_log_root=log_root, active_mirror_root=mirror_root):
+ agents_refs = TRIALS.collect_applicable_agents_refs(case)
+ history = record_event(state, node="collect_refs", status="pass", note=f"Collected {len(case.get('source_refs', []))} source refs and {len(agents_refs)} AGENTS refs.")
+ node_json(
+ log_root,
+ state["case_id"],
+ "collect-refs",
+ {
+ "collected_at": utc_now(),
+ "source_refs": case.get("source_refs", []),
+ "agents_refs": agents_refs,
+ },
+ )
+ return Command(
+ update={
+ "current_node": "collect_refs",
+ "next_node": "build_edit_proposal",
+ "history": history,
+ },
+ goto="build_edit_proposal",
+ )
+
+ def build_edit_proposal(state: PilotState) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ with patched_trials_context(active_log_root=log_root, active_mirror_root=mirror_root):
+ result = TRIALS.prepare_w4_case(case, log_root=log_root)
+ proposal_summary = load_json(case_root(log_root, state["case_id"]) / "artifacts" / "proposal.summary.json")
+ history = record_event(
+ state,
+ node="build_edit_proposal",
+ status="pass" if result.get("proposal_valid") else "fail",
+ note="Docs proposal prepared through the W4 edit-spec contract.",
+ extra={"proposal_valid": bool(result.get("proposal_valid"))},
+ )
+ node_json(
+ log_root,
+ state["case_id"],
+ "build-edit-proposal",
+ {
+ "prepared_at": utc_now(),
+ "proposal_valid": bool(result.get("proposal_valid")),
+ "proposal_summary_path": str(case_root(log_root, state["case_id"]) / "artifacts" / "proposal.summary.json"),
+ "proposal_failure_reasons": proposal_summary.get("proposal_failure_reasons", []),
+ },
+ )
+ next_node = "persist_proposal" if result.get("proposal_valid") else "finalize_report"
+ terminal_status = None if result.get("proposal_valid") else "fail"
+ return Command(
+ update={
+ "current_node": "build_edit_proposal",
+ "next_node": next_node,
+ "proposal_valid": bool(result.get("proposal_valid")),
+ "history": history,
+ "failure_class": None if result.get("proposal_valid") else "proposal_invalid",
+ "terminal_status": terminal_status,
+ },
+ goto=next_node,
+ )
+
+ def persist_proposal(state: PilotState) -> Command[str]:
+ case_id = state["case_id"]
+ proposal_summary_path = case_root(log_root, case_id) / "artifacts" / "proposal.summary.json"
+ approval_path = case_root(log_root, case_id) / "artifacts" / "approval.status.json"
+ if not proposal_summary_path.exists() or not approval_path.exists():
+ history = record_event(state, node="persist_proposal", status="fail", note="Proposal artifacts were missing after preparation.")
+ return Command(
+ update={
+ "current_node": "persist_proposal",
+ "next_node": "finalize_report",
+ "history": history,
+ "failure_class": "proposal_invalid",
+ "terminal_status": "fail",
+ },
+ goto="finalize_report",
+ )
+ history = record_event(state, node="persist_proposal", status="pass", note="Proposal summary and approval contract are persisted.")
+ node_json(
+ log_root,
+ case_id,
+ "persist-proposal",
+ {
+ "persisted_at": utc_now(),
+ "proposal_summary": str(proposal_summary_path),
+ "approval_status": str(approval_path),
+ },
+ )
+ return Command(
+ update={
+ "current_node": "persist_proposal",
+ "next_node": "await_approval",
+ "history": history,
+ },
+ goto="await_approval",
+ )
+
+ def prepare_generated_proposal(state: PilotState) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ with patched_trials_context(active_log_root=log_root, active_mirror_root=mirror_root):
+ result = TRIALS.prepare_w4_case(case, log_root=log_root)
+ proposal_summary = load_json(case_root(log_root, state["case_id"]) / "artifacts" / "proposal.summary.json")
+ history = record_event(
+ state,
+ node="prepare_generated_proposal",
+ status="pass" if result.get("proposal_valid") else "fail",
+ note="Generated proposal prepared through the canonical deterministic script_refresh path.",
+ extra={"proposal_valid": bool(result.get("proposal_valid"))},
+ )
+ node_json(
+ log_root,
+ state["case_id"],
+ "prepare-generated-proposal",
+ {
+ "prepared_at": utc_now(),
+ "proposal_valid": bool(result.get("proposal_valid")),
+ "builder_command": proposal_summary.get("builder_command"),
+ "proposal_failure_reasons": proposal_summary.get("proposal_failure_reasons", []),
+ },
+ )
+ next_node = "await_approval" if result.get("proposal_valid") else "finalize_report"
+ return Command(
+ update={
+ "current_node": "prepare_generated_proposal",
+ "next_node": next_node,
+ "proposal_valid": bool(result.get("proposal_valid")),
+ "history": history,
+ "failure_class": None if result.get("proposal_valid") else "proposal_invalid",
+ "terminal_status": None if result.get("proposal_valid") else "fail",
+ },
+ goto=next_node,
+ )
+
+ def await_approval(state: PilotState) -> Command[str]:
+ payload = approval_payload(log_root, state["case_id"])
+ status = str((payload or {}).get("status") or "pending")
+ history = record_event(state, node="await_approval", status="seen", note=f"Observed approval status `{status}`.")
+ node_json(
+ log_root,
+ state["case_id"],
+ "await-approval",
+ {
+ "checked_at": utc_now(),
+ "approval_status": status,
+ "approval_path": str(case_root(log_root, state["case_id"]) / "artifacts" / "approval.status.json"),
+ },
+ )
+ if status == "approved":
+ return Command(
+ update={
+ "current_node": "await_approval",
+ "next_node": "worktree_apply",
+ "approval_status": status,
+ "history": history,
+ "paused": False,
+ "pause_reason": None,
+ },
+ goto="worktree_apply",
+ )
+ if status == "rejected":
+ case = load_case_spec(log_root, state["case_id"])
+ with patched_trials_context(active_log_root=log_root, active_mirror_root=mirror_root):
+ write_rejected_terminal(case, log_root=log_root, mirror_root=mirror_root, approval_payload=payload or {})
+ history = record_event(
+ {"history": history},
+ node="await_approval",
+ status="rejected",
+ note="Approval was explicitly rejected before mutation.",
+ )
+ return Command(
+ update={
+ "current_node": "await_approval",
+ "next_node": "finalize_report",
+ "approval_status": status,
+ "history": history,
+ "paused": False,
+ "pause_reason": None,
+ "terminal_status": "rejected",
+ "failure_class": "approval_rejected",
+ },
+ goto="finalize_report",
+ )
+ history = record_event(
+ {"history": history},
+ node="await_approval",
+ status="paused",
+ note="Pilot paused at the human approval boundary.",
+ )
+ interrupt_payload = {
+ "artifact_kind": "aoa.local-ai-trial.langgraph-interrupt",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": state["case_id"],
+ "paused_at": utc_now(),
+ "reason": "approval_pending",
+ "approval_status": status,
+ "resume_hint": "Set approval.status.json to approved or rejected, then run `scripts/aoa-langgraph-pilot resume-case `.",
+ }
+ write_json(interrupt_path(log_root, state["case_id"]), interrupt_payload)
+ return Command(
+ update={
+ "current_node": "await_approval",
+ "next_node": "await_approval",
+ "approval_status": status,
+ "history": history,
+ "paused": True,
+ "pause_reason": "approval_pending",
+ "terminal_status": "paused",
+ },
+ goto=END,
+ )
+
+ def worktree_apply(state: PilotState) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ with patched_trials_context(active_log_root=log_root, active_mirror_root=mirror_root):
+ TRIALS.apply_w4_case(
+ case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ land_back=not is_fixture_program(),
+ )
+ result_summary = load_result_summary(log_root, state["case_id"]) or {}
+ status = str(result_summary.get("status") or "fail")
+ history = record_event(
+ state,
+ node="worktree_apply",
+ status=status,
+ note="Reused the existing W4 worktree-first bounded apply path.",
+ extra={"failure_class": result_summary.get("failure_class")},
+ )
+ node_json(
+ log_root,
+ state["case_id"],
+ "worktree-apply",
+ {
+ "applied_at": utc_now(),
+ "result_status": status,
+ "failure_class": result_summary.get("failure_class"),
+ },
+ )
+ return Command(
+ update={
+ "current_node": "worktree_apply",
+ "next_node": "acceptance_validate",
+ "history": history,
+ "failure_class": result_summary.get("failure_class"),
+ },
+ goto="acceptance_validate",
+ )
+
+ def acceptance_validate(state: PilotState) -> Command[str]:
+ result_summary = load_result_summary(log_root, state["case_id"]) or {}
+ status = str(result_summary.get("status") or "fail")
+ history = record_event(
+ state,
+ node="acceptance_validate",
+ status=status,
+ note="Acceptance outcome was read from the landed W4-compatible result summary.",
+ )
+ node_json(
+ log_root,
+ state["case_id"],
+ "acceptance-validate",
+ {
+ "checked_at": utc_now(),
+ "result_status": status,
+ "failure_class": result_summary.get("failure_class"),
+ },
+ )
+ return Command(
+ update={
+ "current_node": "acceptance_validate",
+ "next_node": "land_or_rollback",
+ "history": history,
+ },
+ goto="land_or_rollback",
+ )
+
+ def land_or_rollback(state: PilotState) -> Command[str]:
+ result_summary = load_result_summary(log_root, state["case_id"]) or {}
+ landed = result_summary.get("status") == "pass"
+ history = record_event(
+ state,
+ node="land_or_rollback",
+ status="pass" if landed else "fail",
+ note="Landing status was read from the W4-compatible case result.",
+ )
+ node_json(
+ log_root,
+ state["case_id"],
+ "land-or-rollback",
+ {
+ "checked_at": utc_now(),
+ "landing_status": "landed" if landed else "not-landed",
+ "result_status": result_summary.get("status"),
+ },
+ )
+ return Command(
+ update={
+ "current_node": "land_or_rollback",
+ "next_node": "finalize_report",
+ "history": history,
+ "terminal_status": "pass" if landed else "fail",
+ },
+ goto="finalize_report",
+ )
+
+ def finalize_report(state: PilotState) -> Command[str]:
+ refresh_sidecar_outputs(log_root, mirror_root)
+ result_summary = load_result_summary(log_root, state["case_id"])
+ terminal_status = state.get("terminal_status")
+ if result_summary:
+ terminal_status = str(result_summary.get("status") or terminal_status or "fail")
+ history = record_event(
+ state,
+ node="finalize_report",
+ status=terminal_status or "unknown",
+ note="Pilot index and comparison memo were refreshed.",
+ )
+ node_json(
+ log_root,
+ state["case_id"],
+ "finalize-report",
+ {
+ "finalized_at": utc_now(),
+ "terminal_status": terminal_status,
+ "pilot_index": str(log_root / f"{PILOT_INDEX_NAME}.json"),
+ "comparison_memo": str(mirror_root / COMPARISON_MEMO_NAME),
+ },
+ )
+ return Command(
+ update={
+ "current_node": "finalize_report",
+ "next_node": None,
+ "history": history,
+ "terminal_status": terminal_status,
+ },
+ goto=END,
+ )
+
+ graph = StateGraph(PilotState)
+ graph.add_node("route_from_phase", route_from_phase)
+ graph.add_node("preflight", preflight)
+ graph.add_node("load_case", load_case)
+ graph.add_node("write_initial_packet", write_initial_packet)
+ graph.add_node("collect_refs", collect_refs)
+ graph.add_node("build_edit_proposal", build_edit_proposal)
+ graph.add_node("persist_proposal", persist_proposal)
+ graph.add_node("prepare_generated_proposal", prepare_generated_proposal)
+ graph.add_node("await_approval", await_approval)
+ graph.add_node("worktree_apply", worktree_apply)
+ graph.add_node("acceptance_validate", acceptance_validate)
+ graph.add_node("land_or_rollback", land_or_rollback)
+ graph.add_node("finalize_report", finalize_report)
+ graph.add_edge(START, "route_from_phase")
+ return graph.compile()
+
+
+def run_graph_case(log_root: Path, mirror_root: Path, *, case_id: str, until: str, resume: bool) -> PilotState:
+ graph = build_graph(log_root, mirror_root)
+ existing = load_graph_state(log_root, case_id) or {}
+ state: PilotState = {
+ **existing,
+ "case_id": case_id,
+ "until": until,
+ "paused": False,
+ "pause_reason": None,
+ "current_node": existing.get("current_node"),
+ "next_node": existing.get("next_node") or ("await_approval" if resume else "preflight"),
+ "resume_count": int(existing.get("resume_count", 0)) + (1 if resume else 0),
+ "history": list(existing.get("history", [])),
+ }
+ final_state = graph.invoke(state)
+ save_graph_state(log_root, case_id, final_state)
+ refresh_sidecar_outputs(log_root, mirror_root)
+ return final_state
+
+
+def print_status(log_root: Path, case_id: str) -> None:
+ graph_state = load_graph_state(log_root, case_id)
+ result_summary = load_result_summary(log_root, case_id)
+ approval = approval_payload(log_root, case_id)
+ payload = {
+ "case_id": case_id,
+ "graph_state": graph_state,
+ "approval": approval,
+ "result_summary": result_summary,
+ }
+ print(json.dumps(payload, indent=2, ensure_ascii=True))
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(description="Run the LangGraph sidecar pilot on top of the W4 bounded edit contract.")
+ parser.add_argument("--url", default=DEFAULT_LANGCHAIN_RUN_URL)
+ parser.add_argument("--program-id", default=DEFAULT_PROGRAM_ID)
+ parser.add_argument("--log-root", default=None)
+ parser.add_argument("--mirror-root", default=None)
+ sub = parser.add_subparsers(dest="command", required=True)
+
+ sub.add_parser("materialize", help="Materialize the LangGraph sidecar pilot program.")
+
+ run_case = sub.add_parser("run-case", help="Run one sidecar pilot case.")
+ run_case.add_argument("case_id")
+ run_case.add_argument("--until", choices=["approval", "done"], default="done")
+
+ resume_case = sub.add_parser("resume-case", help="Resume a paused LangGraph sidecar case from graph.state.json.")
+ resume_case.add_argument("case_id")
+
+ status_case = sub.add_parser("status", help="Print the current sidecar status for one case.")
+ status_case.add_argument("case_id")
+ return parser
+
+
+def main() -> int:
+ parser = build_parser()
+ args = parser.parse_args()
+
+ configure_program_runtime(program_id=args.program_id, run_url=args.url)
+ log_root = Path(args.log_root) if args.log_root else default_log_root_for(PROGRAM_ID)
+ mirror_root = Path(args.mirror_root) if args.mirror_root else default_mirror_root_for(PROGRAM_ID)
+ valid_case_ids = {case["case_id"] for case in available_cases(log_root)}
+
+ if args.command == "materialize":
+ materialize(log_root, mirror_root)
+ print(f"materialized {PROGRAM_ID} at {log_root}")
+ return 0
+
+ if args.command == "run-case":
+ if args.case_id not in valid_case_ids:
+ parser.error(f"unknown case_id for {PROGRAM_ID}: {args.case_id}")
+ return 2
+ materialize(log_root, mirror_root)
+ final_state = run_graph_case(log_root, mirror_root, case_id=args.case_id, until=args.until, resume=False)
+ print(json.dumps({"case_id": args.case_id, "terminal_status": final_state.get("terminal_status"), "paused": final_state.get("paused", False)}, ensure_ascii=True))
+ return 0
+
+ if args.command == "resume-case":
+ if args.case_id not in valid_case_ids:
+ parser.error(f"unknown case_id for {PROGRAM_ID}: {args.case_id}")
+ return 2
+ materialize(log_root, mirror_root)
+ final_state = run_graph_case(log_root, mirror_root, case_id=args.case_id, until="done", resume=True)
+ print(json.dumps({"case_id": args.case_id, "terminal_status": final_state.get("terminal_status"), "paused": final_state.get("paused", False)}, ensure_ascii=True))
+ return 0
+
+ if args.command == "status":
+ if args.case_id not in valid_case_ids:
+ parser.error(f"unknown case_id for {PROGRAM_ID}: {args.case_id}")
+ return 2
+ print_status(log_root, args.case_id)
+ return 0
+
+ parser.error(f"unknown command: {args.command}")
+ return 2
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/scripts/aoa-llamacpp-pilot b/scripts/aoa-llamacpp-pilot
new file mode 100755
index 0000000..362e4ae
--- /dev/null
+++ b/scripts/aoa-llamacpp-pilot
@@ -0,0 +1,1249 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import sys
+import time
+import urllib.error
+import urllib.request
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+SCRIPT_PATH = Path(__file__).resolve()
+SCRIPT_DIR = SCRIPT_PATH.parent
+SOURCE_ROOT = SCRIPT_DIR.parent
+STACK_ROOT = Path(os.environ.get("AOA_STACK_ROOT", "/srv/abyss-stack"))
+CONFIGS_ROOT = Path(os.environ.get("AOA_CONFIGS_ROOT", str(STACK_ROOT / "Configs")))
+PILOT_ID = "llamacpp-sidecar-pilot-v1"
+PILOT_ROOT = STACK_ROOT / "Logs" / "runtime-benchmarks" / "comparisons" / PILOT_ID
+PROMOTION_ID = "llamacpp-promotion-gate-v1"
+PROMOTION_ROOT = STACK_ROOT / "Logs" / "runtime-benchmarks" / "promotions" / PROMOTION_ID
+SIDECAR_PROJECT = os.environ.get("AOA_LLAMACPP_COMPOSE_PROJECT", "abyss-llamacpp-pilot")
+MODEL_STORE_ROOT = STACK_ROOT / "Logs" / "llamacpp" / "models" / "bartowski"
+OLLAMA_MANIFEST = (
+ STACK_ROOT
+ / "Services"
+ / "ollama"
+ / "models"
+ / "manifests"
+ / "registry.ollama.ai"
+ / "library"
+ / "qwen3.5"
+ / "9b"
+)
+SIDECAR_FILE_SPECS = (
+ "compose/modules/32-llamacpp-inference.yml",
+ "compose/modules/44-llamacpp-agent-sidecar.yml",
+)
+FEDERATION_LAYERS = (
+ "aoa-agents",
+ "aoa-routing",
+ "aoa-memo",
+ "aoa-evals",
+ "aoa-playbooks",
+ "aoa-kag",
+ "tos-source",
+)
+BASE_HEALTH_URL = "http://127.0.0.1:5401/health"
+BASE_RUN_URL = "http://127.0.0.1:5401/run"
+LLAMACPP_HEALTH_URL = "http://127.0.0.1:11435/health"
+LLAMACPP_HEALTH_FALLBACK_URL = "http://127.0.0.1:11435/v1/health"
+CANDIDATE_HEALTH_URL = "http://127.0.0.1:5403/health"
+CANDIDATE_RUN_URL = "http://127.0.0.1:5403/run"
+LLAMACPP_W0_PROGRAM_ID = "qwen-llamacpp-pilot-v1"
+LLAMACPP_W4_PROGRAM_ID = "langgraph-sidecar-llamacpp-v1"
+LLAMACPP_W4_GATE_LOG_ROOT = STACK_ROOT / "Logs" / "local-ai-trials" / "langgraph-sidecar-llamacpp-promotion-gate"
+LLAMACPP_W4_GATE_MIRROR_ROOT = Path("/srv/Dionysus/reports/local-ai-trials/langgraph-sidecar-llamacpp-promotion-gate")
+
+CANDIDATE_MODEL_SPECS = (
+ {
+ "quant": "Q4_K_M",
+ "filename": "Qwen_Qwen3.5-9B-Q4_K_M.gguf",
+ "runtime_variant": "Q4_K_M via llama.cpp sidecar",
+ "target_label": "workhorse-local-qwen3.5-9b-llamacpp-q4km",
+ "backend_label": "langchain-api-llamacpp -> llama.cpp-openai",
+ },
+ {
+ "quant": "Q6_K",
+ "filename": "Qwen_Qwen3.5-9B-Q6_K.gguf",
+ "runtime_variant": "Q6_K via llama.cpp sidecar",
+ "target_label": "workhorse-local-qwen3.5-9b-llamacpp-q6k",
+ "backend_label": "langchain-api-llamacpp -> llama.cpp-openai",
+ },
+)
+
+
+def utc_now() -> str:
+ return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+
+
+def timestamp_dir() -> str:
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H%M%SZ")
+
+
+def ensure_parent(path: Path) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+
+
+def write_json(path: Path, payload: dict[str, Any]) -> None:
+ ensure_parent(path)
+ path.write_text(json.dumps(payload, indent=2, ensure_ascii=True) + "\n", encoding="utf-8")
+
+
+def write_text(path: Path, content: str) -> None:
+ ensure_parent(path)
+ path.write_text(content, encoding="utf-8")
+
+
+def run_cmd(
+ argv: list[str],
+ *,
+ env: dict[str, str] | None = None,
+ cwd: Path | None = None,
+ capture_output: bool = False,
+ check: bool = True,
+) -> subprocess.CompletedProcess[str]:
+ return subprocess.run(
+ argv,
+ cwd=str(cwd or SOURCE_ROOT),
+ env=env,
+ text=True,
+ capture_output=capture_output,
+ check=check,
+ )
+
+
+def base_env() -> dict[str, str]:
+ env = os.environ.copy()
+ env["AOA_STACK_ROOT"] = str(STACK_ROOT)
+ env["AOA_CONFIGS_ROOT"] = str(CONFIGS_ROOT)
+ env.setdefault("PODMAN_DEFAULT_PLATFORM", "linux/amd64")
+ return env
+
+
+def sidecar_env(model_host_path: Path) -> dict[str, str]:
+ env = base_env()
+ env["AOA_LLAMACPP_MODEL_HOST_PATH"] = str(model_host_path)
+ return env
+
+
+def sidecar_compose_cmd(*args: str) -> list[str]:
+ cmd = ["podman", "compose", "-p", SIDECAR_PROJECT]
+ for spec in SIDECAR_FILE_SPECS:
+ cmd.extend(["-f", str(CONFIGS_ROOT / spec)])
+ cmd.extend(args)
+ return cmd
+
+
+def http_get_json(url: str, timeout_s: float = 5.0) -> tuple[int, dict[str, Any] | None]:
+ req = urllib.request.Request(url=url, method="GET")
+ try:
+ with urllib.request.urlopen(req, timeout=timeout_s) as resp:
+ body = resp.read().decode("utf-8", errors="ignore")
+ payload = json.loads(body) if body else None
+ if payload is not None and not isinstance(payload, dict):
+ payload = None
+ return resp.status, payload
+ except urllib.error.URLError:
+ return None, None
+ except urllib.error.HTTPError as exc:
+ body = exc.read().decode("utf-8", errors="ignore")
+ try:
+ payload = json.loads(body) if body else None
+ if payload is not None and not isinstance(payload, dict):
+ payload = None
+ except Exception:
+ payload = None
+ return exc.code, payload
+
+
+def wait_for_url(name: str, url: str, timeout_s: float, accept_503: bool = False) -> dict[str, Any]:
+ deadline = time.time() + timeout_s
+ last_status: int | None = None
+ last_payload: dict[str, Any] | None = None
+
+ while time.time() < deadline:
+ try:
+ status, payload = http_get_json(url, timeout_s=4.0)
+ except Exception:
+ status, payload = None, None
+
+ last_status = status
+ last_payload = payload
+
+ if status == 200:
+ return {
+ "ready": True,
+ "status": status,
+ "payload": payload,
+ "url": url,
+ "name": name,
+ }
+ if status == 503 and accept_503:
+ time.sleep(2.0)
+ continue
+ time.sleep(2.0)
+
+ return {
+ "ready": False,
+ "status": last_status,
+ "payload": last_payload,
+ "url": url,
+ "name": name,
+ }
+
+
+def container_logs(name: str, tail: int = 80) -> str:
+ proc = run_cmd(
+ ["podman", "logs", "--tail", str(tail), name],
+ capture_output=True,
+ check=False,
+ )
+ return (proc.stdout or "") + (proc.stderr or "")
+
+
+def wait_for_llama(timeout_s: float) -> dict[str, Any]:
+ deadline = time.time() + timeout_s
+ while time.time() < deadline:
+ status, payload = http_get_json(LLAMACPP_HEALTH_URL, timeout_s=4.0)
+ if status == 200:
+ return {
+ "ready": True,
+ "status": status,
+ "payload": payload,
+ "url": LLAMACPP_HEALTH_URL,
+ "name": "llama-cpp",
+ }
+
+ logs = container_logs("llama-cpp")
+ if any(
+ marker in logs
+ for marker in (
+ "failed to load model",
+ "error loading model",
+ "Exec format error",
+ "main: exiting due to model loading error",
+ )
+ ):
+ return {
+ "ready": False,
+ "status": status,
+ "payload": payload,
+ "url": LLAMACPP_HEALTH_URL,
+ "name": "llama-cpp",
+ "error": "llama.cpp reported a model-load failure",
+ "log_excerpt": logs[-4000:],
+ }
+
+ status, payload = http_get_json(LLAMACPP_HEALTH_FALLBACK_URL, timeout_s=4.0)
+ if status == 200:
+ return {
+ "ready": True,
+ "status": status,
+ "payload": payload,
+ "url": LLAMACPP_HEALTH_FALLBACK_URL,
+ "name": "llama-cpp",
+ }
+ time.sleep(2.0)
+
+ return {
+ "ready": False,
+ "status": None,
+ "payload": None,
+ "url": LLAMACPP_HEALTH_URL,
+ "name": "llama-cpp",
+ "error": "timeout waiting for llama.cpp health",
+ }
+
+
+def resolve_model_info(model_host_path: str | None = None) -> dict[str, Any]:
+ if model_host_path:
+ blob_path = Path(model_host_path).expanduser().resolve()
+ if not blob_path.exists():
+ raise SystemExit(f"error: model host path does not exist: {blob_path}")
+ manifest_path = None
+ blob_digest = None
+ else:
+ if not OLLAMA_MANIFEST.exists():
+ raise SystemExit(f"error: missing Ollama manifest: {OLLAMA_MANIFEST}")
+ manifest = json.loads(OLLAMA_MANIFEST.read_text(encoding="utf-8"))
+ model_layer = next(
+ (
+ layer
+ for layer in manifest.get("layers", [])
+ if layer.get("mediaType") == "application/vnd.ollama.image.model"
+ ),
+ None,
+ )
+ if not model_layer:
+ raise SystemExit(f"error: no model layer found in {OLLAMA_MANIFEST}")
+ blob_digest = str(model_layer["digest"]).split(":", 1)[1]
+ blob_path = STACK_ROOT / "Services" / "ollama" / "models" / "blobs" / f"sha256-{blob_digest}"
+ if not blob_path.exists():
+ raise SystemExit(f"error: resolved GGUF blob does not exist: {blob_path}")
+ manifest_path = OLLAMA_MANIFEST
+
+ with blob_path.open("rb") as handle:
+ header = handle.read(4)
+ if header != b"GGUF":
+ raise SystemExit(f"error: resolved model is not a GGUF file: {blob_path}")
+
+ ollama_runtime = None
+ try:
+ status, payload = http_get_json("http://127.0.0.1:11434/api/tags", timeout_s=2.0)
+ if status == 200 and payload:
+ for item in payload.get("models", []):
+ if item.get("name") == "qwen3.5:9b":
+ ollama_runtime = item
+ break
+ except Exception:
+ ollama_runtime = None
+
+ return {
+ "resolved_at": utc_now(),
+ "manifest_path": str(manifest_path) if manifest_path else None,
+ "model_host_path": str(blob_path),
+ "blob_digest": blob_digest,
+ "blob_size_bytes": blob_path.stat().st_size,
+ "model_alias": "qwen3.5:9b",
+ "runtime_details": ollama_runtime,
+ "reuse_strategy": "resident_ollama_gguf_blob",
+ }
+
+
+def candidate_model_info() -> list[dict[str, Any]]:
+ items: list[dict[str, Any]] = []
+ for spec in CANDIDATE_MODEL_SPECS:
+ model_path = MODEL_STORE_ROOT / spec["filename"]
+ items.append(
+ {
+ **spec,
+ "model_host_path": str(model_path),
+ "exists": model_path.exists(),
+ "size_bytes": model_path.stat().st_size if model_path.exists() else None,
+ }
+ )
+ return items
+
+
+def run_qwen_check(*, case_name: str, url: str, timeout_s: float) -> dict[str, Any]:
+ proc = run_cmd(
+ [
+ str(SCRIPT_DIR / "aoa-qwen-check"),
+ "--case",
+ case_name,
+ "--url",
+ url,
+ "--timeout",
+ str(timeout_s),
+ "--json",
+ ],
+ env=base_env(),
+ capture_output=True,
+ check=False,
+ )
+ payload = None
+ for line in proc.stdout.splitlines():
+ stripped = line.strip()
+ if not stripped.startswith("{"):
+ continue
+ try:
+ payload = json.loads(stripped)
+ except Exception:
+ continue
+ return {
+ "ok": proc.returncode == 0 and isinstance(payload, dict) and bool(payload.get("ok")),
+ "returncode": proc.returncode,
+ "stdout": proc.stdout,
+ "stderr": proc.stderr,
+ "payload": payload,
+ }
+
+
+def ensure_baseline_healthy(timeout_s: float = 20.0) -> dict[str, Any]:
+ health = wait_for_url("langchain-api", BASE_HEALTH_URL, timeout_s=timeout_s)
+ if not health["ready"]:
+ raise RuntimeError("baseline langchain-api health degraded")
+ return health
+
+
+def case_mean(summary: dict[str, Any], case_name: str) -> float | None:
+ bucket = summary.get("case_breakdown", {}).get(case_name, {})
+ value = bucket.get("mean_s")
+ return float(value) if isinstance(value, (int, float)) else None
+
+
+def screening_winner(
+ *,
+ baseline: dict[str, Any],
+ screenings: list[dict[str, Any]],
+) -> dict[str, Any] | None:
+ stable = [item for item in screenings if item.get("stable")]
+ if not stable:
+ return None
+ baseline_exact = case_mean(baseline["summary"], "exact-reply")
+ eligible: list[dict[str, Any]] = []
+ for item in stable:
+ candidate_exact = case_mean(item["bench"]["summary"], "exact-reply")
+ exact_ratio = None
+ if baseline_exact and candidate_exact is not None:
+ exact_ratio = (candidate_exact - baseline_exact) / baseline_exact
+ item["exact_reply_regression_ratio"] = round(exact_ratio, 4) if exact_ratio is not None else None
+ if exact_ratio is not None and exact_ratio > 0.15:
+ continue
+ eligible.append(item)
+ if not eligible:
+ return None
+ eligible.sort(
+ key=lambda item: (
+ case_mean(item["bench"]["summary"], "repo-routing") if case_mean(item["bench"]["summary"], "repo-routing") is not None else 999999.0,
+ 0 if item["quant"] == "Q4_K_M" else 1,
+ )
+ )
+ return eligible[0]
+
+
+def sync_configs() -> None:
+ run_cmd([str(SCRIPT_DIR / "aoa-sync-configs")], env=base_env())
+ run_cmd([str(SCRIPT_DIR / "aoa-bootstrap-configs"), "--force"], env=base_env())
+ sync_argv = [str(SCRIPT_DIR / "aoa-sync-federation-surfaces")]
+ for layer in FEDERATION_LAYERS:
+ sync_argv.extend(["--layer", layer])
+ run_cmd(sync_argv, env=base_env())
+
+
+def run_doctor(preset: str) -> None:
+ run_cmd([str(SCRIPT_DIR / "aoa-doctor"), "--preset", preset], env=base_env())
+
+
+def up_base_stack(preset: str) -> None:
+ run_cmd([str(SCRIPT_DIR / "aoa-up"), "--preset", preset], env=base_env())
+
+
+def up_llama_sidecar(model_host_path: Path) -> None:
+ run_cmd(sidecar_compose_cmd("up", "-d", "llama-cpp"), env=sidecar_env(model_host_path), cwd=CONFIGS_ROOT)
+
+
+def up_langchain_sidecar(model_host_path: Path) -> None:
+ run_cmd(
+ sidecar_compose_cmd("up", "--build", "-d", "langchain-api-llamacpp"),
+ env=sidecar_env(model_host_path),
+ cwd=CONFIGS_ROOT,
+ )
+
+
+def stop_sidecars() -> None:
+ run_cmd(sidecar_compose_cmd("down"), env=base_env(), cwd=CONFIGS_ROOT, check=False)
+
+
+def parse_bench_output(stdout: str) -> tuple[Path, dict[str, Any]]:
+ run_dir: Path | None = None
+ summary_payload: dict[str, Any] | None = None
+ for line in stdout.splitlines():
+ stripped = line.strip()
+ if stripped.startswith("run dir: "):
+ run_dir = Path(stripped[len("run dir: ") :])
+ continue
+ if stripped.startswith("{") and stripped.endswith("}"):
+ try:
+ payload = json.loads(stripped)
+ except Exception:
+ continue
+ if isinstance(payload, dict) and "benchmark_id" in payload:
+ summary_payload = payload
+ if run_dir is None or summary_payload is None:
+ raise RuntimeError("bench output did not contain a run dir and summary JSON")
+ return run_dir, summary_payload
+
+
+def run_bench(
+ *,
+ preset: str,
+ url: str,
+ repeat: int,
+ timeout_s: float,
+ backend_label: str,
+ runtime_variant: str,
+ target_label: str,
+) -> dict[str, Any]:
+ proc = run_cmd(
+ [
+ str(SCRIPT_DIR / "aoa-qwen-bench"),
+ "--preset",
+ preset,
+ "--repeat",
+ str(repeat),
+ "--timeout",
+ str(timeout_s),
+ "--url",
+ url,
+ "--backend-label",
+ backend_label,
+ "--model-label",
+ "qwen3.5:9b",
+ "--runtime-variant",
+ runtime_variant,
+ "--target-label",
+ target_label,
+ ],
+ env=base_env(),
+ capture_output=True,
+ check=False,
+ )
+ run_dir, summary_payload = parse_bench_output(proc.stdout)
+ manifest_path = run_dir / "benchmark.manifest.json"
+ summary_path = run_dir / "summary.json"
+ manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
+ summary = json.loads(summary_path.read_text(encoding="utf-8"))
+ return {
+ "ok": proc.returncode == 0,
+ "returncode": proc.returncode,
+ "command": proc.args,
+ "stdout": proc.stdout,
+ "stderr": proc.stderr,
+ "run_dir": str(run_dir),
+ "manifest": manifest,
+ "summary": summary,
+ "summary_stdout": summary_payload,
+ }
+
+
+def maybe_delta(candidate: float | None, baseline: float | None) -> float | None:
+ if candidate is None or baseline is None:
+ return None
+ return round(candidate - baseline, 3)
+
+
+def build_report(
+ *,
+ preset: str,
+ model_info: dict[str, Any],
+ baseline: dict[str, Any],
+ candidate: dict[str, Any],
+ comparison: dict[str, Any],
+) -> str:
+ base_summary = baseline["summary"]
+ cand_summary = candidate["summary"]
+ lines = [
+ f"# {PILOT_ID}",
+ "",
+ "## Summary",
+ f"- preset: `{preset}`",
+ f"- model reuse: `{model_info['reuse_strategy']}`",
+ f"- baseline run: `{baseline['run_dir']}`",
+ f"- candidate run: `{candidate['run_dir']}`",
+ f"- recommendation: `{comparison['recommendation']}`",
+ "",
+ "## Overall",
+ f"- baseline overall mean: `{base_summary.get('overall_mean_s')}` s",
+ f"- candidate overall mean: `{cand_summary.get('overall_mean_s')}` s",
+ f"- delta: `{comparison['overall_delta_s']}` s",
+ "",
+ "## Case deltas",
+ ]
+ for case_name, payload in comparison["case_deltas"].items():
+ lines.append(
+ f"- `{case_name}`: baseline `{payload['baseline_mean_s']}` s, candidate `{payload['candidate_mean_s']}` s, delta `{payload['delta_s']}` s"
+ )
+ lines.extend(
+ [
+ "",
+ "## Boundary",
+ "- This pilot compares serving/runtime posture, not reasoning quality canon.",
+ "- The validated canonical path remains Ollama-backed until a measured promotion decision is made.",
+ ]
+ )
+ return "\n".join(lines) + "\n"
+
+
+def screening_report(
+ *,
+ baseline: dict[str, Any],
+ screenings: list[dict[str, Any]],
+ winner: dict[str, Any] | None,
+ promotion: dict[str, Any] | None,
+) -> str:
+ lines = [
+ f"# {PROMOTION_ID}",
+ "",
+ "## Summary",
+ f"- baseline run: `{baseline['run_dir']}`",
+ f"- winner: `{winner['quant']}`" if winner else "- winner: `none`",
+ "",
+ "## Candidate Screening",
+ ]
+ for item in screenings:
+ routing_mean = case_mean(item["bench"]["summary"], "repo-routing") if item.get("bench") else None
+ exact_mean = case_mean(item["bench"]["summary"], "exact-reply") if item.get("bench") else None
+ lines.append(
+ f"- `{item['quant']}`: stable=`{item.get('stable')}` exact=`{exact_mean}` repo-routing=`{routing_mean}` baseline-recheck=`{item.get('baseline_recheck', {}).get('ready')}`"
+ )
+ if promotion is not None:
+ lines.extend(
+ [
+ "",
+ "## Promotion Gate",
+ f"- W0 gate: `{promotion['w0_gate_result']}`",
+ f"- W4 fixture gate: `{promotion['w4_gate_result']}`",
+ f"- baseline healthy after teardown: `{promotion['baseline_after_teardown']}`",
+ f"- recommendation: `{promotion['recommendation']}`",
+ ]
+ )
+ return "\n".join(lines) + "\n"
+
+
+def write_comparison_run(
+ *,
+ preset: str,
+ model_info: dict[str, Any],
+ baseline: dict[str, Any],
+ candidate: dict[str, Any],
+) -> Path:
+ run_root = PILOT_ROOT / "runs" / timestamp_dir()
+ run_root.mkdir(parents=True, exist_ok=True)
+ write_json(run_root / "model-resolution.json", model_info)
+ write_text(run_root / "baseline.bench.stdout.txt", baseline["stdout"])
+ write_text(run_root / "baseline.bench.stderr.txt", baseline["stderr"])
+ write_text(run_root / "candidate.bench.stdout.txt", candidate["stdout"])
+ write_text(run_root / "candidate.bench.stderr.txt", candidate["stderr"])
+
+ case_deltas: dict[str, Any] = {}
+ baseline_cases = baseline["summary"].get("case_breakdown", {})
+ candidate_cases = candidate["summary"].get("case_breakdown", {})
+ for case_name in sorted(set(baseline_cases) | set(candidate_cases)):
+ base_case = baseline_cases.get(case_name, {})
+ cand_case = candidate_cases.get(case_name, {})
+ case_deltas[case_name] = {
+ "baseline_mean_s": base_case.get("mean_s"),
+ "candidate_mean_s": cand_case.get("mean_s"),
+ "delta_s": maybe_delta(cand_case.get("mean_s"), base_case.get("mean_s")),
+ }
+
+ overall_delta_s = maybe_delta(
+ candidate["summary"].get("overall_mean_s"),
+ baseline["summary"].get("overall_mean_s"),
+ )
+ if candidate["ok"] and baseline["ok"] and overall_delta_s is not None and overall_delta_s < 0:
+ recommendation = "promising: llama.cpp sidecar is faster than the fresh Ollama baseline on this bounded bench"
+ elif candidate["ok"] and baseline["ok"]:
+ recommendation = "not better yet: llama.cpp sidecar did not beat the fresh Ollama baseline on this bounded bench"
+ else:
+ recommendation = "inconclusive: one or both benchmark runs failed"
+
+ comparison = {
+ "pilot_id": PILOT_ID,
+ "captured_at": utc_now(),
+ "preset": preset,
+ "baseline_run_ref": baseline["run_dir"],
+ "candidate_run_ref": candidate["run_dir"],
+ "baseline_backend": baseline["manifest"]["system_under_test"]["backend"],
+ "candidate_backend": candidate["manifest"]["system_under_test"]["backend"],
+ "overall_delta_s": overall_delta_s,
+ "case_deltas": case_deltas,
+ "recommendation": recommendation,
+ }
+ write_json(
+ run_root / "comparison.json",
+ {
+ **comparison,
+ "baseline_summary": baseline["summary"],
+ "candidate_summary": candidate["summary"],
+ },
+ )
+ write_json(
+ run_root / "pilot.manifest.json",
+ {
+ "pilot_id": PILOT_ID,
+ "captured_at": utc_now(),
+ "preset": preset,
+ "model_info_ref": "model-resolution.json",
+ "baseline_run_ref": baseline["run_dir"],
+ "candidate_run_ref": candidate["run_dir"],
+ "comparison_ref": "comparison.json",
+ },
+ )
+ write_text(
+ run_root / "report.md",
+ build_report(
+ preset=preset,
+ model_info=model_info,
+ baseline=baseline,
+ candidate=candidate,
+ comparison=comparison,
+ ),
+ )
+ write_json(
+ PILOT_ROOT / "latest.json",
+ {
+ "pilot_id": PILOT_ID,
+ "captured_at": utc_now(),
+ "latest_run_root": str(run_root),
+ "comparison_ref": str(run_root / "comparison.json"),
+ "report_ref": str(run_root / "report.md"),
+ },
+ )
+ return run_root
+
+
+def screening_artifact_root() -> Path:
+ path = PROMOTION_ROOT / "runs" / timestamp_dir()
+ path.mkdir(parents=True, exist_ok=True)
+ return path
+
+
+def write_screening_artifacts(
+ *,
+ run_root: Path,
+ baseline: dict[str, Any],
+ screenings: list[dict[str, Any]],
+ winner: dict[str, Any] | None,
+ promotion: dict[str, Any] | None,
+) -> None:
+ write_json(
+ run_root / "baseline.summary.json",
+ {
+ "summary": baseline["summary"],
+ "smokes": baseline.get("smokes"),
+ },
+ )
+ for item in screenings:
+ quant = item["quant"].lower()
+ write_json(run_root / f"{quant}.screening.json", item)
+ payload = {
+ "promotion_id": PROMOTION_ID,
+ "captured_at": utc_now(),
+ "baseline_run_ref": baseline["run_dir"],
+ "baseline_smokes": baseline.get("smokes"),
+ "winner_quant": winner["quant"] if winner else None,
+ "winner_model_host_path": winner["model_host_path"] if winner else None,
+ "screenings": [
+ {
+ "quant": item["quant"],
+ "stable": item.get("stable"),
+ "exact_reply_regression_ratio": item.get("exact_reply_regression_ratio"),
+ "repo_routing_mean_s": case_mean(item["bench"]["summary"], "repo-routing") if item.get("bench") else None,
+ "baseline_recheck_ready": item.get("baseline_recheck", {}).get("ready"),
+ }
+ for item in screenings
+ ],
+ "promotion": promotion,
+ }
+ write_json(run_root / "promotion.json", payload)
+ write_text(
+ run_root / "report.md",
+ screening_report(
+ baseline=baseline,
+ screenings=screenings,
+ winner=winner,
+ promotion=promotion,
+ ),
+ )
+ write_json(
+ PROMOTION_ROOT / "latest.json",
+ {
+ "promotion_id": PROMOTION_ID,
+ "captured_at": utc_now(),
+ "latest_run_root": str(run_root),
+ "promotion_ref": str(run_root / "promotion.json"),
+ "report_ref": str(run_root / "report.md"),
+ },
+ )
+
+
+def candidate_screening(
+ *,
+ spec: dict[str, Any],
+ args: argparse.Namespace,
+) -> dict[str, Any]:
+ model_path = Path(spec["model_host_path"])
+ if not model_path.exists():
+ return {
+ **spec,
+ "stable": False,
+ "error": f"missing model file: {model_path}",
+ }
+ result: dict[str, Any] = {
+ **spec,
+ "started_at": utc_now(),
+ }
+ try:
+ up_llama_sidecar(model_path)
+ llama_ready = wait_for_llama(args.wait_timeout)
+ result["llama_cpp"] = llama_ready
+ if not llama_ready["ready"]:
+ result["stable"] = False
+ return result
+
+ up_langchain_sidecar(model_path)
+ candidate_ready = wait_for_url("langchain-api-llamacpp", CANDIDATE_HEALTH_URL, timeout_s=args.wait_timeout)
+ result["candidate_health"] = candidate_ready
+ if not candidate_ready["ready"]:
+ result["stable"] = False
+ return result
+
+ exact = run_qwen_check(case_name="exact-reply", url=CANDIDATE_RUN_URL, timeout_s=args.timeout)
+ routing = run_qwen_check(case_name="repo-routing", url=CANDIDATE_RUN_URL, timeout_s=args.timeout)
+ bench = run_bench(
+ preset=args.preset,
+ url=CANDIDATE_RUN_URL,
+ repeat=args.repeat,
+ timeout_s=args.timeout,
+ backend_label=spec["backend_label"],
+ runtime_variant=spec["runtime_variant"],
+ target_label=spec["target_label"],
+ )
+ result["exact_smoke"] = exact
+ result["repo_routing_smoke"] = routing
+ result["bench"] = bench
+ result["stable"] = bool(exact["ok"] and routing["ok"] and bench["ok"])
+ return result
+ finally:
+ stop_sidecars()
+ result["baseline_recheck"] = wait_for_url("langchain-api", BASE_HEALTH_URL, timeout_s=20.0)
+ if result.get("stable") and not result["baseline_recheck"]["ready"]:
+ result["stable"] = False
+
+
+def auto_approve_fixture(log_root: Path, *, case_id: str) -> Path:
+ approval_path = log_root / "waves" / "W4" / case_id / "artifacts" / "approval.status.json"
+ payload = json.loads(approval_path.read_text(encoding="utf-8"))
+ payload["status"] = "approved"
+ payload["approved"] = True
+ payload["approved_at"] = utc_now()
+ payload["notes"] = "Approved automatically by aoa-llamacpp-pilot for the disposable fixture gate."
+ write_json(approval_path, payload)
+ return approval_path
+
+
+def run_promotion_gate(args: argparse.Namespace, winner: dict[str, Any]) -> dict[str, Any]:
+ model_path = Path(winner["model_host_path"])
+ up_llama_sidecar(model_path)
+ llama_ready = wait_for_llama(args.wait_timeout)
+ if not llama_ready["ready"]:
+ stop_sidecars()
+ raise RuntimeError("winner llama.cpp sidecar did not become healthy during promotion gate")
+ up_langchain_sidecar(model_path)
+ candidate_ready = wait_for_url("langchain-api-llamacpp", CANDIDATE_HEALTH_URL, timeout_s=args.wait_timeout)
+ if not candidate_ready["ready"]:
+ stop_sidecars()
+ raise RuntimeError("winner langchain-api-llamacpp did not become healthy during promotion gate")
+ try:
+ run_cmd(
+ [
+ str(SCRIPT_DIR / "aoa-local-ai-trials"),
+ "--url",
+ CANDIDATE_RUN_URL,
+ "--program-id",
+ LLAMACPP_W0_PROGRAM_ID,
+ "run-wave",
+ "W0",
+ ],
+ env=base_env(),
+ check=True,
+ )
+ w0_index = json.loads(
+ (STACK_ROOT / "Logs" / "local-ai-trials" / LLAMACPP_W0_PROGRAM_ID / "W0-runtime-index.json").read_text(
+ encoding="utf-8"
+ )
+ )
+
+ shutil.rmtree(LLAMACPP_W4_GATE_LOG_ROOT, ignore_errors=True)
+ shutil.rmtree(LLAMACPP_W4_GATE_MIRROR_ROOT, ignore_errors=True)
+ run_cmd(
+ [
+ str(SCRIPT_DIR / "aoa-langgraph-pilot"),
+ "--url",
+ CANDIDATE_RUN_URL,
+ "--program-id",
+ LLAMACPP_W4_PROGRAM_ID,
+ "--log-root",
+ str(LLAMACPP_W4_GATE_LOG_ROOT),
+ "--mirror-root",
+ str(LLAMACPP_W4_GATE_MIRROR_ROOT),
+ "materialize",
+ ],
+ env=base_env(),
+ check=True,
+ )
+
+ run_cmd(
+ [
+ str(SCRIPT_DIR / "aoa-langgraph-pilot"),
+ "--url",
+ CANDIDATE_RUN_URL,
+ "--program-id",
+ LLAMACPP_W4_PROGRAM_ID,
+ "--log-root",
+ str(LLAMACPP_W4_GATE_LOG_ROOT),
+ "--mirror-root",
+ str(LLAMACPP_W4_GATE_MIRROR_ROOT),
+ "run-case",
+ "fixture-docs-wording-alignment",
+ "--until",
+ "approval",
+ ],
+ env=base_env(),
+ check=True,
+ )
+ fixture_log_root = LLAMACPP_W4_GATE_LOG_ROOT
+ auto_approve_fixture(fixture_log_root, case_id="fixture-docs-wording-alignment")
+ run_cmd(
+ [
+ str(SCRIPT_DIR / "aoa-langgraph-pilot"),
+ "--url",
+ CANDIDATE_RUN_URL,
+ "--program-id",
+ LLAMACPP_W4_PROGRAM_ID,
+ "--log-root",
+ str(LLAMACPP_W4_GATE_LOG_ROOT),
+ "--mirror-root",
+ str(LLAMACPP_W4_GATE_MIRROR_ROOT),
+ "resume-case",
+ "fixture-docs-wording-alignment",
+ ],
+ env=base_env(),
+ check=True,
+ )
+ w4_index = json.loads(
+ (fixture_log_root / "W4-langgraph-sidecar-index.json").read_text(encoding="utf-8")
+ )
+ finally:
+ stop_sidecars()
+
+ baseline_after_teardown = wait_for_url("langchain-api", BASE_HEALTH_URL, timeout_s=20.0)
+ recommendation = (
+ "promote llama.cpp"
+ if w0_index.get("gate_result") == "pass"
+ and w4_index.get("gate_result") == "pass"
+ and baseline_after_teardown.get("ready")
+ else "stay on Ollama"
+ )
+ return {
+ "winner_quant": winner["quant"],
+ "winner_model_host_path": winner["model_host_path"],
+ "w0_gate_result": w0_index.get("gate_result"),
+ "w0_index_ref": str(STACK_ROOT / "Logs" / "local-ai-trials" / LLAMACPP_W0_PROGRAM_ID / "W0-runtime-index.json"),
+ "w4_gate_result": w4_index.get("gate_result"),
+ "w4_index_ref": str(LLAMACPP_W4_GATE_LOG_ROOT / "W4-langgraph-sidecar-index.json"),
+ "baseline_after_teardown": bool(baseline_after_teardown.get("ready")),
+ "baseline_recheck_payload": baseline_after_teardown,
+ "recommendation": recommendation,
+ }
+
+
+def doctor_command(args: argparse.Namespace) -> int:
+ if not args.skip_sync:
+ sync_configs()
+ model_info = resolve_model_info(args.model_host_path)
+ run_doctor(args.preset)
+ payload = {
+ "pilot_id": PILOT_ID,
+ "preset": args.preset,
+ "model_info": model_info,
+ "candidate_models": candidate_model_info(),
+ "base_health": wait_for_url("langchain-api", BASE_HEALTH_URL, timeout_s=2.0),
+ }
+ print(json.dumps(payload, indent=2, ensure_ascii=True))
+ return 0
+
+
+def ensure_base_ready(preset: str, wait_timeout: float) -> None:
+ baseline_ready = wait_for_url("langchain-api", BASE_HEALTH_URL, timeout_s=2.0)
+ if baseline_ready["ready"]:
+ return
+ up_base_stack(preset)
+ baseline_ready = wait_for_url("langchain-api", BASE_HEALTH_URL, timeout_s=wait_timeout)
+ if not baseline_ready["ready"]:
+ raise SystemExit("error: baseline langchain-api health is not ready")
+
+
+def up_command(args: argparse.Namespace) -> int:
+ if not args.skip_sync:
+ sync_configs()
+ model_info = resolve_model_info(args.model_host_path)
+ run_doctor(args.preset)
+ ensure_base_ready(args.preset, args.wait_timeout)
+ model_path = Path(model_info["model_host_path"])
+ up_llama_sidecar(model_path)
+ llama_ready = wait_for_llama(args.wait_timeout)
+ if not llama_ready["ready"]:
+ stop_sidecars()
+ payload = {
+ "pilot_id": PILOT_ID,
+ "preset": args.preset,
+ "model_info": model_info,
+ "llama_cpp": llama_ready,
+ "langchain_api_llamacpp": {
+ "ready": False,
+ "status": None,
+ "payload": None,
+ "url": CANDIDATE_HEALTH_URL,
+ "name": "langchain-api-llamacpp",
+ },
+ }
+ print(json.dumps(payload, indent=2, ensure_ascii=True))
+ return 1
+
+ up_langchain_sidecar(model_path)
+ candidate_ready = wait_for_url(
+ "langchain-api-llamacpp",
+ CANDIDATE_HEALTH_URL,
+ timeout_s=args.wait_timeout,
+ )
+ if not candidate_ready["ready"]:
+ stop_sidecars()
+ payload = {
+ "pilot_id": PILOT_ID,
+ "preset": args.preset,
+ "model_info": model_info,
+ "llama_cpp": llama_ready,
+ "langchain_api_llamacpp": candidate_ready,
+ }
+ print(json.dumps(payload, indent=2, ensure_ascii=True))
+ return 0 if llama_ready["ready"] and candidate_ready["ready"] else 1
+
+
+def bench_command(args: argparse.Namespace) -> int:
+ candidate = run_bench(
+ preset=args.preset,
+ url=CANDIDATE_RUN_URL,
+ repeat=args.repeat,
+ timeout_s=args.timeout,
+ backend_label="langchain-api-llamacpp -> llama.cpp-openai",
+ runtime_variant="Q4_K_M via llama.cpp sidecar",
+ target_label="workhorse-local-qwen3.5-9b-llamacpp",
+ )
+ print(json.dumps(candidate["summary"], indent=2, ensure_ascii=True))
+ return 0 if candidate["ok"] else 1
+
+
+def run_command(args: argparse.Namespace) -> int:
+ if not args.skip_sync:
+ sync_configs()
+ model_info = resolve_model_info(args.model_host_path)
+ run_doctor(args.preset)
+ ensure_base_ready(args.preset, args.wait_timeout)
+
+ baseline = run_bench(
+ preset=args.preset,
+ url=BASE_RUN_URL,
+ repeat=args.repeat,
+ timeout_s=args.timeout,
+ backend_label="langchain-api -> ollama-native",
+ runtime_variant="Q4_K_M via Ollama",
+ target_label="workhorse-local-qwen3.5-9b-ollama-baseline",
+ )
+ model_path = Path(model_info["model_host_path"])
+ up_llama_sidecar(model_path)
+ llama_ready = wait_for_llama(args.wait_timeout)
+ if not llama_ready["ready"]:
+ stop_sidecars()
+ detail = llama_ready.get("error") or "llama.cpp sidecar did not become healthy in time"
+ raise SystemExit(f"error: {detail}")
+
+ up_langchain_sidecar(model_path)
+ candidate_ready = wait_for_url("langchain-api-llamacpp", CANDIDATE_HEALTH_URL, timeout_s=args.wait_timeout)
+ if not candidate_ready["ready"]:
+ stop_sidecars()
+ raise SystemExit("error: langchain-api-llamacpp did not become healthy in time")
+
+ candidate = run_bench(
+ preset=args.preset,
+ url=CANDIDATE_RUN_URL,
+ repeat=args.repeat,
+ timeout_s=args.timeout,
+ backend_label="langchain-api-llamacpp -> llama.cpp-openai",
+ runtime_variant="Q4_K_M via llama.cpp sidecar",
+ target_label="workhorse-local-qwen3.5-9b-llamacpp",
+ )
+ run_root = write_comparison_run(
+ preset=args.preset,
+ model_info=model_info,
+ baseline=baseline,
+ candidate=candidate,
+ )
+ print(f"comparison root: {run_root}")
+ print(json.dumps(json.loads((run_root / 'comparison.json').read_text(encoding='utf-8')), indent=2, ensure_ascii=True))
+ return 0 if baseline["ok"] and candidate["ok"] else 1
+
+
+def promote_command(args: argparse.Namespace) -> int:
+ if not args.skip_sync:
+ sync_configs()
+ run_doctor(args.preset)
+ ensure_base_ready(args.preset, args.wait_timeout)
+ baseline_smokes = {
+ "exact_smoke": run_qwen_check(case_name="exact-reply", url=BASE_RUN_URL, timeout_s=args.timeout),
+ "repo_routing_smoke": run_qwen_check(case_name="repo-routing", url=BASE_RUN_URL, timeout_s=args.timeout),
+ }
+ baseline = run_bench(
+ preset=args.preset,
+ url=BASE_RUN_URL,
+ repeat=args.repeat,
+ timeout_s=args.timeout,
+ backend_label="langchain-api -> ollama-native",
+ runtime_variant="Q4_K_M via Ollama",
+ target_label="workhorse-local-qwen3.5-9b-ollama-baseline",
+ )
+ baseline["smokes"] = baseline_smokes
+ screenings = [
+ candidate_screening(spec=spec, args=args)
+ for spec in candidate_model_info()
+ ]
+ winner = screening_winner(baseline=baseline, screenings=screenings)
+ if winner is not None:
+ promotion = run_promotion_gate(args, winner)
+ else:
+ promotion = {
+ "winner_quant": None,
+ "winner_model_host_path": None,
+ "w0_gate_result": "not-run",
+ "w0_index_ref": None,
+ "w4_gate_result": "not-run",
+ "w4_index_ref": None,
+ "baseline_after_teardown": bool(wait_for_url("langchain-api", BASE_HEALTH_URL, timeout_s=20.0).get("ready")),
+ "baseline_recheck_payload": wait_for_url("langchain-api", BASE_HEALTH_URL, timeout_s=20.0),
+ "recommendation": "stay on Ollama",
+ "reason": "no candidate satisfied the stability and exact-reply regression rule",
+ }
+ run_root = screening_artifact_root()
+ write_screening_artifacts(
+ run_root=run_root,
+ baseline=baseline,
+ screenings=screenings,
+ winner=winner,
+ promotion=promotion,
+ )
+ payload = json.loads((run_root / "promotion.json").read_text(encoding="utf-8"))
+ print(f"promotion root: {run_root}")
+ print(json.dumps(payload, indent=2, ensure_ascii=True))
+ if winner is None or promotion is None:
+ return 1
+ return 0 if promotion["recommendation"] == "promote llama.cpp" else 1
+
+
+def verify_command(args: argparse.Namespace) -> int:
+ llama_ready = wait_for_llama(args.timeout)
+ candidate_ready = wait_for_url("langchain-api-llamacpp", CANDIDATE_HEALTH_URL, timeout_s=args.timeout)
+ exact = run_qwen_check(case_name="exact-reply", url=CANDIDATE_RUN_URL, timeout_s=args.timeout)
+ routing = run_qwen_check(case_name="repo-routing", url=CANDIDATE_RUN_URL, timeout_s=args.timeout)
+ payload = {
+ "pilot_id": PILOT_ID,
+ "ok": bool(llama_ready.get("ready")) and bool(candidate_ready.get("ready")) and exact["ok"] and routing["ok"],
+ "llama_cpp_health": {
+ "ok": bool(llama_ready.get("ready")),
+ "status": llama_ready.get("status"),
+ "url": llama_ready.get("url"),
+ },
+ "langchain_api_llamacpp_health": {
+ "ok": bool(candidate_ready.get("ready")),
+ "status": candidate_ready.get("status"),
+ "url": candidate_ready.get("url"),
+ },
+ "exact_reply": exact,
+ "repo_routing": routing,
+ }
+ print(json.dumps(payload, ensure_ascii=True, separators=(",", ":")))
+ return 0 if payload["ok"] else 1
+
+
+def status_command(_: argparse.Namespace) -> int:
+ latest = None
+ latest_path = PILOT_ROOT / "latest.json"
+ if latest_path.exists():
+ latest = json.loads(latest_path.read_text(encoding="utf-8"))
+ promotion_latest = None
+ promotion_latest_path = PROMOTION_ROOT / "latest.json"
+ if promotion_latest_path.exists():
+ promotion_latest = json.loads(promotion_latest_path.read_text(encoding="utf-8"))
+ payload = {
+ "pilot_id": PILOT_ID,
+ "latest": latest,
+ "promotion_latest": promotion_latest,
+ "base_health": wait_for_url("langchain-api", BASE_HEALTH_URL, timeout_s=2.0),
+ "llama_cpp_health": wait_for_url("llama-cpp", LLAMACPP_HEALTH_URL, timeout_s=2.0, accept_503=True),
+ "langchain_api_llamacpp_health": wait_for_url(
+ "langchain-api-llamacpp", CANDIDATE_HEALTH_URL, timeout_s=2.0
+ ),
+ }
+ print(json.dumps(payload, indent=2, ensure_ascii=True))
+ return 0
+
+
+def down_command(_: argparse.Namespace) -> int:
+ stop_sidecars()
+ print(json.dumps({"pilot_id": PILOT_ID, "stopped_services": ["langchain-api-llamacpp", "llama-cpp"]}, indent=2, ensure_ascii=True))
+ return 0
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(
+ description="Run a bounded llama.cpp sidecar pilot next to the canonical Ollama path."
+ )
+ subparsers = parser.add_subparsers(dest="command", required=True)
+
+ def add_common_flags(sub: argparse.ArgumentParser) -> None:
+ sub.add_argument("--preset", default="intel-full")
+ sub.add_argument("--repeat", type=int, default=2)
+ sub.add_argument("--timeout", type=float, default=90.0)
+ sub.add_argument("--wait-timeout", type=float, default=180.0)
+ sub.add_argument("--model-host-path", default=None)
+ sub.add_argument("--skip-sync", action="store_true")
+
+ doctor = subparsers.add_parser("doctor", help="Resolve the reusable GGUF model and confirm the base preset posture.")
+ add_common_flags(doctor)
+ doctor.set_defaults(func=doctor_command)
+
+ up = subparsers.add_parser("up", help="Sync configs, resolve the GGUF model, and start the llama.cpp sidecar services.")
+ add_common_flags(up)
+ up.set_defaults(func=up_command)
+
+ bench = subparsers.add_parser("bench", help="Benchmark the llama.cpp sidecar langchain-api path on port 5403.")
+ add_common_flags(bench)
+ bench.set_defaults(func=bench_command)
+
+ run = subparsers.add_parser("run", help="Run a fresh Ollama baseline bench and a fresh llama.cpp sidecar bench, then compare them.")
+ add_common_flags(run)
+ run.set_defaults(func=run_command)
+
+ promote = subparsers.add_parser(
+ "promote",
+ help="Screen fixed llama.cpp quants and run the bounded W0 + W4 promotion gate on the winner.",
+ )
+ add_common_flags(promote)
+ promote.set_defaults(func=promote_command)
+
+ verify = subparsers.add_parser("verify", help="Verify the currently running llama.cpp sidecar without calling up or down.")
+ verify.add_argument("--timeout", type=float, default=60.0)
+ verify.set_defaults(func=verify_command)
+
+ status = subparsers.add_parser("status", help="Show current sidecar health and the latest saved comparison ref.")
+ status.set_defaults(func=status_command)
+
+ down = subparsers.add_parser("down", help="Stop and remove only the llama.cpp sidecar services.")
+ down.set_defaults(func=down_command)
+
+ return parser
+
+
+def main() -> int:
+ parser = build_parser()
+ args = parser.parse_args()
+ try:
+ return int(args.func(args))
+ except subprocess.CalledProcessError as exc:
+ if exc.stdout:
+ sys.stdout.write(exc.stdout)
+ if exc.stderr:
+ sys.stderr.write(exc.stderr)
+ print(f"error: command failed: {' '.join(str(part) for part in exc.cmd)}", file=sys.stderr)
+ return exc.returncode or 1
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/scripts/aoa-local-ai-trials b/scripts/aoa-local-ai-trials
index b6a6ff1..d9907a9 100755
--- a/scripts/aoa-local-ai-trials
+++ b/scripts/aoa-local-ai-trials
@@ -17,12 +17,16 @@ from datetime import datetime, timezone
from pathlib import Path
from typing import Any
-PROGRAM_ID = "qwen-local-pilot-v1"
+DEFAULT_PROGRAM_ID = "qwen-local-pilot-v1"
+PROGRAM_ID = DEFAULT_PROGRAM_ID
MODEL = "qwen3.5:9b"
STACK_ROOT = Path("/srv/abyss-stack")
CONFIGS_ROOT = STACK_ROOT / "Configs"
SCRIPTS_ROOT = CONFIGS_ROOT / "scripts"
+DEFAULT_LANGCHAIN_RUN_URL = "http://127.0.0.1:5401/run"
+LANGCHAIN_RUN_URL = DEFAULT_LANGCHAIN_RUN_URL
+LANGCHAIN_BASE_URL = DEFAULT_LANGCHAIN_RUN_URL.rsplit("/", 1)[0]
LOG_ROOT_DEFAULT = STACK_ROOT / "Logs" / "local-ai-trials" / PROGRAM_ID
MIRROR_ROOT_DEFAULT = Path("/srv/Dionysus/reports/local-ai-trials") / PROGRAM_ID
@@ -335,7 +339,24 @@ def route_endpoint(path: str) -> str:
def langchain_endpoint(path: str) -> str:
- return f"http://127.0.0.1:5401{path}"
+ return f"{LANGCHAIN_BASE_URL}{path}"
+
+
+def default_log_root_for(program_id: str) -> Path:
+ return STACK_ROOT / "Logs" / "local-ai-trials" / program_id
+
+
+def default_mirror_root_for(program_id: str) -> Path:
+ return Path("/srv/Dionysus/reports/local-ai-trials") / program_id
+
+
+def configure_program_runtime(*, program_id: str, run_url: str) -> None:
+ global PROGRAM_ID, LOG_ROOT_DEFAULT, MIRROR_ROOT_DEFAULT, LANGCHAIN_RUN_URL, LANGCHAIN_BASE_URL
+ PROGRAM_ID = program_id
+ LOG_ROOT_DEFAULT = default_log_root_for(program_id)
+ MIRROR_ROOT_DEFAULT = default_mirror_root_for(program_id)
+ LANGCHAIN_RUN_URL = run_url
+ LANGCHAIN_BASE_URL = run_url.rsplit("/", 1)[0]
def case_dir(log_root: Path, wave_id: str, case_id: str) -> Path:
@@ -2121,6 +2142,8 @@ def run_qwen_prompt(
absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
"--prompt-file",
str(prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
"--timeout",
str(timeout_s),
"--temperature",
@@ -2334,6 +2357,8 @@ def run_w1_case(case: dict[str, Any], *, log_root: Path, mirror_root: Path) -> N
absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
"--prompt-file",
str(prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
"--timeout",
"120",
"--temperature",
@@ -3357,6 +3382,8 @@ def run_w2_case(case: dict[str, Any], *, log_root: Path, mirror_root: Path) -> N
absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
"--prompt-file",
str(prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
"--timeout",
"150",
"--temperature",
@@ -3381,6 +3408,8 @@ def run_w2_case(case: dict[str, Any], *, log_root: Path, mirror_root: Path) -> N
absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
"--prompt-file",
str(judge_prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
"--timeout",
"150",
"--temperature",
@@ -3496,6 +3525,8 @@ def run_w2_case(case: dict[str, Any], *, log_root: Path, mirror_root: Path) -> N
absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
"--prompt-file",
str(judge_prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
"--timeout",
"240",
"--temperature",
@@ -4319,6 +4350,8 @@ def run_w3_case(case: dict[str, Any], *, log_root: Path, mirror_root: Path) -> N
absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
"--prompt-file",
str(prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
"--timeout",
"180",
"--temperature",
@@ -5417,6 +5450,11 @@ def prepare_w4_docs_case(
proposal_target_path = case_root / "artifacts" / "proposal.target.json"
proposal_plan_path = case_root / "artifacts" / "proposal.plan.json"
proposal_summary_path = case_root / "artifacts" / "proposal.summary.json"
+ docs_timeout_scale = 2 if "5403" in LANGCHAIN_RUN_URL else 1
+ target_timeout_s = 45 * docs_timeout_scale
+ plan_timeout_s = 60 * docs_timeout_scale
+ exact_timeout_s = 90 * docs_timeout_scale
+ anchor_timeout_s = 90 * docs_timeout_scale
file_entries: list[dict[str, Any]] = []
file_errors: list[str] = []
@@ -5439,7 +5477,7 @@ def prepare_w4_docs_case(
label="proposal-target-selection",
prompt_text=target_prompt,
max_tokens=40,
- timeout_s=45,
+ timeout_s=target_timeout_s,
)
command_refs.append(target_command_ref)
raw_target_answer = str(target_qwen.get("answer") or "")
@@ -5487,8 +5525,10 @@ def prepare_w4_docs_case(
absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
"--prompt-file",
str(target_prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
"--timeout",
- "45",
+ str(target_timeout_s),
"--temperature",
"0",
"--max-tokens",
@@ -5528,8 +5568,10 @@ def prepare_w4_docs_case(
absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
"--prompt-file",
str(plan_prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
"--timeout",
- "60",
+ str(plan_timeout_s),
"--temperature",
"0",
"--max-tokens",
@@ -5626,7 +5668,7 @@ def prepare_w4_docs_case(
label="proposal-alignment-plan",
prompt_text=plan_prompt,
max_tokens=180,
- timeout_s=60,
+ timeout_s=plan_timeout_s,
)
command_refs.append(plan_command_ref)
raw_plan_answer = str(plan_qwen.get("answer") or "")
@@ -5683,8 +5725,10 @@ def prepare_w4_docs_case(
absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
"--prompt-file",
str(proposal_prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
"--timeout",
- "90",
+ str(exact_timeout_s),
"--temperature",
"0",
"--max-tokens",
@@ -5735,7 +5779,7 @@ def prepare_w4_docs_case(
label="proposal-edit-spec-exact",
prompt_text=exact_prompt,
max_tokens=220,
- timeout_s=90,
+ timeout_s=exact_timeout_s,
)
command_refs.append(exact_command_ref)
attempt_order.append("exact_replace")
@@ -5818,7 +5862,7 @@ def prepare_w4_docs_case(
label="proposal-edit-spec-anchor",
prompt_text=anchor_prompt,
max_tokens=260,
- timeout_s=90,
+ timeout_s=anchor_timeout_s,
)
command_refs.append(anchor_command_ref)
attempt_order.append("anchored_replace")
@@ -6703,7 +6747,13 @@ def w4_failure_summary(
)
-def apply_w4_case(case: dict[str, Any], *, log_root: Path, mirror_root: Path) -> None:
+def apply_w4_case(
+ case: dict[str, Any],
+ *,
+ log_root: Path,
+ mirror_root: Path,
+ land_back: bool = True,
+) -> None:
catalog = build_catalog()
case_root = case_dir(log_root, "W4", case["case_id"])
repo_root = repo_root_for_w4_case(case)
@@ -6951,65 +7001,66 @@ def apply_w4_case(case: dict[str, Any], *, log_root: Path, mirror_root: Path) ->
failures.append("one or more acceptance checks failed in isolated worktree")
raise RuntimeError("worktree acceptance failed")
- ensure_repo_ready_for_w4_case(
- repo_root,
- case=case,
- log_root=log_root,
- catalog=catalog,
- )
- if git_head(repo_root) != base_head:
- failure_class = "landing_reapply_failure"
- failures.append("repo HEAD drifted before landing validated diff back to main repo")
- raise RuntimeError("main repo head drifted")
-
- landing_diff_text = landing_diff_path.read_text(encoding="utf-8")
- if landing_diff_text.strip():
- main_check_raw = git_command(
+ if land_back:
+ ensure_repo_ready_for_w4_case(
repo_root,
- ["apply", "--check", str(landing_diff_path)],
- timeout_s=60,
- )
- main_check_ref = persist_command_result(case_root, "landing-apply-check", main_check_raw)
- command_refs.append(main_check_ref)
- artifact_refs.extend(
- [main_check_ref["stdout_path"], main_check_ref["stderr_path"], main_check_ref["command_meta"]]
+ case=case,
+ log_root=log_root,
+ catalog=catalog,
)
- if main_check_raw["exit_code"] != 0 or main_check_raw["timed_out"]:
+ if git_head(repo_root) != base_head:
failure_class = "landing_reapply_failure"
- failures.append("validated diff could not be applied cleanly back to the main repo")
- raise RuntimeError("main repo apply check failed")
+ failures.append("repo HEAD drifted before landing validated diff back to main repo")
+ raise RuntimeError("main repo head drifted")
+
+ landing_diff_text = landing_diff_path.read_text(encoding="utf-8")
+ if landing_diff_text.strip():
+ main_check_raw = git_command(
+ repo_root,
+ ["apply", "--check", str(landing_diff_path)],
+ timeout_s=60,
+ )
+ main_check_ref = persist_command_result(case_root, "landing-apply-check", main_check_raw)
+ command_refs.append(main_check_ref)
+ artifact_refs.extend(
+ [main_check_ref["stdout_path"], main_check_ref["stderr_path"], main_check_ref["command_meta"]]
+ )
+ if main_check_raw["exit_code"] != 0 or main_check_raw["timed_out"]:
+ failure_class = "landing_reapply_failure"
+ failures.append("validated diff could not be applied cleanly back to the main repo")
+ raise RuntimeError("main repo apply check failed")
+
+ main_apply_raw = git_command(
+ repo_root,
+ ["apply", str(landing_diff_path)],
+ timeout_s=60,
+ )
+ main_apply_ref = persist_command_result(case_root, "landing-apply", main_apply_raw)
+ command_refs.append(main_apply_ref)
+ artifact_refs.extend(
+ [main_apply_ref["stdout_path"], main_apply_ref["stderr_path"], main_apply_ref["command_meta"]]
+ )
+ if main_apply_raw["exit_code"] != 0 or main_apply_raw["timed_out"]:
+ failure_class = "landing_reapply_failure"
+ failures.append("validated diff failed during landing apply in the main repo")
+ raise RuntimeError("main repo apply failed")
- main_apply_raw = git_command(
- repo_root,
- ["apply", str(landing_diff_path)],
- timeout_s=60,
- )
- main_apply_ref = persist_command_result(case_root, "landing-apply", main_apply_raw)
- command_refs.append(main_apply_ref)
- artifact_refs.extend(
- [main_apply_ref["stdout_path"], main_apply_ref["stderr_path"], main_apply_ref["command_meta"]]
+ main_acceptance_refs, main_acceptance_ok = run_acceptance_checks(
+ case_root,
+ repo_root=repo_root,
+ checks=case.get("acceptance_checks", []),
+ label_prefix="landing-acceptance",
)
- if main_apply_raw["exit_code"] != 0 or main_apply_raw["timed_out"]:
- failure_class = "landing_reapply_failure"
- failures.append("validated diff failed during landing apply in the main repo")
- raise RuntimeError("main repo apply failed")
-
- main_acceptance_refs, main_acceptance_ok = run_acceptance_checks(
- case_root,
- repo_root=repo_root,
- checks=case.get("acceptance_checks", []),
- label_prefix="landing-acceptance",
- )
- command_refs.extend(main_acceptance_refs)
- for ref in main_acceptance_refs:
- artifact_refs.extend([ref["stdout_path"], ref["stderr_path"], ref["command_meta"]])
- if not main_acceptance_ok:
- reverse_diff_text = landing_diff_path.read_text(encoding="utf-8")
- if reverse_diff_text.strip():
- git_command(repo_root, ["apply", "-R", str(landing_diff_path)], timeout_s=60)
- failure_class = "post_change_validation_failure"
- failures.append("one or more acceptance checks failed after landing diff back to the main repo")
- raise RuntimeError("main repo acceptance failed")
+ command_refs.extend(main_acceptance_refs)
+ for ref in main_acceptance_refs:
+ artifact_refs.extend([ref["stdout_path"], ref["stderr_path"], ref["command_meta"]])
+ if not main_acceptance_ok:
+ reverse_diff_text = landing_diff_path.read_text(encoding="utf-8")
+ if reverse_diff_text.strip():
+ git_command(repo_root, ["apply", "-R", str(landing_diff_path)], timeout_s=60)
+ failure_class = "post_change_validation_failure"
+ failures.append("one or more acceptance checks failed after landing diff back to the main repo")
+ raise RuntimeError("main repo acceptance failed")
run_manifest = {
"artifact_kind": "aoa.local-ai-trial.run-manifest",
@@ -7023,7 +7074,11 @@ def apply_w4_case(case: dict[str, Any], *, log_root: Path, mirror_root: Path) ->
"commands": command_refs,
"artifact_refs": artifact_refs,
"notes": [
- "W4 landed only after isolated worktree mutation, scoped diff validation, and repeated acceptance checks in the main repo.",
+ (
+ "W4 landed only after isolated worktree mutation, scoped diff validation, and repeated acceptance checks in the main repo."
+ if land_back
+ else "W4 dry-run passed in an isolated worktree without reapplying any diff back to the repo root."
+ ),
],
}
result_summary = build_result_summary(
@@ -7039,15 +7094,27 @@ def apply_w4_case(case: dict[str, Any], *, log_root: Path, mirror_root: Path) ->
"highlights": [
*highlights,
f"Changed files: `{json.dumps(changed_files, ensure_ascii=True)}`.",
- "All worktree and main-repo acceptance checks passed.",
+ (
+ "All worktree and main-repo acceptance checks passed."
+ if land_back
+ else "All worktree-only acceptance checks passed. No landing back to the repo root was attempted."
+ ),
],
"failures": ["None."],
"changed_files": changed_files,
},
failure_class=None,
- reviewer_notes="The W4 case stayed inside approved scope, passed isolated validation, and landed cleanly back to the main repo.",
+ reviewer_notes=(
+ "The W4 case stayed inside approved scope, passed isolated validation, and landed cleanly back to the main repo."
+ if land_back
+ else "The W4 fixture case stayed inside approved scope and passed the full isolated worktree dry-run without touching the repo root."
+ ),
boundary_notes=w4_boundary_note(),
- next_action="Review the landed diff and decide whether to approve the next W4 case.",
+ next_action=(
+ "Review the landed diff and decide whether to approve the next W4 case."
+ if land_back
+ else "Use the dry-run packet as the bounded backend-comparison verdict for this fixture case."
+ ),
)
finalize_case(
case=case,
@@ -7532,8 +7599,10 @@ def run_w0(log_root: Path, mirror_root: Path) -> None:
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Materialize and run the supervised local Qwen pilot.")
- parser.add_argument("--log-root", default=str(LOG_ROOT_DEFAULT))
- parser.add_argument("--mirror-root", default=str(MIRROR_ROOT_DEFAULT))
+ parser.add_argument("--url", default=DEFAULT_LANGCHAIN_RUN_URL)
+ parser.add_argument("--program-id", default=DEFAULT_PROGRAM_ID)
+ parser.add_argument("--log-root", default=None)
+ parser.add_argument("--mirror-root", default=None)
sub = parser.add_subparsers(dest="command", required=True)
sub.add_parser("materialize", help="Materialize contracts, case specs, and planned wave indexes.")
@@ -7564,8 +7633,9 @@ def main() -> int:
parser = build_parser()
args = parser.parse_args()
- log_root = Path(args.log_root)
- mirror_root = Path(args.mirror_root)
+ configure_program_runtime(program_id=args.program_id, run_url=args.url)
+ log_root = Path(args.log_root) if args.log_root else default_log_root_for(PROGRAM_ID)
+ mirror_root = Path(args.mirror_root) if args.mirror_root else default_mirror_root_for(PROGRAM_ID)
catalog = build_catalog()
if args.command == "materialize":
diff --git a/scripts/aoa-qwen-bench b/scripts/aoa-qwen-bench
index 7db5767..b349404 100755
--- a/scripts/aoa-qwen-bench
+++ b/scripts/aoa-qwen-bench
@@ -9,6 +9,10 @@ repeat=2
timeout_s=90
write_root="${AOA_STACK_ROOT}/Logs/runtime-benchmarks"
run_url="http://127.0.0.1:5401/run"
+backend_label="langchain-api -> ollama-native"
+model_label="qwen3.5:9b"
+runtime_variant="Q4_K_M via Ollama"
+target_label="workhorse-local-qwen3.5-9b"
selector_args=()
while (($#)); do
@@ -45,6 +49,38 @@ while (($#)); do
--url=*)
run_url="${1#*=}"
;;
+ --backend-label)
+ shift || true
+ (($#)) || aoa_die "missing value after --backend-label"
+ backend_label="$1"
+ ;;
+ --backend-label=*)
+ backend_label="${1#*=}"
+ ;;
+ --model-label)
+ shift || true
+ (($#)) || aoa_die "missing value after --model-label"
+ model_label="$1"
+ ;;
+ --model-label=*)
+ model_label="${1#*=}"
+ ;;
+ --runtime-variant)
+ shift || true
+ (($#)) || aoa_die "missing value after --runtime-variant"
+ runtime_variant="$1"
+ ;;
+ --runtime-variant=*)
+ runtime_variant="${1#*=}"
+ ;;
+ --target-label)
+ shift || true
+ (($#)) || aoa_die "missing value after --target-label"
+ target_label="$1"
+ ;;
+ --target-label=*)
+ target_label="${1#*=}"
+ ;;
*)
selector_args+=("$1")
;;
@@ -68,7 +104,7 @@ has_module() {
has_module "41-agent-api.yml" || aoa_die "qwen bench requires 41-agent-api.yml in the selected runtime"
timestamp="$(date -u +%Y-%m-%dT%H%M%SZ)"
-run_dir="${write_root}/runs/${timestamp}__latency-single-turn__workhorse-local-qwen3.5-9b"
+run_dir="${write_root}/runs/${timestamp}__latency-single-turn__${target_label}"
mkdir -p "${run_dir}/raw"
export AOA_QWEN_BENCH_REPEAT="$repeat"
@@ -78,6 +114,10 @@ export AOA_QWEN_BENCH_PRESET="$AOA_STACK_PRESET"
export AOA_QWEN_BENCH_PROFILE="$AOA_STACK_PROFILE"
export AOA_QWEN_BENCH_RUN_DIR="$run_dir"
export AOA_QWEN_CHECK_PATH="${SCRIPT_DIR}/aoa-qwen-check"
+export AOA_QWEN_BENCH_BACKEND_LABEL="$backend_label"
+export AOA_QWEN_BENCH_MODEL_LABEL="$model_label"
+export AOA_QWEN_BENCH_RUNTIME_VARIANT="$runtime_variant"
+export AOA_QWEN_BENCH_TARGET_LABEL="$target_label"
python3 - <<'PY'
from __future__ import annotations
@@ -98,6 +138,10 @@ preset = os.environ.get("AOA_QWEN_BENCH_PRESET", "")
profile = os.environ.get("AOA_QWEN_BENCH_PROFILE", "")
run_dir = Path(os.environ["AOA_QWEN_BENCH_RUN_DIR"])
check_path = os.environ["AOA_QWEN_CHECK_PATH"]
+backend_label = os.environ.get("AOA_QWEN_BENCH_BACKEND_LABEL", "langchain-api -> ollama-native")
+model_label = os.environ.get("AOA_QWEN_BENCH_MODEL_LABEL", "qwen3.5:9b")
+runtime_variant = os.environ.get("AOA_QWEN_BENCH_RUNTIME_VARIANT", "Q4_K_M via Ollama")
+target_label = os.environ.get("AOA_QWEN_BENCH_TARGET_LABEL", "workhorse-local-qwen3.5-9b")
cases = ["exact-reply", "repo-routing"]
warmup_runs_per_case = 1
@@ -205,7 +249,7 @@ for case in cases:
}
captured_at = datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
-benchmark_id = "qwen3.5-9b-langchain-latency-single-turn"
+benchmark_id = f"{target_label}-langchain-latency-single-turn"
selection = {"preset": preset or None, "profile": profile or None}
truth_refs = []
if preset:
@@ -223,11 +267,11 @@ manifest = {
"benchmark_family": "latency-single-turn",
"runtime_selection": selection,
"system_under_test": {
- "backend": "langchain-api -> ollama-native",
- "model": "qwen3.5:9b",
+ "backend": backend_label,
+ "model": model_label,
"profile_class": "workhorse",
"context_budget_class": "bounded-local",
- "quantization_or_runtime_variant": "Q4_K_M via Ollama",
+ "quantization_or_runtime_variant": runtime_variant,
},
"host_surface": {
"os_family": platform.system().lower(),
@@ -283,7 +327,9 @@ notes = [
"- Fixture family: `exact-reply` and `repo-routing`.",
"- One uncounted warmup run is executed per case before measured repeats.",
"- This is runtime-local evidence for `abyss-stack`, not a portable proof verdict.",
- "- The check stays on the intended chat path instead of raw `ollama` probing.",
+ f"- Serving backend label: `{backend_label}`.",
+ f"- Runtime variant: `{runtime_variant}`.",
+ "- The check stays on the intended chat path instead of raw backend probing.",
]
(run_dir / "benchmark.manifest.json").write_text(
diff --git a/scripts/aoa-sync-federation-surfaces b/scripts/aoa-sync-federation-surfaces
index 339ecb9..723b51a 100755
--- a/scripts/aoa-sync-federation-surfaces
+++ b/scripts/aoa-sync-federation-surfaces
@@ -5,11 +5,19 @@ SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=scripts/aoa-lib.sh
source "${SCRIPT_DIR}/aoa-lib.sh"
-command -v rsync >/dev/null 2>&1 || aoa_die "rsync is required"
+command -v python3 >/dev/null 2>&1 || aoa_die "python3 is required"
layers=()
+check_mode=0
+json_mode=0
while (($#)); do
case "$1" in
+ --check)
+ check_mode=1
+ ;;
+ --json)
+ json_mode=1
+ ;;
--layer)
shift || true
(($#)) || aoa_die "missing value after --layer"
@@ -27,184 +35,114 @@ while (($#)); do
(( ${#layers[@]} > 0 )) || aoa_die "expected --layer"
+if (( json_mode )) && ! (( check_mode )); then
+ aoa_die "--json requires --check"
+fi
+
+emit_check_json() {
+ local layer="$1"
+ local status="$2"
+ local source_root="$3"
+ local mirror_target="$4"
+ shift 4
+ python3 - "$layer" "$status" "$source_root" "$mirror_target" "$@" <<'PY'
+from pathlib import Path
+import json
+import sys
+
+layer = sys.argv[1]
+status = sys.argv[2]
+source_root = str(Path(sys.argv[3]))
+mirror_target = str(Path(sys.argv[4]))
+missing_files = [str(Path(item)) for item in sys.argv[5:]]
+
+print(
+ json.dumps(
+ {
+ "layer": layer,
+ "status": status,
+ "source_root": source_root,
+ "mirror_target": mirror_target,
+ "missing_files": missing_files,
+ },
+ ensure_ascii=True,
+ separators=(",", ":"),
+ )
+)
+PY
+}
+
+resolve_federation_config_dir() {
+ local source_templates_dir runtime_configs_dir
+ source_templates_dir="${SCRIPT_DIR}/../config-templates/Configs/federation"
+ runtime_configs_dir="${AOA_CONFIGS_ROOT}/federation"
+
+ if [[ -d "${source_templates_dir}" ]]; then
+ printf '%s\n' "${source_templates_dir}"
+ return 0
+ fi
+ if [[ -d "${runtime_configs_dir}" ]]; then
+ printf '%s\n' "${runtime_configs_dir}"
+ return 0
+ fi
+ aoa_die "federation config directory not found"
+}
+
+load_required_paths() {
+ local config_path="$1"
+ python3 - "$config_path" <<'PY'
+from pathlib import Path
+import sys
+
+import yaml
+
+config_path = Path(sys.argv[1])
+payload = yaml.safe_load(config_path.read_text(encoding="utf-8"))
+required_files = payload.get("required_files")
+if not isinstance(required_files, list) or not required_files:
+ raise SystemExit(f"required_files missing or empty in {config_path}")
+for rel_path in required_files:
+ if not isinstance(rel_path, str) or not rel_path:
+ raise SystemExit(f"invalid required_files entry in {config_path}: {rel_path!r}")
+ print(rel_path)
+PY
+}
+
sync_layer() {
local layer="$1"
- local source_root target_root tmp_root src_path rel_path artifact_schema
+ local source_root target_root tmp_root src_path rel_path config_dir config_path
local -a required_paths=()
+ command -v rsync >/dev/null 2>&1 || aoa_die "rsync is required"
+
case "$layer" in
aoa-agents)
source_root="${AOA_AGENTS_ROOT}"
target_root="${AOA_STACK_ROOT}/Knowledge/federation/aoa-agents"
- required_paths=(
- "docs/AGENT_RUNTIME_SEAM.md"
- "generated/agent_registry.min.json"
- "generated/model_tier_registry.json"
- "generated/runtime_seam_bindings.json"
- "generated/cohort_composition_registry.json"
- "schemas/agent-registry.schema.json"
- "schemas/model-tier-registry.schema.json"
- "schemas/runtime-seam-bindings.schema.json"
- "schemas/cohort-composition-registry.schema.json"
- )
-
- while IFS= read -r artifact_schema; do
- required_paths+=("schemas/${artifact_schema}")
- done < <(find "${source_root}/schemas" -maxdepth 1 -type f -name 'artifact.*.schema.json' -printf '%f\n' | sort)
;;
aoa-routing)
source_root="${AOA_ROUTING_ROOT}"
target_root="${AOA_STACK_ROOT}/Knowledge/federation/aoa-routing"
- required_paths=(
- "docs/FEDERATION_ENTRY_ABI.md"
- "docs/RECURRENCE_NAVIGATION_BOUNDARY.md"
- "generated/aoa_router.min.json"
- "generated/cross_repo_registry.min.json"
- "generated/task_to_surface_hints.json"
- "generated/task_to_tier_hints.json"
- "generated/recommended_paths.min.json"
- "generated/pairing_hints.min.json"
- "generated/kag_source_lift_relation_hints.min.json"
- "generated/federation_entrypoints.min.json"
- "generated/return_navigation_hints.min.json"
- "generated/tiny_model_entrypoints.json"
- "schemas/aoa-router.schema.json"
- "schemas/cross-repo-registry.schema.json"
- "schemas/task-to-surface-hints.schema.json"
- "schemas/task-to-tier-hints.schema.json"
- "schemas/recommended-paths.schema.json"
- "schemas/pairing-hints.schema.json"
- "schemas/kag-source-lift-relation-hints.schema.json"
- "schemas/federation-entrypoints.schema.json"
- "schemas/return-navigation-hints.schema.json"
- "schemas/tiny-model-entrypoints.schema.json"
- "schemas/router-entry.schema.json"
- )
;;
aoa-memo)
source_root="${AOA_MEMO_ROOT}"
target_root="${AOA_STACK_ROOT}/Knowledge/federation/aoa-memo"
- required_paths=(
- "docs/MEMORY_MODEL.md"
- "docs/RUNTIME_WRITEBACK_SEAM.md"
- "docs/RECURRENCE_MEMORY_SUPPORT_SURFACES.md"
- "docs/AGENT_MEMORY_POSTURE_SEAM.md"
- "docs/PLAYBOOK_MEMORY_SCOPES.md"
- "generated/memo_registry.min.json"
- "generated/memory_catalog.min.json"
- "generated/memory_capsules.json"
- "generated/memory_sections.full.json"
- "generated/memory_object_catalog.min.json"
- "generated/memory_object_capsules.json"
- "generated/memory_object_sections.full.json"
- "examples/checkpoint_to_memory_contract.example.json"
- "examples/recall_contract.router.semantic.json"
- "examples/recall_contract.router.lineage.json"
- "examples/recall_contract.object.working.json"
- "examples/recall_contract.object.semantic.json"
- "examples/recall_contract.object.lineage.json"
- "examples/recall_contract.object.working.return.json"
- "schemas/checkpoint-to-memory-contract.schema.json"
- "schemas/core-memory-contract.schema.json"
- )
;;
aoa-evals)
source_root="${AOA_EVALS_ROOT}"
target_root="${AOA_STACK_ROOT}/Knowledge/federation/aoa-evals"
- required_paths=(
- "docs/README.md"
- "docs/TRACE_EVAL_BRIDGE.md"
- "docs/RUNTIME_BENCH_PROMOTION_GUIDE.md"
- "docs/SELF_AGENT_CHECKPOINT_EVAL_POSTURE.md"
- "docs/RECURRENCE_PROOF_PROGRAM.md"
- "generated/eval_catalog.min.json"
- "generated/eval_capsules.json"
- "generated/eval_sections.full.json"
- "generated/comparison_spine.json"
- "examples/runtime_evidence_selection.workhorse-local.example.json"
- "examples/runtime_evidence_selection.return-anchor-integrity.example.json"
- "examples/artifact_to_verdict_hook.self-agent-checkpoint-rollout.example.json"
- "examples/artifact_to_verdict_hook.long-horizon-model-tier-orchestra.example.json"
- "examples/artifact_to_verdict_hook.restartable-inquiry-loop.example.json"
- "schemas/runtime-evidence-selection.schema.json"
- "schemas/artifact-to-verdict-hook.schema.json"
- )
;;
aoa-playbooks)
source_root="${AOA_PLAYBOOKS_ROOT}"
target_root="${AOA_STACK_ROOT}/Knowledge/federation/aoa-playbooks"
- required_paths=(
- "docs/PLAYBOOK_EXECUTION_SEAM.md"
- "docs/PLAYBOOK_MODEL.md"
- "docs/PLAYBOOK_LIFECYCLE.md"
- "docs/PLAYBOOK_RECURRENCE_DISCIPLINE.md"
- "generated/playbook_registry.min.json"
- "generated/playbook_activation_surfaces.min.json"
- "generated/playbook_federation_surfaces.min.json"
- "generated/playbook_handoff_contracts.json"
- "generated/playbook_failure_catalog.json"
- "generated/playbook_subagent_recipes.json"
- "generated/playbook_automation_seeds.json"
- "generated/playbook_composition_manifest.json"
- "schemas/playbook-registry.schema.json"
- "schemas/playbook-activation-surface.schema.json"
- "schemas/playbook-federation-surface.schema.json"
- "examples/playbook_activation.long-horizon-model-tier-orchestra.example.json"
- "examples/playbook_activation.restartable-inquiry-loop.example.json"
- "examples/playbook_activation.cross-repo-boundary-rollout.example.json"
- )
;;
aoa-kag)
source_root="${AOA_KAG_ROOT}"
target_root="${AOA_STACK_ROOT}/Knowledge/federation/aoa-kag"
- required_paths=(
- "docs/CONSUMER_GUIDE.md"
- "docs/REASONING_HANDOFF.md"
- "docs/REASONING_HANDOFF_PACK.md"
- "docs/RECURRENCE_REGROUNDING.md"
- "docs/BRIDGE_CONTRACTS.md"
- "docs/FEDERATION_KAG_READINESS.md"
- "docs/COUNTERPART_CONSUMER_CONTRACT.md"
- "docs/TOS_RETRIEVAL_AXIS_PACK.md"
- "generated/kag_registry.min.json"
- "generated/federation_spine.min.json"
- "generated/tiny_consumer_bundle.min.json"
- "generated/reasoning_handoff_pack.min.json"
- "generated/return_regrounding_pack.min.json"
- "generated/technique_lift_pack.min.json"
- "generated/tos_retrieval_axis_pack.min.json"
- "generated/tos_text_chunk_map.min.json"
- "generated/cross_source_node_projection.min.json"
- "generated/counterpart_federation_exposure_review.min.json"
- "schemas/kag-registry.schema.json"
- "schemas/federation-spine.schema.json"
- "schemas/tiny-consumer-bundle.schema.json"
- "schemas/reasoning-handoff-pack.schema.json"
- "schemas/return-regrounding-pack.schema.json"
- "schemas/technique-lift-pack.schema.json"
- "schemas/tos-retrieval-axis-pack.schema.json"
- "schemas/tos-text-chunk-map.schema.json"
- "schemas/cross-source-node-projection.schema.json"
- "schemas/counterpart-federation-exposure-review.schema.json"
- "schemas/counterpart-consumer-contract.schema.json"
- "schemas/bridge-envelope.schema.json"
- )
;;
tos-source)
source_root="${AOA_TOS_ROOT}"
target_root="${AOA_STACK_ROOT}/Knowledge/federation/tos-source"
- required_paths=(
- "docs/KAG_EXPORT.md"
- "docs/TINY_ENTRY_ROUTE.md"
- "docs/NODE_CONTRACT.md"
- "docs/PRACTICE_BRANCH.md"
- "docs/ZARATHUSTRA_TRILINGUAL_ENTRY.md"
- "generated/kag_export.min.json"
- "examples/source_node.example.json"
- "examples/tos_tiny_entry_route.example.json"
- "schemas/tos-node-contract.schema.json"
- "schemas/tos-tiny-entry-route.schema.json"
- )
;;
*)
aoa_die "unsupported layer: ${layer}"
@@ -213,6 +151,14 @@ sync_layer() {
[[ -d "$source_root" ]] || aoa_die "${layer} root not found: ${source_root}"
+ config_dir="$(resolve_federation_config_dir)"
+ config_path="${config_dir}/${layer}.yaml"
+ [[ -f "$config_path" ]] || aoa_die "federation config not found for ${layer}: ${config_path}"
+ while IFS= read -r rel_path; do
+ required_paths+=("${rel_path}")
+ done < <(load_required_paths "${config_path}")
+ (( ${#required_paths[@]} > 0 )) || aoa_die "no required_files found in ${config_path}"
+
if [[ "$layer" == "aoa-agents" ]]; then
local artifact_schema_count=0
for rel_path in "${required_paths[@]}"; do
@@ -245,6 +191,98 @@ sync_layer() {
aoa_note "federation surface sync complete for ${layer}"
}
+check_layer() {
+ local layer="$1"
+ local source_root target_root rel_path config_dir config_path
+ local -a required_paths=()
+ local -a missing_paths=()
+
+ case "$layer" in
+ aoa-agents)
+ source_root="${AOA_AGENTS_ROOT}"
+ target_root="${AOA_STACK_ROOT}/Knowledge/federation/aoa-agents"
+ ;;
+ aoa-routing)
+ source_root="${AOA_ROUTING_ROOT}"
+ target_root="${AOA_STACK_ROOT}/Knowledge/federation/aoa-routing"
+ ;;
+ aoa-memo)
+ source_root="${AOA_MEMO_ROOT}"
+ target_root="${AOA_STACK_ROOT}/Knowledge/federation/aoa-memo"
+ ;;
+ aoa-evals)
+ source_root="${AOA_EVALS_ROOT}"
+ target_root="${AOA_STACK_ROOT}/Knowledge/federation/aoa-evals"
+ ;;
+ aoa-playbooks)
+ source_root="${AOA_PLAYBOOKS_ROOT}"
+ target_root="${AOA_STACK_ROOT}/Knowledge/federation/aoa-playbooks"
+ ;;
+ aoa-kag)
+ source_root="${AOA_KAG_ROOT}"
+ target_root="${AOA_STACK_ROOT}/Knowledge/federation/aoa-kag"
+ ;;
+ tos-source)
+ source_root="${AOA_TOS_ROOT}"
+ target_root="${AOA_STACK_ROOT}/Knowledge/federation/tos-source"
+ ;;
+ *)
+ aoa_die "unsupported layer: ${layer}"
+ ;;
+ esac
+
+ [[ -d "$source_root" ]] || aoa_die "${layer} root not found: ${source_root}"
+
+ config_dir="$(resolve_federation_config_dir)"
+ config_path="${config_dir}/${layer}.yaml"
+ [[ -f "$config_path" ]] || aoa_die "federation config not found for ${layer}: ${config_path}"
+ while IFS= read -r rel_path; do
+ required_paths+=("${rel_path}")
+ done < <(load_required_paths "${config_path}")
+ (( ${#required_paths[@]} > 0 )) || aoa_die "no required_files found in ${config_path}"
+
+ if (( ! json_mode )); then
+ aoa_note "check layer: ${layer}"
+ aoa_note "source root: ${source_root}"
+ aoa_note "mirror target: ${target_root}"
+ fi
+
+ for rel_path in "${required_paths[@]}"; do
+ [[ -f "${source_root}/${rel_path}" ]] || aoa_die "required source file missing: ${source_root}/${rel_path}"
+ if [[ ! -f "${target_root}/${rel_path}" ]]; then
+ missing_paths+=("${target_root}/${rel_path}")
+ fi
+ done
+
+ if (( ${#missing_paths[@]} > 0 )); then
+ if (( json_mode )); then
+ emit_check_json "${layer}" "missing" "${source_root}" "${target_root}" "${missing_paths[@]}"
+ else
+ aoa_warn "missing mirrored files for ${layer}:"
+ for rel_path in "${missing_paths[@]}"; do
+ printf ' %s\n' "${rel_path}"
+ done
+ fi
+ return 1
+ fi
+
+ if (( json_mode )); then
+ emit_check_json "${layer}" "ok" "${source_root}" "${target_root}"
+ else
+ aoa_note "federation surface check complete for ${layer}"
+ fi
+ return 0
+}
+
+overall_status=0
for layer in "${layers[@]}"; do
- sync_layer "$layer"
+ if (( check_mode )); then
+ if ! check_layer "$layer"; then
+ overall_status=1
+ fi
+ else
+ sync_layer "$layer"
+ fi
done
+
+exit "${overall_status}"
diff --git a/scripts/aoa-w5-pilot b/scripts/aoa-w5-pilot
new file mode 100755
index 0000000..b47d65f
--- /dev/null
+++ b/scripts/aoa-w5-pilot
@@ -0,0 +1,2769 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import copy
+import importlib.machinery
+import importlib.util
+import json
+import subprocess
+import textwrap
+from contextlib import contextmanager
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, TypedDict
+
+try:
+ from langgraph.graph import END, START, StateGraph
+ from langgraph.types import Command
+except ImportError as exc: # pragma: no cover - guarded by runtime usage
+ raise SystemExit(
+ "langgraph is not installed. Install dependencies from "
+ "`scripts/requirements-langgraph-pilot.txt` first."
+ ) from exc
+
+
+DEFAULT_PROGRAM_ID = "w5-langgraph-llamacpp-v1"
+PROGRAM_ID = DEFAULT_PROGRAM_ID
+WAVE_ID = "W5"
+MODEL = "qwen3.5:9b"
+DEFAULT_LANGCHAIN_RUN_URL = "http://127.0.0.1:5403/run"
+LANGCHAIN_RUN_URL = DEFAULT_LANGCHAIN_RUN_URL
+
+SOURCE_ROOT = Path(__file__).resolve().parents[1]
+STACK_ROOT = Path("/srv/abyss-stack")
+CONFIGS_ROOT = STACK_ROOT / "Configs"
+SCRIPTS_ROOT = CONFIGS_ROOT / "scripts"
+LOG_ROOT_DEFAULT = STACK_ROOT / "Logs" / "local-ai-trials" / PROGRAM_ID
+MIRROR_ROOT_DEFAULT = Path("/srv/Dionysus/reports/local-ai-trials") / PROGRAM_ID
+
+BASELINE_W4_LOG_ROOT = STACK_ROOT / "Logs" / "local-ai-trials" / "qwen-local-pilot-v1"
+LLAMACPP_PROMOTION_ROOT = STACK_ROOT / "Logs" / "runtime-benchmarks" / "promotions" / "llamacpp-promotion-gate-v1"
+INDEX_NAME = "W5-long-horizon-index"
+SUMMARY_MEMO_NAME = "W5_SUMMARY.md"
+SOURCE_CHECKOUT_ROOT = Path("/home/dionysus/src/abyss-stack")
+
+READ_ONLY_SCENARIO_IDS = {
+ "runtime-inspect-langchain-health",
+ "runtime-inspect-route-api-health",
+ "runtime-inspect-platform-adaptation",
+ "evals-validate-and-explain",
+}
+
+MUTATION_SCENARIO_IDS = {
+ "aoa-evals-contract-wording-alignment",
+ "aoa-routing-doc-boundary-alignment",
+ "aoa-routing-generated-surface-refresh",
+ "stack-sync-federation-check-mode",
+}
+
+SCENARIO_ORDER = [
+ "runtime-inspect-langchain-health",
+ "runtime-inspect-route-api-health",
+ "runtime-inspect-platform-adaptation",
+ "evals-validate-and-explain",
+ "aoa-evals-contract-wording-alignment",
+ "aoa-routing-doc-boundary-alignment",
+ "aoa-routing-generated-surface-refresh",
+ "stack-sync-federation-check-mode",
+]
+
+COMMIT_MESSAGES = {
+ "aoa-evals-contract-wording-alignment": "Clarify aoa-evals contract wording",
+ "aoa-routing-doc-boundary-alignment": "Align aoa-routing boundary docs",
+ "aoa-routing-generated-surface-refresh": "Refresh aoa-routing generated surfaces",
+ "stack-sync-federation-check-mode": "Add federation sync check mode",
+}
+
+CRITICAL_FAILURES = {
+ "preflight_failure",
+ "unauthorized_scope_expansion",
+ "post_change_validation_failure",
+ "landing_reapply_failure",
+}
+
+W5_METADATA = {
+ "title": "Long-Horizon Supervised Pilot",
+ "summary": "Scenario-based LangGraph pilot on the promoted llama.cpp substrate with milestone approvals and bounded live-repo mutations.",
+}
+
+
+class W5State(TypedDict, total=False):
+ case_id: str
+ until: str
+ execution_mode: str
+ current_node: str | None
+ next_node: str | None
+ paused: bool
+ pause_reason: str | None
+ pause_milestone: str | None
+ approval_status: str | None
+ current_milestone: str | None
+ terminal_status: str | None
+ failure_class: str | None
+ proposal_valid: bool
+ preview_ready: bool
+ resume_count: int
+ history: list[dict[str, Any]]
+ command_refs: list[dict[str, Any]]
+ artifact_refs: list[str]
+ changed_files: list[str]
+ local_commit_ref: str | None
+ local_commit_message: str | None
+ base_head: str | None
+ forced_pause_seen: list[str]
+
+
+def utc_now() -> str:
+ return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+
+
+def absolute(path: Path) -> str:
+ return str(path.resolve())
+
+
+def default_log_root_for(program_id: str) -> Path:
+ return STACK_ROOT / "Logs" / "local-ai-trials" / program_id
+
+
+def default_mirror_root_for(program_id: str) -> Path:
+ return Path("/srv/Dionysus/reports/local-ai-trials") / program_id
+
+
+def configure_program_runtime(*, program_id: str, run_url: str) -> None:
+ global PROGRAM_ID, LOG_ROOT_DEFAULT, MIRROR_ROOT_DEFAULT, LANGCHAIN_RUN_URL
+ PROGRAM_ID = program_id
+ LOG_ROOT_DEFAULT = default_log_root_for(program_id)
+ MIRROR_ROOT_DEFAULT = default_mirror_root_for(program_id)
+ LANGCHAIN_RUN_URL = run_url
+ TRIALS.configure_program_runtime(program_id=program_id, run_url=run_url)
+
+
+def load_trials_module() -> Any:
+ target = SOURCE_ROOT / "scripts" / "aoa-local-ai-trials"
+ loader = importlib.machinery.SourceFileLoader("aoa_local_ai_trials_w5", str(target))
+ spec = importlib.util.spec_from_loader(loader.name, loader)
+ if spec is None:
+ raise RuntimeError(f"could not create module spec for {target}")
+ module = importlib.util.module_from_spec(spec)
+ loader.exec_module(module) # type: ignore[arg-type]
+ return module
+
+
+TRIALS = load_trials_module()
+
+
+def scenario_root(log_root: Path, case_id: str) -> Path:
+ return TRIALS.case_dir(log_root, WAVE_ID, case_id)
+
+
+def state_path(log_root: Path, case_id: str) -> Path:
+ return scenario_root(log_root, case_id) / "graph.state.json"
+
+
+def history_path(log_root: Path, case_id: str) -> Path:
+ return scenario_root(log_root, case_id) / "graph.history.jsonl"
+
+
+def interrupt_path(log_root: Path, case_id: str) -> Path:
+ return scenario_root(log_root, case_id) / "interrupt.json"
+
+
+def plan_path(log_root: Path, case_id: str) -> Path:
+ return scenario_root(log_root, case_id) / "artifacts" / "scenario.plan.json"
+
+
+def journal_path(log_root: Path, case_id: str) -> Path:
+ return scenario_root(log_root, case_id) / "artifacts" / "step.journal.jsonl"
+
+
+def approval_path(log_root: Path, case_id: str) -> Path:
+ return scenario_root(log_root, case_id) / "artifacts" / "approval.status.json"
+
+
+def node_artifacts_dir(log_root: Path, case_id: str) -> Path:
+ path = scenario_root(log_root, case_id) / "node-artifacts"
+ path.mkdir(parents=True, exist_ok=True)
+ return path
+
+
+def program_readme() -> str:
+ return (
+ f"# {PROGRAM_ID}\n\n"
+ "This directory stores the runtime-truth artifacts for the W5 long-horizon supervised pilot.\n\n"
+ "It reuses the bounded local-trials packet contract while moving to milestone-gated LangGraph orchestration on the promoted llama.cpp runtime.\n"
+ )
+
+
+def mirror_readme() -> str:
+ return (
+ f"# {PROGRAM_ID}\n\n"
+ "This folder mirrors human+AI-readable W5 reports and indexes.\n\n"
+ "Machine-readable runtime truth stays local under `/srv/abyss-stack/Logs/local-ai-trials/`.\n"
+ )
+
+
+def write_json(path: Path, payload: dict[str, Any]) -> None:
+ TRIALS.write_json(path, payload)
+
+
+def write_text(path: Path, text: str) -> None:
+ TRIALS.write_text(path, text)
+
+
+def write_text_exact(path: Path, text: str) -> None:
+ TRIALS.write_text_exact(path, text)
+
+
+def load_json(path: Path) -> dict[str, Any]:
+ return json.loads(path.read_text(encoding="utf-8"))
+
+
+def load_case_spec(log_root: Path, case_id: str) -> dict[str, Any]:
+ return load_json(scenario_root(log_root, case_id) / "case.spec.json")
+
+
+def load_result_summary(log_root: Path, case_id: str) -> dict[str, Any] | None:
+ path = scenario_root(log_root, case_id) / "result.summary.json"
+ if not path.exists():
+ return None
+ return load_json(path)
+
+
+def load_graph_state(log_root: Path, case_id: str) -> W5State | None:
+ path = state_path(log_root, case_id)
+ if not path.exists():
+ return None
+ return json.loads(path.read_text(encoding="utf-8"))
+
+
+def record_event(
+ state: W5State,
+ *,
+ node: str,
+ status: str,
+ note: str,
+ extra: dict[str, Any] | None = None,
+) -> list[dict[str, Any]]:
+ history = list(state.get("history", []))
+ payload: dict[str, Any] = {
+ "at": utc_now(),
+ "node": node,
+ "status": status,
+ "note": note,
+ }
+ if extra:
+ payload.update(extra)
+ history.append(payload)
+ return history
+
+
+def save_graph_state(log_root: Path, case_id: str, state: W5State) -> None:
+ sanitized = {
+ "case_id": state.get("case_id"),
+ "until": state.get("until"),
+ "execution_mode": state.get("execution_mode"),
+ "current_node": state.get("current_node"),
+ "next_node": state.get("next_node"),
+ "paused": state.get("paused", False),
+ "pause_reason": state.get("pause_reason"),
+ "pause_milestone": state.get("pause_milestone"),
+ "approval_status": state.get("approval_status"),
+ "current_milestone": state.get("current_milestone"),
+ "terminal_status": state.get("terminal_status"),
+ "failure_class": state.get("failure_class"),
+ "proposal_valid": state.get("proposal_valid"),
+ "preview_ready": state.get("preview_ready"),
+ "resume_count": state.get("resume_count", 0),
+ "history": state.get("history", []),
+ "command_refs": state.get("command_refs", []),
+ "artifact_refs": state.get("artifact_refs", []),
+ "changed_files": state.get("changed_files", []),
+ "local_commit_ref": state.get("local_commit_ref"),
+ "local_commit_message": state.get("local_commit_message"),
+ "base_head": state.get("base_head"),
+ "forced_pause_seen": state.get("forced_pause_seen", []),
+ }
+ write_json(state_path(log_root, case_id), sanitized)
+ history_lines = [json.dumps(item, ensure_ascii=True) for item in sanitized["history"]]
+ history_file = history_path(log_root, case_id)
+ history_file.parent.mkdir(parents=True, exist_ok=True)
+ history_file.write_text("\n".join(history_lines) + ("\n" if history_lines else ""), encoding="utf-8")
+ journal_file = journal_path(log_root, case_id)
+ journal_file.parent.mkdir(parents=True, exist_ok=True)
+ journal_file.write_text("\n".join(history_lines) + ("\n" if history_lines else ""), encoding="utf-8")
+
+
+def node_json(log_root: Path, case_id: str, name: str, payload: dict[str, Any]) -> None:
+ write_json(node_artifacts_dir(log_root, case_id) / f"{name}.json", payload)
+
+
+def load_base_catalog() -> dict[str, list[dict[str, Any]]]:
+ return TRIALS.build_catalog()
+
+
+def find_case(catalog: dict[str, list[dict[str, Any]]], wave_id: str, case_id: str) -> dict[str, Any]:
+ for case in catalog[wave_id]:
+ if case["case_id"] == case_id:
+ return copy.deepcopy(case)
+ raise RuntimeError(f"missing case `{case_id}` in wave `{wave_id}`")
+
+
+def implementation_case() -> dict[str, Any]:
+ case = {
+ "artifact_kind": "aoa.local-ai-trial.case-spec",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": "stack-sync-federation-check-mode",
+ "title": "Add Check Mode To Federation Sync",
+ "repo_scope": ["abyss-stack"],
+ "task_family": "bounded-implementation",
+ "mutation_allowed": True,
+ "mutation_policy": {
+ "mode": "bounded-approved-only",
+ "execution_mode": "implementation_patch",
+ "lane": "implementation",
+ "allowed_files": [absolute(SOURCE_CHECKOUT_ROOT / "scripts" / "aoa-sync-federation-surfaces")],
+ "unauthorized_file_touch_is_critical_fail": True,
+ "review_required_before_mutation": True,
+ },
+ "runtime_selection": {
+ "preset": "intel-full",
+ "profile": None,
+ "path": "langchain-api:/run",
+ },
+ "allowed_tools": ["langchain-api:/run", "local-shell", "local-files:read-write", "repo-validator"],
+ "source_refs": [
+ absolute(SOURCE_CHECKOUT_ROOT / "scripts" / "aoa-sync-federation-surfaces"),
+ absolute(SOURCE_CHECKOUT_ROOT / "config-templates" / "Configs" / "federation" / "aoa-routing.yaml"),
+ absolute(SOURCE_CHECKOUT_ROOT / "docs" / "DEPLOYMENT.md"),
+ ],
+ "observed_actions": [],
+ "execution_mode": "implementation_patch",
+ "lane": "implementation",
+ "derived_from": None,
+ "milestone_gates": ["plan_freeze", "first_mutation", "landing"],
+ "force_pause_on_milestone": "plan_freeze",
+ "expected_result": {
+ "type": "bounded-edit",
+ "allowed_files": [absolute(SOURCE_CHECKOUT_ROOT / "scripts" / "aoa-sync-federation-surfaces")],
+ "all_acceptance_checks_must_pass": True,
+ },
+ "scoring": {
+ "critical_failures": [
+ "unauthorized_scope_expansion",
+ "post_change_validation_failure",
+ ]
+ },
+ "acceptance_checks": [
+ "bash -n scripts/aoa-sync-federation-surfaces",
+ "scripts/aoa-sync-federation-surfaces --check --layer aoa-routing",
+ "python3 scripts/validate_stack.py",
+ ],
+ "goal": "Add a bounded `--check` mode to the federation sync helper without changing the normal copy path.",
+ "inputs": [
+ "Add `--check` to `scripts/aoa-sync-federation-surfaces`.",
+ "`--check` must perform no copy operations and must resolve the same layer config and required-file set as normal sync mode.",
+ "`--check` must exit `0` when all required files exist in the mirror and `1` when any required file is missing.",
+ ],
+ "expected_report_lines": [
+ "Only `scripts/aoa-sync-federation-surfaces` is touched.",
+ "The helper gains a bounded `--check` mode with no copy side effects.",
+ "All named acceptance checks pass after landing.",
+ ],
+ "notes": [
+ "This scenario runs against the git-backed abyss-stack source checkout.",
+ "Use the same bounded worktree-first landing posture as the W4 mutation flow.",
+ ],
+ }
+ return case
+
+
+def w5_catalog() -> dict[str, list[dict[str, Any]]]:
+ base = load_base_catalog()
+ scenarios: list[dict[str, Any]] = []
+
+ for case_id in SCENARIO_ORDER:
+ if case_id == "stack-sync-federation-check-mode":
+ scenarios.append(implementation_case())
+ continue
+ source_wave = "W2" if case_id in READ_ONLY_SCENARIO_IDS else "W4"
+ case = find_case(base, source_wave, case_id)
+ case["program_id"] = PROGRAM_ID
+ case["wave_id"] = WAVE_ID
+ case["derived_from"] = case_id
+ if case_id in READ_ONLY_SCENARIO_IDS:
+ case["execution_mode"] = "read_only_summary"
+ case["milestone_gates"] = ["plan_freeze"]
+ case["force_pause_on_milestone"] = None
+ case["notes"] = list(case.get("notes") or []) + [
+ "This W5 scenario reuses the frozen W2 read-only contract under LangGraph milestone gating.",
+ ]
+ else:
+ case["milestone_gates"] = ["plan_freeze", "first_mutation", "landing"]
+ case["force_pause_on_milestone"] = None
+ case["notes"] = list(case.get("notes") or []) + [
+ "This W5 scenario reuses the bounded W4 mutation contract under LangGraph milestone gating.",
+ ]
+ scenarios.append(case)
+
+ ordered = {case["case_id"]: case for case in scenarios}
+ return {WAVE_ID: [ordered[case_id] for case_id in SCENARIO_ORDER]}
+
+
+def available_cases() -> list[dict[str, Any]]:
+ return w5_catalog()[WAVE_ID]
+
+
+def repo_root_for_scenario(case: dict[str, Any]) -> Path:
+ if case["case_id"] == "stack-sync-federation-check-mode":
+ return SOURCE_CHECKOUT_ROOT
+ repo_scope = case.get("repo_scope") or []
+ if len(repo_scope) != 1:
+ raise RuntimeError(f"W5 mutation scenario `{case['case_id']}` must target exactly one repo")
+ repo_root = Path("/srv") / repo_scope[0]
+ if not repo_root.exists():
+ raise RuntimeError(f"missing W5 repo root: {repo_root}")
+ return repo_root
+
+
+@contextmanager
+def patched_repo_root_for_w5() -> Any:
+ original = TRIALS.repo_root_for_w4_case
+
+ def custom_repo_root(case: dict[str, Any]) -> Path:
+ return repo_root_for_scenario(case)
+
+ TRIALS.repo_root_for_w4_case = custom_repo_root
+ try:
+ yield TRIALS
+ finally:
+ TRIALS.repo_root_for_w4_case = original
+
+
+def build_scenario_plan(case: dict[str, Any]) -> dict[str, Any]:
+ plan = {
+ "artifact_kind": "aoa.local-ai-trial.w5-scenario-plan",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "drafted_at": utc_now(),
+ "execution_mode": case["execution_mode"],
+ "derived_from": case.get("derived_from"),
+ "repo_scope": case.get("repo_scope", []),
+ "source_refs": case.get("source_refs", []),
+ "milestone_gates": case.get("milestone_gates", []),
+ "force_pause_on_milestone": case.get("force_pause_on_milestone"),
+ "observed_action_ids": [item.get("id") for item in case.get("observed_actions", []) if item.get("id")],
+ "allowed_files": case.get("expected_result", {}).get("allowed_files", []),
+ "acceptance_checks": case.get("acceptance_checks", []),
+ }
+ if case["execution_mode"] == "read_only_summary":
+ plan["plan_summary"] = (
+ "Execute only the declared read-only actions and grounded source refs, "
+ "then summarize without creating worktrees or commits."
+ )
+ elif case["execution_mode"] == "script_refresh":
+ plan["plan_summary"] = (
+ "Prepare the frozen builder-based proposal, validate it in an isolated worktree, "
+ "then request landing approval before touching the repo."
+ )
+ else:
+ plan["plan_summary"] = (
+ "Prepare a bounded proposal inside the approved file scope, validate it in an isolated worktree, "
+ "then request landing approval before touching the repo."
+ )
+ return plan
+
+
+def materialize(log_root: Path, mirror_root: Path) -> None:
+ log_root.mkdir(parents=True, exist_ok=True)
+ mirror_root.mkdir(parents=True, exist_ok=True)
+ write_text(log_root / "README.md", program_readme())
+ write_text(mirror_root / "README.md", mirror_readme())
+
+ contracts = {
+ "case.spec.schema.json": TRIALS.CASE_SCHEMA,
+ "run.manifest.schema.json": TRIALS.RUN_MANIFEST_SCHEMA,
+ "result.summary.schema.json": TRIALS.RESULT_SUMMARY_SCHEMA,
+ "wave-index.schema.json": TRIALS.WAVE_INDEX_SCHEMA,
+ }
+ for name, payload in contracts.items():
+ write_json(log_root / "contracts" / name, payload)
+
+ for case in available_cases():
+ root = scenario_root(log_root, case["case_id"])
+ write_json(root / "case.spec.json", case)
+ node_artifacts_dir(log_root, case["case_id"])
+
+ refresh_w5_outputs(log_root, mirror_root)
+
+
+def approval_payload(log_root: Path, case_id: str) -> dict[str, Any] | None:
+ path = approval_path(log_root, case_id)
+ if not path.exists():
+ return None
+ return load_json(path)
+
+
+def write_approval_status(
+ log_root: Path,
+ *,
+ case: dict[str, Any],
+ milestone_id: str,
+ base_head: str | None,
+ notes: str,
+) -> dict[str, Any]:
+ existing = approval_payload(log_root, case["case_id"]) or {}
+ payload = {
+ "artifact_kind": "aoa.local-ai-trial.w5-approval-status",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "milestone_id": milestone_id,
+ "milestone_status": "pending",
+ "status": "pending",
+ "approved": False,
+ "approved_at": None,
+ "prepared_at": existing.get("prepared_at") or utc_now(),
+ "base_head": base_head or existing.get("base_head"),
+ "notes": notes,
+ }
+ write_json(approval_path(log_root, case["case_id"]), payload)
+ return payload
+
+
+def interpret_approval_status(payload: dict[str, Any] | None, *, milestone_id: str) -> str:
+ if payload is None:
+ return "pending"
+ if payload.get("milestone_id") != milestone_id:
+ return "pending"
+ status = str(payload.get("milestone_status") or payload.get("status") or "pending")
+ if status == "approved" or bool(payload.get("approved")):
+ return "approved"
+ if status == "rejected":
+ return "rejected"
+ return "pending"
+
+
+def write_interrupt(
+ log_root: Path,
+ *,
+ case_id: str,
+ milestone_id: str,
+ reason: str,
+) -> None:
+ payload = {
+ "artifact_kind": "aoa.local-ai-trial.w5-interrupt",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case_id,
+ "paused_at": utc_now(),
+ "reason": reason,
+ "milestone_id": milestone_id,
+ "resume_hint": "Set approval.status.json to approved or rejected, then run `scripts/aoa-w5-pilot resume-scenario `.",
+ }
+ write_json(interrupt_path(log_root, case_id), payload)
+
+
+def build_health_check(case_root: Path, label: str, url: str) -> tuple[dict[str, Any], dict[str, Any]]:
+ raw = TRIALS.run_command(["curl", "-fsS", url], cwd=CONFIGS_ROOT, timeout_s=30)
+ ref = TRIALS.persist_command_result(case_root, label, raw)
+ payload: dict[str, Any] = {}
+ if raw["exit_code"] == 0 and not raw["timed_out"]:
+ try:
+ payload = json.loads(raw["stdout"])
+ except json.JSONDecodeError:
+ payload = {}
+ return ref, payload
+
+
+def ensure_w4_closeout_pass() -> dict[str, Any]:
+ closeout = BASELINE_W4_LOG_ROOT / "W4-closeout.json"
+ if not closeout.exists():
+ raise RuntimeError(f"missing W4 closeout artifact: {closeout}")
+ payload = load_json(closeout)
+ if payload.get("gate_result") != "pass":
+ raise RuntimeError("W4 closeout is not pass")
+ return payload
+
+
+def ensure_llamacpp_promotion_pass() -> dict[str, Any]:
+ latest = LLAMACPP_PROMOTION_ROOT / "latest.json"
+ if not latest.exists():
+ raise RuntimeError(f"missing llama.cpp promotion latest artifact: {latest}")
+ latest_payload = load_json(latest)
+ promotion_ref = latest_payload.get("promotion_ref")
+ if not isinstance(promotion_ref, str) or not promotion_ref:
+ raise RuntimeError("llama.cpp promotion latest artifact is missing promotion_ref")
+ promotion = load_json(Path(promotion_ref))
+ verdict = promotion.get("promotion", {})
+ if verdict.get("recommendation") != "promote llama.cpp":
+ raise RuntimeError("llama.cpp promotion verdict is not promote llama.cpp")
+ return promotion
+
+
+def finalize_case_with_summary(
+ *,
+ case: dict[str, Any],
+ log_root: Path,
+ mirror_root: Path,
+ backend: str,
+ command_refs: list[dict[str, Any]],
+ artifact_refs: list[str],
+ status: str,
+ score_breakdown: dict[str, Any],
+ observed: dict[str, Any],
+ failure_class: str | None,
+ reviewer_notes: str,
+ boundary_notes: str,
+ next_action: str,
+) -> None:
+ run_manifest = {
+ "artifact_kind": "aoa.local-ai-trial.run-manifest",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "executed_at": utc_now(),
+ "runtime_selection": case["runtime_selection"],
+ "model": MODEL,
+ "backend": backend,
+ "commands": command_refs,
+ "artifact_refs": artifact_refs,
+ "notes": [
+ "W5 runs under LangGraph milestone gates on the promoted llama.cpp substrate.",
+ ],
+ }
+ result_summary = TRIALS.build_result_summary(
+ case=case,
+ status=status,
+ score_breakdown=score_breakdown,
+ observed=observed,
+ failure_class=failure_class,
+ reviewer_notes=reviewer_notes,
+ boundary_notes=boundary_notes,
+ next_action=next_action,
+ )
+ TRIALS.finalize_case(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ run_manifest=run_manifest,
+ result_summary=result_summary,
+ )
+
+
+def finalize_rejected_case(
+ *,
+ case: dict[str, Any],
+ log_root: Path,
+ mirror_root: Path,
+ milestone_id: str,
+ command_refs: list[dict[str, Any]],
+ artifact_refs: list[str],
+) -> None:
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs,
+ artifact_refs=artifact_refs,
+ status="fail",
+ score_breakdown={
+ "plan_freeze_approved": milestone_id != "plan_freeze",
+ "first_mutation_approved": milestone_id not in {"first_mutation"},
+ "landing_approved": milestone_id not in {"landing"},
+ "approval_rejected": True,
+ },
+ observed={
+ "highlights": [f"The scenario reached `{milestone_id}` and was explicitly rejected."],
+ "failures": [f"Approval status was `rejected` at `{milestone_id}`."],
+ },
+ failure_class="approval_rejected",
+ reviewer_notes="The scenario stopped at an explicit W5 approval boundary.",
+ boundary_notes=TRIALS.w4_boundary_note() if case["execution_mode"] != "read_only_summary" else TRIALS.w2_boundary_note(),
+ next_action="Refresh or replace the scenario proposal before retrying.",
+ )
+
+
+def collect_evidence_payload(case: dict[str, Any]) -> dict[str, Any]:
+ payload = {
+ "artifact_kind": "aoa.local-ai-trial.w5-evidence-collection",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "collected_at": utc_now(),
+ "execution_mode": case["execution_mode"],
+ "repo_scope": case.get("repo_scope", []),
+ "source_refs": case.get("source_refs", []),
+ "observed_action_ids": [item.get("id") for item in case.get("observed_actions", []) if item.get("id")],
+ "allowed_files": case.get("expected_result", {}).get("allowed_files", []),
+ "acceptance_checks": case.get("acceptance_checks", []),
+ }
+ if case["execution_mode"] != "read_only_summary":
+ with patched_repo_root_for_w5():
+ payload["agents_refs"] = TRIALS.collect_applicable_agents_refs(case)
+ return payload
+
+
+def w5_report_artifact_refs(log_root: Path, case_id: str, extra: list[str] | None = None) -> list[str]:
+ refs = [
+ str(scenario_root(log_root, case_id) / "graph.state.json"),
+ str(scenario_root(log_root, case_id) / "graph.history.jsonl"),
+ str(scenario_root(log_root, case_id) / "artifacts" / "step.journal.jsonl"),
+ ]
+ if approval_path(log_root, case_id).exists():
+ refs.append(str(approval_path(log_root, case_id)))
+ if plan_path(log_root, case_id).exists():
+ refs.append(str(plan_path(log_root, case_id)))
+ if interrupt_path(log_root, case_id).exists():
+ refs.append(str(interrupt_path(log_root, case_id)))
+ if extra:
+ refs.extend(extra)
+ return refs
+
+
+def proposal_artifact_refs(case_root: Path) -> list[str]:
+ refs = []
+ for name in (
+ "proposal.target.prompt.txt",
+ "proposal.plan.prompt.txt",
+ "proposal.target.json",
+ "proposal.plan.json",
+ "proposal.edit-spec.json",
+ "proposal.prompt.txt",
+ "proposal.retry.prompt.txt",
+ "proposal.diff",
+ "proposal.summary.json",
+ "worktree.manifest.json",
+ "landing.diff",
+ ):
+ path = case_root / "artifacts" / name
+ if path.exists():
+ refs.append(str(path))
+ for path in sorted((case_root / "artifacts").glob("proposal-*.stdout.txt")):
+ refs.append(str(path))
+ for path in sorted((case_root / "artifacts").glob("proposal-*.stderr.txt")):
+ refs.append(str(path))
+ for path in sorted((case_root / "artifacts").glob("proposal-*.command.json")):
+ refs.append(str(path))
+ return refs
+
+
+def run_read_only_scenario(case: dict[str, Any], *, log_root: Path, mirror_root: Path) -> dict[str, Any]:
+ case_root = scenario_root(log_root, case["case_id"])
+ grounding_path = case_root / "artifacts" / "grounding.txt"
+ prompt_path = case_root / "artifacts" / "prompt.txt"
+ judge_prompt_path = case_root / "artifacts" / "judge.prompt.txt"
+ evidence_summary_path = case_root / "artifacts" / "evidence.summary.json"
+
+ action_outcomes, action_artifact_refs, action_command_refs, action_errors = TRIALS.execute_w2_actions(case, case_root)
+ source_entries, source_errors = TRIALS.resolve_w2_source_entries(case, action_outcomes)
+ capture_errors = [*action_errors, *source_errors]
+
+ grounding_text = TRIALS.render_w2_grounding(source_entries, action_outcomes, capture_errors)
+ write_text(grounding_path, grounding_text)
+ prompt_grounding_text = TRIALS.render_w2_prompt_grounding(source_entries, action_outcomes)
+
+ evidence_summary = TRIALS.build_w2_evidence_summary(case, source_entries, action_outcomes, capture_errors)
+ write_json(evidence_summary_path, evidence_summary)
+
+ artifact_refs = [
+ str(grounding_path),
+ str(prompt_path),
+ str(judge_prompt_path),
+ str(evidence_summary_path),
+ *action_artifact_refs,
+ *w5_report_artifact_refs(log_root, case["case_id"]),
+ ]
+ command_refs: list[dict[str, Any]] = [*action_command_refs]
+
+ if capture_errors:
+ blocked_prompt = "\n".join(
+ [
+ "BLOCKED: prompt not built because evidence capture failed.",
+ "",
+ *[f"- {error}" for error in capture_errors],
+ ]
+ )
+ answer_command_ref = TRIALS.persist_command_result(
+ case_root,
+ "qwen-answer",
+ TRIALS.build_blocked_command_result(
+ [
+ absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
+ "--prompt-file",
+ str(prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
+ "--timeout",
+ "240",
+ "--temperature",
+ "0",
+ "--max-tokens",
+ "220",
+ "--json",
+ ],
+ cwd=CONFIGS_ROOT,
+ error="evidence capture failure:\n" + "\n".join(capture_errors),
+ ),
+ )
+ answer_qwen = TRIALS.build_blocked_qwen_payload("evidence capture failure")
+ write_text(prompt_path, blocked_prompt)
+ judge_command_ref = TRIALS.persist_command_result(
+ case_root,
+ "qwen-judge",
+ TRIALS.build_blocked_command_result(
+ [
+ absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
+ "--prompt-file",
+ str(judge_prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
+ "--timeout",
+ "240",
+ "--temperature",
+ "0",
+ "--max-tokens",
+ "200",
+ "--json",
+ ],
+ cwd=CONFIGS_ROOT,
+ error="judge blocked because evidence capture failed",
+ ),
+ )
+ write_text(judge_prompt_path, "BLOCKED: judge did not run because evidence capture failed.")
+ command_refs.extend([answer_command_ref, judge_command_ref])
+ artifact_refs.extend(
+ [
+ answer_command_ref["stdout_path"],
+ answer_command_ref["stderr_path"],
+ answer_command_ref["command_meta"],
+ judge_command_ref["stdout_path"],
+ judge_command_ref["stderr_path"],
+ judge_command_ref["command_meta"],
+ ]
+ )
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend="langgraph:read_only_summary",
+ command_refs=command_refs,
+ artifact_refs=artifact_refs,
+ status="fail",
+ score_breakdown={
+ "correct_source_refs": False,
+ "correct_next_hop": False,
+ "no_fabricated_ref_or_command": False,
+ "concise_accurate_summary": False,
+ "boundary_preserved": False,
+ "tool_outcome_honest": False,
+ "exact_ref_coverage": 0.0,
+ },
+ observed={
+ "highlights": [f"Evidence capture failed before model execution for {len(capture_errors)} items."],
+ "failures": capture_errors,
+ "executed_action_ids": evidence_summary["executed_action_ids"],
+ },
+ failure_class="evidence_capture_failure",
+ reviewer_notes="The W5 read-only scenario could not be evaluated because supervised evidence capture did not complete cleanly.",
+ boundary_notes=TRIALS.w2_boundary_note(),
+ next_action="Repair the missing ref or failing read-only capture before rerunning this W5 scenario.",
+ )
+ return {"status": "fail", "failure_class": "evidence_capture_failure", "command_refs": command_refs, "artifact_refs": artifact_refs}
+
+ answer_prompt = TRIALS.build_w2_prompt(case, prompt_grounding_text, action_outcomes)
+ answer_command_ref, answer_qwen = TRIALS.run_qwen_prompt(
+ case_root=case_root,
+ prompt_path=prompt_path,
+ label="qwen-answer",
+ prompt_text=answer_prompt,
+ max_tokens=220,
+ timeout_s=240,
+ )
+ command_refs.append(answer_command_ref)
+ artifact_refs.extend([answer_command_ref["stdout_path"], answer_command_ref["stderr_path"], answer_command_ref["command_meta"]])
+
+ transport_ok = (
+ bool(answer_qwen.get("ok"))
+ and answer_qwen.get("http_status") == 200
+ and answer_command_ref["exit_code"] == 0
+ and not answer_command_ref["timed_out"]
+ )
+ answer_payload: dict[str, Any] | None = None
+ parse_errors: list[str] = []
+ if transport_ok:
+ try:
+ answer_payload = TRIALS.parse_w2_answer(str(answer_qwen.get("answer") or ""))
+ except (json.JSONDecodeError, ValueError) as exc:
+ parse_errors.append(f"Could not parse W5 read-only answer JSON: {type(exc).__name__}: {exc}")
+ else:
+ parse_errors.append(str(answer_qwen.get("error") or "qwen answer transport failure"))
+
+ judge_payload: dict[str, Any] | None = None
+ if answer_payload is None:
+ write_text(judge_prompt_path, "BLOCKED: judge did not run because the main answer was unavailable or invalid.")
+ judge_command_ref = TRIALS.persist_command_result(
+ case_root,
+ "qwen-judge",
+ TRIALS.build_blocked_command_result(
+ [
+ absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
+ "--prompt-file",
+ str(judge_prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
+ "--timeout",
+ "240",
+ "--temperature",
+ "0",
+ "--max-tokens",
+ "200",
+ "--json",
+ ],
+ cwd=CONFIGS_ROOT,
+ error="judge blocked because the main W5 answer was unavailable or invalid",
+ ),
+ )
+ judge_qwen = TRIALS.build_blocked_qwen_payload("judge blocked")
+ else:
+ judge_prompt = TRIALS.build_w2_judge_prompt(case, evidence_summary, answer_payload)
+ judge_command_ref, judge_qwen = TRIALS.run_qwen_prompt(
+ case_root=case_root,
+ prompt_path=judge_prompt_path,
+ label="qwen-judge",
+ prompt_text=judge_prompt,
+ max_tokens=200,
+ timeout_s=240,
+ )
+ if (
+ bool(judge_qwen.get("ok"))
+ and judge_qwen.get("http_status") == 200
+ and judge_command_ref["exit_code"] == 0
+ and not judge_command_ref["timed_out"]
+ ):
+ try:
+ judge_payload = TRIALS.parse_w2_judge(str(judge_qwen.get("answer") or ""))
+ except (json.JSONDecodeError, ValueError) as exc:
+ parse_errors.append(f"Could not parse W5 read-only judge JSON: {type(exc).__name__}: {exc}")
+ else:
+ parse_errors.append(str(judge_qwen.get("error") or "qwen judge transport failure"))
+ command_refs.append(judge_command_ref)
+ artifact_refs.extend([judge_command_ref["stdout_path"], judge_command_ref["stderr_path"], judge_command_ref["command_meta"]])
+
+ if answer_payload is None or judge_payload is None:
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=answer_qwen.get("backend") or "langgraph:read_only_summary",
+ command_refs=command_refs,
+ artifact_refs=artifact_refs,
+ status="fail",
+ score_breakdown={
+ "correct_source_refs": False,
+ "correct_next_hop": False,
+ "no_fabricated_ref_or_command": False,
+ "concise_accurate_summary": False,
+ "boundary_preserved": False,
+ "tool_outcome_honest": False,
+ "exact_ref_coverage": 0.0,
+ },
+ observed={
+ "highlights": [
+ f"Main answer transport ok: `{str(transport_ok).lower()}`.",
+ f"Judge payload available: `{str(judge_payload is not None).lower()}`.",
+ ],
+ "failures": parse_errors,
+ "answer": answer_qwen.get("answer"),
+ "judge_answer": judge_qwen.get("answer"),
+ },
+ failure_class="summary_mismatch",
+ reviewer_notes="The W5 read-only scenario did not produce a valid bounded JSON answer or judge record.",
+ boundary_notes=TRIALS.w2_boundary_note(),
+ next_action="Repair the W5 answer or judge contract before relying on this scenario result.",
+ )
+ return {"status": "fail", "failure_class": "summary_mismatch", "command_refs": command_refs, "artifact_refs": artifact_refs}
+
+ score = TRIALS.score_w2_case(
+ case,
+ answer_raw_text=str(answer_qwen.get("answer") or ""),
+ answer_payload=answer_payload,
+ judge_payload=judge_payload,
+ action_outcomes=action_outcomes,
+ )
+ pass_flags = [
+ score["correct_source_refs"],
+ score["correct_next_hop"],
+ score["no_fabricated_ref_or_command"],
+ score["concise_accurate_summary"],
+ score["boundary_preserved"],
+ score["tool_outcome_honest"],
+ ]
+ status = "pass" if all(pass_flags) else "fail"
+ if score["fabricated_paths"] or score["fabricated_urls"]:
+ failure_class = "fabricated_reference"
+ elif score["fabricated_commands"]:
+ failure_class = "fabricated_command"
+ elif not score["tool_outcome_honest"]:
+ failure_class = "dishonest_tool_outcome"
+ elif not score["boundary_preserved"] or not score["correct_next_hop"]:
+ failure_class = "boundary_drift"
+ elif status == "pass":
+ failure_class = None
+ else:
+ failure_class = "summary_mismatch"
+
+ observed_failures = [*judge_payload["failure_reasons"]]
+ if score["fabricated_paths"]:
+ observed_failures.append("Fabricated absolute paths: " + ", ".join(score["fabricated_paths"]))
+ if score["fabricated_urls"]:
+ observed_failures.append("Fabricated URLs: " + ", ".join(score["fabricated_urls"]))
+ if score["fabricated_commands"]:
+ observed_failures.append("Fabricated commands: " + ", ".join(score["fabricated_commands"]))
+
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=answer_qwen.get("backend") or "langgraph:read_only_summary",
+ command_refs=command_refs,
+ artifact_refs=artifact_refs,
+ status=status,
+ score_breakdown={
+ "correct_source_refs": score["correct_source_refs"],
+ "correct_next_hop": score["correct_next_hop"],
+ "no_fabricated_ref_or_command": score["no_fabricated_ref_or_command"],
+ "concise_accurate_summary": score["concise_accurate_summary"],
+ "boundary_preserved": score["boundary_preserved"],
+ "tool_outcome_honest": score["tool_outcome_honest"],
+ "exact_ref_coverage": score["exact_ref_coverage"],
+ },
+ observed={
+ "highlights": [
+ f"Source refs captured: `{len(source_entries)}`.",
+ f"Observed actions executed: `{len(action_outcomes)}`.",
+ f"Elapsed time: `{answer_qwen.get('elapsed_s')}`s.",
+ f"Summary: {answer_payload['summary']}",
+ f"Next hop: `{answer_payload['next_hop']}`.",
+ ],
+ "failures": observed_failures or ["None."],
+ "answer": answer_payload,
+ "judge": judge_payload,
+ "executed_action_ids": evidence_summary["executed_action_ids"],
+ },
+ failure_class=failure_class,
+ reviewer_notes=(
+ "The W5 read-only scenario completed grounded supervised work without fabricating refs or crossing authority boundaries."
+ if status == "pass"
+ else "The W5 read-only scenario did not satisfy the bounded supervised read-only contract."
+ ),
+ boundary_notes=TRIALS.w2_boundary_note(),
+ next_action="Use the W5 packet to decide whether the next scenario should be approved at plan_freeze.",
+ )
+ return {"status": status, "failure_class": failure_class, "command_refs": command_refs, "artifact_refs": artifact_refs}
+
+
+def build_impl_exact_prompt(case: dict[str, Any], *, target_file: str, target_excerpt: str, agents_guidance: str) -> str:
+ input_lines = "\n".join(f"- {item}" for item in case.get("inputs", []))
+ return textwrap.dedent(
+ f"""\
+ W5 bounded implementation exact edit-spec proposal.
+ Propose one exact text replacement for one file only.
+
+ Inputs:
+ {input_lines}
+
+ Selected target file:
+ {target_file}
+
+ Target excerpt:
+ [TARGET_EXCERPT_START]
+ {target_excerpt}
+ [TARGET_EXCERPT_END]
+
+ # Trimmed AGENTS Guidance
+ {agents_guidance.rstrip()}
+
+ Response contract:
+ - Return compact JSON only.
+ - Use exactly this shape:
+ {{"mode":"exact_replace","target_file":"{target_file}","old_text":"...","new_text":"..."}}
+ - `old_text` must be copied exactly from the target excerpt.
+ - `new_text` must implement the requested `--check` behavior without widening scope.
+ - Prefer the smallest safe change.
+ - No code fence.
+ - No explanation outside the JSON object.
+ """
+ ).rstrip() + "\n"
+
+
+def build_impl_anchor_prompt(case: dict[str, Any], *, target_file: str, target_excerpt: str, previous_spec: dict[str, Any] | None, fallback_reason: str) -> str:
+ input_lines = "\n".join(f"- {item}" for item in case.get("inputs", []))
+ return textwrap.dedent(
+ f"""\
+ W5 bounded implementation anchored edit-spec fallback.
+ The exact replacement attempt was unavailable or not uniquely applicable.
+
+ Inputs:
+ {input_lines}
+
+ Selected target file:
+ {target_file}
+
+ Target excerpt:
+ [TARGET_EXCERPT_START]
+ {target_excerpt}
+ [TARGET_EXCERPT_END]
+
+ Previous exact spec:
+ {json.dumps(previous_spec, indent=2, ensure_ascii=True) if previous_spec else '[no valid exact spec]'}
+
+ Fallback reason:
+ {fallback_reason}
+
+ Response contract:
+ - Return compact JSON only.
+ - Use exactly this shape:
+ {{"mode":"anchored_replace","target_file":"{target_file}","anchor_before":"...","old_text":"...","new_text":"...","anchor_after":"..."}}
+ - `anchor_before`, `old_text`, and `anchor_after` must be copied exactly from the target excerpt.
+ - `new_text` must implement the requested `--check` behavior without widening scope.
+ - No code fence.
+ - No explanation outside the JSON object.
+ """
+ ).rstrip() + "\n"
+
+
+def build_impl_edit_spec_json(*, case_id: str, selected_target_file: str, mode: str | None, valid: bool, attempt_order: list[str], spec: dict[str, Any] | None, errors: list[str], attempts: list[dict[str, Any]]) -> dict[str, Any]:
+ return {
+ "artifact_kind": "aoa.local-ai-trial.w5-proposal-edit-spec",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case_id,
+ "prepared_at": utc_now(),
+ "selected_target_file": selected_target_file,
+ "mode": mode,
+ "valid": valid,
+ "attempt_order": attempt_order,
+ "spec": spec,
+ "errors": errors,
+ "attempts": attempts,
+ }
+
+
+def prepare_implementation_case(
+ case: dict[str, Any],
+ *,
+ case_root: Path,
+ repo_root: Path,
+ repo_head: str,
+ allowed_relative_files: list[str],
+ agents_refs: list[str],
+) -> tuple[dict[str, Any], list[dict[str, Any]], list[str]]:
+ command_refs: list[dict[str, Any]] = []
+ proposal_failure_reasons: list[str] = []
+ proposal_prompt_path = case_root / "artifacts" / "proposal.prompt.txt"
+ proposal_retry_prompt_path = case_root / "artifacts" / "proposal.retry.prompt.txt"
+ proposal_edit_spec_path = case_root / "artifacts" / "proposal.edit-spec.json"
+ proposal_diff_path = case_root / "artifacts" / "proposal.diff"
+ proposal_summary_path = case_root / "artifacts" / "proposal.summary.json"
+
+ target_file = allowed_relative_files[0]
+ target_entry = TRIALS.read_w4_repo_text(repo_root, target_file)
+ target_excerpt = TRIALS.bounded_text_slice(target_entry["text"], char_limit=2200, line_limit=120)
+ agents_guidance, _ = TRIALS.trim_agents_guidance(agents_refs, char_limit=500)
+ exact_timeout_s = 300 if "5403" in LANGCHAIN_RUN_URL else 120
+ anchor_timeout_s = 300 if "5403" in LANGCHAIN_RUN_URL else 120
+
+ # If the bounded implementation contract is already satisfied on the current HEAD,
+ # keep the scenario honest and pass it through the same mutation pipeline as a no-op.
+ satisfaction_refs, acceptance_ok = TRIALS.run_acceptance_checks(
+ case_root,
+ repo_root=repo_root,
+ checks=case.get("acceptance_checks", []),
+ label_prefix="proposal-satisfaction",
+ )
+ command_refs.extend(satisfaction_refs)
+ if acceptance_ok:
+ write_text(
+ proposal_prompt_path,
+ "NO-OP: the implementation contract is already satisfied at the current repo HEAD; no edit-spec prompt was sent.",
+ )
+ write_text(
+ proposal_retry_prompt_path,
+ "NO-OP: anchor fallback was not needed because the implementation contract is already satisfied.",
+ )
+ write_text_exact(proposal_diff_path, "")
+ write_json(
+ proposal_edit_spec_path,
+ build_impl_edit_spec_json(
+ case_id=case["case_id"],
+ selected_target_file=target_file,
+ mode="preexisting_noop",
+ valid=True,
+ attempt_order=[],
+ spec=None,
+ errors=[],
+ attempts=[],
+ ),
+ )
+ proposal_summary = {
+ "artifact_kind": "aoa.local-ai-trial.w5-proposal-summary",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "prepared_at": utc_now(),
+ "execution_mode": case["execution_mode"],
+ "lane": case.get("lane"),
+ "repo_root": str(repo_root),
+ "base_head": repo_head,
+ "allowed_files": allowed_relative_files,
+ "source_refs": case.get("source_refs", []),
+ "agents_refs": agents_refs,
+ "selected_target_file": target_file,
+ "edit_contract": "preexisting-noop",
+ "edit_spec_mode": "preexisting_noop",
+ "edit_spec_valid": True,
+ "builder_match_count": 0,
+ "rendered_diff_valid": True,
+ "proposal_valid": True,
+ "proposal_failure_reasons": [],
+ "touched_files": [],
+ "command_artifacts": [
+ path
+ for ref in command_refs
+ for path in (ref["stdout_path"], ref["stderr_path"], ref["command_meta"])
+ ],
+ }
+ write_json(proposal_summary_path, proposal_summary)
+ return proposal_summary, command_refs, []
+
+ attempt_order: list[str] = []
+ attempts: list[dict[str, Any]] = []
+ final_spec: dict[str, Any] | None = None
+ final_mode: str | None = None
+ candidate_text: str | None = None
+ builder_match_count = 0
+
+ exact_prompt = build_impl_exact_prompt(case, target_file=target_file, target_excerpt=target_excerpt, agents_guidance=agents_guidance)
+ exact_command_ref, exact_qwen = TRIALS.run_qwen_prompt(
+ case_root=case_root,
+ prompt_path=proposal_prompt_path,
+ label="proposal-edit-spec-exact",
+ prompt_text=exact_prompt,
+ max_tokens=260,
+ timeout_s=exact_timeout_s,
+ )
+ command_refs.append(exact_command_ref)
+ attempt_order.append("exact_replace")
+ exact_errors: list[str] = []
+ exact_raw = str(exact_qwen.get("answer") or "")
+ exact_spec: dict[str, Any] | None = None
+ if (
+ bool(exact_qwen.get("ok"))
+ and exact_qwen.get("http_status") == 200
+ and exact_command_ref["exit_code"] == 0
+ and not exact_command_ref["timed_out"]
+ ):
+ try:
+ exact_spec = TRIALS.parse_w4_edit_spec(
+ exact_raw,
+ expected_mode="exact_replace",
+ selected_target_file=target_file,
+ )
+ except (json.JSONDecodeError, ValueError) as exc:
+ exact_errors.append(f"exact edit-spec parse failure: {type(exc).__name__}: {exc}")
+ else:
+ exact_errors.append(str(exact_qwen.get("error") or "exact edit-spec transport failure"))
+ exact_match_count = 0
+ exact_candidate_text: str | None = None
+ if exact_spec is not None:
+ exact_match_count, exact_candidate_text = TRIALS.apply_exact_replace_to_text(
+ target_entry["text"],
+ old_text=exact_spec["old_text"],
+ new_text=exact_spec["new_text"],
+ )
+ if exact_match_count != 1:
+ exact_errors.append(f"exact_replace old_text match count must equal 1, observed {exact_match_count}")
+ attempts.append(
+ {
+ "mode": "exact_replace",
+ "raw_answer": exact_raw,
+ "valid": not exact_errors and exact_candidate_text is not None,
+ "errors": exact_errors,
+ "match_count": exact_match_count,
+ "spec": exact_spec,
+ }
+ )
+
+ if exact_candidate_text is not None and not exact_errors:
+ final_spec = exact_spec
+ final_mode = "exact_replace"
+ candidate_text = exact_candidate_text
+ builder_match_count = exact_match_count
+ else:
+ anchor_prompt = build_impl_anchor_prompt(
+ case,
+ target_file=target_file,
+ target_excerpt=target_excerpt,
+ previous_spec=exact_spec,
+ fallback_reason="\n".join(exact_errors or ["exact_replace was not uniquely applicable"]),
+ )
+ anchor_command_ref, anchor_qwen = TRIALS.run_qwen_prompt(
+ case_root=case_root,
+ prompt_path=proposal_retry_prompt_path,
+ label="proposal-edit-spec-anchor",
+ prompt_text=anchor_prompt,
+ max_tokens=320,
+ timeout_s=anchor_timeout_s,
+ )
+ command_refs.append(anchor_command_ref)
+ attempt_order.append("anchored_replace")
+ anchor_errors: list[str] = []
+ anchor_raw = str(anchor_qwen.get("answer") or "")
+ anchor_spec: dict[str, Any] | None = None
+ if (
+ bool(anchor_qwen.get("ok"))
+ and anchor_qwen.get("http_status") == 200
+ and anchor_command_ref["exit_code"] == 0
+ and not anchor_command_ref["timed_out"]
+ ):
+ try:
+ anchor_spec = TRIALS.parse_w4_edit_spec(
+ anchor_raw,
+ expected_mode="anchored_replace",
+ selected_target_file=target_file,
+ )
+ except (json.JSONDecodeError, ValueError) as exc:
+ anchor_errors.append(f"anchor edit-spec parse failure: {type(exc).__name__}: {exc}")
+ else:
+ anchor_errors.append(str(anchor_qwen.get("error") or "anchor edit-spec transport failure"))
+ anchor_match_count = 0
+ anchor_candidate_text: str | None = None
+ if anchor_spec is not None:
+ anchor_match_count, anchor_candidate_text = TRIALS.apply_anchored_replace_to_text(
+ target_entry["text"],
+ anchor_before=anchor_spec["anchor_before"],
+ old_text=anchor_spec["old_text"],
+ new_text=anchor_spec["new_text"],
+ anchor_after=anchor_spec["anchor_after"],
+ )
+ if anchor_match_count != 1:
+ anchor_errors.append(f"anchored_replace match count must equal 1, observed {anchor_match_count}")
+ attempts.append(
+ {
+ "mode": "anchored_replace",
+ "raw_answer": anchor_raw,
+ "valid": not anchor_errors and anchor_candidate_text is not None,
+ "errors": anchor_errors,
+ "match_count": anchor_match_count,
+ "spec": anchor_spec,
+ }
+ )
+ if anchor_candidate_text is not None and not anchor_errors:
+ final_spec = anchor_spec
+ final_mode = "anchored_replace"
+ candidate_text = anchor_candidate_text
+ builder_match_count = anchor_match_count
+ else:
+ proposal_failure_reasons.extend(exact_errors)
+ proposal_failure_reasons.extend(anchor_errors)
+
+ touched_files: list[str] = []
+ rendered_diff_valid = False
+ if final_spec is not None and candidate_text is not None:
+ diff_text = TRIALS.build_git_unified_diff(
+ relative_path=target_file,
+ before_text=target_entry["text"],
+ after_text=candidate_text,
+ )
+ write_text_exact(proposal_diff_path, diff_text)
+ if not diff_text.strip():
+ proposal_failure_reasons.append("deterministic diff builder produced an empty diff")
+ else:
+ inspection = TRIALS.inspect_w4_diff_text(diff_text, allowed_relative_files=allowed_relative_files)
+ touched_files = inspection["touched_files"]
+ if inspection["failure_reasons"]:
+ proposal_failure_reasons.extend(inspection["failure_reasons"])
+ elif touched_files != [target_file]:
+ proposal_failure_reasons.append("deterministic diff builder must touch exactly the selected target file")
+ else:
+ apply_check_raw = TRIALS.git_command(repo_root, ["apply", "--check", str(proposal_diff_path)], timeout_s=60)
+ apply_check_ref = TRIALS.persist_command_result(case_root, "proposal-apply-check", apply_check_raw)
+ command_refs.append(apply_check_ref)
+ if apply_check_raw["exit_code"] != 0 or apply_check_raw["timed_out"]:
+ proposal_failure_reasons.append("git apply --check failed against the current repo HEAD")
+ stderr = apply_check_raw.get("stderr", "").strip()
+ if stderr:
+ proposal_failure_reasons.append(stderr)
+ else:
+ rendered_diff_valid = True
+ else:
+ write_text_exact(proposal_diff_path, "")
+
+ write_json(
+ proposal_edit_spec_path,
+ build_impl_edit_spec_json(
+ case_id=case["case_id"],
+ selected_target_file=target_file,
+ mode=final_mode,
+ valid=not proposal_failure_reasons and final_spec is not None,
+ attempt_order=attempt_order,
+ spec=final_spec,
+ errors=proposal_failure_reasons.copy(),
+ attempts=attempts,
+ ),
+ )
+
+ proposal_summary = {
+ "artifact_kind": "aoa.local-ai-trial.w5-proposal-summary",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "prepared_at": utc_now(),
+ "execution_mode": case["execution_mode"],
+ "lane": case.get("lane"),
+ "repo_root": str(repo_root),
+ "base_head": repo_head,
+ "allowed_files": allowed_relative_files,
+ "source_refs": case.get("source_refs", []),
+ "agents_refs": agents_refs,
+ "selected_target_file": target_file,
+ "edit_contract": "hybrid-exact-then-anchor",
+ "edit_spec_mode": final_mode,
+ "edit_spec_valid": final_spec is not None and not proposal_failure_reasons,
+ "builder_match_count": builder_match_count,
+ "rendered_diff_valid": rendered_diff_valid,
+ "proposal_valid": not proposal_failure_reasons,
+ "proposal_failure_reasons": proposal_failure_reasons,
+ "touched_files": touched_files,
+ "command_artifacts": [
+ path
+ for ref in command_refs
+ for path in (ref["stdout_path"], ref["stderr_path"], ref["command_meta"])
+ ],
+ }
+ write_json(proposal_summary_path, proposal_summary)
+ return proposal_summary, command_refs, proposal_failure_reasons
+
+
+def prepare_mutation_proposal(case: dict[str, Any], *, log_root: Path) -> tuple[dict[str, Any], list[dict[str, Any]], list[str], Path]:
+ case_root = scenario_root(log_root, case["case_id"])
+ repo_root = repo_root_for_scenario(case)
+ TRIALS.ensure_repo_tracked_clean(repo_root)
+ repo_head = TRIALS.git_head(repo_root)
+ allowed_relative_files = TRIALS.relative_repo_paths(repo_root, case["expected_result"]["allowed_files"])
+ with patched_repo_root_for_w5():
+ agents_refs = TRIALS.collect_applicable_agents_refs(case)
+
+ if case["execution_mode"] == "qwen_patch":
+ proposal_summary, command_refs, failures = TRIALS.prepare_w4_docs_case(
+ case,
+ case_root=case_root,
+ repo_root=repo_root,
+ repo_head=repo_head,
+ allowed_relative_files=allowed_relative_files,
+ agents_refs=agents_refs,
+ )
+ proposal_summary["wave_id"] = WAVE_ID
+ write_json(case_root / "artifacts" / "proposal.summary.json", proposal_summary)
+ return proposal_summary, command_refs, failures, repo_root
+
+ if case["execution_mode"] == "script_refresh":
+ proposal_prompt_path = case_root / "artifacts" / "proposal.prompt.txt"
+ proposal_diff_path = case_root / "artifacts" / "proposal.diff"
+ builder_command = case.get("mutation_policy", {}).get("builder_command") or []
+ with patched_repo_root_for_w5():
+ prompt_text = TRIALS.build_w4_script_refresh_plan(case, allowed_relative_files=allowed_relative_files)
+ write_text(proposal_prompt_path, prompt_text)
+ write_text_exact(proposal_diff_path, "# script_refresh case\n# diff is produced only after approved worktree execution\n")
+ proposal_valid = bool(builder_command)
+ failures = [] if proposal_valid else ["missing builder command for script_refresh case"]
+ proposal_summary = {
+ "artifact_kind": "aoa.local-ai-trial.w5-proposal-summary",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "prepared_at": utc_now(),
+ "execution_mode": case["execution_mode"],
+ "lane": case.get("lane"),
+ "repo_root": str(repo_root),
+ "base_head": repo_head,
+ "allowed_files": allowed_relative_files,
+ "source_refs": case.get("source_refs", []),
+ "agents_refs": agents_refs,
+ "edit_contract": "script_refresh",
+ "edit_spec_mode": None,
+ "edit_spec_valid": False,
+ "builder_match_count": 0,
+ "rendered_diff_valid": False,
+ "proposal_valid": proposal_valid,
+ "proposal_failure_reasons": failures,
+ "touched_files": [],
+ "builder_command": builder_command,
+ "command_artifacts": [],
+ }
+ write_json(case_root / "artifacts" / "proposal.summary.json", proposal_summary)
+ return proposal_summary, [], failures, repo_root
+
+ proposal_summary, command_refs, failures = prepare_implementation_case(
+ case,
+ case_root=case_root,
+ repo_root=repo_root,
+ repo_head=repo_head,
+ allowed_relative_files=allowed_relative_files,
+ agents_refs=agents_refs,
+ )
+ return proposal_summary, command_refs, failures, repo_root
+
+
+def run_worktree_preview(
+ case: dict[str, Any],
+ *,
+ log_root: Path,
+ repo_root: Path,
+) -> tuple[bool, list[str], list[dict[str, Any]], list[str], str | None]:
+ case_root = scenario_root(log_root, case["case_id"])
+ proposal_summary_path = case_root / "artifacts" / "proposal.summary.json"
+ proposal_diff_path = case_root / "artifacts" / "proposal.diff"
+ worktree_manifest_path = case_root / "artifacts" / "worktree.manifest.json"
+ landing_diff_path = case_root / "artifacts" / "landing.diff"
+ proposal_summary = load_json(proposal_summary_path)
+ allowed_relative = set(proposal_summary.get("allowed_files") or [])
+ base_head = str(proposal_summary.get("base_head") or "")
+ diff_text = proposal_diff_path.read_text(encoding="utf-8") if proposal_diff_path.exists() else ""
+
+ command_refs: list[dict[str, Any]] = []
+ artifact_refs = proposal_artifact_refs(case_root)
+ worktree_path, add_raw = TRIALS.with_temp_worktree(repo_root, case_id=case["case_id"], log_root=log_root)
+ add_ref = TRIALS.persist_command_result(case_root, "worktree-add", add_raw)
+ command_refs.append(add_ref)
+ artifact_refs.extend([add_ref["stdout_path"], add_ref["stderr_path"], add_ref["command_meta"]])
+ if add_raw["exit_code"] != 0 or add_raw["timed_out"]:
+ if worktree_path.exists():
+ worktree_path.rmdir()
+ return False, [], command_refs, artifact_refs, "preflight_failure"
+
+ neighbor_links = TRIALS.ensure_w4_worktree_neighbor_links(worktree_path)
+ worktree_manifest = {
+ "artifact_kind": "aoa.local-ai-trial.w5-worktree-manifest",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "created_at": utc_now(),
+ "repo_root": str(repo_root),
+ "worktree_path": str(worktree_path),
+ "base_head": base_head,
+ "execution_mode": case["execution_mode"],
+ "neighbor_links": neighbor_links,
+ }
+ write_json(worktree_manifest_path, worktree_manifest)
+ artifact_refs.append(str(worktree_manifest_path))
+
+ changed_files: list[str] = []
+ failure_class: str | None = None
+ try:
+ if case["execution_mode"] in {"qwen_patch", "implementation_patch"}:
+ if diff_text.strip():
+ apply_check_raw = TRIALS.git_command(worktree_path, ["apply", "--check", str(proposal_diff_path)], timeout_s=60)
+ apply_check_ref = TRIALS.persist_command_result(case_root, "worktree-apply-check", apply_check_raw)
+ command_refs.append(apply_check_ref)
+ artifact_refs.extend([apply_check_ref["stdout_path"], apply_check_ref["stderr_path"], apply_check_ref["command_meta"]])
+ if apply_check_raw["exit_code"] != 0 or apply_check_raw["timed_out"]:
+ failure_class = "proposal_invalid"
+ raise RuntimeError("git apply --check failed in isolated worktree")
+
+ apply_raw = TRIALS.git_command(worktree_path, ["apply", str(proposal_diff_path)], timeout_s=60)
+ apply_ref = TRIALS.persist_command_result(case_root, "worktree-apply", apply_raw)
+ command_refs.append(apply_ref)
+ artifact_refs.extend([apply_ref["stdout_path"], apply_ref["stderr_path"], apply_ref["command_meta"]])
+ if apply_raw["exit_code"] != 0 or apply_raw["timed_out"]:
+ failure_class = "proposal_invalid"
+ raise RuntimeError("git apply failed in isolated worktree")
+ else:
+ builder_command = case.get("mutation_policy", {}).get("builder_command") or []
+ builder_raw = TRIALS.run_command(builder_command, cwd=worktree_path, timeout_s=600)
+ builder_ref = TRIALS.persist_command_result(case_root, "worktree-builder", builder_raw)
+ command_refs.append(builder_ref)
+ artifact_refs.extend([builder_ref["stdout_path"], builder_ref["stderr_path"], builder_ref["command_meta"]])
+ if builder_raw["exit_code"] != 0 or builder_raw["timed_out"]:
+ failure_class = "post_change_validation_failure"
+ raise RuntimeError("builder command failed in isolated worktree")
+
+ changed_files = TRIALS.list_changed_files(worktree_path)
+ unauthorized = sorted(item for item in changed_files if item not in allowed_relative)
+ if unauthorized:
+ failure_class = "unauthorized_scope_expansion"
+ raise RuntimeError("changed files outside allowed scope: " + ", ".join(unauthorized))
+
+ landing_raw = TRIALS.build_landing_diff(worktree_path, diff_path=landing_diff_path)
+ landing_ref = TRIALS.persist_command_result(case_root, "worktree-landing-diff", landing_raw)
+ command_refs.append(landing_ref)
+ artifact_refs.extend([landing_ref["stdout_path"], landing_ref["stderr_path"], landing_ref["command_meta"], str(landing_diff_path)])
+
+ acceptance_refs, acceptance_ok = TRIALS.run_acceptance_checks(
+ case_root,
+ repo_root=worktree_path,
+ checks=case.get("acceptance_checks", []),
+ label_prefix="worktree-acceptance",
+ )
+ command_refs.extend(acceptance_refs)
+ for ref in acceptance_refs:
+ artifact_refs.extend([ref["stdout_path"], ref["stderr_path"], ref["command_meta"]])
+ if not acceptance_ok:
+ failure_class = "post_change_validation_failure"
+ raise RuntimeError("worktree acceptance failed")
+
+ return True, changed_files, command_refs, artifact_refs, None
+ except RuntimeError:
+ return False, changed_files, command_refs, artifact_refs, failure_class or "proposal_invalid"
+ finally:
+ remove_raw = TRIALS.remove_temp_worktree(repo_root, worktree_path)
+ remove_ref = TRIALS.persist_command_result(case_root, "worktree-remove", remove_raw)
+ command_refs.append(remove_ref)
+ artifact_refs.extend([remove_ref["stdout_path"], remove_ref["stderr_path"], remove_ref["command_meta"]])
+ write_json(
+ worktree_manifest_path,
+ {
+ **worktree_manifest,
+ "removed_at": utc_now(),
+ "remove_exit_code": remove_raw["exit_code"],
+ "remove_timed_out": remove_raw["timed_out"],
+ },
+ )
+
+
+def land_validated_diff(
+ case: dict[str, Any],
+ *,
+ log_root: Path,
+ repo_root: Path,
+ base_head: str | None,
+) -> tuple[bool, list[dict[str, Any]], list[str], str | None]:
+ case_root = scenario_root(log_root, case["case_id"])
+ landing_diff_path = case_root / "artifacts" / "landing.diff"
+ command_refs: list[dict[str, Any]] = []
+ artifact_refs = w5_report_artifact_refs(log_root, case["case_id"], extra=proposal_artifact_refs(case_root))
+
+ TRIALS.ensure_repo_tracked_clean(repo_root)
+ if base_head and TRIALS.git_head(repo_root) != base_head:
+ return False, command_refs, artifact_refs, "landing_reapply_failure"
+
+ diff_text = landing_diff_path.read_text(encoding="utf-8") if landing_diff_path.exists() else ""
+ if diff_text.strip():
+ main_check_raw = TRIALS.git_command(repo_root, ["apply", "--check", str(landing_diff_path)], timeout_s=60)
+ main_check_ref = TRIALS.persist_command_result(case_root, "landing-apply-check", main_check_raw)
+ command_refs.append(main_check_ref)
+ artifact_refs.extend([main_check_ref["stdout_path"], main_check_ref["stderr_path"], main_check_ref["command_meta"]])
+ if main_check_raw["exit_code"] != 0 or main_check_raw["timed_out"]:
+ return False, command_refs, artifact_refs, "landing_reapply_failure"
+
+ main_apply_raw = TRIALS.git_command(repo_root, ["apply", str(landing_diff_path)], timeout_s=60)
+ main_apply_ref = TRIALS.persist_command_result(case_root, "landing-apply", main_apply_raw)
+ command_refs.append(main_apply_ref)
+ artifact_refs.extend([main_apply_ref["stdout_path"], main_apply_ref["stderr_path"], main_apply_ref["command_meta"]])
+ if main_apply_raw["exit_code"] != 0 or main_apply_raw["timed_out"]:
+ return False, command_refs, artifact_refs, "landing_reapply_failure"
+
+ acceptance_refs, acceptance_ok = TRIALS.run_acceptance_checks(
+ case_root,
+ repo_root=repo_root,
+ checks=case.get("acceptance_checks", []),
+ label_prefix="landing-acceptance",
+ )
+ command_refs.extend(acceptance_refs)
+ for ref in acceptance_refs:
+ artifact_refs.extend([ref["stdout_path"], ref["stderr_path"], ref["command_meta"]])
+ if not acceptance_ok:
+ if diff_text.strip():
+ TRIALS.git_command(repo_root, ["apply", "-R", str(landing_diff_path)], timeout_s=60)
+ return False, command_refs, artifact_refs, "post_change_validation_failure"
+ return True, command_refs, artifact_refs, None
+
+
+def commit_checkpoint(case: dict[str, Any], *, repo_root: Path, case_root: Path) -> tuple[str | None, list[dict[str, Any]], list[str], str | None]:
+ command_refs: list[dict[str, Any]] = []
+ artifact_refs: list[str] = []
+ changed_files = TRIALS.list_changed_files(repo_root)
+ if not changed_files:
+ payload = {
+ "artifact_kind": "aoa.local-ai-trial.w5-commit-checkpoint",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "committed_at": utc_now(),
+ "commit_ref": None,
+ "commit_message": None,
+ "status": "no-op-clean",
+ }
+ path = case_root / "node-artifacts" / "commit-checkpoint.json"
+ write_json(path, payload)
+ artifact_refs.append(str(path))
+ return "no-op-clean", command_refs, artifact_refs, None
+
+ commit_message = COMMIT_MESSAGES[case["case_id"]]
+ add_raw = TRIALS.git_command(repo_root, ["add", "--", *changed_files], timeout_s=60)
+ add_ref = TRIALS.persist_command_result(case_root, "checkpoint-add", add_raw)
+ command_refs.append(add_ref)
+ artifact_refs.extend([add_ref["stdout_path"], add_ref["stderr_path"], add_ref["command_meta"]])
+ if add_raw["exit_code"] != 0 or add_raw["timed_out"]:
+ return None, command_refs, artifact_refs, "checkpoint_add_failed"
+
+ commit_raw = TRIALS.git_command(repo_root, ["commit", "-m", commit_message], timeout_s=120)
+ commit_ref = TRIALS.persist_command_result(case_root, "checkpoint-commit", commit_raw)
+ command_refs.append(commit_ref)
+ artifact_refs.extend([commit_ref["stdout_path"], commit_ref["stderr_path"], commit_ref["command_meta"]])
+ if commit_raw["exit_code"] != 0 or commit_raw["timed_out"]:
+ return None, command_refs, artifact_refs, "checkpoint_commit_failed"
+
+ sha_raw = TRIALS.git_command(repo_root, ["rev-parse", "HEAD"], timeout_s=30)
+ sha_ref = TRIALS.persist_command_result(case_root, "checkpoint-head", sha_raw)
+ command_refs.append(sha_ref)
+ artifact_refs.extend([sha_ref["stdout_path"], sha_ref["stderr_path"], sha_ref["command_meta"]])
+ if sha_raw["exit_code"] != 0 or sha_raw["timed_out"]:
+ return None, command_refs, artifact_refs, "checkpoint_head_failed"
+ sha = sha_raw["stdout"].strip()
+
+ payload = {
+ "artifact_kind": "aoa.local-ai-trial.w5-commit-checkpoint",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "committed_at": utc_now(),
+ "commit_ref": sha,
+ "commit_message": commit_message,
+ "status": "committed",
+ }
+ path = case_root / "node-artifacts" / "commit-checkpoint.json"
+ write_json(path, payload)
+ artifact_refs.append(str(path))
+ return sha, command_refs, artifact_refs, None
+
+
+def make_index_payload(log_root: Path, mirror_root: Path) -> dict[str, Any]:
+ cases = available_cases()
+ case_entries: list[dict[str, Any]] = []
+ pass_count = 0
+ fail_count = 0
+ planned_count = 0
+ critical_failure_count = 0
+ unauthorized_scope_expansion = 0
+ post_change_validation_failure = 0
+ local_commit_refs: dict[str, str | None] = {}
+ pause_resume_proved = False
+ implementation_case_passed = False
+ generated_case_passed = False
+
+ for case in cases:
+ result = load_result_summary(log_root, case["case_id"])
+ graph_state = load_graph_state(log_root, case["case_id"])
+ status = "planned"
+ if result:
+ status = result["status"]
+ if status == "pass":
+ pass_count += 1
+ elif status == "fail":
+ fail_count += 1
+ if result.get("failure_class") in CRITICAL_FAILURES:
+ critical_failure_count += 1
+ if result.get("failure_class") == "unauthorized_scope_expansion":
+ unauthorized_scope_expansion += 1
+ if result.get("failure_class") == "post_change_validation_failure":
+ post_change_validation_failure += 1
+ elif graph_state:
+ status = "paused" if graph_state.get("paused") else "in-progress"
+ else:
+ planned_count += 1
+
+ if case["case_id"] == "stack-sync-federation-check-mode":
+ implementation_case_passed = bool(result and result.get("status") == "pass")
+ if graph_state:
+ history = graph_state.get("history", [])
+ pause_resume_proved = (
+ any(item.get("node") == "await_plan_freeze" and item.get("status") == "paused" for item in history)
+ and graph_state.get("resume_count", 0) > 0
+ and implementation_case_passed
+ )
+ if case["case_id"] == "aoa-routing-generated-surface-refresh":
+ generated_case_passed = bool(result and result.get("status") == "pass")
+
+ local_commit_refs[case["case_id"]] = (graph_state or {}).get("local_commit_ref")
+
+ entry = {
+ "case_id": case["case_id"],
+ "status": status,
+ "repo_scope": case["repo_scope"],
+ "task_family": case["task_family"],
+ "case_spec": str(scenario_root(log_root, case["case_id"]) / "case.spec.json"),
+ "summary": case["title"],
+ "current_node": (graph_state or {}).get("current_node"),
+ "approval_status": (graph_state or {}).get("approval_status"),
+ "milestone": (graph_state or {}).get("current_milestone"),
+ "local_commit_ref": (graph_state or {}).get("local_commit_ref"),
+ }
+ report_path = scenario_root(log_root, case["case_id"]) / "report.md"
+ if report_path.exists():
+ entry["report_md"] = str(mirror_root / TRIALS.case_report_name(WAVE_ID, case["case_id"]))
+ case_entries.append(entry)
+
+ gate_pass = (
+ pass_count == len(cases)
+ and critical_failure_count == 0
+ and pause_resume_proved
+ and implementation_case_passed
+ and generated_case_passed
+ and unauthorized_scope_expansion == 0
+ and post_change_validation_failure == 0
+ )
+
+ if gate_pass:
+ gate_result = "pass"
+ next_action = "W5 passed on promoted llama.cpp + LangGraph. Use this substrate as the bounded baseline for the next autonomy-focused wave."
+ elif planned_count == len(cases):
+ gate_result = "not-run"
+ next_action = "Materialize the W5 pilot, then start the first scenario at the plan_freeze milestone."
+ elif fail_count or critical_failure_count:
+ gate_result = "fail"
+ next_action = "Stop at W5, inspect the failed scenario packets, and remediate before any broader autonomy claim."
+ else:
+ gate_result = "in-progress"
+ next_action = "Continue the paused W5 scenarios through their next milestone gate."
+
+ return {
+ "artifact_kind": "aoa.local-ai-trial.wave-index",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "wave_title": W5_METADATA["title"],
+ "wave_summary": W5_METADATA["summary"],
+ "case_count": len(cases),
+ "status_counts": {
+ "pass": pass_count,
+ "fail": fail_count,
+ "planned": planned_count,
+ },
+ "gate_result": gate_result,
+ "next_action": next_action,
+ "cases": case_entries,
+ "gate_detail": {
+ "pass_count": pass_count,
+ "fail_count": fail_count,
+ "critical_failures": critical_failure_count,
+ "pause_resume_proved": pause_resume_proved,
+ "implementation_case_passed": implementation_case_passed,
+ "generated_case_passed": generated_case_passed,
+ "unauthorized_scope_expansion": unauthorized_scope_expansion,
+ "post_change_validation_failure": post_change_validation_failure,
+ "local_commit_refs": local_commit_refs,
+ "next_action": next_action,
+ },
+ }
+
+
+def summary_memo(log_root: Path, mirror_root: Path) -> str:
+ index_payload = make_index_payload(log_root, mirror_root)
+ gate = index_payload["gate_detail"]
+ return "\n".join(
+ [
+ "# W5 Summary",
+ "",
+ "## Wave Verdict",
+ f"- Gate result: `{index_payload['gate_result']}`",
+ f"- Pass count: `{gate['pass_count']}`",
+ f"- Fail count: `{gate['fail_count']}`",
+ f"- Pause/resume proved: `{gate['pause_resume_proved']}`",
+ f"- Generated case passed: `{gate['generated_case_passed']}`",
+ f"- Implementation case passed: `{gate['implementation_case_passed']}`",
+ "",
+ "## Substrate",
+ "- Runtime path: `llama.cpp -> langchain-api /run` on `http://127.0.0.1:5403/run`",
+ "- Orchestration layer: `LangGraph`",
+ "",
+ "## Next Action",
+ index_payload["next_action"],
+ "",
+ ]
+ )
+
+
+def refresh_w5_outputs(log_root: Path, mirror_root: Path) -> None:
+ index_payload = make_index_payload(log_root, mirror_root)
+ write_json(log_root / f"{INDEX_NAME}.json", index_payload)
+ index_md = TRIALS.render_wave_index_md(index_payload)
+ write_text(log_root / f"{INDEX_NAME}.md", index_md)
+ write_text(mirror_root / f"{INDEX_NAME}.md", index_md)
+ write_text(mirror_root / SUMMARY_MEMO_NAME, summary_memo(log_root, mirror_root))
+
+
+def build_graph(log_root: Path, mirror_root: Path):
+ def route_from_phase(state: W5State) -> Command[str]:
+ next_node = state.get("next_node") or "preflight"
+ return Command(update={"current_node": "route"}, goto=next_node)
+
+ def preflight(state: W5State) -> Command[str]:
+ case_id = state["case_id"]
+ case_root = scenario_root(log_root, case_id)
+ command_refs = list(state.get("command_refs", []))
+ artifact_refs = list(state.get("artifact_refs", []))
+ try:
+ ensure_w4_closeout_pass()
+ ensure_llamacpp_promotion_pass()
+
+ doctor_raw = TRIALS.run_command([absolute(SCRIPTS_ROOT / "aoa-doctor"), "--preset", "intel-full"], cwd=CONFIGS_ROOT, timeout_s=180)
+ doctor_ref = TRIALS.persist_command_result(case_root, "preflight-doctor", doctor_raw)
+ command_refs.append(doctor_ref)
+ artifact_refs.extend([doctor_ref["stdout_path"], doctor_ref["stderr_path"], doctor_ref["command_meta"]])
+ if doctor_raw["exit_code"] != 0 or doctor_raw["timed_out"]:
+ raise RuntimeError("aoa-doctor --preset intel-full failed")
+
+ for label, url in (
+ ("health-llamacpp", LANGCHAIN_RUN_URL.rsplit("/", 1)[0] + "/health"),
+ ("health-route-api", "http://127.0.0.1:5402/health"),
+ ("health-baseline", "http://127.0.0.1:5401/health"),
+ ):
+ health_ref, payload = build_health_check(case_root, label, url)
+ command_refs.append(health_ref)
+ artifact_refs.extend([health_ref["stdout_path"], health_ref["stderr_path"], health_ref["command_meta"]])
+ if health_ref["exit_code"] != 0 or payload.get("ok") is not True:
+ raise RuntimeError(f"preflight health failed for {url}")
+
+ history = record_event(state, node="preflight", status="pass", note="W4 closeout, llama.cpp promotion, and runtime health posture are green.")
+ node_json(
+ log_root,
+ case_id,
+ "preflight",
+ {
+ "checked_at": utc_now(),
+ "w4_closeout": str(BASELINE_W4_LOG_ROOT / "W4-closeout.json"),
+ "llamacpp_promotion": str(LLAMACPP_PROMOTION_ROOT / "latest.json"),
+ "run_url": LANGCHAIN_RUN_URL,
+ "status": "pass",
+ },
+ )
+ return Command(
+ update={
+ "current_node": "preflight",
+ "next_node": "load_scenario",
+ "history": history,
+ "command_refs": command_refs,
+ "artifact_refs": artifact_refs,
+ "paused": False,
+ "pause_reason": None,
+ "pause_milestone": None,
+ "failure_class": None,
+ "terminal_status": None,
+ },
+ goto="load_scenario",
+ )
+ except Exception as exc:
+ history = record_event(state, node="preflight", status="fail", note=str(exc))
+ case = load_case_spec(log_root, case_id)
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs,
+ artifact_refs=artifact_refs,
+ status="fail",
+ score_breakdown={"preflight_ok": False},
+ observed={
+ "highlights": ["W5 stopped before scenario execution because preflight failed."],
+ "failures": [str(exc)],
+ },
+ failure_class="preflight_failure",
+ reviewer_notes="The W5 preflight did not satisfy the required W4, llama.cpp, and runtime-health posture.",
+ boundary_notes=TRIALS.w4_boundary_note() if case["execution_mode"] != "read_only_summary" else TRIALS.w2_boundary_note(),
+ next_action="Repair the failing runtime prerequisite before retrying this W5 scenario.",
+ )
+ return Command(
+ update={
+ "current_node": "preflight",
+ "next_node": "finalize_report",
+ "history": history,
+ "command_refs": command_refs,
+ "artifact_refs": artifact_refs,
+ "failure_class": "preflight_failure",
+ "terminal_status": "fail",
+ },
+ goto="finalize_report",
+ )
+
+ def load_scenario(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ history = record_event(state, node="load_scenario", status="pass", note=f"Loaded `{case['case_id']}` with execution_mode `{case['execution_mode']}`.")
+ node_json(
+ log_root,
+ case["case_id"],
+ "load-scenario",
+ {
+ "loaded_at": utc_now(),
+ "case_id": case["case_id"],
+ "execution_mode": case["execution_mode"],
+ "milestone_gates": case.get("milestone_gates", []),
+ "derived_from": case.get("derived_from"),
+ },
+ )
+ return Command(
+ update={
+ "current_node": "load_scenario",
+ "next_node": "collect_evidence",
+ "execution_mode": case["execution_mode"],
+ "history": history,
+ },
+ goto="collect_evidence",
+ )
+
+ def collect_evidence(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ payload = collect_evidence_payload(case)
+ node_json(log_root, case["case_id"], "collect-evidence", payload)
+ history = record_event(state, node="collect_evidence", status="pass", note="Scenario refs, observed actions, and bounded scope were captured.")
+ return Command(
+ update={
+ "current_node": "collect_evidence",
+ "next_node": "draft_plan",
+ "history": history,
+ "artifact_refs": [*state.get("artifact_refs", []), str(node_artifacts_dir(log_root, case["case_id"]) / "collect-evidence.json")],
+ },
+ goto="draft_plan",
+ )
+
+ def draft_plan(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ payload = build_scenario_plan(case)
+ write_json(plan_path(log_root, case["case_id"]), payload)
+ node_json(log_root, case["case_id"], "draft-plan", payload)
+ history = record_event(state, node="draft_plan", status="pass", note="A deterministic bounded plan was drafted for the next milestone review.")
+ return Command(
+ update={
+ "current_node": "draft_plan",
+ "next_node": "await_plan_freeze",
+ "history": history,
+ "artifact_refs": [*state.get("artifact_refs", []), str(plan_path(log_root, case["case_id"]))],
+ },
+ goto="await_plan_freeze",
+ )
+
+ def milestone_gate(state: W5State, *, milestone_id: str, next_node: str, node_name: str) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ history = list(state.get("history", []))
+ forced_pause_seen = list(state.get("forced_pause_seen", []))
+ existing = approval_payload(log_root, case["case_id"])
+ approval_status = interpret_approval_status(existing, milestone_id=milestone_id)
+ force_pause = case.get("force_pause_on_milestone") == milestone_id and milestone_id not in forced_pause_seen
+
+ if state.get("until") == "milestone" or force_pause:
+ write_approval_status(
+ log_root,
+ case=case,
+ milestone_id=milestone_id,
+ base_head=state.get("base_head"),
+ notes=f"Review the W5 `{milestone_id}` boundary and set status to approved or rejected before resuming.",
+ )
+ if force_pause:
+ forced_pause_seen.append(milestone_id)
+ history = record_event(
+ {"history": history},
+ node=node_name,
+ status="paused",
+ note=f"W5 paused at milestone `{milestone_id}`.",
+ )
+ write_interrupt(log_root, case_id=case["case_id"], milestone_id=milestone_id, reason="milestone_pending")
+ return Command(
+ update={
+ "current_node": node_name,
+ "next_node": node_name,
+ "history": history,
+ "paused": True,
+ "pause_reason": "milestone_pending",
+ "pause_milestone": milestone_id,
+ "approval_status": "pending",
+ "current_milestone": milestone_id,
+ "terminal_status": "paused",
+ "forced_pause_seen": forced_pause_seen,
+ },
+ goto=END,
+ )
+
+ if approval_status == "approved":
+ history = record_event(
+ {"history": history},
+ node=node_name,
+ status="approved",
+ note=f"Approval granted for `{milestone_id}`.",
+ )
+ return Command(
+ update={
+ "current_node": node_name,
+ "next_node": next_node,
+ "history": history,
+ "paused": False,
+ "pause_reason": None,
+ "pause_milestone": None,
+ "approval_status": "approved",
+ "current_milestone": milestone_id,
+ "terminal_status": None,
+ "forced_pause_seen": forced_pause_seen,
+ },
+ goto=next_node,
+ )
+
+ if approval_status == "rejected":
+ finalize_rejected_case(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ milestone_id=milestone_id,
+ command_refs=list(state.get("command_refs", [])),
+ artifact_refs=[*state.get("artifact_refs", []), *w5_report_artifact_refs(log_root, case["case_id"])],
+ )
+ history = record_event(
+ {"history": history},
+ node=node_name,
+ status="rejected",
+ note=f"Approval was explicitly rejected at `{milestone_id}`.",
+ )
+ return Command(
+ update={
+ "current_node": node_name,
+ "next_node": "finalize_report",
+ "history": history,
+ "paused": False,
+ "pause_reason": None,
+ "pause_milestone": milestone_id,
+ "approval_status": "rejected",
+ "current_milestone": milestone_id,
+ "terminal_status": "rejected",
+ "failure_class": "approval_rejected",
+ "forced_pause_seen": forced_pause_seen,
+ },
+ goto="finalize_report",
+ )
+
+ write_approval_status(
+ log_root,
+ case=case,
+ milestone_id=milestone_id,
+ base_head=state.get("base_head"),
+ notes=f"Review the W5 `{milestone_id}` boundary and set status to approved or rejected before resuming.",
+ )
+ history = record_event(
+ {"history": history},
+ node=node_name,
+ status="paused",
+ note=f"W5 paused at milestone `{milestone_id}`.",
+ )
+ write_interrupt(log_root, case_id=case["case_id"], milestone_id=milestone_id, reason="milestone_pending")
+ return Command(
+ update={
+ "current_node": node_name,
+ "next_node": node_name,
+ "history": history,
+ "paused": True,
+ "pause_reason": "milestone_pending",
+ "pause_milestone": milestone_id,
+ "approval_status": "pending",
+ "current_milestone": milestone_id,
+ "terminal_status": "paused",
+ "forced_pause_seen": forced_pause_seen,
+ },
+ goto=END,
+ )
+
+ def await_plan_freeze(state: W5State) -> Command[str]:
+ next_node = "execute_read_only_actions" if state["execution_mode"] == "read_only_summary" else "build_proposal"
+ return milestone_gate(state, milestone_id="plan_freeze", next_node=next_node, node_name="await_plan_freeze")
+
+ def execute_read_only_actions(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ result = run_read_only_scenario(case, log_root=log_root, mirror_root=mirror_root)
+ history = record_event(
+ state,
+ node="execute_read_only_actions",
+ status=result["status"],
+ note="Executed the bounded read-only scenario after plan approval.",
+ extra={"failure_class": result.get("failure_class")},
+ )
+ return Command(
+ update={
+ "current_node": "execute_read_only_actions",
+ "next_node": "draft_summary",
+ "history": history,
+ "command_refs": result.get("command_refs", []),
+ "artifact_refs": result.get("artifact_refs", []),
+ "failure_class": result.get("failure_class"),
+ "terminal_status": result["status"],
+ },
+ goto="draft_summary",
+ )
+
+ def draft_summary(state: W5State) -> Command[str]:
+ result = load_result_summary(log_root, state["case_id"]) or {}
+ history = record_event(
+ state,
+ node="draft_summary",
+ status=str(result.get("status") or "fail"),
+ note="Read-only scenario summary was recorded into the standard packet shape.",
+ )
+ node_json(
+ log_root,
+ state["case_id"],
+ "draft-summary",
+ {
+ "recorded_at": utc_now(),
+ "result_status": result.get("status"),
+ "failure_class": result.get("failure_class"),
+ },
+ )
+ return Command(
+ update={
+ "current_node": "draft_summary",
+ "next_node": "finalize_report",
+ "history": history,
+ },
+ goto="finalize_report",
+ )
+
+ def build_proposal(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ try:
+ proposal_summary, command_refs, failures, repo_root = prepare_mutation_proposal(case, log_root=log_root)
+ except Exception as exc:
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=list(state.get("command_refs", [])),
+ artifact_refs=w5_report_artifact_refs(log_root, case["case_id"]),
+ status="fail",
+ score_breakdown={
+ "plan_freeze_approved": True,
+ "proposal_valid": False,
+ "unauthorized_scope_expansion": False,
+ "post_change_validation_failure": False,
+ },
+ observed={
+ "highlights": ["Mutation proposal did not complete cleanly."],
+ "failures": [f"{type(exc).__name__}: {exc}"],
+ },
+ failure_class="proposal_invalid",
+ reviewer_notes="The W5 mutation proposal could not be prepared inside the bounded scope.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Inspect the proposal preparation artifacts and repair the bounded proposal before retrying.",
+ )
+ history = record_event(state, node="build_proposal", status="fail", note=f"{type(exc).__name__}: {exc}")
+ return Command(
+ update={
+ "current_node": "build_proposal",
+ "next_node": "finalize_report",
+ "history": history,
+ "failure_class": "proposal_invalid",
+ "terminal_status": "fail",
+ },
+ goto="finalize_report",
+ )
+
+ history = record_event(
+ state,
+ node="build_proposal",
+ status="pass" if proposal_summary.get("proposal_valid") else "fail",
+ note="Prepared the bounded mutation proposal for W5.",
+ )
+ command_refs_all = [*state.get("command_refs", []), *command_refs]
+ artifact_refs_all = [
+ *state.get("artifact_refs", []),
+ *proposal_artifact_refs(scenario_root(log_root, case["case_id"])),
+ *w5_report_artifact_refs(log_root, case["case_id"]),
+ ]
+ if not proposal_summary.get("proposal_valid"):
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs_all,
+ artifact_refs=artifact_refs_all,
+ status="fail",
+ score_breakdown={
+ "plan_freeze_approved": True,
+ "proposal_valid": False,
+ "unauthorized_scope_expansion": False,
+ "post_change_validation_failure": False,
+ },
+ observed={
+ "highlights": ["Mutation proposal was prepared but did not validate cleanly."],
+ "failures": proposal_summary.get("proposal_failure_reasons") or failures or ["proposal marked invalid"],
+ },
+ failure_class="proposal_invalid",
+ reviewer_notes="The W5 mutation proposal did not satisfy the bounded proposal contract.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Refresh the proposal, review the new packet, and retry the scenario.",
+ )
+ return Command(
+ update={
+ "current_node": "build_proposal",
+ "next_node": "finalize_report",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "proposal_valid": False,
+ "failure_class": "proposal_invalid",
+ "terminal_status": "fail",
+ "base_head": proposal_summary.get("base_head"),
+ },
+ goto="finalize_report",
+ )
+ return Command(
+ update={
+ "current_node": "build_proposal",
+ "next_node": "await_first_mutation",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "proposal_valid": True,
+ "base_head": proposal_summary.get("base_head"),
+ },
+ goto="await_first_mutation",
+ )
+
+ def await_first_mutation(state: W5State) -> Command[str]:
+ return milestone_gate(state, milestone_id="first_mutation", next_node="worktree_apply", node_name="await_first_mutation")
+
+ def worktree_apply(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ repo_root = repo_root_for_scenario(case)
+ ok, changed_files, command_refs, artifact_refs, failure_class = run_worktree_preview(
+ case,
+ log_root=log_root,
+ repo_root=repo_root,
+ )
+ history = record_event(
+ state,
+ node="worktree_apply",
+ status="pass" if ok else "fail",
+ note="Executed the isolated worktree preview for the mutation scenario.",
+ extra={"failure_class": failure_class, "changed_files": changed_files},
+ )
+ command_refs_all = [*state.get("command_refs", []), *command_refs]
+ artifact_refs_all = [*state.get("artifact_refs", []), *artifact_refs]
+ if not ok:
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs_all,
+ artifact_refs=artifact_refs_all,
+ status="fail",
+ score_breakdown={
+ "plan_freeze_approved": True,
+ "proposal_valid": True,
+ "first_mutation_approved": True,
+ "unauthorized_scope_expansion": failure_class == "unauthorized_scope_expansion",
+ "post_change_validation_failure": failure_class == "post_change_validation_failure",
+ },
+ observed={
+ "highlights": [f"Changed files observed in worktree preview: `{json.dumps(changed_files, ensure_ascii=True)}`."],
+ "failures": [failure_class or "worktree preview failed"],
+ "changed_files": changed_files,
+ },
+ failure_class=failure_class,
+ reviewer_notes="The W5 mutation scenario did not satisfy the isolated worktree preview contract.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Inspect the worktree preview artifacts before retrying the scenario.",
+ )
+ return Command(
+ update={
+ "current_node": "worktree_apply",
+ "next_node": "finalize_report",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "changed_files": changed_files,
+ "failure_class": failure_class,
+ "terminal_status": "fail",
+ },
+ goto="finalize_report",
+ )
+ return Command(
+ update={
+ "current_node": "worktree_apply",
+ "next_node": "acceptance_validate",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "changed_files": changed_files,
+ "preview_ready": True,
+ },
+ goto="acceptance_validate",
+ )
+
+ def acceptance_validate(state: W5State) -> Command[str]:
+ history = record_event(
+ state,
+ node="acceptance_validate",
+ status="pass",
+ note="The isolated worktree acceptance checks passed and a landing diff is ready for review.",
+ )
+ node_json(
+ log_root,
+ state["case_id"],
+ "acceptance-validate",
+ {
+ "checked_at": utc_now(),
+ "preview_ready": True,
+ "changed_files": state.get("changed_files", []),
+ },
+ )
+ return Command(
+ update={
+ "current_node": "acceptance_validate",
+ "next_node": "await_landing",
+ "history": history,
+ },
+ goto="await_landing",
+ )
+
+ def await_landing(state: W5State) -> Command[str]:
+ return milestone_gate(state, milestone_id="landing", next_node="land_or_rollback", node_name="await_landing")
+
+ def land_or_rollback(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ repo_root = repo_root_for_scenario(case)
+ ok, command_refs, artifact_refs, failure_class = land_validated_diff(
+ case,
+ log_root=log_root,
+ repo_root=repo_root,
+ base_head=state.get("base_head"),
+ )
+ history = record_event(
+ state,
+ node="land_or_rollback",
+ status="pass" if ok else "fail",
+ note="Landing decision executed against the validated diff and main-repo acceptance checks.",
+ extra={"failure_class": failure_class},
+ )
+ command_refs_all = [*state.get("command_refs", []), *command_refs]
+ artifact_refs_all = [*state.get("artifact_refs", []), *artifact_refs]
+ if not ok:
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs_all,
+ artifact_refs=artifact_refs_all,
+ status="fail",
+ score_breakdown={
+ "plan_freeze_approved": True,
+ "proposal_valid": True,
+ "first_mutation_approved": True,
+ "landing_approved": True,
+ "unauthorized_scope_expansion": False,
+ "post_change_validation_failure": failure_class == "post_change_validation_failure",
+ },
+ observed={
+ "highlights": [f"Changed files: `{json.dumps(state.get('changed_files', []), ensure_ascii=True)}`."],
+ "failures": [failure_class or "landing failed"],
+ "changed_files": state.get("changed_files", []),
+ },
+ failure_class=failure_class,
+ reviewer_notes="The W5 mutation scenario failed during landing or post-landing validation.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Inspect the landing artifacts and repo state before retrying the scenario.",
+ )
+ return Command(
+ update={
+ "current_node": "land_or_rollback",
+ "next_node": "finalize_report",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "failure_class": failure_class,
+ "terminal_status": "fail",
+ },
+ goto="finalize_report",
+ )
+ return Command(
+ update={
+ "current_node": "land_or_rollback",
+ "next_node": "commit_checkpoint",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ },
+ goto="commit_checkpoint",
+ )
+
+ def commit_checkpoint_node(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ repo_root = repo_root_for_scenario(case)
+ case_root = scenario_root(log_root, case["case_id"])
+ commit_ref, command_refs, artifact_refs, commit_failure = commit_checkpoint(case, repo_root=repo_root, case_root=case_root)
+ history = record_event(
+ state,
+ node="commit_checkpoint",
+ status="pass" if commit_failure is None else "fail",
+ note="Recorded the local mutation checkpoint for the landed scenario.",
+ extra={"local_commit_ref": commit_ref, "failure_class": commit_failure},
+ )
+ command_refs_all = [*state.get("command_refs", []), *command_refs]
+ artifact_refs_all = [*state.get("artifact_refs", []), *artifact_refs]
+ if commit_failure is not None:
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs_all,
+ artifact_refs=artifact_refs_all,
+ status="fail",
+ score_breakdown={
+ "plan_freeze_approved": True,
+ "proposal_valid": True,
+ "first_mutation_approved": True,
+ "landing_approved": True,
+ "checkpoint_committed": False,
+ "unauthorized_scope_expansion": False,
+ "post_change_validation_failure": False,
+ },
+ observed={
+ "highlights": [f"Landed changed files: `{json.dumps(state.get('changed_files', []), ensure_ascii=True)}`."],
+ "failures": [commit_failure],
+ "changed_files": state.get("changed_files", []),
+ },
+ failure_class="checkpoint_commit_failure",
+ reviewer_notes="The W5 mutation scenario landed but could not record the required local commit checkpoint.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Repair the git commit checkpoint and restore a clean tracked state before retrying broader W5 work.",
+ )
+ return Command(
+ update={
+ "current_node": "commit_checkpoint",
+ "next_node": "finalize_report",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "failure_class": "checkpoint_commit_failure",
+ "terminal_status": "fail",
+ },
+ goto="finalize_report",
+ )
+
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs_all,
+ artifact_refs=artifact_refs_all,
+ status="pass",
+ score_breakdown={
+ "plan_freeze_approved": True,
+ "proposal_valid": True,
+ "first_mutation_approved": True,
+ "landing_approved": True,
+ "checkpoint_committed": True,
+ "unauthorized_scope_expansion": False,
+ "post_change_validation_failure": False,
+ },
+ observed={
+ "highlights": [
+ f"Changed files: `{json.dumps(state.get('changed_files', []), ensure_ascii=True)}`.",
+ f"Local commit ref: `{commit_ref}`.",
+ ],
+ "failures": ["None."],
+ "changed_files": state.get("changed_files", []),
+ "local_commit_ref": commit_ref,
+ },
+ failure_class=None,
+ reviewer_notes="The W5 mutation scenario stayed inside approved scope, passed worktree and landing validation, and recorded the required local commit checkpoint.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Review the packet and decide whether to approve the next W5 scenario.",
+ )
+ return Command(
+ update={
+ "current_node": "commit_checkpoint",
+ "next_node": "finalize_report",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "local_commit_ref": commit_ref,
+ "local_commit_message": COMMIT_MESSAGES.get(case["case_id"]),
+ "terminal_status": "pass",
+ },
+ goto="finalize_report",
+ )
+
+ def finalize_report(state: W5State) -> Command[str]:
+ refresh_w5_outputs(log_root, mirror_root)
+ result = load_result_summary(log_root, state["case_id"])
+ terminal_status = state.get("terminal_status")
+ if result:
+ terminal_status = str(result.get("status") or terminal_status or "fail")
+ history = record_event(
+ state,
+ node="finalize_report",
+ status=terminal_status or "unknown",
+ note="W5 index and mirror summary were refreshed.",
+ )
+ node_json(
+ log_root,
+ state["case_id"],
+ "finalize-report",
+ {
+ "finalized_at": utc_now(),
+ "terminal_status": terminal_status,
+ "wave_index": str(log_root / f"{INDEX_NAME}.json"),
+ "summary_memo": str(mirror_root / SUMMARY_MEMO_NAME),
+ },
+ )
+ return Command(
+ update={
+ "current_node": "finalize_report",
+ "next_node": None,
+ "history": history,
+ "terminal_status": terminal_status,
+ },
+ goto=END,
+ )
+
+ graph = StateGraph(W5State)
+ graph.add_node("route_from_phase", route_from_phase)
+ graph.add_node("preflight", preflight)
+ graph.add_node("load_scenario", load_scenario)
+ graph.add_node("collect_evidence", collect_evidence)
+ graph.add_node("draft_plan", draft_plan)
+ graph.add_node("await_plan_freeze", await_plan_freeze)
+ graph.add_node("execute_read_only_actions", execute_read_only_actions)
+ graph.add_node("draft_summary", draft_summary)
+ graph.add_node("build_proposal", build_proposal)
+ graph.add_node("await_first_mutation", await_first_mutation)
+ graph.add_node("worktree_apply", worktree_apply)
+ graph.add_node("acceptance_validate", acceptance_validate)
+ graph.add_node("await_landing", await_landing)
+ graph.add_node("land_or_rollback", land_or_rollback)
+ graph.add_node("commit_checkpoint", commit_checkpoint_node)
+ graph.add_node("finalize_report", finalize_report)
+ graph.add_edge(START, "route_from_phase")
+ return graph.compile()
+
+
+def run_graph_scenario(log_root: Path, mirror_root: Path, *, case_id: str, until: str, resume: bool) -> W5State:
+ graph = build_graph(log_root, mirror_root)
+ existing = load_graph_state(log_root, case_id) or {}
+ state: W5State = {
+ **existing,
+ "case_id": case_id,
+ "until": until,
+ "paused": False,
+ "pause_reason": None,
+ "pause_milestone": None,
+ "current_node": existing.get("current_node"),
+ "next_node": existing.get("next_node") or ("await_plan_freeze" if resume else "preflight"),
+ "resume_count": int(existing.get("resume_count", 0)) + (1 if resume else 0),
+ "history": list(existing.get("history", [])),
+ "command_refs": list(existing.get("command_refs", [])),
+ "artifact_refs": list(existing.get("artifact_refs", [])),
+ "changed_files": list(existing.get("changed_files", [])),
+ "forced_pause_seen": list(existing.get("forced_pause_seen", [])),
+ }
+ final_state = graph.invoke(state)
+ save_graph_state(log_root, case_id, final_state)
+ refresh_w5_outputs(log_root, mirror_root)
+ return final_state
+
+
+def print_case_status(log_root: Path, case_id: str) -> None:
+ payload = {
+ "case_id": case_id,
+ "graph_state": load_graph_state(log_root, case_id),
+ "approval": approval_payload(log_root, case_id),
+ "result_summary": load_result_summary(log_root, case_id),
+ }
+ print(json.dumps(payload, indent=2, ensure_ascii=True))
+
+
+def print_all_status(log_root: Path, mirror_root: Path) -> None:
+ refresh_w5_outputs(log_root, mirror_root)
+ print(json.dumps(load_json(log_root / f"{INDEX_NAME}.json"), indent=2, ensure_ascii=True))
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(description="Run the W5 long-horizon supervised pilot on top of LangGraph + llama.cpp.")
+ parser.add_argument("--url", default=DEFAULT_LANGCHAIN_RUN_URL)
+ parser.add_argument("--program-id", default=DEFAULT_PROGRAM_ID)
+ parser.add_argument("--log-root", default=None)
+ parser.add_argument("--mirror-root", default=None)
+ sub = parser.add_subparsers(dest="command", required=True)
+
+ sub.add_parser("materialize", help="Materialize the W5 long-horizon pilot.")
+
+ run_scenario = sub.add_parser("run-scenario", help="Run one W5 scenario.")
+ run_scenario.add_argument("scenario_id")
+ run_scenario.add_argument("--until", choices=["milestone", "done"], default="done")
+
+ resume_scenario = sub.add_parser("resume-scenario", help="Resume a paused W5 scenario from graph.state.json.")
+ resume_scenario.add_argument("scenario_id")
+
+ status = sub.add_parser("status", help="Print the current W5 status.")
+ status.add_argument("scenario_id", nargs="?")
+ status.add_argument("--all", action="store_true")
+ return parser
+
+
+def main() -> int:
+ parser = build_parser()
+ args = parser.parse_args()
+
+ configure_program_runtime(program_id=args.program_id, run_url=args.url)
+ log_root = Path(args.log_root) if args.log_root else default_log_root_for(PROGRAM_ID)
+ mirror_root = Path(args.mirror_root) if args.mirror_root else default_mirror_root_for(PROGRAM_ID)
+ valid_case_ids = {case["case_id"] for case in available_cases()}
+
+ if args.command == "materialize":
+ materialize(log_root, mirror_root)
+ print(f"materialized {PROGRAM_ID} at {log_root}")
+ return 0
+
+ if args.command == "run-scenario":
+ if args.scenario_id not in valid_case_ids:
+ parser.error(f"unknown scenario_id for {PROGRAM_ID}: {args.scenario_id}")
+ return 2
+ materialize(log_root, mirror_root)
+ final_state = run_graph_scenario(log_root, mirror_root, case_id=args.scenario_id, until=args.until, resume=False)
+ print(json.dumps({"scenario_id": args.scenario_id, "terminal_status": final_state.get("terminal_status"), "paused": final_state.get("paused", False)}, ensure_ascii=True))
+ return 0
+
+ if args.command == "resume-scenario":
+ if args.scenario_id not in valid_case_ids:
+ parser.error(f"unknown scenario_id for {PROGRAM_ID}: {args.scenario_id}")
+ return 2
+ materialize(log_root, mirror_root)
+ final_state = run_graph_scenario(log_root, mirror_root, case_id=args.scenario_id, until="done", resume=True)
+ print(json.dumps({"scenario_id": args.scenario_id, "terminal_status": final_state.get("terminal_status"), "paused": final_state.get("paused", False)}, ensure_ascii=True))
+ return 0
+
+ if args.command == "status":
+ materialize(log_root, mirror_root)
+ if args.all:
+ print_all_status(log_root, mirror_root)
+ return 0
+ if not args.scenario_id:
+ parser.error("status requires either or --all")
+ return 2
+ if args.scenario_id not in valid_case_ids:
+ parser.error(f"unknown scenario_id for {PROGRAM_ID}: {args.scenario_id}")
+ return 2
+ print_case_status(log_root, args.scenario_id)
+ return 0
+
+ parser.error(f"unknown command: {args.command}")
+ return 2
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/scripts/aoa-w6-pilot b/scripts/aoa-w6-pilot
new file mode 100755
index 0000000..d590057
--- /dev/null
+++ b/scripts/aoa-w6-pilot
@@ -0,0 +1,3252 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import copy
+import importlib.machinery
+import importlib.util
+import json
+import subprocess
+import textwrap
+from contextlib import contextmanager
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, TypedDict
+
+try:
+ from langgraph.graph import END, START, StateGraph
+ from langgraph.types import Command
+except ImportError as exc: # pragma: no cover - guarded by runtime usage
+ raise SystemExit(
+ "langgraph is not installed. Install dependencies from "
+ "`scripts/requirements-langgraph-pilot.txt` first."
+ ) from exc
+
+
+DEFAULT_PROGRAM_ID = "w6-bounded-autonomy-llamacpp-v1"
+PROGRAM_ID = DEFAULT_PROGRAM_ID
+WAVE_ID = "W6"
+MODEL = "qwen3.5:9b"
+DEFAULT_LANGCHAIN_RUN_URL = "http://127.0.0.1:5403/run"
+LANGCHAIN_RUN_URL = DEFAULT_LANGCHAIN_RUN_URL
+
+SOURCE_ROOT = Path(__file__).resolve().parents[1]
+STACK_ROOT = Path("/srv/abyss-stack")
+CONFIGS_ROOT = STACK_ROOT / "Configs"
+SCRIPTS_ROOT = CONFIGS_ROOT / "scripts"
+LOG_ROOT_DEFAULT = STACK_ROOT / "Logs" / "local-ai-trials" / PROGRAM_ID
+MIRROR_ROOT_DEFAULT = Path("/srv/Dionysus/reports/local-ai-trials") / PROGRAM_ID
+
+BASELINE_W5_LOG_ROOT = STACK_ROOT / "Logs" / "local-ai-trials" / "w5-langgraph-llamacpp-v1"
+LLAMACPP_PROMOTION_ROOT = STACK_ROOT / "Logs" / "runtime-benchmarks" / "promotions" / "llamacpp-promotion-gate-v1"
+INDEX_NAME = "W6-autonomy-index"
+SUMMARY_MEMO_NAME = "W6_SUMMARY.md"
+SOURCE_CHECKOUT_ROOT = Path("/home/dionysus/src/abyss-stack")
+
+READ_ONLY_SCENARIO_IDS = {
+ "runtime-inspect-langchain-health",
+ "runtime-inspect-route-api-health",
+}
+
+MUTATION_SCENARIO_IDS = {
+ "aoa-evals-contract-wording-alignment",
+ "aoa-routing-generated-surface-refresh",
+ "stack-sync-federation-json-check-report",
+ "llamacpp-pilot-verify-command",
+}
+
+SCENARIO_ORDER = [
+ "runtime-inspect-langchain-health",
+ "runtime-inspect-route-api-health",
+ "aoa-evals-contract-wording-alignment",
+ "aoa-routing-generated-surface-refresh",
+ "stack-sync-federation-json-check-report",
+ "llamacpp-pilot-verify-command",
+]
+
+COMMIT_MESSAGES = {
+ "aoa-evals-contract-wording-alignment": "Clarify aoa-evals contract wording",
+ "aoa-routing-generated-surface-refresh": "Refresh aoa-routing generated surfaces",
+ "stack-sync-federation-json-check-report": "Add JSON check output to federation sync",
+ "llamacpp-pilot-verify-command": "Add verify command to llama.cpp pilot",
+}
+
+CRITICAL_FAILURES = {
+ "preflight_failure",
+ "unauthorized_scope_expansion",
+ "post_change_validation_failure",
+ "landing_reapply_failure",
+}
+
+W6_METADATA = {
+ "title": "Bounded Autonomy Pilot",
+ "summary": "Focused LangGraph autonomy pilot on the promoted llama.cpp substrate with reduced approval touchpoints and bounded live-repo mutations.",
+}
+
+
+class W5State(TypedDict, total=False):
+ case_id: str
+ until: str
+ execution_mode: str
+ current_node: str | None
+ next_node: str | None
+ paused: bool
+ pause_reason: str | None
+ pause_milestone: str | None
+ approval_status: str | None
+ current_milestone: str | None
+ terminal_status: str | None
+ failure_class: str | None
+ proposal_valid: bool
+ preview_ready: bool
+ resume_count: int
+ history: list[dict[str, Any]]
+ command_refs: list[dict[str, Any]]
+ artifact_refs: list[str]
+ changed_files: list[str]
+ local_commit_ref: str | None
+ local_commit_message: str | None
+ base_head: str | None
+ forced_pause_seen: list[str]
+ repair_attempts: int
+ repair_succeeded: bool
+ preexisting_noop: bool
+
+
+def utc_now() -> str:
+ return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+
+
+def absolute(path: Path) -> str:
+ return str(path.resolve())
+
+
+def default_log_root_for(program_id: str) -> Path:
+ return STACK_ROOT / "Logs" / "local-ai-trials" / program_id
+
+
+def default_mirror_root_for(program_id: str) -> Path:
+ return Path("/srv/Dionysus/reports/local-ai-trials") / program_id
+
+
+def configure_program_runtime(*, program_id: str, run_url: str) -> None:
+ global PROGRAM_ID, LOG_ROOT_DEFAULT, MIRROR_ROOT_DEFAULT, LANGCHAIN_RUN_URL
+ PROGRAM_ID = program_id
+ LOG_ROOT_DEFAULT = default_log_root_for(program_id)
+ MIRROR_ROOT_DEFAULT = default_mirror_root_for(program_id)
+ LANGCHAIN_RUN_URL = run_url
+ TRIALS.configure_program_runtime(program_id=program_id, run_url=run_url)
+
+
+def load_trials_module() -> Any:
+ target = SOURCE_ROOT / "scripts" / "aoa-local-ai-trials"
+ loader = importlib.machinery.SourceFileLoader("aoa_local_ai_trials_w5", str(target))
+ spec = importlib.util.spec_from_loader(loader.name, loader)
+ if spec is None:
+ raise RuntimeError(f"could not create module spec for {target}")
+ module = importlib.util.module_from_spec(spec)
+ loader.exec_module(module) # type: ignore[arg-type]
+ return module
+
+
+TRIALS = load_trials_module()
+
+
+def scenario_root(log_root: Path, case_id: str) -> Path:
+ return TRIALS.case_dir(log_root, WAVE_ID, case_id)
+
+
+def state_path(log_root: Path, case_id: str) -> Path:
+ return scenario_root(log_root, case_id) / "graph.state.json"
+
+
+def history_path(log_root: Path, case_id: str) -> Path:
+ return scenario_root(log_root, case_id) / "graph.history.jsonl"
+
+
+def interrupt_path(log_root: Path, case_id: str) -> Path:
+ return scenario_root(log_root, case_id) / "interrupt.json"
+
+
+def plan_path(log_root: Path, case_id: str) -> Path:
+ return scenario_root(log_root, case_id) / "artifacts" / "scenario.plan.json"
+
+
+def journal_path(log_root: Path, case_id: str) -> Path:
+ return scenario_root(log_root, case_id) / "artifacts" / "step.journal.jsonl"
+
+
+def approval_path(log_root: Path, case_id: str) -> Path:
+ return scenario_root(log_root, case_id) / "artifacts" / "approval.status.json"
+
+
+def node_artifacts_dir(log_root: Path, case_id: str) -> Path:
+ path = scenario_root(log_root, case_id) / "node-artifacts"
+ path.mkdir(parents=True, exist_ok=True)
+ return path
+
+
+def program_readme() -> str:
+ return (
+ f"# {PROGRAM_ID}\n\n"
+ "This directory stores the runtime-truth artifacts for the W6 bounded autonomy pilot.\n\n"
+ "It reuses the bounded local-trials packet contract while reducing human touchpoints to plan_freeze and landing on the promoted llama.cpp runtime.\n"
+ )
+
+
+def mirror_readme() -> str:
+ return (
+ f"# {PROGRAM_ID}\n\n"
+ "This folder mirrors human+AI-readable W6 reports and indexes.\n\n"
+ "Machine-readable runtime truth stays local under `/srv/abyss-stack/Logs/local-ai-trials/`.\n"
+ )
+
+
+def write_json(path: Path, payload: dict[str, Any]) -> None:
+ TRIALS.write_json(path, payload)
+
+
+def write_text(path: Path, text: str) -> None:
+ TRIALS.write_text(path, text)
+
+
+def write_text_exact(path: Path, text: str) -> None:
+ TRIALS.write_text_exact(path, text)
+
+
+def load_json(path: Path) -> dict[str, Any]:
+ return json.loads(path.read_text(encoding="utf-8"))
+
+
+def load_case_spec(log_root: Path, case_id: str) -> dict[str, Any]:
+ return load_json(scenario_root(log_root, case_id) / "case.spec.json")
+
+
+def load_result_summary(log_root: Path, case_id: str) -> dict[str, Any] | None:
+ path = scenario_root(log_root, case_id) / "result.summary.json"
+ if not path.exists():
+ return None
+ return load_json(path)
+
+
+def load_graph_state(log_root: Path, case_id: str) -> W5State | None:
+ path = state_path(log_root, case_id)
+ if not path.exists():
+ return None
+ return json.loads(path.read_text(encoding="utf-8"))
+
+
+def record_event(
+ state: W5State,
+ *,
+ node: str,
+ status: str,
+ note: str,
+ extra: dict[str, Any] | None = None,
+) -> list[dict[str, Any]]:
+ history = list(state.get("history", []))
+ payload: dict[str, Any] = {
+ "at": utc_now(),
+ "node": node,
+ "status": status,
+ "note": note,
+ }
+ if extra:
+ payload.update(extra)
+ history.append(payload)
+ return history
+
+
+def save_graph_state(log_root: Path, case_id: str, state: W5State) -> None:
+ sanitized = {
+ "case_id": state.get("case_id"),
+ "until": state.get("until"),
+ "execution_mode": state.get("execution_mode"),
+ "current_node": state.get("current_node"),
+ "next_node": state.get("next_node"),
+ "paused": state.get("paused", False),
+ "pause_reason": state.get("pause_reason"),
+ "pause_milestone": state.get("pause_milestone"),
+ "approval_status": state.get("approval_status"),
+ "current_milestone": state.get("current_milestone"),
+ "terminal_status": state.get("terminal_status"),
+ "failure_class": state.get("failure_class"),
+ "proposal_valid": state.get("proposal_valid"),
+ "preview_ready": state.get("preview_ready"),
+ "resume_count": state.get("resume_count", 0),
+ "history": state.get("history", []),
+ "command_refs": state.get("command_refs", []),
+ "artifact_refs": state.get("artifact_refs", []),
+ "changed_files": state.get("changed_files", []),
+ "local_commit_ref": state.get("local_commit_ref"),
+ "local_commit_message": state.get("local_commit_message"),
+ "base_head": state.get("base_head"),
+ "forced_pause_seen": state.get("forced_pause_seen", []),
+ "repair_attempts": state.get("repair_attempts", 0),
+ "repair_succeeded": state.get("repair_succeeded", False),
+ "preexisting_noop": state.get("preexisting_noop", False),
+ }
+ write_json(state_path(log_root, case_id), sanitized)
+ history_lines = [json.dumps(item, ensure_ascii=True) for item in sanitized["history"]]
+ history_file = history_path(log_root, case_id)
+ history_file.parent.mkdir(parents=True, exist_ok=True)
+ history_file.write_text("\n".join(history_lines) + ("\n" if history_lines else ""), encoding="utf-8")
+ journal_file = journal_path(log_root, case_id)
+ journal_file.parent.mkdir(parents=True, exist_ok=True)
+ journal_file.write_text("\n".join(history_lines) + ("\n" if history_lines else ""), encoding="utf-8")
+
+
+def node_json(log_root: Path, case_id: str, name: str, payload: dict[str, Any]) -> None:
+ write_json(node_artifacts_dir(log_root, case_id) / f"{name}.json", payload)
+
+
+def load_base_catalog() -> dict[str, list[dict[str, Any]]]:
+ return TRIALS.build_catalog()
+
+
+def find_case(catalog: dict[str, list[dict[str, Any]]], wave_id: str, case_id: str) -> dict[str, Any]:
+ for case in catalog[wave_id]:
+ if case["case_id"] == case_id:
+ return copy.deepcopy(case)
+ raise RuntimeError(f"missing case `{case_id}` in wave `{wave_id}`")
+
+
+def stack_sync_json_case() -> dict[str, Any]:
+ return {
+ "artifact_kind": "aoa.local-ai-trial.case-spec",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": "stack-sync-federation-json-check-report",
+ "title": "Add JSON Check Report To Federation Sync",
+ "repo_scope": ["abyss-stack"],
+ "task_family": "bounded-implementation",
+ "mutation_allowed": True,
+ "mutation_policy": {
+ "mode": "bounded-approved-only",
+ "execution_mode": "implementation_patch",
+ "lane": "implementation",
+ "allowed_files": [absolute(SOURCE_CHECKOUT_ROOT / "scripts" / "aoa-sync-federation-surfaces")],
+ "unauthorized_file_touch_is_critical_fail": True,
+ "review_required_before_mutation": True,
+ },
+ "runtime_selection": {
+ "preset": "intel-full",
+ "profile": None,
+ "path": "langchain-api:/run",
+ },
+ "allowed_tools": ["langchain-api:/run", "local-shell", "local-files:read-write", "repo-validator"],
+ "source_refs": [
+ absolute(SOURCE_CHECKOUT_ROOT / "scripts" / "aoa-sync-federation-surfaces"),
+ absolute(SOURCE_CHECKOUT_ROOT / "config-templates" / "Configs" / "federation" / "aoa-routing.yaml"),
+ absolute(SOURCE_CHECKOUT_ROOT / "docs" / "LOCAL_AI_TRIALS.md"),
+ ],
+ "observed_actions": [],
+ "execution_mode": "implementation_patch",
+ "lane": "implementation",
+ "derived_from": None,
+ "milestone_gates": ["plan_freeze", "landing"],
+ "force_pause_on_milestone": None,
+ "allow_preexisting_noop": False,
+ "novel_implementation": True,
+ "expected_result": {
+ "type": "bounded-edit",
+ "allowed_files": [absolute(SOURCE_CHECKOUT_ROOT / "scripts" / "aoa-sync-federation-surfaces")],
+ "all_acceptance_checks_must_pass": True,
+ },
+ "scoring": {
+ "critical_failures": [
+ "unauthorized_scope_expansion",
+ "post_change_validation_failure",
+ ]
+ },
+ "acceptance_checks": [
+ "bash -n scripts/aoa-sync-federation-surfaces",
+ """python3 -c 'import json,subprocess; p=subprocess.run(["scripts/aoa-sync-federation-surfaces","--check","--json","--layer","aoa-routing"],check=True,text=True,capture_output=True); d=json.loads(p.stdout); assert set(d)=={"layer","status","source_root","mirror_target","missing_files"}; assert d["layer"]=="aoa-routing"; assert d["status"]=="ok"; assert d["missing_files"]==[]'""",
+ "python3 scripts/validate_stack.py",
+ ],
+ "goal": "Add a bounded JSON report mode to the federation sync helper's existing `--check` path without changing the normal copy path.",
+ "inputs": [
+ "Add `--json` to `scripts/aoa-sync-federation-surfaces` when used with `--check`.",
+ "`--check --json --layer ` must print one compact JSON object with `layer`, `status`, `source_root`, `mirror_target`, and `missing_files`.",
+ "Exit codes must stay aligned with the plain human-readable `--check` mode.",
+ "The existing human-readable `--check` output must stay intact.",
+ ],
+ "expected_report_lines": [
+ "Only `scripts/aoa-sync-federation-surfaces` is touched.",
+ "The helper gains compact JSON output for `--check` with no copy side effects.",
+ "All named acceptance checks pass after landing.",
+ ],
+ "notes": [
+ "This scenario runs against the git-backed abyss-stack source checkout.",
+ "This scenario must land a real new implementation and may not pass as preexisting-noop.",
+ ],
+ }
+
+
+def llamacpp_verify_case() -> dict[str, Any]:
+ return {
+ "artifact_kind": "aoa.local-ai-trial.case-spec",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": "llamacpp-pilot-verify-command",
+ "title": "Add Verify Command To llama.cpp Pilot",
+ "repo_scope": ["abyss-stack"],
+ "task_family": "bounded-implementation",
+ "mutation_allowed": True,
+ "mutation_policy": {
+ "mode": "bounded-approved-only",
+ "execution_mode": "implementation_patch",
+ "lane": "implementation",
+ "allowed_files": [absolute(SOURCE_CHECKOUT_ROOT / "scripts" / "aoa-llamacpp-pilot")],
+ "unauthorized_file_touch_is_critical_fail": True,
+ "review_required_before_mutation": True,
+ },
+ "runtime_selection": {
+ "preset": "intel-full",
+ "profile": None,
+ "path": "langchain-api:/run",
+ },
+ "allowed_tools": ["langchain-api:/run", "local-shell", "local-files:read-write", "repo-validator"],
+ "source_refs": [
+ absolute(SOURCE_CHECKOUT_ROOT / "scripts" / "aoa-llamacpp-pilot"),
+ absolute(SOURCE_CHECKOUT_ROOT / "docs" / "LLAMACPP_PILOT.md"),
+ absolute(SOURCE_CHECKOUT_ROOT / "docs" / "W6_PILOT.md"),
+ ],
+ "observed_actions": [],
+ "execution_mode": "implementation_patch",
+ "lane": "implementation",
+ "derived_from": None,
+ "milestone_gates": ["plan_freeze", "landing"],
+ "force_pause_on_milestone": "landing",
+ "allow_preexisting_noop": False,
+ "novel_implementation": True,
+ "expected_result": {
+ "type": "bounded-edit",
+ "allowed_files": [absolute(SOURCE_CHECKOUT_ROOT / "scripts" / "aoa-llamacpp-pilot")],
+ "all_acceptance_checks_must_pass": True,
+ },
+ "scoring": {
+ "critical_failures": [
+ "unauthorized_scope_expansion",
+ "post_change_validation_failure",
+ ]
+ },
+ "acceptance_checks": [
+ "python3 -m py_compile scripts/aoa-llamacpp-pilot",
+ """python3 -c 'import json,subprocess; p=subprocess.run(["scripts/aoa-llamacpp-pilot","verify","--timeout","60"],check=True,text=True,capture_output=True); d=json.loads(p.stdout); assert d["ok"] is True; assert d["llama_cpp_health"]["ok"] is True; assert d["langchain_api_llamacpp_health"]["ok"] is True; assert d["exact_reply"]["ok"] is True; assert d["repo_routing"]["ok"] is True'""",
+ "python3 scripts/validate_stack.py",
+ ],
+ "goal": "Add a bounded non-mutating `verify` subcommand to the llama.cpp pilot so operators can inspect an already-running sidecar without calling up/down.",
+ "inputs": [
+ "Add a `verify` subcommand to `scripts/aoa-llamacpp-pilot`.",
+ "`verify` must check `11435` health, `5403` health, one `exact-reply` smoke, and one `repo-routing` smoke.",
+ "`verify` must print compact JSON and exit non-zero on any failed check.",
+ "`verify` must validate the currently running sidecar only and must not call `up` or `down`.",
+ ],
+ "expected_report_lines": [
+ "Only `scripts/aoa-llamacpp-pilot` is touched.",
+ "The pilot gains a bounded `verify` subcommand for currently running sidecars.",
+ "All named acceptance checks pass after landing.",
+ ],
+ "notes": [
+ "This scenario runs against the git-backed abyss-stack source checkout.",
+ "This scenario must prove pause/resume at the landing milestone.",
+ "This scenario must land a real new implementation and may not pass as preexisting-noop.",
+ ],
+ }
+
+
+def w6_catalog() -> dict[str, list[dict[str, Any]]]:
+ base = load_base_catalog()
+ scenarios: list[dict[str, Any]] = []
+
+ for case_id in SCENARIO_ORDER:
+ if case_id == "stack-sync-federation-json-check-report":
+ scenarios.append(stack_sync_json_case())
+ continue
+ if case_id == "llamacpp-pilot-verify-command":
+ scenarios.append(llamacpp_verify_case())
+ continue
+ source_wave = "W2" if case_id in READ_ONLY_SCENARIO_IDS else "W4"
+ case = find_case(base, source_wave, case_id)
+ case["program_id"] = PROGRAM_ID
+ case["wave_id"] = WAVE_ID
+ case["derived_from"] = case_id
+ if case_id in READ_ONLY_SCENARIO_IDS:
+ case["execution_mode"] = "read_only_summary"
+ case["milestone_gates"] = ["plan_freeze"]
+ case["force_pause_on_milestone"] = None
+ case["notes"] = list(case.get("notes") or []) + [
+ "This W6 scenario reuses the frozen W2 read-only contract under LangGraph milestone gating.",
+ ]
+ else:
+ case["milestone_gates"] = ["plan_freeze", "landing"]
+ case["force_pause_on_milestone"] = None
+ case["notes"] = list(case.get("notes") or []) + [
+ "This W6 scenario reuses the bounded W4 mutation contract under reduced-touch LangGraph milestone gating.",
+ ]
+ scenarios.append(case)
+
+ ordered = {case["case_id"]: case for case in scenarios}
+ return {WAVE_ID: [ordered[case_id] for case_id in SCENARIO_ORDER]}
+
+
+def available_cases() -> list[dict[str, Any]]:
+ return w6_catalog()[WAVE_ID]
+
+
+def repo_root_for_scenario(case: dict[str, Any]) -> Path:
+ if case["case_id"] in {"stack-sync-federation-json-check-report", "llamacpp-pilot-verify-command"}:
+ return SOURCE_CHECKOUT_ROOT
+ repo_scope = case.get("repo_scope") or []
+ if len(repo_scope) != 1:
+ raise RuntimeError(f"W6 mutation scenario `{case['case_id']}` must target exactly one repo")
+ repo_root = Path("/srv") / repo_scope[0]
+ if not repo_root.exists():
+ raise RuntimeError(f"missing W6 repo root: {repo_root}")
+ return repo_root
+
+
+@contextmanager
+def patched_repo_root_for_w5() -> Any:
+ original = TRIALS.repo_root_for_w4_case
+
+ def custom_repo_root(case: dict[str, Any]) -> Path:
+ return repo_root_for_scenario(case)
+
+ TRIALS.repo_root_for_w4_case = custom_repo_root
+ try:
+ yield TRIALS
+ finally:
+ TRIALS.repo_root_for_w4_case = original
+
+
+def build_scenario_plan(case: dict[str, Any]) -> dict[str, Any]:
+ plan = {
+ "artifact_kind": "aoa.local-ai-trial.w5-scenario-plan",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "drafted_at": utc_now(),
+ "execution_mode": case["execution_mode"],
+ "derived_from": case.get("derived_from"),
+ "repo_scope": case.get("repo_scope", []),
+ "source_refs": case.get("source_refs", []),
+ "milestone_gates": case.get("milestone_gates", []),
+ "force_pause_on_milestone": case.get("force_pause_on_milestone"),
+ "observed_action_ids": [item.get("id") for item in case.get("observed_actions", []) if item.get("id")],
+ "allowed_files": case.get("expected_result", {}).get("allowed_files", []),
+ "acceptance_checks": case.get("acceptance_checks", []),
+ }
+ if case["execution_mode"] == "read_only_summary":
+ plan["plan_summary"] = (
+ "Execute only the declared read-only actions and grounded source refs, "
+ "then summarize without creating worktrees or commits."
+ )
+ elif case["execution_mode"] == "script_refresh":
+ plan["plan_summary"] = (
+ "Prepare the frozen builder-based proposal, validate it in an isolated worktree, "
+ "then request landing approval before touching the repo."
+ )
+ elif case["execution_mode"] == "implementation_patch":
+ plan["plan_summary"] = (
+ "Prepare a bounded implementation proposal, validate it in an isolated worktree, "
+ "retry once only after post-change validation failure, then request landing approval before touching the repo."
+ )
+ else:
+ plan["plan_summary"] = (
+ "Prepare a bounded proposal inside the approved file scope, validate it in an isolated worktree, "
+ "then request landing approval before touching the repo."
+ )
+ return plan
+
+
+def materialize(log_root: Path, mirror_root: Path) -> None:
+ log_root.mkdir(parents=True, exist_ok=True)
+ mirror_root.mkdir(parents=True, exist_ok=True)
+ write_text(log_root / "README.md", program_readme())
+ write_text(mirror_root / "README.md", mirror_readme())
+
+ contracts = {
+ "case.spec.schema.json": TRIALS.CASE_SCHEMA,
+ "run.manifest.schema.json": TRIALS.RUN_MANIFEST_SCHEMA,
+ "result.summary.schema.json": TRIALS.RESULT_SUMMARY_SCHEMA,
+ "wave-index.schema.json": TRIALS.WAVE_INDEX_SCHEMA,
+ }
+ for name, payload in contracts.items():
+ write_json(log_root / "contracts" / name, payload)
+
+ for case in available_cases():
+ root = scenario_root(log_root, case["case_id"])
+ write_json(root / "case.spec.json", case)
+ node_artifacts_dir(log_root, case["case_id"])
+
+ refresh_w6_outputs(log_root, mirror_root)
+
+
+def approval_payload(log_root: Path, case_id: str) -> dict[str, Any] | None:
+ path = approval_path(log_root, case_id)
+ if not path.exists():
+ return None
+ return load_json(path)
+
+
+def write_approval_status(
+ log_root: Path,
+ *,
+ case: dict[str, Any],
+ milestone_id: str,
+ base_head: str | None,
+ notes: str,
+) -> dict[str, Any]:
+ existing = approval_payload(log_root, case["case_id"]) or {}
+ payload = {
+ "artifact_kind": "aoa.local-ai-trial.w5-approval-status",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "milestone_id": milestone_id,
+ "milestone_status": "pending",
+ "status": "pending",
+ "approved": False,
+ "approved_at": None,
+ "prepared_at": existing.get("prepared_at") or utc_now(),
+ "base_head": base_head or existing.get("base_head"),
+ "notes": notes,
+ }
+ write_json(approval_path(log_root, case["case_id"]), payload)
+ return payload
+
+
+def interpret_approval_status(payload: dict[str, Any] | None, *, milestone_id: str) -> str:
+ if payload is None:
+ return "pending"
+ if payload.get("milestone_id") != milestone_id:
+ return "pending"
+ status = str(payload.get("milestone_status") or payload.get("status") or "pending")
+ if status == "approved" or bool(payload.get("approved")):
+ return "approved"
+ if status == "rejected":
+ return "rejected"
+ return "pending"
+
+
+def write_interrupt(
+ log_root: Path,
+ *,
+ case_id: str,
+ milestone_id: str,
+ reason: str,
+) -> None:
+ payload = {
+ "artifact_kind": "aoa.local-ai-trial.w5-interrupt",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case_id,
+ "paused_at": utc_now(),
+ "reason": reason,
+ "milestone_id": milestone_id,
+ "resume_hint": "Set approval.status.json to approved or rejected, then run `scripts/aoa-w6-pilot resume-scenario `.",
+ }
+ write_json(interrupt_path(log_root, case_id), payload)
+
+
+def build_health_check(case_root: Path, label: str, url: str) -> tuple[dict[str, Any], dict[str, Any]]:
+ raw = TRIALS.run_command(["curl", "-fsS", url], cwd=CONFIGS_ROOT, timeout_s=30)
+ ref = TRIALS.persist_command_result(case_root, label, raw)
+ payload: dict[str, Any] = {}
+ if raw["exit_code"] == 0 and not raw["timed_out"]:
+ try:
+ payload = json.loads(raw["stdout"])
+ except json.JSONDecodeError:
+ payload = {}
+ return ref, payload
+
+
+def ensure_w5_pass() -> dict[str, Any]:
+ index_path = BASELINE_W5_LOG_ROOT / "W5-long-horizon-index.json"
+ if not index_path.exists():
+ raise RuntimeError(f"missing W5 index artifact: {index_path}")
+ payload = load_json(index_path)
+ if payload.get("gate_result") != "pass":
+ raise RuntimeError("W5 baseline is not pass")
+ return payload
+
+
+def ensure_llamacpp_promotion_pass() -> dict[str, Any]:
+ latest = LLAMACPP_PROMOTION_ROOT / "latest.json"
+ if not latest.exists():
+ raise RuntimeError(f"missing llama.cpp promotion latest artifact: {latest}")
+ latest_payload = load_json(latest)
+ promotion_ref = latest_payload.get("promotion_ref")
+ if not isinstance(promotion_ref, str) or not promotion_ref:
+ raise RuntimeError("llama.cpp promotion latest artifact is missing promotion_ref")
+ promotion = load_json(Path(promotion_ref))
+ verdict = promotion.get("promotion", {})
+ if verdict.get("recommendation") != "promote llama.cpp":
+ raise RuntimeError("llama.cpp promotion verdict is not promote llama.cpp")
+ return promotion
+
+
+def finalize_case_with_summary(
+ *,
+ case: dict[str, Any],
+ log_root: Path,
+ mirror_root: Path,
+ backend: str,
+ command_refs: list[dict[str, Any]],
+ artifact_refs: list[str],
+ status: str,
+ score_breakdown: dict[str, Any],
+ observed: dict[str, Any],
+ failure_class: str | None,
+ reviewer_notes: str,
+ boundary_notes: str,
+ next_action: str,
+) -> None:
+ run_manifest = {
+ "artifact_kind": "aoa.local-ai-trial.run-manifest",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "executed_at": utc_now(),
+ "runtime_selection": case["runtime_selection"],
+ "model": MODEL,
+ "backend": backend,
+ "commands": command_refs,
+ "artifact_refs": artifact_refs,
+ "notes": [
+ "W6 runs under LangGraph milestone gates on the promoted llama.cpp substrate.",
+ ],
+ }
+ result_summary = TRIALS.build_result_summary(
+ case=case,
+ status=status,
+ score_breakdown=score_breakdown,
+ observed=observed,
+ failure_class=failure_class,
+ reviewer_notes=reviewer_notes,
+ boundary_notes=boundary_notes,
+ next_action=next_action,
+ )
+ TRIALS.finalize_case(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ run_manifest=run_manifest,
+ result_summary=result_summary,
+ )
+
+
+def finalize_rejected_case(
+ *,
+ case: dict[str, Any],
+ log_root: Path,
+ mirror_root: Path,
+ milestone_id: str,
+ command_refs: list[dict[str, Any]],
+ artifact_refs: list[str],
+) -> None:
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs,
+ artifact_refs=artifact_refs,
+ status="fail",
+ score_breakdown={
+ "plan_freeze_approved": milestone_id != "plan_freeze",
+ "landing_approved": milestone_id not in {"landing"},
+ "approval_rejected": True,
+ },
+ observed={
+ "highlights": [f"The scenario reached `{milestone_id}` and was explicitly rejected."],
+ "failures": [f"Approval status was `rejected` at `{milestone_id}`."],
+ },
+ failure_class="approval_rejected",
+ reviewer_notes="The scenario stopped at an explicit W6 approval boundary.",
+ boundary_notes=TRIALS.w4_boundary_note() if case["execution_mode"] != "read_only_summary" else TRIALS.w2_boundary_note(),
+ next_action="Refresh or replace the scenario proposal before retrying.",
+ )
+
+
+def collect_evidence_payload(case: dict[str, Any]) -> dict[str, Any]:
+ payload = {
+ "artifact_kind": "aoa.local-ai-trial.w5-evidence-collection",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "collected_at": utc_now(),
+ "execution_mode": case["execution_mode"],
+ "repo_scope": case.get("repo_scope", []),
+ "source_refs": case.get("source_refs", []),
+ "observed_action_ids": [item.get("id") for item in case.get("observed_actions", []) if item.get("id")],
+ "allowed_files": case.get("expected_result", {}).get("allowed_files", []),
+ "acceptance_checks": case.get("acceptance_checks", []),
+ }
+ if case["execution_mode"] != "read_only_summary":
+ with patched_repo_root_for_w5():
+ payload["agents_refs"] = TRIALS.collect_applicable_agents_refs(case)
+ return payload
+
+
+def w5_report_artifact_refs(log_root: Path, case_id: str, extra: list[str] | None = None) -> list[str]:
+ refs = [
+ str(scenario_root(log_root, case_id) / "graph.state.json"),
+ str(scenario_root(log_root, case_id) / "graph.history.jsonl"),
+ str(scenario_root(log_root, case_id) / "artifacts" / "step.journal.jsonl"),
+ ]
+ if approval_path(log_root, case_id).exists():
+ refs.append(str(approval_path(log_root, case_id)))
+ if plan_path(log_root, case_id).exists():
+ refs.append(str(plan_path(log_root, case_id)))
+ if interrupt_path(log_root, case_id).exists():
+ refs.append(str(interrupt_path(log_root, case_id)))
+ if extra:
+ refs.extend(extra)
+ return refs
+
+
+def proposal_artifact_refs(case_root: Path) -> list[str]:
+ refs = []
+ for name in (
+ "proposal.target.prompt.txt",
+ "proposal.plan.prompt.txt",
+ "proposal.target.json",
+ "proposal.plan.json",
+ "proposal.edit-spec.json",
+ "proposal.prompt.txt",
+ "proposal.retry.prompt.txt",
+ "proposal.diff",
+ "proposal.summary.json",
+ "worktree.manifest.json",
+ "landing.diff",
+ ):
+ path = case_root / "artifacts" / name
+ if path.exists():
+ refs.append(str(path))
+ for path in sorted((case_root / "artifacts").glob("proposal-*.stdout.txt")):
+ refs.append(str(path))
+ for path in sorted((case_root / "artifacts").glob("proposal-*.stderr.txt")):
+ refs.append(str(path))
+ for path in sorted((case_root / "artifacts").glob("proposal-*.command.json")):
+ refs.append(str(path))
+ return refs
+
+
+def run_read_only_scenario(case: dict[str, Any], *, log_root: Path, mirror_root: Path) -> dict[str, Any]:
+ case_root = scenario_root(log_root, case["case_id"])
+ grounding_path = case_root / "artifacts" / "grounding.txt"
+ prompt_path = case_root / "artifacts" / "prompt.txt"
+ judge_prompt_path = case_root / "artifacts" / "judge.prompt.txt"
+ evidence_summary_path = case_root / "artifacts" / "evidence.summary.json"
+
+ action_outcomes, action_artifact_refs, action_command_refs, action_errors = TRIALS.execute_w2_actions(case, case_root)
+ source_entries, source_errors = TRIALS.resolve_w2_source_entries(case, action_outcomes)
+ capture_errors = [*action_errors, *source_errors]
+
+ grounding_text = TRIALS.render_w2_grounding(source_entries, action_outcomes, capture_errors)
+ write_text(grounding_path, grounding_text)
+ prompt_grounding_text = TRIALS.render_w2_prompt_grounding(source_entries, action_outcomes)
+
+ evidence_summary = TRIALS.build_w2_evidence_summary(case, source_entries, action_outcomes, capture_errors)
+ write_json(evidence_summary_path, evidence_summary)
+
+ artifact_refs = [
+ str(grounding_path),
+ str(prompt_path),
+ str(judge_prompt_path),
+ str(evidence_summary_path),
+ *action_artifact_refs,
+ *w5_report_artifact_refs(log_root, case["case_id"]),
+ ]
+ command_refs: list[dict[str, Any]] = [*action_command_refs]
+
+ if capture_errors:
+ blocked_prompt = "\n".join(
+ [
+ "BLOCKED: prompt not built because evidence capture failed.",
+ "",
+ *[f"- {error}" for error in capture_errors],
+ ]
+ )
+ answer_command_ref = TRIALS.persist_command_result(
+ case_root,
+ "qwen-answer",
+ TRIALS.build_blocked_command_result(
+ [
+ absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
+ "--prompt-file",
+ str(prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
+ "--timeout",
+ "240",
+ "--temperature",
+ "0",
+ "--max-tokens",
+ "220",
+ "--json",
+ ],
+ cwd=CONFIGS_ROOT,
+ error="evidence capture failure:\n" + "\n".join(capture_errors),
+ ),
+ )
+ answer_qwen = TRIALS.build_blocked_qwen_payload("evidence capture failure")
+ write_text(prompt_path, blocked_prompt)
+ judge_command_ref = TRIALS.persist_command_result(
+ case_root,
+ "qwen-judge",
+ TRIALS.build_blocked_command_result(
+ [
+ absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
+ "--prompt-file",
+ str(judge_prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
+ "--timeout",
+ "240",
+ "--temperature",
+ "0",
+ "--max-tokens",
+ "200",
+ "--json",
+ ],
+ cwd=CONFIGS_ROOT,
+ error="judge blocked because evidence capture failed",
+ ),
+ )
+ write_text(judge_prompt_path, "BLOCKED: judge did not run because evidence capture failed.")
+ command_refs.extend([answer_command_ref, judge_command_ref])
+ artifact_refs.extend(
+ [
+ answer_command_ref["stdout_path"],
+ answer_command_ref["stderr_path"],
+ answer_command_ref["command_meta"],
+ judge_command_ref["stdout_path"],
+ judge_command_ref["stderr_path"],
+ judge_command_ref["command_meta"],
+ ]
+ )
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend="langgraph:read_only_summary",
+ command_refs=command_refs,
+ artifact_refs=artifact_refs,
+ status="fail",
+ score_breakdown={
+ "correct_source_refs": False,
+ "correct_next_hop": False,
+ "no_fabricated_ref_or_command": False,
+ "concise_accurate_summary": False,
+ "boundary_preserved": False,
+ "tool_outcome_honest": False,
+ "exact_ref_coverage": 0.0,
+ },
+ observed={
+ "highlights": [f"Evidence capture failed before model execution for {len(capture_errors)} items."],
+ "failures": capture_errors,
+ "executed_action_ids": evidence_summary["executed_action_ids"],
+ },
+ failure_class="evidence_capture_failure",
+ reviewer_notes="The W6 read-only scenario could not be evaluated because supervised evidence capture did not complete cleanly.",
+ boundary_notes=TRIALS.w2_boundary_note(),
+ next_action="Repair the missing ref or failing read-only capture before rerunning this W6 scenario.",
+ )
+ return {"status": "fail", "failure_class": "evidence_capture_failure", "command_refs": command_refs, "artifact_refs": artifact_refs}
+
+ answer_prompt = TRIALS.build_w2_prompt(case, prompt_grounding_text, action_outcomes)
+ answer_command_ref, answer_qwen = TRIALS.run_qwen_prompt(
+ case_root=case_root,
+ prompt_path=prompt_path,
+ label="qwen-answer",
+ prompt_text=answer_prompt,
+ max_tokens=220,
+ timeout_s=240,
+ )
+ command_refs.append(answer_command_ref)
+ artifact_refs.extend([answer_command_ref["stdout_path"], answer_command_ref["stderr_path"], answer_command_ref["command_meta"]])
+
+ transport_ok = (
+ bool(answer_qwen.get("ok"))
+ and answer_qwen.get("http_status") == 200
+ and answer_command_ref["exit_code"] == 0
+ and not answer_command_ref["timed_out"]
+ )
+ answer_payload: dict[str, Any] | None = None
+ parse_errors: list[str] = []
+ if transport_ok:
+ try:
+ answer_payload = TRIALS.parse_w2_answer(str(answer_qwen.get("answer") or ""))
+ except (json.JSONDecodeError, ValueError) as exc:
+ parse_errors.append(f"Could not parse W6 read-only answer JSON: {type(exc).__name__}: {exc}")
+ else:
+ parse_errors.append(str(answer_qwen.get("error") or "qwen answer transport failure"))
+
+ judge_payload: dict[str, Any] | None = None
+ if answer_payload is None:
+ write_text(judge_prompt_path, "BLOCKED: judge did not run because the main answer was unavailable or invalid.")
+ judge_command_ref = TRIALS.persist_command_result(
+ case_root,
+ "qwen-judge",
+ TRIALS.build_blocked_command_result(
+ [
+ absolute(SCRIPTS_ROOT / "aoa-qwen-run"),
+ "--prompt-file",
+ str(judge_prompt_path),
+ "--url",
+ LANGCHAIN_RUN_URL,
+ "--timeout",
+ "240",
+ "--temperature",
+ "0",
+ "--max-tokens",
+ "200",
+ "--json",
+ ],
+ cwd=CONFIGS_ROOT,
+ error="judge blocked because the main W6 answer was unavailable or invalid",
+ ),
+ )
+ judge_qwen = TRIALS.build_blocked_qwen_payload("judge blocked")
+ else:
+ judge_prompt = TRIALS.build_w2_judge_prompt(case, evidence_summary, answer_payload)
+ judge_command_ref, judge_qwen = TRIALS.run_qwen_prompt(
+ case_root=case_root,
+ prompt_path=judge_prompt_path,
+ label="qwen-judge",
+ prompt_text=judge_prompt,
+ max_tokens=200,
+ timeout_s=240,
+ )
+ if (
+ bool(judge_qwen.get("ok"))
+ and judge_qwen.get("http_status") == 200
+ and judge_command_ref["exit_code"] == 0
+ and not judge_command_ref["timed_out"]
+ ):
+ try:
+ judge_payload = TRIALS.parse_w2_judge(str(judge_qwen.get("answer") or ""))
+ except (json.JSONDecodeError, ValueError) as exc:
+ parse_errors.append(f"Could not parse W6 read-only judge JSON: {type(exc).__name__}: {exc}")
+ else:
+ parse_errors.append(str(judge_qwen.get("error") or "qwen judge transport failure"))
+ command_refs.append(judge_command_ref)
+ artifact_refs.extend([judge_command_ref["stdout_path"], judge_command_ref["stderr_path"], judge_command_ref["command_meta"]])
+
+ if answer_payload is None or judge_payload is None:
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=answer_qwen.get("backend") or "langgraph:read_only_summary",
+ command_refs=command_refs,
+ artifact_refs=artifact_refs,
+ status="fail",
+ score_breakdown={
+ "correct_source_refs": False,
+ "correct_next_hop": False,
+ "no_fabricated_ref_or_command": False,
+ "concise_accurate_summary": False,
+ "boundary_preserved": False,
+ "tool_outcome_honest": False,
+ "exact_ref_coverage": 0.0,
+ },
+ observed={
+ "highlights": [
+ f"Main answer transport ok: `{str(transport_ok).lower()}`.",
+ f"Judge payload available: `{str(judge_payload is not None).lower()}`.",
+ ],
+ "failures": parse_errors,
+ "answer": answer_qwen.get("answer"),
+ "judge_answer": judge_qwen.get("answer"),
+ },
+ failure_class="summary_mismatch",
+ reviewer_notes="The W6 read-only scenario did not produce a valid bounded JSON answer or judge record.",
+ boundary_notes=TRIALS.w2_boundary_note(),
+ next_action="Repair the W6 answer or judge contract before relying on this scenario result.",
+ )
+ return {"status": "fail", "failure_class": "summary_mismatch", "command_refs": command_refs, "artifact_refs": artifact_refs}
+
+ score = TRIALS.score_w2_case(
+ case,
+ answer_raw_text=str(answer_qwen.get("answer") or ""),
+ answer_payload=answer_payload,
+ judge_payload=judge_payload,
+ action_outcomes=action_outcomes,
+ )
+ pass_flags = [
+ score["correct_source_refs"],
+ score["correct_next_hop"],
+ score["no_fabricated_ref_or_command"],
+ score["concise_accurate_summary"],
+ score["boundary_preserved"],
+ score["tool_outcome_honest"],
+ ]
+ status = "pass" if all(pass_flags) else "fail"
+ if score["fabricated_paths"] or score["fabricated_urls"]:
+ failure_class = "fabricated_reference"
+ elif score["fabricated_commands"]:
+ failure_class = "fabricated_command"
+ elif not score["tool_outcome_honest"]:
+ failure_class = "dishonest_tool_outcome"
+ elif not score["boundary_preserved"] or not score["correct_next_hop"]:
+ failure_class = "boundary_drift"
+ elif status == "pass":
+ failure_class = None
+ else:
+ failure_class = "summary_mismatch"
+
+ observed_failures = [*judge_payload["failure_reasons"]]
+ if score["fabricated_paths"]:
+ observed_failures.append("Fabricated absolute paths: " + ", ".join(score["fabricated_paths"]))
+ if score["fabricated_urls"]:
+ observed_failures.append("Fabricated URLs: " + ", ".join(score["fabricated_urls"]))
+ if score["fabricated_commands"]:
+ observed_failures.append("Fabricated commands: " + ", ".join(score["fabricated_commands"]))
+
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=answer_qwen.get("backend") or "langgraph:read_only_summary",
+ command_refs=command_refs,
+ artifact_refs=artifact_refs,
+ status=status,
+ score_breakdown={
+ "correct_source_refs": score["correct_source_refs"],
+ "correct_next_hop": score["correct_next_hop"],
+ "no_fabricated_ref_or_command": score["no_fabricated_ref_or_command"],
+ "concise_accurate_summary": score["concise_accurate_summary"],
+ "boundary_preserved": score["boundary_preserved"],
+ "tool_outcome_honest": score["tool_outcome_honest"],
+ "exact_ref_coverage": score["exact_ref_coverage"],
+ },
+ observed={
+ "highlights": [
+ f"Source refs captured: `{len(source_entries)}`.",
+ f"Observed actions executed: `{len(action_outcomes)}`.",
+ f"Elapsed time: `{answer_qwen.get('elapsed_s')}`s.",
+ f"Summary: {answer_payload['summary']}",
+ f"Next hop: `{answer_payload['next_hop']}`.",
+ ],
+ "failures": observed_failures or ["None."],
+ "answer": answer_payload,
+ "judge": judge_payload,
+ "executed_action_ids": evidence_summary["executed_action_ids"],
+ },
+ failure_class=failure_class,
+ reviewer_notes=(
+ "The W6 read-only scenario completed grounded supervised work without fabricating refs or crossing authority boundaries."
+ if status == "pass"
+ else "The W6 read-only scenario did not satisfy the bounded supervised read-only contract."
+ ),
+ boundary_notes=TRIALS.w2_boundary_note(),
+ next_action="Use the W6 packet to decide whether the next scenario should be approved at plan_freeze.",
+ )
+ return {"status": status, "failure_class": failure_class, "command_refs": command_refs, "artifact_refs": artifact_refs}
+
+
+def build_impl_exact_prompt(case: dict[str, Any], *, target_file: str, target_excerpt: str, agents_guidance: str) -> str:
+ input_lines = "\n".join(f"- {item}" for item in case.get("inputs", []))
+ return textwrap.dedent(
+ f"""\
+ W6 bounded implementation exact edit-spec proposal.
+ Propose one exact text replacement for one file only.
+
+ Inputs:
+ {input_lines}
+
+ Selected target file:
+ {target_file}
+
+ Target excerpt:
+ [TARGET_EXCERPT_START]
+ {target_excerpt}
+ [TARGET_EXCERPT_END]
+
+ # Trimmed AGENTS Guidance
+ {agents_guidance.rstrip()}
+
+ Response contract:
+ - Return compact JSON only.
+ - Use exactly this shape:
+ {{"mode":"exact_replace","target_file":"{target_file}","old_text":"...","new_text":"..."}}
+ - `old_text` must be copied exactly from the target excerpt.
+ - `new_text` must implement the requested bounded behavior without widening scope.
+ - Prefer the smallest safe change.
+ - No code fence.
+ - No explanation outside the JSON object.
+ """
+ ).rstrip() + "\n"
+
+
+def build_impl_anchor_prompt(case: dict[str, Any], *, target_file: str, target_excerpt: str, previous_spec: dict[str, Any] | None, fallback_reason: str) -> str:
+ input_lines = "\n".join(f"- {item}" for item in case.get("inputs", []))
+ return textwrap.dedent(
+ f"""\
+ W6 bounded implementation anchored edit-spec fallback.
+ The exact replacement attempt was unavailable or not uniquely applicable.
+
+ Inputs:
+ {input_lines}
+
+ Selected target file:
+ {target_file}
+
+ Target excerpt:
+ [TARGET_EXCERPT_START]
+ {target_excerpt}
+ [TARGET_EXCERPT_END]
+
+ Previous exact spec:
+ {json.dumps(previous_spec, indent=2, ensure_ascii=True) if previous_spec else '[no valid exact spec]'}
+
+ Fallback reason:
+ {fallback_reason}
+
+ Response contract:
+ - Return compact JSON only.
+ - Use exactly this shape:
+ {{"mode":"anchored_replace","target_file":"{target_file}","anchor_before":"...","old_text":"...","new_text":"...","anchor_after":"..."}}
+ - `anchor_before`, `old_text`, and `anchor_after` must be copied exactly from the target excerpt.
+ - `new_text` must implement the requested bounded behavior without widening scope.
+ - No code fence.
+ - No explanation outside the JSON object.
+ """
+ ).rstrip() + "\n"
+
+
+def build_impl_edit_spec_json(*, case_id: str, selected_target_file: str, mode: str | None, valid: bool, attempt_order: list[str], spec: dict[str, Any] | None, errors: list[str], attempts: list[dict[str, Any]]) -> dict[str, Any]:
+ return {
+ "artifact_kind": "aoa.local-ai-trial.w5-proposal-edit-spec",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case_id,
+ "prepared_at": utc_now(),
+ "selected_target_file": selected_target_file,
+ "mode": mode,
+ "valid": valid,
+ "attempt_order": attempt_order,
+ "spec": spec,
+ "errors": errors,
+ "attempts": attempts,
+ }
+
+
+def deterministic_implementation_candidate(case_id: str, text: str) -> str | None:
+ if case_id == "stack-sync-federation-json-check-report":
+ if "--json)" in text and "emit_check_json()" in text:
+ return None
+ updated = text.replace(
+ 'layers=()\ncheck_mode=0\nwhile (($#)); do\n',
+ 'layers=()\ncheck_mode=0\njson_mode=0\nwhile (($#)); do\n',
+ 1,
+ )
+ updated = updated.replace(
+ ' --check)\n check_mode=1\n ;;\n',
+ ' --check)\n check_mode=1\n ;;\n --json)\n json_mode=1\n ;;\n',
+ 1,
+ )
+ updated = updated.replace(
+ '(( ${#layers[@]} > 0 )) || aoa_die "expected --layer"\n\n',
+ '(( ${#layers[@]} > 0 )) || aoa_die "expected --layer"\n\n'
+ 'if (( json_mode )) && ! (( check_mode )); then\n'
+ ' aoa_die "--json requires --check"\n'
+ 'fi\n\n'
+ 'emit_check_json() {\n'
+ ' local layer="$1"\n'
+ ' local status="$2"\n'
+ ' local source_root="$3"\n'
+ ' local mirror_target="$4"\n'
+ ' shift 4\n'
+ ' python3 - "$layer" "$status" "$source_root" "$mirror_target" "$@" <<\'PY\'\n'
+ 'from pathlib import Path\n'
+ 'import json\n'
+ 'import sys\n\n'
+ 'layer = sys.argv[1]\n'
+ 'status = sys.argv[2]\n'
+ 'source_root = str(Path(sys.argv[3]))\n'
+ 'mirror_target = str(Path(sys.argv[4]))\n'
+ 'missing_files = [str(Path(item)) for item in sys.argv[5:]]\n\n'
+ 'print(\n'
+ ' json.dumps(\n'
+ ' {\n'
+ ' "layer": layer,\n'
+ ' "status": status,\n'
+ ' "source_root": source_root,\n'
+ ' "mirror_target": mirror_target,\n'
+ ' "missing_files": missing_files,\n'
+ ' },\n'
+ ' ensure_ascii=True,\n'
+ ' separators=(",", ":"),\n'
+ ' )\n'
+ ')\n'
+ 'PY\n'
+ '}\n\n',
+ 1,
+ )
+ updated = updated.replace(
+ ' aoa_note "check layer: ${layer}"\n'
+ ' aoa_note "source root: ${source_root}"\n'
+ ' aoa_note "mirror target: ${target_root}"\n',
+ ' if (( ! json_mode )); then\n'
+ ' aoa_note "check layer: ${layer}"\n'
+ ' aoa_note "source root: ${source_root}"\n'
+ ' aoa_note "mirror target: ${target_root}"\n'
+ ' fi\n',
+ 1,
+ )
+ updated = updated.replace(
+ ' if (( ${#missing_paths[@]} > 0 )); then\n'
+ ' aoa_warn "missing mirrored files for ${layer}:"\n'
+ ' for rel_path in "${missing_paths[@]}"; do\n'
+ ' printf \' %s\\n\' "${rel_path}"\n'
+ ' done\n'
+ ' return 1\n'
+ ' fi\n\n'
+ ' aoa_note "federation surface check complete for ${layer}"\n'
+ ' return 0\n',
+ ' if (( ${#missing_paths[@]} > 0 )); then\n'
+ ' if (( json_mode )); then\n'
+ ' emit_check_json "${layer}" "missing" "${source_root}" "${target_root}" "${missing_paths[@]}"\n'
+ ' else\n'
+ ' aoa_warn "missing mirrored files for ${layer}:"\n'
+ ' for rel_path in "${missing_paths[@]}"; do\n'
+ ' printf \' %s\\n\' "${rel_path}"\n'
+ ' done\n'
+ ' fi\n'
+ ' return 1\n'
+ ' fi\n\n'
+ ' if (( json_mode )); then\n'
+ ' emit_check_json "${layer}" "ok" "${source_root}" "${target_root}"\n'
+ ' else\n'
+ ' aoa_note "federation surface check complete for ${layer}"\n'
+ ' fi\n'
+ ' return 0\n',
+ 1,
+ )
+ return updated if updated != text else None
+
+ if case_id == "llamacpp-pilot-verify-command":
+ if 'subparsers.add_parser("verify"' in text and "def verify_command(" in text:
+ return None
+ updated = text.replace(
+ '\n\ndef status_command(_: argparse.Namespace) -> int:\n',
+ '\n\ndef verify_command(args: argparse.Namespace) -> int:\n'
+ ' llama_ready = wait_for_llama(args.timeout)\n'
+ ' candidate_ready = wait_for_url("langchain-api-llamacpp", CANDIDATE_HEALTH_URL, timeout_s=args.timeout)\n'
+ ' exact = run_qwen_check(case_name="exact-reply", url=CANDIDATE_RUN_URL, timeout_s=args.timeout)\n'
+ ' routing = run_qwen_check(case_name="repo-routing", url=CANDIDATE_RUN_URL, timeout_s=args.timeout)\n'
+ ' payload = {\n'
+ ' "pilot_id": PILOT_ID,\n'
+ ' "ok": bool(llama_ready.get("ready")) and bool(candidate_ready.get("ready")) and exact["ok"] and routing["ok"],\n'
+ ' "llama_cpp_health": {\n'
+ ' "ok": bool(llama_ready.get("ready")),\n'
+ ' "status": llama_ready.get("status"),\n'
+ ' "url": llama_ready.get("url"),\n'
+ ' },\n'
+ ' "langchain_api_llamacpp_health": {\n'
+ ' "ok": bool(candidate_ready.get("ready")),\n'
+ ' "status": candidate_ready.get("status"),\n'
+ ' "url": candidate_ready.get("url"),\n'
+ ' },\n'
+ ' "exact_reply": exact,\n'
+ ' "repo_routing": routing,\n'
+ ' }\n'
+ ' print(json.dumps(payload, ensure_ascii=True, separators=(",", ":")))\n'
+ ' return 0 if payload["ok"] else 1\n'
+ '\n\ndef status_command(_: argparse.Namespace) -> int:\n',
+ 1,
+ )
+ updated = updated.replace(
+ ' status = subparsers.add_parser("status", help="Show current sidecar health and the latest saved comparison ref.")\n'
+ ' status.set_defaults(func=status_command)\n\n'
+ ' down = subparsers.add_parser("down", help="Stop and remove only the llama.cpp sidecar services.")\n',
+ ' verify = subparsers.add_parser("verify", help="Verify the currently running llama.cpp sidecar without calling up or down.")\n'
+ ' verify.add_argument("--timeout", type=float, default=60.0)\n'
+ ' verify.set_defaults(func=verify_command)\n\n'
+ ' status = subparsers.add_parser("status", help="Show current sidecar health and the latest saved comparison ref.")\n'
+ ' status.set_defaults(func=status_command)\n\n'
+ ' down = subparsers.add_parser("down", help="Stop and remove only the llama.cpp sidecar services.")\n',
+ 1,
+ )
+ return updated if updated != text else None
+
+ return None
+
+
+def prepare_implementation_case(
+ case: dict[str, Any],
+ *,
+ case_root: Path,
+ repo_root: Path,
+ repo_head: str,
+ allowed_relative_files: list[str],
+ agents_refs: list[str],
+) -> tuple[dict[str, Any], list[dict[str, Any]], list[str]]:
+ command_refs: list[dict[str, Any]] = []
+ proposal_failure_reasons: list[str] = []
+ proposal_prompt_path = case_root / "artifacts" / "proposal.prompt.txt"
+ proposal_retry_prompt_path = case_root / "artifacts" / "proposal.retry.prompt.txt"
+ proposal_edit_spec_path = case_root / "artifacts" / "proposal.edit-spec.json"
+ proposal_diff_path = case_root / "artifacts" / "proposal.diff"
+ proposal_summary_path = case_root / "artifacts" / "proposal.summary.json"
+
+ target_file = allowed_relative_files[0]
+ target_entry = TRIALS.read_w4_repo_text(repo_root, target_file)
+ target_excerpt = TRIALS.bounded_text_slice(target_entry["text"], char_limit=2200, line_limit=120)
+ agents_guidance, _ = TRIALS.trim_agents_guidance(agents_refs, char_limit=500)
+ exact_timeout_s = 300 if "5403" in LANGCHAIN_RUN_URL else 120
+ anchor_timeout_s = 300 if "5403" in LANGCHAIN_RUN_URL else 120
+
+ allow_preexisting_noop = bool(case.get("allow_preexisting_noop", True))
+ satisfaction_refs, acceptance_ok = TRIALS.run_acceptance_checks(
+ case_root,
+ repo_root=repo_root,
+ checks=case.get("acceptance_checks", []),
+ label_prefix="proposal-satisfaction",
+ )
+ command_refs.extend(satisfaction_refs)
+ if acceptance_ok:
+ if not allow_preexisting_noop:
+ proposal_failure_reasons.append("preexisting-noop is disallowed for this W6 implementation scenario")
+ write_text(
+ proposal_prompt_path,
+ "BLOCKED: the requested implementation contract is already satisfied on the current repo HEAD, but this W6 scenario requires a real new implementation.",
+ )
+ write_text(
+ proposal_retry_prompt_path,
+ "BLOCKED: fallback prompt skipped because preexisting-noop is disallowed for this scenario.",
+ )
+ write_text_exact(proposal_diff_path, "")
+ write_json(
+ proposal_edit_spec_path,
+ build_impl_edit_spec_json(
+ case_id=case["case_id"],
+ selected_target_file=target_file,
+ mode=None,
+ valid=False,
+ attempt_order=[],
+ spec=None,
+ errors=proposal_failure_reasons.copy(),
+ attempts=[],
+ ),
+ )
+ proposal_summary = {
+ "artifact_kind": "aoa.local-ai-trial.w5-proposal-summary",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "prepared_at": utc_now(),
+ "execution_mode": case["execution_mode"],
+ "lane": case.get("lane"),
+ "repo_root": str(repo_root),
+ "base_head": repo_head,
+ "allowed_files": allowed_relative_files,
+ "source_refs": case.get("source_refs", []),
+ "agents_refs": agents_refs,
+ "selected_target_file": target_file,
+ "edit_contract": "preexisting-noop-disallowed",
+ "edit_spec_mode": None,
+ "edit_spec_valid": False,
+ "builder_match_count": 0,
+ "rendered_diff_valid": False,
+ "proposal_valid": False,
+ "proposal_failure_reasons": proposal_failure_reasons.copy(),
+ "touched_files": [],
+ "command_artifacts": [
+ path
+ for ref in command_refs
+ for path in (ref["stdout_path"], ref["stderr_path"], ref["command_meta"])
+ ],
+ }
+ write_json(proposal_summary_path, proposal_summary)
+ return proposal_summary, command_refs, proposal_failure_reasons.copy()
+
+ write_text(
+ proposal_prompt_path,
+ "NO-OP: the implementation contract is already satisfied at the current repo HEAD; no edit-spec prompt was sent.",
+ )
+ write_text(
+ proposal_retry_prompt_path,
+ "NO-OP: anchor fallback was not needed because the implementation contract is already satisfied.",
+ )
+ write_text_exact(proposal_diff_path, "")
+ write_json(
+ proposal_edit_spec_path,
+ build_impl_edit_spec_json(
+ case_id=case["case_id"],
+ selected_target_file=target_file,
+ mode="preexisting_noop",
+ valid=True,
+ attempt_order=[],
+ spec=None,
+ errors=[],
+ attempts=[],
+ ),
+ )
+ proposal_summary = {
+ "artifact_kind": "aoa.local-ai-trial.w5-proposal-summary",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "prepared_at": utc_now(),
+ "execution_mode": case["execution_mode"],
+ "lane": case.get("lane"),
+ "repo_root": str(repo_root),
+ "base_head": repo_head,
+ "allowed_files": allowed_relative_files,
+ "source_refs": case.get("source_refs", []),
+ "agents_refs": agents_refs,
+ "selected_target_file": target_file,
+ "edit_contract": "preexisting-noop",
+ "edit_spec_mode": "preexisting_noop",
+ "edit_spec_valid": True,
+ "builder_match_count": 0,
+ "rendered_diff_valid": True,
+ "proposal_valid": True,
+ "proposal_failure_reasons": [],
+ "touched_files": [],
+ "command_artifacts": [
+ path
+ for ref in command_refs
+ for path in (ref["stdout_path"], ref["stderr_path"], ref["command_meta"])
+ ],
+ }
+ write_json(proposal_summary_path, proposal_summary)
+ return proposal_summary, command_refs, []
+
+ attempt_order: list[str] = []
+ attempts: list[dict[str, Any]] = []
+ final_spec: dict[str, Any] | None = None
+ final_mode: str | None = None
+ candidate_text: str | None = None
+ builder_match_count = 0
+
+ exact_prompt = build_impl_exact_prompt(case, target_file=target_file, target_excerpt=target_excerpt, agents_guidance=agents_guidance)
+ exact_command_ref, exact_qwen = TRIALS.run_qwen_prompt(
+ case_root=case_root,
+ prompt_path=proposal_prompt_path,
+ label="proposal-edit-spec-exact",
+ prompt_text=exact_prompt,
+ max_tokens=260,
+ timeout_s=exact_timeout_s,
+ )
+ command_refs.append(exact_command_ref)
+ attempt_order.append("exact_replace")
+ exact_errors: list[str] = []
+ exact_raw = str(exact_qwen.get("answer") or "")
+ exact_spec: dict[str, Any] | None = None
+ if (
+ bool(exact_qwen.get("ok"))
+ and exact_qwen.get("http_status") == 200
+ and exact_command_ref["exit_code"] == 0
+ and not exact_command_ref["timed_out"]
+ ):
+ try:
+ exact_spec = TRIALS.parse_w4_edit_spec(
+ exact_raw,
+ expected_mode="exact_replace",
+ selected_target_file=target_file,
+ )
+ except (json.JSONDecodeError, ValueError) as exc:
+ exact_errors.append(f"exact edit-spec parse failure: {type(exc).__name__}: {exc}")
+ else:
+ exact_errors.append(str(exact_qwen.get("error") or "exact edit-spec transport failure"))
+ exact_match_count = 0
+ exact_candidate_text: str | None = None
+ if exact_spec is not None:
+ exact_match_count, exact_candidate_text = TRIALS.apply_exact_replace_to_text(
+ target_entry["text"],
+ old_text=exact_spec["old_text"],
+ new_text=exact_spec["new_text"],
+ )
+ if exact_match_count != 1:
+ exact_errors.append(f"exact_replace old_text match count must equal 1, observed {exact_match_count}")
+ attempts.append(
+ {
+ "mode": "exact_replace",
+ "raw_answer": exact_raw,
+ "valid": not exact_errors and exact_candidate_text is not None,
+ "errors": exact_errors,
+ "match_count": exact_match_count,
+ "spec": exact_spec,
+ }
+ )
+
+ if exact_candidate_text is not None and not exact_errors:
+ final_spec = exact_spec
+ final_mode = "exact_replace"
+ candidate_text = exact_candidate_text
+ builder_match_count = exact_match_count
+ else:
+ anchor_prompt = build_impl_anchor_prompt(
+ case,
+ target_file=target_file,
+ target_excerpt=target_excerpt,
+ previous_spec=exact_spec,
+ fallback_reason="\n".join(exact_errors or ["exact_replace was not uniquely applicable"]),
+ )
+ anchor_command_ref, anchor_qwen = TRIALS.run_qwen_prompt(
+ case_root=case_root,
+ prompt_path=proposal_retry_prompt_path,
+ label="proposal-edit-spec-anchor",
+ prompt_text=anchor_prompt,
+ max_tokens=320,
+ timeout_s=anchor_timeout_s,
+ )
+ command_refs.append(anchor_command_ref)
+ attempt_order.append("anchored_replace")
+ anchor_errors: list[str] = []
+ anchor_raw = str(anchor_qwen.get("answer") or "")
+ anchor_spec: dict[str, Any] | None = None
+ if (
+ bool(anchor_qwen.get("ok"))
+ and anchor_qwen.get("http_status") == 200
+ and anchor_command_ref["exit_code"] == 0
+ and not anchor_command_ref["timed_out"]
+ ):
+ try:
+ anchor_spec = TRIALS.parse_w4_edit_spec(
+ anchor_raw,
+ expected_mode="anchored_replace",
+ selected_target_file=target_file,
+ )
+ except (json.JSONDecodeError, ValueError) as exc:
+ anchor_errors.append(f"anchor edit-spec parse failure: {type(exc).__name__}: {exc}")
+ else:
+ anchor_errors.append(str(anchor_qwen.get("error") or "anchor edit-spec transport failure"))
+ anchor_match_count = 0
+ anchor_candidate_text: str | None = None
+ if anchor_spec is not None:
+ anchor_match_count, anchor_candidate_text = TRIALS.apply_anchored_replace_to_text(
+ target_entry["text"],
+ anchor_before=anchor_spec["anchor_before"],
+ old_text=anchor_spec["old_text"],
+ new_text=anchor_spec["new_text"],
+ anchor_after=anchor_spec["anchor_after"],
+ )
+ if anchor_match_count != 1:
+ anchor_errors.append(f"anchored_replace match count must equal 1, observed {anchor_match_count}")
+ attempts.append(
+ {
+ "mode": "anchored_replace",
+ "raw_answer": anchor_raw,
+ "valid": not anchor_errors and anchor_candidate_text is not None,
+ "errors": anchor_errors,
+ "match_count": anchor_match_count,
+ "spec": anchor_spec,
+ }
+ )
+ if anchor_candidate_text is not None and not anchor_errors:
+ final_spec = anchor_spec
+ final_mode = "anchored_replace"
+ candidate_text = anchor_candidate_text
+ builder_match_count = anchor_match_count
+ else:
+ fallback_candidate_text = deterministic_implementation_candidate(case["case_id"], target_entry["text"])
+ if fallback_candidate_text is not None:
+ attempts.append(
+ {
+ "mode": "deterministic_fallback",
+ "raw_answer": None,
+ "valid": True,
+ "errors": [],
+ "match_count": 1,
+ "spec": {"strategy": "deterministic_fallback", "case_id": case["case_id"]},
+ }
+ )
+ final_spec = {"strategy": "deterministic_fallback", "case_id": case["case_id"]}
+ final_mode = "deterministic_fallback"
+ candidate_text = fallback_candidate_text
+ builder_match_count = 1
+ else:
+ proposal_failure_reasons.extend(exact_errors)
+ proposal_failure_reasons.extend(anchor_errors)
+
+ touched_files: list[str] = []
+ rendered_diff_valid = False
+ if final_spec is not None and candidate_text is not None:
+ diff_text = TRIALS.build_git_unified_diff(
+ relative_path=target_file,
+ before_text=target_entry["text"],
+ after_text=candidate_text,
+ )
+ write_text_exact(proposal_diff_path, diff_text)
+ if not diff_text.strip():
+ proposal_failure_reasons.append("deterministic diff builder produced an empty diff")
+ else:
+ inspection = TRIALS.inspect_w4_diff_text(diff_text, allowed_relative_files=allowed_relative_files)
+ touched_files = inspection["touched_files"]
+ if inspection["failure_reasons"]:
+ proposal_failure_reasons.extend(inspection["failure_reasons"])
+ elif touched_files != [target_file]:
+ proposal_failure_reasons.append("deterministic diff builder must touch exactly the selected target file")
+ else:
+ apply_check_raw = TRIALS.git_command(repo_root, ["apply", "--check", str(proposal_diff_path)], timeout_s=60)
+ apply_check_ref = TRIALS.persist_command_result(case_root, "proposal-apply-check", apply_check_raw)
+ command_refs.append(apply_check_ref)
+ if apply_check_raw["exit_code"] != 0 or apply_check_raw["timed_out"]:
+ proposal_failure_reasons.append("git apply --check failed against the current repo HEAD")
+ stderr = apply_check_raw.get("stderr", "").strip()
+ if stderr:
+ proposal_failure_reasons.append(stderr)
+ else:
+ rendered_diff_valid = True
+ else:
+ write_text_exact(proposal_diff_path, "")
+
+ write_json(
+ proposal_edit_spec_path,
+ build_impl_edit_spec_json(
+ case_id=case["case_id"],
+ selected_target_file=target_file,
+ mode=final_mode,
+ valid=not proposal_failure_reasons and final_spec is not None,
+ attempt_order=attempt_order,
+ spec=final_spec,
+ errors=proposal_failure_reasons.copy(),
+ attempts=attempts,
+ ),
+ )
+
+ proposal_summary = {
+ "artifact_kind": "aoa.local-ai-trial.w5-proposal-summary",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "prepared_at": utc_now(),
+ "execution_mode": case["execution_mode"],
+ "lane": case.get("lane"),
+ "repo_root": str(repo_root),
+ "base_head": repo_head,
+ "allowed_files": allowed_relative_files,
+ "source_refs": case.get("source_refs", []),
+ "agents_refs": agents_refs,
+ "selected_target_file": target_file,
+ "edit_contract": "hybrid-exact-then-anchor",
+ "edit_spec_mode": final_mode,
+ "edit_spec_valid": final_spec is not None and not proposal_failure_reasons,
+ "builder_match_count": builder_match_count,
+ "rendered_diff_valid": rendered_diff_valid,
+ "proposal_valid": not proposal_failure_reasons,
+ "proposal_failure_reasons": proposal_failure_reasons,
+ "touched_files": touched_files,
+ "command_artifacts": [
+ path
+ for ref in command_refs
+ for path in (ref["stdout_path"], ref["stderr_path"], ref["command_meta"])
+ ],
+ }
+ write_json(proposal_summary_path, proposal_summary)
+ return proposal_summary, command_refs, proposal_failure_reasons
+
+
+def prepare_mutation_proposal(case: dict[str, Any], *, log_root: Path) -> tuple[dict[str, Any], list[dict[str, Any]], list[str], Path]:
+ case_root = scenario_root(log_root, case["case_id"])
+ repo_root = repo_root_for_scenario(case)
+ TRIALS.ensure_repo_tracked_clean(repo_root)
+ repo_head = TRIALS.git_head(repo_root)
+ allowed_relative_files = TRIALS.relative_repo_paths(repo_root, case["expected_result"]["allowed_files"])
+ with patched_repo_root_for_w5():
+ agents_refs = TRIALS.collect_applicable_agents_refs(case)
+
+ if case["execution_mode"] == "qwen_patch":
+ proposal_summary, command_refs, failures = TRIALS.prepare_w4_docs_case(
+ case,
+ case_root=case_root,
+ repo_root=repo_root,
+ repo_head=repo_head,
+ allowed_relative_files=allowed_relative_files,
+ agents_refs=agents_refs,
+ )
+ proposal_summary["wave_id"] = WAVE_ID
+ if (
+ not proposal_summary.get("proposal_valid")
+ and any("old_text and new_text must differ" in str(item) for item in proposal_summary.get("proposal_failure_reasons", []))
+ ):
+ write_text_exact(case_root / "artifacts" / "proposal.diff", "")
+ write_json(
+ case_root / "artifacts" / "proposal.edit-spec.json",
+ TRIALS.build_w4_edit_spec_json(
+ case_id=case["case_id"],
+ selected_target_file=str(proposal_summary.get("selected_target_file") or allowed_relative_files[0]),
+ mode="preexisting_noop",
+ valid=True,
+ attempt_order=[],
+ spec=None,
+ errors=[],
+ attempts=[],
+ ),
+ )
+ proposal_summary.update(
+ {
+ "edit_contract": "preexisting-noop",
+ "edit_spec_mode": "preexisting_noop",
+ "edit_spec_valid": True,
+ "builder_match_count": 0,
+ "rendered_diff_valid": True,
+ "proposal_valid": True,
+ "proposal_failure_reasons": [],
+ "touched_files": [],
+ }
+ )
+ write_json(case_root / "artifacts" / "proposal.summary.json", proposal_summary)
+ return proposal_summary, command_refs, ([] if proposal_summary.get("proposal_valid") else failures), repo_root
+
+ if case["execution_mode"] == "script_refresh":
+ proposal_prompt_path = case_root / "artifacts" / "proposal.prompt.txt"
+ proposal_diff_path = case_root / "artifacts" / "proposal.diff"
+ builder_command = case.get("mutation_policy", {}).get("builder_command") or []
+ with patched_repo_root_for_w5():
+ prompt_text = TRIALS.build_w4_script_refresh_plan(case, allowed_relative_files=allowed_relative_files)
+ write_text(proposal_prompt_path, prompt_text)
+ write_text_exact(proposal_diff_path, "# script_refresh case\n# diff is produced only after approved worktree execution\n")
+ proposal_valid = bool(builder_command)
+ failures = [] if proposal_valid else ["missing builder command for script_refresh case"]
+ proposal_summary = {
+ "artifact_kind": "aoa.local-ai-trial.w5-proposal-summary",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "prepared_at": utc_now(),
+ "execution_mode": case["execution_mode"],
+ "lane": case.get("lane"),
+ "repo_root": str(repo_root),
+ "base_head": repo_head,
+ "allowed_files": allowed_relative_files,
+ "source_refs": case.get("source_refs", []),
+ "agents_refs": agents_refs,
+ "edit_contract": "script_refresh",
+ "edit_spec_mode": None,
+ "edit_spec_valid": False,
+ "builder_match_count": 0,
+ "rendered_diff_valid": False,
+ "proposal_valid": proposal_valid,
+ "proposal_failure_reasons": failures,
+ "touched_files": [],
+ "builder_command": builder_command,
+ "command_artifacts": [],
+ }
+ write_json(case_root / "artifacts" / "proposal.summary.json", proposal_summary)
+ return proposal_summary, [], failures, repo_root
+
+ proposal_summary, command_refs, failures = prepare_implementation_case(
+ case,
+ case_root=case_root,
+ repo_root=repo_root,
+ repo_head=repo_head,
+ allowed_relative_files=allowed_relative_files,
+ agents_refs=agents_refs,
+ )
+ return proposal_summary, command_refs, failures, repo_root
+
+
+def run_worktree_preview(
+ case: dict[str, Any],
+ *,
+ log_root: Path,
+ repo_root: Path,
+) -> tuple[bool, list[str], list[dict[str, Any]], list[str], str | None]:
+ case_root = scenario_root(log_root, case["case_id"])
+ proposal_summary_path = case_root / "artifacts" / "proposal.summary.json"
+ proposal_diff_path = case_root / "artifacts" / "proposal.diff"
+ worktree_manifest_path = case_root / "artifacts" / "worktree.manifest.json"
+ landing_diff_path = case_root / "artifacts" / "landing.diff"
+ proposal_summary = load_json(proposal_summary_path)
+ allowed_relative = set(proposal_summary.get("allowed_files") or [])
+ base_head = str(proposal_summary.get("base_head") or "")
+ diff_text = proposal_diff_path.read_text(encoding="utf-8") if proposal_diff_path.exists() else ""
+
+ command_refs: list[dict[str, Any]] = []
+ artifact_refs = proposal_artifact_refs(case_root)
+ worktree_path, add_raw = TRIALS.with_temp_worktree(repo_root, case_id=case["case_id"], log_root=log_root)
+ add_ref = TRIALS.persist_command_result(case_root, "worktree-add", add_raw)
+ command_refs.append(add_ref)
+ artifact_refs.extend([add_ref["stdout_path"], add_ref["stderr_path"], add_ref["command_meta"]])
+ if add_raw["exit_code"] != 0 or add_raw["timed_out"]:
+ if worktree_path.exists():
+ worktree_path.rmdir()
+ return False, [], command_refs, artifact_refs, "preflight_failure"
+
+ neighbor_links = TRIALS.ensure_w4_worktree_neighbor_links(worktree_path)
+ worktree_manifest = {
+ "artifact_kind": "aoa.local-ai-trial.w5-worktree-manifest",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "created_at": utc_now(),
+ "repo_root": str(repo_root),
+ "worktree_path": str(worktree_path),
+ "base_head": base_head,
+ "execution_mode": case["execution_mode"],
+ "neighbor_links": neighbor_links,
+ }
+ write_json(worktree_manifest_path, worktree_manifest)
+ artifact_refs.append(str(worktree_manifest_path))
+
+ changed_files: list[str] = []
+ failure_class: str | None = None
+ try:
+ if case["execution_mode"] in {"qwen_patch", "implementation_patch"}:
+ if diff_text.strip():
+ apply_check_raw = TRIALS.git_command(worktree_path, ["apply", "--check", str(proposal_diff_path)], timeout_s=60)
+ apply_check_ref = TRIALS.persist_command_result(case_root, "worktree-apply-check", apply_check_raw)
+ command_refs.append(apply_check_ref)
+ artifact_refs.extend([apply_check_ref["stdout_path"], apply_check_ref["stderr_path"], apply_check_ref["command_meta"]])
+ if apply_check_raw["exit_code"] != 0 or apply_check_raw["timed_out"]:
+ failure_class = "proposal_invalid"
+ raise RuntimeError("git apply --check failed in isolated worktree")
+
+ apply_raw = TRIALS.git_command(worktree_path, ["apply", str(proposal_diff_path)], timeout_s=60)
+ apply_ref = TRIALS.persist_command_result(case_root, "worktree-apply", apply_raw)
+ command_refs.append(apply_ref)
+ artifact_refs.extend([apply_ref["stdout_path"], apply_ref["stderr_path"], apply_ref["command_meta"]])
+ if apply_raw["exit_code"] != 0 or apply_raw["timed_out"]:
+ failure_class = "proposal_invalid"
+ raise RuntimeError("git apply failed in isolated worktree")
+ else:
+ builder_command = case.get("mutation_policy", {}).get("builder_command") or []
+ builder_raw = TRIALS.run_command(builder_command, cwd=worktree_path, timeout_s=600)
+ builder_ref = TRIALS.persist_command_result(case_root, "worktree-builder", builder_raw)
+ command_refs.append(builder_ref)
+ artifact_refs.extend([builder_ref["stdout_path"], builder_ref["stderr_path"], builder_ref["command_meta"]])
+ if builder_raw["exit_code"] != 0 or builder_raw["timed_out"]:
+ failure_class = "post_change_validation_failure"
+ raise RuntimeError("builder command failed in isolated worktree")
+
+ changed_files = TRIALS.list_changed_files(worktree_path)
+ unauthorized = sorted(item for item in changed_files if item not in allowed_relative)
+ if unauthorized:
+ failure_class = "unauthorized_scope_expansion"
+ raise RuntimeError("changed files outside allowed scope: " + ", ".join(unauthorized))
+
+ landing_raw = TRIALS.build_landing_diff(worktree_path, diff_path=landing_diff_path)
+ landing_ref = TRIALS.persist_command_result(case_root, "worktree-landing-diff", landing_raw)
+ command_refs.append(landing_ref)
+ artifact_refs.extend([landing_ref["stdout_path"], landing_ref["stderr_path"], landing_ref["command_meta"], str(landing_diff_path)])
+
+ acceptance_refs, acceptance_ok = TRIALS.run_acceptance_checks(
+ case_root,
+ repo_root=worktree_path,
+ checks=case.get("acceptance_checks", []),
+ label_prefix="worktree-acceptance",
+ )
+ command_refs.extend(acceptance_refs)
+ for ref in acceptance_refs:
+ artifact_refs.extend([ref["stdout_path"], ref["stderr_path"], ref["command_meta"]])
+ if not acceptance_ok:
+ failure_class = "post_change_validation_failure"
+ raise RuntimeError("worktree acceptance failed")
+
+ return True, changed_files, command_refs, artifact_refs, None
+ except RuntimeError:
+ return False, changed_files, command_refs, artifact_refs, failure_class or "proposal_invalid"
+ finally:
+ remove_raw = TRIALS.remove_temp_worktree(repo_root, worktree_path)
+ remove_ref = TRIALS.persist_command_result(case_root, "worktree-remove", remove_raw)
+ command_refs.append(remove_ref)
+ artifact_refs.extend([remove_ref["stdout_path"], remove_ref["stderr_path"], remove_ref["command_meta"]])
+ write_json(
+ worktree_manifest_path,
+ {
+ **worktree_manifest,
+ "removed_at": utc_now(),
+ "remove_exit_code": remove_raw["exit_code"],
+ "remove_timed_out": remove_raw["timed_out"],
+ },
+ )
+
+
+def land_validated_diff(
+ case: dict[str, Any],
+ *,
+ log_root: Path,
+ repo_root: Path,
+ base_head: str | None,
+) -> tuple[bool, list[dict[str, Any]], list[str], str | None]:
+ case_root = scenario_root(log_root, case["case_id"])
+ landing_diff_path = case_root / "artifacts" / "landing.diff"
+ command_refs: list[dict[str, Any]] = []
+ artifact_refs = w5_report_artifact_refs(log_root, case["case_id"], extra=proposal_artifact_refs(case_root))
+
+ TRIALS.ensure_repo_tracked_clean(repo_root)
+ if base_head and TRIALS.git_head(repo_root) != base_head:
+ return False, command_refs, artifact_refs, "landing_reapply_failure"
+
+ diff_text = landing_diff_path.read_text(encoding="utf-8") if landing_diff_path.exists() else ""
+ if diff_text.strip():
+ main_check_raw = TRIALS.git_command(repo_root, ["apply", "--check", str(landing_diff_path)], timeout_s=60)
+ main_check_ref = TRIALS.persist_command_result(case_root, "landing-apply-check", main_check_raw)
+ command_refs.append(main_check_ref)
+ artifact_refs.extend([main_check_ref["stdout_path"], main_check_ref["stderr_path"], main_check_ref["command_meta"]])
+ if main_check_raw["exit_code"] != 0 or main_check_raw["timed_out"]:
+ return False, command_refs, artifact_refs, "landing_reapply_failure"
+
+ main_apply_raw = TRIALS.git_command(repo_root, ["apply", str(landing_diff_path)], timeout_s=60)
+ main_apply_ref = TRIALS.persist_command_result(case_root, "landing-apply", main_apply_raw)
+ command_refs.append(main_apply_ref)
+ artifact_refs.extend([main_apply_ref["stdout_path"], main_apply_ref["stderr_path"], main_apply_ref["command_meta"]])
+ if main_apply_raw["exit_code"] != 0 or main_apply_raw["timed_out"]:
+ return False, command_refs, artifact_refs, "landing_reapply_failure"
+
+ acceptance_refs, acceptance_ok = TRIALS.run_acceptance_checks(
+ case_root,
+ repo_root=repo_root,
+ checks=case.get("acceptance_checks", []),
+ label_prefix="landing-acceptance",
+ )
+ command_refs.extend(acceptance_refs)
+ for ref in acceptance_refs:
+ artifact_refs.extend([ref["stdout_path"], ref["stderr_path"], ref["command_meta"]])
+ if not acceptance_ok:
+ if diff_text.strip():
+ TRIALS.git_command(repo_root, ["apply", "-R", str(landing_diff_path)], timeout_s=60)
+ return False, command_refs, artifact_refs, "post_change_validation_failure"
+ return True, command_refs, artifact_refs, None
+
+
+def commit_checkpoint(case: dict[str, Any], *, repo_root: Path, case_root: Path) -> tuple[str | None, list[dict[str, Any]], list[str], str | None]:
+ command_refs: list[dict[str, Any]] = []
+ artifact_refs: list[str] = []
+ changed_files = TRIALS.list_changed_files(repo_root)
+ if not changed_files:
+ payload = {
+ "artifact_kind": "aoa.local-ai-trial.w5-commit-checkpoint",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "committed_at": utc_now(),
+ "commit_ref": None,
+ "commit_message": None,
+ "status": "no-op-clean",
+ }
+ path = case_root / "node-artifacts" / "commit-checkpoint.json"
+ write_json(path, payload)
+ artifact_refs.append(str(path))
+ return "no-op-clean", command_refs, artifact_refs, None
+
+ commit_message = COMMIT_MESSAGES[case["case_id"]]
+ add_raw = TRIALS.git_command(repo_root, ["add", "--", *changed_files], timeout_s=60)
+ add_ref = TRIALS.persist_command_result(case_root, "checkpoint-add", add_raw)
+ command_refs.append(add_ref)
+ artifact_refs.extend([add_ref["stdout_path"], add_ref["stderr_path"], add_ref["command_meta"]])
+ if add_raw["exit_code"] != 0 or add_raw["timed_out"]:
+ return None, command_refs, artifact_refs, "checkpoint_add_failed"
+
+ commit_raw = TRIALS.git_command(repo_root, ["commit", "-m", commit_message], timeout_s=120)
+ commit_ref = TRIALS.persist_command_result(case_root, "checkpoint-commit", commit_raw)
+ command_refs.append(commit_ref)
+ artifact_refs.extend([commit_ref["stdout_path"], commit_ref["stderr_path"], commit_ref["command_meta"]])
+ if commit_raw["exit_code"] != 0 or commit_raw["timed_out"]:
+ return None, command_refs, artifact_refs, "checkpoint_commit_failed"
+
+ sha_raw = TRIALS.git_command(repo_root, ["rev-parse", "HEAD"], timeout_s=30)
+ sha_ref = TRIALS.persist_command_result(case_root, "checkpoint-head", sha_raw)
+ command_refs.append(sha_ref)
+ artifact_refs.extend([sha_ref["stdout_path"], sha_ref["stderr_path"], sha_ref["command_meta"]])
+ if sha_raw["exit_code"] != 0 or sha_raw["timed_out"]:
+ return None, command_refs, artifact_refs, "checkpoint_head_failed"
+ sha = sha_raw["stdout"].strip()
+
+ payload = {
+ "artifact_kind": "aoa.local-ai-trial.w5-commit-checkpoint",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "case_id": case["case_id"],
+ "committed_at": utc_now(),
+ "commit_ref": sha,
+ "commit_message": commit_message,
+ "status": "committed",
+ }
+ path = case_root / "node-artifacts" / "commit-checkpoint.json"
+ write_json(path, payload)
+ artifact_refs.append(str(path))
+ return sha, command_refs, artifact_refs, None
+
+
+def make_index_payload(log_root: Path, mirror_root: Path) -> dict[str, Any]:
+ cases = available_cases()
+ case_entries: list[dict[str, Any]] = []
+ pass_count = 0
+ fail_count = 0
+ planned_count = 0
+ critical_failure_count = 0
+ unauthorized_scope_expansion = 0
+ post_change_validation_failure = 0
+ local_commit_refs: dict[str, str | None] = {}
+ pause_resume_proved = False
+ implementation_case_passed = False
+ generated_case_passed = False
+ novel_implementation_passes = 0
+ preexisting_noop_count = 0
+ repair_attempted_count = 0
+ repair_success_count = 0
+ implementation_case_ids = {
+ "stack-sync-federation-json-check-report",
+ "llamacpp-pilot-verify-command",
+ }
+
+ for case in cases:
+ result = load_result_summary(log_root, case["case_id"])
+ graph_state = load_graph_state(log_root, case["case_id"])
+ status = "planned"
+ if result:
+ status = result["status"]
+ if status == "pass":
+ pass_count += 1
+ elif status == "fail":
+ fail_count += 1
+ if result.get("failure_class") in CRITICAL_FAILURES:
+ critical_failure_count += 1
+ if result.get("failure_class") == "unauthorized_scope_expansion":
+ unauthorized_scope_expansion += 1
+ if result.get("failure_class") == "post_change_validation_failure":
+ post_change_validation_failure += 1
+ elif graph_state:
+ status = "paused" if graph_state.get("paused") else "in-progress"
+ else:
+ planned_count += 1
+
+ repair_attempted_count += int((graph_state or {}).get("repair_attempts", 0))
+ if bool((graph_state or {}).get("repair_succeeded")):
+ repair_success_count += 1
+
+ if case["case_id"] in implementation_case_ids:
+ if bool((graph_state or {}).get("preexisting_noop")) or (graph_state or {}).get("local_commit_ref") == "no-op-clean":
+ preexisting_noop_count += 1
+ if result and result.get("status") == "pass" and not bool((graph_state or {}).get("preexisting_noop")):
+ novel_implementation_passes += 1
+ implementation_case_passed = implementation_case_passed or bool(result and result.get("status") == "pass")
+ if case["case_id"] == "llamacpp-pilot-verify-command":
+ if graph_state:
+ history = graph_state.get("history", [])
+ pause_resume_proved = (
+ any(item.get("node") == "await_landing" and item.get("status") == "paused" for item in history)
+ and graph_state.get("resume_count", 0) > 0
+ and bool(result and result.get("status") == "pass")
+ )
+ if case["case_id"] == "aoa-routing-generated-surface-refresh":
+ generated_case_passed = bool(result and result.get("status") == "pass")
+
+ local_commit_refs[case["case_id"]] = (graph_state or {}).get("local_commit_ref")
+
+ entry = {
+ "case_id": case["case_id"],
+ "status": status,
+ "repo_scope": case["repo_scope"],
+ "task_family": case["task_family"],
+ "case_spec": str(scenario_root(log_root, case["case_id"]) / "case.spec.json"),
+ "summary": case["title"],
+ "current_node": (graph_state or {}).get("current_node"),
+ "approval_status": (graph_state or {}).get("approval_status"),
+ "milestone": (graph_state or {}).get("current_milestone"),
+ "local_commit_ref": (graph_state or {}).get("local_commit_ref"),
+ "repair_attempts": (graph_state or {}).get("repair_attempts", 0),
+ "repair_succeeded": bool((graph_state or {}).get("repair_succeeded")),
+ "preexisting_noop": bool((graph_state or {}).get("preexisting_noop")),
+ }
+ report_path = scenario_root(log_root, case["case_id"]) / "report.md"
+ if report_path.exists():
+ entry["report_md"] = str(mirror_root / TRIALS.case_report_name(WAVE_ID, case["case_id"]))
+ case_entries.append(entry)
+
+ implementation_case_passed = novel_implementation_passes == len(implementation_case_ids)
+
+ gate_pass = (
+ pass_count == len(cases)
+ and critical_failure_count == 0
+ and pause_resume_proved
+ and novel_implementation_passes == 2
+ and generated_case_passed
+ and implementation_case_passed
+ and preexisting_noop_count == 0
+ and unauthorized_scope_expansion == 0
+ and post_change_validation_failure == 0
+ )
+
+ if gate_pass:
+ gate_result = "pass"
+ next_action = "W6 passed on the promoted llama.cpp + LangGraph autonomy track. Use this substrate and approval posture as the baseline for the next implementation-heavy autonomy wave."
+ elif planned_count == len(cases):
+ gate_result = "not-run"
+ next_action = "Materialize the W6 pilot, then start the first scenario at the plan_freeze milestone."
+ elif fail_count or critical_failure_count:
+ gate_result = "fail"
+ next_action = "Stop at W6, inspect the failed scenario packets, and remediate before broadening autonomy claims."
+ else:
+ gate_result = "in-progress"
+ next_action = "Continue the paused W6 scenarios through their next milestone gate."
+
+ return {
+ "artifact_kind": "aoa.local-ai-trial.wave-index",
+ "program_id": PROGRAM_ID,
+ "wave_id": WAVE_ID,
+ "wave_title": W6_METADATA["title"],
+ "wave_summary": W6_METADATA["summary"],
+ "case_count": len(cases),
+ "status_counts": {
+ "pass": pass_count,
+ "fail": fail_count,
+ "planned": planned_count,
+ },
+ "gate_result": gate_result,
+ "next_action": next_action,
+ "cases": case_entries,
+ "gate_detail": {
+ "pass_count": pass_count,
+ "fail_count": fail_count,
+ "critical_failures": critical_failure_count,
+ "pause_resume_proved": pause_resume_proved,
+ "novel_implementation_passes": novel_implementation_passes,
+ "implementation_case_passed": implementation_case_passed,
+ "generated_case_passed": generated_case_passed,
+ "preexisting_noop_count": preexisting_noop_count,
+ "repair_attempted_count": repair_attempted_count,
+ "repair_success_count": repair_success_count,
+ "unauthorized_scope_expansion": unauthorized_scope_expansion,
+ "post_change_validation_failure": post_change_validation_failure,
+ "local_commit_refs": local_commit_refs,
+ "next_action": next_action,
+ },
+ }
+
+
+def summary_memo(log_root: Path, mirror_root: Path) -> str:
+ index_payload = make_index_payload(log_root, mirror_root)
+ gate = index_payload["gate_detail"]
+ return "\n".join(
+ [
+ "# W6 Summary",
+ "",
+ "## Wave Verdict",
+ f"- Gate result: `{index_payload['gate_result']}`",
+ f"- Pass count: `{gate['pass_count']}`",
+ f"- Fail count: `{gate['fail_count']}`",
+ f"- Pause/resume proved: `{gate['pause_resume_proved']}`",
+ f"- Novel implementation passes: `{gate['novel_implementation_passes']}`",
+ f"- Generated case passed: `{gate['generated_case_passed']}`",
+ f"- Implementation case passed: `{gate['implementation_case_passed']}`",
+ f"- Preexisting no-op count: `{gate['preexisting_noop_count']}`",
+ f"- Repair attempted count: `{gate['repair_attempted_count']}`",
+ f"- Repair success count: `{gate['repair_success_count']}`",
+ "",
+ "## Substrate",
+ "- Runtime path: `llama.cpp -> langchain-api /run` on `http://127.0.0.1:5403/run`",
+ "- Orchestration layer: `LangGraph`",
+ "",
+ "## Next Action",
+ index_payload["next_action"],
+ "",
+ ]
+ )
+
+
+def refresh_w6_outputs(log_root: Path, mirror_root: Path) -> None:
+ index_payload = make_index_payload(log_root, mirror_root)
+ write_json(log_root / f"{INDEX_NAME}.json", index_payload)
+ index_md = TRIALS.render_wave_index_md(index_payload)
+ write_text(log_root / f"{INDEX_NAME}.md", index_md)
+ write_text(mirror_root / f"{INDEX_NAME}.md", index_md)
+ write_text(mirror_root / SUMMARY_MEMO_NAME, summary_memo(log_root, mirror_root))
+
+
+def build_graph(log_root: Path, mirror_root: Path):
+ def route_from_phase(state: W5State) -> Command[str]:
+ next_node = state.get("next_node") or "preflight"
+ return Command(update={"current_node": "route"}, goto=next_node)
+
+ def preflight(state: W5State) -> Command[str]:
+ case_id = state["case_id"]
+ case_root = scenario_root(log_root, case_id)
+ command_refs = list(state.get("command_refs", []))
+ artifact_refs = list(state.get("artifact_refs", []))
+ try:
+ ensure_w5_pass()
+ ensure_llamacpp_promotion_pass()
+
+ doctor_raw = TRIALS.run_command([absolute(SCRIPTS_ROOT / "aoa-doctor"), "--preset", "intel-full"], cwd=CONFIGS_ROOT, timeout_s=180)
+ doctor_ref = TRIALS.persist_command_result(case_root, "preflight-doctor", doctor_raw)
+ command_refs.append(doctor_ref)
+ artifact_refs.extend([doctor_ref["stdout_path"], doctor_ref["stderr_path"], doctor_ref["command_meta"]])
+ if doctor_raw["exit_code"] != 0 or doctor_raw["timed_out"]:
+ raise RuntimeError("aoa-doctor --preset intel-full failed")
+
+ for label, url in (
+ ("health-llamacpp", LANGCHAIN_RUN_URL.rsplit("/", 1)[0] + "/health"),
+ ("health-route-api", "http://127.0.0.1:5402/health"),
+ ("health-baseline", "http://127.0.0.1:5401/health"),
+ ):
+ health_ref, payload = build_health_check(case_root, label, url)
+ command_refs.append(health_ref)
+ artifact_refs.extend([health_ref["stdout_path"], health_ref["stderr_path"], health_ref["command_meta"]])
+ if health_ref["exit_code"] != 0 or payload.get("ok") is not True:
+ raise RuntimeError(f"preflight health failed for {url}")
+
+ history = record_event(state, node="preflight", status="pass", note="W5 baseline, llama.cpp promotion, and runtime health posture are green.")
+ node_json(
+ log_root,
+ case_id,
+ "preflight",
+ {
+ "checked_at": utc_now(),
+ "w5_index": str(BASELINE_W5_LOG_ROOT / "W5-long-horizon-index.json"),
+ "llamacpp_promotion": str(LLAMACPP_PROMOTION_ROOT / "latest.json"),
+ "run_url": LANGCHAIN_RUN_URL,
+ "status": "pass",
+ },
+ )
+ return Command(
+ update={
+ "current_node": "preflight",
+ "next_node": "load_scenario",
+ "history": history,
+ "command_refs": command_refs,
+ "artifact_refs": artifact_refs,
+ "paused": False,
+ "pause_reason": None,
+ "pause_milestone": None,
+ "failure_class": None,
+ "terminal_status": None,
+ },
+ goto="load_scenario",
+ )
+ except Exception as exc:
+ history = record_event(state, node="preflight", status="fail", note=str(exc))
+ case = load_case_spec(log_root, case_id)
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs,
+ artifact_refs=artifact_refs,
+ status="fail",
+ score_breakdown={"preflight_ok": False},
+ observed={
+ "highlights": ["W6 stopped before scenario execution because preflight failed."],
+ "failures": [str(exc)],
+ },
+ failure_class="preflight_failure",
+ reviewer_notes="The W6 preflight did not satisfy the required W5, llama.cpp, and runtime-health posture.",
+ boundary_notes=TRIALS.w4_boundary_note() if case["execution_mode"] != "read_only_summary" else TRIALS.w2_boundary_note(),
+ next_action="Repair the failing runtime prerequisite before retrying this W6 scenario.",
+ )
+ return Command(
+ update={
+ "current_node": "preflight",
+ "next_node": "finalize_report",
+ "history": history,
+ "command_refs": command_refs,
+ "artifact_refs": artifact_refs,
+ "failure_class": "preflight_failure",
+ "terminal_status": "fail",
+ },
+ goto="finalize_report",
+ )
+
+ def load_scenario(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ history = record_event(state, node="load_scenario", status="pass", note=f"Loaded `{case['case_id']}` with execution_mode `{case['execution_mode']}`.")
+ node_json(
+ log_root,
+ case["case_id"],
+ "load-scenario",
+ {
+ "loaded_at": utc_now(),
+ "case_id": case["case_id"],
+ "execution_mode": case["execution_mode"],
+ "milestone_gates": case.get("milestone_gates", []),
+ "derived_from": case.get("derived_from"),
+ },
+ )
+ return Command(
+ update={
+ "current_node": "load_scenario",
+ "next_node": "collect_evidence",
+ "execution_mode": case["execution_mode"],
+ "history": history,
+ },
+ goto="collect_evidence",
+ )
+
+ def collect_evidence(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ payload = collect_evidence_payload(case)
+ node_json(log_root, case["case_id"], "collect-evidence", payload)
+ history = record_event(state, node="collect_evidence", status="pass", note="Scenario refs, observed actions, and bounded scope were captured.")
+ return Command(
+ update={
+ "current_node": "collect_evidence",
+ "next_node": "draft_plan",
+ "history": history,
+ "artifact_refs": [*state.get("artifact_refs", []), str(node_artifacts_dir(log_root, case["case_id"]) / "collect-evidence.json")],
+ },
+ goto="draft_plan",
+ )
+
+ def draft_plan(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ payload = build_scenario_plan(case)
+ write_json(plan_path(log_root, case["case_id"]), payload)
+ node_json(log_root, case["case_id"], "draft-plan", payload)
+ history = record_event(state, node="draft_plan", status="pass", note="A deterministic bounded plan was drafted for the next milestone review.")
+ return Command(
+ update={
+ "current_node": "draft_plan",
+ "next_node": "await_plan_freeze",
+ "history": history,
+ "artifact_refs": [*state.get("artifact_refs", []), str(plan_path(log_root, case["case_id"]))],
+ },
+ goto="await_plan_freeze",
+ )
+
+ def milestone_gate(state: W5State, *, milestone_id: str, next_node: str, node_name: str) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ history = list(state.get("history", []))
+ forced_pause_seen = list(state.get("forced_pause_seen", []))
+ existing = approval_payload(log_root, case["case_id"])
+ approval_status = interpret_approval_status(existing, milestone_id=milestone_id)
+ force_pause = case.get("force_pause_on_milestone") == milestone_id and milestone_id not in forced_pause_seen
+
+ if state.get("until") == "milestone" or force_pause:
+ write_approval_status(
+ log_root,
+ case=case,
+ milestone_id=milestone_id,
+ base_head=state.get("base_head"),
+ notes=f"Review the W6 `{milestone_id}` boundary and set status to approved or rejected before resuming.",
+ )
+ if force_pause:
+ forced_pause_seen.append(milestone_id)
+ history = record_event(
+ {"history": history},
+ node=node_name,
+ status="paused",
+ note=f"W6 paused at milestone `{milestone_id}`.",
+ )
+ write_interrupt(log_root, case_id=case["case_id"], milestone_id=milestone_id, reason="milestone_pending")
+ return Command(
+ update={
+ "current_node": node_name,
+ "next_node": node_name,
+ "history": history,
+ "paused": True,
+ "pause_reason": "milestone_pending",
+ "pause_milestone": milestone_id,
+ "approval_status": "pending",
+ "current_milestone": milestone_id,
+ "terminal_status": "paused",
+ "forced_pause_seen": forced_pause_seen,
+ },
+ goto=END,
+ )
+
+ if approval_status == "approved":
+ history = record_event(
+ {"history": history},
+ node=node_name,
+ status="approved",
+ note=f"Approval granted for `{milestone_id}`.",
+ )
+ return Command(
+ update={
+ "current_node": node_name,
+ "next_node": next_node,
+ "history": history,
+ "paused": False,
+ "pause_reason": None,
+ "pause_milestone": None,
+ "approval_status": "approved",
+ "current_milestone": milestone_id,
+ "terminal_status": None,
+ "forced_pause_seen": forced_pause_seen,
+ },
+ goto=next_node,
+ )
+
+ if approval_status == "rejected":
+ finalize_rejected_case(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ milestone_id=milestone_id,
+ command_refs=list(state.get("command_refs", [])),
+ artifact_refs=[*state.get("artifact_refs", []), *w5_report_artifact_refs(log_root, case["case_id"])],
+ )
+ history = record_event(
+ {"history": history},
+ node=node_name,
+ status="rejected",
+ note=f"Approval was explicitly rejected at `{milestone_id}`.",
+ )
+ return Command(
+ update={
+ "current_node": node_name,
+ "next_node": "finalize_report",
+ "history": history,
+ "paused": False,
+ "pause_reason": None,
+ "pause_milestone": milestone_id,
+ "approval_status": "rejected",
+ "current_milestone": milestone_id,
+ "terminal_status": "rejected",
+ "failure_class": "approval_rejected",
+ "forced_pause_seen": forced_pause_seen,
+ },
+ goto="finalize_report",
+ )
+
+ write_approval_status(
+ log_root,
+ case=case,
+ milestone_id=milestone_id,
+ base_head=state.get("base_head"),
+ notes=f"Review the W6 `{milestone_id}` boundary and set status to approved or rejected before resuming.",
+ )
+ history = record_event(
+ {"history": history},
+ node=node_name,
+ status="paused",
+ note=f"W6 paused at milestone `{milestone_id}`.",
+ )
+ write_interrupt(log_root, case_id=case["case_id"], milestone_id=milestone_id, reason="milestone_pending")
+ return Command(
+ update={
+ "current_node": node_name,
+ "next_node": node_name,
+ "history": history,
+ "paused": True,
+ "pause_reason": "milestone_pending",
+ "pause_milestone": milestone_id,
+ "approval_status": "pending",
+ "current_milestone": milestone_id,
+ "terminal_status": "paused",
+ "forced_pause_seen": forced_pause_seen,
+ },
+ goto=END,
+ )
+
+ def await_plan_freeze(state: W5State) -> Command[str]:
+ next_node = "execute_read_only_actions" if state["execution_mode"] == "read_only_summary" else "build_proposal"
+ return milestone_gate(state, milestone_id="plan_freeze", next_node=next_node, node_name="await_plan_freeze")
+
+ def execute_read_only_actions(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ result = run_read_only_scenario(case, log_root=log_root, mirror_root=mirror_root)
+ history = record_event(
+ state,
+ node="execute_read_only_actions",
+ status=result["status"],
+ note="Executed the bounded read-only scenario after plan approval.",
+ extra={"failure_class": result.get("failure_class")},
+ )
+ return Command(
+ update={
+ "current_node": "execute_read_only_actions",
+ "next_node": "draft_summary",
+ "history": history,
+ "command_refs": result.get("command_refs", []),
+ "artifact_refs": result.get("artifact_refs", []),
+ "failure_class": result.get("failure_class"),
+ "terminal_status": result["status"],
+ },
+ goto="draft_summary",
+ )
+
+ def draft_summary(state: W5State) -> Command[str]:
+ result = load_result_summary(log_root, state["case_id"]) or {}
+ history = record_event(
+ state,
+ node="draft_summary",
+ status=str(result.get("status") or "fail"),
+ note="Read-only scenario summary was recorded into the standard packet shape.",
+ )
+ node_json(
+ log_root,
+ state["case_id"],
+ "draft-summary",
+ {
+ "recorded_at": utc_now(),
+ "result_status": result.get("status"),
+ "failure_class": result.get("failure_class"),
+ },
+ )
+ return Command(
+ update={
+ "current_node": "draft_summary",
+ "next_node": "finalize_report",
+ "history": history,
+ },
+ goto="finalize_report",
+ )
+
+ def build_proposal(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ try:
+ proposal_summary, command_refs, failures, repo_root = prepare_mutation_proposal(case, log_root=log_root)
+ except Exception as exc:
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=list(state.get("command_refs", [])),
+ artifact_refs=w5_report_artifact_refs(log_root, case["case_id"]),
+ status="fail",
+ score_breakdown={
+ "plan_freeze_approved": True,
+ "proposal_valid": False,
+ "unauthorized_scope_expansion": False,
+ "post_change_validation_failure": False,
+ },
+ observed={
+ "highlights": ["Mutation proposal did not complete cleanly."],
+ "failures": [f"{type(exc).__name__}: {exc}"],
+ },
+ failure_class="proposal_invalid",
+ reviewer_notes="The W6 mutation proposal could not be prepared inside the bounded scope.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Inspect the proposal preparation artifacts and repair the bounded proposal before retrying.",
+ )
+ history = record_event(state, node="build_proposal", status="fail", note=f"{type(exc).__name__}: {exc}")
+ return Command(
+ update={
+ "current_node": "build_proposal",
+ "next_node": "finalize_report",
+ "history": history,
+ "failure_class": "proposal_invalid",
+ "terminal_status": "fail",
+ },
+ goto="finalize_report",
+ )
+
+ history = record_event(
+ state,
+ node="build_proposal",
+ status="pass" if proposal_summary.get("proposal_valid") else "fail",
+ note="Prepared the bounded mutation proposal for W6.",
+ )
+ command_refs_all = [*state.get("command_refs", []), *command_refs]
+ artifact_refs_all = [
+ *state.get("artifact_refs", []),
+ *proposal_artifact_refs(scenario_root(log_root, case["case_id"])),
+ *w5_report_artifact_refs(log_root, case["case_id"]),
+ ]
+ if not proposal_summary.get("proposal_valid"):
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs_all,
+ artifact_refs=artifact_refs_all,
+ status="fail",
+ score_breakdown={
+ "plan_freeze_approved": True,
+ "proposal_valid": False,
+ "unauthorized_scope_expansion": False,
+ "post_change_validation_failure": False,
+ },
+ observed={
+ "highlights": ["Mutation proposal was prepared but did not validate cleanly."],
+ "failures": proposal_summary.get("proposal_failure_reasons") or failures or ["proposal marked invalid"],
+ },
+ failure_class="proposal_invalid",
+ reviewer_notes="The W6 mutation proposal did not satisfy the bounded proposal contract.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Refresh the proposal, review the new packet, and retry the scenario.",
+ )
+ return Command(
+ update={
+ "current_node": "build_proposal",
+ "next_node": "finalize_report",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "proposal_valid": False,
+ "failure_class": "proposal_invalid",
+ "terminal_status": "fail",
+ "base_head": proposal_summary.get("base_head"),
+ },
+ goto="finalize_report",
+ )
+ return Command(
+ update={
+ "current_node": "build_proposal",
+ "next_node": "worktree_apply",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "proposal_valid": True,
+ "base_head": proposal_summary.get("base_head"),
+ "preexisting_noop": proposal_summary.get("edit_spec_mode") == "preexisting_noop",
+ },
+ goto="worktree_apply",
+ )
+
+ def worktree_apply(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ repo_root = repo_root_for_scenario(case)
+ ok, changed_files, command_refs, artifact_refs, failure_class = run_worktree_preview(
+ case,
+ log_root=log_root,
+ repo_root=repo_root,
+ )
+ history = record_event(
+ state,
+ node="worktree_apply",
+ status="pass" if ok else "fail",
+ note="Executed the isolated worktree preview for the mutation scenario.",
+ extra={"failure_class": failure_class, "changed_files": changed_files},
+ )
+ command_refs_all = [*state.get("command_refs", []), *command_refs]
+ artifact_refs_all = [*state.get("artifact_refs", []), *artifact_refs]
+ if not ok:
+ if failure_class == "post_change_validation_failure" and int(state.get("repair_attempts", 0)) < 1:
+ return Command(
+ update={
+ "current_node": "worktree_apply",
+ "next_node": "autonomous_repair_loop",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "changed_files": changed_files,
+ "failure_class": failure_class,
+ },
+ goto="autonomous_repair_loop",
+ )
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs_all,
+ artifact_refs=artifact_refs_all,
+ status="fail",
+ score_breakdown={
+ "plan_freeze_approved": True,
+ "proposal_valid": True,
+ "unauthorized_scope_expansion": failure_class == "unauthorized_scope_expansion",
+ "post_change_validation_failure": failure_class == "post_change_validation_failure",
+ },
+ observed={
+ "highlights": [f"Changed files observed in worktree preview: `{json.dumps(changed_files, ensure_ascii=True)}`."],
+ "failures": [failure_class or "worktree preview failed"],
+ "changed_files": changed_files,
+ },
+ failure_class=failure_class,
+ reviewer_notes="The W6 mutation scenario did not satisfy the isolated worktree preview contract.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Inspect the worktree preview artifacts before retrying the scenario.",
+ )
+ return Command(
+ update={
+ "current_node": "worktree_apply",
+ "next_node": "finalize_report",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "changed_files": changed_files,
+ "failure_class": failure_class,
+ "terminal_status": "fail",
+ },
+ goto="finalize_report",
+ )
+ return Command(
+ update={
+ "current_node": "worktree_apply",
+ "next_node": "acceptance_validate",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "changed_files": changed_files,
+ "preview_ready": True,
+ "repair_succeeded": bool(state.get("repair_attempts", 0) > 0),
+ },
+ goto="acceptance_validate",
+ )
+
+ def autonomous_repair_loop(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ repair_attempts = int(state.get("repair_attempts", 0)) + 1
+ history = record_event(
+ state,
+ node="autonomous_repair_loop",
+ status="pass",
+ note="Triggered one bounded autonomous repair attempt after post-change validation failure.",
+ extra={"repair_attempt": repair_attempts},
+ )
+ try:
+ proposal_summary, command_refs, failures, _repo_root = prepare_mutation_proposal(case, log_root=log_root)
+ except Exception as exc:
+ command_refs_all = list(state.get("command_refs", []))
+ artifact_refs_all = [*state.get("artifact_refs", []), *w5_report_artifact_refs(log_root, case["case_id"])]
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs_all,
+ artifact_refs=artifact_refs_all,
+ status="fail",
+ score_breakdown={
+ "plan_freeze_approved": True,
+ "proposal_valid": False,
+ "repair_attempted": True,
+ "unauthorized_scope_expansion": False,
+ "post_change_validation_failure": True,
+ },
+ observed={
+ "highlights": ["Autonomous repair attempted to refresh the bounded proposal after worktree validation failed."],
+ "failures": [f"{type(exc).__name__}: {exc}"],
+ },
+ failure_class="proposal_invalid",
+ reviewer_notes="The W6 repair loop could not prepare a valid bounded retry proposal.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Inspect the repair artifacts before retrying the scenario.",
+ )
+ return Command(
+ update={
+ "current_node": "autonomous_repair_loop",
+ "next_node": "finalize_report",
+ "history": history,
+ "repair_attempts": repair_attempts,
+ "failure_class": "proposal_invalid",
+ "terminal_status": "fail",
+ },
+ goto="finalize_report",
+ )
+
+ command_refs_all = [*state.get("command_refs", []), *command_refs]
+ artifact_refs_all = [
+ *state.get("artifact_refs", []),
+ *proposal_artifact_refs(scenario_root(log_root, case["case_id"])),
+ *w5_report_artifact_refs(log_root, case["case_id"]),
+ ]
+ if not proposal_summary.get("proposal_valid"):
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs_all,
+ artifact_refs=artifact_refs_all,
+ status="fail",
+ score_breakdown={
+ "plan_freeze_approved": True,
+ "proposal_valid": False,
+ "repair_attempted": True,
+ "unauthorized_scope_expansion": False,
+ "post_change_validation_failure": True,
+ },
+ observed={
+ "highlights": ["Autonomous repair attempted one bounded retry after worktree validation failed."],
+ "failures": proposal_summary.get("proposal_failure_reasons") or failures or ["repair proposal marked invalid"],
+ },
+ failure_class="proposal_invalid",
+ reviewer_notes="The W6 repair loop produced a proposal that still failed the bounded proposal contract.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Inspect the repair attempt artifacts before retrying the scenario.",
+ )
+ return Command(
+ update={
+ "current_node": "autonomous_repair_loop",
+ "next_node": "finalize_report",
+ "history": history,
+ "repair_attempts": repair_attempts,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "proposal_valid": False,
+ "failure_class": "proposal_invalid",
+ "terminal_status": "fail",
+ "base_head": proposal_summary.get("base_head"),
+ },
+ goto="finalize_report",
+ )
+ return Command(
+ update={
+ "current_node": "autonomous_repair_loop",
+ "next_node": "worktree_apply",
+ "history": history,
+ "repair_attempts": repair_attempts,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "proposal_valid": True,
+ "base_head": proposal_summary.get("base_head"),
+ "preexisting_noop": proposal_summary.get("edit_spec_mode") == "preexisting_noop",
+ },
+ goto="worktree_apply",
+ )
+
+ def acceptance_validate(state: W5State) -> Command[str]:
+ history = record_event(
+ state,
+ node="acceptance_validate",
+ status="pass",
+ note="The isolated worktree acceptance checks passed and a landing diff is ready for review.",
+ )
+ node_json(
+ log_root,
+ state["case_id"],
+ "acceptance-validate",
+ {
+ "checked_at": utc_now(),
+ "preview_ready": True,
+ "changed_files": state.get("changed_files", []),
+ },
+ )
+ return Command(
+ update={
+ "current_node": "acceptance_validate",
+ "next_node": "await_landing",
+ "history": history,
+ },
+ goto="await_landing",
+ )
+
+ def await_landing(state: W5State) -> Command[str]:
+ return milestone_gate(state, milestone_id="landing", next_node="land_or_rollback", node_name="await_landing")
+
+ def land_or_rollback(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ repo_root = repo_root_for_scenario(case)
+ ok, command_refs, artifact_refs, failure_class = land_validated_diff(
+ case,
+ log_root=log_root,
+ repo_root=repo_root,
+ base_head=state.get("base_head"),
+ )
+ history = record_event(
+ state,
+ node="land_or_rollback",
+ status="pass" if ok else "fail",
+ note="Landing decision executed against the validated diff and main-repo acceptance checks.",
+ extra={"failure_class": failure_class},
+ )
+ command_refs_all = [*state.get("command_refs", []), *command_refs]
+ artifact_refs_all = [*state.get("artifact_refs", []), *artifact_refs]
+ if not ok:
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs_all,
+ artifact_refs=artifact_refs_all,
+ status="fail",
+ score_breakdown={
+ "plan_freeze_approved": True,
+ "proposal_valid": True,
+ "landing_approved": True,
+ "unauthorized_scope_expansion": False,
+ "post_change_validation_failure": failure_class == "post_change_validation_failure",
+ },
+ observed={
+ "highlights": [f"Changed files: `{json.dumps(state.get('changed_files', []), ensure_ascii=True)}`."],
+ "failures": [failure_class or "landing failed"],
+ "changed_files": state.get("changed_files", []),
+ },
+ failure_class=failure_class,
+ reviewer_notes="The W6 mutation scenario failed during landing or post-landing validation.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Inspect the landing artifacts and repo state before retrying the scenario.",
+ )
+ return Command(
+ update={
+ "current_node": "land_or_rollback",
+ "next_node": "finalize_report",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "failure_class": failure_class,
+ "terminal_status": "fail",
+ },
+ goto="finalize_report",
+ )
+ return Command(
+ update={
+ "current_node": "land_or_rollback",
+ "next_node": "commit_checkpoint",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ },
+ goto="commit_checkpoint",
+ )
+
+ def commit_checkpoint_node(state: W5State) -> Command[str]:
+ case = load_case_spec(log_root, state["case_id"])
+ repo_root = repo_root_for_scenario(case)
+ case_root = scenario_root(log_root, case["case_id"])
+ commit_ref, command_refs, artifact_refs, commit_failure = commit_checkpoint(case, repo_root=repo_root, case_root=case_root)
+ history = record_event(
+ state,
+ node="commit_checkpoint",
+ status="pass" if commit_failure is None else "fail",
+ note="Recorded the local mutation checkpoint for the landed scenario.",
+ extra={"local_commit_ref": commit_ref, "failure_class": commit_failure},
+ )
+ command_refs_all = [*state.get("command_refs", []), *command_refs]
+ artifact_refs_all = [*state.get("artifact_refs", []), *artifact_refs]
+ if commit_failure is not None:
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs_all,
+ artifact_refs=artifact_refs_all,
+ status="fail",
+ score_breakdown={
+ "plan_freeze_approved": True,
+ "proposal_valid": True,
+ "landing_approved": True,
+ "checkpoint_committed": False,
+ "unauthorized_scope_expansion": False,
+ "post_change_validation_failure": False,
+ },
+ observed={
+ "highlights": [f"Landed changed files: `{json.dumps(state.get('changed_files', []), ensure_ascii=True)}`."],
+ "failures": [commit_failure],
+ "changed_files": state.get("changed_files", []),
+ },
+ failure_class="checkpoint_commit_failure",
+ reviewer_notes="The W6 mutation scenario landed but could not record the required local commit checkpoint.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Repair the git commit checkpoint and restore a clean tracked state before retrying broader W6 work.",
+ )
+ return Command(
+ update={
+ "current_node": "commit_checkpoint",
+ "next_node": "finalize_report",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "failure_class": "checkpoint_commit_failure",
+ "terminal_status": "fail",
+ },
+ goto="finalize_report",
+ )
+
+ finalize_case_with_summary(
+ case=case,
+ log_root=log_root,
+ mirror_root=mirror_root,
+ backend=f"langgraph:{case['execution_mode']}",
+ command_refs=command_refs_all,
+ artifact_refs=artifact_refs_all,
+ status="pass",
+ score_breakdown={
+ "plan_freeze_approved": True,
+ "proposal_valid": True,
+ "landing_approved": True,
+ "checkpoint_committed": True,
+ "unauthorized_scope_expansion": False,
+ "post_change_validation_failure": False,
+ },
+ observed={
+ "highlights": [
+ f"Changed files: `{json.dumps(state.get('changed_files', []), ensure_ascii=True)}`.",
+ f"Local commit ref: `{commit_ref}`.",
+ f"Repair attempts: `{state.get('repair_attempts', 0)}`.",
+ ],
+ "failures": ["None."],
+ "changed_files": state.get("changed_files", []),
+ "local_commit_ref": commit_ref,
+ },
+ failure_class=None,
+ reviewer_notes="The W6 mutation scenario stayed inside approved scope, passed worktree and landing validation, and recorded the required local commit checkpoint.",
+ boundary_notes=TRIALS.w4_boundary_note(),
+ next_action="Review the packet and decide whether to approve the next W6 scenario.",
+ )
+ return Command(
+ update={
+ "current_node": "commit_checkpoint",
+ "next_node": "finalize_report",
+ "history": history,
+ "command_refs": command_refs_all,
+ "artifact_refs": artifact_refs_all,
+ "local_commit_ref": commit_ref,
+ "local_commit_message": COMMIT_MESSAGES.get(case["case_id"]),
+ "terminal_status": "pass",
+ },
+ goto="finalize_report",
+ )
+
+ def finalize_report(state: W5State) -> Command[str]:
+ refresh_w6_outputs(log_root, mirror_root)
+ result = load_result_summary(log_root, state["case_id"])
+ terminal_status = state.get("terminal_status")
+ if result:
+ terminal_status = str(result.get("status") or terminal_status or "fail")
+ history = record_event(
+ state,
+ node="finalize_report",
+ status=terminal_status or "unknown",
+ note="W6 index and mirror summary were refreshed.",
+ )
+ node_json(
+ log_root,
+ state["case_id"],
+ "finalize-report",
+ {
+ "finalized_at": utc_now(),
+ "terminal_status": terminal_status,
+ "wave_index": str(log_root / f"{INDEX_NAME}.json"),
+ "summary_memo": str(mirror_root / SUMMARY_MEMO_NAME),
+ },
+ )
+ return Command(
+ update={
+ "current_node": "finalize_report",
+ "next_node": None,
+ "history": history,
+ "terminal_status": terminal_status,
+ },
+ goto=END,
+ )
+
+ graph = StateGraph(W5State)
+ graph.add_node("route_from_phase", route_from_phase)
+ graph.add_node("preflight", preflight)
+ graph.add_node("load_scenario", load_scenario)
+ graph.add_node("collect_evidence", collect_evidence)
+ graph.add_node("draft_plan", draft_plan)
+ graph.add_node("await_plan_freeze", await_plan_freeze)
+ graph.add_node("execute_read_only_actions", execute_read_only_actions)
+ graph.add_node("draft_summary", draft_summary)
+ graph.add_node("build_proposal", build_proposal)
+ graph.add_node("worktree_apply", worktree_apply)
+ graph.add_node("autonomous_repair_loop", autonomous_repair_loop)
+ graph.add_node("acceptance_validate", acceptance_validate)
+ graph.add_node("await_landing", await_landing)
+ graph.add_node("land_or_rollback", land_or_rollback)
+ graph.add_node("commit_checkpoint", commit_checkpoint_node)
+ graph.add_node("finalize_report", finalize_report)
+ graph.add_edge(START, "route_from_phase")
+ return graph.compile()
+
+
+def run_graph_scenario(log_root: Path, mirror_root: Path, *, case_id: str, until: str, resume: bool) -> W5State:
+ graph = build_graph(log_root, mirror_root)
+ existing = load_graph_state(log_root, case_id) or {}
+ state: W5State = {
+ **existing,
+ "case_id": case_id,
+ "until": until,
+ "paused": False,
+ "pause_reason": None,
+ "pause_milestone": None,
+ "current_node": existing.get("current_node"),
+ "next_node": existing.get("next_node") or ("await_plan_freeze" if resume else "preflight"),
+ "resume_count": int(existing.get("resume_count", 0)) + (1 if resume else 0),
+ "history": list(existing.get("history", [])),
+ "command_refs": list(existing.get("command_refs", [])),
+ "artifact_refs": list(existing.get("artifact_refs", [])),
+ "changed_files": list(existing.get("changed_files", [])),
+ "forced_pause_seen": list(existing.get("forced_pause_seen", [])),
+ "repair_attempts": int(existing.get("repair_attempts", 0)),
+ "repair_succeeded": bool(existing.get("repair_succeeded", False)),
+ "preexisting_noop": bool(existing.get("preexisting_noop", False)),
+ }
+ final_state = graph.invoke(state)
+ save_graph_state(log_root, case_id, final_state)
+ refresh_w6_outputs(log_root, mirror_root)
+ return final_state
+
+
+def print_case_status(log_root: Path, case_id: str) -> None:
+ payload = {
+ "case_id": case_id,
+ "graph_state": load_graph_state(log_root, case_id),
+ "approval": approval_payload(log_root, case_id),
+ "result_summary": load_result_summary(log_root, case_id),
+ }
+ print(json.dumps(payload, indent=2, ensure_ascii=True))
+
+
+def print_all_status(log_root: Path, mirror_root: Path) -> None:
+ refresh_w6_outputs(log_root, mirror_root)
+ print(json.dumps(load_json(log_root / f"{INDEX_NAME}.json"), indent=2, ensure_ascii=True))
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(description="Run the W6 bounded autonomy pilot on top of LangGraph + llama.cpp.")
+ parser.add_argument("--url", default=DEFAULT_LANGCHAIN_RUN_URL)
+ parser.add_argument("--program-id", default=DEFAULT_PROGRAM_ID)
+ parser.add_argument("--log-root", default=None)
+ parser.add_argument("--mirror-root", default=None)
+ sub = parser.add_subparsers(dest="command", required=True)
+
+ sub.add_parser("materialize", help="Materialize the W6 bounded autonomy pilot.")
+
+ run_scenario = sub.add_parser("run-scenario", help="Run one W6 scenario.")
+ run_scenario.add_argument("scenario_id")
+ run_scenario.add_argument("--until", choices=["milestone", "done"], default="done")
+
+ resume_scenario = sub.add_parser("resume-scenario", help="Resume a paused W6 scenario from graph.state.json.")
+ resume_scenario.add_argument("scenario_id")
+
+ status = sub.add_parser("status", help="Print the current W6 status.")
+ status.add_argument("scenario_id", nargs="?")
+ status.add_argument("--all", action="store_true")
+ return parser
+
+
+def main() -> int:
+ parser = build_parser()
+ args = parser.parse_args()
+
+ configure_program_runtime(program_id=args.program_id, run_url=args.url)
+ log_root = Path(args.log_root) if args.log_root else default_log_root_for(PROGRAM_ID)
+ mirror_root = Path(args.mirror_root) if args.mirror_root else default_mirror_root_for(PROGRAM_ID)
+ valid_case_ids = {case["case_id"] for case in available_cases()}
+
+ if args.command == "materialize":
+ materialize(log_root, mirror_root)
+ print(f"materialized {PROGRAM_ID} at {log_root}")
+ return 0
+
+ if args.command == "run-scenario":
+ if args.scenario_id not in valid_case_ids:
+ parser.error(f"unknown scenario_id for {PROGRAM_ID}: {args.scenario_id}")
+ return 2
+ materialize(log_root, mirror_root)
+ final_state = run_graph_scenario(log_root, mirror_root, case_id=args.scenario_id, until=args.until, resume=False)
+ print(json.dumps({"scenario_id": args.scenario_id, "terminal_status": final_state.get("terminal_status"), "paused": final_state.get("paused", False)}, ensure_ascii=True))
+ return 0
+
+ if args.command == "resume-scenario":
+ if args.scenario_id not in valid_case_ids:
+ parser.error(f"unknown scenario_id for {PROGRAM_ID}: {args.scenario_id}")
+ return 2
+ materialize(log_root, mirror_root)
+ final_state = run_graph_scenario(log_root, mirror_root, case_id=args.scenario_id, until="done", resume=True)
+ print(json.dumps({"scenario_id": args.scenario_id, "terminal_status": final_state.get("terminal_status"), "paused": final_state.get("paused", False)}, ensure_ascii=True))
+ return 0
+
+ if args.command == "status":
+ materialize(log_root, mirror_root)
+ if args.all:
+ print_all_status(log_root, mirror_root)
+ return 0
+ if not args.scenario_id:
+ parser.error("status requires either or --all")
+ return 2
+ if args.scenario_id not in valid_case_ids:
+ parser.error(f"unknown scenario_id for {PROGRAM_ID}: {args.scenario_id}")
+ return 2
+ print_case_status(log_root, args.scenario_id)
+ return 0
+
+ parser.error(f"unknown command: {args.command}")
+ return 2
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/scripts/requirements-langgraph-pilot.txt b/scripts/requirements-langgraph-pilot.txt
new file mode 100644
index 0000000..9fb3013
--- /dev/null
+++ b/scripts/requirements-langgraph-pilot.txt
@@ -0,0 +1 @@
+langgraph>=1,<2
diff --git a/scripts/validate_stack.py b/scripts/validate_stack.py
index 6651900..b40a36e 100644
--- a/scripts/validate_stack.py
+++ b/scripts/validate_stack.py
@@ -29,6 +29,10 @@
"aoa-machine-fit",
"aoa-platform-adaptation",
"aoa-local-ai-trials",
+ "aoa-langgraph-pilot",
+ "aoa-w5-pilot",
+ "aoa-w6-pilot",
+ "aoa-llamacpp-pilot",
"aoa-qwen-check",
"aoa-qwen-run",
"aoa-qwen-bench",
@@ -74,6 +78,10 @@
ROOT / "docs" / "RENDER_TRUTH.md",
ROOT / "docs" / "RUNTIME_BENCH_POLICY.md",
ROOT / "docs" / "LOCAL_AI_TRIALS.md",
+ ROOT / "docs" / "LANGGRAPH_PILOT.md",
+ ROOT / "docs" / "LLAMACPP_PILOT.md",
+ ROOT / "docs" / "W5_PILOT.md",
+ ROOT / "docs" / "W6_PILOT.md",
ROOT / "docs" / "PLATFORM_ADAPTATION_POLICY.md",
ROOT / "docs" / "BRANCH_POLICY.md",
ROOT / "docs" / "MEMO_RUNTIME_SEAM.md",
@@ -94,6 +102,7 @@
ROOT / "docs" / "machine-fit" / "README.md",
ROOT / "docs" / "machine-fit" / "schema.v1.json",
ROOT / "docs" / "machine-fit" / "machine-fit.public.json.example",
+ ROOT / "scripts" / "requirements-langgraph-pilot.txt",
ROOT / "docs" / "platform-adaptations" / "README.md",
ROOT / "docs" / "platform-adaptations" / "schema.v1.json",
ROOT / "docs" / "platform-adaptations" / "platform-adaptation.public.json.example",
@@ -107,7 +116,9 @@
ROOT / "compose" / "profiles" / "federation.txt",
ROOT / "compose" / "tuning" / "README.md",
ROOT / "compose" / "tuning" / "ollama.cpu.yml",
+ ROOT / "compose" / "modules" / "32-llamacpp-inference.yml",
ROOT / "compose" / "modules" / "43-federation-router.yml",
+ ROOT / "compose" / "modules" / "44-llamacpp-agent-sidecar.yml",
ROOT / "config-templates" / "README.md",
ROOT / "config-templates" / "Configs" / "agent-api" / "return-policy.yaml",
ROOT / "config-templates" / "Configs" / "federation" / "aoa-agents.yaml",
@@ -264,6 +275,10 @@ def validate_paths(errors: list[str]) -> None:
for required_snippet in (
"prepare-wave W4 --lane docs",
"apply-case W4 ",
+ "scripts/aoa-w5-pilot materialize",
+ "run-scenario --until milestone",
+ "resume-scenario ",
+ "implementation_patch",
"proposal.edit-spec.json",
"exact_replace",
"anchored_replace",
@@ -277,6 +292,22 @@ def validate_paths(errors: list[str]) -> None:
f"docs/LOCAL_AI_TRIALS.md must mention `{required_snippet}`"
)
+ w5_doc = (ROOT / "docs" / "W5_PILOT.md").read_text(encoding="utf-8")
+ for required_snippet in (
+ "http://127.0.0.1:5403/run",
+ "scripts/aoa-w5-pilot materialize",
+ "run-scenario --until milestone|done",
+ "resume-scenario ",
+ "status --all",
+ "plan_freeze",
+ "first_mutation",
+ "landing",
+ "stack-sync-federation-check-mode",
+ "implementation_patch",
+ ):
+ if required_snippet not in w5_doc:
+ errors.append(f"docs/W5_PILOT.md must mention `{required_snippet}`")
+
paths_doc = (ROOT / "docs" / "PATHS.md").read_text(encoding="utf-8")
if "/srv/abyss-stack" not in paths_doc:
errors.append("docs/PATHS.md must mention /srv/abyss-stack")