8Dionysus · 8Dionysus · Mar 30, 2026 · Mar 30, 2026
diff --git a/.github/workflows/validate-stack.yml b/.github/workflows/validate-stack.yml
@@ -26,7 +26,7 @@ jobs:
         run: python scripts/validate_stack.py
 
       - name: Python syntax check
-        run: python -m py_compile scripts/validate_stack.py scripts/aoa-host-facts scripts/aoa-local-ai-trials scripts/aoa-langgraph-pilot scripts/aoa-w5-pilot scripts/aoa-w6-pilot scripts/aoa-llamacpp-pilot
+        run: python -m py_compile scripts/validate_stack.py scripts/aoa-host-facts scripts/aoa-local-ai-trials scripts/aoa-langgraph-pilot scripts/aoa-w5-pilot scripts/aoa-w6-pilot scripts/aoa-llamacpp-pilot scripts/aoa-runtime-bench-index
 
       - name: Shellcheck scripts
         run: |

diff --git a/docs/RUNTIME_BENCH_POLICY.md b/docs/RUNTIME_BENCH_POLICY.md
@@ -72,7 +72,11 @@ Optional heavy-data root:
 Recommended active tree:
 ```text
 ${AOA_STACK_ROOT}/Logs/runtime-benchmarks/
+  catalog.json
+  latest/
+    index.json
   runs/
+    index.json
     2026-03-24T154200Z__latency-single-turn__workhorse-local-q4/
       benchmark.manifest.json
       summary.json
@@ -90,6 +94,7 @@ Rules:
 - move bulky raw captures to the optional vault when mounted
 - never assume `${AOA_VAULT_ROOT}` exists just because the architecture names it
 - never commit secret-bearing rendered config or live env material
+- keep one generated catalog and one `latest/` pointer layer so repeated runs stay comparable without hand-scanning timestamp directories
 
 ## Minimum run outputs
 A strong runtime benchmark run should produce:
@@ -118,6 +123,17 @@ scripts/aoa-qwen-bench --preset intel-full
 This runner stays on the intended `langchain-api /run` path and writes machine-local evidence under `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/runs/`.
 It performs one uncounted warmup call per case before measured repeats so warm-latency reads stay warm by definition instead of by accident.
 
+Refresh the durable catalog after new runs:
+
+```bash
+scripts/aoa-runtime-bench-index
+```
+
+That helper writes:
+- `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/catalog.json`
+- `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/latest/index.json`
+- `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/runs/index.json`
+
 ## Relationship to local trial programs
 
 If you need a supervised per-case trial program rather than a standalone benchmark run, use:
@@ -144,6 +160,11 @@ scripts/aoa-llamacpp-pilot run --preset intel-full
 That pilot runs a fresh Ollama baseline on `5401`, a fresh `llama.cpp` sidecar bench on `5403`, and writes a comparison packet under `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/comparisons/`.
 It is a runtime-parity aid, not a promotion decision by itself.
 
+Use the catalog layer to answer:
+- what the latest baseline run was for a target label
+- which comparison packet currently represents a pilot family
+- which promotion packet currently represents the active substrate verdict
+
 ## Comparison hygiene
 Before treating two runs as comparable, keep stable:
 - host hardware class or disclose the delta

diff --git a/scripts/aoa-runtime-bench-index b/scripts/aoa-runtime-bench-index
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+
+STACK_ROOT = Path(os.environ.get("AOA_STACK_ROOT", "/srv/abyss-stack"))
+DEFAULT_WRITE_ROOT = STACK_ROOT / "Logs" / "runtime-benchmarks"
+
+
+def utc_now() -> str:
+    return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+
+
+def ensure_parent(path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+
+def write_json(path: Path, payload: dict[str, Any]) -> None:
+    ensure_parent(path)
+    path.write_text(json.dumps(payload, indent=2, ensure_ascii=True) + "\n", encoding="utf-8")
+
+
+def load_json(path: Path) -> dict[str, Any]:
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    if not isinstance(payload, dict):
+        raise ValueError(f"expected object at {path}")
+    return payload
+
+
+def parse_run_dir_name(name: str) -> tuple[str | None, str | None, str | None]:
+    parts = name.split("__", 2)
+    if len(parts) != 3:
+        return None, None, None
+    return parts[0], parts[1], parts[2]
+
+
+def summarize_case_means(case_breakdown: dict[str, Any] | None) -> dict[str, float | None]:
+    result: dict[str, float | None] = {}
+    if not isinstance(case_breakdown, dict):
+        return result
+    for case_id, payload in sorted(case_breakdown.items()):
+        if isinstance(payload, dict):
+            value = payload.get("mean_s")
+            result[str(case_id)] = value if isinstance(value, (int, float)) else None
+    return result
+
+
+def build_run_entry(summary_path: Path) -> dict[str, Any]:
+    summary = load_json(summary_path)
+    run_root = summary_path.parent
+    timestamp_token, benchmark_family, target_label = parse_run_dir_name(run_root.name)
+    return {
+        "run_ref": str(run_root),
+        "summary_ref": str(summary_path),
+        "run_dir": run_root.name,
+        "timestamp_token": timestamp_token,
+        "captured_at": summary.get("captured_at"),
+        "benchmark_id": summary.get("benchmark_id"),
+        "benchmark_family": benchmark_family,
+        "target_label": target_label,
+        "all_passed": bool(summary.get("all_passed")),
+        "runtime_selection": summary.get("runtime_selection"),
+        "overall_mean_s": summary.get("overall_mean_s"),
+        "overall_best_s": summary.get("overall_best_s"),
+        "overall_worst_s": summary.get("overall_worst_s"),
+        "case_means_s": summarize_case_means(summary.get("case_breakdown")),
+    }
+
+
+def latest_entries_by_key(entries: list[dict[str, Any]], key: str) -> dict[str, dict[str, Any]]:
+    latest: dict[str, dict[str, Any]] = {}
+    for entry in entries:
+        value = entry.get(key)
+        if not isinstance(value, str) or not value:
+            continue
+        existing = latest.get(value)
+        if existing is None or str(entry.get("captured_at") or entry.get("timestamp_token") or "") >= str(
+            existing.get("captured_at") or existing.get("timestamp_token") or ""
+        ):
+            latest[value] = entry
+    return dict(sorted(latest.items()))
+
+
+def load_latest_pointer(pointer_path: Path, kind: str) -> dict[str, Any] | None:
+    if not pointer_path.exists():
+        return None
+    payload = load_json(pointer_path)
+    latest_run_root = payload.get("latest_run_root")
+    if not isinstance(latest_run_root, str):
+        return None
+    result: dict[str, Any] = {
+        "kind": kind,
+        "pointer_ref": str(pointer_path),
+        "captured_at": payload.get("captured_at"),
+        "latest_run_root": latest_run_root,
+    }
+    if kind == "comparison":
+        result["comparison_ref"] = payload.get("comparison_ref")
+        result["report_ref"] = payload.get("report_ref")
+        comparison_ref = payload.get("comparison_ref")
+        if isinstance(comparison_ref, str) and Path(comparison_ref).exists():
+            comparison = load_json(Path(comparison_ref))
+            result["pilot_id"] = comparison.get("pilot_id")
+            result["preset"] = comparison.get("preset")
+            result["baseline_backend"] = comparison.get("baseline_backend")
+            result["candidate_backend"] = comparison.get("candidate_backend")
+            result["overall_delta_s"] = comparison.get("overall_delta_s")
+            result["recommendation"] = comparison.get("recommendation")
+    if kind == "promotion":
+        result["promotion_ref"] = payload.get("promotion_ref")
+        result["report_ref"] = payload.get("report_ref")
+        promotion_ref = payload.get("promotion_ref")
+        if isinstance(promotion_ref, str) and Path(promotion_ref).exists():
+            promotion = load_json(Path(promotion_ref))
+            result["promotion_id"] = promotion.get("promotion_id")
+            result["winner_quant"] = promotion.get("winner_quant")
+            result["winner_model_host_path"] = promotion.get("winner_model_host_path")
+            promotion_block = promotion.get("promotion")
+            if isinstance(promotion_block, dict):
+                result["w0_gate_result"] = promotion_block.get("w0_gate_result")
+                result["w4_gate_result"] = promotion_block.get("w4_gate_result")
+                result["recommendation"] = promotion_block.get("recommendation")
+    return result
+
+
+def collect_latest_group(root: Path, kind: str) -> dict[str, dict[str, Any]]:
+    results: dict[str, dict[str, Any]] = {}
+    if not root.exists():
+        return results
+    for child in sorted(path for path in root.iterdir() if path.is_dir()):
+        pointer = child / "latest.json"
+        loaded = load_latest_pointer(pointer, kind)
+        if loaded is None:
+            continue
+        results[child.name] = loaded
+    return results
+
+
+def build_catalog(write_root: Path) -> dict[str, Any]:
+    runs_root = write_root / "runs"
+    comparisons_root = write_root / "comparisons"
+    promotions_root = write_root / "promotions"
+
+    run_entries = [
+        build_run_entry(path)
+        for path in sorted(runs_root.glob("*/summary.json"))
+    ]
+
+    latest_by_target_label = latest_entries_by_key(run_entries, "target_label")
+    latest_by_benchmark_id = latest_entries_by_key(run_entries, "benchmark_id")
+    latest_by_family = latest_entries_by_key(run_entries, "benchmark_family")
+
+    comparisons = collect_latest_group(comparisons_root, "comparison")
+    promotions = collect_latest_group(promotions_root, "promotion")
+
+    return {
+        "catalog_id": "runtime-benchmarks-catalog-v1",
+        "generated_at": utc_now(),
+        "write_root": str(write_root),
+        "retention_posture": "keep raw runs as evidence, use latest pointers and this catalog for durable navigation",
+        "runs": {
+            "count": len(run_entries),
+            "index_ref": str(write_root / "runs" / "index.json"),
+            "latest_by_target_label": {
+                key: {
+                    "run_ref": value["run_ref"],
+                    "captured_at": value["captured_at"],
+                    "overall_mean_s": value["overall_mean_s"],
+                    "case_means_s": value["case_means_s"],
+                }
+                for key, value in latest_by_target_label.items()
+            },
+            "latest_by_benchmark_id": {
+                key: {
+                    "run_ref": value["run_ref"],
+                    "captured_at": value["captured_at"],
+                    "overall_mean_s": value["overall_mean_s"],
+                }
+                for key, value in latest_by_benchmark_id.items()
+            },
+            "latest_by_family": {
+                key: {
+                    "run_ref": value["run_ref"],
+                    "target_label": value["target_label"],
+                    "captured_at": value["captured_at"],
+                }
+                for key, value in latest_by_family.items()
+            },
+        },
+        "comparisons": comparisons,
+        "promotions": promotions,
+    }
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Build a durable runtime benchmark catalog")
+    parser.add_argument(
+        "--write-root",
+        default=str(DEFAULT_WRITE_ROOT),
+        help="runtime benchmark root (default: %(default)s)",
+    )
+    args = parser.parse_args()
+
+    write_root = Path(args.write_root).expanduser().resolve()
+    catalog = build_catalog(write_root)
+
+    runs_root = write_root / "runs"
+    run_entries = [
+        build_run_entry(path)
+        for path in sorted(runs_root.glob("*/summary.json"))
+    ]
+    latest_root = write_root / "latest"
+    latest_index = {
+        "generated_at": catalog["generated_at"],
+        "catalog_ref": str(write_root / "catalog.json"),
+        "comparison_refs": {
+            key: value.get("pointer_ref")
+            for key, value in catalog["comparisons"].items()
+        },
+        "promotion_refs": {
+            key: value.get("pointer_ref")
+            for key, value in catalog["promotions"].items()
+        },
+        "latest_runs_by_target_label": catalog["runs"]["latest_by_target_label"],
+    }
+    run_index = {
+        "generated_at": catalog["generated_at"],
+        "count": len(run_entries),
+        "entries": run_entries,
+    }
+
+    write_json(write_root / "catalog.json", catalog)
+    write_json(latest_root / "index.json", latest_index)
+    write_json(runs_root / "index.json", run_index)
+
+    print(json.dumps({
+        "ok": True,
+        "catalog_ref": str(write_root / "catalog.json"),
+        "latest_ref": str(latest_root / "index.json"),
+        "run_index_ref": str(runs_root / "index.json"),
+        "run_count": len(run_entries),
+        "comparison_count": len(catalog["comparisons"]),
+        "promotion_count": len(catalog["promotions"]),
+    }, ensure_ascii=True))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/validate_stack.py b/scripts/validate_stack.py
@@ -33,6 +33,7 @@
     "aoa-w5-pilot",
     "aoa-w6-pilot",
     "aoa-llamacpp-pilot",
+    "aoa-runtime-bench-index",
     "aoa-qwen-check",
     "aoa-qwen-run",
     "aoa-qwen-bench",