Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/validate-stack.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
run: python scripts/validate_stack.py

- name: Python syntax check
run: python -m py_compile scripts/validate_stack.py scripts/aoa-host-facts scripts/aoa-local-ai-trials scripts/aoa-langgraph-pilot scripts/aoa-w5-pilot scripts/aoa-w6-pilot scripts/aoa-llamacpp-pilot
run: python -m py_compile scripts/validate_stack.py scripts/aoa-host-facts scripts/aoa-local-ai-trials scripts/aoa-langgraph-pilot scripts/aoa-w5-pilot scripts/aoa-w6-pilot scripts/aoa-llamacpp-pilot scripts/aoa-runtime-bench-index

- name: Shellcheck scripts
run: |
Expand Down
21 changes: 21 additions & 0 deletions docs/RUNTIME_BENCH_POLICY.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,11 @@ Optional heavy-data root:
Recommended active tree:
```text
${AOA_STACK_ROOT}/Logs/runtime-benchmarks/
catalog.json
latest/
index.json
runs/
index.json
2026-03-24T154200Z__latency-single-turn__workhorse-local-q4/
benchmark.manifest.json
summary.json
Expand All @@ -90,6 +94,7 @@ Rules:
- move bulky raw captures to the optional vault when mounted
- never assume `${AOA_VAULT_ROOT}` exists just because the architecture names it
- never commit secret-bearing rendered config or live env material
- keep one generated catalog and one `latest/` pointer layer so repeated runs stay comparable without hand-scanning timestamp directories

## Minimum run outputs
A strong runtime benchmark run should produce:
Expand Down Expand Up @@ -118,6 +123,17 @@ scripts/aoa-qwen-bench --preset intel-full
This runner stays on the intended `langchain-api /run` path and writes machine-local evidence under `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/runs/`.
It performs one uncounted warmup call per case before measured repeats so warm-latency reads stay warm by definition instead of by accident.

Refresh the durable catalog after new runs:

```bash
scripts/aoa-runtime-bench-index
```

That helper writes:
- `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/catalog.json`
- `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/latest/index.json`
- `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/runs/index.json`

## Relationship to local trial programs

If you need a supervised per-case trial program rather than a standalone benchmark run, use:
Expand All @@ -144,6 +160,11 @@ scripts/aoa-llamacpp-pilot run --preset intel-full
That pilot runs a fresh Ollama baseline on `5401`, a fresh `llama.cpp` sidecar bench on `5403`, and writes a comparison packet under `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/comparisons/`.
It is a runtime-parity aid, not a promotion decision by itself.

Use the catalog layer to answer:
- what the latest baseline run was for a target label
- which comparison packet currently represents a pilot family
- which promotion packet currently represents the active substrate verdict

## Comparison hygiene
Before treating two runs as comparable, keep stable:
- host hardware class or disclose the delta
Expand Down
255 changes: 255 additions & 0 deletions scripts/aoa-runtime-bench-index
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Any


STACK_ROOT = Path(os.environ.get("AOA_STACK_ROOT", "/srv/abyss-stack"))
DEFAULT_WRITE_ROOT = STACK_ROOT / "Logs" / "runtime-benchmarks"


def utc_now() -> str:
return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")


def ensure_parent(path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)


def write_json(path: Path, payload: dict[str, Any]) -> None:
ensure_parent(path)
path.write_text(json.dumps(payload, indent=2, ensure_ascii=True) + "\n", encoding="utf-8")


def load_json(path: Path) -> dict[str, Any]:
payload = json.loads(path.read_text(encoding="utf-8"))
if not isinstance(payload, dict):
raise ValueError(f"expected object at {path}")
return payload


def parse_run_dir_name(name: str) -> tuple[str | None, str | None, str | None]:
parts = name.split("__", 2)
if len(parts) != 3:
return None, None, None
return parts[0], parts[1], parts[2]


def summarize_case_means(case_breakdown: dict[str, Any] | None) -> dict[str, float | None]:
result: dict[str, float | None] = {}
if not isinstance(case_breakdown, dict):
return result
for case_id, payload in sorted(case_breakdown.items()):
if isinstance(payload, dict):
value = payload.get("mean_s")
result[str(case_id)] = value if isinstance(value, (int, float)) else None
return result


def build_run_entry(summary_path: Path) -> dict[str, Any]:
summary = load_json(summary_path)
run_root = summary_path.parent
timestamp_token, benchmark_family, target_label = parse_run_dir_name(run_root.name)
return {
"run_ref": str(run_root),
"summary_ref": str(summary_path),
"run_dir": run_root.name,
"timestamp_token": timestamp_token,
"captured_at": summary.get("captured_at"),
"benchmark_id": summary.get("benchmark_id"),
"benchmark_family": benchmark_family,
"target_label": target_label,
"all_passed": bool(summary.get("all_passed")),
"runtime_selection": summary.get("runtime_selection"),
"overall_mean_s": summary.get("overall_mean_s"),
"overall_best_s": summary.get("overall_best_s"),
"overall_worst_s": summary.get("overall_worst_s"),
"case_means_s": summarize_case_means(summary.get("case_breakdown")),
}


def latest_entries_by_key(entries: list[dict[str, Any]], key: str) -> dict[str, dict[str, Any]]:
latest: dict[str, dict[str, Any]] = {}
for entry in entries:
value = entry.get(key)
if not isinstance(value, str) or not value:
continue
existing = latest.get(value)
if existing is None or str(entry.get("captured_at") or entry.get("timestamp_token") or "") >= str(
existing.get("captured_at") or existing.get("timestamp_token") or ""
):
latest[value] = entry
return dict(sorted(latest.items()))


def load_latest_pointer(pointer_path: Path, kind: str) -> dict[str, Any] | None:
if not pointer_path.exists():
return None
payload = load_json(pointer_path)
latest_run_root = payload.get("latest_run_root")
if not isinstance(latest_run_root, str):
return None
result: dict[str, Any] = {
"kind": kind,
"pointer_ref": str(pointer_path),
"captured_at": payload.get("captured_at"),
"latest_run_root": latest_run_root,
}
if kind == "comparison":
result["comparison_ref"] = payload.get("comparison_ref")
result["report_ref"] = payload.get("report_ref")
comparison_ref = payload.get("comparison_ref")
if isinstance(comparison_ref, str) and Path(comparison_ref).exists():
comparison = load_json(Path(comparison_ref))
result["pilot_id"] = comparison.get("pilot_id")
result["preset"] = comparison.get("preset")
result["baseline_backend"] = comparison.get("baseline_backend")
result["candidate_backend"] = comparison.get("candidate_backend")
result["overall_delta_s"] = comparison.get("overall_delta_s")
result["recommendation"] = comparison.get("recommendation")
if kind == "promotion":
result["promotion_ref"] = payload.get("promotion_ref")
result["report_ref"] = payload.get("report_ref")
promotion_ref = payload.get("promotion_ref")
if isinstance(promotion_ref, str) and Path(promotion_ref).exists():
promotion = load_json(Path(promotion_ref))
result["promotion_id"] = promotion.get("promotion_id")
result["winner_quant"] = promotion.get("winner_quant")
result["winner_model_host_path"] = promotion.get("winner_model_host_path")
promotion_block = promotion.get("promotion")
if isinstance(promotion_block, dict):
result["w0_gate_result"] = promotion_block.get("w0_gate_result")
result["w4_gate_result"] = promotion_block.get("w4_gate_result")
result["recommendation"] = promotion_block.get("recommendation")
return result


def collect_latest_group(root: Path, kind: str) -> dict[str, dict[str, Any]]:
results: dict[str, dict[str, Any]] = {}
if not root.exists():
return results
for child in sorted(path for path in root.iterdir() if path.is_dir()):
pointer = child / "latest.json"
loaded = load_latest_pointer(pointer, kind)
if loaded is None:
continue
results[child.name] = loaded
return results


def build_catalog(write_root: Path) -> dict[str, Any]:
runs_root = write_root / "runs"
comparisons_root = write_root / "comparisons"
promotions_root = write_root / "promotions"

run_entries = [
build_run_entry(path)
for path in sorted(runs_root.glob("*/summary.json"))
]

latest_by_target_label = latest_entries_by_key(run_entries, "target_label")
latest_by_benchmark_id = latest_entries_by_key(run_entries, "benchmark_id")
latest_by_family = latest_entries_by_key(run_entries, "benchmark_family")

comparisons = collect_latest_group(comparisons_root, "comparison")
promotions = collect_latest_group(promotions_root, "promotion")

return {
"catalog_id": "runtime-benchmarks-catalog-v1",
"generated_at": utc_now(),
"write_root": str(write_root),
"retention_posture": "keep raw runs as evidence, use latest pointers and this catalog for durable navigation",
"runs": {
"count": len(run_entries),
"index_ref": str(write_root / "runs" / "index.json"),
"latest_by_target_label": {
key: {
"run_ref": value["run_ref"],
"captured_at": value["captured_at"],
"overall_mean_s": value["overall_mean_s"],
"case_means_s": value["case_means_s"],
}
for key, value in latest_by_target_label.items()
},
"latest_by_benchmark_id": {
key: {
"run_ref": value["run_ref"],
"captured_at": value["captured_at"],
"overall_mean_s": value["overall_mean_s"],
}
for key, value in latest_by_benchmark_id.items()
},
"latest_by_family": {
key: {
"run_ref": value["run_ref"],
"target_label": value["target_label"],
"captured_at": value["captured_at"],
}
for key, value in latest_by_family.items()
},
},
"comparisons": comparisons,
"promotions": promotions,
}


def main() -> int:
parser = argparse.ArgumentParser(description="Build a durable runtime benchmark catalog")
parser.add_argument(
"--write-root",
default=str(DEFAULT_WRITE_ROOT),
help="runtime benchmark root (default: %(default)s)",
)
args = parser.parse_args()

write_root = Path(args.write_root).expanduser().resolve()
catalog = build_catalog(write_root)

runs_root = write_root / "runs"
run_entries = [
build_run_entry(path)
for path in sorted(runs_root.glob("*/summary.json"))
]
latest_root = write_root / "latest"
latest_index = {
"generated_at": catalog["generated_at"],
"catalog_ref": str(write_root / "catalog.json"),
"comparison_refs": {
key: value.get("pointer_ref")
for key, value in catalog["comparisons"].items()
},
"promotion_refs": {
key: value.get("pointer_ref")
for key, value in catalog["promotions"].items()
},
"latest_runs_by_target_label": catalog["runs"]["latest_by_target_label"],
}
run_index = {
"generated_at": catalog["generated_at"],
"count": len(run_entries),
"entries": run_entries,
}

write_json(write_root / "catalog.json", catalog)
write_json(latest_root / "index.json", latest_index)
write_json(runs_root / "index.json", run_index)

print(json.dumps({
"ok": True,
"catalog_ref": str(write_root / "catalog.json"),
"latest_ref": str(latest_root / "index.json"),
"run_index_ref": str(runs_root / "index.json"),
"run_count": len(run_entries),
"comparison_count": len(catalog["comparisons"]),
"promotion_count": len(catalog["promotions"]),
}, ensure_ascii=True))
return 0


if __name__ == "__main__":
raise SystemExit(main())
1 change: 1 addition & 0 deletions scripts/validate_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
"aoa-w5-pilot",
"aoa-w6-pilot",
"aoa-llamacpp-pilot",
"aoa-runtime-bench-index",
"aoa-qwen-check",
"aoa-qwen-run",
"aoa-qwen-bench",
Expand Down
Loading