Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions docs/RUNTIME_BENCH_POLICY.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ Recommended active tree:
${AOA_STACK_ROOT}/Logs/runtime-benchmarks/
catalog.json
retention.json
cohorts.json
latest/
index.json
runs/
Expand Down Expand Up @@ -133,6 +134,7 @@ scripts/aoa-runtime-bench-index
That helper writes:
- `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/catalog.json`
- `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/retention.json`
- `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/cohorts.json`
- `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/latest/index.json`
- `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/runs/index.json`

Expand All @@ -144,6 +146,18 @@ Retention classes:
- `exploratory`
Local evidence that is not part of the current default comparison surface.

Cohort layer:
- `current-control`
The default local control-path runs to compare against first.
- `promotion-basis`
The runs directly used by the current comparison/promotion verdict path.
- `current-promoted`
The latest promoted winner runs for the active backend substrate.
- `comparison-challenger`
The latest challenger runs retained beside the promoted winner.
- `legacy-baseline`
Older control-path runs kept for drift review.

## Relationship to local trial programs

If you need a supervised per-case trial program rather than a standalone benchmark run, use:
Expand Down
145 changes: 145 additions & 0 deletions scripts/aoa-runtime-bench-index
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,15 @@ def build_run_entry(summary_path: Path) -> dict[str, Any]:
summary = load_json(summary_path)
run_root = summary_path.parent
timestamp_token, benchmark_family, target_label = parse_run_dir_name(run_root.name)
manifest_path = run_root / "benchmark.manifest.json"
manifest = load_json(manifest_path) if manifest_path.exists() else {}
system_under_test = manifest.get("system_under_test") if isinstance(manifest, dict) else {}
if not isinstance(system_under_test, dict):
system_under_test = {}
return {
"run_ref": str(run_root),
"summary_ref": str(summary_path),
"manifest_ref": str(manifest_path) if manifest_path.exists() else None,
"run_dir": run_root.name,
"timestamp_token": timestamp_token,
"captured_at": summary.get("captured_at"),
Expand All @@ -66,6 +72,9 @@ def build_run_entry(summary_path: Path) -> dict[str, Any]:
"target_label": target_label,
"all_passed": bool(summary.get("all_passed")),
"runtime_selection": summary.get("runtime_selection"),
"backend": system_under_test.get("backend"),
"model": system_under_test.get("model"),
"runtime_variant": system_under_test.get("quantization_or_runtime_variant"),
"overall_mean_s": summary.get("overall_mean_s"),
"overall_best_s": summary.get("overall_best_s"),
"overall_worst_s": summary.get("overall_worst_s"),
Expand Down Expand Up @@ -270,6 +279,119 @@ def classify_retention(
}


def latest_promotion_payload(write_root: Path) -> dict[str, Any] | None:
pointer = write_root / "promotions" / "llamacpp-promotion-gate-v1" / "latest.json"
if not pointer.exists():
return None
payload = load_json(pointer)
promotion_ref = payload.get("promotion_ref")
if not isinstance(promotion_ref, str):
return None
promotion_path = Path(promotion_ref)
if not promotion_path.exists():
return None
promotion = load_json(promotion_path)
promotion["_promotion_path"] = str(promotion_path)
return promotion


def determine_cohorts(
write_root: Path,
run_entries: list[dict[str, Any]],
latest_by_target_label: dict[str, dict[str, Any]],
latest_packet_refs: dict[str, set[str]],
retention: dict[str, Any],
) -> dict[str, Any]:
entries_by_ref = {str(entry["run_ref"]): entry for entry in run_entries}
latest_target_refs = {
str(entry["run_ref"])
for entry in latest_by_target_label.values()
if isinstance(entry.get("run_ref"), str)
}
control_target_labels = {
label
for label in latest_by_target_label
if "llamacpp" not in label or "ollama-baseline" in label
}
current_control_refs = sorted(
str(latest_by_target_label[label]["run_ref"])
for label in sorted(control_target_labels)
if label in latest_by_target_label
)

legacy_baseline_refs = sorted(
row["run_ref"]
for row in retention["classes"]["historical"]
if row.get("target_label") in control_target_labels
)

promotion_basis_refs = sorted(set().union(*latest_packet_refs.values()))

promoted_substrate_refs: list[str] = []
comparison_challenger_refs: list[str] = []
promotion = latest_promotion_payload(write_root)
if promotion:
winner_quant = promotion.get("winner_quant")
screening_paths = sorted(
Path(str(promotion["_promotion_path"])).parent.glob("*.screening.json")
)
for screening_path in screening_paths:
screening = load_json(screening_path)
bench = screening.get("bench")
if not isinstance(bench, dict):
continue
run_ref = bench.get("run_dir")
if not isinstance(run_ref, str) or run_ref not in entries_by_ref:
continue
quant = screening.get("quant")
if quant == winner_quant:
promoted_substrate_refs.append(run_ref)
else:
comparison_challenger_refs.append(run_ref)

cohort_order = [
("current-control", current_control_refs, "latest control-path runs used as the default local baseline set"),
("promotion-basis", promotion_basis_refs, "runs referenced by the current latest comparison or promotion packets"),
("current-promoted", sorted(set(promoted_substrate_refs)), "latest promoted llama.cpp winner runs referenced by the current promotion verdict"),
("comparison-challenger", sorted(set(comparison_challenger_refs)), "latest challenger runs kept beside the promoted winner for bounded backend comparison"),
("legacy-baseline", legacy_baseline_refs, "historical control-path runs kept for baseline lineage and drift review"),
]

classes: dict[str, list[dict[str, Any]]] = {}
run_memberships: dict[str, list[str]] = {str(entry["run_ref"]): [] for entry in run_entries}
notes: dict[str, str] = {}

for cohort_id, refs, note in cohort_order:
notes[cohort_id] = note
seen: set[str] = set()
rows: list[dict[str, Any]] = []
for run_ref in refs:
if run_ref in seen:
continue
seen.add(run_ref)
entry = entries_by_ref.get(run_ref)
if entry is None:
continue
rows.append(
{
"run_ref": run_ref,
"captured_at": entry.get("captured_at"),
"target_label": entry.get("target_label"),
"benchmark_id": entry.get("benchmark_id"),
"overall_mean_s": entry.get("overall_mean_s"),
}
)
run_memberships[run_ref].append(cohort_id)
classes[cohort_id] = rows

return {
"notes": notes,
"classes": classes,
"counts": {key: len(value) for key, value in classes.items()},
"run_memberships": run_memberships,
}


def build_catalog(write_root: Path) -> dict[str, Any]:
runs_root = write_root / "runs"
comparisons_root = write_root / "comparisons"
Expand All @@ -285,6 +407,7 @@ def build_catalog(write_root: Path) -> dict[str, Any]:
latest_by_family = latest_entries_by_key(run_entries, "benchmark_family")
latest_packet_refs, all_packet_refs = collect_packet_references(write_root)
retention = classify_retention(run_entries, latest_by_target_label, latest_packet_refs, all_packet_refs)
cohorts = determine_cohorts(write_root, run_entries, latest_by_target_label, latest_packet_refs, retention)

comparisons = collect_latest_group(comparisons_root, "comparison")
promotions = collect_latest_group(promotions_root, "promotion")
Expand All @@ -302,6 +425,11 @@ def build_catalog(write_root: Path) -> dict[str, Any]:
"exploratory": "runs not referenced by durable comparison surfaces",
},
},
"cohorts": {
"counts": cohorts["counts"],
"notes": cohorts["notes"],
"ref": str(write_root / "cohorts.json"),
},
"runs": {
"count": len(run_entries),
"index_ref": str(write_root / "runs" / "index.json"),
Expand Down Expand Up @@ -357,17 +485,20 @@ def main() -> int:
latest_by_target_label = latest_entries_by_key(run_entries, "target_label")
latest_packet_refs, all_packet_refs = collect_packet_references(write_root)
retention = classify_retention(run_entries, latest_by_target_label, latest_packet_refs, all_packet_refs)
cohorts = determine_cohorts(write_root, run_entries, latest_by_target_label, latest_packet_refs, retention)
run_entries_with_retention = []
for entry in run_entries:
enriched = dict(entry)
enriched.update(retention["run_class_map"][str(entry["run_ref"])])
enriched["cohort_memberships"] = cohorts["run_memberships"][str(entry["run_ref"])]
run_entries_with_retention.append(enriched)

latest_root = write_root / "latest"
latest_index = {
"generated_at": catalog["generated_at"],
"catalog_ref": str(write_root / "catalog.json"),
"retention_ref": str(write_root / "retention.json"),
"cohorts_ref": str(write_root / "cohorts.json"),
"comparison_refs": {
key: value.get("pointer_ref")
for key, value in catalog["comparisons"].items()
Expand All @@ -381,6 +512,10 @@ def main() -> int:
item["run_ref"]
for item in retention["classes"]["canonical"]
],
"cohort_refs": {
key: [row["run_ref"] for row in value]
for key, value in cohorts["classes"].items()
},
}
run_index = {
"generated_at": catalog["generated_at"],
Expand All @@ -397,24 +532,34 @@ def main() -> int:
"exploratory": "keep these as local evidence, but do not treat them as the default comparison set",
},
}
cohort_index = {
"generated_at": catalog["generated_at"],
"counts": cohorts["counts"],
"notes": cohorts["notes"],
"classes": cohorts["classes"],
}

write_json(write_root / "catalog.json", catalog)
write_json(latest_root / "index.json", latest_index)
write_json(runs_root / "index.json", run_index)
write_json(write_root / "retention.json", retention_index)
write_json(write_root / "cohorts.json", cohort_index)

print(json.dumps({
"ok": True,
"catalog_ref": str(write_root / "catalog.json"),
"latest_ref": str(latest_root / "index.json"),
"run_index_ref": str(runs_root / "index.json"),
"retention_ref": str(write_root / "retention.json"),
"cohorts_ref": str(write_root / "cohorts.json"),
"run_count": len(run_entries),
"comparison_count": len(catalog["comparisons"]),
"promotion_count": len(catalog["promotions"]),
"canonical_count": retention["counts"]["canonical"],
"historical_count": retention["counts"]["historical"],
"exploratory_count": retention["counts"]["exploratory"],
"current_control_count": cohorts["counts"].get("current-control", 0),
"promotion_basis_count": cohorts["counts"].get("promotion-basis", 0),
}, ensure_ascii=True))
return 0

Expand Down
Loading