From 614cda7658a5c5cd61a324222664d49aed8ec21a Mon Sep 17 00:00:00 2001 From: 8Dionysus Date: Mon, 30 Mar 2026 15:09:01 -0600 Subject: [PATCH] Add benchmark retention classes --- docs/RUNTIME_BENCH_POLICY.md | 10 ++ scripts/aoa-runtime-bench-index | 172 +++++++++++++++++++++++++++++++- 2 files changed, 180 insertions(+), 2 deletions(-) diff --git a/docs/RUNTIME_BENCH_POLICY.md b/docs/RUNTIME_BENCH_POLICY.md index e359971..92bbf23 100644 --- a/docs/RUNTIME_BENCH_POLICY.md +++ b/docs/RUNTIME_BENCH_POLICY.md @@ -73,6 +73,7 @@ Recommended active tree: ```text ${AOA_STACK_ROOT}/Logs/runtime-benchmarks/ catalog.json + retention.json latest/ index.json runs/ @@ -131,9 +132,18 @@ scripts/aoa-runtime-bench-index That helper writes: - `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/catalog.json` +- `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/retention.json` - `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/latest/index.json` - `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/runs/index.json` +Retention classes: +- `canonical` + Current latest pointers and the latest run for each active target label. +- `historical` + Older runs that remain part of an active lineage or older comparison/promotion history. +- `exploratory` + Local evidence that is not part of the current default comparison surface. + ## Relationship to local trial programs If you need a supervised per-case trial program rather than a standalone benchmark run, use: diff --git a/scripts/aoa-runtime-bench-index b/scripts/aoa-runtime-bench-index index 5c5c26e..44f85da 100755 --- a/scripts/aoa-runtime-bench-index +++ b/scripts/aoa-runtime-bench-index @@ -73,6 +73,33 @@ def build_run_entry(summary_path: Path) -> dict[str, Any]: } +def referenced_run_refs_from_comparison(path: Path) -> set[str]: + payload = load_json(path) + refs: set[str] = set() + for key in ("baseline_run_ref", "candidate_run_ref"): + value = payload.get(key) + if isinstance(value, str) and value: + refs.add(value) + return refs + + +def referenced_run_refs_from_promotion(path: Path) -> set[str]: + payload = load_json(path) + refs: set[str] = set() + value = payload.get("baseline_run_ref") + if isinstance(value, str) and value: + refs.add(value) + + for screening_path in sorted(path.parent.glob("*.screening.json")): + screening = load_json(screening_path) + bench = screening.get("bench") + if isinstance(bench, dict): + run_dir = bench.get("run_dir") + if isinstance(run_dir, str) and run_dir: + refs.add(run_dir) + return refs + + def latest_entries_by_key(entries: list[dict[str, Any]], key: str) -> dict[str, dict[str, Any]]: latest: dict[str, dict[str, Any]] = {} for entry in entries: @@ -142,6 +169,107 @@ def collect_latest_group(root: Path, kind: str) -> dict[str, dict[str, Any]]: return results +def collect_packet_references(write_root: Path) -> tuple[dict[str, set[str]], dict[str, set[str]]]: + latest_refs = {"comparison": set(), "promotion": set()} + all_refs = {"comparison": set(), "promotion": set()} + + comparisons_root = write_root / "comparisons" + promotions_root = write_root / "promotions" + + for comparison_path in sorted(comparisons_root.glob("*/runs/*/comparison.json")): + refs = referenced_run_refs_from_comparison(comparison_path) + all_refs["comparison"].update(refs) + for promotion_path in sorted(promotions_root.glob("*/runs/*/promotion.json")): + refs = referenced_run_refs_from_promotion(promotion_path) + all_refs["promotion"].update(refs) + + for child in sorted(path for path in comparisons_root.iterdir() if path.is_dir()): + pointer = child / "latest.json" + if not pointer.exists(): + continue + payload = load_json(pointer) + comparison_ref = payload.get("comparison_ref") + if isinstance(comparison_ref, str) and Path(comparison_ref).exists(): + latest_refs["comparison"].update(referenced_run_refs_from_comparison(Path(comparison_ref))) + + for child in sorted(path for path in promotions_root.iterdir() if path.is_dir()): + pointer = child / "latest.json" + if not pointer.exists(): + continue + payload = load_json(pointer) + promotion_ref = payload.get("promotion_ref") + if isinstance(promotion_ref, str) and Path(promotion_ref).exists(): + latest_refs["promotion"].update(referenced_run_refs_from_promotion(Path(promotion_ref))) + + return latest_refs, all_refs + + +def classify_retention( + run_entries: list[dict[str, Any]], + latest_by_target_label: dict[str, dict[str, Any]], + latest_packet_refs: dict[str, set[str]], + all_packet_refs: dict[str, set[str]], +) -> dict[str, Any]: + latest_target_refs = { + str(entry["run_ref"]) + for entry in latest_by_target_label.values() + if isinstance(entry.get("run_ref"), str) + } + latest_pointer_refs = set().union(*latest_packet_refs.values()) + all_pointer_refs = set().union(*all_packet_refs.values()) + active_target_labels = set(latest_by_target_label.keys()) + + by_class: dict[str, list[dict[str, Any]]] = { + "canonical": [], + "historical": [], + "exploratory": [], + } + run_class_map: dict[str, dict[str, str]] = {} + + for entry in run_entries: + run_ref = str(entry["run_ref"]) + target_label = str(entry.get("target_label") or "") + if run_ref in latest_pointer_refs: + retention_class = "canonical" + reason = "referenced by the current latest comparison or promotion packet" + elif run_ref in latest_target_refs: + retention_class = "canonical" + reason = "latest run for its target label" + elif run_ref in all_pointer_refs: + retention_class = "historical" + reason = "referenced by an older comparison or promotion packet" + elif target_label in active_target_labels: + retention_class = "historical" + reason = "older run in an active target lineage" + else: + retention_class = "exploratory" + reason = "not referenced by current or historical durable comparison surfaces" + + retained = { + "run_ref": run_ref, + "captured_at": entry.get("captured_at"), + "target_label": entry.get("target_label"), + "benchmark_id": entry.get("benchmark_id"), + "overall_mean_s": entry.get("overall_mean_s"), + "retention_reason": reason, + } + by_class[retention_class].append(retained) + run_class_map[run_ref] = { + "retention_class": retention_class, + "retention_reason": reason, + } + + return { + "classes": by_class, + "run_class_map": run_class_map, + "counts": { + "canonical": len(by_class["canonical"]), + "historical": len(by_class["historical"]), + "exploratory": len(by_class["exploratory"]), + }, + } + + def build_catalog(write_root: Path) -> dict[str, Any]: runs_root = write_root / "runs" comparisons_root = write_root / "comparisons" @@ -155,6 +283,8 @@ def build_catalog(write_root: Path) -> dict[str, Any]: latest_by_target_label = latest_entries_by_key(run_entries, "target_label") latest_by_benchmark_id = latest_entries_by_key(run_entries, "benchmark_id") latest_by_family = latest_entries_by_key(run_entries, "benchmark_family") + latest_packet_refs, all_packet_refs = collect_packet_references(write_root) + retention = classify_retention(run_entries, latest_by_target_label, latest_packet_refs, all_packet_refs) comparisons = collect_latest_group(comparisons_root, "comparison") promotions = collect_latest_group(promotions_root, "promotion") @@ -164,6 +294,14 @@ def build_catalog(write_root: Path) -> dict[str, Any]: "generated_at": utc_now(), "write_root": str(write_root), "retention_posture": "keep raw runs as evidence, use latest pointers and this catalog for durable navigation", + "retention": { + "counts": retention["counts"], + "policy": { + "canonical": "current latest pointers and latest runs for active target labels", + "historical": "older runs in active lineages or runs referenced by older comparison/promotion packets", + "exploratory": "runs not referenced by durable comparison surfaces", + }, + }, "runs": { "count": len(run_entries), "index_ref": str(write_root / "runs" / "index.json"), @@ -195,6 +333,7 @@ def build_catalog(write_root: Path) -> dict[str, Any]: }, "comparisons": comparisons, "promotions": promotions, + "retention_ref": str(write_root / "retention.json"), } @@ -215,10 +354,20 @@ def main() -> int: build_run_entry(path) for path in sorted(runs_root.glob("*/summary.json")) ] + latest_by_target_label = latest_entries_by_key(run_entries, "target_label") + latest_packet_refs, all_packet_refs = collect_packet_references(write_root) + retention = classify_retention(run_entries, latest_by_target_label, latest_packet_refs, all_packet_refs) + run_entries_with_retention = [] + for entry in run_entries: + enriched = dict(entry) + enriched.update(retention["run_class_map"][str(entry["run_ref"])]) + run_entries_with_retention.append(enriched) + latest_root = write_root / "latest" latest_index = { "generated_at": catalog["generated_at"], "catalog_ref": str(write_root / "catalog.json"), + "retention_ref": str(write_root / "retention.json"), "comparison_refs": { key: value.get("pointer_ref") for key, value in catalog["comparisons"].items() @@ -228,25 +377,44 @@ def main() -> int: for key, value in catalog["promotions"].items() }, "latest_runs_by_target_label": catalog["runs"]["latest_by_target_label"], + "canonical_run_refs": [ + item["run_ref"] + for item in retention["classes"]["canonical"] + ], } run_index = { "generated_at": catalog["generated_at"], - "count": len(run_entries), - "entries": run_entries, + "count": len(run_entries_with_retention), + "entries": run_entries_with_retention, + } + retention_index = { + "generated_at": catalog["generated_at"], + "counts": retention["counts"], + "classes": retention["classes"], + "notes": { + "canonical": "use these first for repeatable control-path and promotion-path comparison", + "historical": "keep these for lineage and reviewable decision history", + "exploratory": "keep these as local evidence, but do not treat them as the default comparison set", + }, } write_json(write_root / "catalog.json", catalog) write_json(latest_root / "index.json", latest_index) write_json(runs_root / "index.json", run_index) + write_json(write_root / "retention.json", retention_index) print(json.dumps({ "ok": True, "catalog_ref": str(write_root / "catalog.json"), "latest_ref": str(latest_root / "index.json"), "run_index_ref": str(runs_root / "index.json"), + "retention_ref": str(write_root / "retention.json"), "run_count": len(run_entries), "comparison_count": len(catalog["comparisons"]), "promotion_count": len(catalog["promotions"]), + "canonical_count": retention["counts"]["canonical"], + "historical_count": retention["counts"]["historical"], + "exploratory_count": retention["counts"]["exploratory"], }, ensure_ascii=True)) return 0