From f969d4c68e91edb7dcfada1affdf104dd8fcf9fd Mon Sep 17 00:00:00 2001 From: 8Dionysus Date: Mon, 30 Mar 2026 15:13:48 -0600 Subject: [PATCH] Add benchmark comparison cohorts --- docs/RUNTIME_BENCH_POLICY.md | 14 +++ scripts/aoa-runtime-bench-index | 145 ++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) diff --git a/docs/RUNTIME_BENCH_POLICY.md b/docs/RUNTIME_BENCH_POLICY.md index 92bbf23..e5c1528 100644 --- a/docs/RUNTIME_BENCH_POLICY.md +++ b/docs/RUNTIME_BENCH_POLICY.md @@ -74,6 +74,7 @@ Recommended active tree: ${AOA_STACK_ROOT}/Logs/runtime-benchmarks/ catalog.json retention.json + cohorts.json latest/ index.json runs/ @@ -133,6 +134,7 @@ scripts/aoa-runtime-bench-index That helper writes: - `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/catalog.json` - `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/retention.json` +- `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/cohorts.json` - `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/latest/index.json` - `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/runs/index.json` @@ -144,6 +146,18 @@ Retention classes: - `exploratory` Local evidence that is not part of the current default comparison surface. +Cohort layer: +- `current-control` + The default local control-path runs to compare against first. +- `promotion-basis` + The runs directly used by the current comparison/promotion verdict path. +- `current-promoted` + The latest promoted winner runs for the active backend substrate. +- `comparison-challenger` + The latest challenger runs retained beside the promoted winner. +- `legacy-baseline` + Older control-path runs kept for drift review. + ## Relationship to local trial programs If you need a supervised per-case trial program rather than a standalone benchmark run, use: diff --git a/scripts/aoa-runtime-bench-index b/scripts/aoa-runtime-bench-index index 44f85da..f5bb972 100755 --- a/scripts/aoa-runtime-bench-index +++ b/scripts/aoa-runtime-bench-index @@ -55,9 +55,15 @@ def build_run_entry(summary_path: Path) -> dict[str, Any]: summary = load_json(summary_path) run_root = summary_path.parent timestamp_token, benchmark_family, target_label = parse_run_dir_name(run_root.name) + manifest_path = run_root / "benchmark.manifest.json" + manifest = load_json(manifest_path) if manifest_path.exists() else {} + system_under_test = manifest.get("system_under_test") if isinstance(manifest, dict) else {} + if not isinstance(system_under_test, dict): + system_under_test = {} return { "run_ref": str(run_root), "summary_ref": str(summary_path), + "manifest_ref": str(manifest_path) if manifest_path.exists() else None, "run_dir": run_root.name, "timestamp_token": timestamp_token, "captured_at": summary.get("captured_at"), @@ -66,6 +72,9 @@ def build_run_entry(summary_path: Path) -> dict[str, Any]: "target_label": target_label, "all_passed": bool(summary.get("all_passed")), "runtime_selection": summary.get("runtime_selection"), + "backend": system_under_test.get("backend"), + "model": system_under_test.get("model"), + "runtime_variant": system_under_test.get("quantization_or_runtime_variant"), "overall_mean_s": summary.get("overall_mean_s"), "overall_best_s": summary.get("overall_best_s"), "overall_worst_s": summary.get("overall_worst_s"), @@ -270,6 +279,119 @@ def classify_retention( } +def latest_promotion_payload(write_root: Path) -> dict[str, Any] | None: + pointer = write_root / "promotions" / "llamacpp-promotion-gate-v1" / "latest.json" + if not pointer.exists(): + return None + payload = load_json(pointer) + promotion_ref = payload.get("promotion_ref") + if not isinstance(promotion_ref, str): + return None + promotion_path = Path(promotion_ref) + if not promotion_path.exists(): + return None + promotion = load_json(promotion_path) + promotion["_promotion_path"] = str(promotion_path) + return promotion + + +def determine_cohorts( + write_root: Path, + run_entries: list[dict[str, Any]], + latest_by_target_label: dict[str, dict[str, Any]], + latest_packet_refs: dict[str, set[str]], + retention: dict[str, Any], +) -> dict[str, Any]: + entries_by_ref = {str(entry["run_ref"]): entry for entry in run_entries} + latest_target_refs = { + str(entry["run_ref"]) + for entry in latest_by_target_label.values() + if isinstance(entry.get("run_ref"), str) + } + control_target_labels = { + label + for label in latest_by_target_label + if "llamacpp" not in label or "ollama-baseline" in label + } + current_control_refs = sorted( + str(latest_by_target_label[label]["run_ref"]) + for label in sorted(control_target_labels) + if label in latest_by_target_label + ) + + legacy_baseline_refs = sorted( + row["run_ref"] + for row in retention["classes"]["historical"] + if row.get("target_label") in control_target_labels + ) + + promotion_basis_refs = sorted(set().union(*latest_packet_refs.values())) + + promoted_substrate_refs: list[str] = [] + comparison_challenger_refs: list[str] = [] + promotion = latest_promotion_payload(write_root) + if promotion: + winner_quant = promotion.get("winner_quant") + screening_paths = sorted( + Path(str(promotion["_promotion_path"])).parent.glob("*.screening.json") + ) + for screening_path in screening_paths: + screening = load_json(screening_path) + bench = screening.get("bench") + if not isinstance(bench, dict): + continue + run_ref = bench.get("run_dir") + if not isinstance(run_ref, str) or run_ref not in entries_by_ref: + continue + quant = screening.get("quant") + if quant == winner_quant: + promoted_substrate_refs.append(run_ref) + else: + comparison_challenger_refs.append(run_ref) + + cohort_order = [ + ("current-control", current_control_refs, "latest control-path runs used as the default local baseline set"), + ("promotion-basis", promotion_basis_refs, "runs referenced by the current latest comparison or promotion packets"), + ("current-promoted", sorted(set(promoted_substrate_refs)), "latest promoted llama.cpp winner runs referenced by the current promotion verdict"), + ("comparison-challenger", sorted(set(comparison_challenger_refs)), "latest challenger runs kept beside the promoted winner for bounded backend comparison"), + ("legacy-baseline", legacy_baseline_refs, "historical control-path runs kept for baseline lineage and drift review"), + ] + + classes: dict[str, list[dict[str, Any]]] = {} + run_memberships: dict[str, list[str]] = {str(entry["run_ref"]): [] for entry in run_entries} + notes: dict[str, str] = {} + + for cohort_id, refs, note in cohort_order: + notes[cohort_id] = note + seen: set[str] = set() + rows: list[dict[str, Any]] = [] + for run_ref in refs: + if run_ref in seen: + continue + seen.add(run_ref) + entry = entries_by_ref.get(run_ref) + if entry is None: + continue + rows.append( + { + "run_ref": run_ref, + "captured_at": entry.get("captured_at"), + "target_label": entry.get("target_label"), + "benchmark_id": entry.get("benchmark_id"), + "overall_mean_s": entry.get("overall_mean_s"), + } + ) + run_memberships[run_ref].append(cohort_id) + classes[cohort_id] = rows + + return { + "notes": notes, + "classes": classes, + "counts": {key: len(value) for key, value in classes.items()}, + "run_memberships": run_memberships, + } + + def build_catalog(write_root: Path) -> dict[str, Any]: runs_root = write_root / "runs" comparisons_root = write_root / "comparisons" @@ -285,6 +407,7 @@ def build_catalog(write_root: Path) -> dict[str, Any]: latest_by_family = latest_entries_by_key(run_entries, "benchmark_family") latest_packet_refs, all_packet_refs = collect_packet_references(write_root) retention = classify_retention(run_entries, latest_by_target_label, latest_packet_refs, all_packet_refs) + cohorts = determine_cohorts(write_root, run_entries, latest_by_target_label, latest_packet_refs, retention) comparisons = collect_latest_group(comparisons_root, "comparison") promotions = collect_latest_group(promotions_root, "promotion") @@ -302,6 +425,11 @@ def build_catalog(write_root: Path) -> dict[str, Any]: "exploratory": "runs not referenced by durable comparison surfaces", }, }, + "cohorts": { + "counts": cohorts["counts"], + "notes": cohorts["notes"], + "ref": str(write_root / "cohorts.json"), + }, "runs": { "count": len(run_entries), "index_ref": str(write_root / "runs" / "index.json"), @@ -357,10 +485,12 @@ def main() -> int: latest_by_target_label = latest_entries_by_key(run_entries, "target_label") latest_packet_refs, all_packet_refs = collect_packet_references(write_root) retention = classify_retention(run_entries, latest_by_target_label, latest_packet_refs, all_packet_refs) + cohorts = determine_cohorts(write_root, run_entries, latest_by_target_label, latest_packet_refs, retention) run_entries_with_retention = [] for entry in run_entries: enriched = dict(entry) enriched.update(retention["run_class_map"][str(entry["run_ref"])]) + enriched["cohort_memberships"] = cohorts["run_memberships"][str(entry["run_ref"])] run_entries_with_retention.append(enriched) latest_root = write_root / "latest" @@ -368,6 +498,7 @@ def main() -> int: "generated_at": catalog["generated_at"], "catalog_ref": str(write_root / "catalog.json"), "retention_ref": str(write_root / "retention.json"), + "cohorts_ref": str(write_root / "cohorts.json"), "comparison_refs": { key: value.get("pointer_ref") for key, value in catalog["comparisons"].items() @@ -381,6 +512,10 @@ def main() -> int: item["run_ref"] for item in retention["classes"]["canonical"] ], + "cohort_refs": { + key: [row["run_ref"] for row in value] + for key, value in cohorts["classes"].items() + }, } run_index = { "generated_at": catalog["generated_at"], @@ -397,11 +532,18 @@ def main() -> int: "exploratory": "keep these as local evidence, but do not treat them as the default comparison set", }, } + cohort_index = { + "generated_at": catalog["generated_at"], + "counts": cohorts["counts"], + "notes": cohorts["notes"], + "classes": cohorts["classes"], + } write_json(write_root / "catalog.json", catalog) write_json(latest_root / "index.json", latest_index) write_json(runs_root / "index.json", run_index) write_json(write_root / "retention.json", retention_index) + write_json(write_root / "cohorts.json", cohort_index) print(json.dumps({ "ok": True, @@ -409,12 +551,15 @@ def main() -> int: "latest_ref": str(latest_root / "index.json"), "run_index_ref": str(runs_root / "index.json"), "retention_ref": str(write_root / "retention.json"), + "cohorts_ref": str(write_root / "cohorts.json"), "run_count": len(run_entries), "comparison_count": len(catalog["comparisons"]), "promotion_count": len(catalog["promotions"]), "canonical_count": retention["counts"]["canonical"], "historical_count": retention["counts"]["historical"], "exploratory_count": retention["counts"]["exploratory"], + "current_control_count": cohorts["counts"].get("current-control", 0), + "promotion_basis_count": cohorts["counts"].get("promotion-basis", 0), }, ensure_ascii=True)) return 0