From f969d4c68e91edb7dcfada1affdf104dd8fcf9fd Mon Sep 17 00:00:00 2001
From: 8Dionysus <gerhmangrant@gmail.com>
Date: Mon, 30 Mar 2026 15:13:48 -0600
Subject: [PATCH] Add benchmark comparison cohorts

---
 docs/RUNTIME_BENCH_POLICY.md    |  14 +++
 scripts/aoa-runtime-bench-index | 145 ++++++++++++++++++++++++++++++++
 2 files changed, 159 insertions(+)

diff --git a/docs/RUNTIME_BENCH_POLICY.md b/docs/RUNTIME_BENCH_POLICY.md
index 92bbf23..e5c1528 100644
--- a/docs/RUNTIME_BENCH_POLICY.md
+++ b/docs/RUNTIME_BENCH_POLICY.md
@@ -74,6 +74,7 @@ Recommended active tree:
 ${AOA_STACK_ROOT}/Logs/runtime-benchmarks/
   catalog.json
   retention.json
+  cohorts.json
   latest/
     index.json
   runs/
@@ -133,6 +134,7 @@ scripts/aoa-runtime-bench-index
 That helper writes:
 - `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/catalog.json`
 - `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/retention.json`
+- `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/cohorts.json`
 - `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/latest/index.json`
 - `${AOA_STACK_ROOT}/Logs/runtime-benchmarks/runs/index.json`
 
@@ -144,6 +146,18 @@ Retention classes:
 - `exploratory`
   Local evidence that is not part of the current default comparison surface.
 
+Cohort layer:
+- `current-control`
+  The default local control-path runs to compare against first.
+- `promotion-basis`
+  The runs directly used by the current comparison/promotion verdict path.
+- `current-promoted`
+  The latest promoted winner runs for the active backend substrate.
+- `comparison-challenger`
+  The latest challenger runs retained beside the promoted winner.
+- `legacy-baseline`
+  Older control-path runs kept for drift review.
+
 ## Relationship to local trial programs
 
 If you need a supervised per-case trial program rather than a standalone benchmark run, use:
diff --git a/scripts/aoa-runtime-bench-index b/scripts/aoa-runtime-bench-index
index 44f85da..f5bb972 100755
--- a/scripts/aoa-runtime-bench-index
+++ b/scripts/aoa-runtime-bench-index
@@ -55,9 +55,15 @@ def build_run_entry(summary_path: Path) -> dict[str, Any]:
     summary = load_json(summary_path)
     run_root = summary_path.parent
     timestamp_token, benchmark_family, target_label = parse_run_dir_name(run_root.name)
+    manifest_path = run_root / "benchmark.manifest.json"
+    manifest = load_json(manifest_path) if manifest_path.exists() else {}
+    system_under_test = manifest.get("system_under_test") if isinstance(manifest, dict) else {}
+    if not isinstance(system_under_test, dict):
+        system_under_test = {}
     return {
         "run_ref": str(run_root),
         "summary_ref": str(summary_path),
+        "manifest_ref": str(manifest_path) if manifest_path.exists() else None,
         "run_dir": run_root.name,
         "timestamp_token": timestamp_token,
         "captured_at": summary.get("captured_at"),
@@ -66,6 +72,9 @@ def build_run_entry(summary_path: Path) -> dict[str, Any]:
         "target_label": target_label,
         "all_passed": bool(summary.get("all_passed")),
         "runtime_selection": summary.get("runtime_selection"),
+        "backend": system_under_test.get("backend"),
+        "model": system_under_test.get("model"),
+        "runtime_variant": system_under_test.get("quantization_or_runtime_variant"),
         "overall_mean_s": summary.get("overall_mean_s"),
         "overall_best_s": summary.get("overall_best_s"),
         "overall_worst_s": summary.get("overall_worst_s"),
@@ -270,6 +279,119 @@ def classify_retention(
     }
 
 
+def latest_promotion_payload(write_root: Path) -> dict[str, Any] | None:
+    pointer = write_root / "promotions" / "llamacpp-promotion-gate-v1" / "latest.json"
+    if not pointer.exists():
+        return None
+    payload = load_json(pointer)
+    promotion_ref = payload.get("promotion_ref")
+    if not isinstance(promotion_ref, str):
+        return None
+    promotion_path = Path(promotion_ref)
+    if not promotion_path.exists():
+        return None
+    promotion = load_json(promotion_path)
+    promotion["_promotion_path"] = str(promotion_path)
+    return promotion
+
+
+def determine_cohorts(
+    write_root: Path,
+    run_entries: list[dict[str, Any]],
+    latest_by_target_label: dict[str, dict[str, Any]],
+    latest_packet_refs: dict[str, set[str]],
+    retention: dict[str, Any],
+) -> dict[str, Any]:
+    entries_by_ref = {str(entry["run_ref"]): entry for entry in run_entries}
+    latest_target_refs = {
+        str(entry["run_ref"])
+        for entry in latest_by_target_label.values()
+        if isinstance(entry.get("run_ref"), str)
+    }
+    control_target_labels = {
+        label
+        for label in latest_by_target_label
+        if "llamacpp" not in label or "ollama-baseline" in label
+    }
+    current_control_refs = sorted(
+        str(latest_by_target_label[label]["run_ref"])
+        for label in sorted(control_target_labels)
+        if label in latest_by_target_label
+    )
+
+    legacy_baseline_refs = sorted(
+        row["run_ref"]
+        for row in retention["classes"]["historical"]
+        if row.get("target_label") in control_target_labels
+    )
+
+    promotion_basis_refs = sorted(set().union(*latest_packet_refs.values()))
+
+    promoted_substrate_refs: list[str] = []
+    comparison_challenger_refs: list[str] = []
+    promotion = latest_promotion_payload(write_root)
+    if promotion:
+        winner_quant = promotion.get("winner_quant")
+        screening_paths = sorted(
+            Path(str(promotion["_promotion_path"])).parent.glob("*.screening.json")
+        )
+        for screening_path in screening_paths:
+            screening = load_json(screening_path)
+            bench = screening.get("bench")
+            if not isinstance(bench, dict):
+                continue
+            run_ref = bench.get("run_dir")
+            if not isinstance(run_ref, str) or run_ref not in entries_by_ref:
+                continue
+            quant = screening.get("quant")
+            if quant == winner_quant:
+                promoted_substrate_refs.append(run_ref)
+            else:
+                comparison_challenger_refs.append(run_ref)
+
+    cohort_order = [
+        ("current-control", current_control_refs, "latest control-path runs used as the default local baseline set"),
+        ("promotion-basis", promotion_basis_refs, "runs referenced by the current latest comparison or promotion packets"),
+        ("current-promoted", sorted(set(promoted_substrate_refs)), "latest promoted llama.cpp winner runs referenced by the current promotion verdict"),
+        ("comparison-challenger", sorted(set(comparison_challenger_refs)), "latest challenger runs kept beside the promoted winner for bounded backend comparison"),
+        ("legacy-baseline", legacy_baseline_refs, "historical control-path runs kept for baseline lineage and drift review"),
+    ]
+
+    classes: dict[str, list[dict[str, Any]]] = {}
+    run_memberships: dict[str, list[str]] = {str(entry["run_ref"]): [] for entry in run_entries}
+    notes: dict[str, str] = {}
+
+    for cohort_id, refs, note in cohort_order:
+        notes[cohort_id] = note
+        seen: set[str] = set()
+        rows: list[dict[str, Any]] = []
+        for run_ref in refs:
+            if run_ref in seen:
+                continue
+            seen.add(run_ref)
+            entry = entries_by_ref.get(run_ref)
+            if entry is None:
+                continue
+            rows.append(
+                {
+                    "run_ref": run_ref,
+                    "captured_at": entry.get("captured_at"),
+                    "target_label": entry.get("target_label"),
+                    "benchmark_id": entry.get("benchmark_id"),
+                    "overall_mean_s": entry.get("overall_mean_s"),
+                }
+            )
+            run_memberships[run_ref].append(cohort_id)
+        classes[cohort_id] = rows
+
+    return {
+        "notes": notes,
+        "classes": classes,
+        "counts": {key: len(value) for key, value in classes.items()},
+        "run_memberships": run_memberships,
+    }
+
+
 def build_catalog(write_root: Path) -> dict[str, Any]:
     runs_root = write_root / "runs"
     comparisons_root = write_root / "comparisons"
@@ -285,6 +407,7 @@ def build_catalog(write_root: Path) -> dict[str, Any]:
     latest_by_family = latest_entries_by_key(run_entries, "benchmark_family")
     latest_packet_refs, all_packet_refs = collect_packet_references(write_root)
     retention = classify_retention(run_entries, latest_by_target_label, latest_packet_refs, all_packet_refs)
+    cohorts = determine_cohorts(write_root, run_entries, latest_by_target_label, latest_packet_refs, retention)
 
     comparisons = collect_latest_group(comparisons_root, "comparison")
     promotions = collect_latest_group(promotions_root, "promotion")
@@ -302,6 +425,11 @@ def build_catalog(write_root: Path) -> dict[str, Any]:
                 "exploratory": "runs not referenced by durable comparison surfaces",
             },
         },
+        "cohorts": {
+            "counts": cohorts["counts"],
+            "notes": cohorts["notes"],
+            "ref": str(write_root / "cohorts.json"),
+        },
         "runs": {
             "count": len(run_entries),
             "index_ref": str(write_root / "runs" / "index.json"),
@@ -357,10 +485,12 @@ def main() -> int:
     latest_by_target_label = latest_entries_by_key(run_entries, "target_label")
     latest_packet_refs, all_packet_refs = collect_packet_references(write_root)
     retention = classify_retention(run_entries, latest_by_target_label, latest_packet_refs, all_packet_refs)
+    cohorts = determine_cohorts(write_root, run_entries, latest_by_target_label, latest_packet_refs, retention)
     run_entries_with_retention = []
     for entry in run_entries:
         enriched = dict(entry)
         enriched.update(retention["run_class_map"][str(entry["run_ref"])])
+        enriched["cohort_memberships"] = cohorts["run_memberships"][str(entry["run_ref"])]
         run_entries_with_retention.append(enriched)
 
     latest_root = write_root / "latest"
@@ -368,6 +498,7 @@ def main() -> int:
         "generated_at": catalog["generated_at"],
         "catalog_ref": str(write_root / "catalog.json"),
         "retention_ref": str(write_root / "retention.json"),
+        "cohorts_ref": str(write_root / "cohorts.json"),
         "comparison_refs": {
             key: value.get("pointer_ref")
             for key, value in catalog["comparisons"].items()
@@ -381,6 +512,10 @@ def main() -> int:
             item["run_ref"]
             for item in retention["classes"]["canonical"]
         ],
+        "cohort_refs": {
+            key: [row["run_ref"] for row in value]
+            for key, value in cohorts["classes"].items()
+        },
     }
     run_index = {
         "generated_at": catalog["generated_at"],
@@ -397,11 +532,18 @@ def main() -> int:
             "exploratory": "keep these as local evidence, but do not treat them as the default comparison set",
         },
     }
+    cohort_index = {
+        "generated_at": catalog["generated_at"],
+        "counts": cohorts["counts"],
+        "notes": cohorts["notes"],
+        "classes": cohorts["classes"],
+    }
 
     write_json(write_root / "catalog.json", catalog)
     write_json(latest_root / "index.json", latest_index)
     write_json(runs_root / "index.json", run_index)
     write_json(write_root / "retention.json", retention_index)
+    write_json(write_root / "cohorts.json", cohort_index)
 
     print(json.dumps({
         "ok": True,
@@ -409,12 +551,15 @@ def main() -> int:
         "latest_ref": str(latest_root / "index.json"),
         "run_index_ref": str(runs_root / "index.json"),
         "retention_ref": str(write_root / "retention.json"),
+        "cohorts_ref": str(write_root / "cohorts.json"),
         "run_count": len(run_entries),
         "comparison_count": len(catalog["comparisons"]),
         "promotion_count": len(catalog["promotions"]),
         "canonical_count": retention["counts"]["canonical"],
         "historical_count": retention["counts"]["historical"],
         "exploratory_count": retention["counts"]["exploratory"],
+        "current_control_count": cohorts["counts"].get("current-control", 0),
+        "promotion_basis_count": cohorts["counts"].get("promotion-basis", 0),
     }, ensure_ascii=True))
     return 0