criticalml-uw · tomtseng · Feb 14, 2026 · Feb 14, 2026 · Feb 14, 2026 · Feb 14, 2026
diff --git a/scripts/user/tomtseng/chat_template_refactor_testing_260217/analyze_results.py b/scripts/user/tomtseng/chat_template_refactor_testing_260217/analyze_results.py
@@ -0,0 +1,237 @@
+"""Analyze chat-attack regression test results.
+
+Compares evaluation metrics from the new chat-attack branch against the
+baseline metrics stored in nov7_trial best.json for each model-attack pair.
+
+Reports:
+- Side-by-side comparison of old vs new metrics (strong_reject, mmlu_pro_val)
+- Absolute and relative differences
+- Regressions where relative change exceeds a configurable threshold
+
+Usage:
+    uv run scripts/user/tomtseng/chat_template_refactor_testing_260217/analyze_results.py \
+        --source-results-dir /data/saad_hossain/SafeTuneBed/results/nov7_trial \
+        --new-results-dir results/chat_template_refactor_testing_260217
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+from tamperbench.utils import get_repo_root
+from tamperbench.whitebox.utils.names import EvalPath
+
+REPO_ROOT = get_repo_root()
+
+MODELS = ["qwen3_0_6b", "qwen3_0_6b_base"]
+
+ATTACKS = [
+    "backdoor_finetune",
+    "benign_full_parameter_finetune",
+    "benign_lora_finetune",
+    "competing_objectives_finetune",
+    "full_parameter_finetune",
+    "lora_finetune",
+    "multilingual_finetune",
+    "no_weight_modification",
+    "style_modulation_finetune",
+]
+
+# Eval name -> metric_name in evaluation.json
+EVAL_METRIC_NAMES: dict[str, str] = {
+    "strong_reject": "strong_reject_score",
+    "mmlu_pro_val": "mmlu_pro_accuracy",
+}
+
+
+def load_old_metrics(source_results_dir: Path, model_alias: str, attack: str) -> dict[str, float] | None:
+    """Load baseline metrics from best.json's top trial.
+
+    Returns dict mapping eval_name -> metric_value, or None if not found.
+    """
+    best_json_path = source_results_dir / model_alias / attack / "optuna_single" / "best.json"
+    if not best_json_path.exists():
+        return None
+    with open(best_json_path) as f:
+        best = json.load(f)
+    top_trial = best["top_trials"][0]
+    eval_names = top_trial.get("eval_names", list(EVAL_METRIC_NAMES.keys()))
+    values = top_trial["values"]
+    return dict(zip(eval_names, values, strict=False))
+
+
+def load_new_metrics(new_results_dir: Path, model_alias: str, attack: str) -> dict[str, float] | None:
+    """Load new metrics from regression test evaluation.json files.
+
+    Returns dict mapping eval_name -> metric_value, or None if not found.
+    """
+    attack_dir = new_results_dir / model_alias / attack
+    if not attack_dir.exists():
+        return None
+
+    # Find trial directory (there should be exactly one)
+    trial_dirs = [d for d in attack_dir.iterdir() if d.is_dir() and d.name.startswith("trial_")]
+    if not trial_dirs:
+        return None
+    trial_dir = trial_dirs[0]
+
+    metrics: dict[str, float] = {}
+    for eval_name, metric_name in EVAL_METRIC_NAMES.items():
+        eval_json_path = trial_dir / EvalPath.CHECKPOINT_EVAL_DIR / eval_name / EvalPath.EVALUATION
+        if not eval_json_path.exists():
+            continue
+        with open(eval_json_path) as f:
+            data = json.load(f)
+        for row in data:
+            if row.get("metric_name") == metric_name:
+                metrics[eval_name] = float(row["metric_value"])
+                break
+
+    return metrics if metrics else None
+
+
+def analyze(
+    source_results_dir: Path,
+    new_results_dir: Path,
+    regression_threshold: float,
+) -> None:
+    """Compare old vs new metrics and report regressions."""
+    print(f"Source results: {source_results_dir}")
+    print(f"New results:    {new_results_dir}")
+    print(f"Regression threshold: {regression_threshold:.0%} relative change")
+    print()
+
+    eval_names = list(EVAL_METRIC_NAMES.keys())
+    records: list[dict[str, Any]] = []
+    missing_old = 0
+    missing_new = 0
+
+    for model in MODELS:
+        for attack in ATTACKS:
+            old_metrics = load_old_metrics(source_results_dir, model, attack)
+            new_metrics = load_new_metrics(new_results_dir, model, attack)
+
+            if old_metrics is None:
+                missing_old += 1
+                continue
+            if new_metrics is None:
+                missing_new += 1
+                continue
+
+            record: dict[str, Any] = {"model": model, "attack": attack}
+            for en in eval_names:
+                old_val = old_metrics.get(en)
+                new_val = new_metrics.get(en)
+                record[f"{en}_old"] = old_val
+                record[f"{en}_new"] = new_val
+                if old_val is not None and new_val is not None:
+                    record[f"{en}_diff"] = new_val - old_val
+                    if abs(old_val) > 1e-9:
+                        record[f"{en}_rel"] = (new_val - old_val) / abs(old_val)
+                    else:
+                        record[f"{en}_rel"] = float("inf") if abs(new_val - old_val) > 1e-9 else 0.0
+            records.append(record)
+
+    print(f"Pairs analyzed: {len(records)}")
+    print(f"Skipped (no old baseline): {missing_old}")
+    print(f"Skipped (no new results):  {missing_new}")
+    print()
+
+    if not records:
+        print("No results to compare.")
+        return
+
+    # Print detailed comparison table
+    header = f"{'Model':<20} {'Attack':<35}"
+    for en in eval_names:
+        header += f"  {'Old':>7}  {'New':>7}  {'Diff':>8}  {'Rel%':>7}"
+    print(header)
+    print("-" * len(header))
+
+    regressions: list[dict[str, Any]] = []
+
+    for r in records:
+        row = f"{r['model']:<20} {r['attack']:<35}"
+        is_regression = False
+        for en in eval_names:
+            old_val = r.get(f"{en}_old")
+            new_val = r.get(f"{en}_new")
+            diff = r.get(f"{en}_diff")
+            rel = r.get(f"{en}_rel")
+
+            old_str = f"{old_val:.4f}" if old_val is not None else "N/A"
+            new_str = f"{new_val:.4f}" if new_val is not None else "N/A"
+            diff_str = f"{diff:+.4f}" if diff is not None else "N/A"
+            rel_str = f"{rel:+.1%}" if rel is not None and rel != float("inf") else "N/A"
+
+            row += f"  {old_str:>7}  {new_str:>7}  {diff_str:>8}  {rel_str:>7}"
+
+            if rel is not None and abs(rel) > regression_threshold:
+                is_regression = True
+
+        if is_regression:
+            row += "  *** REGRESSION ***"
+            regressions.append(r)
+        print(row)
+
+    print()
+
+    # Summary statistics
+    for en in eval_names:
+        diffs = [r[f"{en}_diff"] for r in records if r.get(f"{en}_diff") is not None]
+        if diffs:
+            avg_diff = sum(diffs) / len(diffs)
+            max_diff = max(diffs, key=abs)
+            print(f"{en}: avg diff = {avg_diff:+.4f}, max abs diff = {max_diff:+.4f}")
+
+    print()
+
+    # Report regressions
+    if regressions:
+        print(f"*** {len(regressions)} REGRESSIONS DETECTED (>{regression_threshold:.0%} relative change) ***")
+        for r in regressions:
+            print(f"  {r['model']}/{r['attack']}")
+            for en in eval_names:
+                rel = r.get(f"{en}_rel")
+                if rel is not None and abs(rel) > regression_threshold:
+                    print(f"    {en}: {r[f'{en}_old']:.4f} -> {r[f'{en}_new']:.4f} ({rel:+.1%})")
+    else:
+        print("*** NO REGRESSIONS DETECTED ***")
+        print("All metrics within acceptable tolerance.")
+
+
+def main() -> None:
+    """Analyze chat-attack regression test results."""
+    parser = argparse.ArgumentParser(description="Analyze chat-attack regression test results")
+    parser.add_argument(
+        "--source-results-dir",
+        type=Path,
+        default=Path("/data/saad_hossain/SafeTuneBed/results/nov7_trial"),
+        help="Directory with baseline results (best.json files)",
+    )
+    parser.add_argument(
+        "--new-results-dir",
+        type=Path,
+        default=REPO_ROOT / "results" / "chat_template_refactor_testing_260217",
+        help="Directory with new regression test results",
+    )
+    parser.add_argument(
+        "--regression-threshold",
+        type=float,
+        default=0.05,
+        help="Relative change threshold to flag as regression (default: 0.05 = 5%%)",
+    )
+    args = parser.parse_args()
+
+    analyze(
+        source_results_dir=args.source_results_dir,
+        new_results_dir=args.new_results_dir,
+        regression_threshold=args.regression_threshold,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/user/tomtseng/chat_template_refactor_testing_260217/run_one.sh b/scripts/user/tomtseng/chat_template_refactor_testing_260217/run_one.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# Run a single model-attack pair for the chat-attack regression test.
+#
+# Usage:
+#   bash scripts/user/tomtseng/chat_template_refactor_testing_260217/run_one.sh <model_alias> <attack>
+#
+# Example:
+#   bash scripts/user/tomtseng/chat_template_refactor_testing_260217/run_one.sh qwen3_0_6b lora_finetune
+
+set -euo pipefail
+
+if [[ $# -lt 2 ]]; then
+    echo "Usage: $0 <model_alias> <attack>"
+    echo "Example: $0 qwen3_0_6b lora_finetune"
+    exit 1
+fi
+
+MODEL_ALIAS="$1"
+ATTACK="$2"
+
+if [[ "$(hostname)" == "cais_login" ]]; then
+    REPO_DIR="/data/tom_tseng/TamperBench"
+    SOURCE_RESULTS_DIR="/data/saad_hossain/SafeTuneBed/results/nov7_trial"
+    UV="/data/tom_tseng/.local/bin/uv"
+else
+    REPO_DIR="/workspace/TamperBench"
+    SOURCE_RESULTS_DIR="/tamperbench_data/cais_archive/nov7_trial"
+    UV="uv"
+fi
+OUTPUT_RESULTS_DIR="${REPO_DIR}/results/chat_template_refactor_testing_260217"
+SCRIPT="scripts/user/tomtseng/chat_template_refactor_testing_260217/run_regression.py"
+
+declare -A MODEL_PATHS=(
+    ["qwen3_0_6b"]="Qwen/Qwen3-0.6B"
+    ["qwen3_0_6b_base"]="Qwen/Qwen3-0.6B-Base"
+)
+
+if [[ -z "${MODEL_PATHS[$MODEL_ALIAS]+x}" ]]; then
+    echo "Error: Unknown model alias '${MODEL_ALIAS}'"
+    echo "Available: ${!MODEL_PATHS[*]}"
+    exit 1
+fi
+
+MODEL_PATH="${MODEL_PATHS[$MODEL_ALIAS]}"
+
+BEST_JSON="${SOURCE_RESULTS_DIR}/${MODEL_ALIAS}/${ATTACK}/optuna_single/best.json"
+if [[ ! -f "$BEST_JSON" ]]; then
+    echo "Error: No best.json at ${BEST_JSON}"
+    exit 1
+fi
+
+cd "${REPO_DIR}"
+
+echo "Running regression test: ${MODEL_ALIAS} / ${ATTACK} (${MODEL_PATH})"
+export WANDB_MODE=disabled
+
+"${UV}" run "${SCRIPT}" "${MODEL_PATH}" \
+    --model-alias "${MODEL_ALIAS}" \
+    --attack "${ATTACK}" \
+    --source-results-dir "${SOURCE_RESULTS_DIR}" \
+    --output-results-dir "${OUTPUT_RESULTS_DIR}"