DaviBonetto · DaviBonetto · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/notebooks/06_Causal_Intervention_Reproduction.ipynb b/notebooks/06_Causal_Intervention_Reproduction.ipynb
@@ -0,0 +1,351 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Experiment: F1-01a Causal Intervention Reproduction\n",
+        "\n",
+        "This notebook reproduces the mini causal-intervention run and will clone the repo automatically on Colab if needed.\n",
+        "\n",
+        "Important:\n",
+        "- Do not use `%run scripts/run_causal_intervention.py` here.\n",
+        "- Import the module and call its functions directly; that avoids Jupyter's `-f` argument issue.\n",
+        "- Run this on a T4 GPU if possible.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Setup: install the few runtime dependencies used by the notebook.\n",
+        "# On Colab this is usually enough; restart only if the runtime asks you to.\n",
+        "!pip install -q transformers pandas tqdm\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from __future__ import annotations\n",
+        "\n",
+        "import os\n",
+        "import random\n",
+        "import subprocess\n",
+        "import sys\n",
+        "from pathlib import Path\n",
+        "\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "import torch\n",
+        "from IPython.display import display\n",
+        "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+        "\n",
+        "REPO_URL = \"https://github.com/DaviBonetto/spectralguard.git\"\n",
+        "REPO_REF = \"codex/fix/colab-causal-repro-notebook\"\n",
+        "CLONE_DIR = Path(\"/content/spectralguard\")\n",
+        "\n",
+        "# If auto-detection fails, set this manually to a local folder that already contains\n",
+        "# scripts/run_causal_intervention.py.\n",
+        "REPO_ROOT_OVERRIDE = None\n",
+        "\n",
+        "def ensure_repo_root() -> Path:\n",
+        "    candidates = []\n",
+        "    if REPO_ROOT_OVERRIDE is not None:\n",
+        "        candidates.append(Path(REPO_ROOT_OVERRIDE))\n",
+        "    candidates.extend(\n",
+        "        [\n",
+        "            CLONE_DIR,\n",
+        "            Path.cwd(),\n",
+        "            Path(\"/content/drive/MyDrive/Research - code - copy\"),\n",
+        "            Path(\"/content/drive/MyDrive/?rea de Trabalho/Research - code - copy\"),\n",
+        "            Path(\"/workspace/Research - code - copy\"),\n",
+        "        ]\n",
+        "    )\n",
+        "    seen = set()\n",
+        "    for candidate in candidates:\n",
+        "        if candidate is None:\n",
+        "            continue\n",
+        "        candidate = candidate.expanduser()\n",
+        "        key = str(candidate)\n",
+        "        if key in seen:\n",
+        "            continue\n",
+        "        seen.add(key)\n",
+        "        if (candidate / \"scripts\" / \"run_causal_intervention.py\").exists():\n",
+        "            return candidate.resolve()\n",
+        "\n",
+        "    if not (CLONE_DIR / \".git\").exists():\n",
+        "        print(f\"Cloning {REPO_URL} ({REPO_REF}) into {CLONE_DIR} ...\")\n",
+        "        subprocess.run(\n",
+        "            [\n",
+        "                \"git\",\n",
+        "                \"clone\",\n",
+        "                \"--depth\", \"1\",\n",
+        "                \"--branch\", REPO_REF,\n",
+        "                \"--single-branch\",\n",
+        "                REPO_URL,\n",
+        "                str(CLONE_DIR),\n",
+        "            ],\n",
+        "            check=True,\n",
+        "        )\n",
+        "\n",
+        "    if (CLONE_DIR / \"scripts\" / \"run_causal_intervention.py\").exists():\n",
+        "        return CLONE_DIR.resolve()\n",
+        "\n",
+        "    raise FileNotFoundError(\n",
+        "        \"Could not find or clone the repository root. If you uploaded the repo to Colab manually, \"\n",
+        "        \"set REPO_ROOT_OVERRIDE to that folder.\"\n",
+        "    )\n",
+        "\n",
+        "REPO_ROOT = ensure_repo_root()\n",
+        "SCRIPTS_DIR = REPO_ROOT / \"scripts\"\n",
+        "os.chdir(REPO_ROOT)\n",
+        "if str(REPO_ROOT) not in sys.path:\n",
+        "    sys.path.insert(0, str(REPO_ROOT))\n",
+        "if str(SCRIPTS_DIR) not in sys.path:\n",
+        "    sys.path.insert(0, str(SCRIPTS_DIR))\n",
+        "\n",
+        "print(f\"Repo root: {REPO_ROOT}\")\n",
+        "print(f\"Scripts dir: {SCRIPTS_DIR}\")\n",
+        "\n",
+        "import run_causal_intervention as causal\n",
+        "print(f\"Imported module: {causal.__file__}\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Reproducibility and run configuration.\n",
+        "MODEL_ID = \"state-spaces/mamba-130m-hf\"\n",
+        "N_SAMPLES = 20\n",
+        "SEED = 42\n",
+        "THRESHOLDS = [0.99]\n",
+        "MAX_ATTEMPTS = 3000\n",
+        "MAX_NEW_TOKENS = 25\n",
+        "DEBUG = True\n",
+        "LOCAL_FILES_ONLY = False  # Set True only if the model is already cached in the Colab runtime.\n",
+        "\n",
+        "OUTPUT_DIR = REPO_ROOT / \"artifacts\" / \"phase1\"\n",
+        "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n",
+        "\n",
+        "print(f\"MODEL_ID={MODEL_ID}\")\n",
+        "print(f\"N_SAMPLES={N_SAMPLES}\")\n",
+        "print(f\"THRESHOLDS={THRESHOLDS}\")\n",
+        "print(f\"OUTPUT_DIR={OUTPUT_DIR}\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Load tokenizer and model.\n",
+        "causal.set_seed(SEED)\n",
+        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+        "print(f\"Using device: {device}\")\n",
+        "if device.type == \"cuda\":\n",
+        "    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
+        "\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\n",
+        "    MODEL_ID,\n",
+        "    local_files_only=LOCAL_FILES_ONLY,\n",
+        ")\n",
+        "if tokenizer.pad_token is None:\n",
+        "    tokenizer.pad_token = tokenizer.eos_token\n",
+        "\n",
+        "model = AutoModelForCausalLM.from_pretrained(\n",
+        "    MODEL_ID,\n",
+        "    local_files_only=LOCAL_FILES_ONLY,\n",
+        ").to(device)\n",
+        "causal.tie_lm_head(model)\n",
+        "model.eval()\n",
+        "\n",
+        "print(\"Model loaded and ready.\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Mine prompts that the baseline model can solve before running the intervention.\n",
+        "prompts = causal.mine_validated_prompts(\n",
+        "    model,\n",
+        "    tokenizer,\n",
+        "    N_SAMPLES,\n",
+        "    device,\n",
+        "    SEED,\n",
+        "    MAX_ATTEMPTS,\n",
+        "    MAX_NEW_TOKENS,\n",
+        ")\n",
+        "\n",
+        "print(f\"Validated prompts: {len(prompts)}\")\n",
+        "if prompts:\n",
+        "    print(\"Example prompt:\")\n",
+        "    print(prompts[0][0])\n",
+        "    print(f\"Target: {prompts[0][1]}\")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def run_protocol(protocol: str, layer_idx: int, output_name: str) -> pd.DataFrame:\n",
+        "    n = len(prompts)\n",
+        "    rows = [\n",
+        "        {\n",
+        "            \"experiment_id\": \"causal_clamp\",\n",
+        "            \"model_id\": MODEL_ID,\n",
+        "            \"attacker_type\": \"none\",\n",
+        "            \"split\": \"validated_ar\",\n",
+        "            \"n\": n,\n",
+        "            \"metric_name\": \"accuracy\",\n",
+        "            \"metric_value\": 1.0,\n",
+        "            \"ci_low\": 1.0,\n",
+        "            \"ci_high\": 1.0,\n",
+        "            \"seed\": SEED,\n",
+        "            \"artifact_path\": str(OUTPUT_DIR / output_name),\n",
+        "            \"protocol\": protocol,\n",
+        "            \"rho_target\": \"baseline\",\n",
+        "            \"correct\": n,\n",
+        "        }\n",
+        "    ]\n",
+        "\n",
+        "    for rho in THRESHOLDS:\n",
+        "        acc, ci_low, ci_high, correct = causal.evaluate_accuracy(\n",
+        "            model,\n",
+        "            tokenizer,\n",
+        "            prompts,\n",
+        "            rho,\n",
+        "            protocol,\n",
+        "            layer_idx,\n",
+        "            device,\n",
+        "            MAX_NEW_TOKENS,\n",
+        "            debug=DEBUG,\n",
+        "        )\n",
+        "        rows.append(\n",
+        "            {\n",
+        "                \"experiment_id\": \"causal_clamp\",\n",
+        "                \"model_id\": MODEL_ID,\n",
+        "                \"attacker_type\": \"none\",\n",
+        "                \"split\": \"validated_ar\",\n",
+        "                \"n\": n,\n",
+        "                \"metric_name\": \"accuracy\",\n",
+        "                \"metric_value\": round(acc, 6),\n",
+        "                \"ci_low\": round(ci_low, 6),\n",
+        "                \"ci_high\": round(ci_high, 6),\n",
+        "                \"seed\": SEED,\n",
+        "                \"artifact_path\": str(OUTPUT_DIR / output_name),\n",
+        "                \"protocol\": protocol,\n",
+        "                \"rho_target\": rho,\n",
+        "                \"correct\": correct,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "    df = pd.DataFrame(rows)\n",
+        "    out_path = OUTPUT_DIR / output_name\n",
+        "    df.to_csv(out_path, index=False)\n",
+        "    print(f\"Saved: {out_path}\")\n",
+        "    display(df)\n",
+        "    return df\n",
+        "\n",
+        "print(\"Helper ready.\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 1. All-layer run\n",
+        "\n",
+        "This is the global intervention path.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "df_all_layer = run_protocol(\n",
+        "    protocol=\"all_layer\",\n",
+        "    layer_idx=0,\n",
+        "    output_name=\"repro_all_layer.csv\",\n",
+        ")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 2. Single-layer run\n",
+        "\n",
+        "This is the local intervention path. Use layer 0 unless you want to test another layer.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "df_single_layer = run_protocol(\n",
+        "    protocol=\"single_layer\",\n",
+        "    layer_idx=0,\n",
+        "    output_name=\"repro_single_layer.csv\",\n",
+        ")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 3. Summary and next step\n",
+        "\n",
+        "If the notebook stops with a `rho_after` assertion, that is the signal I need. Send me the CSVs plus the full notebook output.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "summary = pd.concat(\n",
+        "    [\n",
+        "        df_all_layer.assign(run=\"all_layer\"),\n",
+        "        df_single_layer.assign(run=\"single_layer\"),\n",
+        "    ],\n",
+        "    ignore_index=True,\n",
+        ")\n",
+        "\n",
+        "display(summary[[\"run\", \"protocol\", \"rho_target\", \"metric_value\", \"correct\", \"ci_low\", \"ci_high\"]])\n",
+        "print(\"Done. Send the CSVs and the full notebook log.\")\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.12"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}