Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
351 changes: 351 additions & 0 deletions notebooks/06_Causal_Intervention_Reproduction.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,351 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Experiment: F1-01a Causal Intervention Reproduction\n",
"\n",
"This notebook reproduces the mini causal-intervention run and will clone the repo automatically on Colab if needed.\n",
"\n",
"Important:\n",
"- Do not use `%run scripts/run_causal_intervention.py` here.\n",
"- Import the module and call its functions directly; that avoids Jupyter's `-f` argument issue.\n",
"- Run this on a T4 GPU if possible.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Setup: install the few runtime dependencies used by the notebook.\n",
"# On Colab this is usually enough; restart only if the runtime asks you to.\n",
"!pip install -q transformers pandas tqdm\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from __future__ import annotations\n",
"\n",
"import os\n",
"import random\n",
"import subprocess\n",
"import sys\n",
"from pathlib import Path\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import torch\n",
"from IPython.display import display\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
"\n",
"REPO_URL = \"https://github.com/DaviBonetto/spectralguard.git\"\n",
"REPO_REF = \"codex/fix/colab-causal-repro-notebook\"\n",
"CLONE_DIR = Path(\"/content/spectralguard\")\n",
"\n",
"# If auto-detection fails, set this manually to a local folder that already contains\n",
"# scripts/run_causal_intervention.py.\n",
"REPO_ROOT_OVERRIDE = None\n",
"\n",
"def ensure_repo_root() -> Path:\n",
" candidates = []\n",
" if REPO_ROOT_OVERRIDE is not None:\n",
" candidates.append(Path(REPO_ROOT_OVERRIDE))\n",
" candidates.extend(\n",
" [\n",
" CLONE_DIR,\n",
" Path.cwd(),\n",
" Path(\"/content/drive/MyDrive/Research - code - copy\"),\n",
" Path(\"/content/drive/MyDrive/?rea de Trabalho/Research - code - copy\"),\n",
" Path(\"/workspace/Research - code - copy\"),\n",
" ]\n",
" )\n",
" seen = set()\n",
" for candidate in candidates:\n",
" if candidate is None:\n",
" continue\n",
" candidate = candidate.expanduser()\n",
" key = str(candidate)\n",
" if key in seen:\n",
" continue\n",
" seen.add(key)\n",
" if (candidate / \"scripts\" / \"run_causal_intervention.py\").exists():\n",
" return candidate.resolve()\n",
"\n",
" if not (CLONE_DIR / \".git\").exists():\n",
" print(f\"Cloning {REPO_URL} ({REPO_REF}) into {CLONE_DIR} ...\")\n",
" subprocess.run(\n",
" [\n",
" \"git\",\n",
" \"clone\",\n",
" \"--depth\", \"1\",\n",
" \"--branch\", REPO_REF,\n",
" \"--single-branch\",\n",
" REPO_URL,\n",
" str(CLONE_DIR),\n",
" ],\n",
" check=True,\n",
" )\n",
"\n",
" if (CLONE_DIR / \"scripts\" / \"run_causal_intervention.py\").exists():\n",
" return CLONE_DIR.resolve()\n",
"\n",
" raise FileNotFoundError(\n",
" \"Could not find or clone the repository root. If you uploaded the repo to Colab manually, \"\n",
" \"set REPO_ROOT_OVERRIDE to that folder.\"\n",
" )\n",
"\n",
"REPO_ROOT = ensure_repo_root()\n",
"SCRIPTS_DIR = REPO_ROOT / \"scripts\"\n",
"os.chdir(REPO_ROOT)\n",
"if str(REPO_ROOT) not in sys.path:\n",
" sys.path.insert(0, str(REPO_ROOT))\n",
"if str(SCRIPTS_DIR) not in sys.path:\n",
" sys.path.insert(0, str(SCRIPTS_DIR))\n",
"\n",
"print(f\"Repo root: {REPO_ROOT}\")\n",
"print(f\"Scripts dir: {SCRIPTS_DIR}\")\n",
"\n",
"import run_causal_intervention as causal\n",
"print(f\"Imported module: {causal.__file__}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Reproducibility and run configuration.\n",
"MODEL_ID = \"state-spaces/mamba-130m-hf\"\n",
"N_SAMPLES = 20\n",
"SEED = 42\n",
"THRESHOLDS = [0.99]\n",
"MAX_ATTEMPTS = 3000\n",
"MAX_NEW_TOKENS = 25\n",
"DEBUG = True\n",
"LOCAL_FILES_ONLY = False # Set True only if the model is already cached in the Colab runtime.\n",
"\n",
"OUTPUT_DIR = REPO_ROOT / \"artifacts\" / \"phase1\"\n",
"OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n",
"\n",
"print(f\"MODEL_ID={MODEL_ID}\")\n",
"print(f\"N_SAMPLES={N_SAMPLES}\")\n",
"print(f\"THRESHOLDS={THRESHOLDS}\")\n",
"print(f\"OUTPUT_DIR={OUTPUT_DIR}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load tokenizer and model.\n",
"causal.set_seed(SEED)\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"print(f\"Using device: {device}\")\n",
"if device.type == \"cuda\":\n",
" print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\n",
" MODEL_ID,\n",
" local_files_only=LOCAL_FILES_ONLY,\n",
")\n",
"if tokenizer.pad_token is None:\n",
" tokenizer.pad_token = tokenizer.eos_token\n",
"\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" MODEL_ID,\n",
" local_files_only=LOCAL_FILES_ONLY,\n",
").to(device)\n",
"causal.tie_lm_head(model)\n",
"model.eval()\n",
"\n",
"print(\"Model loaded and ready.\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Mine prompts that the baseline model can solve before running the intervention.\n",
"prompts = causal.mine_validated_prompts(\n",
" model,\n",
" tokenizer,\n",
" N_SAMPLES,\n",
" device,\n",
" SEED,\n",
" MAX_ATTEMPTS,\n",
" MAX_NEW_TOKENS,\n",
")\n",
"\n",
"print(f\"Validated prompts: {len(prompts)}\")\n",
"if prompts:\n",
" print(\"Example prompt:\")\n",
" print(prompts[0][0])\n",
" print(f\"Target: {prompts[0][1]}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def run_protocol(protocol: str, layer_idx: int, output_name: str) -> pd.DataFrame:\n",
" n = len(prompts)\n",
" rows = [\n",
" {\n",
" \"experiment_id\": \"causal_clamp\",\n",
" \"model_id\": MODEL_ID,\n",
" \"attacker_type\": \"none\",\n",
" \"split\": \"validated_ar\",\n",
" \"n\": n,\n",
" \"metric_name\": \"accuracy\",\n",
" \"metric_value\": 1.0,\n",
" \"ci_low\": 1.0,\n",
" \"ci_high\": 1.0,\n",
" \"seed\": SEED,\n",
" \"artifact_path\": str(OUTPUT_DIR / output_name),\n",
" \"protocol\": protocol,\n",
" \"rho_target\": \"baseline\",\n",
" \"correct\": n,\n",
" }\n",
" ]\n",
"\n",
" for rho in THRESHOLDS:\n",
" acc, ci_low, ci_high, correct = causal.evaluate_accuracy(\n",
" model,\n",
" tokenizer,\n",
" prompts,\n",
" rho,\n",
" protocol,\n",
" layer_idx,\n",
" device,\n",
" MAX_NEW_TOKENS,\n",
" debug=DEBUG,\n",
" )\n",
" rows.append(\n",
" {\n",
" \"experiment_id\": \"causal_clamp\",\n",
" \"model_id\": MODEL_ID,\n",
" \"attacker_type\": \"none\",\n",
" \"split\": \"validated_ar\",\n",
" \"n\": n,\n",
" \"metric_name\": \"accuracy\",\n",
" \"metric_value\": round(acc, 6),\n",
" \"ci_low\": round(ci_low, 6),\n",
" \"ci_high\": round(ci_high, 6),\n",
" \"seed\": SEED,\n",
" \"artifact_path\": str(OUTPUT_DIR / output_name),\n",
" \"protocol\": protocol,\n",
" \"rho_target\": rho,\n",
" \"correct\": correct,\n",
" }\n",
" )\n",
"\n",
" df = pd.DataFrame(rows)\n",
" out_path = OUTPUT_DIR / output_name\n",
" df.to_csv(out_path, index=False)\n",
" print(f\"Saved: {out_path}\")\n",
" display(df)\n",
" return df\n",
"\n",
"print(\"Helper ready.\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. All-layer run\n",
"\n",
"This is the global intervention path.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_all_layer = run_protocol(\n",
" protocol=\"all_layer\",\n",
" layer_idx=0,\n",
" output_name=\"repro_all_layer.csv\",\n",
")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Single-layer run\n",
"\n",
"This is the local intervention path. Use layer 0 unless you want to test another layer.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_single_layer = run_protocol(\n",
" protocol=\"single_layer\",\n",
" layer_idx=0,\n",
" output_name=\"repro_single_layer.csv\",\n",
")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Summary and next step\n",
"\n",
"If the notebook stops with a `rho_after` assertion, that is the signal I need. Send me the CSVs plus the full notebook output.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"summary = pd.concat(\n",
" [\n",
" df_all_layer.assign(run=\"all_layer\"),\n",
" df_single_layer.assign(run=\"single_layer\"),\n",
" ],\n",
" ignore_index=True,\n",
")\n",
"\n",
"display(summary[[\"run\", \"protocol\", \"rho_target\", \"metric_value\", \"correct\", \"ci_low\", \"ci_high\"]])\n",
"print(\"Done. Send the CSVs and the full notebook log.\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading
Loading