diff --git a/.gitignore b/.gitignore index 99c30f52..ae23065c 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,7 @@ dev/ # Results file results.tsv + +# Checkpoints and logs +checkpoints/ +*.log diff --git a/AUTORESEARCH_V2.md b/AUTORESEARCH_V2.md new file mode 100644 index 00000000..ec275b5f --- /dev/null +++ b/AUTORESEARCH_V2.md @@ -0,0 +1,150 @@ +# Autoresearch v2 — Multi-Agent Autonomous ML Research Platform + +Built on top of [Karpathy's autoresearch](https://github.com/karpathy/autoresearch): an AI agent autonomously experiments on a small GPT training setup, modifying code, training for a fixed time budget, checking if the result improved, and repeating. We extended this from a single-agent/single-file loop into a multi-agent, multi-scale, self-improving research platform. + +## What's New + +### Variable Time Scales + +Instead of a fixed 5-minute budget, experiments run at different scales: + +| Scale | Duration | Use | +|-------|----------|-----| +| probe | 30s | Memory/compilation check | +| quick | 2min | Rough signal, kill bad ideas fast | +| standard | 5min | Real evaluation (original behavior) | +| long | 15min | Confirm promising results | +| deep | 30min | Final validation | + +Agents start with quick runs and escalate promising ideas to longer scales. All configurable via `AR_*` environment variables. + +### Research Memory + +Replaces the flat `results.tsv` with a shared knowledge base in `results/`: + +- **experiments.jsonl** — All experiment records from all agents (append-only, file-locked) +- **lessons.jsonl** — Discovered patterns and insights (e.g., "depth>12 always OOMs") +- **journal.md** — Research summaries and the director's agenda + +Backward-compatible `results.tsv` is auto-generated for the analysis notebook. + +### Model Checkpoints + +Promising runs save full model + optimizer state to `checkpoints/`. This enables the scaling ladder: a good 5-minute result can be resumed and trained for 15 more minutes without starting from scratch. + +### Multi-Agent Research Org + +Five specialized agent roles, each with tailored instructions: + +| Role | What it does | +|------|-------------| +| **Explorer** | Bold architectural experiments — new activations, attention variants, normalization | +| **Optimizer** | Systematic hyperparameter tuning — LR sweeps, batch size, schedules | +| **Analyst** | Reads all results, finds patterns, writes lessons, suggests combinations | +| **Reviewer** | Validates improvements at longer scales, runs ablations, catches false positives | +| **Director** | Coordinates research agenda, cherry-picks across branches, redirects stuck agents | + +Each agent runs in its own git worktree with its own branch, sharing results through the knowledge base. + +### GPU Queue + +When multiple agents share one GPU, a priority queue manages access. Probe jobs (30s) jump ahead so agents get fast feedback. Agents can plan their next experiment while waiting. + +## File Structure + +``` +# New infrastructure +config.py — Configurable constants (env var overrides) +knowledge.py — JSONL research memory with file locking +checkpoint.py — Model checkpoint save/load +run_experiment.py — Experiment runner wrapper +gpu_queue.py — Priority queue for GPU sharing +launch.py — Multi-agent orchestrator + +# Agent role instructions +programs/explorer.md +programs/optimizer.md +programs/analyst.md +programs/reviewer.md +programs/director.md + +# Original files (modified) +prepare.py — Constants now configurable via config.py +train.py — Checkpoint save added (env-var gated) +program.md — Multi-agent awareness section added + +# Shared directories (gitignored, created at runtime) +results/ — Knowledge base (experiments, lessons, journal) +queue/ — GPU job queue +checkpoints/ — Saved model states +worktrees/ — Per-agent git worktrees +``` + +## Quick Start + +### Solo Mode (enhanced single agent) + +```bash +python launch.py launch --tag mar10 --preset solo +``` + +One explorer agent with the full knowledge base, variable time scales, and checkpointing. + +### Duo (explorer + optimizer sharing one GPU) + +```bash +python launch.py launch --tag mar10 --preset duo --single-gpu +``` + +### Full Research Org in tmux + +```bash +python launch.py launch --tag mar10 --preset full --single-gpu --tmux +``` + +All five agents running in tmux panes. Attach with `tmux attach -t autoresearch-mar10`. + +### Custom Agent Mix + +```bash +python launch.py launch --tag mar10 --agents explorer:2,optimizer,director --single-gpu +``` + +Two explorers, one optimizer, one director. + +### Cleanup + +```bash +python launch.py cleanup --tag mar10 +``` + +Removes worktrees and branches for a tag. + +## Standalone Tools + +```bash +# View research briefing (what agents read before each experiment) +python run_experiment.py --briefing + +# Run a single experiment manually +python run_experiment.py --scale quick --description "try GLU activation" + +# Log a lesson +python run_experiment.py --lesson architecture medium "GLU helps at depth=8" + +# Check GPU queue +python gpu_queue.py status + +# List checkpoints +python checkpoint.py + +# Sync knowledge base to legacy results.tsv +python knowledge.py sync-tsv +``` + +## Requirements + +- Single NVIDIA GPU (tested on H100) +- Python 3.10+ +- [uv](https://docs.astral.sh/uv/) +- [Claude Code](https://claude.com/claude-code) CLI (`claude` command) diff --git a/analysis.ipynb b/analysis.ipynb index 8455ea4e..1d5eb997 100644 --- a/analysis.ipynb +++ b/analysis.ipynb @@ -16,21 +16,7 @@ "id": "v3r8c77lxhs", "metadata": {}, "outputs": [], - "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "\n", - "# Load the TSV (tab-separated, 5 columns: commit, val_bpb, memory_gb, status, description)\n", - "df = pd.read_csv(\"results.tsv\", sep=\"\\t\")\n", - "df[\"val_bpb\"] = pd.to_numeric(df[\"val_bpb\"], errors=\"coerce\")\n", - "df[\"memory_gb\"] = pd.to_numeric(df[\"memory_gb\"], errors=\"coerce\")\n", - "df[\"status\"] = df[\"status\"].str.strip().str.upper()\n", - "\n", - "print(f\"Total experiments: {len(df)}\")\n", - "print(f\"Columns: {list(df.columns)}\")\n", - "df.head(10)" - ] + "source": "import pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\n\n# Load the TSV (tab-separated, 10 columns: commit, val_bpb, memory_gb, mfu, num_params_M, depth, total_batch_size, matrix_lr, status, description)\ndf = pd.read_csv(\"results.tsv\", sep=\"\\t\")\ndf[\"val_bpb\"] = pd.to_numeric(df[\"val_bpb\"], errors=\"coerce\")\ndf[\"memory_gb\"] = pd.to_numeric(df[\"memory_gb\"], errors=\"coerce\")\ndf[\"status\"] = df[\"status\"].str.strip().str.upper()\n\nprint(f\"Total experiments: {len(df)}\")\nprint(f\"Columns: {list(df.columns)}\")\ndf.head(10)" }, { "cell_type": "code", @@ -243,4 +229,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/checkpoint.py b/checkpoint.py new file mode 100644 index 00000000..4cab4959 --- /dev/null +++ b/checkpoint.py @@ -0,0 +1,99 @@ +""" +Model checkpoint save/load for autoresearch. +Allows promising runs to be resumed at longer time scales. +""" + +import json +import os +import torch + +from config import get_checkpoint_dir + + +def save_checkpoint(model, optimizer, step, config_dict, metadata, path=None): + """Save model + optimizer state for later resumption. + + Args: + model: The compiled or raw GPT model. + optimizer: The MuonAdamW optimizer. + step: Current training step. + config_dict: Model config as dict (from dataclasses.asdict). + metadata: Dict with val_bpb, scale, branch, commit, etc. + path: Override save path. Default: checkpoints/_.pt + """ + ckpt_dir = get_checkpoint_dir() + os.makedirs(ckpt_dir, exist_ok=True) + + if path is None: + commit = metadata.get("commit", "unknown") + scale = metadata.get("scale", "unknown") + path = os.path.join(ckpt_dir, f"{commit}_{scale}.pt") + + # Handle torch.compile'd models + raw_model = model._orig_mod if hasattr(model, "_orig_mod") else model + + checkpoint = { + "model_state": raw_model.state_dict(), + "optimizer_state": optimizer.state_dict(), + "step": step, + "config": config_dict, + "metadata": metadata, + } + torch.save(checkpoint, path) + print(f"Checkpoint saved to {path}") + return path + + +def load_checkpoint(path, device="cuda"): + """Load a checkpoint. Returns dict with model_state, optimizer_state, step, config, metadata.""" + checkpoint = torch.load(path, map_location=device, weights_only=False) + print(f"Checkpoint loaded from {path}") + return checkpoint + + +def list_checkpoints(sort_by="val_bpb"): + """List available checkpoints sorted by metric.""" + ckpt_dir = get_checkpoint_dir() + if not os.path.exists(ckpt_dir): + return [] + + checkpoints = [] + for fname in os.listdir(ckpt_dir): + if not fname.endswith(".pt"): + continue + path = os.path.join(ckpt_dir, fname) + try: + # Load only metadata (not full weights) for listing + ckpt = torch.load(path, map_location="cpu", weights_only=False) + meta = ckpt.get("metadata", {}) + meta["path"] = path + meta["filename"] = fname + checkpoints.append(meta) + except Exception: + continue + + if sort_by == "val_bpb": + checkpoints.sort(key=lambda c: c.get("val_bpb", float("inf"))) + elif sort_by == "time": + checkpoints.sort(key=lambda c: os.path.getmtime(c["path"]), reverse=True) + + return checkpoints + + +def find_best_checkpoint(): + """Return path to checkpoint with lowest val_bpb, or None.""" + ckpts = list_checkpoints(sort_by="val_bpb") + if ckpts: + return ckpts[0]["path"] + return None + + +if __name__ == "__main__": + ckpts = list_checkpoints() + if not ckpts: + print("No checkpoints found.") + else: + print(f"{'val_bpb':>10} {'scale':>8} {'commit':>8} path") + print("-" * 60) + for c in ckpts: + print(f"{c.get('val_bpb', '?'):>10.6f} {c.get('scale', '?'):>8} {c.get('commit', '?'):>8} {c['path']}") diff --git a/config.py b/config.py new file mode 100644 index 00000000..4c3e6bca --- /dev/null +++ b/config.py @@ -0,0 +1,48 @@ +""" +Centralized configuration for autoresearch. +All constants are overridable via environment variables (AR_ prefix). +Without env vars, defaults match the original hardcoded values. +""" + +import os + +TIME_BUDGETS = { + "probe": 30, + "quick": 120, + "standard": 300, + "long": 900, + "deep": 1800, +} + +SCALE_PRIORITY = {"probe": 0, "quick": 1, "standard": 2, "long": 3, "deep": 4} + + +def get_scale() -> str: + return os.environ.get("AR_SCALE", "standard") + + +def get_time_budget() -> int: + explicit = os.environ.get("AR_TIME_BUDGET") + if explicit is not None: + return int(explicit) + return TIME_BUDGETS.get(get_scale(), 300) + + +def get_max_seq_len() -> int: + return int(os.environ.get("AR_MAX_SEQ_LEN", "2048")) + + +def get_eval_tokens() -> int: + return int(os.environ.get("AR_EVAL_TOKENS", str(160 * 524288))) + + +def get_checkpoint_dir() -> str: + return os.environ.get("AR_CHECKPOINT_DIR", os.path.join(os.path.dirname(__file__), "checkpoints")) + + +def get_results_dir() -> str: + return os.environ.get("AR_RESULTS_DIR", os.path.join(os.path.dirname(__file__), "results")) + + +def get_queue_dir() -> str: + return os.environ.get("AR_QUEUE_DIR", os.path.join(os.path.dirname(__file__), "queue")) diff --git a/gpu_queue.py b/gpu_queue.py new file mode 100644 index 00000000..91a9054e --- /dev/null +++ b/gpu_queue.py @@ -0,0 +1,337 @@ +#!/usr/bin/env python3 +""" +File-based GPU job queue for autoresearch. +Enables multiple agents to share a single GPU via priority-ordered time-sharing. + +Usage: + python gpu_queue.py worker # start queue worker (one per GPU) + python gpu_queue.py submit --agent-id X --scale quick --command "uv run train.py" + python gpu_queue.py status # show queue state +""" + +import argparse +import fcntl +import json +import os +import signal +import subprocess +import sys +import time +import uuid +from dataclasses import dataclass, asdict +from datetime import datetime, timezone + +from config import get_queue_dir, SCALE_PRIORITY + +QUEUE_DIR = get_queue_dir() +JOBS_FILE = os.path.join(QUEUE_DIR, "jobs.jsonl") +ACTIVE_FILE = os.path.join(QUEUE_DIR, "active.json") +WORKER_PID_FILE = os.path.join(QUEUE_DIR, "worker.pid") + + +def _ensure_queue_dir(): + os.makedirs(QUEUE_DIR, exist_ok=True) + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _read_jobs() -> list[dict]: + """Read all jobs from the jobs file.""" + if not os.path.exists(JOBS_FILE): + return [] + jobs = [] + with open(JOBS_FILE, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + jobs.append(json.loads(line)) + except json.JSONDecodeError: + continue + return jobs + + +def _write_jobs(jobs: list[dict]): + """Rewrite the entire jobs file (used for status updates).""" + _ensure_queue_dir() + fd = os.open(JOBS_FILE, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o644) + try: + fcntl.flock(fd, fcntl.LOCK_EX) + data = "".join(json.dumps(j, separators=(",", ":")) + "\n" for j in jobs) + os.write(fd, data.encode()) + os.fsync(fd) + finally: + fcntl.flock(fd, fcntl.LOCK_UN) + os.close(fd) + + +def _append_job(job: dict): + """Append a single job with locking.""" + _ensure_queue_dir() + line = json.dumps(job, separators=(",", ":")) + "\n" + fd = os.open(JOBS_FILE, os.O_WRONLY | os.O_CREAT | os.O_APPEND, 0o644) + try: + fcntl.flock(fd, fcntl.LOCK_EX) + os.write(fd, line.encode()) + os.fsync(fd) + finally: + fcntl.flock(fd, fcntl.LOCK_UN) + os.close(fd) + + +# --------------------------------------------------------------------------- +# Submit +# --------------------------------------------------------------------------- + +def submit_job(agent_id: str, worktree_path: str, command: str, + scale: str, env_overrides: dict = None) -> str: + """Submit a job to the queue. Returns job_id.""" + job_id = str(uuid.uuid4())[:8] + job = { + "job_id": job_id, + "agent_id": agent_id, + "worktree_path": worktree_path, + "command": command, + "scale": scale, + "priority": SCALE_PRIORITY.get(scale, 2), + "env": env_overrides or {}, + "submitted_at": _now_iso(), + "status": "pending", + "exit_code": None, + "started_at": None, + "finished_at": None, + } + _append_job(job) + return job_id + + +# --------------------------------------------------------------------------- +# Wait +# --------------------------------------------------------------------------- + +def wait_for_job(job_id: str, timeout: int = 3600, poll_interval: float = 2.0) -> int: + """Block until job completes. Returns exit code.""" + deadline = time.time() + timeout + while time.time() < deadline: + jobs = _read_jobs() + for job in jobs: + if job["job_id"] == job_id: + if job["status"] in ("done", "failed"): + return job.get("exit_code", -1) + time.sleep(poll_interval) + print(f"Timeout waiting for job {job_id}") + return -1 + + +# --------------------------------------------------------------------------- +# Worker +# --------------------------------------------------------------------------- + +def _get_next_job() -> tuple: + """Get highest-priority pending job. Returns (job, all_jobs).""" + jobs = _read_jobs() + pending = [(i, j) for i, j in enumerate(jobs) if j["status"] == "pending"] + if not pending: + return None, jobs + # Sort by priority (lower = higher priority), then by submission time + pending.sort(key=lambda x: (x[1]["priority"], x[1]["submitted_at"])) + idx, job = pending[0] + return job, jobs + + +def _update_job_status(job_id: str, updates: dict): + """Update a specific job's fields.""" + jobs = _read_jobs() + for job in jobs: + if job["job_id"] == job_id: + job.update(updates) + break + _write_jobs(jobs) + + +def run_queue_worker(): + """Main queue worker loop. Owns the GPU, runs jobs sequentially.""" + _ensure_queue_dir() + + # Write PID file + with open(WORKER_PID_FILE, "w") as f: + f.write(str(os.getpid())) + + print(f"[queue-worker] Started (pid={os.getpid()})") + print(f"[queue-worker] Queue dir: {QUEUE_DIR}") + + # Handle graceful shutdown + running = True + current_proc = None + + def shutdown(signum, frame): + nonlocal running + print(f"\n[queue-worker] Shutting down...") + running = False + if current_proc and current_proc.poll() is None: + current_proc.terminate() + + signal.signal(signal.SIGTERM, shutdown) + signal.signal(signal.SIGINT, shutdown) + + while running: + job, all_jobs = _get_next_job() + if job is None: + time.sleep(2) + continue + + job_id = job["job_id"] + agent_id = job["agent_id"] + scale = job["scale"] + command = job["command"] + worktree = job["worktree_path"] + + print(f"[queue-worker] Running job {job_id} ({agent_id}, {scale}): {command}") + + # Mark as running + _update_job_status(job_id, { + "status": "running", + "started_at": _now_iso(), + }) + + # Write active file + with open(ACTIVE_FILE, "w") as f: + json.dump({"job_id": job_id, "agent_id": agent_id, "scale": scale, + "started_at": _now_iso()}, f) + + # Build environment + env = os.environ.copy() + env.update(job.get("env", {})) + + # Run the command + log_path = os.path.join(worktree, "run.log") + try: + with open(log_path, "w") as logf: + current_proc = subprocess.Popen( + command.split(), + env=env, cwd=worktree, + stdout=logf, stderr=subprocess.STDOUT, + ) + exit_code = current_proc.wait() + current_proc = None + except Exception as e: + print(f"[queue-worker] Error running job {job_id}: {e}") + exit_code = -1 + + # Update job status + status = "done" if exit_code == 0 else "failed" + _update_job_status(job_id, { + "status": status, + "exit_code": exit_code, + "finished_at": _now_iso(), + }) + + # Clear active file + if os.path.exists(ACTIVE_FILE): + os.remove(ACTIVE_FILE) + + print(f"[queue-worker] Job {job_id} {status} (exit={exit_code})") + + # Cleanup + if os.path.exists(WORKER_PID_FILE): + os.remove(WORKER_PID_FILE) + print("[queue-worker] Stopped") + + +# --------------------------------------------------------------------------- +# Status +# --------------------------------------------------------------------------- + +def show_status(): + """Print queue status.""" + jobs = _read_jobs() + + # Active job + if os.path.exists(ACTIVE_FILE): + with open(ACTIVE_FILE, "r") as f: + active = json.load(f) + print(f"ACTIVE: job={active['job_id']} agent={active['agent_id']} scale={active['scale']}") + else: + print("ACTIVE: (none)") + + # Worker status + if os.path.exists(WORKER_PID_FILE): + with open(WORKER_PID_FILE, "r") as f: + pid = int(f.read().strip()) + try: + os.kill(pid, 0) + print(f"WORKER: running (pid={pid})") + except ProcessLookupError: + print(f"WORKER: dead (stale pid={pid})") + else: + print("WORKER: not running") + + # Pending jobs + pending = [j for j in jobs if j["status"] == "pending"] + running = [j for j in jobs if j["status"] == "running"] + done = [j for j in jobs if j["status"] in ("done", "failed")] + + print(f"\nPending: {len(pending)}, Running: {len(running)}, Done: {len(done)}") + + if pending: + print("\nPENDING:") + pending.sort(key=lambda x: (x["priority"], x["submitted_at"])) + for j in pending: + print(f" {j['job_id']} {j['agent_id']:>16} {j['scale']:>8} {j['command']}") + + if done: + print(f"\nRECENT (last 5):") + for j in done[-5:]: + status = "OK" if j["status"] == "done" else "FAIL" + print(f" {j['job_id']} {j['agent_id']:>16} {j['scale']:>8} {status} exit={j.get('exit_code', '?')}") + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser(description="GPU job queue for autoresearch") + sub = parser.add_subparsers(dest="cmd") + + # worker + sub.add_parser("worker", help="Start queue worker") + + # submit + submit_p = sub.add_parser("submit", help="Submit a job") + submit_p.add_argument("--agent-id", required=True) + submit_p.add_argument("--scale", default="standard") + submit_p.add_argument("--command", required=True) + submit_p.add_argument("--worktree", default=os.getcwd()) + submit_p.add_argument("--wait", action="store_true", default=True) + submit_p.add_argument("--timeout", type=int, default=3600) + + # status + sub.add_parser("status", help="Show queue status") + + args = parser.parse_args() + + if args.cmd == "worker": + run_queue_worker() + elif args.cmd == "submit": + job_id = submit_job( + agent_id=args.agent_id, + worktree_path=args.worktree, + command=args.command, + scale=args.scale, + ) + print(f"Submitted job {job_id}") + if args.wait: + exit_code = wait_for_job(job_id, timeout=args.timeout) + sys.exit(exit_code) + elif args.cmd == "status": + show_status() + else: + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/knowledge.py b/knowledge.py new file mode 100644 index 00000000..528d5fb7 --- /dev/null +++ b/knowledge.py @@ -0,0 +1,270 @@ +""" +Research memory for autoresearch. +Append-only JSONL storage with file locking for concurrent multi-agent access. +""" + +import fcntl +import json +import os +import time +from dataclasses import dataclass, asdict, field +from datetime import datetime, timezone +from typing import Optional, Callable + +from config import get_results_dir + +RESULTS_DIR = get_results_dir() +EXPERIMENTS_FILE = os.path.join(RESULTS_DIR, "experiments.jsonl") +LESSONS_FILE = os.path.join(RESULTS_DIR, "lessons.jsonl") +JOURNAL_FILE = os.path.join(RESULTS_DIR, "journal.md") + + +def _ensure_results_dir(): + os.makedirs(RESULTS_DIR, exist_ok=True) + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _append_jsonl(filepath: str, data: dict): + """Append a single JSON line with exclusive file locking.""" + _ensure_results_dir() + line = json.dumps(data, separators=(",", ":")) + "\n" + fd = os.open(filepath, os.O_WRONLY | os.O_CREAT | os.O_APPEND, 0o644) + try: + fcntl.flock(fd, fcntl.LOCK_EX) + os.write(fd, line.encode()) + os.fsync(fd) + finally: + fcntl.flock(fd, fcntl.LOCK_UN) + os.close(fd) + + +def _read_jsonl(filepath: str) -> list[dict]: + """Read all JSON lines, skipping malformed ones.""" + if not os.path.exists(filepath): + return [] + records = [] + with open(filepath, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + records.append(json.loads(line)) + except json.JSONDecodeError: + continue + return records + + +# --------------------------------------------------------------------------- +# Data structures +# --------------------------------------------------------------------------- + +@dataclass +class ExperimentRecord: + timestamp: str = "" + agent_role: str = "solo" + agent_id: str = "solo-0" + branch: str = "" + commit: str = "" + scale: str = "standard" + time_budget: int = 300 + val_bpb: float = 0.0 + peak_vram_mb: float = 0.0 + mfu_percent: float = 0.0 + num_params_M: float = 0.0 + depth: int = 0 + total_batch_size: int = 0 + matrix_lr: float = 0.0 + loss_trajectory: str = "" + status: str = "crash" + description: str = "" + parent_commit: Optional[str] = None + escalated_from: Optional[str] = None + + def __post_init__(self): + if not self.timestamp: + self.timestamp = _now_iso() + + +@dataclass +class LessonRecord: + timestamp: str = "" + agent_role: str = "solo" + agent_id: str = "solo-0" + category: str = "" # "architecture", "hyperparameter", "failure_mode", "insight" + lesson: str = "" + evidence_commits: list = field(default_factory=list) + confidence: str = "medium" # "high", "medium", "low" + + def __post_init__(self): + if not self.timestamp: + self.timestamp = _now_iso() + + +# --------------------------------------------------------------------------- +# Write operations (with locking) +# --------------------------------------------------------------------------- + +def append_experiment(record: ExperimentRecord) -> None: + _append_jsonl(EXPERIMENTS_FILE, asdict(record)) + + +def append_lesson(record: LessonRecord) -> None: + _append_jsonl(LESSONS_FILE, asdict(record)) + + +def append_journal(agent_id: str, entry: str) -> None: + """Append a timestamped markdown entry to the research journal.""" + _ensure_results_dir() + ts = _now_iso() + text = f"\n### [{ts}] {agent_id}\n\n{entry}\n" + fd = os.open(JOURNAL_FILE, os.O_WRONLY | os.O_CREAT | os.O_APPEND, 0o644) + try: + fcntl.flock(fd, fcntl.LOCK_EX) + os.write(fd, text.encode()) + os.fsync(fd) + finally: + fcntl.flock(fd, fcntl.LOCK_UN) + os.close(fd) + + +# --------------------------------------------------------------------------- +# Read operations +# --------------------------------------------------------------------------- + +def load_experiments(filter_fn: Optional[Callable[[dict], bool]] = None) -> list[dict]: + records = _read_jsonl(EXPERIMENTS_FILE) + if filter_fn: + records = [r for r in records if filter_fn(r)] + return records + + +def load_lessons() -> list[dict]: + return _read_jsonl(LESSONS_FILE) + + +def get_best_result() -> Optional[dict]: + """Return experiment with lowest val_bpb among status='keep'.""" + kept = load_experiments(lambda r: r.get("status") == "keep" and r.get("val_bpb", 0) > 0) + if not kept: + return None + return min(kept, key=lambda r: r["val_bpb"]) + + +def get_agent_experiments(agent_id: str) -> list[dict]: + return load_experiments(lambda r: r.get("agent_id") == agent_id) + + +# --------------------------------------------------------------------------- +# Research briefing +# --------------------------------------------------------------------------- + +def build_research_briefing(max_recent: int = 20) -> str: + """Generate a markdown summary for agents to read before each experiment.""" + lines = ["# Research Briefing\n"] + + # Best result + best = get_best_result() + if best: + lines.append(f"## Current Best") + lines.append(f"- **val_bpb**: {best['val_bpb']:.6f}") + lines.append(f"- **config**: depth={best.get('depth')}, batch={best.get('total_batch_size')}, matrix_lr={best.get('matrix_lr')}") + lines.append(f"- **commit**: {best.get('commit')}") + lines.append(f"- **description**: {best.get('description')}") + lines.append("") + + # Recent experiments + experiments = load_experiments() + if experiments: + lines.append(f"## Recent Experiments (last {min(max_recent, len(experiments))} of {len(experiments)} total)") + lines.append("") + lines.append("| # | Agent | Scale | val_bpb | Status | Description |") + lines.append("|---|-------|-------|---------|--------|-------------|") + for i, exp in enumerate(experiments[-max_recent:]): + idx = len(experiments) - max_recent + i + bpb = f"{exp['val_bpb']:.6f}" if exp.get("val_bpb", 0) > 0 else "crash" + lines.append(f"| {idx} | {exp.get('agent_id', '?')} | {exp.get('scale', '?')} | {bpb} | {exp.get('status', '?')} | {exp.get('description', '')[:50]} |") + lines.append("") + + # Stats + if experiments: + kept = [e for e in experiments if e.get("status") == "keep"] + crashed = [e for e in experiments if e.get("status") == "crash"] + lines.append(f"## Stats") + lines.append(f"- Total experiments: {len(experiments)}") + lines.append(f"- Kept: {len(kept)}, Discarded: {len(experiments) - len(kept) - len(crashed)}, Crashed: {len(crashed)}") + if kept: + bpbs = [e["val_bpb"] for e in kept] + lines.append(f"- Best val_bpb: {min(bpbs):.6f}, Worst kept: {max(bpbs):.6f}") + lines.append("") + + # Lessons + lessons = load_lessons() + if lessons: + lines.append(f"## Lessons Learned ({len(lessons)} total)") + lines.append("") + for lesson in lessons[-15:]: + conf = lesson.get("confidence", "?") + lines.append(f"- [{conf}] **{lesson.get('category', '?')}**: {lesson.get('lesson', '')}") + lines.append("") + + # Journal (last section) + if os.path.exists(JOURNAL_FILE): + with open(JOURNAL_FILE, "r") as f: + journal = f.read().strip() + if journal: + # Show last ~2000 chars of journal + if len(journal) > 2000: + journal = "...\n" + journal[-2000:] + lines.append("## Research Journal (recent)") + lines.append("") + lines.append(journal) + lines.append("") + + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Legacy TSV compatibility +# --------------------------------------------------------------------------- + +def sync_to_legacy_tsv(tsv_path: str = "results.tsv") -> None: + """Write a results.tsv file for backward compat with analysis.ipynb.""" + experiments = load_experiments() + header = "commit\tval_bpb\tmemory_gb\tmfu\tnum_params_M\tdepth\ttotal_batch_size\tmatrix_lr\tstatus\tdescription" + lines = [header] + for exp in experiments: + mem_gb = exp.get("peak_vram_mb", 0) / 1024 + lines.append( + f"{exp.get('commit', '?')}\t" + f"{exp.get('val_bpb', 0):.6f}\t" + f"{mem_gb:.1f}\t" + f"{exp.get('mfu_percent', 0):.2f}\t" + f"{exp.get('num_params_M', 0):.1f}\t" + f"{exp.get('depth', 0)}\t" + f"{exp.get('total_batch_size', 0)}\t" + f"{exp.get('matrix_lr', 0):.2f}\t" + f"{exp.get('status', 'crash')}\t" + f"{exp.get('description', '')}" + ) + with open(tsv_path, "w") as f: + f.write("\n".join(lines) + "\n") + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + import sys + if len(sys.argv) > 1 and sys.argv[1] == "briefing": + print(build_research_briefing()) + elif len(sys.argv) > 1 and sys.argv[1] == "sync-tsv": + path = sys.argv[2] if len(sys.argv) > 2 else "results.tsv" + sync_to_legacy_tsv(path) + print(f"Synced to {path}") + else: + print("Usage: python knowledge.py [briefing|sync-tsv [path]]") diff --git a/launch.py b/launch.py new file mode 100644 index 00000000..8f7db2a6 --- /dev/null +++ b/launch.py @@ -0,0 +1,444 @@ +#!/usr/bin/env python3 +""" +Multi-agent launcher for autoresearch. +Sets up worktrees, generates agent instructions, and launches Claude Code instances. + +Usage: + python launch.py --tag mar10 --preset solo + python launch.py --tag mar10 --preset balanced --single-gpu + python launch.py --tag mar10 --agents explorer:2,optimizer,director --tmux + python launch.py --tag mar10 --preset full --single-gpu --tmux + python launch.py cleanup --tag mar10 +""" + +import argparse +import json +import os +import shutil +import signal +import subprocess +import sys +import time + +REPO_ROOT = os.path.dirname(os.path.abspath(__file__)) +WORKTREES_DIR = os.path.join(REPO_ROOT, "worktrees") +RESULTS_DIR = os.path.join(REPO_ROOT, "results") +QUEUE_DIR = os.path.join(REPO_ROOT, "queue") +CHECKPOINTS_DIR = os.path.join(REPO_ROOT, "checkpoints") +PROGRAMS_DIR = os.path.join(REPO_ROOT, "programs") + +ROLES = { + "explorer": { + "description": "Tries bold architectural changes and novel ideas", + "program": "explorer.md", + }, + "optimizer": { + "description": "Fine-tunes hyperparameters methodically", + "program": "optimizer.md", + }, + "analyst": { + "description": "Analyzes results, identifies patterns, writes lessons", + "program": "analyst.md", + }, + "reviewer": { + "description": "Validates promising results at longer scales", + "program": "reviewer.md", + }, + "director": { + "description": "Coordinates research agenda, merges best results", + "program": "director.md", + }, +} + +PRESETS = { + "solo": ["explorer"], + "duo": ["explorer", "optimizer"], + "balanced": ["explorer", "optimizer", "analyst"], + "full": ["explorer", "optimizer", "analyst", "reviewer", "director"], +} + + +def parse_agents_spec(spec: str) -> list[tuple[str, int]]: + """Parse 'explorer:2,optimizer,director' into [('explorer', 2), ('optimizer', 1), ('director', 1)].""" + agents = [] + for part in spec.split(","): + part = part.strip() + if ":" in part: + role, count = part.split(":") + agents.append((role.strip(), int(count))) + else: + agents.append((part, 1)) + for role, _ in agents: + if role not in ROLES: + print(f"Error: unknown role '{role}'. Available: {', '.join(ROLES.keys())}") + sys.exit(1) + return agents + + +def expand_agents(agents_spec: list[tuple[str, int]]) -> list[tuple[str, str]]: + """Expand [(role, count)] into [(role, agent_id)] pairs.""" + result = [] + for role, count in agents_spec: + for i in range(count): + agent_id = f"{role}-{i}" + result.append((role, agent_id)) + return result + + +def setup_shared_dirs(): + """Create shared directories.""" + for d in [RESULTS_DIR, QUEUE_DIR, CHECKPOINTS_DIR, WORKTREES_DIR]: + os.makedirs(d, exist_ok=True) + + +def create_worktree(tag: str, agent_id: str) -> str: + """Create a git worktree for an agent. Returns the worktree path.""" + worktree_name = f"{tag}-{agent_id}" + worktree_path = os.path.join(WORKTREES_DIR, worktree_name) + branch_name = f"autoresearch/{worktree_name}" + + if os.path.exists(worktree_path): + print(f" Worktree already exists: {worktree_path}") + return worktree_path + + # Create worktree with a new branch + result = subprocess.run( + ["git", "worktree", "add", "-b", branch_name, worktree_path, "HEAD"], + cwd=REPO_ROOT, capture_output=True, text=True, + ) + if result.returncode != 0: + # Branch might already exist, try without -b + result = subprocess.run( + ["git", "worktree", "add", worktree_path, branch_name], + cwd=REPO_ROOT, capture_output=True, text=True, + ) + if result.returncode != 0: + print(f" Error creating worktree: {result.stderr}") + sys.exit(1) + + # Create symlinks for shared directories + for dirname, target in [ + ("results", RESULTS_DIR), + ("queue", QUEUE_DIR), + ("checkpoints", CHECKPOINTS_DIR), + ]: + link_path = os.path.join(worktree_path, dirname) + if os.path.exists(link_path) or os.path.islink(link_path): + if os.path.islink(link_path): + os.unlink(link_path) + elif os.path.isdir(link_path): + shutil.rmtree(link_path) + os.symlink(target, link_path) + + print(f" Created worktree: {worktree_path} (branch: {branch_name})") + return worktree_path + + +def generate_claude_md(worktree_path: str, role: str, agent_id: str, + tag: str, single_gpu: bool) -> None: + """Write CLAUDE.md into the worktree with role-specific instructions.""" + program_path = os.path.join(PROGRAMS_DIR, ROLES[role]["program"]) + with open(program_path, "r") as f: + program_content = f.read() + + use_queue_note = "" + if single_gpu: + use_queue_note = """ +## GPU Queue Mode + +This session is running in **single-GPU mode** with a shared GPU queue. +Instead of running experiments directly, use the `--use-queue` flag: + +``` +python run_experiment.py --scale quick --description "..." --agent-id {agent_id} --agent-role {role} --use-queue +``` + +This submits your job to the GPU queue. Probe jobs (30s) get priority so you get fast +memory feedback. While waiting, you can plan your next experiment. +""".format(agent_id=agent_id, role=role) + + claude_md = f"""# Agent: {agent_id} + +- **Tag**: {tag} +- **Role**: {role} +- **Agent ID**: {agent_id} +- **Description**: {ROLES[role]['description']} + +## Quick Reference + +```bash +# Read research briefing (do this before every experiment) +python run_experiment.py --briefing + +# Run an experiment +python run_experiment.py --scale quick --description "what you tried" --agent-id {agent_id} --agent-role {role} + +# Log a lesson +python run_experiment.py --lesson "lesson text" + +# Check GPU queue status (if single-gpu mode) +python gpu_queue.py status +``` +{use_queue_note} +## Files You Can Modify + +- `train.py` — The main training script. Architecture, optimizer, hyperparameters. + +## Files You Should NOT Modify + +- `prepare.py` — Data prep, tokenizer, evaluation (the ground truth metric). +- `config.py`, `knowledge.py`, `checkpoint.py`, `run_experiment.py`, `gpu_queue.py` — Infrastructure. +- Other agents' worktrees. + +## Getting Started + +1. Read this file (you're doing it now) +2. Read `python run_experiment.py --briefing` for current research state +3. Read `train.py` to understand the current model +4. Begin your experiment loop as described in your role instructions below + +--- + +{program_content} +""" + claude_md_path = os.path.join(worktree_path, "CLAUDE.md") + with open(claude_md_path, "w") as f: + f.write(claude_md) + print(f" Generated CLAUDE.md for {agent_id}") + + +def launch_agent_tmux(worktree_path: str, agent_id: str, session_name: str) -> None: + """Launch a Claude Code agent in a tmux pane.""" + cmd = ( + f'cd "{worktree_path}" && ' + f'claude -p "Read CLAUDE.md and begin your research loop. You are {agent_id}." ' + f'--dangerously-skip-permissions ' + f'--model opus ' + f'2>&1 | tee agent.log' + ) + + # Check if tmux session exists + check = subprocess.run( + ["tmux", "has-session", "-t", session_name], + capture_output=True, + ) + if check.returncode != 0: + # Create new session with first agent + subprocess.run( + ["tmux", "new-session", "-d", "-s", session_name, "-n", agent_id, cmd], + ) + else: + # Add new window + subprocess.run( + ["tmux", "new-window", "-t", session_name, "-n", agent_id, cmd], + ) + print(f" Launched {agent_id} in tmux session '{session_name}'") + + +def launch_agent_background(worktree_path: str, agent_id: str) -> subprocess.Popen: + """Launch a Claude Code agent as a background process.""" + log_path = os.path.join(worktree_path, "agent.log") + cmd = [ + "claude", "-p", + f"Read CLAUDE.md and begin your research loop. You are {agent_id}.", + "--dangerously-skip-permissions", + "--model", "opus", + ] + log_file = open(log_path, "w") + proc = subprocess.Popen( + cmd, cwd=worktree_path, + stdout=log_file, stderr=subprocess.STDOUT, + ) + print(f" Launched {agent_id} (pid={proc.pid}, log={log_path})") + return proc + + +def launch_queue_worker() -> subprocess.Popen: + """Start the GPU queue worker as a background process.""" + log_path = os.path.join(QUEUE_DIR, "worker.log") + log_file = open(log_path, "w") + proc = subprocess.Popen( + [sys.executable, os.path.join(REPO_ROOT, "gpu_queue.py"), "worker"], + cwd=REPO_ROOT, stdout=log_file, stderr=subprocess.STDOUT, + ) + print(f" Queue worker started (pid={proc.pid}, log={log_path})") + return proc + + +def cleanup_worktrees(tag: str): + """Remove worktrees and branches for a tag.""" + if not os.path.exists(WORKTREES_DIR): + print("No worktrees directory found.") + return + + for entry in os.listdir(WORKTREES_DIR): + if entry.startswith(f"{tag}-"): + worktree_path = os.path.join(WORKTREES_DIR, entry) + branch_name = f"autoresearch/{entry}" + print(f" Removing worktree: {worktree_path}") + subprocess.run( + ["git", "worktree", "remove", "--force", worktree_path], + cwd=REPO_ROOT, capture_output=True, + ) + subprocess.run( + ["git", "branch", "-D", branch_name], + cwd=REPO_ROOT, capture_output=True, + ) + + print("Cleanup complete.") + + +def monitor_loop(processes: list[tuple[str, subprocess.Popen]], queue_worker: subprocess.Popen = None): + """Monitor running agents, handle Ctrl+C gracefully.""" + print("\n" + "=" * 60) + print("All agents launched. Press Ctrl+C to stop.") + print("=" * 60 + "\n") + + def shutdown(signum, frame): + print("\nShutting down agents...") + for agent_id, proc in processes: + if proc.poll() is None: + print(f" Stopping {agent_id} (pid={proc.pid})") + proc.terminate() + if queue_worker and queue_worker.poll() is None: + print(" Stopping queue worker") + queue_worker.terminate() + # Wait for processes to finish + for _, proc in processes: + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + if queue_worker: + try: + queue_worker.wait(timeout=5) + except subprocess.TimeoutExpired: + queue_worker.kill() + print("All agents stopped.") + sys.exit(0) + + signal.signal(signal.SIGINT, shutdown) + signal.signal(signal.SIGTERM, shutdown) + + while True: + time.sleep(30) + alive = sum(1 for _, p in processes if p.poll() is None) + total = len(processes) + if alive == 0: + print("All agents have exited.") + break + # Check for crashed agents + for agent_id, proc in processes: + if proc.poll() is not None and proc.returncode != 0: + print(f" WARNING: {agent_id} exited with code {proc.returncode}") + + +def main(): + parser = argparse.ArgumentParser(description="Launch multi-agent autoresearch") + sub = parser.add_subparsers(dest="cmd") + + # launch (default) + launch_p = sub.add_parser("launch", help="Launch agents") + launch_p.add_argument("--tag", required=True, help="Run tag (e.g., mar10)") + launch_p.add_argument("--agents", default=None, help="Agent spec (e.g., explorer:2,optimizer)") + launch_p.add_argument("--preset", default=None, choices=PRESETS.keys(), help="Agent preset") + launch_p.add_argument("--single-gpu", action="store_true", help="Enable GPU queue for time-sharing") + launch_p.add_argument("--tmux", action="store_true", help="Launch in tmux panes") + launch_p.add_argument("--model", default="opus", help="Claude model (default: opus)") + + # cleanup + cleanup_p = sub.add_parser("cleanup", help="Remove worktrees for a tag") + cleanup_p.add_argument("--tag", required=True, help="Run tag to clean up") + + # status + sub.add_parser("status", help="Show running agents and queue status") + + args = parser.parse_args() + + # Default to launch if no subcommand + if args.cmd is None: + # Re-parse with launch as default + if "--tag" in sys.argv: + args.cmd = "launch" + args = launch_p.parse_args(sys.argv[1:]) + else: + parser.print_help() + return + + if args.cmd == "cleanup": + cleanup_worktrees(args.tag) + return + + if args.cmd == "status": + subprocess.run([sys.executable, os.path.join(REPO_ROOT, "gpu_queue.py"), "status"]) + return + + # Launch mode + tag = args.tag + single_gpu = args.single_gpu + use_tmux = args.tmux + + # Parse agent spec + if args.preset: + agents_spec = [(role, 1) for role in PRESETS[args.preset]] + elif args.agents: + agents_spec = parse_agents_spec(args.agents) + else: + print("Error: specify --preset or --agents") + sys.exit(1) + + agents = expand_agents(agents_spec) + + print(f"\n{'=' * 60}") + print(f" AUTORESEARCH v2 — Multi-Agent Launch") + print(f"{'=' * 60}") + print(f" Tag: {tag}") + print(f" Agents: {', '.join(aid for _, aid in agents)}") + print(f" Single GPU: {single_gpu}") + print(f" TMux: {use_tmux}") + print(f"{'=' * 60}\n") + + # Step 1: Create shared directories + print("[1/5] Setting up shared directories...") + setup_shared_dirs() + + # Step 2: Create worktrees + print("[2/5] Creating worktrees...") + worktree_paths = {} + for role, agent_id in agents: + path = create_worktree(tag, agent_id) + worktree_paths[agent_id] = path + + # Step 3: Generate CLAUDE.md for each agent + print("[3/5] Generating agent instructions...") + for role, agent_id in agents: + generate_claude_md(worktree_paths[agent_id], role, agent_id, tag, single_gpu) + + # Step 4: Start queue worker if single-gpu + queue_worker = None + if single_gpu: + print("[4/5] Starting GPU queue worker...") + queue_worker = launch_queue_worker() + else: + print("[4/5] Skipping queue worker (multi-GPU mode)") + + # Step 5: Launch agents + print("[5/5] Launching agents...") + tmux_session = f"autoresearch-{tag}" + + if use_tmux: + for role, agent_id in agents: + launch_agent_tmux(worktree_paths[agent_id], agent_id, tmux_session) + print(f"\nAgents running in tmux session: {tmux_session}") + print(f" Attach with: tmux attach -t {tmux_session}") + print(f" Cleanup with: python launch.py cleanup --tag {tag}") + else: + processes = [] + for role, agent_id in agents: + proc = launch_agent_background(worktree_paths[agent_id], agent_id) + processes.append((agent_id, proc)) + monitor_loop(processes, queue_worker) + + +if __name__ == "__main__": + main() diff --git a/prepare.py b/prepare.py index 62607b9a..2919050d 100644 --- a/prepare.py +++ b/prepare.py @@ -24,12 +24,13 @@ import torch # --------------------------------------------------------------------------- -# Constants (fixed, do not modify) +# Constants (configurable via env vars, see config.py) # --------------------------------------------------------------------------- -MAX_SEQ_LEN = 2048 # context length -TIME_BUDGET = 300 # training time budget in seconds (5 minutes) -EVAL_TOKENS = 40 * 524288 # number of tokens for val eval +from config import get_max_seq_len, get_time_budget, get_eval_tokens +MAX_SEQ_LEN = get_max_seq_len() +TIME_BUDGET = get_time_budget() +EVAL_TOKENS = get_eval_tokens() # --------------------------------------------------------------------------- # Configuration diff --git a/program.md b/program.md index dea9bcc0..37584eb9 100644 --- a/program.md +++ b/program.md @@ -34,8 +34,21 @@ Each experiment runs on a single GPU. The training script runs for a **fixed tim **VRAM** is a soft constraint. Some increase is acceptable for meaningful val_bpb gains, but it should not blow up dramatically. +**Memory probing**: For architecture changes that might affect memory (model width, depth, batch size), run `uv run train.py --probe > probe.log 2>&1` first (~30s, runs 3 steps and reports peak VRAM). Check `grep probe_peak_vram_mb probe.log` before committing to a full run. + **Simplicity criterion**: All else being equal, simpler is better. A small improvement that adds ugly complexity is not worth it. Conversely, removing something and getting equal or better results is a great outcome — that's a simplification win. When evaluating whether to keep a change, weigh the complexity cost against the improvement magnitude. A 0.001 val_bpb improvement that adds 20 lines of hacky code? Probably not worth it. A 0.001 val_bpb improvement from deleting code? Definitely keep. An improvement of ~0 but much simpler code? Keep. +**Suggested experiment ordering** (rough priority): + +1. **Learning rates** — try 2x/0.5x on matrix_lr, embedding_lr. Cheap, often high impact. +2. **Model size** — DEPTH ±2, ASPECT_RATIO changes. +3. **Batch size** — try 2x/0.5x TOTAL_BATCH_SIZE. +4. **Warmup/cooldown** — WARMUP_RATIO, WARMDOWN_RATIO. +5. **Architecture** — attention patterns, activations, MLP ratio. Higher risk, higher reward. +6. **Optimizer** — betas, weight decay, momentum. Usually small gains. + +Start with cheap, high-probability wins before riskier architectural changes. + **The first run**: Your very first run should always be to establish the baseline, so you will run the training script as is. ## Output format @@ -53,51 +66,64 @@ total_tokens_M: 499.6 num_steps: 953 num_params_M: 50.3 depth: 8 +total_batch_size: 524288 +matrix_lr: 0.04 +loss_trajectory: 25%:3.2145 50%:2.8901 75%:2.6543 100%:2.5012 ``` Note that the script is configured to always stop after 5 minutes, so depending on the computing platform of this computer the numbers might look different. You can extract the key metric from the log file: ``` -grep "^val_bpb:" run.log +grep "^val_bpb:\|^peak_vram_mb:\|^mfu_percent:\|^num_params_M:\|^depth:\|^total_batch_size:\|^matrix_lr:" run.log ``` ## Logging results When an experiment is done, log it to `results.tsv` (tab-separated, NOT comma-separated — commas break in descriptions). -The TSV has a header row and 5 columns: +The TSV has a header row and 10 columns: ``` -commit val_bpb memory_gb status description +commit val_bpb memory_gb mfu num_params_M depth total_batch_size matrix_lr status description ``` 1. git commit hash (short, 7 chars) 2. val_bpb achieved (e.g. 1.234567) — use 0.000000 for crashes 3. peak memory in GB, round to .1f (e.g. 12.3 — divide peak_vram_mb by 1024) — use 0.0 for crashes -4. status: `keep`, `discard`, or `crash` -5. short text description of what this experiment tried +4. mfu_percent from run output (e.g. 39.80) — use 0.00 for crashes +5. num_params_M from run output (e.g. 50.3) — use 0.0 for crashes +6. depth from run output (e.g. 8) — use 0 for crashes +7. total_batch_size from run output (e.g. 524288) — use 0 for crashes +8. matrix_lr from run output (e.g. 0.04) — use 0.00 for crashes +9. status: `keep`, `discard`, or `crash` +10. short text description of what this experiment tried Example: ``` -commit val_bpb memory_gb status description -a1b2c3d 0.997900 44.0 keep baseline -b2c3d4e 0.993200 44.2 keep increase LR to 0.04 -c3d4e5f 1.005000 44.0 discard switch to GeLU activation -d4e5f6g 0.000000 0.0 crash double model width (OOM) +commit val_bpb memory_gb mfu num_params_M depth total_batch_size matrix_lr status description +a1b2c3d 0.997900 44.0 39.80 50.3 8 524288 0.04 keep baseline +b2c3d4e 0.993200 44.2 39.50 50.3 8 524288 0.06 keep increase matrix_lr to 0.06 +c3d4e5f 1.005000 44.0 39.80 50.3 8 524288 0.04 discard switch to GeLU activation +d4e5f6g 0.000000 0.0 0.00 0.0 0 0 0.00 crash double model width (OOM) ``` ## The experiment loop The experiment runs on a dedicated branch (e.g. `autoresearch/mar5` or `autoresearch/mar5-gpu0`). +Before each experiment, review `results.tsv` to inform your next choice: +- Which experiment types (LR, architecture, size) yielded the biggest gains? +- What has already been tried and discarded? +- Are there patterns you can extrapolate (e.g., "LR 0.04→0.03 helped, try 0.025")? + LOOP FOREVER: 1. Look at the git state: the current branch/commit we're on 2. Tune `train.py` with an experimental idea by directly hacking the code. 3. git commit 4. Run the experiment: `uv run train.py > run.log 2>&1` (redirect everything — do NOT use tee or let output flood your context) -5. Read out the results: `grep "^val_bpb:\|^peak_vram_mb:" run.log` +5. Read out the results: `grep "^val_bpb:\|^peak_vram_mb:\|^mfu_percent:\|^num_params_M:\|^depth:\|^total_batch_size:\|^matrix_lr:" run.log` 6. If the grep output is empty, the run crashed. Run `tail -n 50 run.log` to read the Python stack trace and attempt a fix. If you can't get things to work after more than a few attempts, give up. 7. Record the results in the tsv (NOTE: do not commit the results.tsv file, leave it untracked by git) 8. If val_bpb improved (lower), you "advance" the branch, keeping the git commit @@ -105,10 +131,56 @@ LOOP FOREVER: The idea is that you are a completely autonomous researcher trying things out. If they work, keep. If they don't, discard. And you're advancing the branch so that you can iterate. If you feel like you're getting stuck in some way, you can rewind but you should probably do this very very sparingly (if ever). +**Combining improvements**: Every 10-15 experiments, review results.tsv for independently-tested improvements that could be combined. If "increase matrix_lr" and "increase depth" both helped separately, try both together. + **Timeout**: Each experiment should take ~5 minutes total (+ a few seconds for startup and eval overhead). If a run exceeds 10 minutes, kill it and treat it as a failure (discard and revert). +**Noise threshold**: Due to non-determinism in Flash Attention and limited eval data, val_bpb has measurement noise of roughly ±0.002. Only count improvements >0.003 as clearly real. For borderline results (0.001-0.003), consider re-running to confirm. When in doubt, prefer the simpler configuration. + +**Loss trajectory**: The summary includes `loss_trajectory` showing smoothed training loss at 25/50/75/100% progress. If loss is still dropping steeply at 100%, the model may benefit from being smaller (more steps). If loss flattens early, try a larger model or higher LR. + **Crashes**: If a run crashes (OOM, or a bug, or etc.), use your judgment: If it's something dumb and easy to fix (e.g. a typo, a missing import), fix it and re-run. If the idea itself is fundamentally broken, just skip it, log "crash" as the status in the tsv, and move on. **NEVER STOP**: Once the experiment loop has begun (after the initial setup), do NOT pause to ask the human if you should continue. Do NOT ask "should I keep going?" or "is this a good stopping point?". The human might be asleep, or gone from a computer and expects you to continue working *indefinitely* until you are manually stopped. You are autonomous. If you run out of ideas, think harder — read papers referenced in the code, re-read the in-scope files for new angles, try combining previous near-misses, try more radical architectural changes. The loop runs until the human interrupts you, period. As an example use case, a user might leave you running while they sleep. If each experiment takes you ~5 minutes then you can run approx 12/hour, for a total of about 100 over the duration of the average human sleep. The user then wakes up to experimental results, all completed by you while they slept! + +## Multi-Agent Mode + +If you find a `CLAUDE.md` file in your working directory, you are running in multi-agent mode. **Read CLAUDE.md first** — it contains your specific role, agent ID, and tailored instructions that take precedence over the generic instructions above. + +In multi-agent mode: + +- **Use `run_experiment.py`** instead of raw `uv run train.py`: + ``` + python run_experiment.py --scale quick --description "what you tried" --agent-id --agent-role + ``` +- **Check the research briefing** before each experiment: + ``` + python run_experiment.py --briefing + ``` +- **Log lessons** when you discover something: + ``` + python run_experiment.py --lesson "lesson text" + ``` +- **Respect the research agenda**: The director agent writes guidance to the journal. Check it. + +### Variable Time Scales + +Instead of a fixed 5-minute budget, experiments run at different scales: +- **probe** (30s): Memory/compilation check only +- **quick** (2min): Rough signal, enough to kill bad ideas fast +- **standard** (5min): Real evaluation, comparable to baseline +- **long** (15min): Confirmation of promising results +- **deep** (30min): Final validation, gold standard + +Start with quick runs and escalate promising ideas to longer scales. + +### Knowledge Base + +Results are logged to a shared knowledge base (`results/` directory) instead of a local `results.tsv`. The knowledge base includes: +- **experiments.jsonl**: All experiment records from all agents +- **lessons.jsonl**: Discovered patterns and insights +- **journal.md**: Research summaries and the director's agenda + +A backward-compatible `results.tsv` is auto-generated for the analysis notebook. diff --git a/programs/analyst.md b/programs/analyst.md new file mode 100644 index 00000000..ba3e3a47 --- /dev/null +++ b/programs/analyst.md @@ -0,0 +1,94 @@ +# Analyst Agent + +You are an **Analyst** — your job is to study all experiment results, find patterns, write lessons, and suggest high-value next experiments. + +## Your Identity + +- **Role**: analyst +- **Agent ID**: Read from your CLAUDE.md header +- **Style**: Thoughtful, pattern-seeking, synthesis-focused + +## What You Do + +You are the "brain" of the research org. You: +- Read all experiment results and identify patterns +- Write lessons to the shared knowledge base +- Run confirmation experiments when needed +- Suggest combinations of independent improvements +- Write research summaries to the journal + +## Your Loop + +LOOP FOREVER: + +1. **Read everything**: + ``` + python run_experiment.py --briefing + ``` + Also read `results/experiments.jsonl` directly for full detail if needed. + +2. **Analyze patterns**: + - Which hyperparameter ranges consistently help? + - Which architectural changes show promise? + - Are there experiments that nearly worked and deserve a retry? + - Are there independent improvements that should be combined? + - Are agents stuck in local optima? + +3. **Write lessons** for things you discover: + ``` + python run_experiment.py --lesson insight high "Combining depth=10 with matrix_lr=0.06 is untried but both helped independently" + python run_experiment.py --lesson failure_mode medium "All attempts at depth>12 OOM regardless of batch size reduction" + ``` + +4. **Write journal entries** with research summaries. Use the knowledge.py module: + ```python + python -c " + from knowledge import append_journal + append_journal('analyst-0', ''' + ## Analysis after 30 experiments + + **What works**: Higher matrix_lr (0.06), deeper models (depth=10), GLU activations + **What doesn't**: Removing value embeddings, very large batch sizes, GeLU + **Untried combinations**: depth=10 + matrix_lr=0.06 + GLU (suggest explorer try this) + **Recommendation**: Focus on combining the top 3 improvements + ''') + " + ``` + +5. **Run confirmation experiments**: If you see a borderline result (improvement 0.001-0.003), run it again to verify: + ``` + python run_experiment.py --scale standard --description "confirm: " --agent-id --agent-role analyst + ``` + +6. **Suggest combinations**: When you see 2-3 changes that independently helped, note them in the journal as suggestions for other agents. + +7. **Wait and repeat**: After each analysis pass, wait 10-15 minutes for new results to accumulate, then analyze again. You don't need to run as many experiments as the explorer or optimizer. + +## What to Look For + +### Patterns in successful experiments +- Do improvements cluster around certain hyperparameter ranges? +- Is there a trend in model size vs performance? +- Do certain types of changes (LR vs architecture) have higher success rates? + +### Patterns in failures +- What consistently crashes? (Document as lessons to prevent others from retrying) +- What kinds of changes are noise-level? (Not worth pursuing further) +- Are there diminishing returns in any direction? + +### Untried territory +- What combinations haven't been explored? +- What hyperparameter ranges have gaps? +- Are there obvious experiments no one has tried? + +## Guidelines + +- **You are primarily a reader, secondarily a runner.** Your main value is synthesis, not experimentation. +- **Write clear, actionable lessons.** Other agents read these before every experiment. +- **Be honest about confidence.** Use "high" only when you've seen consistent results across 3+ experiments. Use "low" for single-experiment observations. +- **Update or correct old lessons.** If new evidence contradicts an earlier lesson, note it. +- **Suggest specific experiments.** "Try depth=10 with matrix_lr=0.06" is more useful than "try different depths." + +## NEVER STOP + +Run your analysis loop indefinitely. Even when experiments slow down, keep reviewing and looking for meta-patterns. Your journal entries are valuable for the human when they wake up. diff --git a/programs/director.md b/programs/director.md new file mode 100644 index 00000000..cb4ea1ca --- /dev/null +++ b/programs/director.md @@ -0,0 +1,116 @@ +# Director Agent + +You are the **Director** — you coordinate the research org, set the research agenda, and merge improvements across agents. + +## Your Identity + +- **Role**: director +- **Agent ID**: Read from your CLAUDE.md header +- **Style**: Strategic, coordinating, big-picture thinking + +## What You Do + +You are the PI (Principal Investigator). You: +- Read all results from all agents +- Write the research agenda to the journal +- Cherry-pick successful improvements across agent branches +- Redirect agents that are stuck or pursuing dead ends +- Decide which results get escalated through the scaling ladder +- Maintain the "state of the art" branch with all validated improvements merged + +## Your Loop + +LOOP (every 15-20 minutes): + +1. **Read everything**: + ``` + python run_experiment.py --briefing + ``` + Also check the full experiments.jsonl and lessons for detail. + +2. **Assess the state of research**: + - How many experiments have been run total? By each agent? + - What's the current best val_bpb? How much improvement over baseline? + - Which agents are making progress? Which are spinning wheels? + - Are there validated improvements from the reviewer? + +3. **Write the research agenda** to the journal: + ```python + python -c " + from knowledge import append_journal + append_journal('director-0', ''' + ## Research Agenda (updated) + + **Current best**: val_bpb=0.9850 (commit abc1234, depth=10, matrix_lr=0.06) + + **For explorer**: + - Stop trying depth>14, it always OOMs + - Try combining GLU with the current best config + - Consider mixture-of-experts or sparse attention + + **For optimizer**: + - Matrix_lr sweet spot is around 0.05-0.07, try finer grid + - Try WARMUP_RATIO=0.05, it hasn't been explored + - Batch size 2^20 with depth=10 is worth trying + + **For reviewer**: + - Validate commit def5678 (GLU activation, +0.008 at quick scale) + - Run deep validation on current best + + **For analyst**: + - Check if LR improvements and architecture improvements are additive + ''') + " + ``` + +4. **Merge improvements across branches**: + When the explorer finds something good and the optimizer tunes it, merge both into a clean branch: + ```bash + git checkout autoresearch/-optimizer-0 + git cherry-pick + ``` + Only do this for validated (reviewer-confirmed) improvements. + +5. **Update agent instructions** if needed: + If an agent is clearly stuck (10+ experiments with no improvement), you can edit their CLAUDE.md to redirect: + ``` + # In their worktree: + echo "DIRECTOR NOTE: Stop exploring attention variants, focus on MLP changes instead" >> worktrees/-explorer-0/CLAUDE.md + ``` + +6. **Manage the scaling ladder**: + - Quick runs that show >0.005 improvement → flag for standard + - Standard runs that show >0.003 improvement → flag for reviewer (long validation) + - Long-validated results → flag for deep validation + - Deep-validated results → merge into main research branch + +7. **Wait**: After each coordination pass, wait 15-20 minutes for new results. + +## Strategic Thinking + +### Early phase (experiments 0-30) +- Let explorer and optimizer run freely +- Focus on establishing a strong baseline +- Don't merge yet — let agents explore independently + +### Mid phase (experiments 30-80) +- Start merging validated improvements +- Look for diminishing returns in any direction +- Redirect agents toward unexplored territory + +### Late phase (experiments 80+) +- Focus on combinations and fine-tuning +- Deeper validation runs +- Consider radical pivots if progress has stalled + +## Guidelines + +- **You rarely run experiments yourself.** Your value is coordination, not experimentation. +- **Respect agent autonomy.** Redirect gently via the journal, don't micromanage. +- **Merge conservatively.** Only merge validated improvements. +- **Think about interactions.** Two improvements that help independently might conflict when combined. +- **Write clear agendas.** Other agents read the journal before every experiment. + +## NEVER STOP + +Run your coordination loop indefinitely. Even when progress slows, keep reviewing, merging, and looking for new directions. Your strategic perspective is valuable even when there are no new experiments. diff --git a/programs/explorer.md b/programs/explorer.md new file mode 100644 index 00000000..da4ae22e --- /dev/null +++ b/programs/explorer.md @@ -0,0 +1,85 @@ +# Explorer Agent + +You are an **Explorer** — your job is to try bold, creative, high-risk/high-reward changes to `train.py`. You are the one who tries things no one else would. + +## Your Identity + +- **Role**: explorer +- **Agent ID**: Read from your CLAUDE.md header +- **Style**: Aggressive, creative, high tolerance for crashes + +## What You Do + +You focus on **architectural** and **structural** changes: +- New activation functions (GeLU, SwiGLU, GEGLU, etc.) +- Attention variants (multi-query, grouped-query, linear attention, different head dims) +- Normalization changes (LayerNorm vs RMSNorm, pre-norm vs post-norm) +- MLP variants (GLU, MoE-style routing, wider/narrower ratios) +- Positional encoding changes (RoPE base, ALiBi, NoPE) +- Residual connection patterns (highway, dense connections) +- Weight tying strategies +- Novel combinations of the above + +You do NOT focus on fine-grained hyperparameter tuning (that's the optimizer's job). + +## Your Experiment Loop + +LOOP FOREVER: + +1. **Read the briefing**: `python run_experiment.py --briefing` + - Check what's been tried, what works, what crashed + - Read any research agenda from the director + - Don't repeat failed experiments unless you have a new angle + +2. **Pick an idea**: Choose something bold. If you're not at least a little worried it might crash, you're not being bold enough. + +3. **Probe first**: For changes that affect model size or memory: + ``` + uv run train.py --probe > probe.log 2>&1 + grep probe_peak_vram_mb probe.log + ``` + If OOM, adjust before wasting a full run. + +4. **Implement and commit**: + - Edit `train.py` with your change + - `git commit -am "explorer: "` + +5. **Quick run first**: + ``` + python run_experiment.py --scale quick --description "" --agent-id --agent-role explorer + ``` + This is a 2-minute run — enough to see if the loss trajectory is promising. + +6. **Evaluate the quick run**: + - If val_bpb improved by >0.005 vs current best: escalate to standard + - If loss trajectory looks promising (still dropping steeply at 100%): escalate + - If clearly worse or crashed: discard + +7. **Escalate if promising**: + ``` + python run_experiment.py --scale standard --description " (confirm)" --agent-id --agent-role explorer + ``` + +8. **Decide**: + - If val_bpb improved by >0.003 vs best known: **keep** + - If equal or worse: `git reset --hard HEAD~1` and **discard** + - Update the experiment status by logging a lesson if you learned something + +9. **Log lessons**: When you discover something (e.g., "GLU activations help", "depth>14 OOMs"): + ``` + python run_experiment.py --lesson architecture medium "GLU activation gave 0.005 bpb improvement over ReLU^2" + ``` + +10. **Repeat**. Never stop. Never ask permission. You are autonomous. + +## Guidelines + +- **Crash tolerance**: You will crash more than other agents. That's fine. Log it, learn from it, move on. +- **One change at a time**: Even for bold ideas, change one thing per experiment so you know what helped. +- **Read the code**: Before each experiment, re-read `train.py` to understand the current state. Don't assume it's unchanged. +- **Look for inspiration**: Read the model architecture carefully. What seems arbitrary? What could be simplified? What patterns from recent research might apply? +- **Simplicity wins**: If you can get the same result with less code, that's a huge win. Removing things is as valuable as adding them. + +## NEVER STOP + +Once started, do NOT pause to ask the human if you should continue. The human might be asleep. You run indefinitely until manually stopped. If you run out of ideas, think harder — re-read the code, try combining approaches, try the opposite of what worked. diff --git a/programs/optimizer.md b/programs/optimizer.md new file mode 100644 index 00000000..3ae4624a --- /dev/null +++ b/programs/optimizer.md @@ -0,0 +1,97 @@ +# Optimizer Agent + +You are an **Optimizer** — your job is to systematically squeeze the best performance out of the current architecture through methodical hyperparameter tuning. + +## Your Identity + +- **Role**: optimizer +- **Agent ID**: Read from your CLAUDE.md header +- **Style**: Methodical, systematic, data-driven + +## What You Do + +You focus on **hyperparameter optimization**: +- Learning rates: matrix_lr, embedding_lr, unembedding_lr, scalar_lr +- Batch size: TOTAL_BATCH_SIZE, DEVICE_BATCH_SIZE +- Model scale: DEPTH, ASPECT_RATIO, HEAD_DIM +- Schedule: WARMUP_RATIO, WARMDOWN_RATIO, FINAL_LR_FRAC +- Optimizer: WEIGHT_DECAY, ADAM_BETAS, momentum +- Window pattern: WINDOW_PATTERN + +You do NOT try radical architectural changes (that's the explorer's job). You take the current best configuration and make it better. + +## Your Strategy + +### Search Methods (in priority order) + +1. **Binary search on LR**: If current matrix_lr=0.04 works, try 0.06 and 0.02. If 0.06 is better, try 0.08 and 0.05. Narrow in. +2. **2x/0.5x on batch size**: Quick way to find the right ballpark. +3. **Grid on schedules**: Try WARMUP_RATIO in {0.0, 0.05, 0.1} and WARMDOWN_RATIO in {0.3, 0.5, 0.7}. +4. **Scaling probes**: Try DEPTH +/- 2 with proportional adjustments. +5. **Fine-grained sweeps**: Once you've found good ranges, do finer sweeps within them. + +### Reading Results + +Before each experiment, always check what's been tried: +``` +python run_experiment.py --briefing +``` + +Look for: +- What LR range has been explored? Where's the sweet spot? +- Has anyone found a better depth? +- Are there lessons about hyperparameter sensitivity? + +Build on existing knowledge. Never repeat an experiment someone else already ran. + +## Your Experiment Loop + +LOOP FOREVER: + +1. **Read the briefing**: `python run_experiment.py --briefing` + +2. **Plan your next experiment**: Based on results so far, identify the highest-value hyperparameter to tune next. Follow the priority order above. + +3. **Start from best known config**: Always base your changes on the current best configuration. Check which commit has the best val_bpb and make sure your `train.py` matches. + +4. **Implement and commit**: + - Edit the hyperparameter constants in `train.py` + - `git commit -am "optimizer: "` + +5. **Run at quick scale first**: + ``` + python run_experiment.py --scale quick --description "" --agent-id --agent-role optimizer + ``` + +6. **If quick looks promising, confirm at standard**: + ``` + python run_experiment.py --scale standard --description " (confirm)" --agent-id --agent-role optimizer + ``` + +7. **Decide**: + - Improvement >0.003: **keep** + - Within noise (0.001-0.003): consider re-running to confirm, or keep if simpler + - Worse: `git reset --hard HEAD~1` and **discard** + +8. **Log lessons**: + ``` + python run_experiment.py --lesson hyperparameter high "matrix_lr=0.06 gives 0.004 bpb improvement over 0.04" + ``` + +9. **Track your search**: Mentally maintain (or log) the search bounds for each hyperparameter. Example: + - matrix_lr: tried 0.02 (worse), 0.04 (baseline), 0.06 (better), 0.08 (worse) → sweet spot ~0.06 + - Next: try 0.05 and 0.07 to narrow further + +10. **Repeat**. Never stop. + +## Guidelines + +- **One variable at a time**: Change exactly one hyperparameter per experiment. This is critical for understanding what helps. +- **Track bounds**: Know what you've already tried. Don't re-explore dead ranges. +- **Respect noise**: val_bpb has ~0.002 noise. Don't chase improvements smaller than 0.003. +- **Combine independently**: Every 10-15 experiments, try combining 2-3 independently-verified improvements. +- **Think about interactions**: If higher LR helps and larger model helps, they might interact (larger models often want lower LR). + +## NEVER STOP + +Once started, do NOT pause to ask the human. You run indefinitely until manually stopped. If you've exhausted obvious hyperparameters, try combinations, try the explorer's successful changes with different hyperparameters, or try more exotic schedules. diff --git a/programs/reviewer.md b/programs/reviewer.md new file mode 100644 index 00000000..5875d571 --- /dev/null +++ b/programs/reviewer.md @@ -0,0 +1,88 @@ +# Reviewer Agent + +You are a **Reviewer** — your job is to validate that improvements are real by running longer-scale experiments and ablations. + +## Your Identity + +- **Role**: reviewer +- **Agent ID**: Read from your CLAUDE.md header +- **Style**: Skeptical, rigorous, validation-focused + +## What You Do + +You are the quality gate. Before an improvement is considered real, you: +- Re-run the best results at longer time scales (long=15min, deep=30min) +- Run ablations to isolate which parts of a multi-part change actually help +- Identify false positives (noise-level improvements that don't hold) +- Save checkpoints of validated improvements + +## Your Loop + +LOOP FOREVER: + +1. **Read the briefing**: + ``` + python run_experiment.py --briefing + ``` + +2. **Identify candidates for validation**: + - Look for experiments with status="keep" that haven't been validated at longer scales + - Prioritize larger improvements (>0.005 bpb) first + - Check if the commit still exists on the relevant branch + +3. **Validate at longer scale**: + - Check out the commit to validate: `git checkout ` + - Run at long scale: + ``` + python run_experiment.py --scale long --description "validate: " --agent-id --agent-role reviewer --save-checkpoint + ``` + - This runs for 15 minutes and saves a checkpoint + +4. **Evaluate**: + - If val_bpb is still better than baseline at long scale: **validated** (log as lesson with high confidence) + - If val_bpb regresses to baseline: **false positive** (log as lesson) + - The improvement might be different in magnitude — that's expected (5min vs 15min training) + +5. **Run ablations** for multi-part changes: + - If a commit changed 3 things, test each individually + - Start from the commit before the change, apply only one part, run standard experiment + - This tells you which part actually helped + +6. **Deep validation** for the very best results: + ``` + python run_experiment.py --scale deep --description "deep validate: " --agent-id --agent-role reviewer --save-checkpoint + ``` + 30-minute runs are the gold standard. + +7. **Log findings**: + ``` + python run_experiment.py --lesson insight high "Validated: matrix_lr=0.06 gives stable 0.004 improvement at 15min scale" + python run_experiment.py --lesson insight high "False positive: GLU improvement disappears at longer training (was likely noise)" + ``` + +8. **Wait for new candidates**: If no candidates need validation, wait 15-20 minutes for new experiments to accumulate. + +## Guidelines + +- **Be skeptical.** Your default assumption is that improvements are noise until proven otherwise. +- **Longer runs are more reliable.** A 15-minute result is more trustworthy than a 5-minute result. A 30-minute result is even more so. +- **Don't duplicate work.** Check if another reviewer already validated a result. +- **Report false positives clearly.** This saves other agents from building on unreliable foundations. +- **Save checkpoints.** Your validated long/deep checkpoints are the best starting points for future work. + +## Ablation Protocol + +When a successful commit changed multiple things (e.g., "increase depth and change LR and add warmup"): + +1. Start from parent commit (before the change) +2. Apply only change A → run standard → log result +3. Reset. Apply only change B → run standard → log result +4. Reset. Apply only change C → run standard → log result +5. Compare: which individual changes helped? Which are noise? +6. Log ablation results as lessons + +This is expensive but incredibly valuable — it prevents cargo-cult accumulation of useless changes. + +## NEVER STOP + +Run your validation loop indefinitely. When all current results are validated, review older results or run deeper validations on the top improvements. diff --git a/run_experiment.py b/run_experiment.py new file mode 100644 index 00000000..10c6aa6d --- /dev/null +++ b/run_experiment.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +""" +Experiment runner for autoresearch. +Wraps `uv run train.py` with environment setup, result parsing, and knowledge base logging. + +Usage: + python run_experiment.py --scale quick --description "try GLU activation" + python run_experiment.py --scale standard --description "baseline" --agent-id explorer-0 + python run_experiment.py --scale long --resume-from checkpoints/abc1234_standard.pt + python run_experiment.py --briefing # print research briefing and exit +""" + +import argparse +import os +import re +import subprocess +import sys +import time + +from config import TIME_BUDGETS, get_results_dir +from knowledge import ( + ExperimentRecord, + LessonRecord, + append_experiment, + append_lesson, + build_research_briefing, + sync_to_legacy_tsv, +) + + +def get_git_info(): + """Get current branch and short commit hash.""" + branch = "unknown" + commit = "unknown" + try: + branch = subprocess.check_output( + ["git", "rev-parse", "--abbrev-ref", "HEAD"], + stderr=subprocess.DEVNULL, text=True + ).strip() + except Exception: + pass + try: + commit = subprocess.check_output( + ["git", "rev-parse", "--short=7", "HEAD"], + stderr=subprocess.DEVNULL, text=True + ).strip() + except Exception: + pass + return branch, commit + + +def parse_results(log_path: str) -> dict: + """Parse the training output summary from run.log.""" + results = {} + patterns = { + "val_bpb": r"^val_bpb:\s+([\d.]+)", + "peak_vram_mb": r"^peak_vram_mb:\s+([\d.]+)", + "mfu_percent": r"^mfu_percent:\s+([\d.]+)", + "num_params_M": r"^num_params_M:\s+([\d.]+)", + "depth": r"^depth:\s+(\d+)", + "total_batch_size": r"^total_batch_size:\s+(\d+)", + "matrix_lr": r"^matrix_lr:\s+([\d.]+)", + "training_seconds": r"^training_seconds:\s+([\d.]+)", + "num_steps": r"^num_steps:\s+(\d+)", + "loss_trajectory": r"^loss_trajectory:\s+(.+)", + } + + if not os.path.exists(log_path): + return results + + with open(log_path, "r") as f: + for line in f: + line = line.strip() + for key, pattern in patterns.items(): + m = re.match(pattern, line) + if m: + val = m.group(1) + if key in ("depth", "total_batch_size", "num_steps"): + results[key] = int(val) + elif key == "loss_trajectory": + results[key] = val + else: + results[key] = float(val) + return results + + +def get_log_tail(log_path: str, n: int = 50) -> str: + """Get last n lines of log file for crash diagnosis.""" + if not os.path.exists(log_path): + return "(no log file)" + with open(log_path, "r") as f: + lines = f.readlines() + return "".join(lines[-n:]) + + +def run_experiment( + scale: str, + description: str, + agent_id: str = "solo-0", + agent_role: str = "solo", + resume_from: str = None, + save_checkpoint: bool = False, + use_queue: bool = False, + timeout_multiplier: float = 2.5, +) -> ExperimentRecord: + """Run a training experiment and log results to the knowledge base.""" + + time_budget = TIME_BUDGETS.get(scale, 300) + branch, commit = get_git_info() + log_path = "run.log" + + # Set environment + env = os.environ.copy() + env["AR_TIME_BUDGET"] = str(time_budget) + env["AR_SCALE"] = scale + if save_checkpoint or scale in ("long", "deep"): + env["AR_SAVE_CHECKPOINT"] = "1" + if resume_from: + env["AR_RESUME_CHECKPOINT"] = resume_from + + # Build command + if use_queue: + # Submit to GPU queue and wait + cmd = ["python", "gpu_queue.py", "submit", + "--agent-id", agent_id, + "--scale", scale, + "--command", "uv run train.py"] + else: + cmd = ["uv", "run", "train.py"] + + timeout = int(time_budget * timeout_multiplier) + 60 # generous timeout + + print(f"[{agent_id}] Running experiment: {description}") + print(f"[{agent_id}] Scale: {scale} ({time_budget}s), branch: {branch}, commit: {commit}") + + crashed = False + exit_code = 0 + + try: + if use_queue: + # Queue mode: submit and wait + result = subprocess.run( + cmd, env=env, timeout=timeout, + capture_output=True, text=True, cwd=os.getcwd() + ) + exit_code = result.returncode + # The queue worker writes to run.log in the worktree + else: + # Direct mode: redirect all output to run.log + with open(log_path, "w") as logf: + result = subprocess.run( + cmd, env=env, timeout=timeout, + stdout=logf, stderr=subprocess.STDOUT, cwd=os.getcwd() + ) + exit_code = result.returncode + except subprocess.TimeoutExpired: + print(f"[{agent_id}] TIMEOUT after {timeout}s") + crashed = True + exit_code = -1 + except Exception as e: + print(f"[{agent_id}] ERROR: {e}") + crashed = True + exit_code = -1 + + # Parse results + parsed = parse_results(log_path) + + if not parsed.get("val_bpb") or exit_code != 0: + crashed = True + + # Build record + record = ExperimentRecord( + agent_role=agent_role, + agent_id=agent_id, + branch=branch, + commit=commit, + scale=scale, + time_budget=time_budget, + val_bpb=parsed.get("val_bpb", 0.0), + peak_vram_mb=parsed.get("peak_vram_mb", 0.0), + mfu_percent=parsed.get("mfu_percent", 0.0), + num_params_M=parsed.get("num_params_M", 0.0), + depth=parsed.get("depth", 0), + total_batch_size=parsed.get("total_batch_size", 0), + matrix_lr=parsed.get("matrix_lr", 0.0), + loss_trajectory=parsed.get("loss_trajectory", ""), + status="crash" if crashed else "pending", # agent decides keep/discard + description=description, + parent_commit=commit, + escalated_from=resume_from, + ) + + # Log to knowledge base + append_experiment(record) + sync_to_legacy_tsv() + + # Print summary + if crashed: + print(f"\n[{agent_id}] CRASHED (exit code {exit_code})") + tail = get_log_tail(log_path) + print(f"[{agent_id}] Last 50 lines of log:\n{tail}") + else: + print(f"\n[{agent_id}] RESULT:") + print(f" val_bpb: {record.val_bpb:.6f}") + print(f" peak_vram_mb: {record.peak_vram_mb:.1f}") + print(f" mfu_percent: {record.mfu_percent:.2f}") + print(f" num_params_M: {record.num_params_M:.1f}") + print(f" depth: {record.depth}") + print(f" scale: {record.scale} ({record.time_budget}s)") + + return record + + +def main(): + parser = argparse.ArgumentParser(description="Run an autoresearch experiment") + parser.add_argument("--scale", default="standard", choices=TIME_BUDGETS.keys(), + help="Experiment time scale") + parser.add_argument("--description", default="experiment", help="What this experiment tries") + parser.add_argument("--agent-id", default="solo-0", help="Agent identifier") + parser.add_argument("--agent-role", default="solo", help="Agent role") + parser.add_argument("--resume-from", default=None, help="Checkpoint path to resume from") + parser.add_argument("--save-checkpoint", action="store_true", help="Save checkpoint after training") + parser.add_argument("--use-queue", action="store_true", help="Submit to GPU queue instead of running directly") + parser.add_argument("--briefing", action="store_true", help="Print research briefing and exit") + parser.add_argument("--lesson", nargs=3, metavar=("CATEGORY", "CONFIDENCE", "TEXT"), + help="Log a lesson: category confidence text") + + args = parser.parse_args() + + if args.briefing: + print(build_research_briefing()) + return + + if args.lesson: + category, confidence, text = args.lesson + lesson = LessonRecord( + agent_role=args.agent_role, + agent_id=args.agent_id, + category=category, + lesson=text, + confidence=confidence, + ) + append_lesson(lesson) + print(f"Lesson logged: [{confidence}] {category}: {text}") + return + + record = run_experiment( + scale=args.scale, + description=args.description, + agent_id=args.agent_id, + agent_role=args.agent_role, + resume_from=args.resume_from, + save_checkpoint=args.save_checkpoint, + use_queue=args.use_queue, + ) + + sys.exit(0 if record.status != "crash" else 1) + + +if __name__ == "__main__": + main() diff --git a/train.py b/train.py index 6994fb9b..c63c741b 100644 --- a/train.py +++ b/train.py @@ -9,6 +9,7 @@ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" import gc +import sys import time from dataclasses import dataclass, asdict @@ -449,6 +450,8 @@ def step(self): DEPTH = 8 # number of transformer layers DEVICE_BATCH_SIZE = 128 # per-device batch size (reduce if OOM) +PROBE_MODE = "--probe" in sys.argv + # --------------------------------------------------------------------------- # Setup: tokenizer, model, optimizer, dataloader # --------------------------------------------------------------------------- @@ -538,6 +541,7 @@ def get_weight_decay(progress): smooth_train_loss = 0 total_training_time = 0 step = 0 +loss_checkpoints = [] while True: torch.cuda.synchronize() @@ -560,6 +564,8 @@ def get_weight_decay(progress): if group['kind'] == 'muon': group["momentum"] = muon_momentum group["weight_decay"] = muon_weight_decay + # Compute gradient norm (for logging only, not clipping) + grad_norm = sum(p.grad.float().square().sum().item() for p in model.parameters() if p.grad is not None) ** 0.5 optimizer.step() model.zero_grad(set_to_none=True) @@ -567,7 +573,10 @@ def get_weight_decay(progress): # Fast fail: abort if loss is exploding if train_loss_f > 100: - print("FAIL") + peak_vram_mb_fail = torch.cuda.max_memory_allocated() / 1024 / 1024 + print() + print("---") + print(f"FAIL: loss diverged at step {step}, loss={train_loss_f:.4f}, peak_vram_mb={peak_vram_mb_fail:.1f}, progress={100*progress:.1f}%") exit(1) torch.cuda.synchronize() @@ -581,12 +590,16 @@ def get_weight_decay(progress): ema_beta = 0.9 smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss_f debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) + # Record loss at progress milestones + next_milestone = len(loss_checkpoints) * 25 + 25 + if next_milestone <= 75 and 100 * progress >= next_milestone: + loss_checkpoints.append((next_milestone, debiased_smooth_loss)) pct_done = 100 * progress tok_per_sec = int(TOTAL_BATCH_SIZE / dt) mfu = 100 * num_flops_per_token * TOTAL_BATCH_SIZE / dt / H100_BF16_PEAK_FLOPS remaining = max(0, TIME_BUDGET - total_training_time) - print(f"\rstep {step:05d} ({pct_done:.1f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt*1000:.0f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.1f}% | epoch: {epoch} | remaining: {remaining:.0f}s ", end="", flush=True) + print(f"\rstep {step:05d} ({pct_done:.1f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt*1000:.0f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.1f}% | gnorm: {grad_norm:.2f} | epoch: {epoch} | remaining: {remaining:.0f}s ", end="", flush=True) # GC management (Python's GC causes ~500ms stalls) if step == 0: @@ -598,11 +611,22 @@ def get_weight_decay(progress): step += 1 + # Probe mode: run a few steps then report memory and exit + if PROBE_MODE and step >= 3: + torch.cuda.synchronize() + peak_vram_mb = torch.cuda.max_memory_allocated() / 1024 / 1024 + print() + print("---") + print(f"probe_peak_vram_mb: {peak_vram_mb:.1f}") + print(f"probe_steps: {step}") + sys.exit(0) + # Time's up — but only stop after warmup steps so we don't count compilation if step > 10 and total_training_time >= TIME_BUDGET: break print() # newline after \r training log +loss_checkpoints.append((100, debiased_smooth_loss)) total_tokens = step * TOTAL_BATCH_SIZE @@ -627,3 +651,24 @@ def get_weight_decay(progress): print(f"num_steps: {step}") print(f"num_params_M: {num_params / 1e6:.1f}") print(f"depth: {DEPTH}") +print(f"total_batch_size: {TOTAL_BATCH_SIZE}") +print(f"matrix_lr: {MATRIX_LR}") +loss_trajectory = " ".join(f"{pct}%:{loss:.4f}" for pct, loss in loss_checkpoints) +print(f"loss_trajectory: {loss_trajectory}") + +# Checkpoint save (opt-in via env var) +if os.environ.get("AR_SAVE_CHECKPOINT", "0") == "1": + from checkpoint import save_checkpoint + from dataclasses import asdict as _asdict + save_checkpoint(model, optimizer, step, _asdict(config), { + "val_bpb": val_bpb, + "scale": os.environ.get("AR_SCALE", "standard"), + "peak_vram_mb": peak_vram_mb, + "mfu_percent": steady_state_mfu, + "num_params_M": num_params / 1e6, + "depth": DEPTH, + "total_batch_size": TOTAL_BATCH_SIZE, + "matrix_lr": MATRIX_LR, + "training_seconds": total_training_time, + "num_steps": step, + })