From 467269418d3a565cc02584dde85a57508e8e61df Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Sun, 22 Feb 2026 13:15:30 +1100
Subject: [PATCH 01/17] feat(agents): basic deep-research example

---
 examples/deep-research/deep-research.mjs | 1096 ++++++++++++++++++++++
 test/examples.js                         |   53 ++
 2 files changed, 1149 insertions(+)
 create mode 100644 examples/deep-research/deep-research.mjs
diff --git a/examples/deep-research/deep-research.mjs b/examples/deep-research/deep-research.mjs
new file mode 100644
index 0000000..7667fa7
--- /dev/null
+++ b/examples/deep-research/deep-research.mjs
@@ -0,0 +1,1096 @@
+#!/usr/bin/env node
+/**
+ * Deep Research with Tool-Calling Agents via BranchStore
+ *
+ * Demonstrates three fork patterns in a multi-agent research pipeline:
+ *
+ * 1. PLAN:     Branch.create() + grammar — single constrained generation
+ * 2. RESEARCH: fork() + prefill() divergent suffixes — content-based divergence
+ *              from shared prefix, with tool-calling agentic loop
+ * 3. VERIFY:   fork() + reseed() same prompt — stochastic divergence for
+ *              convergence checking, then model-as-judge eval fork
+ *
+ * Search uses a Qwen3-Reranker-0.6B cross-encoder for semantic relevance
+ * scoring over a local corpus of markdown files. Both models (generative +
+ * reranker) are loaded simultaneously — Qwen3 family shares vocabulary.
+ *
+ * The key performance insight: BranchStore.commit() packs N branches into
+ * ONE llama_decode() call. N agents generate in lockstep with O(1) GPU
+ * dispatches per step, regardless of branch count.
+ *
+ * Usage:
+ *   node deep-research.mjs <model-path> --corpus <path> --query <text> [options]
+ *
+ * Required:
+ *   <model-path>     Path to generative model (e.g. Qwen3-4B-Instruct)
+ *   --corpus  path   Directory of .md files (or single .md file) to research
+ *   --query   text   Research question
+ *
+ * Options:
+ *   --reranker path  Reranker model path (default: qwen3-reranker-0.6b)
+ *   --jsonl          JSONL output for testing
+ *   --verbose        Show native llama.cpp logs
+ *
+ * Example:
+ *   node deep-research.mjs ./models/Qwen3-4B.gguf \
+ *     --corpus ~/docs --query "How does the auth system work?"
+ */
+
+import * as fs from 'node:fs';
+import * as path from 'node:path';
+import * as readline from 'node:readline';
+import { fileURLToPath } from 'node:url';
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const DEFAULT_MODEL = path.resolve(
+  __dirname,
+  '../../models/Qwen3-4B-Instruct-2507-Q4_K_M.gguf'
+);
+const DEFAULT_RERANKER = path.resolve(
+  __dirname,
+  '../../models/qwen3-reranker-0.6b-q4_k_m.gguf'
+);
+
+const args = process.argv.slice(2);
+const jsonlMode = args.includes('--jsonl');
+const verbose = args.includes('--verbose');
+
+function argVal(flag) {
+  const i = args.indexOf(flag);
+  return i !== -1 ? args[i + 1] : null;
+}
+const flagIndices = new Set(
+  ['--reranker', '--corpus', '--query'].flatMap((f) => {
+    const i = args.indexOf(f);
+    return i !== -1 ? [i, i + 1] : [];
+  })
+);
+
+const rerankModelPath = argVal('--reranker') || DEFAULT_RERANKER;
+const corpusDir = argVal('--corpus');
+const QUERY = argVal('--query');
+const modelPath = args.find((a, i) =>
+  !a.startsWith('--') && !flagIndices.has(i)
+) || DEFAULT_MODEL;
+
+if (!corpusDir || !QUERY) {
+  const missing = [
+    !corpusDir && '--corpus',
+    !QUERY && '--query',
+  ].filter(Boolean);
+  process.stdout.write(
+    `Usage: node deep-research.mjs [model-path] --corpus <path> --query <text> [--reranker <path>]\n` +
+    `Missing: ${missing.join(', ')}\n`
+  );
+  process.exit(1);
+}
+
+// ================================================================
+// Suppress native llama.cpp logs (C-level stderr) for clean output.
+// The native binary hasn't loaded yet (lazy on first createContext),
+// so redirecting fd 2 here catches all ggml/llama init logs.
+// Use --verbose to see them.
+// ================================================================
+if (!verbose && !jsonlMode) {
+  try {
+    fs.closeSync(2);
+    fs.openSync(process.platform === 'win32' ? '\\\\.\\NUL' : '/dev/null', 'w');
+  } catch { /* non-fatal — logs will show */ }
+}
+
+// ================================================================
+// DISPLAY — ANSI formatting for terminal output
+// ================================================================
+
+const isTTY = process.stdout.isTTY;
+const c = isTTY ? {
+  bold: '\x1b[1m', dim: '\x1b[2m', reset: '\x1b[0m',
+  green: '\x1b[32m', cyan: '\x1b[36m', yellow: '\x1b[33m', red: '\x1b[31m',
+} : { bold: '', dim: '', reset: '', green: '', cyan: '', yellow: '', red: '' };
+
+const log = (...a) => { if (!jsonlMode) console.log(...a); };
+
+function emit(event, data) {
+  if (jsonlMode) console.log(JSON.stringify({ event, ...data }));
+}
+
+// ================================================================
+// CONSTANTS
+// ================================================================
+
+const AGENT_COUNT = 3;
+const VERIFY_COUNT = 3;
+const MAX_TOOL_TURNS = 6;
+
+// ================================================================
+// CORPUS — load and chunk at ## boundaries
+// ================================================================
+
+function loadCorpus() {
+  if (!fs.existsSync(corpusDir)) {
+    process.stdout.write(`Error: corpus not found: ${corpusDir}\n`);
+    process.exit(1);
+  }
+  const stat = fs.statSync(corpusDir);
+  if (stat.isFile()) {
+    return [{ name: path.basename(corpusDir), content: fs.readFileSync(corpusDir, 'utf8') }];
+  }
+  const files = fs.readdirSync(corpusDir).filter((f) => f.endsWith('.md'));
+  if (!files.length) {
+    process.stdout.write(`Error: no .md files in: ${corpusDir}\n`);
+    process.exit(1);
+  }
+  return files.map((f) => ({
+    name: f,
+    content: fs.readFileSync(path.join(corpusDir, f), 'utf8'),
+  }));
+}
+
+// Max chars per chunk — conservative estimate at ~3 chars/token for code-heavy
+// content, leaving room for reranker template overhead (~130 tokens).
+// With reranker nCtx=8192: budget ≈ 8000 tokens × 3 = 24000 chars.
+const CHUNK_CHAR_LIMIT = 24000;
+
+function chunkCorpus(files) {
+  const out = [];
+  for (const file of files) {
+    for (const section of file.content.split(/(?=^## )/m)) {
+      const heading = (section.match(/^##?\s+(.+)/m) || [, file.name])[1];
+      const trimmed = section.trim();
+      if (trimmed.length <= CHUNK_CHAR_LIMIT) {
+        out.push({ file: file.name, heading, text: trimmed });
+        continue;
+      }
+      // Sub-split oversized sections: ### → paragraph → hard truncate
+      for (const sub of subChunk(trimmed, heading)) {
+        out.push({ file: file.name, heading: sub.heading, text: sub.text });
+      }
+    }
+  }
+  return out;
+}
+
+function subChunk(text, parentHeading) {
+  // Try splitting at ### boundaries first
+  const subSections = text.split(/(?=^### )/m);
+  if (subSections.length > 1) {
+    const results = [];
+    for (const sub of subSections) {
+      const subHeading = (sub.match(/^###?\s+(.+)/m) || [, parentHeading])[1];
+      const trimmed = sub.trim();
+      if (trimmed.length <= CHUNK_CHAR_LIMIT) {
+        results.push({ heading: subHeading, text: trimmed });
+      } else {
+        // Still too large — fall through to paragraph splitting
+        results.push(...splitByParagraph(trimmed, subHeading));
+      }
+    }
+    return results;
+  }
+  // No ### headings — split by paragraphs
+  return splitByParagraph(text, parentHeading);
+}
+
+function splitByParagraph(text, heading) {
+  const paragraphs = text.split(/\n\n+/);
+  const results = [];
+  let current = '';
+  let partIndex = 0;
+
+  for (const para of paragraphs) {
+    if (current.length + para.length + 2 > CHUNK_CHAR_LIMIT && current.length > 0) {
+      results.push({ heading: `${heading} (${++partIndex})`, text: current.trim() });
+      current = '';
+    }
+    // Single paragraph exceeds limit — hard truncate
+    if (para.length > CHUNK_CHAR_LIMIT) {
+      if (current.length > 0) {
+        results.push({ heading: `${heading} (${++partIndex})`, text: current.trim() });
+        current = '';
+      }
+      results.push({ heading: `${heading} (${++partIndex})`, text: para.slice(0, CHUNK_CHAR_LIMIT) });
+      continue;
+    }
+    current += (current ? '\n\n' : '') + para;
+  }
+  if (current.trim()) {
+    results.push({ heading: `${heading} (${partIndex > 0 ? ++partIndex : ''})`.replace(/ \(\)$/, ''), text: current.trim() });
+  }
+  return results;
+}
+
+const corpus = loadCorpus();
+const chunks = chunkCorpus(corpus);
+
+// ================================================================
+// RERANKER — Qwen3-Reranker cross-encoder scoring via Branch API
+// ================================================================
+
+// Prompt template from Qwen3-Reranker model card: system (yes/no judge) +
+// user (<Instruct> + <Query> + <Document>) + empty think block prefix.
+const RERANK_PREFIX =
+  '<|im_start|>system\n' +
+  'Judge whether the Document meets the requirements based on the Query ' +
+  'and the Instruct provided. Note that the answer can only be "yes" or "no".' +
+  '<|im_end|>\n<|im_start|>user\n' +
+  '<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n\n' +
+  '<Query>: ';
+const RERANK_MID = '\n\n<Document>: ';
+const RERANK_SUFFIX = '<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n';
+
+let rerankCtx = null;
+let yesId = 0;
+let noId = 0;
+
+// Pre-tokenized template segments — populated after reranker loads.
+let rerankPrefixTokens = null; // RERANK_PREFIX (with BOS)
+let rerankMidTokens = null;    // RERANK_MID
+let rerankSuffixTokens = null; // RERANK_SUFFIX
+
+function rerankScore(logits) {
+  const max = Math.max(logits[yesId], logits[noId]);
+  const yesExp = Math.exp(logits[yesId] - max);
+  const noExp = Math.exp(logits[noId] - max);
+  return yesExp / (yesExp + noExp);
+}
+
+// ================================================================
+// TOOLS — reranker-backed search + snippet extraction
+// ================================================================
+
+async function toolSearch(query) {
+  const queryTokens = await rerankCtx.tokenize(query, false);
+  const scored = [];
+  for (const chunk of chunks) {
+    // Pre-tokenized segments — no string concat, no per-chunk tokenize().
+    // Boundary safety: all joints are at special tokens or newlines,
+    // which are explicit token boundaries in Qwen3's BPE vocabulary.
+    const tokens = [
+      ...rerankPrefixTokens, ...queryTokens,
+      ...rerankMidTokens, ...chunk.tokens,
+      ...rerankSuffixTokens,
+    ];
+    // Fresh branch per chunk — position must start at 0 each time.
+    const branch = Branch.create(rerankCtx, 0, { temperature: 0 });
+    await branch.prefill(tokens);
+    const score = rerankScore(branch.getLogits());
+    await branch.prune();
+    scored.push({ file: chunk.file, heading: chunk.heading, score: Math.round(score * 1000) / 1000 });
+  }
+  return scored.sort((a, b) => b.score - a.score).slice(0, 5);
+}
+
+function toolReadFile(filename, query) {
+  const file = corpus.find((f) => f.name === filename);
+  if (!file) {
+    return { error: `File not found: ${filename}. Available: ${corpus.map((f) => f.name).join(', ')}` };
+  }
+  if (!query) return { file: file.name, content: file.content.slice(0, 800) };
+  const terms = query.toLowerCase().split(/\s+/).filter(Boolean);
+  const lines = file.content.split('\n');
+  const snippets = [];
+  const seen = new Set();
+  for (let i = 0; i < lines.length; i++) {
+    if (!terms.some((t) => lines[i].toLowerCase().includes(t))) continue;
+    const start = Math.max(0, i - 1);
+    const end = Math.min(lines.length, i + 4);
+    if (seen.has(start)) continue;
+    seen.add(start);
+    snippets.push(lines.slice(start, end).join('\n'));
+    if (snippets.length >= 3) break;
+  }
+  return snippets.length > 0
+    ? { file: file.name, snippets }
+    : { file: file.name, snippets: ['No matches for: ' + query] };
+}
+
+async function executeTool(name, toolArgs) {
+  switch (name) {
+    case 'search':
+      return toolSearch(toolArgs.query || '');
+    case 'read_file':
+      return toolReadFile(toolArgs.filename || toolArgs.path || '', toolArgs.query || '');
+    case 'report':
+      return { acknowledged: true };
+    default:
+      return { error: `Unknown tool: ${name}` };
+  }
+}
+
+const TOOLS = [
+  {
+    type: 'function',
+    function: {
+      name: 'search',
+      description: 'Search the knowledge base for relevant content. Returns sections ranked by semantic relevance.',
+      parameters: {
+        type: 'object',
+        properties: { query: { type: 'string', description: 'Search query' } },
+        required: ['query'],
+      },
+    },
+  },
+  {
+    type: 'function',
+    function: {
+      name: 'read_file',
+      description: 'Extract relevant snippets from a specific file. Use query to target specific content.',
+      parameters: {
+        type: 'object',
+        properties: {
+          filename: { type: 'string', description: 'Filename from search results (e.g. "api-security.md")' },
+          query: { type: 'string', description: 'What to extract from the file' },
+        },
+        required: ['filename'],
+      },
+    },
+  },
+  {
+    type: 'function',
+    function: {
+      name: 'report',
+      description: 'Submit your final research findings. Call this when you have gathered enough information to answer the question.',
+      parameters: {
+        type: 'object',
+        properties: { findings: { type: 'string', description: 'Your research findings and answer' } },
+        required: ['findings'],
+      },
+    },
+  },
+];
+
+const TOOLS_JSON = JSON.stringify(TOOLS);
+
+const AGENT_SYSTEM_PROMPT =
+  'You are a research assistant with access to a knowledge base. ' +
+  'Use the search and read_file tools to find information, then call report with your findings. ' +
+  'Be thorough: search first, read relevant files, then report. ' +
+  'Available files: ' + corpus.map((f) => f.name).join(', ');
+
+// ================================================================
+// HELPERS
+// ================================================================
+
+const sec = (a, b) => ((b - a) / 1000).toFixed(1);
+const pad = (s, n) => String(s).padStart(n);
+const fmtSize = (bytes) => bytes > 1e9
+  ? (bytes / 1e9).toFixed(1) + ' GB'
+  : (bytes / 1e6).toFixed(0) + ' MB';
+
+// ================================================================
+// MAIN
+// ================================================================
+
+// Dynamic import — native module loads here, after fd 2 redirect
+const { createContext, Branch, BranchStore } = await import('../../lib/index.js');
+
+async function main() {
+  const t0 = performance.now();
+
+  const modelName = path.basename(modelPath).replace(/-Q\w+\.gguf$/, '');
+  const rerankName = path.basename(rerankModelPath).replace(/-q\w+\.gguf$/i, '');
+  const modelSize = fmtSize(fs.statSync(modelPath).size);
+  const rerankSize = fmtSize(fs.statSync(rerankModelPath).size);
+
+  log();
+  log(`${c.bold}  Deep Research${c.reset} ${c.dim}— BranchStore Tool-Calling Agents${c.reset}`);
+  log();
+
+  emit('start', {
+    model: path.basename(modelPath),
+    reranker: path.basename(rerankModelPath),
+    query: QUERY,
+    agentCount: AGENT_COUNT,
+    verifyCount: VERIFY_COUNT,
+    chunks: chunks.length,
+  });
+
+  log(`  ${c.green}●${c.reset} Loading ${c.bold}${modelName}${c.reset} ${c.dim}(${modelSize}, KV: Q4_0)${c.reset}`);
+
+  // Load generative model
+  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '16384', 10);
+  const ctx = await createContext({
+    modelPath,
+    nCtx,
+    nSeqMax: AGENT_COUNT + 1,
+    typeK: 'q4_0',
+    typeV: 'q4_0',
+  });
+
+  log(`  ${c.green}●${c.reset} Loading ${c.bold}${rerankName}${c.reset} ${c.dim}(${rerankSize}, reranker)${c.reset}`);
+
+  // Load reranker (small — ~300 MB alongside the 4B generative model)
+  rerankCtx = await createContext({
+    modelPath: rerankModelPath,
+    nCtx: 8192,
+    nSeqMax: AGENT_COUNT,
+  });
+
+  // Pre-tokenize reranker template segments + chunk texts.
+  // Done once — saves N_chunks × tokenize() calls per search.
+  [yesId] = await rerankCtx.tokenize('yes', false);
+  [noId] = await rerankCtx.tokenize('no', false);
+  rerankPrefixTokens = await rerankCtx.tokenize(RERANK_PREFIX, true);
+  rerankMidTokens = await rerankCtx.tokenize(RERANK_MID, false);
+  rerankSuffixTokens = await rerankCtx.tokenize(RERANK_SUFFIX, false);
+  for (const chunk of chunks) {
+    chunk.tokens = await rerankCtx.tokenize(chunk.text, false);
+  }
+
+  const corpusIsFile = corpus.length === 1 && fs.statSync(corpusDir).isFile();
+  const corpusLabel = corpusIsFile
+    ? path.basename(corpusDir)
+    : `${path.basename(corpusDir)}/ — ${corpus.length} files`;
+  log(`  ${c.dim}  Corpus: ${corpusLabel} → ${chunks.length} chunks${c.reset}`);
+
+  const store = new BranchStore(ctx);
+  const sep = ctx.getTurnSeparator();
+
+  log();
+  log(`  ${c.dim}Query${c.reset}`);
+  log(`  ${c.bold}${QUERY}${c.reset}`);
+
+  // ================================================================
+  // PHASE 1: PLAN — Branch.create() + grammar
+  // ================================================================
+  const tPlan = performance.now();
+
+  const planSchema = {
+    type: 'object',
+    properties: {
+      questions: {
+        type: 'array',
+        items: { type: 'string' },
+        minItems: 2,
+        maxItems: AGENT_COUNT,
+      },
+    },
+    required: ['questions'],
+  };
+  const planGrammar = await ctx.jsonSchemaToGrammar(JSON.stringify(planSchema));
+
+  const planMessages = [
+    { role: 'system', content: 'You break research queries into sub-questions. Output JSON only.' },
+    { role: 'user', content: `Break this into ${AGENT_COUNT} independent sub-questions for parallel research: "${QUERY}"` },
+  ];
+  const { prompt: planPrompt } = await ctx.formatChat(JSON.stringify(planMessages));
+  const planTokens = await ctx.tokenize(planPrompt);
+
+  const lead = Branch.create(ctx, 0, { temperature: 0.3 }, undefined, planGrammar);
+  await lead.prefill(planTokens);
+
+  let planOutput = '';
+  let planTokenCount = 0;
+  for await (const { text } of lead) {
+    planOutput += text;
+    planTokenCount++;
+  }
+  await lead.prune();
+
+  let questions;
+  try {
+    const plan = JSON.parse(planOutput);
+    questions = plan.questions.slice(0, AGENT_COUNT);
+    if (!questions.length) throw new Error('empty questions');
+  } catch {
+    questions = Array.from({ length: AGENT_COUNT }, (_, i) => `${QUERY} (aspect ${i + 1})`);
+  }
+
+  emit('plan', { questions, planTokens: planTokenCount });
+
+  // ================================================================
+  // PHASE 2: RESEARCH — fork() + prefill() divergent suffixes + tools
+  // ================================================================
+  const tResearch = performance.now();
+
+  log();
+  log(`  ${c.green}●${c.reset} ${c.bold}Plan${c.reset} ${c.dim}${planTokenCount} tok · ${sec(tPlan, tResearch)}s${c.reset}`);
+  questions.forEach((q, i) => log(`    ${c.dim}${i + 1}.${c.reset} ${q}`));
+
+  // Shared prefix: system prompt + tool definitions, NO assistant prompt
+  const sharedMessages = [{ role: 'system', content: AGENT_SYSTEM_PROMPT }];
+  const sharedFmt = await ctx.formatChat(
+    JSON.stringify(sharedMessages),
+    { tools: TOOLS_JSON, addGenerationPrompt: false }
+  );
+  const sharedTokens = await ctx.tokenize(sharedFmt.prompt);
+
+  // Root branch — prefill shared prefix once
+  const agentRoot = Branch.create(ctx, 0, { temperature: 0.5 });
+  await agentRoot.prefill(sharedTokens);
+
+  // Fork N agents, compute divergent suffixes via token slicing
+  const agents = [];
+  for (const q of questions) {
+    const branch = await agentRoot.fork();
+
+    const fullMessages = [
+      { role: 'system', content: AGENT_SYSTEM_PROMPT },
+      { role: 'user', content: q },
+    ];
+    const fmt = await ctx.formatChat(JSON.stringify(fullMessages), { tools: TOOLS_JSON });
+    const fullTokens = await ctx.tokenize(fmt.prompt);
+    const suffixTokens = fullTokens.slice(sharedTokens.length);
+
+    agents.push({
+      branch,
+      suffixTokens,
+      fmt: {
+        format: fmt.format,
+        reasoningFormat: fmt.reasoningFormat,
+        thinkingForcedOpen: fmt.thinkingForcedOpen,
+        parser: fmt.parser,
+      },
+      messages: [...fullMessages],
+      rawOutput: '',
+      done: false,
+      tokenCount: 0,
+      toolCallCount: 0,
+      turns: 0,
+      findings: null,
+    });
+  }
+  // agentRoot pruned after agents are done (can't prune parent with live children)
+
+  // Batched prefill — only the unique suffixes
+  await store.prefill(agents.map((w) => [w.branch, w.suffixTokens]));
+
+  emit('research_start', {
+    agentCount: agents.length,
+    sharedPrefixTokens: sharedTokens.length,
+  });
+
+  log();
+  log(`  ${c.green}●${c.reset} ${c.bold}Research${c.reset} ${c.dim}${agents.length} agents · shared prefix ${sharedTokens.length} tok${c.reset}`);
+
+  // Reranker mutex — serializes llama_decode calls on rerankCtx so
+  // concurrent fire-and-forget searches don't race.
+  let rerankLock = Promise.resolve();
+  function withRerankLock(fn) {
+    const prev = rerankLock;
+    let release;
+    rerankLock = new Promise((r) => { release = r; });
+    return prev.then(fn).finally(release);
+  }
+
+  // ── runAgentSwarm — reusable agentic loop ──
+  //
+  // Three-phase tick:
+  //   PRODUCE — produceSync() on active (non-pending, non-done) agents
+  //   COMMIT  — store.commit() for tokens produced this tick
+  //   SETTLE  — non-blocking check for resolved tools, batch warm-prefill
+  //
+  // Tool calls are dispatched as promises and don't block generation.
+  // Active agents keep producing tokens while tools run in background.
+  // When tools resolve, their agents get batched warm-prefilled back in.
+  async function runAgentSwarm(agents) {
+    let steps = 0;
+    let totalToolCalls = 0;
+    const counters = {
+      warmPrefillCalls: 0,
+      warmPrefillBranches: 0,
+      stalledTicks: 0,
+      maxConcurrentTools: 0,
+      idleTicks: 0,
+    };
+
+    // pendingTools: Map<agentIndex, { promise, name }>
+    const pendingTools = new Map();
+
+    function dispatchTool(ai, w, tc, parsed) {
+      let toolArgs;
+      try { toolArgs = JSON.parse(tc.arguments); } catch { toolArgs = {}; }
+      const callId = tc.id || `call_${w.toolCallCount}`;
+
+      w.toolCallCount++;
+      totalToolCalls++;
+      w.turns++;
+
+      emit('tool_call', { agentIndex: ai, toolName: tc.name, arguments: tc.arguments });
+      const argSummary = tc.name === 'search'
+        ? `"${toolArgs.query || ''}"`
+        : toolArgs.filename + (toolArgs.query ? `, "${toolArgs.query}"` : '');
+      log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.cyan}${tc.name}${c.reset}(${argSummary})`);
+
+      const promise = (async () => {
+        try {
+          const result = tc.name === 'search'
+            ? await withRerankLock(() => executeTool(tc.name, toolArgs))
+            : await executeTool(tc.name, toolArgs);
+
+          const resultStr = JSON.stringify(result);
+          emit('tool_result', {
+            agentIndex: ai, toolName: tc.name,
+            result: resultStr.length > 200 ? resultStr.slice(0, 200) + '...' : resultStr,
+          });
+          log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.dim}← ${tc.name} ${resultStr.length}b${c.reset}`);
+
+          w.messages.push({
+            role: 'assistant', content: parsed.content,
+            ...(parsed.reasoningContent && { reasoning_content: parsed.reasoningContent }),
+            tool_calls: [{ type: 'function', function: { name: tc.name, arguments: tc.arguments }, id: callId }],
+          });
+          w.messages.push({ role: 'tool', content: resultStr, tool_call_id: callId });
+
+          // Format warm prefill tokens — the assistant's tool-call turn is
+          // already in KV from generation; `sep` closes it. This formats just
+          // the tool-result turn + new assistant prompt.
+          const { prompt } = await ctx.formatChat(
+            JSON.stringify([
+              { role: 'system', content: '' },
+              { role: 'tool', content: resultStr, tool_call_id: callId },
+            ])
+          );
+          const delta = await ctx.tokenize(prompt, false);
+          return { ai: ai, prefillTokens: [...sep, ...delta] };
+        } catch (err) {
+          w.done = true;
+          w.findings = `Tool error: ${err.message}`;
+          return { ai: ai, prefillTokens: null };
+        }
+      })();
+
+      pendingTools.set(ai, { promise, name: tc.name });
+      counters.maxConcurrentTools = Math.max(counters.maxConcurrentTools, pendingTools.size);
+    }
+
+    for (;;) {
+      // ── Phase 1: PRODUCE — sample from active agents ──
+      const entries = [];
+      for (let ai = 0; ai < agents.length; ai++) {
+        const w = agents[ai];
+        if (w.done || pendingTools.has(ai)) continue;
+
+        const { token, text, isStop } = w.branch.produceSync();
+        if (isStop) {
+          const parsed = ctx.parseChatOutput(w.rawOutput, w.fmt.format, {
+            reasoningFormat: w.fmt.reasoningFormat,
+            thinkingForcedOpen: w.fmt.thinkingForcedOpen,
+            parser: w.fmt.parser,
+          });
+
+          const tc = parsed.toolCalls[0];
+          if (!tc || w.turns >= MAX_TOOL_TURNS) {
+            w.done = true;
+            if (!w.findings && parsed.content) w.findings = parsed.content;
+            continue;
+          }
+
+          if (tc.name === 'report') {
+            try { w.findings = JSON.parse(tc.arguments).findings; } catch { w.findings = tc.arguments; }
+            w.done = true;
+            w.toolCallCount++;
+            totalToolCalls++;
+            emit('tool_call', { agentIndex: ai, toolName: 'report', arguments: tc.arguments });
+            log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.cyan}report${c.reset}`);
+            continue;
+          }
+
+          // Fire-and-forget — dispatch tool without blocking the decode loop
+          dispatchTool(ai, w, tc, parsed);
+          w.rawOutput = '';
+          continue;
+        }
+
+        entries.push([w.branch, token]);
+        w.rawOutput += text;
+        w.tokenCount++;
+      }
+
+      // ── Phase 2: COMMIT — batch-decode produced tokens ──
+      if (entries.length > 0) {
+        await store.commit(entries);
+        steps++;
+      }
+
+      // ── Phase 3: SETTLE — non-blocking check for resolved tools ──
+      const prefillPairs = [];
+      for (const [ai, info] of pendingTools) {
+        const result = await Promise.race([info.promise, Promise.resolve(null)]);
+        if (result !== null) {
+          pendingTools.delete(ai);
+          if (result.prefillTokens) {
+            prefillPairs.push([agents[result.ai].branch, result.prefillTokens]);
+          }
+        }
+      }
+
+      if (prefillPairs.length > 0) {
+        await store.prefill(prefillPairs);
+        counters.warmPrefillCalls++;
+        counters.warmPrefillBranches += prefillPairs.length;
+      }
+
+      // ── Termination + idle yield ──
+      const allDone = agents.every((w) => w.done) && pendingTools.size === 0;
+      if (allDone) break;
+
+      if (entries.length === 0 && pendingTools.size > 0) {
+        counters.stalledTicks++;
+        if (prefillPairs.length === 0) {
+          // Nothing produced, nothing settled — yield until a tool resolves
+          await Promise.race([...pendingTools.values()].map((i) => i.promise));
+          counters.idleTicks++;
+        }
+      }
+    }
+
+    const totalTokens = agents.reduce((s, w) => s + w.tokenCount, 0);
+    return { totalTokens, totalToolCalls, steps, counters };
+  }
+
+  // ── forkAgent — fork from conversation trunk with own system prompt ──
+  //
+  // Different from Phase 2's shared-prefix optimization (agentRoot + slice).
+  // This pattern injects a full system prompt + tools via sep + delta onto
+  // an existing conversation branch — used for follow-up research agents.
+  async function forkAgent(trunk, systemPrompt, task, opts = {}) {
+    const branch = await trunk.fork();
+    const messages = [
+      { role: 'system', content: systemPrompt },
+      { role: 'user', content: task },
+    ];
+    const fmtOpts = opts.tools ? { tools: opts.tools } : {};
+    const fmt = await ctx.formatChat(JSON.stringify(messages), fmtOpts);
+    const suffixTokens = [...sep, ...await ctx.tokenize(fmt.prompt, false)];
+    return {
+      branch, suffixTokens,
+      fmt: { format: fmt.format, reasoningFormat: fmt.reasoningFormat,
+             thinkingForcedOpen: fmt.thinkingForcedOpen, parser: fmt.parser },
+      messages: [...messages],
+      rawOutput: '', done: false, tokenCount: 0,
+      toolCallCount: 0, turns: 0, findings: null,
+    };
+  }
+
+  const { totalTokens: totalAgentTokens, totalToolCalls, steps: researchSteps, counters } =
+    await runAgentSwarm(agents);
+
+  for (let i = 0; i < agents.length; i++) {
+    const w = agents[i];
+    const isLast = i === agents.length - 1;
+    const branch = isLast ? '└' : '├';
+
+    emit('agent_done', {
+      index: i,
+      question: questions[i],
+      findings: (w.findings || '').slice(0, 500),
+      toolCalls: w.toolCallCount,
+      turns: w.turns,
+      tokenCount: w.tokenCount,
+    });
+
+    log(`    ${c.dim}${branch}${c.reset} ${c.yellow}${i}${c.reset} ${c.green}done${c.reset} ${c.dim}${w.tokenCount} tok · ${w.toolCallCount} tools${c.reset}`);
+
+    await w.branch.prune();
+  }
+  await agentRoot.prune();
+
+  // ================================================================
+  // PHASE 3: VERIFY — fork() + reseed() + eval fork
+  // ================================================================
+  const tVerify = performance.now();
+
+  log(`    ${c.dim}${totalAgentTokens} tok · ${totalToolCalls} tools · ${sec(tResearch, tVerify)}s${c.reset}`);
+
+  const findingsText = agents
+    .map((w, i) => `Q: ${questions[i]}\nA: ${(w.findings || '').trim()}`)
+    .join('\n\n');
+
+  const synthMessages = [
+    { role: 'system', content: 'Synthesize the research findings into a coherent, concise summary.' },
+    { role: 'user', content: `Research findings:\n\n${findingsText}\n\nSynthesize these into a brief summary answering: "${QUERY}"` },
+  ];
+  const { prompt: synthPrompt } = await ctx.formatChat(JSON.stringify(synthMessages));
+  const synthTokens = await ctx.tokenize(synthPrompt);
+
+  const synthRoot = Branch.create(ctx, 0, { temperature: 0.7 });
+  await synthRoot.prefill(synthTokens);
+
+  emit('verify_start', {
+    attemptCount: VERIFY_COUNT,
+    prefixTokens: synthTokens.length,
+  });
+
+  log();
+  log(`  ${c.green}●${c.reset} ${c.bold}Verify${c.reset} ${c.dim}${VERIFY_COUNT} attempts · shared prefix ${synthTokens.length} tok${c.reset}`);
+
+  const attempts = [];
+  for (let i = 0; i < VERIFY_COUNT; i++) {
+    const branch = await synthRoot.fork();
+    branch.reseedSampler(2000 + i);
+    attempts.push({ branch, output: '', done: false, tokenCount: 0, ppl: Infinity });
+  }
+  // synthRoot pruned after attempts are done (can't prune parent with live children)
+
+  let verifySteps = 0;
+  for (;;) {
+    const entries = [];
+    for (const a of attempts) {
+      if (a.done) continue;
+      const { token, text, isStop } = a.branch.produceSync();
+      if (isStop) {
+        const p = a.branch.perplexity;
+        a.ppl = Number.isFinite(p) ? p : Infinity;
+        a.done = true;
+        continue;
+      }
+      entries.push([a.branch, token]);
+      a.output += text;
+      a.tokenCount++;
+    }
+    if (entries.length === 0) break;
+    await store.commit(entries);
+    verifySteps++;
+  }
+
+  const totalVerifyTokens = attempts.reduce((s, a) => s + a.tokenCount, 0);
+  for (let i = 0; i < attempts.length; i++) {
+    const isLast = i === attempts.length - 1;
+    const branch = isLast ? '└' : '├';
+
+    emit('attempt_done', {
+      index: i,
+      output: attempts[i].output.trim().slice(0, 500),
+      tokenCount: attempts[i].tokenCount,
+      ppl: attempts[i].ppl,
+    });
+
+    log(`    ${c.dim}${branch} ${attempts[i].tokenCount} tok · ppl ${attempts[i].ppl.toFixed(2)}${c.reset}`);
+  }
+
+  // Pick lowest perplexity synthesis (most coherent) — same as best-of-n.mjs
+  // Selected before pruning so we can keep the best branch alive for follow-up.
+  const bestAttempt = attempts.reduce((a, b) => a.ppl <= b.ppl ? a : b);
+
+  for (const a of attempts) { if (a !== bestAttempt) await a.branch.prune(); }
+  // synthRoot stays alive until interactive loop ends — forked children share
+  // physical KV entries with the parent via seq_id tags.
+
+  // Eval fork — model-as-judge
+  const tEval = performance.now();
+
+  log(`    ${c.dim}${totalVerifyTokens} tok · ${sec(tVerify, tEval)}s${c.reset}`);
+
+  const responsesText = attempts
+    .map((a, i) => `Response ${i + 1}: ${a.output.trim()}`)
+    .join('\n\n');
+
+  const evalMessages = [
+    {
+      role: 'system',
+      content: 'You are a consistency checker. Compare the responses and determine if they convey the same core meaning. Output JSON only.',
+    },
+    {
+      role: 'user',
+      content: `Do these responses agree on the key points?\n\n${responsesText}`,
+    },
+  ];
+
+  const evalSchema = {
+    type: 'object',
+    properties: { converged: { type: 'boolean' } },
+    required: ['converged'],
+  };
+  const evalGrammar = await ctx.jsonSchemaToGrammar(JSON.stringify(evalSchema));
+
+  const { prompt: evalPrompt } = await ctx.formatChat(JSON.stringify(evalMessages));
+  const evalTokens = await ctx.tokenize(evalPrompt);
+
+  const evalBranch = Branch.create(ctx, 0, { temperature: 0 }, undefined, evalGrammar);
+  await evalBranch.prefill(evalTokens);
+
+  let evalOutput = '';
+  let evalTokenCount = 0;
+  for await (const { text } of evalBranch) {
+    evalOutput += text;
+    evalTokenCount++;
+  }
+  await evalBranch.prune();
+
+  let converged;
+  try {
+    converged = JSON.parse(evalOutput).converged;
+  } catch {
+    converged = null;
+  }
+
+  emit('convergence', { evalOutput, evalTokens: evalTokenCount, converged });
+
+  // ================================================================
+  // COMPLETE
+  // ================================================================
+  const tEnd = performance.now();
+
+  const verdict = converged === true ? `${c.green}yes${c.reset}` : converged === false ? `${c.red}no${c.reset}` : `${c.yellow}unknown${c.reset}`;
+  log();
+  log(`  ${c.green}●${c.reset} ${c.bold}Eval${c.reset} ${c.dim}${evalTokenCount} tok · ${sec(tEval, tEnd)}s${c.reset}`);
+  log(`    Converged: ${verdict}`);
+
+  log();
+  log(`  ${c.dim}${'─'.repeat(58)}${c.reset}`);
+  log();
+  const prose = bestAttempt.output.trim()
+    .replace(/\*\*(.+?)\*\*/g, `${c.bold}$1${c.reset}`)
+    .split('\n').map((l) => `  ${l}`).join('\n');
+  log(prose);
+  log();
+
+  const totalTokens = planTokenCount + totalAgentTokens + totalVerifyTokens + evalTokenCount;
+
+  emit('complete', {
+    planTokens: planTokenCount,
+    agentTokens: totalAgentTokens,
+    researchSteps,
+    verifyTokens: totalVerifyTokens,
+    verifySteps,
+    evalTokens: evalTokenCount,
+    converged,
+    totalToolCalls,
+    prefixTokens: synthTokens.length,
+    sharedPrefixTokens: sharedTokens.length,
+    agentCount: questions.length,
+    attemptCount: attempts.length,
+    wallTimeMs: Math.round(tEnd - t0),
+    planMs: Math.round(tResearch - tPlan),
+    researchMs: Math.round(tVerify - tResearch),
+    verifyMs: Math.round(tEval - tVerify),
+    evalMs: Math.round(tEnd - tEval),
+    ...counters,
+  });
+
+  log();
+  log(`  ${c.dim}${'━'.repeat(58)}${c.reset}`);
+  log(`  ${c.dim}Plan       ${pad(planTokenCount, 5)} tok${' '.repeat(30)}${pad(sec(tPlan, tResearch), 6)}s${c.reset}`);
+  log(`  ${c.dim}Research   ${pad(totalAgentTokens, 5)} tok  (${agents.map((w) => w.tokenCount).join(' + ')})  ${pad(totalToolCalls, 2)} tools  ${pad(sec(tResearch, tVerify), 6)}s${c.reset}`);
+  log(`  ${c.dim}Verify     ${pad(totalVerifyTokens, 5)} tok  (${attempts.map((a) => a.tokenCount).join(' + ')})${' '.repeat(11)}${pad(sec(tVerify, tEval), 6)}s${c.reset}`);
+  log(`  ${c.dim}Eval       ${pad(evalTokenCount, 5)} tok  converged: ${converged ? 'yes' : 'no'}${' '.repeat(11)}${pad(sec(tEval, tEnd), 6)}s${c.reset}`);
+  const kvSaved = sharedTokens.length * (agents.length - 1) + synthTokens.length * (attempts.length - 1);
+  log(`  ${c.dim}${'━'.repeat(58)}${c.reset}`);
+  log(`  ${c.bold}Total${c.reset}      ${c.bold}${pad(totalTokens, 5)}${c.reset} tok  ${c.dim}${agents.length} agents · ${totalToolCalls} tools${c.reset}         ${c.bold}${pad(sec(t0, tEnd), 6)}s${c.reset}`);
+  log(`  ${c.dim}KV shared    ${sharedTokens.length} × ${agents.length - 1} + ${synthTokens.length} × ${attempts.length - 1} = ${kvSaved.toLocaleString()} tok saved${c.reset}`);
+  log();
+
+  if (jsonlMode) {
+    await bestAttempt.branch.prune();
+    await synthRoot.prune();
+    rerankCtx.dispose();
+    ctx.dispose();
+    return;
+  }
+
+  // ================================================================
+  // INTERACTIVE — readline follow-up loop with agent-swarm research
+  // ================================================================
+
+  // retainOnly strips all sequences except the winner from KV in one pass.
+  // synthRoot's slot is freed, winner's topology resets to root (standalone).
+  // This frees AGENT_COUNT seq_ids for follow-up research agents.
+  await store.retainOnly(bestAttempt.branch);
+
+  log(`  ${c.dim}Ask a follow-up question or /quit to exit${c.reset}`);
+  log();
+
+  await new Promise((resolve) => {
+    const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
+    const followUpBranch = bestAttempt.branch;
+    let exiting = false;
+    let generating = false;
+    let eofWhileGenerating = false;
+
+    async function exit() {
+      if (exiting) return;
+      exiting = true;
+      rl.close();
+      await followUpBranch.prune();
+      rerankCtx.dispose();
+      ctx.dispose();
+      resolve();
+    }
+
+    const ask = () => {
+      if (exiting) return;
+      rl.question(`  ${c.dim}>${c.reset} `, handleInput);
+    };
+
+    async function handleInput(input) {
+      const trimmed = input.trim();
+      if (!trimmed || trimmed === '/quit') {
+        await exit();
+        return;
+      }
+
+      generating = true;
+
+      // Fork AGENT_COUNT research agents from the conversation trunk.
+      // Each agent inherits full conversation KV (back-references resolve
+      // naturally), gets reseeded for search diversity.
+      log(`  ${c.dim}  researching...${c.reset}`);
+
+      const followUpAgents = [];
+      for (let i = 0; i < AGENT_COUNT; i++) {
+        const agent = await forkAgent(followUpBranch, AGENT_SYSTEM_PROMPT, trimmed, { tools: TOOLS_JSON });
+        agent.branch.reseedSampler(Date.now() + i);
+        followUpAgents.push(agent);
+      }
+
+      // Batch prefill all agents' divergent suffixes
+      await store.prefill(followUpAgents.map((a) => [a.branch, a.suffixTokens]));
+
+      // Run parallel research with batched decode
+      const swarmResult = await runAgentSwarm(followUpAgents);
+
+      log(`  ${c.dim}  ${swarmResult.totalToolCalls} tools · ${swarmResult.totalTokens} tok${c.reset}`);
+
+      // Collect findings from all agents
+      const agentFindings = followUpAgents
+        .map((a, i) => a.findings ? `[Agent ${i}] ${a.findings.trim()}` : null)
+        .filter(Boolean)
+        .join('\n\n');
+
+      // Prune all agent branches — their findings are captured
+      for (const a of followUpAgents) await a.branch.prune();
+
+      // Format findings + question as user turn, prefill into trunk
+      const groundedContent = agentFindings
+        ? `Research findings:\n${agentFindings}\n\nUser question: ${trimmed}\n\nAnswer based on the research findings above.`
+        : trimmed;
+
+      const { prompt } = await ctx.formatChat(
+        JSON.stringify([{ role: 'system', content: '' }, { role: 'user', content: groundedContent }])
+      );
+      const delta = await ctx.tokenize(prompt, false);
+      await followUpBranch.prefill([...sep, ...delta]);
+
+      // Generate grounded response
+      process.stdout.write(`  ${c.dim}<${c.reset} `);
+      for await (const { text } of followUpBranch) {
+        process.stdout.write(text);
+      }
+      console.log('\n');
+
+      generating = false;
+
+      if (eofWhileGenerating) {
+        await exit();
+      } else {
+        ask();
+      }
+    }
+
+    rl.on('close', () => {
+      if (generating) {
+        eofWhileGenerating = true;
+      } else {
+        exit();
+      }
+    });
+    ask();
+  });
+}
+
+main().catch((err) => {
+  // stderr is redirected in quiet mode — use stdout for errors
+  process.stdout.write(`Error: ${err.message}\n${err.stack}\n`);
+  process.exit(1);
+});
diff --git a/test/examples.js b/test/examples.js
index e2ab9ba..dedce3c 100644
--- a/test/examples.js
+++ b/test/examples.js
@@ -27,6 +27,16 @@ const EMBED_MODEL_PATH = process.env.EMBED_MODEL_PATH
   ? path.resolve(process.env.EMBED_MODEL_PATH)
   : path.join(__dirname, '../liblloyal/tests/fixtures/nomic-embed-text-v1.5.Q4_K_M.gguf');
 
+// Qwen3 model for deep-research (tool-calling, instruct model)
+const QWEN3_PATH = process.env.QWEN3_MODEL
+  ? path.resolve(process.env.QWEN3_MODEL)
+  : path.join(__dirname, '../models/Qwen3-4B-Instruct-2507-Q4_K_M.gguf');
+
+// Qwen3 reranker for deep-research semantic search
+const RERANKER_PATH = process.env.RERANKER_MODEL
+  ? path.resolve(process.env.RERANKER_MODEL)
+  : path.join(__dirname, '../models/qwen3-reranker-0.6b-q4_k_m.gguf');
+
 
 if (!fs.existsSync(MODEL_PATH)) {
   console.error('❌ Test model not found!');
@@ -296,6 +306,49 @@ const EXAMPLES = {
       assert(complete, 'should have complete event');
     },
   },
+
+  'deep-research': {
+    path: 'deep-research/deep-research.mjs',
+    timeout: 300000,
+    modelPath: QWEN3_PATH,
+    extraArgs: [
+      '--reranker', RERANKER_PATH,
+      '--corpus', process.env.DEEP_RESEARCH_CORPUS || '',
+      '--query', process.env.DEEP_RESEARCH_QUERY || '',
+    ],
+    skip: !fs.existsSync(QWEN3_PATH) || !fs.existsSync(RERANKER_PATH)
+      || !process.env.DEEP_RESEARCH_CORPUS || !process.env.DEEP_RESEARCH_QUERY,
+    skipReason: 'Requires QWEN3_MODEL, RERANKER_MODEL, DEEP_RESEARCH_CORPUS, and DEEP_RESEARCH_QUERY env vars',
+    validate(events) {
+      const start = events.find(e => e.event === 'start');
+      assert(start, 'should have start event');
+      assert(start.agentCount === 3, 'should have 3 agents');
+      assert(start.chunks > 0, 'should have corpus chunks');
+
+      const plan = events.find(e => e.event === 'plan');
+      assert(plan, 'should have plan event');
+      assert(plan.questions.length >= 2, 'should plan at least 2 sub-questions');
+
+      const researchStart = events.find(e => e.event === 'research_start');
+      assert(researchStart, 'should have research_start event');
+      assert(researchStart.sharedPrefixTokens > 0, 'should have shared prefix');
+
+      const toolCalls = events.filter(e => e.event === 'tool_call');
+      assert(toolCalls.length > 0, 'should make at least one tool call');
+
+      const agentsDone = events.filter(e => e.event === 'agent_done');
+      assert(agentsDone.length === 3, 'all 3 agents should finish');
+      for (const a of agentsDone) {
+        assert(a.tokenCount > 0, `agent ${a.index} should generate tokens`);
+      }
+
+      const complete = events.find(e => e.event === 'complete');
+      assert(complete, 'should have complete event');
+      assert(complete.totalToolCalls > 0, 'should have tool calls');
+      assert(complete.wallTimeMs > 0, 'should have wall time');
+      assert(complete.converged !== undefined, 'should have convergence result');
+    },
+  },
 };
 
 async function runTest(name, config) {

From 682523d9f099bca0461241416c742a48cfa75a89 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Sun, 22 Feb 2026 21:04:17 +1100
Subject: [PATCH 02/17] feat(agents): initial abstractions

---
 examples/deep-research/deep-research.mjs | 284 ++++++-----------------
 lib/Agent.js                             | 216 +++++++++++++++++
 lib/Session.js                           |  93 ++++++++
 lib/index.d.ts                           | 223 ++++++++++++++++++
 lib/index.js                             |  17 ++
 5 files changed, 622 insertions(+), 211 deletions(-)
 create mode 100644 lib/Agent.js
 create mode 100644 lib/Session.js

diff --git a/examples/deep-research/deep-research.mjs b/examples/deep-research/deep-research.mjs
index 7667fa7..1e29bfd 100644
--- a/examples/deep-research/deep-research.mjs
+++ b/examples/deep-research/deep-research.mjs
@@ -382,7 +382,7 @@ const fmtSize = (bytes) => bytes > 1e9
 // ================================================================
 
 // Dynamic import — native module loads here, after fd 2 redirect
-const { createContext, Branch, BranchStore } = await import('../../lib/index.js');
+const { createContext, Branch, BranchStore, Session, forkAgent, runAgents } = await import('../../lib/index.js');
 
 async function main() {
   const t0 = performance.now();
@@ -444,7 +444,6 @@ async function main() {
   log(`  ${c.dim}  Corpus: ${corpusLabel} → ${chunks.length} chunks${c.reset}`);
 
   const store = new BranchStore(ctx);
-  const sep = ctx.getTurnSeparator();
 
   log();
   log(`  ${c.dim}Query${c.reset}`);
@@ -541,7 +540,6 @@ async function main() {
         thinkingForcedOpen: fmt.thinkingForcedOpen,
         parser: fmt.parser,
       },
-      messages: [...fullMessages],
       rawOutput: '',
       done: false,
       tokenCount: 0,
@@ -563,8 +561,12 @@ async function main() {
   log();
   log(`  ${c.green}●${c.reset} ${c.bold}Research${c.reset} ${c.dim}${agents.length} agents · shared prefix ${sharedTokens.length} tok${c.reset}`);
 
-  // Reranker mutex — serializes llama_decode calls on rerankCtx so
-  // concurrent fire-and-forget searches don't race.
+  // Reranker mutex — serializes llama_decode calls on rerankCtx.
+  // Fire-and-forget tool dispatch means multiple agents can dispatch search
+  // concurrently; _branchPrefill runs on the libuv thread pool, so concurrent
+  // calls race llama_decode on the same llama_context. BranchStore serializes
+  // via batched decode (one llama_decode per commit/prefill), but individual
+  // Branch.prefill calls on rerankCtx bypass that.
   let rerankLock = Promise.resolve();
   function withRerankLock(fn) {
     const prev = rerankLock;
@@ -573,198 +575,34 @@ async function main() {
     return prev.then(fn).finally(release);
   }
 
-  // ── runAgentSwarm — reusable agentic loop ──
-  //
-  // Three-phase tick:
-  //   PRODUCE — produceSync() on active (non-pending, non-done) agents
-  //   COMMIT  — store.commit() for tokens produced this tick
-  //   SETTLE  — non-blocking check for resolved tools, batch warm-prefill
-  //
-  // Tool calls are dispatched as promises and don't block generation.
-  // Active agents keep producing tokens while tools run in background.
-  // When tools resolve, their agents get batched warm-prefilled back in.
-  async function runAgentSwarm(agents) {
-    let steps = 0;
-    let totalToolCalls = 0;
-    const counters = {
-      warmPrefillCalls: 0,
-      warmPrefillBranches: 0,
-      stalledTicks: 0,
-      maxConcurrentTools: 0,
-      idleTicks: 0,
-    };
-
-    // pendingTools: Map<agentIndex, { promise, name }>
-    const pendingTools = new Map();
-
-    function dispatchTool(ai, w, tc, parsed) {
-      let toolArgs;
-      try { toolArgs = JSON.parse(tc.arguments); } catch { toolArgs = {}; }
-      const callId = tc.id || `call_${w.toolCallCount}`;
-
-      w.toolCallCount++;
-      totalToolCalls++;
-      w.turns++;
-
-      emit('tool_call', { agentIndex: ai, toolName: tc.name, arguments: tc.arguments });
-      const argSummary = tc.name === 'search'
-        ? `"${toolArgs.query || ''}"`
-        : toolArgs.filename + (toolArgs.query ? `, "${toolArgs.query}"` : '');
-      log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.cyan}${tc.name}${c.reset}(${argSummary})`);
-
-      const promise = (async () => {
-        try {
-          const result = tc.name === 'search'
-            ? await withRerankLock(() => executeTool(tc.name, toolArgs))
-            : await executeTool(tc.name, toolArgs);
-
-          const resultStr = JSON.stringify(result);
-          emit('tool_result', {
-            agentIndex: ai, toolName: tc.name,
-            result: resultStr.length > 200 ? resultStr.slice(0, 200) + '...' : resultStr,
-          });
-          log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.dim}← ${tc.name} ${resultStr.length}b${c.reset}`);
-
-          w.messages.push({
-            role: 'assistant', content: parsed.content,
-            ...(parsed.reasoningContent && { reasoning_content: parsed.reasoningContent }),
-            tool_calls: [{ type: 'function', function: { name: tc.name, arguments: tc.arguments }, id: callId }],
-          });
-          w.messages.push({ role: 'tool', content: resultStr, tool_call_id: callId });
-
-          // Format warm prefill tokens — the assistant's tool-call turn is
-          // already in KV from generation; `sep` closes it. This formats just
-          // the tool-result turn + new assistant prompt.
-          const { prompt } = await ctx.formatChat(
-            JSON.stringify([
-              { role: 'system', content: '' },
-              { role: 'tool', content: resultStr, tool_call_id: callId },
-            ])
-          );
-          const delta = await ctx.tokenize(prompt, false);
-          return { ai: ai, prefillTokens: [...sep, ...delta] };
-        } catch (err) {
-          w.done = true;
-          w.findings = `Tool error: ${err.message}`;
-          return { ai: ai, prefillTokens: null };
-        }
-      })();
-
-      pendingTools.set(ai, { promise, name: tc.name });
-      counters.maxConcurrentTools = Math.max(counters.maxConcurrentTools, pendingTools.size);
-    }
-
-    for (;;) {
-      // ── Phase 1: PRODUCE — sample from active agents ──
-      const entries = [];
-      for (let ai = 0; ai < agents.length; ai++) {
-        const w = agents[ai];
-        if (w.done || pendingTools.has(ai)) continue;
-
-        const { token, text, isStop } = w.branch.produceSync();
-        if (isStop) {
-          const parsed = ctx.parseChatOutput(w.rawOutput, w.fmt.format, {
-            reasoningFormat: w.fmt.reasoningFormat,
-            thinkingForcedOpen: w.fmt.thinkingForcedOpen,
-            parser: w.fmt.parser,
-          });
-
-          const tc = parsed.toolCalls[0];
-          if (!tc || w.turns >= MAX_TOOL_TURNS) {
-            w.done = true;
-            if (!w.findings && parsed.content) w.findings = parsed.content;
-            continue;
-          }
-
-          if (tc.name === 'report') {
-            try { w.findings = JSON.parse(tc.arguments).findings; } catch { w.findings = tc.arguments; }
-            w.done = true;
-            w.toolCallCount++;
-            totalToolCalls++;
-            emit('tool_call', { agentIndex: ai, toolName: 'report', arguments: tc.arguments });
-            log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.cyan}report${c.reset}`);
-            continue;
-          }
-
-          // Fire-and-forget — dispatch tool without blocking the decode loop
-          dispatchTool(ai, w, tc, parsed);
-          w.rawOutput = '';
-          continue;
-        }
-
-        entries.push([w.branch, token]);
-        w.rawOutput += text;
-        w.tokenCount++;
-      }
-
-      // ── Phase 2: COMMIT — batch-decode produced tokens ──
-      if (entries.length > 0) {
-        await store.commit(entries);
-        steps++;
-      }
-
-      // ── Phase 3: SETTLE — non-blocking check for resolved tools ──
-      const prefillPairs = [];
-      for (const [ai, info] of pendingTools) {
-        const result = await Promise.race([info.promise, Promise.resolve(null)]);
-        if (result !== null) {
-          pendingTools.delete(ai);
-          if (result.prefillTokens) {
-            prefillPairs.push([agents[result.ai].branch, result.prefillTokens]);
-          }
-        }
-      }
-
-      if (prefillPairs.length > 0) {
-        await store.prefill(prefillPairs);
-        counters.warmPrefillCalls++;
-        counters.warmPrefillBranches += prefillPairs.length;
-      }
-
-      // ── Termination + idle yield ──
-      const allDone = agents.every((w) => w.done) && pendingTools.size === 0;
-      if (allDone) break;
-
-      if (entries.length === 0 && pendingTools.size > 0) {
-        counters.stalledTicks++;
-        if (prefillPairs.length === 0) {
-          // Nothing produced, nothing settled — yield until a tool resolves
-          await Promise.race([...pendingTools.values()].map((i) => i.promise));
-          counters.idleTicks++;
-        }
-      }
-    }
-
-    const totalTokens = agents.reduce((s, w) => s + w.tokenCount, 0);
-    return { totalTokens, totalToolCalls, steps, counters };
-  }
-
-  // ── forkAgent — fork from conversation trunk with own system prompt ──
-  //
-  // Different from Phase 2's shared-prefix optimization (agentRoot + slice).
-  // This pattern injects a full system prompt + tools via sep + delta onto
-  // an existing conversation branch — used for follow-up research agents.
-  async function forkAgent(trunk, systemPrompt, task, opts = {}) {
-    const branch = await trunk.fork();
-    const messages = [
-      { role: 'system', content: systemPrompt },
-      { role: 'user', content: task },
-    ];
-    const fmtOpts = opts.tools ? { tools: opts.tools } : {};
-    const fmt = await ctx.formatChat(JSON.stringify(messages), fmtOpts);
-    const suffixTokens = [...sep, ...await ctx.tokenize(fmt.prompt, false)];
-    return {
-      branch, suffixTokens,
-      fmt: { format: fmt.format, reasoningFormat: fmt.reasoningFormat,
-             thinkingForcedOpen: fmt.thinkingForcedOpen, parser: fmt.parser },
-      messages: [...messages],
-      rawOutput: '', done: false, tokenCount: 0,
-      toolCallCount: 0, turns: 0, findings: null,
-    };
-  }
+  const executeToolLocked = (name, args) =>
+    name === 'search'
+      ? withRerankLock(() => executeTool(name, args))
+      : executeTool(name, args);
 
   const { totalTokens: totalAgentTokens, totalToolCalls, steps: researchSteps, counters } =
-    await runAgentSwarm(agents);
+    await runAgents(agents, {
+      store, ctx,
+      executeTool: executeToolLocked,
+      maxTurns: MAX_TOOL_TURNS,
+      onToolCall(ai, toolName, args) {
+        emit('tool_call', { agentIndex: ai, toolName, arguments: args });
+        let toolArgs;
+        try { toolArgs = JSON.parse(args); } catch { toolArgs = {}; }
+        const argSummary = toolName === 'search'
+          ? `"${toolArgs.query || ''}"`
+          : toolName === 'report' ? ''
+          : toolArgs.filename + (toolArgs.query ? `, "${toolArgs.query}"` : '');
+        log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.cyan}${toolName}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
+      },
+      onToolResult(ai, toolName, resultStr) {
+        emit('tool_result', {
+          agentIndex: ai, toolName,
+          result: resultStr.length > 200 ? resultStr.slice(0, 200) + '...' : resultStr,
+        });
+        log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.dim}← ${toolName} ${resultStr.length}b${c.reset}`);
+      },
+    });
 
   for (let i = 0; i < agents.length; i++) {
     const w = agents[i];
@@ -983,17 +821,16 @@ async function main() {
   // INTERACTIVE — readline follow-up loop with agent-swarm research
   // ================================================================
 
-  // retainOnly strips all sequences except the winner from KV in one pass.
-  // synthRoot's slot is freed, winner's topology resets to root (standalone).
-  // This frees AGENT_COUNT seq_ids for follow-up research agents.
-  await store.retainOnly(bestAttempt.branch);
+  // Session manages trunk lifecycle — promote crowns winner, freeing
+  // AGENT_COUNT seq_ids for follow-up research agents.
+  const session = new Session({ ctx, store });
+  await session.promote(bestAttempt.branch);
 
   log(`  ${c.dim}Ask a follow-up question or /quit to exit${c.reset}`);
   log();
 
   await new Promise((resolve) => {
     const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
-    const followUpBranch = bestAttempt.branch;
     let exiting = false;
     let generating = false;
     let eofWhileGenerating = false;
@@ -1002,7 +839,7 @@ async function main() {
       if (exiting) return;
       exiting = true;
       rl.close();
-      await followUpBranch.prune();
+      await session.dispose();
       rerankCtx.dispose();
       ctx.dispose();
       resolve();
@@ -1014,6 +851,7 @@ async function main() {
     };
 
     async function handleInput(input) {
+      try {
       const trimmed = input.trim();
       if (!trimmed || trimmed === '/quit') {
         await exit();
@@ -1029,8 +867,12 @@ async function main() {
 
       const followUpAgents = [];
       for (let i = 0; i < AGENT_COUNT; i++) {
-        const agent = await forkAgent(followUpBranch, AGENT_SYSTEM_PROMPT, trimmed, { tools: TOOLS_JSON });
-        agent.branch.reseedSampler(Date.now() + i);
+        const agent = await forkAgent(session.trunk, {
+          systemPrompt: AGENT_SYSTEM_PROMPT,
+          content: trimmed,
+          tools: TOOLS_JSON,
+          seed: Date.now() + i,
+        }, ctx);
         followUpAgents.push(agent);
       }
 
@@ -1038,7 +880,26 @@ async function main() {
       await store.prefill(followUpAgents.map((a) => [a.branch, a.suffixTokens]));
 
       // Run parallel research with batched decode
-      const swarmResult = await runAgentSwarm(followUpAgents);
+      const swarmResult = await runAgents(followUpAgents, {
+        store, ctx,
+        executeTool: executeToolLocked,
+        maxTurns: MAX_TOOL_TURNS,
+        onToolCall(ai, toolName, args) {
+          emit('tool_call', { agentIndex: ai, toolName, arguments: args });
+          let toolArgs;
+          try { toolArgs = JSON.parse(args); } catch { toolArgs = {}; }
+          const argSummary = toolName === 'search'
+            ? `"${toolArgs.query || ''}"`
+            : toolName === 'report' ? ''
+            : toolArgs.filename + (toolArgs.query ? `, "${toolArgs.query}"` : '');
+          log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.cyan}${toolName}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
+        },
+        onToolResult(ai, toolName, resultStr) {
+          emit('tool_result', { agentIndex: ai, toolName,
+            result: resultStr.length > 200 ? resultStr.slice(0, 200) + '...' : resultStr });
+          log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.dim}← ${toolName} ${resultStr.length}b${c.reset}`);
+        },
+      });
 
       log(`  ${c.dim}  ${swarmResult.totalToolCalls} tools · ${swarmResult.totalTokens} tok${c.reset}`);
 
@@ -1051,20 +912,16 @@ async function main() {
       // Prune all agent branches — their findings are captured
       for (const a of followUpAgents) await a.branch.prune();
 
-      // Format findings + question as user turn, prefill into trunk
+      // Format findings + question as user turn, prefill into trunk via Session
       const groundedContent = agentFindings
         ? `Research findings:\n${agentFindings}\n\nUser question: ${trimmed}\n\nAnswer based on the research findings above.`
         : trimmed;
 
-      const { prompt } = await ctx.formatChat(
-        JSON.stringify([{ role: 'system', content: '' }, { role: 'user', content: groundedContent }])
-      );
-      const delta = await ctx.tokenize(prompt, false);
-      await followUpBranch.prefill([...sep, ...delta]);
+      await session.prefillUser(groundedContent);
 
       // Generate grounded response
       process.stdout.write(`  ${c.dim}<${c.reset} `);
-      for await (const { text } of followUpBranch) {
+      for await (const { text } of session.trunk) {
         process.stdout.write(text);
       }
       console.log('\n');
@@ -1076,6 +933,11 @@ async function main() {
       } else {
         ask();
       }
+      } catch (err) {
+        log(`  ${c.red}Error: ${err.message}${c.reset}`);
+        generating = false;
+        ask();
+      }
     }
 
     rl.on('close', () => {
diff --git a/lib/Agent.js b/lib/Agent.js
new file mode 100644
index 0000000..4f05a2a
--- /dev/null
+++ b/lib/Agent.js
@@ -0,0 +1,216 @@
+/**
+ * Agent - forkAgent + runAgents
+ *
+ * Two exported functions for the agentic loop pattern:
+ * - forkAgent: fork from parent, format task, compute suffix tokens
+ * - runAgents: three-phase tick loop (PRODUCE -> COMMIT -> SETTLE)
+ *
+ * Decoupled from Session — takes ctx directly, operates on agent branches.
+ * Consumer wires tool dispatch, callbacks, and Session separately.
+ */
+
+/**
+ * Fork an agent from a parent branch with its own system prompt + task.
+ *
+ * Always prepends getTurnSeparator() — forces clean break before agent's
+ * system prompt. Returns AgentState ready for store.prefill().
+ *
+ * @param {Branch} parent - Branch to fork from
+ * @param {{ systemPrompt: string, content: string, tools?: string, seed?: number }} task
+ * @param {SessionContext} ctx
+ * @returns {Promise<AgentState>}
+ */
+async function forkAgent(parent, task, ctx) {
+  const branch = await parent.fork();
+  const messages = [
+    { role: 'system', content: task.systemPrompt },
+    { role: 'user', content: task.content },
+  ];
+  const fmtOpts = task.tools ? { tools: task.tools } : {};
+  const fmt = await ctx.formatChat(JSON.stringify(messages), fmtOpts);
+  const sep = ctx.getTurnSeparator();
+  const suffixTokens = [...sep, ...await ctx.tokenize(fmt.prompt, false)];
+  if (task.seed != null) branch.reseedSampler(task.seed);
+  return {
+    branch,
+    suffixTokens,
+    fmt: {
+      format: fmt.format,
+      reasoningFormat: fmt.reasoningFormat,
+      thinkingForcedOpen: fmt.thinkingForcedOpen,
+      parser: fmt.parser,
+    },
+    rawOutput: '',
+    done: false,
+    tokenCount: 0,
+    toolCallCount: 0,
+    turns: 0,
+    findings: null,
+  };
+}
+
+/**
+ * Run agents in a batched three-phase tick loop.
+ *
+ * Mechanics preserved from runAgentSwarm:
+ * - Three-phase tick: PRODUCE -> COMMIT -> SETTLE
+ * - Fire-and-forget tool dispatch (tools run while other agents generate)
+ * - Warm prefill with sep + delta when tools resolve
+ * - `report` tool as completion signal (not dispatched to executeTool)
+ * - Non-blocking settle via Promise.race
+ * - Idle yield when all active agents are pending tools
+ *
+ * @param {AgentState[]} agents
+ * @param {{
+ *   store: BranchStore,
+ *   ctx: SessionContext,
+ *   executeTool: (name: string, args: object) => Promise<any>,
+ *   maxTurns?: number,
+ *   onToolCall?: (agentIndex: number, toolName: string, args: string) => void,
+ *   onToolResult?: (agentIndex: number, toolName: string, resultStr: string) => void,
+ *   onReport?: (agentIndex: number, findings: string) => void,
+ * }} opts
+ * @returns {Promise<{ totalTokens: number, totalToolCalls: number, steps: number, counters: object }>}
+ */
+async function runAgents(agents, opts) {
+  const { store, ctx, executeTool, maxTurns = 6, onToolCall, onToolResult, onReport } = opts;
+  const sep = ctx.getTurnSeparator();
+
+  let steps = 0;
+  let totalToolCalls = 0;
+  const counters = {
+    warmPrefillCalls: 0,
+    warmPrefillBranches: 0,
+    stalledTicks: 0,
+    maxConcurrentTools: 0,
+    idleTicks: 0,
+  };
+
+  // pendingTools: Map<agentIndex, { promise, name }>
+  const pendingTools = new Map();
+
+  function dispatchTool(ai, w, tc) {
+    let toolArgs;
+    try { toolArgs = JSON.parse(tc.arguments); } catch { toolArgs = {}; }
+    const callId = tc.id || `call_${w.toolCallCount}`;
+
+    w.toolCallCount++;
+    totalToolCalls++;
+    w.turns++;
+
+    if (onToolCall) onToolCall(ai, tc.name, tc.arguments);
+
+    const promise = (async () => {
+      try {
+        const result = await executeTool(tc.name, toolArgs);
+        const resultStr = JSON.stringify(result);
+
+        if (onToolResult) onToolResult(ai, tc.name, resultStr);
+
+        // Format warm prefill tokens — the assistant's tool-call turn is
+        // already in KV from generation; sep closes it.
+        const { prompt } = await ctx.formatChat(
+          JSON.stringify([
+            { role: 'system', content: '' },
+            { role: 'tool', content: resultStr, tool_call_id: callId },
+          ])
+        );
+        const delta = await ctx.tokenize(prompt, false);
+        return { ai, prefillTokens: [...sep, ...delta] };
+      } catch (err) {
+        w.done = true;
+        w.findings = `Tool error: ${err.message}`;
+        return { ai, prefillTokens: null };
+      }
+    })();
+
+    pendingTools.set(ai, { promise, name: tc.name });
+    counters.maxConcurrentTools = Math.max(counters.maxConcurrentTools, pendingTools.size);
+  }
+
+  for (;;) {
+    // -- Phase 1: PRODUCE -- sample from active agents
+    const entries = [];
+    for (let ai = 0; ai < agents.length; ai++) {
+      const w = agents[ai];
+      if (w.done || pendingTools.has(ai)) continue;
+
+      const { token, text, isStop } = w.branch.produceSync();
+      if (isStop) {
+        const parsed = ctx.parseChatOutput(w.rawOutput, w.fmt.format, {
+          reasoningFormat: w.fmt.reasoningFormat,
+          thinkingForcedOpen: w.fmt.thinkingForcedOpen,
+          parser: w.fmt.parser,
+        });
+
+        const tc = parsed.toolCalls[0];
+        if (!tc || w.turns >= maxTurns) {
+          w.done = true;
+          if (!w.findings && parsed.content) w.findings = parsed.content;
+          continue;
+        }
+
+        if (tc.name === 'report') {
+          try { w.findings = JSON.parse(tc.arguments).findings; } catch { w.findings = tc.arguments; }
+          w.done = true;
+          w.toolCallCount++;
+          totalToolCalls++;
+          if (onToolCall) onToolCall(ai, 'report', tc.arguments);
+          if (onReport) onReport(ai, w.findings);
+          continue;
+        }
+
+        // Fire-and-forget — dispatch tool without blocking the decode loop
+        dispatchTool(ai, w, tc);
+        w.rawOutput = '';
+        continue;
+      }
+
+      entries.push([w.branch, token]);
+      w.rawOutput += text;
+      w.tokenCount++;
+    }
+
+    // -- Phase 2: COMMIT -- batch-decode produced tokens
+    if (entries.length > 0) {
+      await store.commit(entries);
+      steps++;
+    }
+
+    // -- Phase 3: SETTLE -- non-blocking check for resolved tools
+    const prefillPairs = [];
+    for (const [ai, info] of pendingTools) {
+      const result = await Promise.race([info.promise, Promise.resolve(null)]);
+      if (result !== null) {
+        pendingTools.delete(ai);
+        if (result.prefillTokens) {
+          prefillPairs.push([agents[result.ai].branch, result.prefillTokens]);
+        }
+      }
+    }
+
+    if (prefillPairs.length > 0) {
+      await store.prefill(prefillPairs);
+      counters.warmPrefillCalls++;
+      counters.warmPrefillBranches += prefillPairs.length;
+    }
+
+    // -- Termination + idle yield
+    const allDone = agents.every((w) => w.done) && pendingTools.size === 0;
+    if (allDone) break;
+
+    if (entries.length === 0 && pendingTools.size > 0) {
+      counters.stalledTicks++;
+      if (prefillPairs.length === 0) {
+        // Nothing produced, nothing settled — yield until a tool resolves
+        await Promise.race([...pendingTools.values()].map((i) => i.promise));
+        counters.idleTicks++;
+      }
+    }
+  }
+
+  const totalTokens = agents.reduce((s, w) => s + w.tokenCount, 0);
+  return { totalTokens, totalToolCalls, steps, counters };
+}
+
+module.exports = { forkAgent, runAgents };
diff --git a/lib/Session.js b/lib/Session.js
new file mode 100644
index 0000000..64b019a
--- /dev/null
+++ b/lib/Session.js
@@ -0,0 +1,93 @@
+/**
+ * Session - Trunk lifecycle + conversation delta helpers
+ *
+ * Owns the current "trunk" branch — the single conversation thread that
+ * persists across agent swarms and follow-up turns. Provides promote()
+ * to crown a winner (retainOnly + reassign), and delta helpers that
+ * centralize the sep + formatChat + tokenize + prefill pattern.
+ *
+ * Session does NOT own the SessionContext or BranchStore — the consumer
+ * creates those and passes them in. dispose() prunes trunk only.
+ */
+class Session {
+  /**
+   * @param {{ ctx: SessionContext, store: BranchStore }} opts
+   */
+  constructor({ ctx, store }) {
+    this._ctx = ctx;
+    this._store = store;
+    this._trunk = null;
+  }
+
+  /** @returns {Branch|null} Current trunk branch */
+  get trunk() {
+    return this._trunk;
+  }
+
+  /** @param {Branch} branch - Assign initial trunk (no promote) */
+  set trunk(branch) {
+    this._trunk = branch;
+  }
+
+  /**
+   * Promote a winner to trunk — retainOnly + reassign
+   *
+   * Safe even if winner is the only branch (resets topology, no-op on KV).
+   * @param {Branch} winner
+   */
+  async promote(winner) {
+    await this._store.retainOnly(winner);
+    this._trunk = winner;
+  }
+
+  /**
+   * Dispose trunk only — consumer owns ctx and other resources
+   */
+  async dispose() {
+    if (this._trunk && !this._trunk.disposed) {
+      await this._trunk.prune();
+    }
+    this._trunk = null;
+  }
+
+  /**
+   * Prefill a user turn into trunk
+   *
+   * Centralizes: sep + formatChat([system:'', user:content]) + tokenize(false) + prefill
+   *
+   * @param {string} content - User message content
+   * @param {{ tools?: string }} [opts]
+   */
+  async prefillUser(content, opts = {}) {
+    const sep = this._ctx.getTurnSeparator();
+    const fmtOpts = opts.tools ? { tools: opts.tools } : {};
+    const { prompt } = await this._ctx.formatChat(
+      JSON.stringify([{ role: 'system', content: '' }, { role: 'user', content }]),
+      fmtOpts
+    );
+    const delta = await this._ctx.tokenize(prompt, false);
+    await this._trunk.prefill([...sep, ...delta]);
+  }
+
+  /**
+   * Prefill a tool result turn into trunk
+   *
+   * Centralizes: sep + formatChat([system:'', tool:result]) + tokenize(false) + prefill
+   *
+   * @param {string} resultStr - JSON-stringified tool result
+   * @param {string} callId - Tool call ID
+   */
+  async prefillToolResult(resultStr, callId) {
+    const sep = this._ctx.getTurnSeparator();
+    const { prompt } = await this._ctx.formatChat(
+      JSON.stringify([
+        { role: 'system', content: '' },
+        { role: 'tool', content: resultStr, tool_call_id: callId },
+      ])
+    );
+    const delta = await this._ctx.tokenize(prompt, false);
+    await this._trunk.prefill([...sep, ...delta]);
+  }
+}
+
+module.exports = { Session };
diff --git a/lib/index.d.ts b/lib/index.d.ts
index 6c79a1c..b10d48e 100644
--- a/lib/index.d.ts
+++ b/lib/index.d.ts
@@ -2099,3 +2099,226 @@ export class BranchStore {
   /** Number of available seq_id leases */
   readonly available: number;
 }
+
+// ================================================================
+// Agent primitives
+// ================================================================
+
+/**
+ * Task description for forkAgent
+ *
+ * @category Branching
+ */
+export interface AgentTask {
+  /** System prompt for the agent */
+  systemPrompt: string;
+  /** User content / question for the agent */
+  content: string;
+  /** JSON-stringified tool definitions (optional) */
+  tools?: string;
+  /** PRNG seed for sampler diversity (optional) */
+  seed?: number;
+}
+
+/**
+ * State of a single agent in a runAgents loop
+ *
+ * Returned by forkAgent(). Also constructible manually for shared-prefix
+ * patterns (Phase 2 agentRoot + slice).
+ *
+ * @category Branching
+ */
+export interface AgentState {
+  /** The agent's branch */
+  branch: Branch;
+  /** Tokens to prefill before the loop starts */
+  suffixTokens: number[];
+  /** Format metadata for parseChatOutput */
+  fmt: {
+    format: ChatFormat;
+    reasoningFormat: ReasoningFormat;
+    thinkingForcedOpen: boolean;
+    parser: string;
+  };
+  /** Accumulated raw output text */
+  rawOutput: string;
+  /** Whether the agent has finished */
+  done: boolean;
+  /** Number of tokens generated */
+  tokenCount: number;
+  /** Number of tool calls made */
+  toolCallCount: number;
+  /** Number of tool-call turns completed */
+  turns: number;
+  /** Final findings (set by report tool or fallback content) */
+  findings: string | null;
+}
+
+/**
+ * Options for runAgents
+ *
+ * @category Branching
+ */
+export interface RunAgentsOptions {
+  /** BranchStore for commit/prefill */
+  store: BranchStore;
+  /** SessionContext for parseChatOutput, formatChat, tokenize */
+  ctx: SessionContext;
+  /** Tool executor — consumer wraps with locks as needed */
+  executeTool: (name: string, args: Record<string, unknown>) => Promise<unknown>;
+  /** Maximum tool-call turns per agent (default: 6) */
+  maxTurns?: number;
+  /** Called when an agent dispatches a tool call */
+  onToolCall?: (agentIndex: number, toolName: string, args: string) => void;
+  /** Called when a tool result returns */
+  onToolResult?: (agentIndex: number, toolName: string, resultStr: string) => void;
+  /** Called when an agent submits a report */
+  onReport?: (agentIndex: number, findings: string) => void;
+}
+
+/**
+ * Result from runAgents
+ *
+ * @category Branching
+ */
+export interface RunAgentsResult {
+  /** Total tokens generated across all agents */
+  totalTokens: number;
+  /** Total tool calls across all agents */
+  totalToolCalls: number;
+  /** Number of batched decode steps */
+  steps: number;
+  /** Performance counters */
+  counters: {
+    warmPrefillCalls: number;
+    warmPrefillBranches: number;
+    stalledTicks: number;
+    maxConcurrentTools: number;
+    idleTicks: number;
+  };
+}
+
+/**
+ * Fork an agent from a parent branch with its own system prompt + task
+ *
+ * Always prepends getTurnSeparator() for a clean structural break before
+ * the agent's system prompt. Returns AgentState ready for store.prefill().
+ *
+ * @param parent - Branch to fork from
+ * @param task - Agent task description
+ * @param ctx - SessionContext for formatting and tokenization
+ * @returns AgentState with branch and suffixTokens
+ *
+ * @example
+ * ```typescript
+ * const agent = await forkAgent(trunk, {
+ *   systemPrompt: 'You are a research assistant.',
+ *   content: 'What is X?',
+ *   tools: toolsJson,
+ *   seed: Date.now(),
+ * }, ctx);
+ * await store.prefill([[agent.branch, agent.suffixTokens]]);
+ * ```
+ *
+ * @category Branching
+ */
+export function forkAgent(
+  parent: Branch,
+  task: AgentTask,
+  ctx: SessionContext
+): Promise<AgentState>;
+
+/**
+ * Run agents in a batched three-phase tick loop
+ *
+ * Preserves the mechanical execution wins from BranchStore:
+ * shared-prefix KV, batched decode, fire-and-forget tools, idle yield.
+ *
+ * @param agents - Array of AgentState (from forkAgent or manual construction)
+ * @param opts - Configuration including store, ctx, executeTool, and callbacks
+ * @returns Aggregate statistics
+ *
+ * @example
+ * ```typescript
+ * const result = await runAgents(agents, {
+ *   store, ctx,
+ *   executeTool: (name, args) => myToolDispatch(name, args),
+ *   maxTurns: 6,
+ *   onToolCall(ai, name, args) { console.log(`Agent ${ai}: ${name}`); },
+ * });
+ * ```
+ *
+ * @category Branching
+ */
+export function runAgents(
+  agents: AgentState[],
+  opts: RunAgentsOptions
+): Promise<RunAgentsResult>;
+
+/**
+ * Session - Trunk lifecycle + conversation delta helpers
+ *
+ * Owns the current "trunk" branch and provides promote() to crown a winner,
+ * plus delta helpers that centralize the sep + formatChat + tokenize + prefill
+ * pattern for injecting new turns into an ongoing conversation.
+ *
+ * Session does NOT own the SessionContext or BranchStore — the consumer
+ * creates those and passes them in. dispose() prunes trunk only.
+ *
+ * @example
+ * ```typescript
+ * const session = new Session({ ctx, store });
+ * session.trunk = initialBranch;
+ *
+ * // After verification, promote the best attempt
+ * await session.promote(bestAttempt.branch);
+ *
+ * // Inject a user turn and generate
+ * await session.prefillUser('What about X?');
+ * for await (const { text } of session.trunk) {
+ *   process.stdout.write(text);
+ * }
+ *
+ * // Cleanup
+ * await session.dispose();
+ * ctx.dispose();
+ * ```
+ *
+ * @category Branching
+ */
+export class Session {
+  constructor(opts: { ctx: SessionContext; store: BranchStore });
+
+  /** Current trunk branch (or null before assignment) */
+  get trunk(): Branch | null;
+
+  /** Assign initial trunk without promote */
+  set trunk(branch: Branch | null);
+
+  /**
+   * Promote a winner to trunk — retainOnly + reassign
+   *
+   * Calls store.retainOnly(winner), then sets trunk = winner.
+   * Safe even if winner is the only branch.
+   */
+  promote(winner: Branch): Promise<void>;
+
+  /**
+   * Dispose trunk only — consumer owns ctx and other resources
+   */
+  dispose(): Promise<void>;
+
+  /**
+   * Prefill a user turn into trunk
+   *
+   * Centralizes: sep + formatChat + tokenize(false) + prefill
+   */
+  prefillUser(content: string, opts?: { tools?: string }): Promise<void>;
+
+  /**
+   * Prefill a tool result turn into trunk
+   *
+   * Centralizes: sep + formatChat + tokenize(false) + prefill
+   */
+  prefillToolResult(resultStr: string, callId: string): Promise<void>;
+}
diff --git a/lib/index.js b/lib/index.js
index e7719bc..144bbf0 100644
--- a/lib/index.js
+++ b/lib/index.js
@@ -197,6 +197,8 @@ const getBinary = () => {
 
 const { Branch } = require('./Branch');
 const { BranchStore } = require('./BranchStore');
+const { Session } = require('./Session');
+const { forkAgent, runAgents } = require('./Agent');
 
 module.exports = {
   /**
@@ -209,6 +211,21 @@ module.exports = {
    * @see BranchStore
    */
   BranchStore,
+  /**
+   * Session class for trunk lifecycle + conversation deltas
+   * @see Session
+   */
+  Session,
+  /**
+   * Fork an agent from a parent branch with task context
+   * @see forkAgent
+   */
+  forkAgent,
+  /**
+   * Run agents in a batched three-phase tick loop
+   * @see runAgents
+   */
+  runAgents,
   /**
    * Create a new inference context
    *

From d82fb3be5a58087ac7f0123c66bd1715e4d88bff Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Mon, 23 Feb 2026 01:05:32 +1100
Subject: [PATCH 03/17] refactor(major): TS migration

---
 .github/workflows/docs.yml                    |   4 +-
 .github/workflows/gpu-test.yml                |   1 -
 .github/workflows/release.yml                 |   4 +-
 .github/workflows/tests.yml                   |   6 +-
 .gitignore                                    |   1 +
 .npmignore                                    |   3 +-
 README.md                                     |   2 +-
 examples/best-of-n/README.md                  | 118 ---
 examples/best-of-n/best-of-n.mjs              | 244 -----
 examples/chat/{chat.mjs => chat.ts}           |  39 +-
 .../{deep-research.mjs => deep-research.ts}   | 195 ++--
 examples/embed/{embed.mjs => embed.ts}        |  38 +-
 examples/entropy/{entropy.mjs => entropy.ts}  |  31 +-
 examples/grammar/README.md                    |  77 --
 examples/grammar/grammar.mjs                  | 177 ----
 examples/speculative/README.md                | 117 --
 examples/speculative/speculative.mjs          | 271 -----
 examples/streaming/README.md                  | 217 ----
 examples/streaming/streaming-summary.mjs      | 552 ----------
 examples/streaming/streaming-tsampler.mjs     | 326 ------
 examples/streaming/streaming.mjs              | 185 ----
 lib/Branch.js                                 | 471 ---------
 lib/BranchStore.js                            |  43 -
 lib/Session.js                                |  93 --
 lib/index.js                                  | 276 -----
 package-lock.json                             | 571 +++++++++-
 package.json                                  |  23 +-
 lib/Agent.js => src/Agent.ts                  | 116 +-
 src/Branch.ts                                 | 565 ++++++++++
 src/BranchStore.ts                            | 155 +++
 src/Session.ts                                | 115 ++
 src/index.ts                                  | 279 +++++
 lib/index.d.ts => src/types.ts                | 998 ++----------------
 test/examples.js                              | 463 --------
 test/examples.ts                              | 339 ++++++
 test/{integration.js => integration.ts}       | 627 +++++------
 tsconfig.json                                 |  18 +
 typedoc.json                                  |   2 +-
 38 files changed, 2712 insertions(+), 5050 deletions(-)
 delete mode 100644 examples/best-of-n/README.md
 delete mode 100644 examples/best-of-n/best-of-n.mjs
 rename examples/chat/{chat.mjs => chat.ts} (72%)
 rename examples/deep-research/{deep-research.mjs => deep-research.ts} (85%)
 rename examples/embed/{embed.mjs => embed.ts} (84%)
 rename examples/entropy/{entropy.mjs => entropy.ts} (84%)
 delete mode 100644 examples/grammar/README.md
 delete mode 100644 examples/grammar/grammar.mjs
 delete mode 100644 examples/speculative/README.md
 delete mode 100644 examples/speculative/speculative.mjs
 delete mode 100644 examples/streaming/README.md
 delete mode 100644 examples/streaming/streaming-summary.mjs
 delete mode 100644 examples/streaming/streaming-tsampler.mjs
 delete mode 100644 examples/streaming/streaming.mjs
 delete mode 100644 lib/Branch.js
 delete mode 100644 lib/BranchStore.js
 delete mode 100644 lib/Session.js
 delete mode 100644 lib/index.js
 rename lib/Agent.js => src/Agent.ts (67%)
 create mode 100644 src/Branch.ts
 create mode 100644 src/BranchStore.ts
 create mode 100644 src/Session.ts
 create mode 100644 src/index.ts
 rename lib/index.d.ts => src/types.ts (60%)
 delete mode 100644 test/examples.js
 create mode 100644 test/examples.ts
 rename test/{integration.js => integration.ts} (76%)
 create mode 100644 tsconfig.json

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index f49b678..aff706f 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -4,8 +4,8 @@ on:
   push:
     branches: [main]
     paths:
-      - 'lib/index.d.ts'
-      - 'lib/Branch.js'
+      - 'src/types.ts'
+      - 'src/Branch.ts'
       - 'package.json'
       - 'typedoc.json'
       - 'README.md'
diff --git a/.github/workflows/gpu-test.yml b/.github/workflows/gpu-test.yml
index 37d9fd5..56e7192 100644
--- a/.github/workflows/gpu-test.yml
+++ b/.github/workflows/gpu-test.yml
@@ -6,7 +6,6 @@ on:
     paths:
       - 'liblloyal'
       - 'llama.cpp'
-      - 'lib/**'
       - 'src/**'
       - 'test/**'
       - 'CMakeLists.txt'
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 980703b..608301a 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -218,7 +218,7 @@ jobs:
           export LD_LIBRARY_PATH="${PKG_BIN}:${LD_LIBRARY_PATH:-}"
           echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
           node -e "
-            const { loadBinary } = require('./lib');
+            const { loadBinary } = require('./dist');
             const addon = loadBinary();
             console.log('✓ Platform package loaded successfully');
             console.log('  Exports:', Object.keys(addon));
@@ -255,7 +255,7 @@ jobs:
           Write-Host "VULKAN_SDK: $env:VULKAN_SDK"
           Write-Host "CUDA_PATH: $env:CUDA_PATH"
           node -e "
-            const { loadBinary } = require('./lib');
+            const { loadBinary } = require('./dist');
             const addon = loadBinary();
             console.log('✓ Platform package loaded successfully');
             console.log('  Exports:', Object.keys(addon));
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index b8e01c8..cdad345 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -141,9 +141,9 @@ jobs:
           echo "📦 Package contents:"
           cat package-contents.txt
 
-          # Verify lib/ JavaScript is included
-          if ! grep -q "package/lib/index.js" package-contents.txt; then
-            echo "❌ ERROR: lib/index.js not in package!"
+          # Verify dist/ JavaScript is included
+          if ! grep -q "package/dist/index.js" package-contents.txt; then
+            echo "❌ ERROR: dist/index.js not in package!"
             exit 1
           fi
 
diff --git a/.gitignore b/.gitignore
index 44b0e3f..1ec8cb6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # Build outputs
 build/
+dist/
 prebuilds/
 *.node
 /include/
diff --git a/.npmignore b/.npmignore
index 95ac8ef..2662537 100644
--- a/.npmignore
+++ b/.npmignore
@@ -34,8 +34,9 @@ tests/
 examples/
 docs/
 
-# C++ source files (users get prebuilt binaries, not source)
+# C++ and TS source files (users get prebuilt binaries + compiled JS, not source)
 src/
+tsconfig.json
 
 # Test models (too large for npm)
 models/
diff --git a/README.md b/README.md
index 49fb41d..a455bc5 100644
--- a/README.md
+++ b/README.md
@@ -218,7 +218,7 @@ See [`examples/grammar/`](./examples/grammar/) for the full branch fork pattern.
 
 Full API documentation: **[lloyal-ai.github.io/lloyal.node](https://lloyal-ai.github.io/lloyal.node/)**
 
-Generated from [`lib/index.d.ts`](./lib/index.d.ts) with TypeDoc.
+Generated from [`src/types.ts`](./src/types.ts) with TypeDoc.
 
 ---
 
diff --git a/examples/best-of-n/README.md b/examples/best-of-n/README.md
deleted file mode 100644
index e9d973d..0000000
--- a/examples/best-of-n/README.md
+++ /dev/null
@@ -1,118 +0,0 @@
-# Best-of-N Sampling with Perplexity Selection
-
-Demonstrates why best-of-n beats single generation: generate N diverse candidates, select the most coherent by perplexity.
-
-## Run It
-
-```bash
-node best-of-n.mjs
-```
-
-## What You'll See
-
-```
-BASELINE: Single generation (T=0.3)
-  PPL: 2.07 | "In the realm where the moon dipped..."
-
-BEST-OF-5: Generate 5 candidates (T=0.9), select lowest PPL
-  [1] PPL:  2.95 | "In the heart of a moonlit forest..."
-  [2] PPL:  4.41 | "Under the cloak of a midnight moon..."
-  [3] PPL:  3.09 | "As the last wisps of sunlight..."
-  [4] PPL:  3.42 | "Under the moon's silvery glow..."
-  [5] PPL:  3.46 | "Under the emerald canopy..."
-
-RESULTS
-  Best candidate [1] (PPL 2.95)
-  PPL range: 2.95 - 4.41 (Δ1.46)
-```
-
-## How It Works
-
-| Step | What Happens |
-|------|--------------|
-| 1. Prefill | Decode prompt on seq 0 |
-| 2. Capture logits | Copy logits buffer (critical for fair comparison) |
-| 3. Generate N candidates | Each forks KV, samples from captured logits, then continues |
-| 4. Track PPL | Accumulate surprisal per candidate |
-| 5. Select best | Lowest perplexity wins |
-
-## Key Implementation Detail
-
-After prefilling, the logits buffer contains P(next_token | prompt). When we fork to multiple sequences, **each candidate's first token must sample from these same captured logits**:
-
-```javascript
-// Capture after prefill
-const capturedLogits = new Float32Array(ctx.getLogits());
-
-// Each candidate:
-// 1. Sample first token from captured logits (tsampler)
-const token = sampleWithStrategy(capturedLogits, { params, workspace, prng });
-
-// 2. Compute surprisal from captured logits (native C++)
-const surprisal = ctx.modelSurprisal(token, 'nats', capturedLogits);
-```
-
-Without this, later candidates would sample from earlier candidates' states - unfair comparison.
-
-## Why Perplexity?
-
-```
-PPL = exp(average surprisal) = "how surprised is the model?"
-```
-
-| PPL | Meaning |
-|-----|---------|
-| Low | Model is confident in what it wrote |
-| High | Model was uncertain, may have inconsistencies |
-
-Best-of-N trades compute for quality:
-- High temp generates **diverse** candidates (explore)
-- PPL filtering selects **coherent** ones (exploit)
-
-## Key APIs
-
-| Method | Description |
-|--------|-------------|
-| `kvSeqCopy(src, dst)` | Fork KV cache (O(1) tag copy) |
-| `getLogits()` | Get raw logits (zero-copy view) |
-| `modelSurprisal(token, base?, logits?)` | Surprisal from current or captured logits |
-| `createPerplexityTracker()` | Create tracker handle |
-| `addSurprisal(tracker, value)` | Accumulate to tracker |
-| `getPerplexity(tracker)` | Get current PPL |
-
-## Native Metrics API
-
-The native `modelSurprisal()` accepts an optional `logits` parameter for captured logits:
-
-```javascript
-// First token: surprisal from captured logits
-const firstSurprisal = ctx.modelSurprisal(token, 'nats', capturedLogits);
-
-// Subsequent tokens: current context logits (default)
-const surprisal = ctx.modelSurprisal(token);
-```
-
-All math runs in C++ - no JS overhead for softmax/log operations.
-
-## tsampler Integration
-
-[@lloyal-labs/tsampler](https://www.npmjs.com/package/@lloyal-labs/tsampler) handles sampling from captured logits:
-
-```javascript
-import { sampleWithStrategy, SamplerWorkspace, Xoroshiro128Plus } from '@lloyal-labs/tsampler';
-
-const token = sampleWithStrategy(capturedLogits, {
-  params: { temperature: 0.9, topP: 0.95 },
-  workspace,
-  prng,
-});
-```
-
-**Division of labor:**
-- **tsampler**: Sampling (temperature, topP, topK) from arbitrary logits
-- **Native API**: Metrics (surprisal, entropy, perplexity) from arbitrary logits
-
-## References
-
-- [Stiennon et al. 2020](https://arxiv.org/abs/2009.01325) - "Learning to summarize from human feedback" (Best-of-N in RLHF)
-- [tsampler](https://github.com/lloyal-ai/tsampler) - Pure TypeScript sampling with llama.cpp parity
diff --git a/examples/best-of-n/best-of-n.mjs b/examples/best-of-n/best-of-n.mjs
deleted file mode 100644
index 22c9328..0000000
--- a/examples/best-of-n/best-of-n.mjs
+++ /dev/null
@@ -1,244 +0,0 @@
-#!/usr/bin/env node
-/**
- * Best-of-N Sampling with Perplexity Selection (Parallel Streaming)
- *
- * Demonstrates why best-of-n beats single generation:
- * - Generate N candidates with high temperature (diverse)
- * - Select best by perplexity (model's confidence in its output)
- * - Lower perplexity = more coherent, higher quality
- *
- * Based on: "Best-of-N" / "Rejection Sampling" used in RLHF pipelines
- * See: Stiennon et al. 2020 "Learning to summarize from human feedback"
- *
- * KEY IMPLEMENTATION DETAIL:
- * Uses the Branch API for parallel generation. The root branch prefills the
- * prompt and captures logits. When forking to multiple candidates, each fork
- * inherits the root's logits snapshot, ensuring all candidates start from
- * the same probability distribution.
- *
- * Usage:
- *   node best-of-n.mjs [model-path]          # Human-readable output
- *   node best-of-n.mjs [model-path] --jsonl  # JSONL output for testing
- */
-
-import * as path from 'node:path';
-import { fileURLToPath } from 'node:url';
-import { createContext, Branch } from '../../lib/index.js';
-
-const __dirname = path.dirname(fileURLToPath(import.meta.url));
-const DEFAULT_MODEL = path.resolve(
-  __dirname,
-  '../../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf'
-);
-
-// Parse args
-const args = process.argv.slice(2);
-const jsonlMode = args.includes('--jsonl');
-const modelPath = args.find(a => !a.startsWith('--')) || DEFAULT_MODEL;
-
-/** Emit output - JSONL or human-readable */
-function emit(event, data) {
-  if (jsonlMode) {
-    console.log(JSON.stringify({ event, ...data }));
-  }
-}
-
-/** Collect tokens from a branch's async iterator, return text + perplexity. */
-async function generateWithBranch(branch, maxTokens, ctx) {
-  const tokens = [];
-  for await (const { token } of branch) {
-    tokens.push(token);
-    if (tokens.length >= maxTokens) break;
-  }
-  const ppl = branch.perplexity;
-  return {
-    text: await ctx.detokenize(tokens),
-    ppl: Number.isFinite(ppl) ? ppl : 999,
-    tokenCount: tokens.length,
-  };
-}
-
-async function main() {
-  const N = 5;           // Number of candidates
-  const MAX_TOKENS = 60;
-  const HIGH_TEMP = 0.9; // High temp for diversity
-  const LOW_TEMP = 0.3;  // Low temp for single baseline
-
-  if (!jsonlMode) {
-    console.log('Best-of-N Sampling Demo (Parallel Streaming)');
-    console.log('=============================================\n');
-    console.log('Why best-of-n works:');
-    console.log('  1. Generate N candidates with HIGH temperature (diverse)');
-    console.log('  2. Score each by perplexity (model confidence)');
-    console.log('  3. Select LOWEST perplexity (most coherent)\n');
-    console.log(`Loading model: ${path.basename(modelPath)}`);
-  }
-
-  emit('start', { model: path.basename(modelPath), n: N, maxTokens: MAX_TOKENS, highTemp: HIGH_TEMP, lowTemp: LOW_TEMP });
-
-  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
-  const ctx = await createContext({
-    modelPath,
-    nCtx,
-    nSeqMax: N + 2, // Need slots for N candidates + baseline + trunk
-  });
-
-  // Use chat template for consistent behavior
-  const userPrompt = 'Write a creative opening sentence for a fantasy novel.';
-  const messages = [{ role: 'user', content: userPrompt }];
-  const { prompt } = await ctx.formatChat(JSON.stringify(messages));
-
-  if (!jsonlMode) {
-    console.log(`\nPrompt: "${userPrompt}"`);
-  }
-
-  // Prefill prompt via root branch
-  const promptTokens = await ctx.tokenize(prompt);
-
-  const root = Branch.create(ctx, 0, {
-    temperature: HIGH_TEMP,
-    topP: 0.95,
-  });
-  await root.prefill(promptTokens);
-
-  if (!jsonlMode) {
-    console.log(`\nPrefill complete. Prompt length: ${promptTokens.length} tokens`);
-  }
-
-  // === Baseline: Single generation with low temperature ===
-  if (!jsonlMode) {
-    console.log('\n' + '='.repeat(70));
-    console.log('BASELINE: Single generation (forked from root)');
-    console.log('='.repeat(70));
-  }
-
-  // Fork baseline from root — inherits KV prefix + logits snapshot
-  const baselineBranch = await root.fork();
-
-  const baseline = await generateWithBranch(baselineBranch, MAX_TOKENS, ctx);
-
-  emit('baseline', { ppl: baseline.ppl, text: baseline.text, tokenCount: baseline.tokenCount });
-
-  if (!jsonlMode) {
-    console.log(`  PPL: ${baseline.ppl.toFixed(2)} | "${baseline.text}"`);
-  }
-
-  await baselineBranch.prune();
-
-  // === Best-of-N: Parallel candidates with high temperature ===
-  if (!jsonlMode) {
-    console.log('\n' + '='.repeat(70));
-    console.log(`BEST-OF-${N}: Generate ${N} candidates in parallel (T=${HIGH_TEMP})`);
-    console.log('='.repeat(70));
-  }
-
-  // Fork N candidate branches from root
-  // Each fork gets: copied logits snapshot + copied KV cache + copied sampler
-  // CRITICAL: Reseed each branch's sampler for diversity (otherwise all produce identical output)
-  const branches = [];
-  for (let i = 0; i < N; i++) {
-    const branch = await root.fork();
-    branch.reseedSampler(1000 + i);  // Unique seed per branch
-    branches.push(branch);
-  }
-
-  // Generate each candidate sequentially — same total GPU work, simpler flow
-  const candidates = [];
-  for (let i = 0; i < N; i++) {
-    const tokens = [];
-    for await (const { token, text } of branches[i]) {
-      tokens.push(token);
-      emit('token', { candidateIndex: i, text, index: tokens.length - 1 });
-      if (tokens.length >= MAX_TOKENS) break;
-    }
-
-    const ppl = branches[i].perplexity;
-    const text = await ctx.detokenize(tokens);
-    candidates.push({
-      text,
-      ppl: Number.isFinite(ppl) ? ppl : 999,
-      tokenCount: tokens.length,
-    });
-
-    emit('candidate', { index: i + 1, ppl: candidates[i].ppl, text, tokenCount: tokens.length });
-
-    if (!jsonlMode) {
-      const truncated = text.length > 55 ? text.slice(0, 55) + '...' : text;
-      console.log(`  [${i + 1}] PPL: ${candidates[i].ppl.toFixed(2).padStart(6)} | "${truncated}"`);
-    }
-
-    await branches[i].prune();
-  }
-  await root.prune();
-
-  // Select best
-  const best = candidates.reduce((a, b) => (a.ppl < b.ppl ? a : b));
-  const worst = candidates.reduce((a, b) => (a.ppl > b.ppl ? a : b));
-  const bestIdx = candidates.indexOf(best) + 1;
-
-  // Analysis
-  const improvement = (baseline.ppl - best.ppl) / baseline.ppl;
-  const pplRange = worst.ppl - best.ppl;
-
-  emit('complete', {
-    bestIndex: bestIdx,
-    bestPpl: best.ppl,
-    bestText: best.text,
-    worstPpl: worst.ppl,
-    baselinePpl: baseline.ppl,
-    pplRange,
-    improvement,
-    bestBeatBaseline: best.ppl < baseline.ppl,
-  });
-
-  if (!jsonlMode) {
-    // === Results ===
-    console.log('\n' + '='.repeat(70));
-    console.log('RESULTS');
-    console.log('='.repeat(70));
-
-    console.log(`\n  Best candidate [${bestIdx}] (PPL ${best.ppl.toFixed(2)}):`);
-    console.log(`    "${best.text}"`);
-
-    console.log(`\n  Baseline (PPL ${baseline.ppl.toFixed(2)}):`);
-    console.log(`    "${baseline.text}"`);
-
-    console.log('\n  Analysis:');
-    console.log(`    - PPL range across candidates: ${best.ppl.toFixed(2)} - ${worst.ppl.toFixed(2)} (Δ${pplRange.toFixed(2)})`);
-    if (best.ppl < baseline.ppl) {
-      console.log(`    - Best-of-${N} beat baseline by ${(improvement * 100).toFixed(1)}% lower PPL`);
-    } else {
-      console.log(`    - Baseline was already good (low temp = focused)`);
-    }
-
-    console.log('\n' + '='.repeat(70));
-    console.log('KEY INSIGHT');
-    console.log('='.repeat(70));
-    console.log(`
-  Perplexity = exp(average surprisal) = "how surprised is the model?"
-
-  Lower PPL = model is confident in what it wrote = usually more coherent
-  Higher PPL = model was uncertain = may have inconsistencies
-
-  Best-of-N trades compute for quality:
-    - High temp generates diverse candidates (explore the space)
-    - PPL filtering selects the coherent ones (exploit quality)
-
-  Implementation note:
-    Uses the Branch API for parallel generation. After prefilling the
-    prompt, we create a root branch and capture its logits. When forking
-    to N candidates, each fork inherits the root's logits snapshot,
-    ensuring all candidates start from the same probability distribution.
-    Generation happens in round-robin fashion, interleaving tokens across
-    all candidates.
-`);
-  }
-
-  ctx.dispose();
-}
-
-main().catch((err) => {
-  console.error('Error:', err.message);
-  console.error(err.stack);
-  process.exit(1);
-});
diff --git a/examples/chat/chat.mjs b/examples/chat/chat.ts
similarity index 72%
rename from examples/chat/chat.mjs
rename to examples/chat/chat.ts
index 4ec2ea0..a4cd00b 100644
--- a/examples/chat/chat.mjs
+++ b/examples/chat/chat.ts
@@ -3,8 +3,8 @@
  * Simple chat example using lloyal.node
  *
  * Usage:
- *   node chat.mjs /path/to/model.gguf
- *   node chat.mjs  # uses default model path
+ *   npx tsx chat.ts /path/to/model.gguf
+ *   npx tsx chat.ts  # uses default model path
  *
  * This example demonstrates:
  * - Branch API for token generation (produce/commit two-phase)
@@ -15,23 +15,22 @@
 
 import * as readline from "node:readline";
 import * as path from "node:path";
-import { fileURLToPath } from "node:url";
-import { createContext, Branch } from "../../lib/index.js";
+import { createContext, Branch } from "../../dist/index.js";
+import type { SessionContext, FormattedChatResult } from "../../dist/index.js";
 
-const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const DEFAULT_MODEL = path.resolve(
   __dirname,
   "../../models/Phi-3.5-mini-instruct-Q4_K_M.gguf",
 );
 
-async function main() {
+async function main(): Promise<void> {
   const modelPath = process.argv[2] || DEFAULT_MODEL;
 
   console.log(`Loading model: ${modelPath}`);
   console.log("This may take a moment...\n");
 
   const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
-  const ctx = await createContext({
+  const ctx: SessionContext = await createContext({
     modelPath,
     nCtx,
     threads: 4,
@@ -40,19 +39,19 @@ async function main() {
   console.log("Model loaded! Type your message and press Enter.");
   console.log("Commands: /clear to reset, /quit to exit\n");
 
-  const messages = [];
-  let branch = null;
-  let fmt = null;
-  const sep = ctx.getTurnSeparator();
+  const messages: Array<{role: string; content: string; reasoning_content?: string}> = [];
+  let branch: InstanceType<typeof Branch> | null = null;
+  let fmt: FormattedChatResult | null = null;
+  const sep: number[] = ctx.getTurnSeparator();
 
   const rl = readline.createInterface({
     input: process.stdin,
     output: process.stdout,
   });
 
-  const askUser = () => rl.question("> ", handleInput);
+  const askUser = (): void => { rl.question("> ", handleInput); };
 
-  async function handleInput(input) {
+  async function handleInput(input: string): Promise<void> {
     const trimmed = input.trim();
 
     if (trimmed === "/quit" || trimmed === "/exit") {
@@ -111,13 +110,13 @@ async function main() {
     console.log("\n");
 
     // Parse output: separates reasoning from content for thinking models
-    const parsed = ctx.parseChatOutput(rawOutput, fmt.format, {
-      reasoningFormat: fmt.reasoningFormat,
-      thinkingForcedOpen: fmt.thinkingForcedOpen,
-      parser: fmt.parser,
+    const parsed = ctx.parseChatOutput(rawOutput, fmt!.format, {
+      reasoningFormat: fmt!.reasoningFormat,
+      thinkingForcedOpen: fmt!.thinkingForcedOpen,
+      parser: fmt!.parser,
     });
 
-    const msg = { role: "assistant", content: parsed.content };
+    const msg: {role: string; content: string; reasoning_content?: string} = { role: "assistant", content: parsed.content };
     if (parsed.reasoningContent) {
       msg.reasoning_content = parsed.reasoningContent;
     }
@@ -129,7 +128,7 @@ async function main() {
   askUser();
 }
 
-main().catch((err) => {
-  console.error("Error:", err.message);
+main().catch((err: unknown) => {
+  console.error("Error:", (err as Error).message);
   process.exit(1);
 });
diff --git a/examples/deep-research/deep-research.mjs b/examples/deep-research/deep-research.ts
similarity index 85%
rename from examples/deep-research/deep-research.mjs
rename to examples/deep-research/deep-research.ts
index 1e29bfd..8cb1f91 100644
--- a/examples/deep-research/deep-research.mjs
+++ b/examples/deep-research/deep-research.ts
@@ -19,7 +19,7 @@
  * dispatches per step, regardless of branch count.
  *
  * Usage:
- *   node deep-research.mjs <model-path> --corpus <path> --query <text> [options]
+ *   node deep-research.ts <model-path> --corpus <path> --query <text> [options]
  *
  * Required:
  *   <model-path>     Path to generative model (e.g. Qwen3-4B-Instruct)
@@ -32,16 +32,22 @@
  *   --verbose        Show native llama.cpp logs
  *
  * Example:
- *   node deep-research.mjs ./models/Qwen3-4B.gguf \
+ *   node deep-research.ts ./models/Qwen3-4B.gguf \
  *     --corpus ~/docs --query "How does the auth system work?"
  */
 
 import * as fs from 'node:fs';
 import * as path from 'node:path';
 import * as readline from 'node:readline';
-import { fileURLToPath } from 'node:url';
+import {
+  createContext, Branch, BranchStore, Session, forkAgent, runAgents,
+} from '../../dist/index.js';
+import type { SessionContext, AgentState } from '../../dist/index.js';
+
+// ================================================================
+// CLI ARGS
+// ================================================================
 
-const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const DEFAULT_MODEL = path.resolve(
   __dirname,
   '../../models/Qwen3-4B-Instruct-2507-Q4_K_M.gguf'
@@ -55,7 +61,7 @@ const args = process.argv.slice(2);
 const jsonlMode = args.includes('--jsonl');
 const verbose = args.includes('--verbose');
 
-function argVal(flag) {
+function argVal(flag: string): string | null {
   const i = args.indexOf(flag);
   return i !== -1 ? args[i + 1] : null;
 }
@@ -79,7 +85,7 @@ if (!corpusDir || !QUERY) {
     !QUERY && '--query',
   ].filter(Boolean);
   process.stdout.write(
-    `Usage: node deep-research.mjs [model-path] --corpus <path> --query <text> [--reranker <path>]\n` +
+    `Usage: node deep-research.ts [model-path] --corpus <path> --query <text> [--reranker <path>]\n` +
     `Missing: ${missing.join(', ')}\n`
   );
   process.exit(1);
@@ -108,9 +114,9 @@ const c = isTTY ? {
   green: '\x1b[32m', cyan: '\x1b[36m', yellow: '\x1b[33m', red: '\x1b[31m',
 } : { bold: '', dim: '', reset: '', green: '', cyan: '', yellow: '', red: '' };
 
-const log = (...a) => { if (!jsonlMode) console.log(...a); };
+const log = (...a: unknown[]): void => { if (!jsonlMode) console.log(...a); };
 
-function emit(event, data) {
+function emit(event: string, data: Record<string, unknown>): void {
   if (jsonlMode) console.log(JSON.stringify({ event, ...data }));
 }
 
@@ -126,23 +132,27 @@ const MAX_TOOL_TURNS = 6;
 // CORPUS — load and chunk at ## boundaries
 // ================================================================
 
-function loadCorpus() {
-  if (!fs.existsSync(corpusDir)) {
+interface CorpusFile { name: string; content: string }
+interface Chunk { file: string; heading: string; text: string; tokens: number[] }
+interface SubChunk { heading: string; text: string }
+
+function loadCorpus(): CorpusFile[] {
+  if (!fs.existsSync(corpusDir!)) {
     process.stdout.write(`Error: corpus not found: ${corpusDir}\n`);
     process.exit(1);
   }
-  const stat = fs.statSync(corpusDir);
+  const stat = fs.statSync(corpusDir!);
   if (stat.isFile()) {
-    return [{ name: path.basename(corpusDir), content: fs.readFileSync(corpusDir, 'utf8') }];
+    return [{ name: path.basename(corpusDir!), content: fs.readFileSync(corpusDir!, 'utf8') }];
   }
-  const files = fs.readdirSync(corpusDir).filter((f) => f.endsWith('.md'));
+  const files = fs.readdirSync(corpusDir!).filter((f) => f.endsWith('.md'));
   if (!files.length) {
     process.stdout.write(`Error: no .md files in: ${corpusDir}\n`);
     process.exit(1);
   }
   return files.map((f) => ({
     name: f,
-    content: fs.readFileSync(path.join(corpusDir, f), 'utf8'),
+    content: fs.readFileSync(path.join(corpusDir!, f), 'utf8'),
   }));
 }
 
@@ -151,32 +161,32 @@ function loadCorpus() {
 // With reranker nCtx=8192: budget ≈ 8000 tokens × 3 = 24000 chars.
 const CHUNK_CHAR_LIMIT = 24000;
 
-function chunkCorpus(files) {
-  const out = [];
+function chunkCorpus(files: CorpusFile[]): Chunk[] {
+  const out: Chunk[] = [];
   for (const file of files) {
     for (const section of file.content.split(/(?=^## )/m)) {
-      const heading = (section.match(/^##?\s+(.+)/m) || [, file.name])[1];
+      const heading = (section.match(/^##?\s+(.+)/m) || [, file.name])[1]!;
       const trimmed = section.trim();
       if (trimmed.length <= CHUNK_CHAR_LIMIT) {
-        out.push({ file: file.name, heading, text: trimmed });
+        out.push({ file: file.name, heading, text: trimmed, tokens: [] });
         continue;
       }
       // Sub-split oversized sections: ### → paragraph → hard truncate
       for (const sub of subChunk(trimmed, heading)) {
-        out.push({ file: file.name, heading: sub.heading, text: sub.text });
+        out.push({ file: file.name, heading: sub.heading, text: sub.text, tokens: [] });
       }
     }
   }
   return out;
 }
 
-function subChunk(text, parentHeading) {
+function subChunk(text: string, parentHeading: string): SubChunk[] {
   // Try splitting at ### boundaries first
   const subSections = text.split(/(?=^### )/m);
   if (subSections.length > 1) {
-    const results = [];
+    const results: SubChunk[] = [];
     for (const sub of subSections) {
-      const subHeading = (sub.match(/^###?\s+(.+)/m) || [, parentHeading])[1];
+      const subHeading = (sub.match(/^###?\s+(.+)/m) || [, parentHeading])[1]!;
       const trimmed = sub.trim();
       if (trimmed.length <= CHUNK_CHAR_LIMIT) {
         results.push({ heading: subHeading, text: trimmed });
@@ -191,9 +201,9 @@ function subChunk(text, parentHeading) {
   return splitByParagraph(text, parentHeading);
 }
 
-function splitByParagraph(text, heading) {
+function splitByParagraph(text: string, heading: string): SubChunk[] {
   const paragraphs = text.split(/\n\n+/);
-  const results = [];
+  const results: SubChunk[] = [];
   let current = '';
   let partIndex = 0;
 
@@ -220,7 +230,7 @@ function splitByParagraph(text, heading) {
 }
 
 const corpus = loadCorpus();
-const chunks = chunkCorpus(corpus);
+const chunks: Chunk[] = chunkCorpus(corpus);
 
 // ================================================================
 // RERANKER — Qwen3-Reranker cross-encoder scoring via Branch API
@@ -238,16 +248,16 @@ const RERANK_PREFIX =
 const RERANK_MID = '\n\n<Document>: ';
 const RERANK_SUFFIX = '<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n';
 
-let rerankCtx = null;
+let rerankCtx: SessionContext | null = null;
 let yesId = 0;
 let noId = 0;
 
 // Pre-tokenized template segments — populated after reranker loads.
-let rerankPrefixTokens = null; // RERANK_PREFIX (with BOS)
-let rerankMidTokens = null;    // RERANK_MID
-let rerankSuffixTokens = null; // RERANK_SUFFIX
+let rerankPrefixTokens: number[] | null = null; // RERANK_PREFIX (with BOS)
+let rerankMidTokens: number[] | null = null;    // RERANK_MID
+let rerankSuffixTokens: number[] | null = null; // RERANK_SUFFIX
 
-function rerankScore(logits) {
+function rerankScore(logits: Float32Array): number {
   const max = Math.max(logits[yesId], logits[noId]);
   const yesExp = Math.exp(logits[yesId] - max);
   const noExp = Math.exp(logits[noId] - max);
@@ -258,20 +268,22 @@ function rerankScore(logits) {
 // TOOLS — reranker-backed search + snippet extraction
 // ================================================================
 
-async function toolSearch(query) {
-  const queryTokens = await rerankCtx.tokenize(query, false);
-  const scored = [];
+interface ScoredChunk { file: string; heading: string; score: number }
+
+async function toolSearch(query: string): Promise<ScoredChunk[]> {
+  const queryTokens = await rerankCtx!.tokenize(query, false);
+  const scored: ScoredChunk[] = [];
   for (const chunk of chunks) {
     // Pre-tokenized segments — no string concat, no per-chunk tokenize().
     // Boundary safety: all joints are at special tokens or newlines,
     // which are explicit token boundaries in Qwen3's BPE vocabulary.
     const tokens = [
-      ...rerankPrefixTokens, ...queryTokens,
-      ...rerankMidTokens, ...chunk.tokens,
-      ...rerankSuffixTokens,
+      ...rerankPrefixTokens!, ...queryTokens,
+      ...rerankMidTokens!, ...chunk.tokens,
+      ...rerankSuffixTokens!,
     ];
     // Fresh branch per chunk — position must start at 0 each time.
-    const branch = Branch.create(rerankCtx, 0, { temperature: 0 });
+    const branch = Branch.create(rerankCtx!, 0, { temperature: 0 });
     await branch.prefill(tokens);
     const score = rerankScore(branch.getLogits());
     await branch.prune();
@@ -280,7 +292,14 @@ async function toolSearch(query) {
   return scored.sort((a, b) => b.score - a.score).slice(0, 5);
 }
 
-function toolReadFile(filename, query) {
+interface ReadFileResult {
+  file: string;
+  content?: string;
+  snippets?: string[];
+  error?: string;
+}
+
+function toolReadFile(filename: string, query: string): ReadFileResult | { error: string } {
   const file = corpus.find((f) => f.name === filename);
   if (!file) {
     return { error: `File not found: ${filename}. Available: ${corpus.map((f) => f.name).join(', ')}` };
@@ -288,8 +307,8 @@ function toolReadFile(filename, query) {
   if (!query) return { file: file.name, content: file.content.slice(0, 800) };
   const terms = query.toLowerCase().split(/\s+/).filter(Boolean);
   const lines = file.content.split('\n');
-  const snippets = [];
-  const seen = new Set();
+  const snippets: string[] = [];
+  const seen = new Set<number>();
   for (let i = 0; i < lines.length; i++) {
     if (!terms.some((t) => lines[i].toLowerCase().includes(t))) continue;
     const start = Math.max(0, i - 1);
@@ -304,12 +323,15 @@ function toolReadFile(filename, query) {
     : { file: file.name, snippets: ['No matches for: ' + query] };
 }
 
-async function executeTool(name, toolArgs) {
+async function executeTool(name: string, toolArgs: Record<string, unknown>): Promise<unknown> {
   switch (name) {
     case 'search':
-      return toolSearch(toolArgs.query || '');
+      return toolSearch((toolArgs.query as string) || '');
     case 'read_file':
-      return toolReadFile(toolArgs.filename || toolArgs.path || '', toolArgs.query || '');
+      return toolReadFile(
+        (toolArgs.filename as string) || (toolArgs.path as string) || '',
+        (toolArgs.query as string) || ''
+      );
     case 'report':
       return { acknowledged: true };
     default:
@@ -371,9 +393,9 @@ const AGENT_SYSTEM_PROMPT =
 // HELPERS
 // ================================================================
 
-const sec = (a, b) => ((b - a) / 1000).toFixed(1);
-const pad = (s, n) => String(s).padStart(n);
-const fmtSize = (bytes) => bytes > 1e9
+const sec = (a: number, b: number): string => ((b - a) / 1000).toFixed(1);
+const pad = (s: unknown, n: number): string => String(s).padStart(n);
+const fmtSize = (bytes: number): string => bytes > 1e9
   ? (bytes / 1e9).toFixed(1) + ' GB'
   : (bytes / 1e6).toFixed(0) + ' MB';
 
@@ -381,10 +403,15 @@ const fmtSize = (bytes) => bytes > 1e9
 // MAIN
 // ================================================================
 
-// Dynamic import — native module loads here, after fd 2 redirect
-const { createContext, Branch, BranchStore, Session, forkAgent, runAgents } = await import('../../lib/index.js');
+interface Attempt {
+  branch: InstanceType<typeof Branch>;
+  output: string;
+  done: boolean;
+  tokenCount: number;
+  ppl: number;
+}
 
-async function main() {
+async function main(): Promise<void> {
   const t0 = performance.now();
 
   const modelName = path.basename(modelPath).replace(/-Q\w+\.gguf$/, '');
@@ -399,7 +426,7 @@ async function main() {
   emit('start', {
     model: path.basename(modelPath),
     reranker: path.basename(rerankModelPath),
-    query: QUERY,
+    query: QUERY!,
     agentCount: AGENT_COUNT,
     verifyCount: VERIFY_COUNT,
     chunks: chunks.length,
@@ -437,10 +464,10 @@ async function main() {
     chunk.tokens = await rerankCtx.tokenize(chunk.text, false);
   }
 
-  const corpusIsFile = corpus.length === 1 && fs.statSync(corpusDir).isFile();
+  const corpusIsFile = corpus.length === 1 && fs.statSync(corpusDir!).isFile();
   const corpusLabel = corpusIsFile
-    ? path.basename(corpusDir)
-    : `${path.basename(corpusDir)}/ — ${corpus.length} files`;
+    ? path.basename(corpusDir!)
+    : `${path.basename(corpusDir!)}/ — ${corpus.length} files`;
   log(`  ${c.dim}  Corpus: ${corpusLabel} → ${chunks.length} chunks${c.reset}`);
 
   const store = new BranchStore(ctx);
@@ -486,7 +513,7 @@ async function main() {
   }
   await lead.prune();
 
-  let questions;
+  let questions: string[];
   try {
     const plan = JSON.parse(planOutput);
     questions = plan.questions.slice(0, AGENT_COUNT);
@@ -519,7 +546,7 @@ async function main() {
   await agentRoot.prefill(sharedTokens);
 
   // Fork N agents, compute divergent suffixes via token slicing
-  const agents = [];
+  const agents: AgentState[] = [];
   for (const q of questions) {
     const branch = await agentRoot.fork();
 
@@ -568,14 +595,14 @@ async function main() {
   // via batched decode (one llama_decode per commit/prefill), but individual
   // Branch.prefill calls on rerankCtx bypass that.
   let rerankLock = Promise.resolve();
-  function withRerankLock(fn) {
+  function withRerankLock<T>(fn: () => Promise<T>): Promise<T> {
     const prev = rerankLock;
-    let release;
+    let release: () => void;
     rerankLock = new Promise((r) => { release = r; });
-    return prev.then(fn).finally(release);
+    return prev.then(fn).finally(release!);
   }
 
-  const executeToolLocked = (name, args) =>
+  const executeToolLocked = (name: string, args: Record<string, unknown>): Promise<unknown> =>
     name === 'search'
       ? withRerankLock(() => executeTool(name, args))
       : executeTool(name, args);
@@ -585,9 +612,9 @@ async function main() {
       store, ctx,
       executeTool: executeToolLocked,
       maxTurns: MAX_TOOL_TURNS,
-      onToolCall(ai, toolName, args) {
+      onToolCall(ai: number, toolName: string, args: string) {
         emit('tool_call', { agentIndex: ai, toolName, arguments: args });
-        let toolArgs;
+        let toolArgs: Record<string, string>;
         try { toolArgs = JSON.parse(args); } catch { toolArgs = {}; }
         const argSummary = toolName === 'search'
           ? `"${toolArgs.query || ''}"`
@@ -595,7 +622,7 @@ async function main() {
           : toolArgs.filename + (toolArgs.query ? `, "${toolArgs.query}"` : '');
         log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.cyan}${toolName}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
       },
-      onToolResult(ai, toolName, resultStr) {
+      onToolResult(ai: number, toolName: string, resultStr: string) {
         emit('tool_result', {
           agentIndex: ai, toolName,
           result: resultStr.length > 200 ? resultStr.slice(0, 200) + '...' : resultStr,
@@ -607,7 +634,7 @@ async function main() {
   for (let i = 0; i < agents.length; i++) {
     const w = agents[i];
     const isLast = i === agents.length - 1;
-    const branch = isLast ? '└' : '├';
+    const branchChar = isLast ? '└' : '├';
 
     emit('agent_done', {
       index: i,
@@ -618,7 +645,7 @@ async function main() {
       tokenCount: w.tokenCount,
     });
 
-    log(`    ${c.dim}${branch}${c.reset} ${c.yellow}${i}${c.reset} ${c.green}done${c.reset} ${c.dim}${w.tokenCount} tok · ${w.toolCallCount} tools${c.reset}`);
+    log(`    ${c.dim}${branchChar}${c.reset} ${c.yellow}${i}${c.reset} ${c.green}done${c.reset} ${c.dim}${w.tokenCount} tok · ${w.toolCallCount} tools${c.reset}`);
 
     await w.branch.prune();
   }
@@ -653,7 +680,7 @@ async function main() {
   log();
   log(`  ${c.green}●${c.reset} ${c.bold}Verify${c.reset} ${c.dim}${VERIFY_COUNT} attempts · shared prefix ${synthTokens.length} tok${c.reset}`);
 
-  const attempts = [];
+  const attempts: Attempt[] = [];
   for (let i = 0; i < VERIFY_COUNT; i++) {
     const branch = await synthRoot.fork();
     branch.reseedSampler(2000 + i);
@@ -663,7 +690,7 @@ async function main() {
 
   let verifySteps = 0;
   for (;;) {
-    const entries = [];
+    const entries: [InstanceType<typeof Branch>, number][] = [];
     for (const a of attempts) {
       if (a.done) continue;
       const { token, text, isStop } = a.branch.produceSync();
@@ -685,7 +712,7 @@ async function main() {
   const totalVerifyTokens = attempts.reduce((s, a) => s + a.tokenCount, 0);
   for (let i = 0; i < attempts.length; i++) {
     const isLast = i === attempts.length - 1;
-    const branch = isLast ? '└' : '├';
+    const branchChar = isLast ? '└' : '├';
 
     emit('attempt_done', {
       index: i,
@@ -694,7 +721,7 @@ async function main() {
       ppl: attempts[i].ppl,
     });
 
-    log(`    ${c.dim}${branch} ${attempts[i].tokenCount} tok · ppl ${attempts[i].ppl.toFixed(2)}${c.reset}`);
+    log(`    ${c.dim}${branchChar} ${attempts[i].tokenCount} tok · ppl ${attempts[i].ppl.toFixed(2)}${c.reset}`);
   }
 
   // Pick lowest perplexity synthesis (most coherent) — same as best-of-n.mjs
@@ -746,7 +773,7 @@ async function main() {
   }
   await evalBranch.prune();
 
-  let converged;
+  let converged: boolean | null;
   try {
     converged = JSON.parse(evalOutput).converged;
   } catch {
@@ -812,7 +839,7 @@ async function main() {
   if (jsonlMode) {
     await bestAttempt.branch.prune();
     await synthRoot.prune();
-    rerankCtx.dispose();
+    rerankCtx!.dispose();
     ctx.dispose();
     return;
   }
@@ -829,28 +856,28 @@ async function main() {
   log(`  ${c.dim}Ask a follow-up question or /quit to exit${c.reset}`);
   log();
 
-  await new Promise((resolve) => {
+  await new Promise<void>((resolve) => {
     const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
     let exiting = false;
     let generating = false;
     let eofWhileGenerating = false;
 
-    async function exit() {
+    async function exit(): Promise<void> {
       if (exiting) return;
       exiting = true;
       rl.close();
       await session.dispose();
-      rerankCtx.dispose();
+      rerankCtx!.dispose();
       ctx.dispose();
       resolve();
     }
 
-    const ask = () => {
+    const ask = (): void => {
       if (exiting) return;
       rl.question(`  ${c.dim}>${c.reset} `, handleInput);
     };
 
-    async function handleInput(input) {
+    async function handleInput(input: string): Promise<void> {
       try {
       const trimmed = input.trim();
       if (!trimmed || trimmed === '/quit') {
@@ -865,9 +892,9 @@ async function main() {
       // naturally), gets reseeded for search diversity.
       log(`  ${c.dim}  researching...${c.reset}`);
 
-      const followUpAgents = [];
+      const followUpAgents: AgentState[] = [];
       for (let i = 0; i < AGENT_COUNT; i++) {
-        const agent = await forkAgent(session.trunk, {
+        const agent = await forkAgent(session.trunk!, {
           systemPrompt: AGENT_SYSTEM_PROMPT,
           content: trimmed,
           tools: TOOLS_JSON,
@@ -884,9 +911,9 @@ async function main() {
         store, ctx,
         executeTool: executeToolLocked,
         maxTurns: MAX_TOOL_TURNS,
-        onToolCall(ai, toolName, args) {
+        onToolCall(ai: number, toolName: string, args: string) {
           emit('tool_call', { agentIndex: ai, toolName, arguments: args });
-          let toolArgs;
+          let toolArgs: Record<string, string>;
           try { toolArgs = JSON.parse(args); } catch { toolArgs = {}; }
           const argSummary = toolName === 'search'
             ? `"${toolArgs.query || ''}"`
@@ -894,7 +921,7 @@ async function main() {
             : toolArgs.filename + (toolArgs.query ? `, "${toolArgs.query}"` : '');
           log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.cyan}${toolName}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
         },
-        onToolResult(ai, toolName, resultStr) {
+        onToolResult(ai: number, toolName: string, resultStr: string) {
           emit('tool_result', { agentIndex: ai, toolName,
             result: resultStr.length > 200 ? resultStr.slice(0, 200) + '...' : resultStr });
           log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.dim}← ${toolName} ${resultStr.length}b${c.reset}`);
@@ -921,7 +948,7 @@ async function main() {
 
       // Generate grounded response
       process.stdout.write(`  ${c.dim}<${c.reset} `);
-      for await (const { text } of session.trunk) {
+      for await (const { text } of session.trunk!) {
         process.stdout.write(text);
       }
       console.log('\n');
@@ -934,7 +961,7 @@ async function main() {
         ask();
       }
       } catch (err) {
-        log(`  ${c.red}Error: ${err.message}${c.reset}`);
+        log(`  ${c.red}Error: ${(err as Error).message}${c.reset}`);
         generating = false;
         ask();
       }
@@ -951,8 +978,8 @@ async function main() {
   });
 }
 
-main().catch((err) => {
+main().catch((err: unknown) => {
   // stderr is redirected in quiet mode — use stdout for errors
-  process.stdout.write(`Error: ${err.message}\n${err.stack}\n`);
+  process.stdout.write(`Error: ${(err as Error).message}\n${(err as Error).stack}\n`);
   process.exit(1);
 });
diff --git a/examples/embed/embed.mjs b/examples/embed/embed.ts
similarity index 84%
rename from examples/embed/embed.mjs
rename to examples/embed/embed.ts
index ce79da0..bf4ac09 100644
--- a/examples/embed/embed.mjs
+++ b/examples/embed/embed.ts
@@ -3,9 +3,9 @@
  * Embedding extraction example using lloyal.node
  *
  * Usage:
- *   node embed.mjs /path/to/embedding-model.gguf          # Human-readable output
- *   node embed.mjs /path/to/embedding-model.gguf --jsonl  # JSONL output for testing
- *   node embed.mjs  # uses default nomic-embed model path
+ *   npx tsx embed.ts /path/to/embedding-model.gguf          # Human-readable output
+ *   npx tsx embed.ts /path/to/embedding-model.gguf --jsonl  # JSONL output for testing
+ *   npx tsx embed.ts  # uses default nomic-embed model path
  *
  * This example demonstrates:
  * - Creating an embedding context with pooling enabled
@@ -14,10 +14,8 @@
  */
 
 import * as path from 'node:path';
-import { fileURLToPath } from 'node:url';
-import { createContext } from '../../lib/index.js';
-
-const __dirname = path.dirname(fileURLToPath(import.meta.url));
+import { createContext, PoolingType } from '../../dist/index.js';
+import type { SessionContext } from '../../dist/index.js';
 
 // Default to nomic-embed-text model in fixtures
 const DEFAULT_MODEL = path.resolve(
@@ -31,24 +29,16 @@ const jsonlMode = args.includes('--jsonl');
 const modelPath = args.find(a => !a.startsWith('--')) || DEFAULT_MODEL;
 
 /** Emit output - JSONL or human-readable */
-function emit(event, data) {
+function emit(event: string, data: Record<string, unknown>): void {
   if (jsonlMode) {
     console.log(JSON.stringify({ event, ...data }));
   }
 }
 
-// Pooling types (matches llama.cpp LLAMA_POOLING_TYPE_*)
-const PoolingType = {
-  NONE: 0,
-  MEAN: 1,
-  CLS: 2,
-  LAST: 3,
-};
-
 /**
  * Compute cosine similarity between two vectors
  */
-function cosineSimilarity(a, b) {
+function cosineSimilarity(a: Float32Array, b: Float32Array): number {
   if (a.length !== b.length) {
     throw new Error('Vectors must have same dimension');
   }
@@ -73,7 +63,7 @@ function cosineSimilarity(a, b) {
 /**
  * Get embedding for a text
  */
-async function getEmbedding(ctx, text) {
+async function getEmbedding(ctx: SessionContext, text: string): Promise<Float32Array> {
   // Tokenize the text
   const tokens = await ctx.tokenize(text);
 
@@ -89,7 +79,7 @@ async function getEmbedding(ctx, text) {
   return embedding;
 }
 
-async function main() {
+async function main(): Promise<void> {
   if (!jsonlMode) {
     console.log('='.repeat(60));
     console.log('lloyal.node Embedding Example');
@@ -134,7 +124,7 @@ async function main() {
   }
 
   // Get embeddings for all texts
-  const embeddings = [];
+  const embeddings: { text: string; embedding: Float32Array }[] = [];
   for (const text of texts) {
     const start = performance.now();
     const embedding = await getEmbedding(ctx, text);
@@ -167,7 +157,7 @@ async function main() {
       emit('similarity', { i, j, similarity: sim });
 
       if (!jsonlMode) {
-        const bar = '█'.repeat(Math.round(sim * 20));
+        const bar = '\u2588'.repeat(Math.round(sim * 20));
         console.log(`  [${i}] vs [${j}]: ${sim.toFixed(4)} ${bar}`);
         console.log(`      "${texts[i].substring(0, 30)}..."`);
         console.log(`      "${texts[j].substring(0, 30)}..."`);
@@ -204,7 +194,7 @@ async function main() {
   if (!jsonlMode) {
     console.log('Results (ranked by similarity):\n');
     ranked.forEach((result, i) => {
-      const bar = '█'.repeat(Math.round(result.similarity * 20));
+      const bar = '\u2588'.repeat(Math.round(result.similarity * 20));
       console.log(`  ${i + 1}. ${result.similarity.toFixed(4)} ${bar}`);
       console.log(`     "${result.text}"`);
       console.log();
@@ -222,7 +212,7 @@ async function main() {
 }
 
 main().catch((err) => {
-  console.error('Error:', err.message);
-  console.error(err.stack);
+  console.error('Error:', (err as Error).message);
+  console.error((err as Error).stack);
   process.exit(1);
 });
diff --git a/examples/entropy/entropy.mjs b/examples/entropy/entropy.ts
similarity index 84%
rename from examples/entropy/entropy.mjs
rename to examples/entropy/entropy.ts
index 7618567..cdfd5fd 100644
--- a/examples/entropy/entropy.mjs
+++ b/examples/entropy/entropy.ts
@@ -14,15 +14,14 @@
  *
  *
  * Usage:
- *   node entropy.mjs [model-path]          # Human-readable output
- *   node entropy.mjs [model-path] --jsonl  # JSONL output for testing
+ *   npx tsx entropy.ts [model-path]          # Human-readable output
+ *   npx tsx entropy.ts [model-path] --jsonl  # JSONL output for testing
  */
 
 import * as path from 'node:path';
-import { fileURLToPath } from 'node:url';
-import { createContext, Branch } from '../../lib/index.js';
+import { createContext, Branch } from '../../dist/index.js';
+import type { SessionContext } from '../../dist/index.js';
 
-const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const DEFAULT_MODEL = path.resolve(
   __dirname,
   '../../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf'
@@ -34,7 +33,7 @@ const jsonlMode = args.includes('--jsonl');
 const modelPath = args.find(a => !a.startsWith('--')) || DEFAULT_MODEL;
 
 /** Emit output - JSONL or human-readable */
-function emit(event, data) {
+function emit(event: string, data: Record<string, unknown>): void {
   if (jsonlMode) {
     console.log(JSON.stringify({ event, ...data }));
   }
@@ -48,7 +47,7 @@ const THETA = 1.5; // Scale factor
 /**
  * Calculate EDT temperature from entropy
  */
-function edtTemperature(entropy) {
+function edtTemperature(entropy: number): number {
   const safeEntropy = Math.max(entropy, 0.1);
   return T0 * Math.pow(N, THETA / safeEntropy);
 }
@@ -59,7 +58,7 @@ function edtTemperature(entropy) {
  * Uses Branch API with per-token setSamplerParams() for EDT adaptation.
  * Each token gets a temperature computed from the current logit entropy.
  */
-async function generate(ctx, prompt, strategy, strategyName, maxTokens = 50) {
+async function generate(ctx: SessionContext, prompt: string, strategy: number | 'edt', strategyName: string, maxTokens: number = 50): Promise<{text: string; avgEntropy: number; avgTemp: number; tokenCount: number; temps: number[]; entropies: number[]}> {
   const messages = [{ role: 'user', content: prompt }];
   const { prompt: formatted } = await ctx.formatChat(JSON.stringify(messages));
   const tokens = await ctx.tokenize(formatted);
@@ -68,9 +67,9 @@ async function generate(ctx, prompt, strategy, strategyName, maxTokens = 50) {
   const branch = Branch.create(ctx, 0, { temperature: baseTemp, topP: 0.9 });
   await branch.prefill(tokens);
 
-  const output = [];
-  const temps = [];
-  const entropies = [];
+  const output: number[] = [];
+  const temps: number[] = [];
+  const entropies: number[] = [];
 
   for (let i = 0; i < maxTokens; i++) {
     const entropy = branch.modelEntropy('nats');
@@ -100,10 +99,12 @@ async function generate(ctx, prompt, strategy, strategyName, maxTokens = 50) {
   return { text, avgEntropy, avgTemp, tokenCount: output.length, temps, entropies };
 }
 
+type GenerateResult = Awaited<ReturnType<typeof generate>>;
+
 /**
  * Run comparison for a single prompt
  */
-async function compareStrategies(ctx, prompt, label) {
+async function compareStrategies(ctx: SessionContext, prompt: string, label: string): Promise<{fixed: GenerateResult; edt: GenerateResult}> {
   if (!jsonlMode) {
     console.log(`\n${'='.repeat(70)}`);
     console.log(`${label}: "${prompt}"`);
@@ -152,7 +153,7 @@ async function compareStrategies(ctx, prompt, label) {
   return { fixed, edt };
 }
 
-async function main() {
+async function main(): Promise<void> {
   if (!jsonlMode) {
     console.log('EDT vs Fixed Temperature Comparison');
     console.log('Based on Zhang et al. 2024: https://arxiv.org/abs/2403.14541\n');
@@ -209,7 +210,7 @@ don't add randomness - let it output what it knows.
 }
 
 main().catch((err) => {
-  console.error('Error:', err.message);
-  console.error(err.stack);
+  console.error('Error:', (err as Error).message);
+  console.error((err as Error).stack);
   process.exit(1);
 });
diff --git a/examples/grammar/README.md b/examples/grammar/README.md
deleted file mode 100644
index 57ac23a..0000000
--- a/examples/grammar/README.md
+++ /dev/null
@@ -1,77 +0,0 @@
-# Grammar-Constrained Generation with Branch Forking
-
-Demonstrates grammar-constrained generation using the Branch API with automatic grammar cloning on fork.
-
-## Run It
-
-```bash
-node grammar.mjs
-```
-
-## What You'll See
-
-```
-Generating until "city" field...
-  {
-  "name": "John Doe",
-  "age": 30,
-  "city":
-
-Forking into 3 branches at branch point...
-
-  [NYC branch]: { "name": "John Doe", "age": 30, "city": "Seattle" }
-  [LA branch]: { "name": "John Doe", "age": 30, "city": "Chicago" }
-  [Chicago branch]: { "name": "John Doe", "age": 30, "city": "LA" }
-```
-
-## The Branch Fork Pattern
-
-Grammar state is integrated into the branch and cloned automatically on fork:
-
-```javascript
-// Create root branch with grammar constraint
-const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(schema));
-const root = Branch.create(ctx, 0, params, undefined, grammar);
-await root.prefill(promptTokens);
-
-// Generate until branch point
-for (let i = 0; i < 100; i++) {
-  const { token, text, isStop } = await root.produce();
-  if (isStop) break;
-  await root.commit(token);
-  if (accumulated.includes('"city"')) break;
-}
-
-// Fork — grammar state cloned automatically
-for (const city of cities) {
-  const child = await root.fork();
-  child.reseedSampler(seed++);
-
-  for await (const { text } of child) {
-    // Each branch generates independently with its own grammar state
-  }
-  await child.prune();
-}
-await root.prune();
-```
-
-## Why Branch Fork Here?
-
-For grammar-constrained branching, fork handles everything atomically:
-- **KV cache**: Shared prefix, divergent-only storage per branch
-- **Grammar state**: Parser position cloned automatically
-- **Sampler chain**: Penalties and PRNG cloned and reseeded
-
-No manual KV save/load or grammar cloning needed — `fork()` is a single operation.
-
-## Key APIs
-
-| Method | Description |
-|--------|-------------|
-| `Branch.create(ctx, pos, params, nBatch, grammar)` | Create branch with grammar constraint |
-| `branch.fork()` | Clone branch: KV prefix + grammar + sampler |
-| `branch.reseedSampler(seed)` | Diversify forked branch's PRNG |
-| `branch.produce()` | Sample grammar-valid token |
-| `branch.commit(token)` | Advance grammar + KV state |
-| `branch.prune()` | Clean up branch resources |
-| `ctx.jsonSchemaToGrammar(json)` | Convert JSON schema to GBNF grammar |
diff --git a/examples/grammar/grammar.mjs b/examples/grammar/grammar.mjs
deleted file mode 100644
index 6f96f2c..0000000
--- a/examples/grammar/grammar.mjs
+++ /dev/null
@@ -1,177 +0,0 @@
-#!/usr/bin/env node
-/**
- * Grammar-constrained generation with forkable state
- *
- * Uses Branch API for grammar-constrained generation with tree branching.
- * Grammar state is automatically cloned on fork(), so each branch can
- * diverge independently while maintaining valid JSON output.
- *
- * Usage:
- *   node grammar.mjs [model-path]          # Human-readable output
- *   node grammar.mjs [model-path] --jsonl  # JSONL output for testing
- */
-
-import * as path from 'node:path';
-import { fileURLToPath } from 'node:url';
-import { createContext, Branch } from '../../lib/index.js';
-
-const __dirname = path.dirname(fileURLToPath(import.meta.url));
-const DEFAULT_MODEL = path.resolve(
-  __dirname,
-  '../../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf'
-);
-
-// Parse args
-const args = process.argv.slice(2);
-const jsonlMode = args.includes('--jsonl');
-const modelPath = args.find(a => !a.startsWith('--')) || DEFAULT_MODEL;
-
-/** Emit output - JSONL or human-readable */
-function emit(event, data) {
-  if (jsonlMode) {
-    console.log(JSON.stringify({ event, ...data }));
-  }
-}
-
-async function main() {
-  if (!jsonlMode) {
-    console.log(`Loading model: ${path.basename(modelPath)}`);
-  }
-
-  emit('start', { model: path.basename(modelPath) });
-
-  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
-  const ctx = await createContext({
-    modelPath,
-    nCtx,
-    nSeqMax: 4,
-  });
-
-  // JSON schema with enum for branching demo
-  const schema = {
-    type: 'object',
-    properties: {
-      name: { type: 'string' },
-      age: { type: 'number' },
-      city: { enum: ['NYC', 'LA', 'Chicago', 'Seattle'] },
-    },
-    required: ['name', 'age', 'city'],
-  };
-
-  if (!jsonlMode) {
-    console.log('\nJSON Schema:');
-    console.log(JSON.stringify(schema, null, 2));
-  }
-
-  const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(schema));
-  if (!jsonlMode) {
-    console.log('\nGBNF Grammar (first 200 chars):');
-    console.log(grammar.slice(0, 200) + '...\n');
-  }
-
-  const prompt = 'Generate a person as JSON:\n';
-  if (!jsonlMode) {
-    console.log(`Prompt: "${prompt}"`);
-  }
-
-  const tokens = await ctx.tokenize(prompt);
-
-  // Root branch with grammar constraint — grammar state cloned automatically on fork()
-  const root = Branch.create(ctx, 0, { temperature: 0.7, topP: 0.9 }, undefined, grammar);
-  await root.prefill(tokens);
-
-  // ===== PHASE 1: Generate until we see "city" key =====
-  if (!jsonlMode) {
-    console.log('\nGenerating until "city" field...');
-    process.stdout.write('  ');
-  }
-
-  let accumulated = '';
-
-  for (let i = 0; i < 100; i++) {
-    const { token, text, isStop } = await root.produce();
-    if (isStop) break;
-
-    accumulated += text;
-    if (!jsonlMode) {
-      process.stdout.write(text);
-    }
-    emit('token', { phase: 'prefix', token, text });
-
-    await root.commit(token);
-
-    // Stop when we see "city": - we want to branch here
-    if (accumulated.includes('"city"')) {
-      break;
-    }
-  }
-  if (!jsonlMode) {
-    console.log('\n');
-  }
-
-  // ===== PHASE 2: Fork and complete with different branches =====
-  const cities = ['NYC', 'LA', 'Chicago'];
-  if (!jsonlMode) {
-    console.log(`Forking into ${cities.length} branches at branch point...\n`);
-  }
-
-  emit('branch_point', { prefix: accumulated, position: root.position });
-
-  const results = [];
-  for (const city of cities) {
-    const child = await root.fork();
-    child.reseedSampler(results.length + 42);
-
-    let branchText = '';
-    for (let i = 0; i < 30; i++) {
-      const { token, text, isStop } = await child.produce();
-      if (isStop) break;
-
-      branchText += text;
-      emit('token', { phase: 'branch', city, token, text });
-
-      await child.commit(token);
-    }
-
-    const fullOutput = accumulated + branchText;
-    results.push({ city, output: fullOutput });
-
-    if (!jsonlMode) {
-      console.log(`  [${city} branch]: ${fullOutput}`);
-    }
-    emit('branch_complete', { city, output: fullOutput });
-
-    await child.prune();
-  }
-
-  await root.prune();
-
-  // Validate JSON outputs
-  let validJsonCount = 0;
-  for (const b of results) {
-    try {
-      JSON.parse(b.output);
-      validJsonCount++;
-    } catch {
-      // Invalid JSON
-    }
-  }
-
-  emit('complete', {
-    branchCount: results.length,
-    validJsonCount,
-    branches: results.map(b => ({ city: b.city, output: b.output })),
-  });
-
-  ctx.dispose();
-
-  if (!jsonlMode) {
-    console.log('\nDone.');
-  }
-}
-
-main().catch((err) => {
-  console.error('Error:', err.message);
-  console.error(err.stack);
-  process.exit(1);
-});
diff --git a/examples/speculative/README.md b/examples/speculative/README.md
deleted file mode 100644
index 7433c77..0000000
--- a/examples/speculative/README.md
+++ /dev/null
@@ -1,117 +0,0 @@
-# Speculative Decoding with Branch API
-
-Demonstrates speculative decoding using the Branch primitive: fork a draft, verify, accept/reject, sample bonus token.
-
-## Run It
-
-```bash
-node speculative.mjs
-```
-
-## What You'll See
-
-```
-Prompt: "The quick brown fox"
-
-Generating 30 tokens with speculative decoding...
-
-The quick brown fox jumps over the lazy dog. The dog...
-
-==================================================
-Statistics
-==================================================
-  Iterations: 13
-  Tokens drafted: 48
-  Tokens accepted: 6
-  Accept rate: 12.5%
-  Output tokens: 30
-```
-
-## How It Works
-
-| Phase | What Happens |
-|-------|--------------|
-| **1. MAIN** | Create main branch tracking committed state |
-| **2. FORK** | Fork draft branch (shares KV prefix with main) |
-| **3. DRAFT** | produce/commit N tokens on draft branch |
-| **4. VERIFY** | Check draft confidence (entropy threshold) |
-| **5. PRUNE** | Remove draft branch (cleans up divergent KV) |
-| **6. ACCEPT** | Commit accepted tokens to main branch |
-| **7. BONUS** | Sample one token from main at rejection point |
-
-## Key Pattern: Fork/Draft/Verify with Branch API
-
-```javascript
-// Main branch tracks committed state
-const main = Branch.create(ctx, 0, { temperature: 0.7 });
-await main.prefill(promptTokens);
-
-while (output.length < maxTokens) {
-  // Fork draft from main — shares KV prefix
-  const draft = await main.fork();
-  draft.reseedSampler(iteration);
-
-  // Draft N tokens
-  const drafts = [];
-  for (let i = 0; i < N; i++) {
-    const entropy = ctx.modelEntropy('nats', draft.getLogits());
-    const { token, text, isStop } = draft.produceSync();
-    if (isStop) break;
-    drafts.push({ token, text, entropy });
-    await draft.commit(token);
-  }
-
-  // Verify and prune draft
-  const acceptedCount = verify(drafts);
-  await draft.prune();
-
-  // Commit accepted tokens to main
-  for (const d of drafts.slice(0, acceptedCount)) {
-    await main.commit(d.token);
-  }
-
-  // Bonus token from main at rejection point
-  if (acceptedCount < drafts.length) {
-    const { token } = main.produceSync();
-    await main.commit(token);
-  }
-}
-await main.prune();
-```
-
-## Why Branch API?
-
-The produce/commit separation is what makes speculative decoding natural:
-
-- **produce()** samples without writing to KV — inspect before deciding
-- **commit()** accepts + decodes — advance state only for accepted tokens
-- **fork()** shares KV prefix — draft branch doesn't duplicate the prompt
-- **prune()** removes divergent KV — clean rejection without manual bookkeeping
-
-## Key APIs
-
-| Method | Description |
-|--------|-------------|
-| `Branch.create(ctx, pos, params)` | Create branch at position |
-| `branch.fork()` | Fork: shared KV prefix + cloned sampler |
-| `branch.produce()` | Sample without KV write |
-| `branch.commit(token)` | Accept + decode into KV |
-| `branch.prune()` | Remove divergent KV entries |
-| `branch.reseedSampler(seed)` | Diversify forked branch |
-| `ctx.modelEntropy('nats', logits)` | Check draft confidence |
-
-## Accept Rate
-
-The accept rate determines speedup:
-
-| Accept Rate | Meaning |
-|-------------|---------|
-| High (>70%) | Draft model matches target well - good speedup |
-| Low (<30%) | Draft model diverges - minimal speedup |
-
-This example uses entropy-based simulation (not a real draft model), so accept rates are low. With a properly trained draft model, rates of 60-80% are achievable.
-
-## References
-
-- [Leviathan et al. 2023](https://arxiv.org/abs/2211.17192) - "Fast Inference from Transformers via Speculative Decoding"
-- [Chen et al. 2023](https://arxiv.org/abs/2302.01318) - "Accelerating LLM Decoding with Speculative Sampling"
diff --git a/examples/speculative/speculative.mjs b/examples/speculative/speculative.mjs
deleted file mode 100644
index 93bc111..0000000
--- a/examples/speculative/speculative.mjs
+++ /dev/null
@@ -1,271 +0,0 @@
-#!/usr/bin/env node
-/**
- * Speculative Decoding with Branch API
- *
- * This example demonstrates speculative decoding using the Branch primitive:
- * - Main branch tracks committed state
- * - Fork a draft branch for speculative generation
- * - Prune draft on rejection, commit accepted tokens to main
- * - Sample bonus token from main at rejection point
- *
- * Real speculative decoding uses a small "draft" model and large "target" model.
- * This example uses the same model for both (demonstrating the mechanics, not speedup).
- *
- * Branch API Benefits:
- * - Atomic fork: KV + logits + sampler + perplexity cloned together
- * - produce/commit separation: sample without KV write, then commit
- * - Shared prefix: forked branches share KV for common prefix
- * - Clean cleanup: prune() removes divergent KV entries
- *
- * References:
- * - Leviathan et al. 2023 "Fast Inference from Transformers via Speculative Decoding"
- * - Chen et al. 2023 "Accelerating Large Language Model Decoding with Speculative Sampling"
- *
- * Usage:
- *   node speculative.mjs [model-path]          # Human-readable output
- *   node speculative.mjs [model-path] --jsonl  # JSONL output for testing
- */
-
-import * as path from 'node:path';
-import { fileURLToPath } from 'node:url';
-import { createContext, Branch } from '../../lib/index.js';
-
-const __dirname = path.dirname(fileURLToPath(import.meta.url));
-const DEFAULT_MODEL = path.resolve(
-  __dirname,
-  '../../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf'
-);
-
-// Parse args
-const args = process.argv.slice(2);
-const jsonlMode = args.includes('--jsonl');
-const modelPath = args.find((a) => !a.startsWith('--')) || DEFAULT_MODEL;
-
-/** Emit output - JSONL or human-readable */
-function emit(event, data) {
-  if (jsonlMode) {
-    console.log(JSON.stringify({ event, ...data }));
-  }
-}
-
-/**
- * Simulate speculative decoding verification
- *
- * In real speculative decoding:
- * - Draft model generates N tokens quickly (small model or n-gram)
- * - Target model scores all N tokens in a single batch
- * - Compare: if target agrees with draft, accept; else reject and use target's token
- *
- * Here we simulate by accepting tokens with probability based on draft confidence.
- */
-function simulateVerification(drafts) {
-  // In production: compare draft probabilities to target probabilities
-  // Here: accept high-confidence drafts (low entropy), reject uncertain ones
-  let accepted = 0;
-
-  for (const draft of drafts) {
-    // Simulate: accept if draft was "confident" (entropy < threshold)
-    // Real implementation would compare P_target(token) vs P_draft(token)
-    if (draft.entropy < 2.0) {
-      accepted++;
-    } else {
-      break; // First rejection stops the chain
-    }
-  }
-
-  return accepted;
-}
-
-async function main() {
-  const DRAFT_COUNT = 4;
-  const GENERATION_LENGTH = 30;
-
-  if (!jsonlMode) {
-    console.log('Speculative Decoding Demo (Branch API)');
-    console.log('======================================\n');
-    console.log(`Loading model: ${path.basename(modelPath)}`);
-  }
-
-  emit('start', {
-    model: path.basename(modelPath),
-    draftCount: DRAFT_COUNT,
-    generationLength: GENERATION_LENGTH,
-  });
-
-  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
-  const ctx = await createContext({
-    modelPath,
-    nCtx,
-    nSeqMax: 4, // Enable multi-sequence for fork/verify pattern
-  });
-
-  const prompt = 'The quick brown fox';
-  if (!jsonlMode) {
-    console.log(`\nPrompt: "${prompt}"`);
-  }
-
-  // Prefill prompt via main branch
-  const promptTokens = await ctx.tokenize(prompt);
-
-  const main = Branch.create(ctx, 0, {
-    temperature: 0.7, // For bonus token sampling
-  });
-  await main.prefill(promptTokens);
-
-  const output = [];
-  let totalDrafted = 0;
-  let totalAccepted = 0;
-  let iterations = 0;
-
-  if (!jsonlMode) {
-    console.log(
-      `\nGenerating ${GENERATION_LENGTH} tokens with speculative decoding...\n`
-    );
-    process.stdout.write(prompt);
-  }
-
-  while (output.length < GENERATION_LENGTH) {
-    iterations++;
-
-    // === DRAFT PHASE ===
-    // Fork main branch for speculative drafting
-    // Draft branch shares KV prefix with main, diverges as it generates
-    const draft = await main.fork();
-    draft.reseedSampler(iterations); // Different seed each iteration for diversity
-
-    const drafts = [];
-
-    for (let i = 0; i < DRAFT_COUNT && output.length + drafts.length < GENERATION_LENGTH; i++) {
-      // Get entropy BEFORE sampling (from draft branch's logits snapshot)
-      const entropy = draft.modelEntropy('nats');
-
-      // produce() samples from captured logits (no KV write yet)
-      const { token, text, isStop } = draft.produceSync();
-
-      if (isStop) break;
-
-      drafts.push({ token, text, entropy });
-
-      // commit() accepts token + decodes + captures new logits
-      await draft.commit(token);
-    }
-
-    if (drafts.length === 0) {
-      await draft.prune();
-      break;
-    }
-    totalDrafted += drafts.length;
-
-    // === VERIFY PHASE ===
-    // Simulate verification - in production this compares draft vs target distributions
-    const acceptedCount = simulateVerification(drafts);
-    totalAccepted += acceptedCount;
-
-    // === CLEANUP DRAFT ===
-    // Prune draft branch - removes its divergent KV entries
-    // Main branch is unchanged (still at pre-draft position)
-    await draft.prune();
-
-    // === ACCEPT PHASE ===
-    // Commit accepted tokens to main branch
-    const accepted = drafts.slice(0, acceptedCount);
-    for (const d of accepted) {
-      await main.commit(d.token);
-      if (!jsonlMode) {
-        process.stdout.write(d.text);
-      }
-      emit('token', {
-        token: d.token,
-        text: d.text,
-        entropy: d.entropy,
-        accepted: true,
-      });
-      output.push(d.token);
-    }
-
-    // === BONUS TOKEN ===
-    // If we rejected some drafts, sample a bonus token from main
-    // Main is now at the accepted position with fresh logits
-    const rejected = drafts.slice(acceptedCount);
-    if (rejected.length > 0) {
-      // produce() samples from main's current logits (at rejection point)
-      const { token: bonusToken, text: bonusText, isStop } = main.produceSync();
-
-      if (!isStop) {
-        await main.commit(bonusToken);
-        if (!jsonlMode) {
-          process.stdout.write(bonusText);
-        }
-        emit('token', { token: bonusToken, text: bonusText, bonus: true });
-        output.push(bonusToken);
-      }
-    }
-
-    emit('iteration', {
-      iteration: iterations,
-      drafted: drafts.length,
-      accepted: acceptedCount,
-      rejected: rejected.length,
-      hasBonus: rejected.length > 0,
-    });
-
-    // Check for natural stopping
-    if (output.length > 0 && ctx.isStopToken(output[output.length - 1])) {
-      break;
-    }
-  }
-
-  // Cleanup main branch
-  await main.prune();
-
-  // Statistics
-  const acceptRate = totalDrafted > 0 ? totalAccepted / totalDrafted : 0;
-
-  emit('complete', {
-    iterations,
-    totalDrafted,
-    totalAccepted,
-    acceptRate,
-    outputTokens: output.length,
-  });
-
-  if (!jsonlMode) {
-    console.log('\n');
-    console.log('='.repeat(50));
-    console.log('Statistics');
-    console.log('='.repeat(50));
-    console.log(`  Iterations: ${iterations}`);
-    console.log(`  Tokens drafted: ${totalDrafted}`);
-    console.log(`  Tokens accepted: ${totalAccepted}`);
-    console.log(`  Accept rate: ${(acceptRate * 100).toFixed(1)}%`);
-    console.log(`  Output tokens: ${output.length}`);
-
-    console.log('\n' + '='.repeat(50));
-    console.log('How Speculative Decoding Works (Branch API)');
-    console.log('='.repeat(50));
-    console.log(`
-  1. MAIN:   Create main branch tracking committed state
-  2. FORK:   Fork draft branch (shares KV prefix with main)
-  3. DRAFT:  produce/commit N tokens on draft branch
-  4. VERIFY: Check draft confidence (entropy threshold)
-  5. PRUNE:  Remove draft branch (cleans up divergent KV)
-  6. COMMIT: Apply accepted tokens to main branch
-  7. BONUS:  Sample one token from main at rejection point
-  8. REPEAT: Continue from main's new position
-
-  Branch API Advantages:
-  - Atomic fork: KV + logits + sampler copied together
-  - Shared prefix: Only divergent KV uses extra memory
-  - Clean separation: produce() samples, commit() writes
-  - Easy cleanup: prune() handles KV removal
-`);
-  }
-
-  ctx.dispose();
-}
-
-main().catch((err) => {
-  console.error('Error:', err.message);
-  console.error(err.stack);
-  process.exit(1);
-});
diff --git a/examples/streaming/README.md b/examples/streaming/README.md
deleted file mode 100644
index 2352f60..0000000
--- a/examples/streaming/README.md
+++ /dev/null
@@ -1,217 +0,0 @@
-# Streaming Examples
-
-Advanced streaming patterns for long-form generation with quality preservation.
-
-## Examples Overview
-
-| Example | Purpose | Key Pattern |
-|---------|---------|-------------|
-| `streaming.mjs` | Infinite context generation | BlinkKV reseeding |
-| `streaming-tsampler.mjs` | TypeScript sampling with N-gram tracking | TTA (Test-Time Alignment) |
-| `streaming-summary.mjs` | Dynamic summary sinks | BlinkKV + summary sidecar |
-
----
-
-## streaming.mjs - BlinkKV Infinite Context
-
-Demonstrates generating beyond the context window limit using the BlinkKV reseeding pattern.
-
-### Usage
-
-```bash
-node streaming.mjs /path/to/model.gguf
-```
-
-### Parameters (from BlinkKV paper)
-
-| Parameter | Value | Description |
-|-----------|-------|-------------|
-| Context size | 2048 | Model's context window |
-| Sink tokens | prompt | Structural anchor (entire prompt) |
-| Tail size | 256 | Most recent tokens to retain |
-
-### BlinkKV Pattern
-
-When the KV cache fills:
-1. **Clear** the entire KV cache
-2. **Re-decode sinks** (prompt tokens) at positions [0..N]
-3. **Re-decode tail** (256 most recent) at positions [N+1..N+256]
-4. **Continue** from position N+257
-
-This maintains cache-local position contiguity, which is necessary and sufficient for streaming quality.
-
-### Key APIs
-
-| Method | Description |
-|--------|-------------|
-| `clearAndReseed(sinks, tail)` | Clear cache, re-decode at local positions |
-| `modelSurprisal(token)` | Measure prediction error |
-| `createPerplexityTracker()` | Track quality across stream |
-
----
-
-## streaming-tsampler.mjs - TypeScript Sampling with N-gram Tracking
-
-Demonstrates using tsampler (TypeScript sampling library) with N-gram sequence tracking for repetition detection.
-
-### Usage
-
-```bash
-node streaming-tsampler.mjs /path/to/model.gguf
-```
-
-### Architecture
-
-```
-┌─────────────────────────────────────────────────────────┐
-│  Native Context (llama.cpp)                             │
-│  - KV cache management                                  │
-│  - Logits computation via decode()                      │
-│  - BlinkKV reseeding                                    │
-└─────────────────────────────────────────────────────────┘
-                    │ ctx.getLogits()
-                    ▼
-┌─────────────────────────────────────────────────────────┐
-│  tsampler (TypeScript)                                  │
-│  - sampleWithStrategy() for token selection             │
-│  - Temperature, top-p, top-k filtering                  │
-│  - Xoroshiro128Plus PRNG for reproducibility            │
-└─────────────────────────────────────────────────────────┘
-                    │ sampled token
-                    ▼
-┌─────────────────────────────────────────────────────────┐
-│  NgramTracker (App-level)                               │
-│  - Tracks N-gram sequences (configurable N)             │
-│  - Threshold-based blocking (block after K repeats)     │
-│  - Logit steering: blocked token → -Infinity            │
-└─────────────────────────────────────────────────────────┘
-```
-
-### Key Insight: Token vs Sequence Penalties
-
-llama.cpp's built-in repetition penalties operate at the **token level**, penalizing individual words regardless of context. This degrades prose quality over long generations as common words ("the", "is", "a") accumulate penalties.
-
-Instead, tsampler + N-gram tracking operates at the **sequence level**:
-- Only blocks when an exact N-token sequence repeats
-- Threshold-based: only blocks after K occurrences (not first occurrence)
-- Preserves natural word reuse while preventing actual loops
-
-### tsampler Integration
-
-```javascript
-import {
-  sampleWithStrategy,
-  Xoroshiro128Plus,
-  SamplerWorkspace,
-} from 'tsampler';
-
-const prng = new Xoroshiro128Plus(42);  // Deterministic seed
-const workspace = new SamplerWorkspace(256);
-
-// Get logits from native layer
-const logits = new Float32Array(ctx.getLogits());
-
-// Apply N-gram blocking before sampling
-const blockedToken = ngramTracker.getBlockedToken();
-if (blockedToken !== null) {
-  logits[blockedToken] = -Infinity;
-}
-
-// Sample with tsampler
-const token = sampleWithStrategy(logits, {
-  params: { temperature: 0.8, topP: 0.9 },
-  workspace,
-  prng,
-});
-```
-
-### Configuration
-
-| Parameter | Default | Description |
-|-----------|---------|-------------|
-| `NGRAM_SIZE` | 6 | N-gram length for sequence tracking |
-| `BLOCK_THRESHOLD` | 2 | Block after K occurrences of same pattern |
-
----
-
-## streaming-summary.mjs - Dynamic Summary Sinks
-
-Extends BlinkKV with a slim-summary sidecar that generates cumulative summaries of evicted content. Summaries become sink tokens on reseed, giving the model compressed semantic memory of what it generated beyond the visible tail.
-
-### Usage
-
-```bash
-node streaming-summary.mjs /path/to/model.gguf
-node streaming-summary.mjs /path/to/model.gguf --jsonl
-```
-
-### Architecture
-
-```
-┌─────────────────────────────────────────────────────────┐
-│  Main Context (llama.cpp)                                │
-│  - KV cache management + BlinkKV reseeding               │
-│  - Token generation loop                                 │
-│  - clearAndReseed(sinks, tail) with dynamic sinks        │
-└─────────────────────────────────────────────────────────┘
-           │ evicted text                    │ reseed
-           ▼                                 ▲ sink tokens
-┌─────────────────────────────────┐          │
-│  Summary Sidecar (slim-summary)  │──────────┘
-│  - slim-summarize.gguf (1.7GB)  │
-│  - Prompt: <human>/<summarize>  │
-│  - Output: Python-style list    │
-└─────────────────────────────────┘
-
-After reseed, KV cache layout:
-┌──────────┬─────────────┬───────────────┐
-│  anchor  │   summary   │     tail      │
-│ (prompt) │ (evicted→)  │ (256 recent)  │
-└──────────┴─────────────┴───────────────┘
-```
-
-### Sidecar Prompt Format
-
-The slim-summarize model uses a specific prompt format:
-
-```
-<human>: {text}
-<summarize> key points (5) </summarize>
-<bot>:
-```
-
-Output is a Python-style list: `['point1', 'point2', 'point3']`
-
-When budget is tight, uses `brief description (1)` for a single cohesive summary.
-
-### Budget Management
-
-| Concept | Formula |
-|---------|---------|
-| Max sink tokens | `nCtx * sinkBudgetRatio` (default 0.4 = 819 tokens) |
-| Summary budget | `maxSinkTokens - anchorTokens.length` |
-| Over budget? | Re-summarize with `brief description (1)`, maxTokens=100 |
-
-### Configuration
-
-| Parameter | Default | Description |
-|-----------|---------|-------------|
-| `TAIL_SIZE` | 256 | Most recent tokens to retain |
-| `TARGET_TOKENS` | 5000 | Total tokens to generate |
-| `sinkBudgetRatio` | 0.4 | Fraction of context allocated to sinks |
-| `summaryMaxTokens` | 200 | Max tokens for summary generation |
-
-### Key APIs
-
-| Method | Description |
-|--------|-------------|
-| `clearAndReseed(sinks, tail)` | Clear cache, re-decode sinks + tail |
-| `tokenize(text)` | Tokenize summary text for sink injection |
-| `kvCacheClear()` | Clear sidecar KV before each summary |
-| `formatChat(messages)` | Format anchor message with chat template |
-
----
-
-## References
-
-1. Han et al. 2024 - "LM-Infinite: Zero-Shot Extreme Length Generalization" (BlinkKV)
diff --git a/examples/streaming/streaming-summary.mjs b/examples/streaming/streaming-summary.mjs
deleted file mode 100644
index 27221ca..0000000
--- a/examples/streaming/streaming-summary.mjs
+++ /dev/null
@@ -1,552 +0,0 @@
-#!/usr/bin/env node
-/**
- * Infinite context generation with dynamic summary sinks
- *
- * Usage:
- *   node streaming-summary.mjs [model-path]              # Self-summary (default)
- *   node streaming-summary.mjs [model-path] --sidecar    # Use slim-summarize sidecar
- *   node streaming-summary.mjs [model-path] --jsonl      # JSONL output for testing
- *
- * This example demonstrates:
- * - BlinkKV reseeding with ghostwritten progress sinks
- * - Self-summary: main model summarizes its own evicted content (default)
- * - Sidecar mode: optional slim-summarize model for summarization (--sidecar)
- * - Outline detection with structural progress tracking
- * - Pattern matching (not instruction following) to guide continuation
- * - Branch API for generation (produce/commit loop)
- *
- * After reseed, KV cache contains: [progress][tail]
- * - progress = minimal anchor + checklist of done/current sections + summary
- * - tail     = recent 256 tokens for continuity
- *
- * The progress sink uses "done" / "continue from here" markers that the
- * model pattern-matches against, rather than relying on instruction following.
- */
-
-import * as fs from 'node:fs';
-import * as path from 'node:path';
-import { fileURLToPath } from 'node:url';
-import { createContext, Branch } from '../../lib/index.js';
-
-const __dirname = path.dirname(fileURLToPath(import.meta.url));
-const DEFAULT_MODEL = path.resolve(
-  __dirname,
-  '../../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf'
-);
-const SUMMARY_MODEL = path.resolve(
-  __dirname,
-  '../../models/slim-summarize.gguf'
-);
-
-// Parse args
-const args = process.argv.slice(2);
-const jsonlMode = args.includes('--jsonl');
-const useSidecar = args.includes('--sidecar');
-const modelPath = args.find(a => !a.startsWith('--')) || DEFAULT_MODEL;
-
-// Parse --max-tokens for CI (default 5000)
-const maxTokensArg = args.find(a => a.startsWith('--max-tokens='));
-const TARGET_TOKENS = maxTokensArg ? parseInt(maxTokensArg.split('=')[1], 10) : 5000;
-
-/** Emit output - JSONL or human-readable */
-function emit(event, data) {
-  if (jsonlMode) {
-    console.log(JSON.stringify({ event, ...data }));
-  }
-}
-
-/**
- * Parse slim-summarize output (Python-style list) into readable text
- */
-function parseSummaryOutput(raw) {
-  // Output is Python-style list: ['point1', 'point2', ...]
-  // Items may contain apostrophes (e.g., "It's"), so we can't match between quotes.
-  // Instead, strip outer brackets + quotes, then split on the item boundary: ', '
-  let inner = raw.trim();
-  if (inner.startsWith('[')) inner = inner.slice(1);
-  if (inner.endsWith(']')) inner = inner.slice(0, -1);
-  inner = inner.trim();
-  if (inner.startsWith("'") || inner.startsWith('"')) inner = inner.slice(1);
-  if (inner.endsWith("'") || inner.endsWith('"')) inner = inner.slice(0, -1);
-
-  if (!inner) return raw.trim();
-
-  // Split on quote-comma-quote boundaries (handles apostrophes within items)
-  const items = inner.split(/['"]\s*,\s*['"]/)
-    .map(s => s.trim())
-    .filter(Boolean);
-
-  if (items.length > 0) return items.join('\n');
-  return inner;
-}
-
-/**
- * Generate a summary using sidecar context
- * @param {object} summaryCtx - Context to use for summarization
- * @param {string} text - Text to summarize
- * @param {object} options - Options: maxTokens, brief, format ('self' | 'slim-summarize')
- */
-async function generateSummary(summaryCtx, text, options = {}) {
-  const maxTokens = options.maxTokens || 200;
-  const format = options.format || 'self';
-
-  let tokens;
-
-  if (format === 'self') {
-    // Self-summary: use model's chat template via formatChat()
-    const { prompt } = await summaryCtx.formatChat(
-      JSON.stringify([
-        {
-          role: 'system',
-          content: 'Summarize the following text concisely. List the key points.',
-        },
-        { role: 'user', content: text.slice(-10000) },
-      ])
-    );
-    tokens = await summaryCtx.tokenize(prompt);
-  } else {
-    // slim-summarize prompt format
-    const paramStr = options.brief ? 'brief description (1)' : 'key points (5)';
-    const prompt = `<human> ${text.slice(-10000)}\n<summarize> ${paramStr}</summarize>\n<bot>:`;
-    tokens = await summaryCtx.tokenize(prompt);
-  }
-
-  await summaryCtx.kvCacheClear();
-  const branch = Branch.create(summaryCtx, 0, { temperature: 0.3 });
-  await branch.prefill(tokens);
-
-  let response = '';
-  for (let i = 0; i < maxTokens; i++) {
-    const { token, text: t, isStop } = await branch.produce();
-    if (isStop) break;
-    response += t;
-    await branch.commit(token);
-  }
-  await branch.prune();
-
-  // Only parse slim-summarize Python-style list format
-  return format === 'slim-summarize'
-    ? parseSummaryOutput(response.trim())
-    : response.trim();
-}
-
-/**
- * Parse numbered outline items from prompt text.
- */
-function parseOutline(text) {
-  const items = [];
-  const regex = /^\s*(\d+)\.\s+(.+?)(?:\s*[-–—:]\s*.*)?$/gm;
-  let match;
-  while ((match = regex.exec(text)) !== null) {
-    items.push({
-      number: parseInt(match[1]),
-      title: match[2].trim(),
-    });
-  }
-  return items;
-}
-
-/**
- * Extract instruction part of prompt, before any numbered outline.
- */
-function extractMinimalAnchor(text) {
-  const listMatch = text.match(/^\s*1\.\s/m);
-  if (listMatch && listMatch.index > 0) {
-    return text.slice(0, listMatch.index).trim();
-  }
-  return text.slice(0, 200).trim();
-}
-
-/**
- * Build ghostwritten progress sink.
- * Completed items show "- done", current shows "- continue from here".
- * Model pattern-matches to continue from the right section.
- */
-function buildProgressSink(anchor, outline, allGeneratedText, summaryChain) {
-  const lower = allGeneratedText.toLowerCase();
-
-  let lastCoveredIdx = -1;
-  for (let i = outline.length - 1; i >= 0; i--) {
-    if (lower.includes(outline[i].title.toLowerCase())) {
-      lastCoveredIdx = i;
-      break;
-    }
-  }
-
-  let text = `${anchor}\n\n`;
-
-  for (let i = 0; i < outline.length; i++) {
-    const item = outline[i];
-    if (i < lastCoveredIdx) {
-      text += `${item.number}. ${item.title} - done\n`;
-    } else if (i === lastCoveredIdx) {
-      text += `${item.number}. ${item.title} - continue from here\n`;
-    } else {
-      text += `${item.number}. ${item.title}\n`;
-    }
-  }
-
-  if (summaryChain) {
-    text += `\nKey points so far:\n${summaryChain}\n`;
-  }
-
-  return text;
-}
-
-async function main() {
-  // Constants
-  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
-  const TAIL_SIZE = 256;
-  const MAX_SINK_RATIO = 0.4;
-  const MAX_SINK_TOKENS = Math.floor(nCtx * MAX_SINK_RATIO);
-  const SUMMARY_MAX_TOKENS = 200;
-
-  // Determine summary mode before emitting start event
-  const summaryFormat = useSidecar ? 'slim-summarize' : 'self';
-
-  if (!jsonlMode) {
-    console.log(`Loading model: ${modelPath}`);
-    console.log(`Summary mode: ${summaryFormat}`);
-  }
-
-  emit('start', {
-    model: path.basename(modelPath),
-    nCtx,
-    tailSize: TAIL_SIZE,
-    maxSinkTokens: MAX_SINK_TOKENS,
-    targetTokens: TARGET_TOKENS,
-    summaryMode: summaryFormat,
-  });
-
-  const ctx = await createContext({
-    modelPath,
-    nCtx,
-  });
-
-  // Summary sidecar — preload in background (overlaps with prompt decode + generation)
-  // Default: "self" mode - second context from same model (weights shared via model_registry)
-  // --sidecar flag: use slim-summarize.gguf instead
-  let summaryCtx = null;
-  let summaryCtxPromise = null;
-  let actualSummaryFormat = summaryFormat;
-
-  if (useSidecar) {
-    // Sidecar mode: use slim-summarize.gguf
-    const summaryModelAvailable = fs.existsSync(SUMMARY_MODEL);
-    if (summaryModelAvailable) {
-      summaryCtxPromise = createContext({ modelPath: SUMMARY_MODEL, nCtx: 4096 });
-    } else {
-      if (!jsonlMode) {
-        console.log('Sidecar model not found - falling back to self-summary');
-      }
-      emit('sidecar_missing', { message: 'slim-summarize.gguf not found, using self-summary' });
-      // Fall back to self mode
-      summaryCtxPromise = createContext({ modelPath, nCtx: 4096 });
-      actualSummaryFormat = 'self';
-    }
-  } else {
-    // Self mode (default): second context from same model
-    // Weights are shared via model_registry — only KV cache is duplicated
-    summaryCtxPromise = createContext({ modelPath, nCtx: 4096 });
-  }
-
-  const prompt = `Write a comprehensive guide to machine learning, covering the following topics in extreme detail with examples, code snippets, and mathematical formulas:
-
-1. Linear Regression - derivation, implementation, regularization
-2. Logistic Regression - binary and multiclass
-3. Neural Networks - backpropagation, activation functions
-4. Convolutional Neural Networks - architectures, pooling, stride
-5. Recurrent Neural Networks - LSTM, GRU, attention
-6. Transformers - self-attention, positional encoding
-7. Optimization - SGD, Adam, learning rate schedules
-8. Regularization - dropout, batch normalization, weight decay
-
-Begin:
-
-# Comprehensive Machine Learning Guide
-
-## Chapter 1: Linear Regression
-
-`;
-
-  // Parse outline for ghostwritten progress sinks
-  const outline = parseOutline(prompt);
-  const minimalAnchor = outline.length > 0
-    ? extractMinimalAnchor(prompt)
-    : null;
-
-  if (!jsonlMode) {
-    console.log(`\nPrompt: "${prompt.slice(0, 100)}..."`);
-    if (outline.length > 0) {
-      console.log(`Outline detected: ${outline.length} sections`);
-      console.log(`Minimal anchor: "${minimalAnchor}"`);
-    }
-  }
-
-  const promptTokens = await ctx.tokenize(prompt);
-
-  // Fallback anchor for prompts without outlines
-  let anchorTokens = null;
-  if (outline.length === 0) {
-    anchorTokens = [...promptTokens];
-  }
-
-  const summaryBudget = outline.length > 0
-    ? MAX_SINK_TOKENS
-    : MAX_SINK_TOKENS - (anchorTokens?.length || 0);
-
-  const samplingParams = { temperature: 0.8, topP: 0.9 };
-  let branch = Branch.create(ctx, 0, samplingParams);
-  await branch.prefill(promptTokens);
-
-  if (!jsonlMode) {
-    console.log(`\nContext size: ${nCtx}`);
-    console.log(`Target tokens: ${TARGET_TOKENS}`);
-    console.log(`Sink budget: ${MAX_SINK_TOKENS} tokens`);
-    console.log(`Tail size: ${TAIL_SIZE}`);
-    console.log(`\nGenerating...\n`);
-    process.stdout.write(prompt);
-  }
-
-  const allTokens = [...promptTokens];
-  // Manual PPL tracking (persists across branch reseeds)
-  let nllSum = 0, nllCount = 0;
-  let reseedCount = 0;
-  let currentSegmentText = '';
-  let allGeneratedText = '';
-  const summaries = [];
-  let pendingSummaryTokens = [];
-
-  for (let t = 0; t < TARGET_TOKENS; t++) {
-    const { token, isStop } = await branch.produce();
-
-    if (isStop) {
-      if (!jsonlMode) {
-        console.log('\n[EOS token reached]');
-      }
-      emit('eos', { tokenIndex: t });
-      break;
-    }
-
-    const surprisal = branch.modelSurprisal(token, 'nats');
-    nllSum += Math.max(0, surprisal);
-    nllCount++;
-
-    const text = ctx.tokenToText(token);
-    if (!jsonlMode) {
-      process.stdout.write(text);
-    }
-    emit('token', { source: 'main', index: t, token, text, surprisal });
-
-    currentSegmentText += text;
-    allGeneratedText += text;
-    allTokens.push(token);
-    await branch.commit(token);
-
-    // Cache full? Reseed with dynamic sinks
-    if (branch.position >= nCtx) {
-      // Estimate evicted portion of current segment only
-      const tailCharsEstimate = TAIL_SIZE * 4;
-      const evictedFromSegment = currentSegmentText.length > tailCharsEstimate
-        ? currentSegmentText.slice(0, -tailCharsEstimate)
-        : '';
-
-      let sinks;
-
-      // Resolve preloaded summary context (should already be loaded by now)
-      if (summaryCtxPromise && !summaryCtx) {
-        summaryCtx = await summaryCtxPromise;
-        const summaryModelName = actualSummaryFormat === 'self' ? path.basename(modelPath) : 'slim-summarize.gguf';
-        if (!jsonlMode) {
-          console.log(`\n  [Summary context loaded: ${summaryModelName} (${actualSummaryFormat} mode)]`);
-        }
-        emit('summary_loaded', { model: summaryModelName, mode: actualSummaryFormat });
-      }
-
-      // Run summary sidecar if available
-      let chainText = null;
-      if (summaryCtx && evictedFromSegment.length > 0) {
-        emit('summary_start', { reseedCount: reseedCount + 1 });
-        const summaryStartTime = Date.now();
-
-        if (!jsonlMode) {
-          process.stdout.write(`\n  [Summarizing ${evictedFromSegment.length} evicted chars (page ${summaries.length + 1})...`);
-        }
-
-        const newPage = await generateSummary(summaryCtx, evictedFromSegment, {
-          maxTokens: SUMMARY_MAX_TOKENS,
-          format: actualSummaryFormat,
-        });
-        summaries.push(newPage);
-        chainText = summaries.join('\n');
-
-        // Fold oldest pages if chain is getting large
-        let testTokens = await ctx.tokenize(chainText);
-        if (testTokens.length > summaryBudget * 0.6) {
-          if (!jsonlMode) {
-            process.stdout.write(' (folding oldest pages)');
-          }
-
-          const foldCount = Math.max(1, Math.ceil(summaries.length / 2));
-          const toFold = summaries.splice(0, foldCount);
-          const folded = await generateSummary(summaryCtx, toFold.join('\n'), {
-            brief: true,
-            maxTokens: 100,
-            format: actualSummaryFormat,
-          });
-          summaries.unshift(folded);
-          chainText = summaries.join('\n');
-        }
-
-        const compressionRatio = evictedFromSegment.length > 0
-          ? (evictedFromSegment.length / newPage.length).toFixed(1)
-          : '0';
-        const durationMs = Date.now() - summaryStartTime;
-
-        emit('summary_complete', {
-          reseedCount: reseedCount + 1,
-          summary: newPage,
-          summaryTokens: (await ctx.tokenize(chainText)).length,
-          compressionRatio: parseFloat(compressionRatio),
-          durationMs,
-          pages: summaries.length,
-        });
-
-        if (!jsonlMode) {
-          process.stdout.write(` ${compressionRatio}x, ${summaries.length} pages]`);
-        }
-      }
-
-      // Build sinks — progress mode (outline detected) or fallback
-      if (outline.length > 0) {
-        const progressText = buildProgressSink(
-          minimalAnchor, outline, allGeneratedText, chainText
-        );
-        let progressTokens = await ctx.tokenize(progressText);
-
-        if (progressTokens.length > MAX_SINK_TOKENS) {
-          // Drop summary details to fit budget
-          const trimmedText = buildProgressSink(
-            minimalAnchor, outline, allGeneratedText, null
-          );
-          progressTokens = await ctx.tokenize(trimmedText);
-        }
-
-        sinks = progressTokens;
-        pendingSummaryTokens = progressTokens;
-
-        if (!jsonlMode) {
-          console.log(`\n  [Progress sink: ${progressTokens.length} tok]`);
-          // Show progress state
-          const lower = allGeneratedText.toLowerCase();
-          let lastIdx = -1;
-          for (let i = outline.length - 1; i >= 0; i--) {
-            if (lower.includes(outline[i].title.toLowerCase())) {
-              lastIdx = i; break;
-            }
-          }
-          if (lastIdx >= 0) {
-            console.log(`  [Sections done: ${lastIdx}, continuing: ${outline[lastIdx].title}]`);
-          }
-        }
-
-        emit('sink_update', {
-          anchorTokens: 0,
-          summaryTokens: progressTokens.length,
-          totalSinkTokens: progressTokens.length,
-          budgetUsed: ((progressTokens.length / MAX_SINK_TOKENS) * 100).toFixed(1),
-          budgetMax: MAX_SINK_TOKENS,
-          pages: summaries.length,
-          mode: 'progress',
-        });
-      } else if (chainText) {
-        const wrapped = `Previously:\n${chainText}\n`;
-        const summaryTokens = await ctx.tokenize(wrapped);
-        sinks = [...anchorTokens, ...summaryTokens];
-        pendingSummaryTokens = summaryTokens;
-
-        if (!jsonlMode) {
-          process.stdout.write(` ${summaryTokens.length} summary tok]`);
-        }
-
-        emit('sink_update', {
-          anchorTokens: anchorTokens.length,
-          summaryTokens: summaryTokens.length,
-          totalSinkTokens: sinks.length,
-          budgetUsed: ((sinks.length / MAX_SINK_TOKENS) * 100).toFixed(1),
-          budgetMax: MAX_SINK_TOKENS,
-          pages: summaries.length,
-          mode: 'anchor',
-        });
-      } else {
-        sinks = [...(anchorTokens || [])];
-      }
-
-      const tail = allTokens.slice(-TAIL_SIZE);
-
-      // Destroy current branch, clear KV, create fresh branch with re-prefill
-      await branch.prune();
-      await ctx.kvCacheClear();
-      branch = Branch.create(ctx, 0, samplingParams);
-      await branch.prefill([...sinks, ...tail]);
-
-      reseedCount++;
-
-      const ppl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1;
-      emit('reseed', {
-        count: reseedCount,
-        tokenIndex: t + 1,
-        ppl,
-        sinkTokens: sinks.length,
-        tailTokens: TAIL_SIZE,
-        summaryPages: summaries.length,
-        summaryPreview: summaries[summaries.length - 1]?.slice(0, 100) || '',
-      });
-
-      if (!jsonlMode) {
-        console.log(`  [Reseed ${reseedCount} at token ${t + 1}/${TARGET_TOKENS} | PPL: ${ppl.toFixed(2)} | Sinks: ${sinks.length} tok | Pages: ${summaries.length}]`);
-      }
-
-      currentSegmentText = '';
-    }
-
-    // Progress indicator every 1000 tokens
-    if ((t + 1) % 1000 === 0 && reseedCount === 0 && !jsonlMode) {
-      console.log(`\n  [${t + 1}/${TARGET_TOKENS} tokens]`);
-    }
-  }
-
-  const finalPpl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1;
-  await branch.prune();
-
-  const generatedTokens = allTokens.length - promptTokens.length;
-  const finalChain = summaries.join('\n');
-  emit('complete', {
-    generatedTokens,
-    reseeds: reseedCount,
-    finalPpl,
-    finalSummary: finalChain.slice(0, 300),
-    finalSummaryTokens: pendingSummaryTokens.length,
-    summaryPages: summaries.length,
-  });
-
-  if (!jsonlMode) {
-    console.log('\n\n' + '='.repeat(50));
-    console.log(`Generated: ${generatedTokens} tokens`);
-    console.log(`Reseeds: ${reseedCount}`);
-    console.log(`Final perplexity: ${finalPpl.toFixed(2)}`);
-    if (summaries.length > 0) {
-      console.log(`Summary pages: ${summaries.length}`);
-      console.log(`Final chain (${pendingSummaryTokens.length} tok): ${finalChain.slice(0, 200)}`);
-    }
-    console.log('='.repeat(50));
-  }
-
-  ctx.dispose();
-  if (summaryCtx) summaryCtx.dispose();
-}
-
-main().catch((err) => {
-  console.error('Error:', err.message);
-  process.exit(1);
-});
diff --git a/examples/streaming/streaming-tsampler.mjs b/examples/streaming/streaming-tsampler.mjs
deleted file mode 100644
index ec41ec5..0000000
--- a/examples/streaming/streaming-tsampler.mjs
+++ /dev/null
@@ -1,326 +0,0 @@
-#!/usr/bin/env node
-/**
- * Infinite context generation with BlinkKV + tsampler N-gram deduplication
- *
- * This example demonstrates:
- * - TypeScript sampling via tsampler (TTA pattern)
- * - N-gram tracking to detect sequence repetition
- * - Logit steering to prevent repeated sequences
- * - Branch API for KV management (prefill/commit)
- * - KV cache clear + re-prefill for infinite context
- *
- * The key insight: llama.cpp's token-level penalties degrade prose quality.
- * Instead, we track N-grams at the app level and steer away from repeats.
- *
- * Usage:
- *   node streaming-tsampler.mjs [model-path]          # Human-readable output
- *   node streaming-tsampler.mjs [model-path] --jsonl  # JSONL output for testing
- */
-
-import * as path from 'node:path';
-import { fileURLToPath } from 'node:url';
-import { createContext, Branch } from '../../lib/index.js';
-
-// Import tsampler from npm package
-import {
-  sampleWithStrategy,
-  // TokenHistoryTracker, // Disabled - matching baseline
-  Xoroshiro128Plus,
-  SamplerWorkspace,
-} from '@lloyal-labs/tsampler';
-
-const __dirname = path.dirname(fileURLToPath(import.meta.url));
-const DEFAULT_MODEL = path.resolve(
-  __dirname,
-  '../../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf'
-);
-
-// Parse args
-const args = process.argv.slice(2);
-const jsonlMode = args.includes('--jsonl');
-const modelPath = args.find(a => !a.startsWith('--')) || DEFAULT_MODEL;
-
-// Parse --max-tokens for CI (default 5000)
-const maxTokensArg = args.find(a => a.startsWith('--max-tokens='));
-const TARGET_TOKENS = maxTokensArg ? parseInt(maxTokensArg.split('=')[1], 10) : 5000;
-
-/** Emit output - JSONL or human-readable */
-function emit(event, data) {
-  if (jsonlMode) {
-    console.log(JSON.stringify({ event, ...data }));
-  }
-}
-
-/**
- * N-gram tracker for sequence-level repetition detection (threshold-based)
- *
- * Tracks N-grams and their followers. Only blocks when the SAME N-gram → follower
- * pattern is seen K times (threshold), indicating true looping behavior rather
- * than coincidental reuse.
- */
-class NgramTracker {
-  constructor(n = 4, threshold = 2) {
-    this.n = n;
-    this.threshold = threshold; // Block after seeing same pattern K times
-    this.ngrams = new Map(); // ngram key -> Map<follower, count>
-    this.recentTokens = [];
-  }
-
-  /**
-   * Record a token and update N-gram history
-   */
-  accept(token) {
-    this.recentTokens.push(token);
-
-    // Once we have enough tokens, record the N-gram and what followed
-    if (this.recentTokens.length > this.n) {
-      const ngramTokens = this.recentTokens.slice(-this.n - 1, -1);
-      const ngramKey = ngramTokens.join(',');
-
-      // Get or create follower counts for this N-gram
-      if (!this.ngrams.has(ngramKey)) {
-        this.ngrams.set(ngramKey, new Map());
-      }
-      const followers = this.ngrams.get(ngramKey);
-
-      // Increment count for this follower
-      const count = followers.get(token) || 0;
-      followers.set(token, count + 1);
-    }
-  }
-
-  /**
-   * Check if current context would repeat an N-gram above threshold
-   * @returns {number|null} Token to block, or null if below threshold
-   */
-  getBlockedToken() {
-    if (this.recentTokens.length < this.n) {
-      return null;
-    }
-
-    const currentNgram = this.recentTokens.slice(-this.n);
-    const ngramKey = currentNgram.join(',');
-
-    const followers = this.ngrams.get(ngramKey);
-    if (!followers) {
-      return null;
-    }
-
-    // Find follower that has hit threshold (true loop)
-    for (const [follower, count] of followers) {
-      if (count >= this.threshold) {
-        return follower;
-      }
-    }
-
-    return null;
-  }
-
-  /**
-   * Get stats for logging
-   */
-  stats() {
-    let totalPatterns = 0;
-    for (const followers of this.ngrams.values()) {
-      totalPatterns += followers.size;
-    }
-    return {
-      uniqueNgrams: this.ngrams.size,
-      totalPatterns,
-      totalTokens: this.recentTokens.length,
-    };
-  }
-}
-
-async function main() {
-  // BlinkKV parameters
-  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
-  const TAIL_SIZE = 256;
-  const NGRAM_SIZE = 6; // Track 6-grams for sequence detection
-  const BLOCK_THRESHOLD = 2; // Only block after seeing same pattern K times
-
-  if (!jsonlMode) {
-    console.log(`Loading model: ${modelPath}`);
-  }
-
-  emit('start', { model: path.basename(modelPath), nCtx, tailSize: TAIL_SIZE, targetTokens: TARGET_TOKENS, ngramSize: NGRAM_SIZE, blockThreshold: BLOCK_THRESHOLD });
-
-  const ctx = await createContext({
-    modelPath,
-    nCtx,
-  });
-
-  const prompt = `Write a comprehensive guide to machine learning, covering the following topics in extreme detail with examples, code snippets, and mathematical formulas:
-
-1. Linear Regression - derivation, implementation, regularization
-2. Logistic Regression - binary and multiclass
-3. Neural Networks - backpropagation, activation functions
-4. Convolutional Neural Networks - architectures, pooling, stride
-5. Recurrent Neural Networks - LSTM, GRU, attention
-6. Transformers - self-attention, positional encoding
-7. Optimization - SGD, Adam, learning rate schedules
-8. Regularization - dropout, batch normalization, weight decay
-
-Begin:
-
-# Comprehensive Machine Learning Guide
-
-## Chapter 1: Linear Regression
-
-`;
-  if (!jsonlMode) {
-    console.log(`\nPrompt: "${prompt.slice(0, 100)}..."`);
-  }
-
-  const promptTokens = await ctx.tokenize(prompt);
-
-  // Track all generated tokens
-  const allTokens = [...promptTokens];
-  const sinks = [...promptTokens]; // Sink the entire prompt
-
-  // tsampler setup
-  const prng = new Xoroshiro128Plus(42); // Fixed seed for reproducibility
-  // const tokenHistory = new TokenHistoryTracker(32); // Disabled - matching baseline
-  const workspace = new SamplerWorkspace(256);
-
-  // N-gram tracker for sequence-level deduplication
-  const ngramTracker = new NgramTracker(NGRAM_SIZE, BLOCK_THRESHOLD);
-
-  // Seed N-gram tracker with prompt tokens
-  for (const token of promptTokens) {
-    ngramTracker.accept(token);
-  }
-
-  if (!jsonlMode) {
-    console.log(`\nContext size: ${nCtx}`);
-    console.log(`Target tokens: ${TARGET_TOKENS}`);
-    console.log(`Sink tokens (prompt): ${sinks.length}`);
-    console.log(`Tail size: ${TAIL_SIZE}`);
-    console.log(`N-gram size: ${NGRAM_SIZE}, block threshold: ${BLOCK_THRESHOLD}`);
-    console.log(`\nGenerating with tsampler + N-gram deduplication (threshold-based)...\n`);
-    process.stdout.write(prompt);
-  }
-
-  // Branch used purely for KV management — sampling done externally via tsampler
-  let branch = Branch.create(ctx, 0, { temperature: 0 });
-  await branch.prefill(promptTokens);
-
-  // Manual PPL tracking (persists across branch reseeds)
-  let nllSum = 0, nllCount = 0;
-  let reseedCount = 0;
-  let blockedCount = 0;
-
-  for (let t = 0; t < TARGET_TOKENS; t++) {
-    // Get logits from branch snapshot
-    const originalLogits = branch.getLogits();
-    const logits = new Float32Array(originalLogits);
-
-    // N-gram deduplication: Check if we're about to repeat a sequence
-    const blockedToken = ngramTracker.getBlockedToken();
-    const wasBlocked = blockedToken !== null && blockedToken < logits.length;
-    if (wasBlocked) {
-      // Steer away from the repeat by setting logit to -Infinity
-      logits[blockedToken] = -Infinity;
-      blockedCount++;
-    }
-
-    // Sample with tsampler (TTA pattern)
-    // Match baseline params exactly: temp 0.8, topP 0.9, no topK, no penalties
-    const token = sampleWithStrategy(logits, {
-      params: {
-        temperature: 0.8,
-        topP: 0.9,
-      },
-      workspace,
-      prng,
-    });
-
-    // Check for EOS
-    if (ctx.isStopToken(token)) {
-      if (!jsonlMode) {
-        console.log('\n[EOS token reached]');
-      }
-      emit('eos', { tokenIndex: t });
-      break;
-    }
-
-    // Accept token into trackers
-    // tokenHistory.accept(token); // Disabled - matching baseline
-    ngramTracker.accept(token);
-
-    // Track surprisal from branch's logits snapshot (before N-gram steering)
-    const surprisal = branch.modelSurprisal(token, 'nats');
-    nllSum += Math.max(0, surprisal);
-    nllCount++;
-
-    // Output token
-    const text = ctx.tokenToText(token);
-    if (!jsonlMode) {
-      process.stdout.write(text);
-    }
-    emit('token', { index: t, token, text, surprisal, blocked: wasBlocked });
-
-    // Store and advance KV (no sampler accept — we're using tsampler externally)
-    allTokens.push(token);
-    await branch.commit(token);
-
-    // Cache full? Reseed at boundary
-    if (branch.position >= nCtx) {
-      const tail = allTokens.slice(-TAIL_SIZE);
-
-      // Destroy current branch, clear KV, create fresh branch with re-prefill
-      await branch.prune();
-      await ctx.kvCacheClear();
-      branch = Branch.create(ctx, 0, { temperature: 0 });
-      await branch.prefill([...sinks, ...tail]);
-
-      reseedCount++;
-
-      const ppl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1;
-      const stats = ngramTracker.stats();
-
-      emit('reseed', { count: reseedCount, tokenIndex: t + 1, ppl, blockedCount, uniqueNgrams: stats.uniqueNgrams });
-
-      if (!jsonlMode) {
-        console.log(`\n  [Reseed ${reseedCount} at token ${t + 1}/${TARGET_TOKENS} | PPL: ${ppl.toFixed(2)} | Blocked: ${blockedCount} | Unique ${NGRAM_SIZE}-grams: ${stats.uniqueNgrams}]`);
-      }
-    }
-
-    // Progress every 1000 tokens
-    if ((t + 1) % 1000 === 0 && branch.position < nCtx && !jsonlMode) {
-      const stats = ngramTracker.stats();
-      console.log(`\n  [${t + 1}/${TARGET_TOKENS} | Blocked repeats: ${blockedCount} | Unique ${NGRAM_SIZE}-grams: ${stats.uniqueNgrams}]`);
-    }
-  }
-
-  const finalPpl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1;
-  const finalStats = ngramTracker.stats();
-  await branch.prune();
-
-  const generatedTokens = allTokens.length - promptTokens.length;
-  emit('complete', {
-    generatedTokens,
-    reseeds: reseedCount,
-    finalPpl,
-    blockedCount,
-    uniqueNgrams: finalStats.uniqueNgrams,
-  });
-
-  if (!jsonlMode) {
-    console.log('\n\n' + '='.repeat(60));
-    console.log(`Generated: ${generatedTokens} tokens`);
-    console.log(`Reseeds: ${reseedCount}`);
-    console.log(`Final perplexity: ${finalPpl.toFixed(2)}`);
-    console.log(`Sequence repeats blocked: ${blockedCount}`);
-    console.log(`Unique ${NGRAM_SIZE}-grams tracked: ${finalStats.uniqueNgrams}`);
-    console.log('='.repeat(60));
-  }
-
-  ctx.dispose();
-}
-
-main().catch((err) => {
-  console.error('Error:', err.message);
-  console.error(err.stack);
-  process.exit(1);
-});
diff --git a/examples/streaming/streaming.mjs b/examples/streaming/streaming.mjs
deleted file mode 100644
index e877e64..0000000
--- a/examples/streaming/streaming.mjs
+++ /dev/null
@@ -1,185 +0,0 @@
-#!/usr/bin/env node
-/**
- * Infinite context generation with BlinkKV
- *
- * Usage:
- *   node streaming.mjs [model-path]          # Human-readable output
- *   node streaming.mjs [model-path] --jsonl  # JSONL output for testing
- *
- * This example demonstrates:
- * - Generating tokens beyond context window limit
- * - KV cache clear + re-prefill for cache-local position reindexing
- * - Per-token perplexity measurement across reseeds
- * - Branch API for generation (produce/commit loop)
- *
- * Parameters from BlinkKV paper: 2048 context, 4 sinks, 256 tail
- */
-
-import * as path from 'node:path';
-import { fileURLToPath } from 'node:url';
-import { createContext, Branch } from '../../lib/index.js';
-
-const __dirname = path.dirname(fileURLToPath(import.meta.url));
-const DEFAULT_MODEL = path.resolve(
-  __dirname,
-  '../../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf'
-);
-
-// Parse args
-const args = process.argv.slice(2);
-const jsonlMode = args.includes('--jsonl');
-const modelPath = args.find(a => !a.startsWith('--')) || DEFAULT_MODEL;
-
-// Parse --max-tokens for CI (default 5000)
-const maxTokensArg = args.find(a => a.startsWith('--max-tokens='));
-const TARGET_TOKENS = maxTokensArg ? parseInt(maxTokensArg.split('=')[1], 10) : 5000;
-
-/** Emit output - JSONL or human-readable */
-function emit(event, data) {
-  if (jsonlMode) {
-    console.log(JSON.stringify({ event, ...data }));
-  }
-}
-
-async function main() {
-  // BlinkKV paper parameters: 2048 context, 4 sinks, 256 tail
-  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
-  const SINK_COUNT = 4;
-  const TAIL_SIZE = 256;
-
-  if (!jsonlMode) {
-    console.log(`Loading model: ${modelPath}`);
-  }
-
-  emit('start', { model: path.basename(modelPath), nCtx, sinkCount: SINK_COUNT, tailSize: TAIL_SIZE, targetTokens: TARGET_TOKENS });
-
-  const ctx = await createContext({
-    modelPath,
-    nCtx,
-  });
-
-  const prompt = `Write a comprehensive guide to machine learning, covering the following topics in extreme detail with examples, code snippets, and mathematical formulas:
-
-1. Linear Regression - derivation, implementation, regularization
-2. Logistic Regression - binary and multiclass
-3. Neural Networks - backpropagation, activation functions
-4. Convolutional Neural Networks - architectures, pooling, stride
-5. Recurrent Neural Networks - LSTM, GRU, attention
-6. Transformers - self-attention, positional encoding
-7. Optimization - SGD, Adam, learning rate schedules
-8. Regularization - dropout, batch normalization, weight decay
-
-Begin:
-
-# Comprehensive Machine Learning Guide
-
-## Chapter 1: Linear Regression
-
-`;
-  if (!jsonlMode) {
-    console.log(`\nPrompt: "${prompt.slice(0, 100)}..."`);
-  }
-
-  const promptTokens = await ctx.tokenize(prompt);
-
-  // Track all generated tokens (needed for reseeding)
-  const allTokens = [...promptTokens];
-  // Sink the entire prompt - it's the structural anchor
-  const sinks = [...promptTokens];
-
-  if (!jsonlMode) {
-    console.log(`\nContext size: ${nCtx}`);
-    console.log(`Target tokens: ${TARGET_TOKENS}`);
-    console.log(`Sink tokens (prompt): ${sinks.length}`);
-    console.log(`Tail size: ${TAIL_SIZE}`);
-    console.log(`Cache size after reseed: ${sinks.length + TAIL_SIZE}`);
-    console.log(`\nGenerating...\n`);
-    process.stdout.write(prompt);
-  }
-
-  const samplingParams = { temperature: 0.8, topP: 0.9 };
-  let branch = Branch.create(ctx, 0, samplingParams);
-  await branch.prefill(promptTokens);
-
-  // Manual PPL tracking (persists across branch reseeds)
-  let nllSum = 0, nllCount = 0;
-  let reseedCount = 0;
-
-  for (let t = 0; t < TARGET_TOKENS; t++) {
-    // NOTE: Token-level repeat penalties are NOT used for long-form generation.
-    // llama.cpp's penalty system penalizes individual tokens (not sequences),
-    // which degrades prose quality over 100+ tokens as common words accumulate
-    // in the penalty buffer. For sequence-level deduplication, use N-gram
-    // tracking with logit steering (TTA pattern) instead.
-    const { token, isStop } = await branch.produce();
-    if (isStop) {
-      if (!jsonlMode) {
-        console.log('\n[EOS token reached]');
-      }
-      emit('eos', { tokenIndex: t });
-      break;
-    }
-
-    // Track surprisal from the logits used by produce()
-    const surprisal = branch.modelSurprisal(token, 'nats');
-    nllSum += Math.max(0, surprisal);
-    nllCount++;
-
-    // Output token
-    const text = ctx.tokenToText(token);
-    if (!jsonlMode) {
-      process.stdout.write(text);
-    }
-    emit('token', { index: t, token, text, surprisal });
-
-    // Store token and commit (decode + capture new logits)
-    allTokens.push(token);
-    await branch.commit(token);
-
-    // Cache full? Reseed at boundary
-    if (branch.position >= nCtx) {
-      const tail = allTokens.slice(-TAIL_SIZE);
-
-      // Destroy current branch, clear KV, create fresh branch with re-prefill
-      await branch.prune();
-      await ctx.kvCacheClear();
-      branch = Branch.create(ctx, 0, samplingParams);
-      await branch.prefill([...sinks, ...tail]);
-
-      reseedCount++;
-
-      const ppl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1;
-      emit('reseed', { count: reseedCount, tokenIndex: t + 1, ppl });
-
-      if (!jsonlMode) {
-        console.log(`\n  [Reseed ${reseedCount} at token ${t + 1}/${TARGET_TOKENS} | PPL: ${ppl.toFixed(2)}]`);
-      }
-    }
-
-    // Progress indicator every 1000 tokens
-    if ((t + 1) % 1000 === 0 && reseedCount === 0 && !jsonlMode) {
-      console.log(`\n  [${t + 1}/${TARGET_TOKENS} tokens]`);
-    }
-  }
-
-  const finalPpl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1;
-  await branch.prune();
-
-  const generatedTokens = allTokens.length - promptTokens.length;
-  emit('complete', { generatedTokens, reseeds: reseedCount, finalPpl });
-
-  if (!jsonlMode) {
-    console.log('\n\n' + '='.repeat(50));
-    console.log(`Generated: ${generatedTokens} tokens`);
-    console.log(`Reseeds: ${reseedCount}`);
-    console.log(`Final perplexity: ${finalPpl.toFixed(2)}`);
-    console.log('='.repeat(50));
-  }
-
-  ctx.dispose();
-}
-
-main().catch((err) => {
-  console.error('Error:', err.message);
-  process.exit(1);
-});
diff --git a/lib/Branch.js b/lib/Branch.js
deleted file mode 100644
index b7ee396..0000000
--- a/lib/Branch.js
+++ /dev/null
@@ -1,471 +0,0 @@
-/**
- * Branch - Forkable inference handle for covalent generation
- *
- * A Branch owns everything needed for independent generation: a KV cache
- * sequence, sampler chain, logits snapshot, and perplexity tracker.
- *
- * Forking is cheap — the KV prefix is shared in memory (metadata-only operation under unified KV —
- * no KV tensor buffers are copied), so sibling branches read from the same physical KV entries.
- * Only tokens decoded after the fork point are exclusive to each branch.
- * This is the covalent property: branches share a bond (common prefix)
- * while diverging independently.
- *
- * Branches form trees, not just flat lists. Fork from root for best-of-N,
- * fork from children for tree search/beam search, fork from a draft for speculative
- * decoding.
- *
- * The produce/commit protocol separates sampling from state advancement:
- * produce() samples without writing to KV, letting you inspect the result
- * before deciding to commit(). This two-phase split is what makes speculative
- * verification and tree search natural.
- *
- * @example Best-of-N with perplexity selection
- * ```js
- * const root = Branch.create(ctx, tokens.length, { temperature: 0.8 });
- * await root.prefill(tokens);
- *
- * const results = [];
- * for (let i = 0; i < 5; i++) {
- *   const branch = await root.fork();
- *   branch.reseedSampler(1000 + i);
- *   const tokens = [];
- *   for await (const { token } of branch) tokens.push(token);
- *   results.push({ branch, tokens, ppl: branch.perplexity });
- * }
- *
- * const best = results.reduce((a, b) => a.ppl < b.ppl ? a : b);
- * for (const r of results) { if (r !== best) await r.branch.prune(); }
- * ```
- */
-
-class Branch {
-  /**
-   * @param {SessionContext} ctx
-   * @param {number} handle
-   */
-  constructor(ctx, handle) {
-    this._ctx = ctx;
-    this._handle = handle;
-    this._disposed = false;
-  }
-
-  /**
-   * Create a root branch at the given position
-   *
-   * The branch takes ownership of the sequence and creates its own sampler
-   * chain from the provided params. Call prefill() to decode prompt tokens
-   * and capture the logit distribution before forking.
-   *
-   * @param {SessionContext} ctx - SessionContext to create branch on
-   * @param {number} position - Starting position (typically prompt token count)
-   * @param {SamplingParams} [params] - Sampling parameters (temperature, topP, etc.)
-   * @param {number} [nBatch] - Per-branch batch size override (defaults to context nBatch).
-   *   Controls chunk size for prefill(). Has no effect on
-   *   single-token commit() which uses a zero-allocation fast path. Useful for tuning
-   *   memory/throughput tradeoff on bulk token decode — e.g. smaller nBatch for cheap
-   *   exploration branches, larger for the trunk.
-   * @param {string} [grammar] - GBNF grammar string for constrained generation.
-   *   When provided, sample() returns only grammar-valid tokens. The grammar state
-   *   is cloned on fork(), so sibling branches can diverge independently.
-   * @returns {Branch} New Branch instance
-   */
-  static create(ctx, position, params, nBatch, grammar) {
-    const handle = ctx._branchCreate(position, params, nBatch, grammar);
-    return new Branch(ctx, handle);
-  }
-
-  /**
-   * Fork this branch to a new sequence
-   *
-   * The child shares the parent's KV prefix in memory (metadata-only under unified KV, no KV buffer copy).
-   * Logits, sampler state, and perplexity tracker are cloned so the child
-   * can diverge independently. Fork from any branch — root or intermediate —
-   * to build arbitrarily deep trees.
-   *
-   * Call reseedSampler() on each child for stochastic diversity.
-   *
-   * @returns {Promise<Branch>} New forked Branch
-   */
-  async fork() {
-    this._ensureNotDisposed();
-    const newHandle = this._ctx._branchFork(this._handle);
-    return new Branch(this._ctx, newHandle);
-  }
-
-  /**
-   * Get a copy of this branch's captured logits snapshot
-   *
-   * Returns n_vocab floats — the raw logit distribution from the last
-   * prefill() or commit() call. Use for distributional analysis
-   * (KL divergence, entropy, top-k overlap) without crossing the
-   * sampling chain.
-   *
-   * @returns {Float32Array} Copy of the logits snapshot (n_vocab elements)
-   * @throws {Error} If no logits have been captured yet
-   */
-  getLogits() {
-    this._ensureNotDisposed();
-    return this._ctx._branchGetLogits(this._handle);
-  }
-
-  /**
-   * Bulk-decode tokens into the branch's KV cache and capture logits
-   *
-   * Feeds an array of tokens through the model. tokens.length is the total
-   * count to process; the branch's nBatch (set at Branch.create) controls
-   * how many are sent per llama_decode call. For example, 500 tokens with
-   * nBatch=64 makes 8 llama_decode calls (7x64 + 1x52). With nBatch=512
-   * it makes 1.
-   *
-   * Advances position by tokens.length and stores the final logits into
-   * the branch's internal snapshot. The next produce()/sample() call reads
-   * from that snapshot — logits never cross the JS boundary.
-   *
-   * Does NOT accept tokens into the sampler's repeat-penalty window — use
-   * this for external tokens (user input between turns), not model-generated
-   * tokens. For model output, use commit() which does accept + decode.
-   *
-   * The primary way to feed tokens into a branch's KV cache.
-   *
-   * @param {number[]} tokens - Token IDs to decode
-   * @returns {Promise<void>}
-   */
-  async prefill(tokens) {
-    this._ensureNotDisposed();
-    await this._ctx._branchPrefill(this._handle, tokens);
-  }
-
-  /**
-   * Sample next token from branch's logits snapshot
-   *
-   * Applies the branch's full sampler chain (top-k, top-p, temperature,
-   * repeat/presence penalties) to the captured logits.
-   *
-   * @returns {number} Sampled token ID
-   */
-  sample() {
-    this._ensureNotDisposed();
-    return this._ctx._branchSample(this._handle);
-  }
-
-  /**
-   * Record token in the sampler's repeat/presence penalty window
-   *
-   * @param {number} token - Token to accept
-   */
-  accept(token) {
-    this._ensureNotDisposed();
-    this._ctx._branchAccept(this._handle, token);
-  }
-
-  /**
-   * Discard this branch entirely — remove its KV entries and free the handle
-   *
-   * Use for losers: branches whose generation you want to erase completely.
-   * Only removes KV entries divergent from the shared prefix; sibling
-   * branches are unaffected.
-   *
-   * @returns {Promise<void>}
-   */
-  async prune() {
-    if (this._disposed) return;
-    this._ctx._branchPrune(this._handle);
-    this._disposed = true;
-  }
-
-  /**
-   * Discard this branch and all its descendants — CASCADE delete
-   *
-   * Iterative post-order traversal: prunes children first, then this branch.
-   * Use when you want to tear down an entire subtree (e.g. abandoned search path).
-   *
-   * @returns {Promise<void>}
-   */
-  async pruneSubtree() {
-    if (this._disposed) return;
-    this._ctx._branchPruneSubtree(this._handle);
-    this._disposed = true;
-  }
-
-  /**
-   * Reseed the sampler's PRNG for diversity after fork()
-   *
-   * CRITICAL for parallel generation: Without reseeding, all forked branches
-   * produce identical outputs because they share the same PRNG state.
-   *
-   * Only affects stochastic samplers (temperature > 0). Greedy samplers are unchanged.
-   *
-   * @param {number} seed - New seed for the PRNG
-   *
-   * @example
-   * ```js
-   * const root = Branch.create(ctx, pos, { temperature: 0.9 });
-   * await root.prefill(promptTokens);
-   *
-   * // Fork and reseed for diversity
-   * const branches = [];
-   * for (let i = 0; i < 5; i++) {
-   *   const branch = await root.fork();
-   *   branch.reseedSampler(1000 + i);  // Each branch gets unique seed
-   *   branches.push(branch);
-   * }
-   * ```
-   */
-  reseedSampler(seed) {
-    this._ensureNotDisposed();
-    this._ctx._branchSamplerChainReseed(this._handle, seed);
-  }
-
-  /**
-   * Apply dynamic logit adjustments for this branch only
-   *
-   * Unlike logit_bias (which is cloned on fork), steer biases are NOT inherited
-   * by child branches. Each branch manages its own steer state independently.
-   *
-   * Use cases:
-   * - tsampler: Block tokens that would create repeated N-grams (per-path history)
-   * - Tree search: Block already-explored actions at this node (not inherited by children)
-   *
-   * Applied during sample() in order: Grammar -> Logit Bias -> Steer -> Sampler Chain
-   *
-   * @param {Array<{token: number, bias: number}>} biases - Token adjustments.
-   *   Use -Infinity to block a token, positive values to boost.
-   *
-   * @example Block tokens for N-gram deduplication
-   * ```js
-   * // Client computes blocked tokens based on generated text
-   * const blocked = computeNgramBlocks(generatedText);
-   * branch.steer(blocked.map(t => ({ token: t, bias: -Infinity })));
-   *
-   * const { token } = await branch.produce();  // Blocked tokens won't be sampled
-   * await branch.commit(token);
-   *
-   * branch.clearSteer();  // Reset for next iteration
-   * ```
-   */
-  steer(biases) {
-    this._ensureNotDisposed();
-    this._ctx._branchSteer(this._handle, biases);
-  }
-
-  /**
-   * Clear all steer biases from this branch
-   *
-   * Removes any dynamic logit adjustments set by steer().
-   */
-  clearSteer() {
-    this._ensureNotDisposed();
-    this._ctx._branchClearSteer(this._handle);
-  }
-
-  /**
-   * Replace the sampler chain with new parameters (memoized)
-   *
-   * If the new params match the current chain's params, this is a no-op.
-   * Otherwise the old chain is freed and a new one is created.
-   *
-   * @param {SamplingParams} params - New sampling parameters
-   */
-  setSamplerParams(params) {
-    this._ensureNotDisposed();
-    this._ctx._branchSetSamplerParams(this._handle, params);
-  }
-
-  /**
-   * Replace or remove the grammar constraint
-   *
-   * Pass a GBNF grammar string to constrain generation, or empty string / null
-   * to remove the constraint. The grammar state is cloned on fork().
-   *
-   * @param {string} [grammarStr] - GBNF grammar string, or empty/null to remove
-   */
-  setGrammar(grammarStr) {
-    this._ensureNotDisposed();
-    this._ctx._branchSetGrammar(this._handle, grammarStr || '');
-  }
-
-  /**
-   * Sample the next token without advancing state (async)
-   *
-   * No KV write, no position update. Inspect the result before deciding
-   * to commit() — this separation is what enables speculative verification
-   * and conditional branching.
-   *
-   * Async contract: local branches resolve immediately; cloud branches
-   * may perform an HTTP round-trip. Use produceSync() when you know the
-   * branch is local and want zero-overhead sampling.
-   *
-   * @returns {Promise<{ token: number, text: string, isStop: boolean }>}
-   */
-  async produce() {
-    return this.produceSync();
-  }
-
-  /**
-   * Sample the next token without advancing state (sync)
-   *
-   * Same as produce() but synchronous. Use when you know the branch is
-   * local and want to avoid the microtick overhead of a promise.
-   *
-   * @returns {{ token: number, text: string, isStop: boolean }}
-   */
-  produceSync() {
-    this._ensureNotDisposed();
-    const token = this.sample();
-    return {
-      token,
-      text: this._ctx.tokenToText(token),
-      isStop: this._ctx.isStopToken(token),
-    };
-  }
-
-  /**
-   * Accept and decode — update branch state, then write token to KV
-   *
-   * Accepts the token into the sampler penalty window (for correct PPL
-   * measurement), then decodes (writing to KV cache) and captures the
-   * resulting logits for the next produce() call. Accept-first ordering
-   * with rollback: if decode throws, sampler/grammar/metrics are restored
-   * from clones taken before the accept.
-   *
-   * @param {number} token - Token to commit (from produce())
-   * @returns {Promise<void>}
-   */
-  async commit(token) {
-    this._ensureNotDisposed();
-    await this._ctx._storeCommit([this._handle], [token]);
-  }
-
-  // ===== METRICS =====
-
-  /**
-   * Compute entropy of the branch's logits distribution
-   *
-   * @param {'nats'|'bits'} [base='nats']
-   * @returns {number}
-   */
-  modelEntropy(base = 'nats') {
-    this._ensureNotDisposed();
-    return this._ctx._branchModelEntropy(this._handle, base);
-  }
-
-  /**
-   * Compute surprisal for a specific token from the branch's logits
-   *
-   * @param {number} token
-   * @param {'nats'|'bits'} [base='nats']
-   * @returns {number}
-   */
-  modelSurprisal(token, base = 'nats') {
-    this._ensureNotDisposed();
-    return this._ctx._branchModelSurprisal(this._handle, token, base);
-  }
-
-  /**
-   * Sampling-level perplexity (from filtered distribution)
-   *
-   * @returns {number}
-   */
-  get samplingPerplexity() {
-    this._ensureNotDisposed();
-    return this._ctx._branchGetSamplingPerplexity(this._handle);
-  }
-
-  /**
-   * Set static logit biases on this branch (cloned on fork)
-   *
-   * @param {Array<{token: number, bias: number}>} biases
-   */
-  setLogitBias(biases) {
-    this._ensureNotDisposed();
-    this._ctx._branchSetLogitBias(this._handle, biases);
-  }
-
-  /**
-   * Clear all static logit biases from this branch
-   */
-  clearLogitBias() {
-    this._ensureNotDisposed();
-    this._ctx._branchClearLogitBias(this._handle);
-  }
-
-  // ===== ACCESSORS =====
-
-  /** @returns {number} Branch's current position (number of tokens decoded) */
-  get position() {
-    this._ensureNotDisposed();
-    return this._ctx._branchGetPosition(this._handle);
-  }
-
-  /** @returns {number} Branch's perplexity (exp of mean surprisal) */
-  get perplexity() {
-    this._ensureNotDisposed();
-    return this._ctx._branchGetPerplexity(this._handle);
-  }
-
-  /** @returns {number} Internal handle (for debugging) */
-  get handle() {
-    return this._handle;
-  }
-
-  /** @returns {boolean} Whether this branch has been disposed */
-  get disposed() {
-    return this._disposed;
-  }
-
-  /** @returns {number|null} Parent branch handle, or null if root */
-  get parent() {
-    this._ensureNotDisposed();
-    const h = this._ctx._branchParent(this._handle);
-    return h === 0 ? null : h;
-  }
-
-  /** @returns {number[]} Child branch handles */
-  get children() {
-    this._ensureNotDisposed();
-    return this._ctx._branchChildren(this._handle);
-  }
-
-  /** @returns {boolean} True if this branch has no children */
-  get isLeaf() {
-    this._ensureNotDisposed();
-    return this._ctx._branchIsLeaf(this._handle);
-  }
-
-  /** @returns {boolean} True if this branch holds a KV lease */
-  get isActive() {
-    this._ensureNotDisposed();
-    return this._ctx._branchIsActive(this._handle);
-  }
-
-  // ===== ASYNC ITERATION =====
-
-  /**
-   * Async iterator — generate tokens until EOG
-   *
-   * Commit-before-yield: every yielded token is already written to KV and
-   * accepted into the sampler. Breaking out of the loop is clean — no
-   * orphaned uncommitted tokens, perplexity reflects all yielded tokens.
-   *
-   * For inspect-before-commit (speculative decoding, tree search), use
-   * the produce()/commit() protocol directly.
-   */
-  async *[Symbol.asyncIterator]() {
-    while (!this._disposed) {
-      const { token, text, isStop } = await this.produce();
-      if (isStop) return;
-      await this.commit(token);
-      yield { token, text };
-    }
-  }
-
-  // ===== INTERNAL =====
-
-  _ensureNotDisposed() {
-    if (this._disposed) {
-      throw new Error('Branch has been disposed');
-    }
-  }
-}
-
-module.exports = { Branch };
diff --git a/lib/BranchStore.js b/lib/BranchStore.js
deleted file mode 100644
index 8b14030..0000000
--- a/lib/BranchStore.js
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * BranchStore - Batched multi-branch decode operations
- *
- * See index.d.ts for full API documentation.
- */
-class BranchStore {
-  constructor(ctx) {
-    this._ctx = ctx;
-  }
-
-  // entries: [branch, token][] — binding is structural, not positional
-  async commit(entries) {
-    const handles = [], tokens = [];
-    for (const [branch, token] of entries) {
-      if (branch.disposed) throw new Error('BranchStore.commit: branch is disposed');
-      handles.push(branch.handle);
-      tokens.push(token);
-    }
-    await this._ctx._storeCommit(handles, tokens);
-  }
-
-  // entries: [branch, tokens[]][] — binding is structural, not positional
-  async prefill(entries) {
-    const handles = [], tokenArrays = [];
-    for (const [branch, tokens] of entries) {
-      if (branch.disposed) throw new Error('BranchStore.prefill: branch is disposed');
-      handles.push(branch.handle);
-      tokenArrays.push(tokens);
-    }
-    await this._ctx._storePrefill(handles, tokenArrays);
-  }
-
-  async retainOnly(winner) {
-    if (winner.disposed) throw new Error('BranchStore.retainOnly: winner is disposed');
-    this._ctx._storeRetainOnly(winner.handle);
-  }
-
-  get available() {
-    return this._ctx._storeAvailable();
-  }
-}
-
-module.exports = { BranchStore };
diff --git a/lib/Session.js b/lib/Session.js
deleted file mode 100644
index 64b019a..0000000
--- a/lib/Session.js
+++ /dev/null
@@ -1,93 +0,0 @@
-/**
- * Session - Trunk lifecycle + conversation delta helpers
- *
- * Owns the current "trunk" branch — the single conversation thread that
- * persists across agent swarms and follow-up turns. Provides promote()
- * to crown a winner (retainOnly + reassign), and delta helpers that
- * centralize the sep + formatChat + tokenize + prefill pattern.
- *
- * Session does NOT own the SessionContext or BranchStore — the consumer
- * creates those and passes them in. dispose() prunes trunk only.
- */
-class Session {
-  /**
-   * @param {{ ctx: SessionContext, store: BranchStore }} opts
-   */
-  constructor({ ctx, store }) {
-    this._ctx = ctx;
-    this._store = store;
-    this._trunk = null;
-  }
-
-  /** @returns {Branch|null} Current trunk branch */
-  get trunk() {
-    return this._trunk;
-  }
-
-  /** @param {Branch} branch - Assign initial trunk (no promote) */
-  set trunk(branch) {
-    this._trunk = branch;
-  }
-
-  /**
-   * Promote a winner to trunk — retainOnly + reassign
-   *
-   * Safe even if winner is the only branch (resets topology, no-op on KV).
-   * @param {Branch} winner
-   */
-  async promote(winner) {
-    await this._store.retainOnly(winner);
-    this._trunk = winner;
-  }
-
-  /**
-   * Dispose trunk only — consumer owns ctx and other resources
-   */
-  async dispose() {
-    if (this._trunk && !this._trunk.disposed) {
-      await this._trunk.prune();
-    }
-    this._trunk = null;
-  }
-
-  /**
-   * Prefill a user turn into trunk
-   *
-   * Centralizes: sep + formatChat([system:'', user:content]) + tokenize(false) + prefill
-   *
-   * @param {string} content - User message content
-   * @param {{ tools?: string }} [opts]
-   */
-  async prefillUser(content, opts = {}) {
-    const sep = this._ctx.getTurnSeparator();
-    const fmtOpts = opts.tools ? { tools: opts.tools } : {};
-    const { prompt } = await this._ctx.formatChat(
-      JSON.stringify([{ role: 'system', content: '' }, { role: 'user', content }]),
-      fmtOpts
-    );
-    const delta = await this._ctx.tokenize(prompt, false);
-    await this._trunk.prefill([...sep, ...delta]);
-  }
-
-  /**
-   * Prefill a tool result turn into trunk
-   *
-   * Centralizes: sep + formatChat([system:'', tool:result]) + tokenize(false) + prefill
-   *
-   * @param {string} resultStr - JSON-stringified tool result
-   * @param {string} callId - Tool call ID
-   */
-  async prefillToolResult(resultStr, callId) {
-    const sep = this._ctx.getTurnSeparator();
-    const { prompt } = await this._ctx.formatChat(
-      JSON.stringify([
-        { role: 'system', content: '' },
-        { role: 'tool', content: resultStr, tool_call_id: callId },
-      ])
-    );
-    const delta = await this._ctx.tokenize(prompt, false);
-    await this._trunk.prefill([...sep, ...delta]);
-  }
-}
-
-module.exports = { Session };
diff --git a/lib/index.js b/lib/index.js
deleted file mode 100644
index 144bbf0..0000000
--- a/lib/index.js
+++ /dev/null
@@ -1,276 +0,0 @@
-/**
- * liblloyal-node - Thin N-API wrapper over liblloyal
- *
- * Exposes raw llama.cpp inference primitives for Node.js.
- * Primary use case: Integration testing for tsampler.
- *
- * @example
- * ```js
- * const { createContext } = require('@lloyal-labs/lloyal.node');
- *
- * const ctx = await createContext({
- *   modelPath: './model.gguf',
- *   nCtx: 2048,
- *   nThreads: 4
- * });
- *
- * // Tokenize
- * const tokens = await ctx.tokenize("Hello world");
- *
- * // Generate via Branch API
- * const branch = Branch.create(ctx, 0, { temperature: 0.7 });
- * await branch.prefill(tokens);
- * for await (const { text } of branch) {
- *   process.stdout.write(text);
- * }
- * await branch.prune();
- *
- * // Cleanup
- * ctx.dispose();
- * ```
- *
- * @example GPU variant selection
- * ```js
- * // Option 1: Environment variable (affects all contexts)
- * // Set LLOYAL_GPU=cuda before running
- *
- * // Option 2: Per-context selection (recommended)
- * const ctx = await createContext(
- *   { modelPath: './model.gguf', nCtx: 4096 },
- *   { gpuVariant: 'cuda' }  // Falls back to CPU if CUDA unavailable
- * );
- * ```
- */
-
-/**
- * Platform package naming: @lloyal-labs/lloyal.node-{platform}-{arch}[-{gpu}]
- * @param {string} [variant] - GPU variant: 'cuda', 'vulkan', or undefined for CPU
- * @returns {string} Platform package name
- */
-const getPlatformPackageName = (variant) => {
-  const platform = process.platform;
-  const arch = process.arch;
-  // cpu/metal/default = no suffix, cuda/vulkan = suffix
-  const noSuffix = !variant || variant === 'default' || variant === 'cpu' || variant === 'metal';
-  const suffix = noSuffix ? '' : `-${variant}`;
-  return `@lloyal-labs/lloyal.node-${platform}-${arch}${suffix}`;
-};
-
-/**
- * Try to load a platform package, return null on failure.
- * Failures include: package not installed, missing GPU runtime libs (dlopen fails),
- * or module doesn't export expected interface.
- * @param {string} packageName - Package name to load
- * @param {boolean} [verbose=false] - Log failure reasons
- * @returns {object|null} The native binary module or null
- */
-const tryLoadPackage = (packageName, verbose = false) => {
-  try {
-    const mod = require(packageName);
-    // Validate it's actually a native module with expected exports
-    if (mod && typeof mod.createContext === 'function') {
-      return mod;
-    }
-    if (verbose) {
-      console.warn(`[lloyal.node] ${packageName} loaded but missing createContext export`);
-    }
-    return null;
-  } catch (e) {
-    if (verbose) {
-      console.warn(`[lloyal.node] Failed to load ${packageName}: ${e.message}`);
-    }
-    return null;
-  }
-};
-
-/**
- * Load the native binary with automatic fallback.
- *
- * **Loading Priority:**
- *
- * When `LLOYAL_LOCAL=1`:
- * - Uses local build exclusively (`build/Release/lloyal.node`)
- * - Throws error if not found (no fallback)
- *
- * Otherwise:
- * 1. Requested GPU variant package (if `variant` param or `LLOYAL_GPU` env var specified)
- * 2. Local build (`build/Release/lloyal.node`) — always fresher during development
- * 3. Default platform package (`@lloyal-labs/lloyal.node-{platform}-{arch}`)
- *
- * **Environment Variables:**
- * - `LLOYAL_LOCAL=1` — Use local build exclusively (`build/Release/lloyal.node`).
- *   Throws an error if local build not found. Use during development to test
- *   local changes without uninstalling npm packages.
- * - `LLOYAL_GPU` — GPU variant to load: `'cuda'` or `'vulkan'`. Equivalent to
- *   passing the `variant` parameter.
- * - `LLOYAL_NO_FALLBACK=1` — Disable fallback when GPU variant fails. Throws an
- *   error instead of silently falling back to CPU. Use in CI to ensure the
- *   specific GPU package loads correctly and catch missing runtime libraries.
- *
- * @param {string} [variant] - GPU variant: `'cuda'`, `'vulkan'`, or `undefined` for CPU.
- *   Overrides `LLOYAL_GPU` env var if specified.
- * @returns {object} The native binary module with `createContext` and `SessionContext`
- * @throws {Error} If no binary can be loaded for the current platform
- *
- * @example Development testing with local build
- * ```bash
- * # Build locally, then test without uninstalling npm packages
- * npm run build
- * LLOYAL_LOCAL=1 node my-script.js
- * ```
- *
- * @example GPU variant selection
- * ```bash
- * # Via environment variable
- * LLOYAL_GPU=cuda node my-script.js
- *
- * # Or programmatically
- * const binary = loadBinary('cuda');
- * ```
- *
- * @example CI: Ensure GPU package loads (no silent fallback)
- * ```bash
- * LLOYAL_GPU=cuda LLOYAL_NO_FALLBACK=1 npm test
- * ```
- */
-const loadBinary = (variant) => {
-  // Use env var if no variant specified
-  variant = variant ?? process.env.LLOYAL_GPU;
-  // LLOYAL_NO_FALLBACK=1 disables fallback (for CI testing specific packages)
-  const noFallback = process.env.LLOYAL_NO_FALLBACK === '1';
-  // LLOYAL_LOCAL=1 forces local build first (development)
-  const useLocal = process.env.LLOYAL_LOCAL === '1';
-
-  // 0. Use local build if explicitly requested (no fallback)
-  if (useLocal) {
-    try {
-      return require('../build/Release/lloyal.node');
-    } catch (e) {
-      throw new Error(
-        '[lloyal.node] LLOYAL_LOCAL=1 but local build not found. ' +
-        'Run `npm run build` first.'
-      );
-    }
-  }
-
-  // 1. Try requested variant (if specified)
-  if (variant && variant !== 'default') {
-    const pkgName = getPlatformPackageName(variant);
-    const binary = tryLoadPackage(pkgName, true); // verbose=true to see errors
-    if (binary) return binary;
-
-    if (noFallback) {
-      throw new Error(
-        `[lloyal.node] GPU variant "${variant}" failed to load. ` +
-        `Package: ${pkgName}. Check that runtime libraries are available.`
-      );
-    }
-    console.warn(`[lloyal.node] GPU variant "${variant}" unavailable, falling back to CPU`);
-  }
-
-  // 2. Try local build (always fresher than installed packages during development)
-  try {
-    return require('../build/Release/lloyal.node');
-  } catch (e) {
-    // ignore — no local build
-  }
-
-  // 3. Try default platform package (CPU)
-  const defaultPkg = getPlatformPackageName();
-  const binary = tryLoadPackage(defaultPkg, true); // verbose=true
-  if (binary) return binary;
-
-  throw new Error(
-    `No lloyal.node binary found for ${process.platform}-${process.arch}. ` +
-    `Tried: ${variant ? getPlatformPackageName(variant) + ', ' : ''}${defaultPkg}`
-  );
-};
-
-// Default binary (loaded lazily on first use)
-let _binary = null;
-const getBinary = () => {
-  if (!_binary) {
-    _binary = loadBinary(process.env.LLOYAL_GPU);
-  }
-  return _binary;
-};
-
-const { Branch } = require('./Branch');
-const { BranchStore } = require('./BranchStore');
-const { Session } = require('./Session');
-const { forkAgent, runAgents } = require('./Agent');
-
-module.exports = {
-  /**
-   * Branch class for parallel generation
-   * @see Branch.create()
-   */
-  Branch,
-  /**
-   * BranchStore class for batched multi-branch decode
-   * @see BranchStore
-   */
-  BranchStore,
-  /**
-   * Session class for trunk lifecycle + conversation deltas
-   * @see Session
-   */
-  Session,
-  /**
-   * Fork an agent from a parent branch with task context
-   * @see forkAgent
-   */
-  forkAgent,
-  /**
-   * Run agents in a batched three-phase tick loop
-   * @see runAgents
-   */
-  runAgents,
-  /**
-   * Create a new inference context
-   *
-   * @param {ContextOptions} options - Context configuration
-   * @param {LoadOptions} [loadOptions] - Binary loading options
-   * @returns {Promise<SessionContext>} The inference context
-   *
-   * @example
-   * ```js
-   * // Basic usage
-   * const ctx = await createContext({
-   *   modelPath: './model.gguf',
-   *   nCtx: 2048,
-   *   nThreads: 4
-   * });
-   *
-   * // With GPU variant
-   * const ctx = await createContext(
-   *   { modelPath: './model.gguf' },
-   *   { gpuVariant: 'cuda' }
-   * );
-   * ```
-   */
-  createContext: async (options, loadOptions) => {
-    const variant = loadOptions?.gpuVariant || process.env.LLOYAL_GPU;
-    const binary = variant ? loadBinary(variant) : getBinary();
-    return binary.createContext(options);
-  },
-
-  /**
-   * Load binary for a specific GPU variant.
-   * Useful for checking variant availability before creating context.
-   *
-   * @param {string} [variant] - 'cuda', 'vulkan', or undefined for CPU
-   * @returns {object} Native binary module
-   * @throws {Error} If no binary available for platform
-   *
-   * @example
-   * ```js
-   * // Load default (CPU) binary
-   * const binary = loadBinary();
-   *
-   * // Load CUDA binary (falls back to CPU if unavailable)
-   * const binary = loadBinary('cuda');
-   * ```
-   */
-  loadBinary,
-};
diff --git a/package-lock.json b/package-lock.json
index 09cdb1a..94ddd88 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -13,10 +13,13 @@
         "node-addon-api": "^8.5.0"
       },
       "devDependencies": {
+        "@types/node": "^25.3.0",
         "cmake-js": "^8.0.0",
         "glob": "^11.0.0",
+        "tsx": "^4.21.0",
         "typedoc": "^0.28.16",
-        "typedoc-rhineai-theme": "^1.2.0"
+        "typedoc-rhineai-theme": "^1.2.0",
+        "typescript": "^5.9.3"
       },
       "engines": {
         "node": ">=22.0.0"
@@ -37,6 +40,448 @@
         "@lloyal-labs/lloyal.node-win32-x64-vulkan": "1.6.0"
       }
     },
+    "node_modules/@esbuild/aix-ppc64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.27.3.tgz",
+      "integrity": "sha512-9fJMTNFTWZMh5qwrBItuziu834eOCUcEqymSH7pY+zoMVEZg3gcPuBNxH1EvfVYe9h0x/Ptw8KBzv7qxb7l8dg==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "aix"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-arm": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.27.3.tgz",
+      "integrity": "sha512-i5D1hPY7GIQmXlXhs2w8AWHhenb00+GxjxRncS2ZM7YNVGNfaMxgzSGuO8o8SJzRc/oZwU2bcScvVERk03QhzA==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.27.3.tgz",
+      "integrity": "sha512-YdghPYUmj/FX2SYKJ0OZxf+iaKgMsKHVPF1MAq/P8WirnSpCStzKJFjOjzsW0QQ7oIAiccHdcqjbHmJxRb/dmg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.27.3.tgz",
+      "integrity": "sha512-IN/0BNTkHtk8lkOM8JWAYFg4ORxBkZQf9zXiEOfERX/CzxW3Vg1ewAhU7QSWQpVIzTW+b8Xy+lGzdYXV6UZObQ==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/darwin-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.27.3.tgz",
+      "integrity": "sha512-Re491k7ByTVRy0t3EKWajdLIr0gz2kKKfzafkth4Q8A5n1xTHrkqZgLLjFEHVD+AXdUGgQMq+Godfq45mGpCKg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/darwin-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.27.3.tgz",
+      "integrity": "sha512-vHk/hA7/1AckjGzRqi6wbo+jaShzRowYip6rt6q7VYEDX4LEy1pZfDpdxCBnGtl+A5zq8iXDcyuxwtv3hNtHFg==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/freebsd-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.27.3.tgz",
+      "integrity": "sha512-ipTYM2fjt3kQAYOvo6vcxJx3nBYAzPjgTCk7QEgZG8AUO3ydUhvelmhrbOheMnGOlaSFUoHXB6un+A7q4ygY9w==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/freebsd-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.27.3.tgz",
+      "integrity": "sha512-dDk0X87T7mI6U3K9VjWtHOXqwAMJBNN2r7bejDsc+j03SEjtD9HrOl8gVFByeM0aJksoUuUVU9TBaZa2rgj0oA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-arm": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.27.3.tgz",
+      "integrity": "sha512-s6nPv2QkSupJwLYyfS+gwdirm0ukyTFNl3KTgZEAiJDd+iHZcbTPPcWCcRYH+WlNbwChgH2QkE9NSlNrMT8Gfw==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.27.3.tgz",
+      "integrity": "sha512-sZOuFz/xWnZ4KH3YfFrKCf1WyPZHakVzTiqji3WDc0BCl2kBwiJLCXpzLzUBLgmp4veFZdvN5ChW4Eq/8Fc2Fg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-ia32": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.27.3.tgz",
+      "integrity": "sha512-yGlQYjdxtLdh0a3jHjuwOrxQjOZYD/C9PfdbgJJF3TIZWnm/tMd/RcNiLngiu4iwcBAOezdnSLAwQDPqTmtTYg==",
+      "cpu": [
+        "ia32"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-loong64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.27.3.tgz",
+      "integrity": "sha512-WO60Sn8ly3gtzhyjATDgieJNet/KqsDlX5nRC5Y3oTFcS1l0KWba+SEa9Ja1GfDqSF1z6hif/SkpQJbL63cgOA==",
+      "cpu": [
+        "loong64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-mips64el": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.27.3.tgz",
+      "integrity": "sha512-APsymYA6sGcZ4pD6k+UxbDjOFSvPWyZhjaiPyl/f79xKxwTnrn5QUnXR5prvetuaSMsb4jgeHewIDCIWljrSxw==",
+      "cpu": [
+        "mips64el"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-ppc64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.27.3.tgz",
+      "integrity": "sha512-eizBnTeBefojtDb9nSh4vvVQ3V9Qf9Df01PfawPcRzJH4gFSgrObw+LveUyDoKU3kxi5+9RJTCWlj4FjYXVPEA==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-riscv64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.27.3.tgz",
+      "integrity": "sha512-3Emwh0r5wmfm3ssTWRQSyVhbOHvqegUDRd0WhmXKX2mkHJe1SFCMJhagUleMq+Uci34wLSipf8Lagt4LlpRFWQ==",
+      "cpu": [
+        "riscv64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-s390x": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.27.3.tgz",
+      "integrity": "sha512-pBHUx9LzXWBc7MFIEEL0yD/ZVtNgLytvx60gES28GcWMqil8ElCYR4kvbV2BDqsHOvVDRrOxGySBM9Fcv744hw==",
+      "cpu": [
+        "s390x"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.27.3.tgz",
+      "integrity": "sha512-Czi8yzXUWIQYAtL/2y6vogER8pvcsOsk5cpwL4Gk5nJqH5UZiVByIY8Eorm5R13gq+DQKYg0+JyQoytLQas4dA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/netbsd-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.27.3.tgz",
+      "integrity": "sha512-sDpk0RgmTCR/5HguIZa9n9u+HVKf40fbEUt+iTzSnCaGvY9kFP0YKBWZtJaraonFnqef5SlJ8/TiPAxzyS+UoA==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/netbsd-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.27.3.tgz",
+      "integrity": "sha512-P14lFKJl/DdaE00LItAukUdZO5iqNH7+PjoBm+fLQjtxfcfFE20Xf5CrLsmZdq5LFFZzb5JMZ9grUwvtVYzjiA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openbsd-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.27.3.tgz",
+      "integrity": "sha512-AIcMP77AvirGbRl/UZFTq5hjXK+2wC7qFRGoHSDrZ5v5b8DK/GYpXW3CPRL53NkvDqb9D+alBiC/dV0Fb7eJcw==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openbsd-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.27.3.tgz",
+      "integrity": "sha512-DnW2sRrBzA+YnE70LKqnM3P+z8vehfJWHXECbwBmH/CU51z6FiqTQTHFenPlHmo3a8UgpLyH3PT+87OViOh1AQ==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openharmony-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.27.3.tgz",
+      "integrity": "sha512-NinAEgr/etERPTsZJ7aEZQvvg/A6IsZG/LgZy+81wON2huV7SrK3e63dU0XhyZP4RKGyTm7aOgmQk0bGp0fy2g==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openharmony"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/sunos-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.27.3.tgz",
+      "integrity": "sha512-PanZ+nEz+eWoBJ8/f8HKxTTD172SKwdXebZ0ndd953gt1HRBbhMsaNqjTyYLGLPdoWHy4zLU7bDVJztF5f3BHA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "sunos"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-arm64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.27.3.tgz",
+      "integrity": "sha512-B2t59lWWYrbRDw/tjiWOuzSsFh1Y/E95ofKz7rIVYSQkUYBjfSgf6oeYPNWHToFRr2zx52JKApIcAS/D5TUBnA==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-ia32": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.27.3.tgz",
+      "integrity": "sha512-QLKSFeXNS8+tHW7tZpMtjlNb7HKau0QDpwm49u0vUp9y1WOF+PEzkU84y9GqYaAVW8aH8f3GcBck26jh54cX4Q==",
+      "cpu": [
+        "ia32"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-x64": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.27.3.tgz",
+      "integrity": "sha512-4uJGhsxuptu3OcpVAzli+/gWusVGwZZHTlS63hh++ehExkVT8SgiEf7/uC/PclrPPkLhZqGgCTjd0VWLo6xMqA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
     "node_modules/@gerrit0/mini-shiki": {
       "version": "3.22.0",
       "resolved": "https://registry.npmjs.org/@gerrit0/mini-shiki/-/mini-shiki-3.22.0.tgz",
@@ -186,6 +631,16 @@
         "@types/unist": "*"
       }
     },
+    "node_modules/@types/node": {
+      "version": "25.3.0",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-25.3.0.tgz",
+      "integrity": "sha512-4K3bqJpXpqfg2XKGK9bpDTc6xO/xoUP/RBWS7AtRMug6zZFaRekiLzjVtAoZMquxoAbzBvy5nxQ7veS5eYzf8A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "undici-types": "~7.18.0"
+      }
+    },
     "node_modules/@types/unist": {
       "version": "3.0.3",
       "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
@@ -484,6 +939,48 @@
         "url": "https://github.com/fb55/entities?sponsor=1"
       }
     },
+    "node_modules/esbuild": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.3.tgz",
+      "integrity": "sha512-8VwMnyGCONIs6cWue2IdpHxHnAjzxnw2Zr7MkVxB2vjmQ2ivqGFb4LEG3SMnv0Gb2F/G/2yA8zUaiL1gywDCCg==",
+      "dev": true,
+      "hasInstallScript": true,
+      "license": "MIT",
+      "bin": {
+        "esbuild": "bin/esbuild"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "optionalDependencies": {
+        "@esbuild/aix-ppc64": "0.27.3",
+        "@esbuild/android-arm": "0.27.3",
+        "@esbuild/android-arm64": "0.27.3",
+        "@esbuild/android-x64": "0.27.3",
+        "@esbuild/darwin-arm64": "0.27.3",
+        "@esbuild/darwin-x64": "0.27.3",
+        "@esbuild/freebsd-arm64": "0.27.3",
+        "@esbuild/freebsd-x64": "0.27.3",
+        "@esbuild/linux-arm": "0.27.3",
+        "@esbuild/linux-arm64": "0.27.3",
+        "@esbuild/linux-ia32": "0.27.3",
+        "@esbuild/linux-loong64": "0.27.3",
+        "@esbuild/linux-mips64el": "0.27.3",
+        "@esbuild/linux-ppc64": "0.27.3",
+        "@esbuild/linux-riscv64": "0.27.3",
+        "@esbuild/linux-s390x": "0.27.3",
+        "@esbuild/linux-x64": "0.27.3",
+        "@esbuild/netbsd-arm64": "0.27.3",
+        "@esbuild/netbsd-x64": "0.27.3",
+        "@esbuild/openbsd-arm64": "0.27.3",
+        "@esbuild/openbsd-x64": "0.27.3",
+        "@esbuild/openharmony-arm64": "0.27.3",
+        "@esbuild/sunos-x64": "0.27.3",
+        "@esbuild/win32-arm64": "0.27.3",
+        "@esbuild/win32-ia32": "0.27.3",
+        "@esbuild/win32-x64": "0.27.3"
+      }
+    },
     "node_modules/escalade": {
       "version": "3.2.0",
       "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
@@ -526,6 +1023,21 @@
         "node": ">=14.14"
       }
     },
+    "node_modules/fsevents": {
+      "version": "2.3.3",
+      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
+      "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
+      "dev": true,
+      "hasInstallScript": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+      }
+    },
     "node_modules/get-caller-file": {
       "version": "2.0.5",
       "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
@@ -536,6 +1048,19 @@
         "node": "6.* || 8.* || >= 10.*"
       }
     },
+    "node_modules/get-tsconfig": {
+      "version": "4.13.6",
+      "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.13.6.tgz",
+      "integrity": "sha512-shZT/QMiSHc/YBLxxOkMtgSid5HFoauqCE3/exfsEcwg1WkeqjG+V40yBbBrsD+jW2HDXcs28xOfcbm2jI8Ddw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "resolve-pkg-maps": "^1.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1"
+      }
+    },
     "node_modules/glob": {
       "version": "11.1.0",
       "resolved": "https://registry.npmjs.org/glob/-/glob-11.1.0.tgz",
@@ -840,6 +1365,16 @@
         "node": ">=0.10.0"
       }
     },
+    "node_modules/resolve-pkg-maps": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz",
+      "integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==",
+      "dev": true,
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
+      }
+    },
     "node_modules/semver": {
       "version": "7.7.3",
       "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
@@ -1020,6 +1555,26 @@
         "node": ">=18"
       }
     },
+    "node_modules/tsx": {
+      "version": "4.21.0",
+      "resolved": "https://registry.npmjs.org/tsx/-/tsx-4.21.0.tgz",
+      "integrity": "sha512-5C1sg4USs1lfG0GFb2RLXsdpXqBSEhAaA/0kPL01wxzpMqLILNxIxIOKiILz+cdg/pLnOUxFYOR5yhHU666wbw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "esbuild": "~0.27.0",
+        "get-tsconfig": "^4.7.5"
+      },
+      "bin": {
+        "tsx": "dist/cli.mjs"
+      },
+      "engines": {
+        "node": ">=18.0.0"
+      },
+      "optionalDependencies": {
+        "fsevents": "~2.3.3"
+      }
+    },
     "node_modules/typedoc": {
       "version": "0.28.17",
       "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.28.17.tgz",
@@ -1074,12 +1629,11 @@
       }
     },
     "node_modules/typescript": {
-      "version": "5.8.3",
-      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.3.tgz",
-      "integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==",
+      "version": "5.9.3",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz",
+      "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
       "dev": true,
       "license": "Apache-2.0",
-      "peer": true,
       "bin": {
         "tsc": "bin/tsc",
         "tsserver": "bin/tsserver"
@@ -1095,6 +1649,13 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/undici-types": {
+      "version": "7.18.2",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.18.2.tgz",
+      "integrity": "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/universalify": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.1.tgz",
diff --git a/package.json b/package.json
index de6e841..6dcedfc 100644
--- a/package.json
+++ b/package.json
@@ -2,25 +2,27 @@
   "name": "@lloyal-labs/lloyal.node",
   "version": "1.6.0",
   "description": "Node.js client for liblloyal+llama.cpp",
-  "main": "lib/index.js",
-  "types": "lib/index.d.ts",
+  "main": "dist/index.js",
+  "types": "dist/index.d.ts",
   "gypfile": false,
   "publishConfig": {
     "access": "public"
   },
   "scripts": {
     "download-models": "bash scripts/download-test-models.sh",
-    "build": "node scripts/build.js",
+    "build:native": "node scripts/build.js",
+    "build:ts": "tsc",
+    "build": "npm run build:ts && npm run build:native",
     "build:debug": "cmake-js compile --debug",
     "rebuild": "cmake-js rebuild",
-    "clean": "cmake-js clean && rm -rf build_test/",
+    "clean": "cmake-js clean && rm -rf build_test/ dist/",
     "version": "node scripts/sync-versions.js && git add -A",
     "docs": "npx typedoc",
     "test": "npm run test:integration",
-    "test:integration": "node test/integration.js",
-    "test:examples": "node test/examples.js",
+    "test:integration": "npx tsx test/integration.ts",
+    "test:examples": "npx tsx test/examples.ts",
     "sync:llama-cpp": "node scripts/sync-llama-cpp.js",
-    "example": "node examples/chat/chat.mjs"
+    "example": "npx tsx examples/chat/chat.ts"
   },
   "repository": {
     "type": "git",
@@ -47,10 +49,13 @@
     "node-addon-api": "^8.5.0"
   },
   "devDependencies": {
+    "@types/node": "^25.3.0",
     "cmake-js": "^8.0.0",
     "glob": "^11.0.0",
+    "tsx": "^4.21.0",
     "typedoc": "^0.28.16",
-    "typedoc-rhineai-theme": "^1.2.0"
+    "typedoc-rhineai-theme": "^1.2.0",
+    "typescript": "^5.9.3"
   },
   "optionalDependencies": {
     "@lloyal-labs/lloyal.node-darwin-arm64": "1.6.0",
@@ -71,7 +76,7 @@
     "node": ">=22.0.0"
   },
   "files": [
-    "lib/",
+    "dist/",
     "scripts/"
   ]
 }
diff --git a/lib/Agent.js b/src/Agent.ts
similarity index 67%
rename from lib/Agent.js
rename to src/Agent.ts
index 4f05a2a..76097bd 100644
--- a/lib/Agent.js
+++ b/src/Agent.ts
@@ -1,26 +1,42 @@
+import type { Branch } from './Branch';
+import type {
+  AgentState,
+  AgentTask,
+  ParsedToolCall,
+  RunAgentsOptions,
+  RunAgentsResult,
+  SessionContext,
+} from './types';
+
 /**
- * Agent - forkAgent + runAgents
+ * Fork an agent from a parent branch with its own system prompt + task
  *
- * Two exported functions for the agentic loop pattern:
- * - forkAgent: fork from parent, format task, compute suffix tokens
- * - runAgents: three-phase tick loop (PRODUCE -> COMMIT -> SETTLE)
+ * Always prepends getTurnSeparator() for a clean structural break before
+ * the agent's system prompt. Returns AgentState ready for store.prefill().
  *
- * Decoupled from Session — takes ctx directly, operates on agent branches.
- * Consumer wires tool dispatch, callbacks, and Session separately.
- */
-
-/**
- * Fork an agent from a parent branch with its own system prompt + task.
+ * @param parent - Branch to fork from
+ * @param task - Agent task description
+ * @param ctx - SessionContext for formatting and tokenization
+ * @returns AgentState with branch and suffixTokens
  *
- * Always prepends getTurnSeparator() — forces clean break before agent's
- * system prompt. Returns AgentState ready for store.prefill().
+ * @example
+ * ```typescript
+ * const agent = await forkAgent(trunk, {
+ *   systemPrompt: 'You are a research assistant.',
+ *   content: 'What is X?',
+ *   tools: toolsJson,
+ *   seed: Date.now(),
+ * }, ctx);
+ * await store.prefill([[agent.branch, agent.suffixTokens]]);
+ * ```
  *
- * @param {Branch} parent - Branch to fork from
- * @param {{ systemPrompt: string, content: string, tools?: string, seed?: number }} task
- * @param {SessionContext} ctx
- * @returns {Promise<AgentState>}
+ * @category Branching
  */
-async function forkAgent(parent, task, ctx) {
+export async function forkAgent(
+  parent: Branch,
+  task: AgentTask,
+  ctx: SessionContext
+): Promise<AgentState> {
   const branch = await parent.fork();
   const messages = [
     { role: 'system', content: task.systemPrompt },
@@ -50,29 +66,31 @@ async function forkAgent(parent, task, ctx) {
 }
 
 /**
- * Run agents in a batched three-phase tick loop.
+ * Run agents in a batched three-phase tick loop
  *
- * Mechanics preserved from runAgentSwarm:
- * - Three-phase tick: PRODUCE -> COMMIT -> SETTLE
- * - Fire-and-forget tool dispatch (tools run while other agents generate)
- * - Warm prefill with sep + delta when tools resolve
- * - `report` tool as completion signal (not dispatched to executeTool)
- * - Non-blocking settle via Promise.race
- * - Idle yield when all active agents are pending tools
+ * Preserves the mechanical execution wins from BranchStore:
+ * shared-prefix KV, batched decode, fire-and-forget tools, idle yield.
  *
- * @param {AgentState[]} agents
- * @param {{
- *   store: BranchStore,
- *   ctx: SessionContext,
- *   executeTool: (name: string, args: object) => Promise<any>,
- *   maxTurns?: number,
- *   onToolCall?: (agentIndex: number, toolName: string, args: string) => void,
- *   onToolResult?: (agentIndex: number, toolName: string, resultStr: string) => void,
- *   onReport?: (agentIndex: number, findings: string) => void,
- * }} opts
- * @returns {Promise<{ totalTokens: number, totalToolCalls: number, steps: number, counters: object }>}
+ * @param agents - Array of AgentState (from forkAgent or manual construction)
+ * @param opts - Configuration including store, ctx, executeTool, and callbacks
+ * @returns Aggregate statistics
+ *
+ * @example
+ * ```typescript
+ * const result = await runAgents(agents, {
+ *   store, ctx,
+ *   executeTool: (name, args) => myToolDispatch(name, args),
+ *   maxTurns: 6,
+ *   onToolCall(ai, name, args) { console.log(`Agent ${ai}: ${name}`); },
+ * });
+ * ```
+ *
+ * @category Branching
  */
-async function runAgents(agents, opts) {
+export async function runAgents(
+  agents: AgentState[],
+  opts: RunAgentsOptions
+): Promise<RunAgentsResult> {
   const { store, ctx, executeTool, maxTurns = 6, onToolCall, onToolResult, onReport } = opts;
   const sep = ctx.getTurnSeparator();
 
@@ -86,11 +104,13 @@ async function runAgents(agents, opts) {
     idleTicks: 0,
   };
 
-  // pendingTools: Map<agentIndex, { promise, name }>
-  const pendingTools = new Map();
+  const pendingTools = new Map<number, {
+    promise: Promise<{ ai: number; prefillTokens: number[] | null }>;
+    name: string;
+  }>();
 
-  function dispatchTool(ai, w, tc) {
-    let toolArgs;
+  function dispatchTool(ai: number, w: AgentState, tc: ParsedToolCall): void {
+    let toolArgs: Record<string, unknown>;
     try { toolArgs = JSON.parse(tc.arguments); } catch { toolArgs = {}; }
     const callId = tc.id || `call_${w.toolCallCount}`;
 
@@ -107,8 +127,6 @@ async function runAgents(agents, opts) {
 
         if (onToolResult) onToolResult(ai, tc.name, resultStr);
 
-        // Format warm prefill tokens — the assistant's tool-call turn is
-        // already in KV from generation; sep closes it.
         const { prompt } = await ctx.formatChat(
           JSON.stringify([
             { role: 'system', content: '' },
@@ -116,10 +134,10 @@ async function runAgents(agents, opts) {
           ])
         );
         const delta = await ctx.tokenize(prompt, false);
-        return { ai, prefillTokens: [...sep, ...delta] };
+        return { ai, prefillTokens: [...sep, ...delta] as number[] | null };
       } catch (err) {
         w.done = true;
-        w.findings = `Tool error: ${err.message}`;
+        w.findings = `Tool error: ${(err as Error).message}`;
         return { ai, prefillTokens: null };
       }
     })();
@@ -130,7 +148,7 @@ async function runAgents(agents, opts) {
 
   for (;;) {
     // -- Phase 1: PRODUCE -- sample from active agents
-    const entries = [];
+    const entries: [Branch, number][] = [];
     for (let ai = 0; ai < agents.length; ai++) {
       const w = agents[ai];
       if (w.done || pendingTools.has(ai)) continue;
@@ -156,7 +174,7 @@ async function runAgents(agents, opts) {
           w.toolCallCount++;
           totalToolCalls++;
           if (onToolCall) onToolCall(ai, 'report', tc.arguments);
-          if (onReport) onReport(ai, w.findings);
+          if (onReport) onReport(ai, w.findings!);
           continue;
         }
 
@@ -178,7 +196,7 @@ async function runAgents(agents, opts) {
     }
 
     // -- Phase 3: SETTLE -- non-blocking check for resolved tools
-    const prefillPairs = [];
+    const prefillPairs: [Branch, number[]][] = [];
     for (const [ai, info] of pendingTools) {
       const result = await Promise.race([info.promise, Promise.resolve(null)]);
       if (result !== null) {
@@ -212,5 +230,3 @@ async function runAgents(agents, opts) {
   const totalTokens = agents.reduce((s, w) => s + w.tokenCount, 0);
   return { totalTokens, totalToolCalls, steps, counters };
 }
-
-module.exports = { forkAgent, runAgents };
diff --git a/src/Branch.ts b/src/Branch.ts
new file mode 100644
index 0000000..d444395
--- /dev/null
+++ b/src/Branch.ts
@@ -0,0 +1,565 @@
+import type { SessionContext, SamplingParams, Produced } from './types';
+
+/**
+ * Forkable inference handle for covalent generation
+ *
+ * A Branch owns everything needed for independent generation: a KV cache
+ * sequence, sampler chain, logits snapshot, and perplexity tracker.
+ *
+ * Forking is cheap — the KV prefix is shared in memory (metadata-only operation under unified KV —
+ * no KV tensor buffers are copied), so sibling branches read from the same physical KV entries.
+ * Only tokens decoded after the fork point are exclusive to each branch.
+ *
+ * Branches form trees, not just flat lists. Fork from root for best-of-N,
+ * fork from children for tree search/beam search, fork from a draft for speculative
+ * decoding.
+ *
+ * The produce/commit protocol separates sampling from state advancement:
+ * produce() samples without writing to KV, letting you inspect the result
+ * before deciding to commit().
+ *
+ * @example Best-of-N with perplexity selection
+ * ```typescript
+ * const root = Branch.create(ctx, tokens.length, { temperature: 0.8 });
+ * await root.prefill(tokens);
+ *
+ * const results = [];
+ * for (let i = 0; i < 5; i++) {
+ *   const branch = await root.fork();
+ *   branch.reseedSampler(1000 + i);
+ *   const tokens = [];
+ *   for await (const { token } of branch) tokens.push(token);
+ *   results.push({ branch, tokens, ppl: branch.perplexity });
+ * }
+ *
+ * const best = results.reduce((a, b) => a.ppl < b.ppl ? a : b);
+ * for (const r of results) { if (r !== best) await r.branch.prune(); }
+ * ```
+ *
+ * @category Branching
+ */
+export class Branch {
+  private _ctx: SessionContext;
+  private _handle: number;
+  private _disposed: boolean;
+
+  constructor(ctx: SessionContext, handle: number) {
+    this._ctx = ctx;
+    this._handle = handle;
+    this._disposed = false;
+  }
+
+  /**
+   * Create a root branch at the given position
+   *
+   * The branch takes ownership of the sequence and creates its own sampler
+   * chain from the provided params. Call prefill() to decode prompt tokens
+   * and capture the logit distribution before forking.
+   *
+   * @param ctx - SessionContext to create branch on
+   * @param position - Starting position (typically prompt token count)
+   * @param params - Sampling parameters (temperature, topP, etc.)
+   * @param nBatch - Per-branch batch size override (defaults to context nBatch).
+   *   Controls chunk size for prefill(). Has no effect on
+   *   single-token commit() which uses a zero-allocation fast path.
+   * @param grammar - GBNF grammar string for constrained generation.
+   *   When provided, sample() returns only grammar-valid tokens. The grammar state
+   *   is cloned on fork(), so sibling branches can diverge independently.
+   * @returns New Branch instance
+   */
+  static create(
+    ctx: SessionContext,
+    position: number,
+    params?: SamplingParams,
+    nBatch?: number,
+    grammar?: string
+  ): Branch {
+    const handle = ctx._branchCreate(position, params, nBatch, grammar);
+    return new Branch(ctx, handle);
+  }
+
+  /**
+   * Fork this branch to a new sequence
+   *
+   * The child shares the parent's KV prefix in memory (metadata-only under unified KV, no KV buffer copy).
+   * Logits, sampler state, and perplexity tracker are cloned so the child
+   * can diverge independently. Fork from any branch — root or intermediate —
+   * to build arbitrarily deep trees.
+   *
+   * Call reseedSampler() on each child for stochastic diversity.
+   *
+   * @returns New forked Branch
+   */
+  async fork(): Promise<Branch> {
+    this._ensureNotDisposed();
+    const newHandle = this._ctx._branchFork(this._handle);
+    return new Branch(this._ctx, newHandle);
+  }
+
+  /**
+   * Get a copy of this branch's captured logits snapshot.
+   *
+   * Returns n_vocab floats — the raw logit distribution from the last
+   * prefill() or commit() call.
+   *
+   * Returns an independent copy of the branch's internal snapshot.
+   * The returned Float32Array is safe to hold across async boundaries
+   * and is not affected by subsequent decode operations.
+   *
+   * @returns Independent copy of the logits snapshot (n_vocab elements)
+   * @throws If no logits have been captured yet
+   */
+  getLogits(): Float32Array {
+    this._ensureNotDisposed();
+    return this._ctx._branchGetLogits(this._handle);
+  }
+
+  /**
+   * Bulk-decode tokens into the branch's KV cache and capture logits.
+   *
+   * `tokens.length` is the total count to process; the branch's `nBatch`
+   * (set at `Branch.create`) controls how many are sent per `llama_decode`
+   * call. E.g. 500 tokens with `nBatch=64` → 8 calls (7×64 + 1×52).
+   *
+   * Advances `position` by `tokens.length`. Stores final logits into the
+   * branch's internal snapshot — the next `produce()`/`sample()` reads
+   * from it.
+   *
+   * Does NOT accept tokens into the repeat-penalty window — for external
+   * tokens (user input between turns), not model-generated tokens.
+   * For model output, use `commit()` which does accept + decode.
+   *
+   * The primary way to feed tokens into a branch's KV cache.
+   *
+   * @param tokens - Token IDs to decode
+   */
+  async prefill(tokens: number[]): Promise<void> {
+    this._ensureNotDisposed();
+    await this._ctx._branchPrefill(this._handle, tokens);
+  }
+
+  /**
+   * Sample next token from branch's logits snapshot
+   *
+   * Applies the branch's full sampler chain (top-k, top-p, temperature,
+   * repeat/presence penalties) to the captured logits.
+   *
+   * @returns Sampled token ID
+   */
+  sample(): number {
+    this._ensureNotDisposed();
+    return this._ctx._branchSample(this._handle);
+  }
+
+  /**
+   * Record token in the sampler's repeat/presence penalty window
+   *
+   * @param token - Token to accept
+   */
+  accept(token: number): void {
+    this._ensureNotDisposed();
+    this._ctx._branchAccept(this._handle, token);
+  }
+
+  /**
+   * Discard this branch — remove its divergent KV entries and free the handle
+   *
+   * Only removes KV entries divergent from the shared prefix; sibling branches
+   * are unaffected. The disposed flag is set synchronously — any call to
+   * produce(), commit(), etc. after prune() will throw immediately, even
+   * before the returned promise resolves.
+   *
+   * RESTRICT mode: throws if children exist. Use {@link pruneSubtree} to
+   * cascade-delete an entire subtree.
+   */
+  async prune(): Promise<void> {
+    if (this._disposed) return;
+    this._ctx._branchPrune(this._handle);
+    this._disposed = true;
+  }
+
+  /**
+   * Discard this branch and all its descendants — CASCADE delete
+   *
+   * Iterative post-order traversal: prunes children first, then this branch.
+   * Use when tearing down an entire subtree (e.g. abandoned search path).
+   * Sets disposed synchronously, like {@link prune}.
+   */
+  async pruneSubtree(): Promise<void> {
+    if (this._disposed) return;
+    this._ctx._branchPruneSubtree(this._handle);
+    this._disposed = true;
+  }
+
+  /**
+   * Reseed the sampler's PRNG for diversity after fork()
+   *
+   * CRITICAL for parallel generation: Without reseeding, all forked branches
+   * produce identical outputs because they share the same PRNG state.
+   *
+   * Only affects stochastic samplers (temperature > 0). Greedy samplers are unchanged.
+   *
+   * @param seed - New seed for the PRNG
+   */
+  reseedSampler(seed: number): void {
+    this._ensureNotDisposed();
+    this._ctx._branchSamplerChainReseed(this._handle, seed);
+  }
+
+  /**
+   * Apply dynamic logit adjustments for this branch only
+   *
+   * Unlike `logit_bias` in sampling params (which is cloned on fork), steer biases
+   * are NOT inherited by child branches. Each branch manages its own steer state
+   * independently. This makes steer ideal for path-dependent constraints.
+   *
+   * **Use cases:**
+   * - **tsampler**: Block tokens that would create repeated N-grams based on
+   *   this branch's specific generation history
+   * - **Diverse beam search**: Penalize tokens already chosen by sibling beams
+   *   to encourage output diversity across the beam
+   * - **Dynamic constraints**: Apply token restrictions that change per-step
+   *
+   * **Sampling order:** Grammar → Logit Bias → Steer → Sampler Chain
+   *
+   * @param biases - Array of token adjustments. Use `-Infinity` to completely
+   *   block a token, positive values to boost probability, negative to reduce.
+   *
+   * @example Block tokens for N-gram deduplication (tsampler pattern)
+   * ```ts
+   * // Compute which tokens would create repeated 4-grams
+   * const blocked = computeNgramBlocks(generatedTokens, n=4);
+   *
+   * // Block those tokens for this sample only
+   * branch.steer(blocked.map(t => ({ token: t, bias: -Infinity })));
+   *
+   * const { token } = await branch.produce();  // Blocked tokens won't be sampled
+   * await branch.commit(token);
+   *
+   * // Clear for next iteration (recompute based on new history)
+   * branch.clearSteer();
+   * ```
+   *
+   * @example Diverse beam search
+   * ```ts
+   * // Each beam penalizes tokens chosen by siblings this step
+   * for (const beam of beams) {
+   *   // Collect tokens chosen by other beams
+   *   const siblingTokens = beams
+   *     .filter(b => b !== beam && b.lastToken !== undefined)
+   *     .map(b => b.lastToken);
+   *
+   *   // Penalize sibling choices to encourage diversity
+   *   beam.branch.steer(siblingTokens.map(t => ({ token: t, bias: -2.0 })));
+   *
+   *   const { token } = await beam.branch.produce();
+   *   await beam.branch.commit(token);
+   *   beam.lastToken = token;
+   *   beam.branch.clearSteer();
+   * }
+   * ```
+   *
+   * @example Boost specific tokens
+   * ```ts
+   * // Boost "yes" and "no" tokens for a yes/no question
+   * branch.steer([
+   *   { token: yesTokenId, bias: 5.0 },
+   *   { token: noTokenId, bias: 5.0 }
+   * ]);
+   * ```
+   */
+  steer(biases: Array<{ token: number; bias: number }>): void {
+    this._ensureNotDisposed();
+    this._ctx._branchSteer(this._handle, biases);
+  }
+
+  /**
+   * Clear all steer biases from this branch
+   *
+   * Removes any dynamic logit adjustments set by `steer()`. Call this after
+   * each generation step if your steer constraints are computed per-step
+   * (e.g., N-gram blocking where the blocked set changes as text grows).
+   *
+   * @example Per-step steer pattern
+   * ```ts
+   * for (let i = 0; i < maxTokens; i++) {
+   *   // Compute constraints based on current state
+   *   const blocked = computeConstraints(generatedTokens);
+   *   branch.steer(blocked.map(t => ({ token: t, bias: -Infinity })));
+   *
+   *   const { token, isStop } = await branch.produce();
+   *   if (isStop) break;
+   *
+   *   await branch.commit(token);
+   *   branch.clearSteer();  // Reset for next iteration
+   *   generatedTokens.push(token);
+   * }
+   * ```
+   */
+  clearSteer(): void {
+    this._ensureNotDisposed();
+    this._ctx._branchClearSteer(this._handle);
+  }
+
+  /**
+   * Replace the sampler chain with new parameters (memoized)
+   *
+   * If the new params match the current chain's params, this is a no-op.
+   * Otherwise the old chain is freed and a new one is created. Use for
+   * Entropy-Driven Temperature (EDT) and other adaptive sampling strategies
+   * that adjust parameters per-step.
+   *
+   * @param params - New sampling parameters
+   *
+   * @example Entropy-Driven Temperature
+   * ```typescript
+   * const entropy = branch.modelEntropy('nats');
+   * branch.setSamplerParams({ temperature: edtTemperature(entropy) });
+   * const { token } = await branch.produce();
+   * await branch.commit(token);
+   * ```
+   */
+  setSamplerParams(params: SamplingParams): void {
+    this._ensureNotDisposed();
+    this._ctx._branchSetSamplerParams(this._handle, params);
+  }
+
+  /**
+   * Replace or remove the grammar constraint
+   *
+   * Pass a GBNF grammar string to constrain generation. Pass empty string
+   * or undefined to remove the constraint. The grammar state is cloned on
+   * fork(), so sibling branches can diverge independently after hot-swap.
+   *
+   * @param grammarStr - GBNF grammar string, or empty/undefined to remove
+   *
+   * @example Hot-swap grammar mid-generation
+   * ```typescript
+   * // Start unconstrained, then switch to JSON after detecting tool call
+   * branch.setGrammar(jsonGrammar);
+   * const { token } = await branch.produce();
+   * ```
+   */
+  setGrammar(grammarStr?: string): void {
+    this._ensureNotDisposed();
+    this._ctx._branchSetGrammar(this._handle, grammarStr || '');
+  }
+
+  /**
+   * Sample next token without advancing state (async)
+   *
+   * Async contract: local branches resolve immediately; cloud branches
+   * may perform an HTTP round-trip. Use {@link produceSync} when you know
+   * the branch is local and want zero-overhead sampling.
+   */
+  async produce(): Promise<Produced> {
+    return this.produceSync();
+  }
+
+  /**
+   * Sample next token without advancing state (sync)
+   *
+   * Same as {@link produce} but synchronous. Use when you know the branch
+   * is local and want to avoid the microtick overhead of a promise.
+   */
+  produceSync(): Produced {
+    this._ensureNotDisposed();
+    const token = this.sample();
+    return {
+      token,
+      text: this._ctx.tokenToText(token),
+      isStop: this._ctx.isStopToken(token),
+    };
+  }
+
+  /**
+   * Accept and decode — update branch state, then write token to KV
+   *
+   * Accepts the token into the sampler penalty window (for correct PPL
+   * measurement), then decodes (writing to KV cache via AsyncWorker on
+   * the libuv thread pool) and captures the resulting logits for the next
+   * produce() call. Accept-first ordering with rollback: if decode throws,
+   * sampler/grammar/metrics are restored from clones.
+   *
+   * @param token Token to commit (from produce())
+   */
+  async commit(token: number): Promise<void> {
+    this._ensureNotDisposed();
+    await this._ctx._storeCommit([this._handle], [token]);
+  }
+
+  // ===== METRICS =====
+
+  /**
+   * Compute entropy of the branch's logits distribution
+   *
+   * Measures model uncertainty from the branch's captured logits snapshot:
+   * - Low entropy: Model is confident (peaked distribution)
+   * - High entropy: Model is uncertain (flat distribution)
+   *
+   * Operates directly on `state->logits_snapshot` — no JS round-trip.
+   *
+   * @param base - Logarithm base: "nats" (default) or "bits"
+   * @returns Entropy value in specified base
+   *
+   * COST: O(n_vocab) - must sum over all token probabilities
+   */
+  modelEntropy(base: 'nats' | 'bits' = 'nats'): number {
+    this._ensureNotDisposed();
+    return this._ctx._branchModelEntropy(this._handle, base);
+  }
+
+  /**
+   * Compute surprisal (negative log-likelihood) for a specific token
+   *
+   * Measures how "surprising" the model finds the given token from
+   * the branch's captured logits snapshot:
+   * - Low surprisal: Model expected this token (high probability)
+   * - High surprisal: Model didn't expect this token (low probability)
+   *
+   * Operates directly on `state->logits_snapshot` — no JS round-trip.
+   *
+   * @param token - Token ID to compute surprisal for
+   * @param base - Logarithm base: "nats" (default) or "bits"
+   * @returns Surprisal value in specified base
+   *
+   * COST: O(n_vocab) - softmax normalization required
+   */
+  modelSurprisal(token: number, base: 'nats' | 'bits' = 'nats'): number {
+    this._ensureNotDisposed();
+    return this._ctx._branchModelSurprisal(this._handle, token, base);
+  }
+
+  /**
+   * Sampling-level perplexity (from filtered distribution)
+   *
+   * Returns perplexity from the distribution actually sampled from
+   * (after top-k/p/temp/penalties). Useful for policy priors and
+   * monitoring sampler chain impact.
+   *
+   * Compare with {@link perplexity} which is model-level (raw logits).
+   */
+  get samplingPerplexity(): number {
+    this._ensureNotDisposed();
+    return this._ctx._branchGetSamplingPerplexity(this._handle);
+  }
+
+  /**
+   * Set static logit biases on this branch
+   *
+   * Unlike {@link steer} (which is NOT inherited on fork), logit biases
+   * ARE cloned when forking. Use for persistent constraints that should
+   * propagate to child branches.
+   *
+   * Applied during sample() in order: Grammar -> Logit Bias -> Steer -> Sampler Chain
+   *
+   * @param biases - Array of token adjustments. Use `-Infinity` to block,
+   *   positive to boost, negative to reduce.
+   */
+  setLogitBias(biases: Array<{ token: number; bias: number }>): void {
+    this._ensureNotDisposed();
+    this._ctx._branchSetLogitBias(this._handle, biases);
+  }
+
+  /**
+   * Clear all static logit biases from this branch
+   */
+  clearLogitBias(): void {
+    this._ensureNotDisposed();
+    this._ctx._branchClearLogitBias(this._handle);
+  }
+
+  // ===== ACCESSORS =====
+
+  /** Branch's current position (number of tokens decoded) */
+  get position(): number {
+    this._ensureNotDisposed();
+    return this._ctx._branchGetPosition(this._handle);
+  }
+
+  /** Branch's perplexity (exp of mean surprisal) */
+  get perplexity(): number {
+    this._ensureNotDisposed();
+    return this._ctx._branchGetPerplexity(this._handle);
+  }
+
+  /** Internal handle (for debugging) */
+  get handle(): number {
+    return this._handle;
+  }
+
+  /** Whether this branch has been disposed */
+  get disposed(): boolean {
+    return this._disposed;
+  }
+
+  /** Parent branch handle, or null if root */
+  get parent(): number | null {
+    this._ensureNotDisposed();
+    const h = this._ctx._branchParent(this._handle);
+    return h === 0 ? null : h;
+  }
+
+  /** Child branch handles */
+  get children(): number[] {
+    this._ensureNotDisposed();
+    return this._ctx._branchChildren(this._handle);
+  }
+
+  /** True if this branch has no children */
+  get isLeaf(): boolean {
+    this._ensureNotDisposed();
+    return this._ctx._branchIsLeaf(this._handle);
+  }
+
+  /** True if this branch holds a KV lease */
+  get isActive(): boolean {
+    this._ensureNotDisposed();
+    return this._ctx._branchIsActive(this._handle);
+  }
+
+  // ===== ASYNC ITERATION =====
+
+  /**
+   * Async iterator — generate tokens until EOG
+   *
+   * Commit-before-yield semantics: every yielded token is already written
+   * to KV and accepted into the sampler. Breaking out of the loop is clean —
+   * no orphaned uncommitted tokens, perplexity reflects all yielded tokens.
+   *
+   * For inspect-before-commit (speculative decoding, tree search), use
+   * the {@link produce}/{@link commit} protocol directly.
+   *
+   * @example Generate to completion
+   * ```typescript
+   * for await (const { token, text } of branch) {
+   *   process.stdout.write(text);
+   * }
+   * ```
+   *
+   * @example Generate with consumer-side bound
+   * ```typescript
+   * const tokens = [];
+   * for await (const { token } of branch) {
+   *   tokens.push(token);
+   *   if (tokens.length >= limit) break;
+   * }
+   * ```
+   */
+  async *[Symbol.asyncIterator](): AsyncIterableIterator<{ token: number; text: string }> {
+    while (!this._disposed) {
+      const { token, text, isStop } = await this.produce();
+      if (isStop) return;
+      await this.commit(token);
+      yield { token, text };
+    }
+  }
+
+  // ===== INTERNAL =====
+
+  private _ensureNotDisposed(): void {
+    if (this._disposed) {
+      throw new Error('Branch has been disposed');
+    }
+  }
+}
diff --git a/src/BranchStore.ts b/src/BranchStore.ts
new file mode 100644
index 0000000..c4813b9
--- /dev/null
+++ b/src/BranchStore.ts
@@ -0,0 +1,155 @@
+import type { Branch } from './Branch';
+import type { SessionContext } from './types';
+
+/**
+ * High-throughput multi-branch decode operations
+ *
+ * The naive approach to N-branch generation is N sequential llama_decode()
+ * calls — each paying full GPU kernel launch overhead, memory barrier, and
+ * PCIe round-trip. BranchStore eliminates this by packing all branches into
+ * a single llama_batch and dispatching once: O(1) GPU round-trips regardless
+ * of branch count. The GPU parallelizes across sequences within the batch,
+ * so N branches approach the wall-time cost of 1.
+ *
+ * Two operations, two packing strategies:
+ *
+ * **commit()** — Generation step. Each branch contributes exactly 1 token.
+ * Packs N tokens into a single batch via `decode_each` (one row per sequence,
+ * all at their respective positions). Single `llama_decode()` call. Logits
+ * captured per-branch at batch index `i`. O(N) total work, O(1) GPU
+ * dispatches, O(1) amortized dispatch overhead per branch. Accept-first
+ * ordering with rollback: accepts each token into its branch's repeat-penalty
+ * window before decode, restores from clones if decode throws.
+ *
+ * **prefill()** — Bulk token injection. Each branch contributes a
+ * variable-length token array. Uses a two-pass bin-packing algorithm:
+ *
+ * - *Pass 1 (planning)*: Greedy first-fit packs items into chunks ≤ nBatch.
+ *   Items larger than nBatch get a dedicated chunk and fall through to
+ *   decode_many's internal auto-chunking (ceil(nTokens / nBatch) calls).
+ * - *Pass 2 (dispatch)*: Normal chunks dispatch via `decode_scatter` (one
+ *   `llama_decode` per chunk). Logits are indexed by flattened cursor
+ *   position: for item k in a chunk, logits live at `cursor + nTokens[k] - 1`.
+ *
+ * For T total tokens across N branches with batch capacity B:
+ * - Best case (T ≤ B): 1 GPU dispatch, all branches in one batch.
+ * - Worst case: ceil(T / B) dispatches. Each dispatch is fully packed.
+ * - Amortized per-token GPU overhead: O(1/B) — vanishes as batch fills.
+ *
+ * Does NOT accept tokens into the sampler penalty window — use for
+ * external/replayed tokens where repeat-penalty tracking is unwanted.
+ * For model-generated tokens, use {@link commit} instead.
+ *
+ * Both methods take `[branch, token(s)]` tuples — the branch-to-token
+ * binding is structural, not positional. After either call, each branch's
+ * logits snapshot is updated with the output distribution from its decoded
+ * token(s), ready for the next `produce()`/`sample()` call.
+ *
+ * @example 32-branch generation step — one GPU dispatch
+ * ```typescript
+ * const store = new BranchStore(ctx);
+ * const entries = await Promise.all(branches.map(async b => [b, (await b.produce()).token] as [Branch, number]));
+ * await store.commit(entries);  // 32 tokens, 1 llama_decode()
+ * ```
+ *
+ * @example Best-of-N with batched commit
+ * ```typescript
+ * const store = new BranchStore(ctx);
+ * const branches = [];
+ * for (const _ of [1, 2, 3]) branches.push(await root.fork());
+ *
+ * for (let step = 0; step < 50; step++) {
+ *   const produced = await Promise.all(branches.map(async b => [b, await b.produce()] as const));
+ *   const live = produced.filter(([, p]) => !p.isStop);
+ *   if (!live.length) break;
+ *   await store.commit(live.map(([b, p]) => [b, p.token]));
+ * }
+ * ```
+ *
+ * @example Asymmetric prefill — variable-length injections, auto-chunked
+ * ```typescript
+ * await store.prefill([
+ *   [branchA, systemPromptTokens],   // 200 tokens
+ *   [branchB, shortQueryTokens],     //  12 tokens
+ *   [branchC, longDocumentTokens],   // 800 tokens
+ * ]);
+ * // Bin-packed into ceil(1012 / nBatch) GPU dispatches
+ * ```
+ *
+ * @category Branching
+ */
+export class BranchStore {
+  private _ctx: SessionContext;
+
+  constructor(ctx: SessionContext) {
+    this._ctx = ctx;
+  }
+
+  /**
+   * Batched single-token commit for model-generated tokens
+   *
+   * Each tuple `[branch, token]` binds one token to one branch.
+   * Accepts each token into its branch's repeat-penalty window (for correct
+   * PPL measurement), then decodes all N tokens in a single llama_decode()
+   * call via decode_each and captures logits per-branch. Accept-first
+   * ordering with rollback: if decode throws, sampler/grammar/metrics are
+   * restored from clones taken before the accept.
+   *
+   * @param entries - Array of `[branch, token]` tuples (branches must not be disposed)
+   * @throws If any branch is disposed
+   */
+  async commit(entries: [Branch, number][]): Promise<void> {
+    const handles: number[] = [];
+    const tokens: number[] = [];
+    for (const [branch, token] of entries) {
+      if (branch.disposed) throw new Error('BranchStore.commit: branch is disposed');
+      handles.push(branch.handle);
+      tokens.push(token);
+    }
+    await this._ctx._storeCommit(handles, tokens);
+  }
+
+  /**
+   * Batched variable-length prefill for external tokens
+   *
+   * Each tuple `[branch, tokens]` binds a token array to one branch.
+   * Each branch can receive a different number of tokens — decode_scatter
+   * handles variable-length runs and auto-chunks to fit nBatch.
+   *
+   * Does NOT call accept_token — use for external/replayed tokens where
+   * repeat-penalty tracking is unwanted. For model-generated tokens,
+   * use {@link commit} instead.
+   *
+   * @param entries - Array of `[branch, tokens]` tuples (branches must not be disposed)
+   * @throws If any branch is disposed
+   */
+  async prefill(entries: [Branch, number[]][]): Promise<void> {
+    const handles: number[] = [];
+    const tokenArrays: number[][] = [];
+    for (const [branch, tokens] of entries) {
+      if (branch.disposed) throw new Error('BranchStore.prefill: branch is disposed');
+      handles.push(branch.handle);
+      tokenArrays.push(tokens);
+    }
+    await this._ctx._storePrefill(handles, tokenArrays);
+  }
+
+  /**
+   * Retain only the winner branch — evict all other leases and free their slots.
+   *
+   * Nuclear operation: calls `kv::seq_keep` on the winner's seq_id (stripping all
+   * other sequences from KV cache in a single pass), then frees all loser slots
+   * and rebuilds the vacancy list. The winner's topology is reset (no parent, no children).
+   *
+   * @param winner - The branch to keep (must not be disposed, must hold a lease)
+   * @throws If winner is disposed or has no lease
+   */
+  async retainOnly(winner: Branch): Promise<void> {
+    if (winner.disposed) throw new Error('BranchStore.retainOnly: winner is disposed');
+    this._ctx._storeRetainOnly(winner.handle);
+  }
+
+  get available(): number {
+    return this._ctx._storeAvailable();
+  }
+}
diff --git a/src/Session.ts b/src/Session.ts
new file mode 100644
index 0000000..068b3c7
--- /dev/null
+++ b/src/Session.ts
@@ -0,0 +1,115 @@
+import type { Branch } from './Branch';
+import type { BranchStore } from './BranchStore';
+import type { SessionContext } from './types';
+
+/**
+ * Session - Trunk lifecycle + conversation delta helpers
+ *
+ * Owns the current "trunk" branch and provides promote() to crown a winner,
+ * plus delta helpers that centralize the sep + formatChat + tokenize + prefill
+ * pattern for injecting new turns into an ongoing conversation.
+ *
+ * Session does NOT own the SessionContext or BranchStore — the consumer
+ * creates those and passes them in. dispose() prunes trunk only.
+ *
+ * @example
+ * ```typescript
+ * const session = new Session({ ctx, store });
+ * session.trunk = initialBranch;
+ *
+ * // After verification, promote the best attempt
+ * await session.promote(bestAttempt.branch);
+ *
+ * // Inject a user turn and generate
+ * await session.prefillUser('What about X?');
+ * for await (const { text } of session.trunk) {
+ *   process.stdout.write(text);
+ * }
+ *
+ * // Cleanup
+ * await session.dispose();
+ * ctx.dispose();
+ * ```
+ *
+ * @category Branching
+ */
+export class Session {
+  private _ctx: SessionContext;
+  private _store: BranchStore;
+  private _trunk: Branch | null;
+
+  constructor({ ctx, store }: { ctx: SessionContext; store: BranchStore }) {
+    this._ctx = ctx;
+    this._store = store;
+    this._trunk = null;
+  }
+
+  /** Current trunk branch */
+  get trunk(): Branch | null {
+    return this._trunk;
+  }
+
+  /** Assign initial trunk (no promote) */
+  set trunk(branch: Branch | null) {
+    this._trunk = branch;
+  }
+
+  /**
+   * Promote a winner to trunk — retainOnly + reassign
+   *
+   * Safe even if winner is the only branch (resets topology, no-op on KV).
+   */
+  async promote(winner: Branch): Promise<void> {
+    await this._store.retainOnly(winner);
+    this._trunk = winner;
+  }
+
+  /**
+   * Dispose trunk only — consumer owns ctx and other resources
+   */
+  async dispose(): Promise<void> {
+    if (this._trunk && !this._trunk.disposed) {
+      await this._trunk.prune();
+    }
+    this._trunk = null;
+  }
+
+  /**
+   * Prefill a user turn into trunk
+   *
+   * Centralizes: sep + formatChat([system:'', user:content]) + tokenize(false) + prefill
+   *
+   * @param content - User message content
+   * @param opts - Optional tools JSON string
+   */
+  async prefillUser(content: string, opts: { tools?: string } = {}): Promise<void> {
+    const sep = this._ctx.getTurnSeparator();
+    const fmtOpts = opts.tools ? { tools: opts.tools } : {};
+    const { prompt } = await this._ctx.formatChat(
+      JSON.stringify([{ role: 'system', content: '' }, { role: 'user', content }]),
+      fmtOpts
+    );
+    const delta = await this._ctx.tokenize(prompt, false);
+    await this._trunk!.prefill([...sep, ...delta]);
+  }
+
+  /**
+   * Prefill a tool result turn into trunk
+   *
+   * Centralizes: sep + formatChat([system:'', tool:result]) + tokenize(false) + prefill
+   *
+   * @param resultStr - JSON-stringified tool result
+   * @param callId - Tool call ID
+   */
+  async prefillToolResult(resultStr: string, callId: string): Promise<void> {
+    const sep = this._ctx.getTurnSeparator();
+    const { prompt } = await this._ctx.formatChat(
+      JSON.stringify([
+        { role: 'system', content: '' },
+        { role: 'tool', content: resultStr, tool_call_id: callId },
+      ])
+    );
+    const delta = await this._ctx.tokenize(prompt, false);
+    await this._trunk!.prefill([...sep, ...delta]);
+  }
+}
diff --git a/src/index.ts b/src/index.ts
new file mode 100644
index 0000000..100b5fb
--- /dev/null
+++ b/src/index.ts
@@ -0,0 +1,279 @@
+/**
+ * liblloyal-node - Thin N-API wrapper over liblloyal
+ *
+ * Exposes raw llama.cpp inference primitives for Node.js.
+ *
+ * @example
+ * ```js
+ * const { createContext } = require('@lloyal-labs/lloyal.node');
+ *
+ * const ctx = await createContext({
+ *   modelPath: './model.gguf',
+ *   nCtx: 2048,
+ *   nThreads: 4
+ * });
+ *
+ * // Tokenize
+ * const tokens = await ctx.tokenize("Hello world");
+ *
+ * // Generate via Branch API
+ * const branch = Branch.create(ctx, 0, { temperature: 0.7 });
+ * await branch.prefill(tokens);
+ * for await (const { text } of branch) {
+ *   process.stdout.write(text);
+ * }
+ * await branch.prune();
+ *
+ * // Cleanup
+ * ctx.dispose();
+ * ```
+ */
+
+import type {
+  ContextOptions,
+  GpuVariant,
+  LoadOptions,
+  NativeBinding,
+  SessionContext,
+} from './types';
+
+import { Branch } from './Branch';
+import { BranchStore } from './BranchStore';
+import { Session } from './Session';
+import { forkAgent, runAgents } from './Agent';
+
+/**
+ * Platform package naming: @lloyal-labs/lloyal.node-{platform}-{arch}[-{gpu}]
+ */
+const getPlatformPackageName = (variant?: string): string => {
+  const platform = process.platform;
+  const arch = process.arch;
+  const noSuffix = !variant || variant === 'default' || variant === 'cpu' || variant === 'metal';
+  const suffix = noSuffix ? '' : `-${variant}`;
+  return `@lloyal-labs/lloyal.node-${platform}-${arch}${suffix}`;
+};
+
+/**
+ * Try to load a platform package, return null on failure.
+ */
+const tryLoadPackage = (packageName: string, verbose = false): NativeBinding | null => {
+  try {
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    const mod = require(packageName) as NativeBinding;
+    if (mod && typeof mod.createContext === 'function') {
+      return mod;
+    }
+    if (verbose) {
+      console.warn(`[lloyal.node] ${packageName} loaded but missing createContext export`);
+    }
+    return null;
+  } catch (e) {
+    if (verbose) {
+      console.warn(`[lloyal.node] Failed to load ${packageName}: ${(e as Error).message}`);
+    }
+    return null;
+  }
+};
+
+/**
+ * Load native binary for a specific GPU variant
+ *
+ * lloyal.node ships as a family of platform-specific npm packages, each
+ * containing a prebuilt native addon:
+ * `@lloyal-labs/lloyal.node-{platform}-{arch}[-{gpu}]`
+ * (e.g., `darwin-arm64`, `linux-x64-cuda`, `win32-x64-vulkan`).
+ *
+ * `loadBinary()` resolves the correct package at runtime with a prioritized
+ * fallback chain:
+ *
+ * 1. Requested GPU variant package (if `variant` or `LLOYAL_GPU` env var set)
+ * 2. Local development build (`build/Release/lloyal.node`)
+ * 3. Default CPU platform package
+ *
+ * Most callers should use {@link createContext} directly — it calls
+ * `loadBinary()` internally. Use this function when you need to:
+ * - Pre-check whether a GPU variant is available before creating contexts
+ * - Share one loaded binary across multiple context creations
+ * - Inspect or test the binary loading logic in isolation
+ *
+ * **Environment variables:**
+ * - `LLOYAL_LOCAL=1` — Force local build only; throws if not found
+ *   (use during development to test local C++ changes)
+ * - `LLOYAL_GPU=cuda|vulkan` — Request GPU variant (equivalent to `variant` param)
+ * - `LLOYAL_NO_FALLBACK=1` — Disable silent CPU fallback; throws if GPU
+ *   variant fails (use in CI to catch missing runtime libraries)
+ *
+ * @param variant GPU variant: 'cuda', 'vulkan', or undefined for CPU
+ * @returns Native binary module with createContext method
+ * @throws Error if no binary available for the current platform
+ *
+ * @example
+ * ```typescript
+ * // Load default (CPU) binary
+ * const binary = loadBinary();
+ *
+ * // Load CUDA binary (falls back to CPU if unavailable)
+ * const binary = loadBinary('cuda');
+ *
+ * // Create context from loaded binary
+ * const ctx = await binary.createContext({ modelPath: './model.gguf' });
+ * ```
+ *
+ * @category Core
+ */
+export const loadBinary = (variant?: GpuVariant): NativeBinding => {
+  const resolvedVariant = variant ?? process.env.LLOYAL_GPU;
+  const noFallback = process.env.LLOYAL_NO_FALLBACK === '1';
+  const useLocal = process.env.LLOYAL_LOCAL === '1';
+
+  // 0. Use local build if explicitly requested (no fallback)
+  if (useLocal) {
+    try {
+      return require('../build/Release/lloyal.node') as NativeBinding;
+    } catch {
+      throw new Error(
+        '[lloyal.node] LLOYAL_LOCAL=1 but local build not found. ' +
+        'Run `npm run build` first.'
+      );
+    }
+  }
+
+  // 1. Try requested variant (if specified)
+  if (resolvedVariant && resolvedVariant !== 'default') {
+    const pkgName = getPlatformPackageName(resolvedVariant);
+    const binary = tryLoadPackage(pkgName, true);
+    if (binary) return binary;
+
+    if (noFallback) {
+      throw new Error(
+        `[lloyal.node] GPU variant "${resolvedVariant}" failed to load. ` +
+        `Package: ${pkgName}. Check that runtime libraries are available.`
+      );
+    }
+    console.warn(`[lloyal.node] GPU variant "${resolvedVariant}" unavailable, falling back to CPU`);
+  }
+
+  // 2. Try local build (always fresher than installed packages during development)
+  try {
+    return require('../build/Release/lloyal.node') as NativeBinding;
+  } catch {
+    // ignore — no local build
+  }
+
+  // 3. Try default platform package (CPU)
+  const defaultPkg = getPlatformPackageName();
+  const binary = tryLoadPackage(defaultPkg, true);
+  if (binary) return binary;
+
+  throw new Error(
+    `No lloyal.node binary found for ${process.platform}-${process.arch}. ` +
+    `Tried: ${resolvedVariant ? getPlatformPackageName(resolvedVariant) + ', ' : ''}${defaultPkg}`
+  );
+};
+
+// Default binary (loaded lazily on first use)
+let _binary: NativeBinding | null = null;
+const getBinary = (): NativeBinding => {
+  if (!_binary) {
+    _binary = loadBinary(process.env.LLOYAL_GPU as GpuVariant | undefined);
+  }
+  return _binary;
+};
+
+/**
+ * Create a new inference context
+ *
+ * Entry point for all inference. Resolves the correct native binary (see
+ * {@link loadBinary} for the platform/GPU fallback chain), loads the model
+ * via a reference-counted registry (multiple contexts can share one model's
+ * weight tensors in memory), and allocates a `llama_context` with its own
+ * KV cache and compute scratch buffers.
+ *
+ * **What gets allocated:**
+ * - KV cache: `nCtx * 2 * nLayers * dHead` bytes per KV type (fp16 default).
+ *   For a 7B model with `nCtx: 4096`, expect ~1-2 GB of KV memory.
+ * - Compute scratch: temporary buffers for the forward pass, sized to `nBatch`.
+ *
+ * **Model sharing:** If two contexts use the same `modelPath`, the model
+ * weights are loaded once and shared. Only the KV cache and compute buffers
+ * are per-context. This makes multi-context setups (e.g., one context per
+ * conversation) memory-efficient.
+ *
+ * @param options Context creation options
+ * @param loadOptions Optional binary loading options (GPU variant selection)
+ * @returns Promise resolving to SessionContext instance
+ *
+ * @example Basic usage
+ * ```typescript
+ * const ctx = await createContext({
+ *   modelPath: './model.gguf',
+ *   nCtx: 2048,
+ *   nThreads: 4
+ * });
+ *
+ * try {
+ *   const tokens = await ctx.tokenize("Hello");
+ *   const branch = Branch.create(ctx, 0, { temperature: 0.7 });
+ *   await branch.prefill(tokens);
+ *   for await (const { text } of branch) process.stdout.write(text);
+ * } finally {
+ *   ctx.dispose();
+ * }
+ * ```
+ *
+ * @example Multi-branch context (tree search, best-of-N)
+ * ```typescript
+ * const ctx = await createContext({
+ *   modelPath: './model.gguf',
+ *   nCtx: 8192,
+ *   nBatch: 512,     // Bin-packing capacity for BranchStore.prefill
+ *   nSeqMax: 33,     // 32 branches + 1 root sequence
+ * });
+ * ```
+ *
+ * @example With GPU variant selection
+ * ```typescript
+ * const ctx = await createContext(
+ *   { modelPath: './model.gguf', nCtx: 4096 },
+ *   { gpuVariant: 'cuda' }
+ * );
+ * ```
+ *
+ * @category Core
+ */
+export const createContext = async (
+  options: ContextOptions,
+  loadOptions?: LoadOptions
+): Promise<SessionContext> => {
+  const variant = loadOptions?.gpuVariant || process.env.LLOYAL_GPU;
+  const binary = variant ? loadBinary(variant as GpuVariant) : getBinary();
+  return binary.createContext(options);
+};
+
+export { Branch, BranchStore, Session, forkAgent, runAgents };
+export { PoolingType, ChatFormat, ReasoningFormat, GrammarTriggerType } from './types';
+export type {
+  GpuVariant,
+  KvCacheType,
+  LoadOptions,
+  ContextOptions,
+  FormatChatOptions,
+  GrammarTrigger,
+  FormattedChatResult,
+  ParseChatOutputOptions,
+  ParsedToolCall,
+  ParseChatOutputResult,
+  PenaltyParams,
+  MirostatParams,
+  DryParams,
+  XtcParams,
+  AdvancedSamplingParams,
+  SamplingParams,
+  SessionContext,
+  Produced,
+  AgentTask,
+  AgentState,
+  RunAgentsOptions,
+  RunAgentsResult,
+  NativeBinding,
+} from './types';
diff --git a/lib/index.d.ts b/src/types.ts
similarity index 60%
rename from lib/index.d.ts
rename to src/types.ts
index b10d48e..b9a0c7e 100644
--- a/lib/index.d.ts
+++ b/src/types.ts
@@ -16,6 +16,9 @@
  * Parallel and tree-structured generation with batched GPU dispatch.
  */
 
+import type { Branch } from './Branch';
+import type { BranchStore } from './BranchStore';
+
 /**
  * GPU variant for binary loading
  *
@@ -90,6 +93,71 @@ export enum PoolingType {
   RANK = 4,
 }
 
+/**
+ * Chat format detected by the template engine
+ *
+ * Identifies how the model formats tool calls, reasoning blocks, and content.
+ * Returned by {@link SessionContext.formatChat | formatChat()} in
+ * {@link FormattedChatResult.format} and consumed by
+ * {@link SessionContext.parseChatOutput | parseChatOutput()}.
+ *
+ * You generally don't need to inspect these values directly --
+ * just pass them through from the formatChat result to parseChatOutput.
+ *
+ * Only commonly-used values are listed. The full set matches llama.cpp's
+ * `common_chat_format` enum (30+ formats).
+ *
+ * @category Chat
+ */
+export enum ChatFormat {
+  /** Plain content, no special formatting */
+  CONTENT_ONLY = 0,
+  /** Generic tool call format */
+  GENERIC = 1,
+}
+
+/**
+ * Reasoning/thinking block format
+ *
+ * Controls how `<think>` blocks are handled during formatting and parsing.
+ *
+ * @see {@link FormatChatOptions.reasoningFormat} for input-side usage
+ * @see {@link ParseChatOutputOptions.reasoningFormat} for output-side usage
+ *
+ * @category Chat
+ */
+export enum ReasoningFormat {
+  /** No reasoning extraction (default) */
+  NONE = 0,
+  /** Auto-detect reasoning format from model template */
+  AUTO = 1,
+  /** DeepSeek legacy format (`<think>...</think>` in content) */
+  DEEPSEEK_LEGACY = 2,
+  /** DeepSeek format (structured reasoning extraction) */
+  DEEPSEEK = 3,
+}
+
+/**
+ * Grammar trigger type
+ *
+ * Determines how lazy grammar activation is triggered during generation.
+ *
+ * @see {@link GrammarTrigger}
+ * @see {@link FormattedChatResult.grammarTriggers}
+ *
+ * @category Chat
+ */
+export enum GrammarTriggerType {
+  /** Trigger on a specific token ID */
+  TOKEN = 0,
+  /** Trigger on a word boundary match */
+  WORD = 1,
+  /** Trigger on a regex pattern match */
+  PATTERN = 2,
+  /** Trigger on a full-string regex pattern match */
+  PATTERN_FULL = 3,
+}
+
 /**
  * Configuration for context creation
  *
@@ -177,71 +245,6 @@ export interface ContextOptions {
   typeV?: KvCacheType;
 }
 
-/**
- * Chat format detected by the template engine
- *
- * Identifies how the model formats tool calls, reasoning blocks, and content.
- * Returned by {@link SessionContext.formatChat | formatChat()} in
- * {@link FormattedChatResult.format} and consumed by
- * {@link SessionContext.parseChatOutput | parseChatOutput()}.
- *
- * You generally don't need to inspect these values directly --
- * just pass them through from the formatChat result to parseChatOutput.
- *
- * Only commonly-used values are listed. The full set matches llama.cpp's
- * `common_chat_format` enum (30+ formats).
- *
- * @category Chat
- */
-export enum ChatFormat {
-  /** Plain content, no special formatting */
-  CONTENT_ONLY = 0,
-  /** Generic tool call format */
-  GENERIC = 1,
-}
-
-/**
- * Reasoning/thinking block format
- *
- * Controls how `<think>` blocks are handled during formatting and parsing.
- *
- * @see {@link FormatChatOptions.reasoningFormat} for input-side usage
- * @see {@link ParseChatOutputOptions.reasoningFormat} for output-side usage
- *
- * @category Chat
- */
-export enum ReasoningFormat {
-  /** No reasoning extraction (default) */
-  NONE = 0,
-  /** Auto-detect reasoning format from model template */
-  AUTO = 1,
-  /** DeepSeek legacy format (`<think>...</think>` in content) */
-  DEEPSEEK_LEGACY = 2,
-  /** DeepSeek format (structured reasoning extraction) */
-  DEEPSEEK = 3,
-}
-
-/**
- * Grammar trigger type
- *
- * Determines how lazy grammar activation is triggered during generation.
- *
- * @see {@link GrammarTrigger}
- * @see {@link FormattedChatResult.grammarTriggers}
- *
- * @category Chat
- */
-export enum GrammarTriggerType {
-  /** Trigger on a specific token ID */
-  TOKEN = 0,
-  /** Trigger on a word boundary match */
-  WORD = 1,
-  /** Trigger on a regex pattern match */
-  PATTERN = 2,
-  /** Trigger on a full-string regex pattern match */
-  PATTERN_FULL = 3,
-}
-
 /**
  * Options for chat template formatting
  *
@@ -824,7 +827,7 @@ export interface SessionContext {
    * - Forgetting specific messages
    * - Preparing for injection of new context
    *
-   * ⚠️ CRITICAL: Call BEFORE next decode(), not after!
+   * CRITICAL: Call BEFORE next decode(), not after!
    * The model needs to know about the removal before processing new tokens.
    *
    * Cost: ~1-5ms depending on range
@@ -884,7 +887,6 @@ export interface SessionContext {
    * Use when starting a completely new conversation.
    *
    * Cost: ~1ms
-   *
    */
   kvCacheClear(): Promise<void>;
 
@@ -902,8 +904,8 @@ export interface SessionContext {
    * **Why not naive eviction?** Selective eviction (`kvCacheRemove`) preserves
    * original position IDs, which grow without bound. Across 5 architectures,
    * naive eviction produces PPL spanning 3 orders of magnitude — ranging from
-   * 1.15× baseline (Llama, lucky config) to 198× (Phi, sinks present).
-   * Under Blink KV reconstruction, all 5 converge to 3–16% of baseline.
+   * 1.15x baseline (Llama, lucky config) to 198x (Phi, sinks present).
+   * Under Blink KV reconstruction, all 5 converge to 3-16% of baseline.
    *
    * **Sinks are optional.** Under reconstruction, the 0+N (sinkless) config
    * matches 4+N (with sinks) within <2% across all tested architectures.
@@ -921,7 +923,7 @@ export interface SessionContext {
    * @param sinks First N tokens from conversation start (typically 4, or empty).
    *   Must be the same tokens every reseed — reusing different tokens degrades
    *   any attention-sink patterns the model may have learned for early positions.
-   * @param tail Recent M tokens to preserve (typically 252–1020)
+   * @param tail Recent M tokens to preserve (typically 252-1020)
    * @returns Promise that resolves when reconstruction completes.
    *   Next decode continues at position `sinks.length + tail.length`.
    *
@@ -960,7 +962,7 @@ export interface SessionContext {
    * physical KV entries for the shared prefix; only tokens decoded after
    * the fork point allocate new storage. This is what makes tree-structured
    * generation (best-of-N, beam search, speculative decoding) memory-efficient:
-   * N branches sharing a 1000-token prefix cost ~1000 KV entries, not N×1000.
+   * N branches sharing a 1000-token prefix cost ~1000 KV entries, not N*1000.
    *
    * The higher-level {@link Branch.fork} wraps this and additionally clones
    * the sampler chain, grammar state, logits snapshot, and perplexity tracker.
@@ -1050,7 +1052,7 @@ export interface SessionContext {
   /**
    * Format messages using model's chat template
    *
-   * Converts [{role, content}] → formatted prompt string with full format awareness.
+   * Converts [{role, content}] -> formatted prompt string with full format awareness.
    * Uses model's built-in template (ChatML, Llama, Mistral, etc.).
    *
    * The returned `format` and `reasoningFormat` fields should be passed to
@@ -1333,209 +1335,93 @@ export interface SessionContext {
 
   // ===== BRANCH API (internal, wrapped by Branch class) =====
 
-  /** @internal Create a new branch for parallel generation */
-  _branchCreate(position: number, params?: SamplingParams, nBatch?: number): number;
+  /** @internal */
+  _branchCreate(position: number, params?: SamplingParams, nBatch?: number, grammar?: string): number;
 
-  /** @internal Fork a branch to a new sequence */
+  /** @internal */
   _branchFork(handle: number): number;
 
-  /** @internal Decode multiple tokens in n_batch-sized chunks and capture logits */
+  /** @internal */
   _branchPrefill(handle: number, tokens: number[]): Promise<void>;
 
-  /** @internal Sample next token from branch's logits snapshot */
+  /** @internal */
   _branchSample(handle: number): number;
 
-  /** @internal Accept token (update sampler state for penalties) */
+  /** @internal */
   _branchAccept(handle: number, token: number): void;
 
-  /** @internal Get branch's current position */
+  /** @internal */
   _branchGetPosition(handle: number): number;
 
-  /** @internal Get branch's perplexity */
+  /** @internal */
   _branchGetPerplexity(handle: number): number;
 
-  /** @internal Get copy of branch's logits snapshot */
+  /** @internal */
   _branchGetLogits(handle: number): Float32Array;
 
-  /** @internal Prune branch (remove KV cache entries and free handle) — RESTRICT: throws if children */
+  /** @internal */
   _branchPrune(handle: number): void;
 
-  /** @internal Prune branch and all descendants — CASCADE */
+  /** @internal */
   _branchPruneSubtree(handle: number): void;
 
-  /** @internal Get parent branch handle (0 = INVALID_HANDLE if root) */
+  /** @internal */
   _branchParent(handle: number): number;
 
-  /** @internal Get child branch handles */
+  /** @internal */
   _branchChildren(handle: number): number[];
 
-  /** @internal Check if branch has no children */
+  /** @internal */
   _branchIsLeaf(handle: number): boolean;
 
-  /** @internal Check if branch holds a KV lease */
+  /** @internal */
   _branchIsActive(handle: number): boolean;
 
-  /** @internal Reseed branch sampler PRNG for diversity after fork */
+  /** @internal */
   _branchSamplerChainReseed(handle: number, seed: number): void;
 
-  /** @internal Set dynamic logit biases for a branch */
+  /** @internal */
   _branchSteer(handle: number, biases: Array<{ token: number; bias: number }>): void;
 
-  /** @internal Clear all dynamic logit biases from a branch */
+  /** @internal */
   _branchClearSteer(handle: number): void;
 
-  /** @internal Replace sampler chain with new parameters (memoized) */
+  /** @internal */
   _branchSetSamplerParams(handle: number, params: SamplingParams): void;
 
-  /** @internal Replace or remove grammar constraint */
+  /** @internal */
   _branchSetGrammar(handle: number, grammarStr: string): void;
 
-  /** @internal Compute entropy from branch's logits snapshot */
+  /** @internal */
   _branchModelEntropy(handle: number, base?: string): number;
 
-  /** @internal Compute surprisal from branch's logits snapshot */
+  /** @internal */
   _branchModelSurprisal(handle: number, token: number, base?: string): number;
 
-  /** @internal Get sampling-level perplexity */
+  /** @internal */
   _branchGetSamplingPerplexity(handle: number): number;
 
-  /** @internal Set static logit biases on a branch */
+  /** @internal */
   _branchSetLogitBias(handle: number, biases: Array<{ token: number; bias: number }>): void;
 
-  /** @internal Clear all static logit biases from a branch */
+  /** @internal */
   _branchClearLogitBias(handle: number): void;
 
   // ===== STORE API (internal, wrapped by BranchStore) =====
 
-  /** @internal Batched accept + decode_each + capture for N branches */
+  /** @internal */
   _storeCommit(handles: number[], tokens: number[]): Promise<void>;
 
-  /** @internal Batched decode_scatter + capture for N branches with variable token counts */
+  /** @internal */
   _storePrefill(handles: number[], tokenArrays: number[][]): Promise<void>;
 
-  /** @internal Retain winner branch, evict all others */
+  /** @internal */
   _storeRetainOnly(handle: number): void;
 
-  /** @internal Get number of available seq_id leases */
+  /** @internal */
   _storeAvailable(): number;
 }
 
-/**
- * Create a new inference context
- *
- * Entry point for all inference. Resolves the correct native binary (see
- * {@link loadBinary} for the platform/GPU fallback chain), loads the model
- * via a reference-counted registry (multiple contexts can share one model's
- * weight tensors in memory), and allocates a `llama_context` with its own
- * KV cache and compute scratch buffers.
- *
- * **What gets allocated:**
- * - KV cache: `nCtx * 2 * nLayers * dHead` bytes per KV type (fp16 default).
- *   For a 7B model with `nCtx: 4096`, expect ~1-2 GB of KV memory.
- * - Compute scratch: temporary buffers for the forward pass, sized to `nBatch`.
- *
- * **Model sharing:** If two contexts use the same `modelPath`, the model
- * weights are loaded once and shared. Only the KV cache and compute buffers
- * are per-context. This makes multi-context setups (e.g., one context per
- * conversation) memory-efficient.
- *
- * @param options Context creation options
- * @param loadOptions Optional binary loading options (GPU variant selection)
- * @returns Promise resolving to SessionContext instance
- *
- * @example Basic usage
- * ```typescript
- * const ctx = await createContext({
- *   modelPath: './model.gguf',
- *   nCtx: 2048,
- *   nThreads: 4
- * });
- *
- * try {
- *   const tokens = await ctx.tokenize("Hello");
- *   const branch = Branch.create(ctx, 0, { temperature: 0.7 });
- *   await branch.prefill(tokens);
- *   for await (const { text } of branch) process.stdout.write(text);
- * } finally {
- *   ctx.dispose();
- * }
- * ```
- *
- * @example Multi-branch context (tree search, best-of-N)
- * ```typescript
- * const ctx = await createContext({
- *   modelPath: './model.gguf',
- *   nCtx: 8192,
- *   nBatch: 512,     // Bin-packing capacity for BranchStore.prefill
- *   nSeqMax: 33,     // 32 branches + 1 root sequence
- * });
- * ```
- *
- * @example With GPU variant selection
- * ```typescript
- * const ctx = await createContext(
- *   { modelPath: './model.gguf', nCtx: 4096 },
- *   { gpuVariant: 'cuda' }
- * );
- * ```
- *
- * @category Core
- */
-export function createContext(
-  options: ContextOptions,
-  loadOptions?: LoadOptions
-): Promise<SessionContext>;
-
-/**
- * Load native binary for a specific GPU variant
- *
- * lloyal.node ships as a family of platform-specific npm packages, each
- * containing a prebuilt native addon:
- * `@lloyal-labs/lloyal.node-{platform}-{arch}[-{gpu}]`
- * (e.g., `darwin-arm64`, `linux-x64-cuda`, `win32-x64-vulkan`).
- *
- * `loadBinary()` resolves the correct package at runtime with a prioritized
- * fallback chain:
- *
- * 1. Requested GPU variant package (if `variant` or `LLOYAL_GPU` env var set)
- * 2. Local development build (`build/Release/lloyal.node`)
- * 3. Default CPU platform package
- *
- * Most callers should use {@link createContext} directly — it calls
- * `loadBinary()` internally. Use this function when you need to:
- * - Pre-check whether a GPU variant is available before creating contexts
- * - Share one loaded binary across multiple context creations
- * - Inspect or test the binary loading logic in isolation
- *
- * **Environment variables:**
- * - `LLOYAL_LOCAL=1` — Force local build only; throws if not found
- *   (use during development to test local C++ changes)
- * - `LLOYAL_GPU=cuda|vulkan` — Request GPU variant (equivalent to `variant` param)
- * - `LLOYAL_NO_FALLBACK=1` — Disable silent CPU fallback; throws if GPU
- *   variant fails (use in CI to catch missing runtime libraries)
- *
- * @param variant GPU variant: 'cuda', 'vulkan', or undefined for CPU
- * @returns Native binary module with createContext method
- * @throws Error if no binary available for the current platform
- *
- * @example
- * ```typescript
- * // Load default (CPU) binary
- * const binary = loadBinary();
- *
- * // Load CUDA binary (falls back to CPU if unavailable)
- * const binary = loadBinary('cuda');
- *
- * // Create context from loaded binary
- * const ctx = await binary.createContext({ modelPath: './model.gguf' });
- * ```
- *
- * @category Core
- */
-export function loadBinary(variant?: GpuVariant): {
-  createContext(options: ContextOptions): Promise<SessionContext>;
-};
-
 /**
  * Result from Branch.produce()
  *
@@ -1550,560 +1436,6 @@ export interface Produced {
   isStop: boolean;
 }
 
-/**
- * Forkable inference handle for covalent generation
- *
- * A Branch owns everything needed for independent generation: a KV cache
- * sequence, sampler chain, logits snapshot, and perplexity tracker.
- *
- * Forking is cheap — the KV prefix is shared in memory (metadata-only operation under unified KV —
- * no KV tensor buffers are copied), so sibling branches read from the same physical KV entries.
- * Only tokens decoded after the fork point are exclusive to each branch.
- *
- * Branches form trees, not just flat lists. Fork from root for best-of-N,
- * fork from children for tree search/beam search, fork from a draft for speculative
- * decoding.
- *
- * The produce/commit protocol separates sampling from state advancement:
- * produce() samples without writing to KV, letting you inspect the result
- * before deciding to commit().
- *
- * @example Best-of-N with perplexity selection
- * ```typescript
- * const root = Branch.create(ctx, tokens.length, { temperature: 0.8 });
- * await root.prefill(tokens);
- *
- * const results = [];
- * for (let i = 0; i < 5; i++) {
- *   const branch = await root.fork();
- *   branch.reseedSampler(1000 + i);
- *   const tokens = [];
- *   for await (const { token } of branch) tokens.push(token);
- *   results.push({ branch, tokens, ppl: branch.perplexity });
- * }
- *
- * const best = results.reduce((a, b) => a.ppl < b.ppl ? a : b);
- * for (const r of results) { if (r !== best) await r.branch.prune(); }
- * ```
- *
- * @category Branching
- */
-export class Branch {
-  /**
-   * Create a root branch at the given position
-   *
-   * The branch takes ownership of the sequence and creates its own sampler
-   * chain from the provided params. Call prefill() to decode prompt tokens
-   * and capture the logit distribution before forking.
-   *
-   * @param ctx SessionContext to create branch on
-   * @param position Starting position (typically prompt token count)
-   * @param params Sampling parameters (temperature, topP, etc.)
-   * @param nBatch Per-branch batch size override (defaults to context nBatch)
-   * @param grammar GBNF grammar string for constrained generation. When provided,
-   *   sample() returns only grammar-valid tokens. The grammar state is cloned on
-   *   fork(), so sibling branches can diverge independently.
-   */
-  static create(
-    ctx: SessionContext,
-    position: number,
-    params?: SamplingParams,
-    nBatch?: number,
-    grammar?: string
-  ): Branch;
-
-  /**
-   * Fork this branch to a new sequence
-   *
-   * The child shares the parent's KV prefix in memory (metadata-only under unified KV, no KV buffer copy).
-   * Logits, sampler state, and perplexity tracker are cloned so the child
-   * can diverge independently. Fork from any branch — root or intermediate —
-   * to build arbitrarily deep trees.
-   *
-   */
-  fork(): Promise<Branch>;
-
-  /**
-   * Get a copy of this branch's captured logits snapshot.
-   *
-   * Returns n_vocab floats — the raw logit distribution from the last
-   * prefill() or commit() call.
-   *
-   * Returns an independent copy of the branch's internal snapshot.
-   * The returned Float32Array is safe to hold across async boundaries
-   * and is not affected by subsequent decode operations.
-   *
-   * @returns Independent copy of the logits snapshot (n_vocab elements)
-   * @throws If no logits have been captured yet
-   */
-  getLogits(): Float32Array;
-
-  /**
-   * Bulk-decode tokens into the branch's KV cache and capture logits.
-   *
-   * `tokens.length` is the total count to process; the branch's `nBatch`
-   * (set at `Branch.create`) controls how many are sent per `llama_decode`
-   * call. E.g. 500 tokens with `nBatch=64` → 8 calls (7×64 + 1×52).
-   *
-   * Advances `position` by `tokens.length`. Stores final logits into the
-   * branch's internal snapshot — the next `produce()`/`sample()` reads
-   * from it.
-   *
-   * Does NOT accept tokens into the repeat-penalty window — for external
-   * tokens (user input between turns), not model-generated tokens.
-   * For model output, use `commit()` which does accept + decode.
-   *
-   * The primary way to feed tokens into a branch's KV cache.
-   *
-   * @param tokens - Token IDs to decode
-   */
-  prefill(tokens: number[]): Promise<void>;
-
-  /** Sample next token from branch's frozen logits snapshot */
-  sample(): number;
-
-  /** Accept token for repeat-penalty tracking */
-  accept(token: number): void;
-
-  /**
-   * Discard this branch — remove its divergent KV entries and free the handle
-   *
-   * Only removes KV entries divergent from the shared prefix; sibling branches
-   * are unaffected. The disposed flag is set synchronously — any call to
-   * produce(), commit(), etc. after prune() will throw immediately, even
-   * before the returned promise resolves.
-   *
-   * RESTRICT mode: throws if children exist. Use {@link pruneSubtree} to
-   * cascade-delete an entire subtree.
-   */
-  prune(): Promise<void>;
-
-  /**
-   * Discard this branch and all its descendants — CASCADE delete
-   *
-   * Iterative post-order traversal: prunes children first, then this branch.
-   * Use when tearing down an entire subtree (e.g. abandoned search path).
-   * Sets disposed synchronously, like {@link prune}.
-   */
-  pruneSubtree(): Promise<void>;
-
-  /**
-   * Reseed the sampler's PRNG for diversity after fork()
-   *
-   * CRITICAL for parallel generation: Without reseeding, all forked branches
-   * produce identical outputs because they share the same PRNG state.
-   *
-   * Only affects stochastic samplers (temperature > 0). Greedy samplers are unchanged.
-   *
-   * @param seed - New seed for the PRNG
-   */
-  reseedSampler(seed: number): void;
-
-  /**
-   * Apply dynamic logit adjustments for this branch only
-   *
-   * Unlike `logit_bias` in sampling params (which is cloned on fork), steer biases
-   * are NOT inherited by child branches. Each branch manages its own steer state
-   * independently. This makes steer ideal for path-dependent constraints.
-   *
-   * **Use cases:**
-   * - **tsampler**: Block tokens that would create repeated N-grams based on
-   *   this branch's specific generation history
-   * - **Diverse beam search**: Penalize tokens already chosen by sibling beams
-   *   to encourage output diversity across the beam
-   * - **Dynamic constraints**: Apply token restrictions that change per-step
-   *
-   * **Sampling order:** Grammar → Logit Bias → Steer → Sampler Chain
-   *
-   * @param biases - Array of token adjustments. Use `-Infinity` to completely
-   *   block a token, positive values to boost probability, negative to reduce.
-   *
-   * @example Block tokens for N-gram deduplication (tsampler pattern)
-   * ```ts
-   * // Compute which tokens would create repeated 4-grams
-   * const blocked = computeNgramBlocks(generatedTokens, n=4);
-   *
-   * // Block those tokens for this sample only
-   * branch.steer(blocked.map(t => ({ token: t, bias: -Infinity })));
-   *
-   * const { token } = await branch.produce();  // Blocked tokens won't be sampled
-   * await branch.commit(token);
-   *
-   * // Clear for next iteration (recompute based on new history)
-   * branch.clearSteer();
-   * ```
-   *
-   * @example Diverse beam search
-   * ```ts
-   * // Each beam penalizes tokens chosen by siblings this step
-   * for (const beam of beams) {
-   *   // Collect tokens chosen by other beams
-   *   const siblingTokens = beams
-   *     .filter(b => b !== beam && b.lastToken !== undefined)
-   *     .map(b => b.lastToken);
-   *
-   *   // Penalize sibling choices to encourage diversity
-   *   beam.branch.steer(siblingTokens.map(t => ({ token: t, bias: -2.0 })));
-   *
-   *   const { token } = await beam.branch.produce();
-   *   await beam.branch.commit(token);
-   *   beam.lastToken = token;
-   *   beam.branch.clearSteer();
-   * }
-   * ```
-   *
-   * @example Boost specific tokens
-   * ```ts
-   * // Boost "yes" and "no" tokens for a yes/no question
-   * branch.steer([
-   *   { token: yesTokenId, bias: 5.0 },
-   *   { token: noTokenId, bias: 5.0 }
-   * ]);
-   * ```
-   */
-  steer(biases: Array<{ token: number; bias: number }>): void;
-
-  /**
-   * Clear all steer biases from this branch
-   *
-   * Removes any dynamic logit adjustments set by `steer()`. Call this after
-   * each generation step if your steer constraints are computed per-step
-   * (e.g., N-gram blocking where the blocked set changes as text grows).
-   *
-   * @example Per-step steer pattern
-   * ```ts
-   * for (let i = 0; i < maxTokens; i++) {
-   *   // Compute constraints based on current state
-   *   const blocked = computeConstraints(generatedTokens);
-   *   branch.steer(blocked.map(t => ({ token: t, bias: -Infinity })));
-   *
-   *   const { token, isStop } = await branch.produce();
-   *   if (isStop) break;
-   *
-   *   await branch.commit(token);
-   *   branch.clearSteer();  // Reset for next iteration
-   *   generatedTokens.push(token);
-   * }
-   * ```
-   */
-  clearSteer(): void;
-
-  /**
-   * Compute entropy of the branch's logits distribution
-   *
-   * Measures model uncertainty from the branch's captured logits snapshot:
-   * - Low entropy: Model is confident (peaked distribution)
-   * - High entropy: Model is uncertain (flat distribution)
-   *
-   * Operates directly on `state->logits_snapshot` — no JS round-trip.
-   *
-   * @param base - Logarithm base: "nats" (default) or "bits"
-   * @returns Entropy value in specified base
-   *
-   * COST: O(n_vocab) - must sum over all token probabilities
-   */
-  modelEntropy(base?: 'nats' | 'bits'): number;
-
-  /**
-   * Compute surprisal (negative log-likelihood) for a specific token
-   *
-   * Measures how "surprising" the model finds the given token from
-   * the branch's captured logits snapshot:
-   * - Low surprisal: Model expected this token (high probability)
-   * - High surprisal: Model didn't expect this token (low probability)
-   *
-   * Operates directly on `state->logits_snapshot` — no JS round-trip.
-   *
-   * @param token - Token ID to compute surprisal for
-   * @param base - Logarithm base: "nats" (default) or "bits"
-   * @returns Surprisal value in specified base
-   *
-   * COST: O(n_vocab) - softmax normalization required
-   */
-  modelSurprisal(token: number, base?: 'nats' | 'bits'): number;
-
-  /**
-   * Sampling-level perplexity (from filtered distribution)
-   *
-   * Returns perplexity from the distribution actually sampled from
-   * (after top-k/p/temp/penalties). Useful for policy priors and
-   * monitoring sampler chain impact.
-   *
-   * Compare with {@link perplexity} which is model-level (raw logits).
-   */
-  readonly samplingPerplexity: number;
-
-  /**
-   * Set static logit biases on this branch
-   *
-   * Unlike {@link steer} (which is NOT inherited on fork), logit biases
-   * ARE cloned when forking. Use for persistent constraints that should
-   * propagate to child branches.
-   *
-   * Applied during sample() in order: Grammar -> Logit Bias -> Steer -> Sampler Chain
-   *
-   * @param biases - Array of token adjustments. Use `-Infinity` to block,
-   *   positive to boost, negative to reduce.
-   */
-  setLogitBias(biases: Array<{ token: number; bias: number }>): void;
-
-  /**
-   * Clear all static logit biases from this branch
-   */
-  clearLogitBias(): void;
-
-  /**
-   * Replace the sampler chain with new parameters (memoized)
-   *
-   * If the new params match the current chain's params, this is a no-op.
-   * Otherwise the old chain is freed and a new one is created. Use for
-   * Entropy-Driven Temperature (EDT) and other adaptive sampling strategies
-   * that adjust parameters per-step.
-   *
-   * @param params - New sampling parameters
-   *
-   * @example Entropy-Driven Temperature
-   * ```typescript
-   * const entropy = branch.modelEntropy('nats');
-   * branch.setSamplerParams({ temperature: edtTemperature(entropy) });
-   * const { token } = await branch.produce();
-   * await branch.commit(token);
-   * ```
-   */
-  setSamplerParams(params: SamplingParams): void;
-
-  /**
-   * Replace or remove the grammar constraint
-   *
-   * Pass a GBNF grammar string to constrain generation. Pass empty string
-   * or undefined to remove the constraint. The grammar state is cloned on
-   * fork(), so sibling branches can diverge independently after hot-swap.
-   *
-   * @param grammarStr - GBNF grammar string, or empty/undefined to remove
-   *
-   * @example Hot-swap grammar mid-generation
-   * ```typescript
-   * // Start unconstrained, then switch to JSON after detecting tool call
-   * branch.setGrammar(jsonGrammar);
-   * const { token } = await branch.produce();
-   * ```
-   */
-  setGrammar(grammarStr?: string): void;
-
-  /**
-   * Sample next token without advancing state (async)
-   *
-   * Async contract: local branches resolve immediately; cloud branches
-   * may perform an HTTP round-trip. Use {@link produceSync} when you know
-   * the branch is local and want zero-overhead sampling.
-   */
-  produce(): Promise<Produced>;
-
-  /**
-   * Sample next token without advancing state (sync)
-   *
-   * Same as {@link produce} but synchronous. Use when you know the branch
-   * is local and want to avoid the microtick overhead of a promise.
-   */
-  produceSync(): Produced;
-
-  /**
-   * Accept and decode — update branch state, then write token to KV
-   *
-   * Accepts the token into the sampler penalty window (for correct PPL
-   * measurement), then decodes (writing to KV cache via AsyncWorker on
-   * the libuv thread pool) and captures the resulting logits for the next
-   * produce() call. Accept-first ordering with rollback: if decode throws,
-   * sampler/grammar/metrics are restored from clones.
-   *
-   * @param token Token to commit (from produce())
-   */
-  commit(token: number): Promise<void>;
-
-  /** Branch's current position */
-  readonly position: number;
-
-  /** Branch's perplexity */
-  readonly perplexity: number;
-
-  /** Internal handle (for debugging) */
-  readonly handle: number;
-
-  /** Whether this branch has been disposed */
-  readonly disposed: boolean;
-
-  /** Parent branch handle, or null if root */
-  readonly parent: number | null;
-
-  /** Child branch handles */
-  readonly children: number[];
-
-  /** True if this branch has no children */
-  readonly isLeaf: boolean;
-
-  /** True if this branch holds a KV lease */
-  readonly isActive: boolean;
-
-  /**
-   * Async iterator — generate tokens until EOG
-   *
-   * Commit-before-yield semantics: every yielded token is already written
-   * to KV and accepted into the sampler. Breaking out of the loop is clean —
-   * no orphaned uncommitted tokens, perplexity reflects all yielded tokens.
-   *
-   * For inspect-before-commit (speculative decoding, tree search), use
-   * the {@link produce}/{@link commit} protocol directly.
-   *
-   * @example Generate to completion
-   * ```typescript
-   * for await (const { token, text } of branch) {
-   *   process.stdout.write(text);
-   * }
-   * ```
-   *
-   * @example Generate with consumer-side bound
-   * ```typescript
-   * const tokens = [];
-   * for await (const { token } of branch) {
-   *   tokens.push(token);
-   *   if (tokens.length >= limit) break;
-   * }
-   * ```
-   */
-  [Symbol.asyncIterator](): AsyncIterableIterator<{ token: number; text: string }>;
-}
-
-/**
- * High-throughput multi-branch decode operations
- *
- * The naive approach to N-branch generation is N sequential llama_decode()
- * calls — each paying full GPU kernel launch overhead, memory barrier, and
- * PCIe round-trip. BranchStore eliminates this by packing all branches into
- * a single llama_batch and dispatching once: O(1) GPU round-trips regardless
- * of branch count. The GPU parallelizes across sequences within the batch,
- * so N branches approach the wall-time cost of 1.
- *
- * Two operations, two packing strategies:
- *
- * **commit()** — Generation step. Each branch contributes exactly 1 token.
- * Packs N tokens into a single batch via `decode_each` (one row per sequence,
- * all at their respective positions). Single `llama_decode()` call. Logits
- * captured per-branch at batch index `i`. O(N) total work, O(1) GPU
- * dispatches, O(1) amortized dispatch overhead per branch. Accept-first
- * ordering with rollback: accepts each token into its branch's repeat-penalty
- * window before decode, restores from clones if decode throws.
- *
- * **prefill()** — Bulk token injection. Each branch contributes a
- * variable-length token array. Uses a two-pass bin-packing algorithm:
- *
- * - *Pass 1 (planning)*: Greedy first-fit packs items into chunks ≤ nBatch.
- *   Items larger than nBatch get a dedicated chunk and fall through to
- *   decode_many's internal auto-chunking (ceil(nTokens / nBatch) calls).
- * - *Pass 2 (dispatch)*: Normal chunks dispatch via `decode_scatter` (one
- *   `llama_decode` per chunk). Logits are indexed by flattened cursor
- *   position: for item k in a chunk, logits live at `cursor + nTokens[k] - 1`.
- *
- * For T total tokens across N branches with batch capacity B:
- * - Best case (T ≤ B): 1 GPU dispatch, all branches in one batch.
- * - Worst case: ceil(T / B) dispatches. Each dispatch is fully packed.
- * - Amortized per-token GPU overhead: O(1/B) — vanishes as batch fills.
- *
- * Does NOT accept tokens into the sampler penalty window — use for
- * external/replayed tokens where repeat-penalty tracking is unwanted.
- * For model-generated tokens, use {@link commit} instead.
- *
- * Both methods take `[branch, token(s)]` tuples — the branch-to-token
- * binding is structural, not positional. After either call, each branch's
- * logits snapshot is updated with the output distribution from its decoded
- * token(s), ready for the next `produce()`/`sample()` call.
- *
- * @example 32-branch generation step — one GPU dispatch
- * ```typescript
- * const store = new BranchStore(ctx);
- * const entries = await Promise.all(branches.map(async b => [b, (await b.produce()).token] as [Branch, number]));
- * await store.commit(entries);  // 32 tokens, 1 llama_decode()
- * ```
- *
- * @example Best-of-N with batched commit
- * ```typescript
- * const store = new BranchStore(ctx);
- * const branches = [];
- * for (const _ of [1, 2, 3]) branches.push(await root.fork());
- *
- * for (let step = 0; step < 50; step++) {
- *   const produced = await Promise.all(branches.map(async b => [b, await b.produce()] as const));
- *   const live = produced.filter(([, p]) => !p.isStop);
- *   if (!live.length) break;
- *   await store.commit(live.map(([b, p]) => [b, p.token]));
- * }
- * ```
- *
- * @example Asymmetric prefill — variable-length injections, auto-chunked
- * ```typescript
- * await store.prefill([
- *   [branchA, systemPromptTokens],   // 200 tokens
- *   [branchB, shortQueryTokens],     //  12 tokens
- *   [branchC, longDocumentTokens],   // 800 tokens
- * ]);
- * // Bin-packed into ceil(1012 / nBatch) GPU dispatches
- * ```
- *
- * @category Branching
- */
-export class BranchStore {
-  constructor(ctx: SessionContext);
-
-  /**
-   * Batched single-token commit for model-generated tokens
-   *
-   * Each tuple `[branch, token]` binds one token to one branch.
-   * Accepts each token into its branch's repeat-penalty window (for correct
-   * PPL measurement), then decodes all N tokens in a single llama_decode()
-   * call via decode_each and captures logits per-branch. Accept-first
-   * ordering with rollback: if decode throws, sampler/grammar/metrics are
-   * restored from clones taken before the accept.
-   *
-   * @param entries - Array of `[branch, token]` tuples (branches must not be disposed)
-   * @throws If any branch is disposed
-   */
-  commit(entries: [Branch, number][]): Promise<void>;
-
-  /**
-   * Batched variable-length prefill for external tokens
-   *
-   * Each tuple `[branch, tokens]` binds a token array to one branch.
-   * Each branch can receive a different number of tokens — decode_scatter
-   * handles variable-length runs and auto-chunks to fit nBatch.
-   *
-   * Does NOT call accept_token — use for external/replayed tokens where
-   * repeat-penalty tracking is unwanted. For model-generated tokens,
-   * use {@link commit} instead.
-   *
-   * @param entries - Array of `[branch, tokens]` tuples (branches must not be disposed)
-   * @throws If any branch is disposed
-   */
-  prefill(entries: [Branch, number[]][]): Promise<void>;
-
-  /**
-   * Retain only the winner branch — evict all other leases and free their slots.
-   *
-   * Nuclear operation: calls `kv::seq_keep` on the winner's seq_id (stripping all
-   * other sequences from KV cache in a single pass), then frees all loser slots
-   * and rebuilds the vacancy list. The winner's topology is reset (no parent, no children).
-   *
-   * @param winner - The branch to keep (must not be disposed, must hold a lease)
-   * @throws If winner is disposed or has no lease
-   */
-  retainOnly(winner: Branch): Promise<void>;
-
-  /** Number of available seq_id leases */
-  readonly available: number;
-}
-
-// ================================================================
-// Agent primitives
-// ================================================================
-
 /**
  * Task description for forkAgent
  *
@@ -2199,126 +1531,10 @@ export interface RunAgentsResult {
 }
 
 /**
- * Fork an agent from a parent branch with its own system prompt + task
- *
- * Always prepends getTurnSeparator() for a clean structural break before
- * the agent's system prompt. Returns AgentState ready for store.prefill().
- *
- * @param parent - Branch to fork from
- * @param task - Agent task description
- * @param ctx - SessionContext for formatting and tokenization
- * @returns AgentState with branch and suffixTokens
- *
- * @example
- * ```typescript
- * const agent = await forkAgent(trunk, {
- *   systemPrompt: 'You are a research assistant.',
- *   content: 'What is X?',
- *   tools: toolsJson,
- *   seed: Date.now(),
- * }, ctx);
- * await store.prefill([[agent.branch, agent.suffixTokens]]);
- * ```
- *
- * @category Branching
- */
-export function forkAgent(
-  parent: Branch,
-  task: AgentTask,
-  ctx: SessionContext
-): Promise<AgentState>;
-
-/**
- * Run agents in a batched three-phase tick loop
- *
- * Preserves the mechanical execution wins from BranchStore:
- * shared-prefix KV, batched decode, fire-and-forget tools, idle yield.
- *
- * @param agents - Array of AgentState (from forkAgent or manual construction)
- * @param opts - Configuration including store, ctx, executeTool, and callbacks
- * @returns Aggregate statistics
- *
- * @example
- * ```typescript
- * const result = await runAgents(agents, {
- *   store, ctx,
- *   executeTool: (name, args) => myToolDispatch(name, args),
- *   maxTurns: 6,
- *   onToolCall(ai, name, args) { console.log(`Agent ${ai}: ${name}`); },
- * });
- * ```
- *
- * @category Branching
- */
-export function runAgents(
-  agents: AgentState[],
-  opts: RunAgentsOptions
-): Promise<RunAgentsResult>;
-
-/**
- * Session - Trunk lifecycle + conversation delta helpers
- *
- * Owns the current "trunk" branch and provides promote() to crown a winner,
- * plus delta helpers that centralize the sep + formatChat + tokenize + prefill
- * pattern for injecting new turns into an ongoing conversation.
- *
- * Session does NOT own the SessionContext or BranchStore — the consumer
- * creates those and passes them in. dispose() prunes trunk only.
- *
- * @example
- * ```typescript
- * const session = new Session({ ctx, store });
- * session.trunk = initialBranch;
+ * Native binding interface — what loadBinary() returns
  *
- * // After verification, promote the best attempt
- * await session.promote(bestAttempt.branch);
- *
- * // Inject a user turn and generate
- * await session.prefillUser('What about X?');
- * for await (const { text } of session.trunk) {
- *   process.stdout.write(text);
- * }
- *
- * // Cleanup
- * await session.dispose();
- * ctx.dispose();
- * ```
- *
- * @category Branching
+ * @category Core
  */
-export class Session {
-  constructor(opts: { ctx: SessionContext; store: BranchStore });
-
-  /** Current trunk branch (or null before assignment) */
-  get trunk(): Branch | null;
-
-  /** Assign initial trunk without promote */
-  set trunk(branch: Branch | null);
-
-  /**
-   * Promote a winner to trunk — retainOnly + reassign
-   *
-   * Calls store.retainOnly(winner), then sets trunk = winner.
-   * Safe even if winner is the only branch.
-   */
-  promote(winner: Branch): Promise<void>;
-
-  /**
-   * Dispose trunk only — consumer owns ctx and other resources
-   */
-  dispose(): Promise<void>;
-
-  /**
-   * Prefill a user turn into trunk
-   *
-   * Centralizes: sep + formatChat + tokenize(false) + prefill
-   */
-  prefillUser(content: string, opts?: { tools?: string }): Promise<void>;
-
-  /**
-   * Prefill a tool result turn into trunk
-   *
-   * Centralizes: sep + formatChat + tokenize(false) + prefill
-   */
-  prefillToolResult(resultStr: string, callId: string): Promise<void>;
+export interface NativeBinding {
+  createContext(options: ContextOptions): Promise<SessionContext>;
 }
diff --git a/test/examples.js b/test/examples.js
deleted file mode 100644
index dedce3c..0000000
--- a/test/examples.js
+++ /dev/null
@@ -1,463 +0,0 @@
-/**
- * Examples Integration Test
- *
- * Runs examples with --jsonl flag and validates structured output.
- * Each example emits JSONL events that we parse and assert on.
- *
- * Usage:
- *   node test/examples.js           # Run all examples
- *   node test/examples.js entropy   # Run specific example
- *
- * Environment variables:
- *   LLAMA_TEST_MODEL  - Path to chat/instruct model (default: SmolLM2)
- *   EMBED_MODEL_PATH - Path to embedding model (default: nomic-embed)
- */
-
-const { spawn } = require('child_process');
-const path = require('path');
-const fs = require('fs');
-
-// Model paths - use env var or default (resolve to absolute path)
-const MODEL_PATH = process.env.LLAMA_TEST_MODEL
-  ? path.resolve(process.env.LLAMA_TEST_MODEL)
-  : path.join(__dirname, '../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf');
-
-// Embedding model (separate from chat model, resolve to absolute path)
-const EMBED_MODEL_PATH = process.env.EMBED_MODEL_PATH
-  ? path.resolve(process.env.EMBED_MODEL_PATH)
-  : path.join(__dirname, '../liblloyal/tests/fixtures/nomic-embed-text-v1.5.Q4_K_M.gguf');
-
-// Qwen3 model for deep-research (tool-calling, instruct model)
-const QWEN3_PATH = process.env.QWEN3_MODEL
-  ? path.resolve(process.env.QWEN3_MODEL)
-  : path.join(__dirname, '../models/Qwen3-4B-Instruct-2507-Q4_K_M.gguf');
-
-// Qwen3 reranker for deep-research semantic search
-const RERANKER_PATH = process.env.RERANKER_MODEL
-  ? path.resolve(process.env.RERANKER_MODEL)
-  : path.join(__dirname, '../models/qwen3-reranker-0.6b-q4_k_m.gguf');
-
-
-if (!fs.existsSync(MODEL_PATH)) {
-  console.error('❌ Test model not found!');
-  console.error(`   Expected: ${MODEL_PATH}`);
-  console.error('   Run: npm run download-models');
-  process.exit(1);
-}
-
-/**
- * Run an example with --jsonl and collect events
- */
-function runExample(scriptPath, timeout = 600000, extraArgs = [], modelPathOverride = null) {
-  return new Promise((resolve, reject) => {
-    const events = [];
-    let stderr = '';
-
-    const modelArg = modelPathOverride || MODEL_PATH;
-
-    const child = spawn('node', [scriptPath, modelArg, '--jsonl', ...extraArgs], {
-      cwd: path.dirname(scriptPath),
-      stdio: ['ignore', 'pipe', 'pipe'],
-    });
-
-    child.stdout.on('data', (data) => {
-      const lines = data.toString().split('\n');
-      for (const line of lines) {
-        if (line.startsWith('{')) {
-          try {
-            const event = JSON.parse(line);
-            events.push(event);
-          } catch {
-            // Ignore malformed JSON
-          }
-        }
-      }
-    });
-
-    child.stderr.on('data', (data) => {
-      stderr += data.toString();
-    });
-
-    const timeoutId = setTimeout(() => {
-      child.kill('SIGTERM');
-      reject(new Error('TIMEOUT'));
-    }, timeout);
-
-    child.on('close', (code) => {
-      clearTimeout(timeoutId);
-      if (code === 0) {
-        resolve(events);
-      } else {
-        reject(new Error(`Exit code ${code}\n${stderr.slice(-500)}`));
-      }
-    });
-
-    child.on('error', (err) => {
-      clearTimeout(timeoutId);
-      reject(err);
-    });
-  });
-}
-
-/**
- * Assert helper
- */
-function assert(condition, message) {
-  if (!condition) {
-    throw new Error(`Assertion failed: ${message}`);
-  }
-}
-
-/**
- * Example test definitions
- */
-const EXAMPLES = {
-  entropy: {
-    path: 'entropy/entropy.mjs',
-    timeout: 120000,
-    validate(events) {
-      const start = events.find(e => e.event === 'start');
-      assert(start, 'should have start event');
-      assert(start.model, 'start should have model');
-
-      const comparisons = events.filter(e => e.event === 'comparison');
-      assert(comparisons.length === 3, `should have 3 comparisons, got ${comparisons.length}`);
-
-      for (const c of comparisons) {
-        assert(c.fixed && c.edt, 'comparison should have fixed and edt results');
-        assert(c.fixed.tokenCount > 0, 'fixed should generate tokens');
-        assert(c.edt.tokenCount > 0, 'edt should generate tokens');
-        assert(typeof c.edt.avgTemp === 'number', 'edt should have avgTemp');
-      }
-
-      const complete = events.find(e => e.event === 'complete');
-      assert(complete, 'should have complete event');
-      assert(complete.comparisons === 3, 'should complete 3 comparisons');
-    },
-  },
-
-  speculative: {
-    path: 'speculative/speculative.mjs',
-    timeout: 120000,
-    validate(events) {
-      const start = events.find(e => e.event === 'start');
-      assert(start, 'should have start event');
-      assert(start.draftCount > 0, 'should have draftCount');
-
-      const iterations = events.filter(e => e.event === 'iteration');
-      assert(iterations.length > 0, 'should have iterations');
-
-      for (const iter of iterations) {
-        assert(iter.drafted > 0, 'iteration should have drafted tokens');
-        assert(iter.accepted >= 0, 'iteration should have accepted count');
-      }
-
-      const complete = events.find(e => e.event === 'complete');
-      assert(complete, 'should have complete event');
-      assert(complete.outputTokens > 0, 'should generate tokens');
-      assert(complete.acceptRate >= 0 && complete.acceptRate <= 1, 'acceptRate should be 0-1');
-    },
-  },
-
-  grammar: {
-    path: 'grammar/grammar.mjs',
-    timeout: 120000,
-    validate(events) {
-      const start = events.find(e => e.event === 'start');
-      assert(start, 'should have start event');
-
-      const branchPoint = events.find(e => e.event === 'branch_point');
-      assert(branchPoint, 'should have branch_point event');
-      assert(branchPoint.prefix.includes('"city"'), 'should branch at city field');
-
-      const branchCompletes = events.filter(e => e.event === 'branch_complete');
-      assert(branchCompletes.length === 3, 'should complete 3 branches');
-
-      const complete = events.find(e => e.event === 'complete');
-      assert(complete, 'should have complete event');
-      assert(complete.validJsonCount > 0, 'should produce valid JSON');
-    },
-  },
-
-  'best-of-n': {
-    path: 'best-of-n/best-of-n.mjs',
-    timeout: 180000,
-    validate(events) {
-      const start = events.find(e => e.event === 'start');
-      assert(start, 'should have start event');
-      assert(start.n === 5, 'should have n=5 candidates');
-
-      const baseline = events.find(e => e.event === 'baseline');
-      assert(baseline, 'should have baseline');
-      assert(baseline.ppl > 0, 'baseline should have positive ppl');
-
-      const candidates = events.filter(e => e.event === 'candidate');
-      assert(candidates.length === 5, 'should have 5 candidates');
-
-      for (const c of candidates) {
-        assert(c.ppl > 1 && c.ppl < 1000, `candidate ppl should be in (1, 1000), got ${c.ppl}`);
-        assert(c.tokenCount > 0, 'candidate should have tokens');
-      }
-
-      const complete = events.find(e => e.event === 'complete');
-      assert(complete, 'should have complete event');
-      assert(complete.bestPpl > 0, 'should have bestPpl');
-    },
-  },
-
-  streaming: {
-    path: 'streaming/streaming.mjs',
-    timeout: 120000,
-    extraArgs: ['--max-tokens=500'],
-    validate(events) {
-      const start = events.find(e => e.event === 'start');
-      assert(start, 'should have start event');
-
-      const tokens = events.filter(e => e.event === 'token');
-      assert(tokens.length > 50, 'should generate tokens');
-
-      for (const t of tokens.slice(0, 10)) {
-        assert(typeof t.surprisal === 'number', 'token should have surprisal');
-      }
-
-      const complete = events.find(e => e.event === 'complete');
-      assert(complete, 'should have complete event');
-      assert(complete.generatedTokens > 0, 'should generate tokens');
-      assert(complete.finalPpl > 0, 'should have finalPpl');
-    },
-  },
-
-  'streaming-tsampler': {
-    path: 'streaming/streaming-tsampler.mjs',
-    timeout: 120000,
-    extraArgs: ['--max-tokens=500'],
-    validate(events) {
-      const start = events.find(e => e.event === 'start');
-      assert(start, 'should have start event');
-      assert(start.ngramSize > 0, 'should have ngramSize');
-
-      const tokens = events.filter(e => e.event === 'token');
-      assert(tokens.length > 0, 'should generate tokens');
-
-      const complete = events.find(e => e.event === 'complete');
-      assert(complete, 'should have complete event');
-      assert(complete.generatedTokens > 0, 'should generate tokens');
-      assert(typeof complete.blockedCount === 'number', 'should track blocked count');
-      assert(complete.uniqueNgrams > 0, 'should track unique ngrams');
-    },
-  },
-
-  'streaming-summary': {
-    path: 'streaming/streaming-summary.mjs',
-    timeout: 180000,
-    extraArgs: ['--max-tokens=500'],
-    validate(events) {
-      const start = events.find(e => e.event === 'start');
-      assert(start, 'should have start event');
-      assert(start.summaryMode === 'self', 'should default to self-summary mode');
-
-      const tokens = events.filter(e => e.event === 'token');
-      assert(tokens.length > 50, 'should generate tokens');
-
-      for (const t of tokens.slice(0, 10)) {
-        assert(t.source === 'main', 'token should have source=main');
-        assert(typeof t.surprisal === 'number', 'token should have surprisal');
-      }
-
-      const complete = events.find(e => e.event === 'complete');
-      assert(complete, 'should have complete event');
-      assert(complete.generatedTokens > 0, 'should generate tokens');
-      assert(complete.finalPpl > 0, 'should have finalPpl');
-    },
-  },
-
-  embed: {
-    path: 'embed/embed.mjs',
-    timeout: 60000,
-    modelPath: EMBED_MODEL_PATH,
-    skip: !fs.existsSync(EMBED_MODEL_PATH),
-    skipReason: 'nomic-embed-text model not found',
-    validate(events) {
-      const start = events.find(e => e.event === 'start');
-      assert(start, 'should have start event');
-      assert(start.embeddingDim > 0, 'should have embedding dimension');
-      assert(start.hasPooling === true, 'should have pooling enabled');
-
-      const embeddings = events.filter(e => e.event === 'embedding');
-      assert(embeddings.length === 4, 'should embed 4 texts');
-
-      for (const e of embeddings) {
-        assert(e.dimension > 0, 'embedding should have dimension');
-        assert(e.elapsed >= 0, 'embedding should have elapsed time');
-      }
-
-      const similarities = events.filter(e => e.event === 'similarity');
-      assert(similarities.length === 6, 'should have 6 similarity pairs (4 choose 2)');
-
-      for (const s of similarities) {
-        assert(s.similarity >= -1 && s.similarity <= 1, 'similarity should be in [-1, 1]');
-      }
-
-      const search = events.find(e => e.event === 'search');
-      assert(search, 'should have search event');
-      assert(search.results.length === 4, 'search should rank all texts');
-
-      const complete = events.find(e => e.event === 'complete');
-      assert(complete, 'should have complete event');
-    },
-  },
-
-  'deep-research': {
-    path: 'deep-research/deep-research.mjs',
-    timeout: 300000,
-    modelPath: QWEN3_PATH,
-    extraArgs: [
-      '--reranker', RERANKER_PATH,
-      '--corpus', process.env.DEEP_RESEARCH_CORPUS || '',
-      '--query', process.env.DEEP_RESEARCH_QUERY || '',
-    ],
-    skip: !fs.existsSync(QWEN3_PATH) || !fs.existsSync(RERANKER_PATH)
-      || !process.env.DEEP_RESEARCH_CORPUS || !process.env.DEEP_RESEARCH_QUERY,
-    skipReason: 'Requires QWEN3_MODEL, RERANKER_MODEL, DEEP_RESEARCH_CORPUS, and DEEP_RESEARCH_QUERY env vars',
-    validate(events) {
-      const start = events.find(e => e.event === 'start');
-      assert(start, 'should have start event');
-      assert(start.agentCount === 3, 'should have 3 agents');
-      assert(start.chunks > 0, 'should have corpus chunks');
-
-      const plan = events.find(e => e.event === 'plan');
-      assert(plan, 'should have plan event');
-      assert(plan.questions.length >= 2, 'should plan at least 2 sub-questions');
-
-      const researchStart = events.find(e => e.event === 'research_start');
-      assert(researchStart, 'should have research_start event');
-      assert(researchStart.sharedPrefixTokens > 0, 'should have shared prefix');
-
-      const toolCalls = events.filter(e => e.event === 'tool_call');
-      assert(toolCalls.length > 0, 'should make at least one tool call');
-
-      const agentsDone = events.filter(e => e.event === 'agent_done');
-      assert(agentsDone.length === 3, 'all 3 agents should finish');
-      for (const a of agentsDone) {
-        assert(a.tokenCount > 0, `agent ${a.index} should generate tokens`);
-      }
-
-      const complete = events.find(e => e.event === 'complete');
-      assert(complete, 'should have complete event');
-      assert(complete.totalToolCalls > 0, 'should have tool calls');
-      assert(complete.wallTimeMs > 0, 'should have wall time');
-      assert(complete.converged !== undefined, 'should have convergence result');
-    },
-  },
-};
-
-async function runTest(name, config) {
-  const fullPath = path.join(__dirname, '../examples', config.path);
-
-  if (config.skip) {
-    console.log(`⏭️  ${name}: SKIPPED`);
-    console.log(`   Reason: ${config.skipReason}`);
-    return { name, skipped: true, skipReason: config.skipReason };
-  }
-
-  console.log(`\n📜 ${name}:`);
-  const startTime = Date.now();
-
-  try {
-    const modelPathToUse = config.modelPath || MODEL_PATH;
-    const extraArgs = config.extraArgs || [];
-
-    const events = await runExample(fullPath, config.timeout, extraArgs, modelPathToUse);
-    config.validate(events);
-
-    const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
-
-    console.log(`   ✅ PASSED (${elapsed}s)`);
-    console.log(`   Events: ${events.length} total`);
-
-    // Show key metrics from complete event if present
-    const complete = events.find(e => e.event === 'complete');
-    if (complete) {
-      const metrics = [];
-      if (complete.generatedTokens) metrics.push(`tokens: ${complete.generatedTokens}`);
-      if (complete.outputTokens) metrics.push(`tokens: ${complete.outputTokens}`);
-      if (complete.finalPpl) metrics.push(`ppl: ${complete.finalPpl.toFixed(2)}`);
-      if (complete.reseeds !== undefined) metrics.push(`reseeds: ${complete.reseeds}`);
-      if (complete.acceptRate !== undefined) metrics.push(`accept: ${(complete.acceptRate * 100).toFixed(0)}%`);
-      if (complete.validJsonCount !== undefined) metrics.push(`valid: ${complete.validJsonCount}/${complete.branchCount}`);
-      if (complete.bestPpl) metrics.push(`bestPpl: ${complete.bestPpl.toFixed(2)}`);
-      if (complete.embeddings) metrics.push(`embeddings: ${complete.embeddings}`);
-      if (metrics.length > 0) {
-        console.log(`   Metrics: ${metrics.join(', ')}`);
-      }
-    }
-
-    return {
-      name,
-      passed: true,
-      elapsed: parseFloat(elapsed),
-      eventCount: events.length,
-      metrics: complete || {}
-    };
-
-  } catch (err) {
-    const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
-    console.log(`   ❌ FAILED (${elapsed}s)`);
-    console.log(`   Error: ${err.message}`);
-    return { name, passed: false, elapsed: parseFloat(elapsed), error: err.message };
-  }
-}
-
-async function main() {
-  const filterName = process.argv[2];
-
-  console.log('=== Examples Integration Test ===');
-  console.log(`Model: ${path.basename(MODEL_PATH)}`);
-
-  const toRun = filterName
-    ? { [filterName]: EXAMPLES[filterName] }
-    : EXAMPLES;
-
-  if (filterName && !EXAMPLES[filterName]) {
-    console.error(`Unknown example: ${filterName}`);
-    console.error(`Available: ${Object.keys(EXAMPLES).join(', ')}`);
-    process.exit(1);
-  }
-
-  const results = [];
-
-  for (const [name, config] of Object.entries(toRun)) {
-    const result = await runTest(name, config);
-    results.push(result);
-  }
-
-  // Summary
-  console.log('\n' + '═'.repeat(60));
-  console.log('EXAMPLES TEST SUMMARY');
-  console.log('═'.repeat(60));
-  console.log(`Model: ${path.basename(MODEL_PATH)}`);
-  console.log();
-
-  const passed = results.filter(r => r.passed).length;
-  const failed = results.filter(r => !r.passed && !r.skipped).length;
-  const skipped = results.filter(r => r.skipped).length;
-  const totalTime = results.reduce((sum, r) => sum + (r.elapsed || 0), 0).toFixed(1);
-
-  console.log('Results:');
-  for (const r of results) {
-    const status = r.skipped ? '⏭️ ' : (r.passed ? '✅' : '❌');
-    const time = r.elapsed ? ` (${r.elapsed}s)` : '';
-    const detail = r.skipped ? ` - ${r.skipReason}` : (r.error ? ` - ${r.error.slice(0, 50)}` : '');
-    console.log(`  ${status} ${r.name}${time}${detail}`);
-  }
-
-  console.log();
-  console.log(`Total: ${passed} passed, ${failed} failed, ${skipped} skipped in ${totalTime}s`);
-
-  process.exit(failed > 0 ? 1 : 0);
-}
-
-main().catch((err) => {
-  console.error('Fatal:', err);
-  process.exit(1);
-});
diff --git a/test/examples.ts b/test/examples.ts
new file mode 100644
index 0000000..57a8997
--- /dev/null
+++ b/test/examples.ts
@@ -0,0 +1,339 @@
+/**
+ * Examples Integration Test
+ *
+ * Runs examples with --jsonl flag and validates structured output.
+ * Each example emits JSONL events that we parse and assert on.
+ *
+ * Usage:
+ *   npx tsx test/examples.ts           # Run all examples
+ *   npx tsx test/examples.ts entropy   # Run specific example
+ *
+ * Environment variables:
+ *   LLAMA_TEST_MODEL  - Path to chat/instruct model (default: SmolLM2)
+ *   EMBED_MODEL_PATH - Path to embedding model (default: nomic-embed)
+ */
+
+import { spawn, ChildProcess } from 'node:child_process';
+import * as path from 'node:path';
+import * as fs from 'node:fs';
+
+interface ExampleEvent {
+  event: string;
+  [key: string]: any; // eslint-disable-line @typescript-eslint/no-explicit-any -- dynamic JSONL fields
+}
+
+interface ExampleConfig {
+  path: string;
+  timeout: number;
+  modelPath?: string;
+  extraArgs?: string[];
+  skip?: boolean;
+  skipReason?: string;
+  validate: (events: ExampleEvent[]) => void;
+}
+
+interface TestResult {
+  name: string;
+  passed?: boolean;
+  skipped?: boolean;
+  skipReason?: string;
+  elapsed?: number;
+  eventCount?: number;
+  metrics?: Record<string, unknown>;
+  error?: string;
+}
+
+const MODEL_PATH: string = process.env.LLAMA_TEST_MODEL
+  ? path.resolve(process.env.LLAMA_TEST_MODEL)
+  : path.join(__dirname, '../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf');
+
+const EMBED_MODEL_PATH: string = process.env.EMBED_MODEL_PATH
+  ? path.resolve(process.env.EMBED_MODEL_PATH)
+  : path.join(__dirname, '../liblloyal/tests/fixtures/nomic-embed-text-v1.5.Q4_K_M.gguf');
+
+const QWEN3_PATH: string = process.env.QWEN3_MODEL
+  ? path.resolve(process.env.QWEN3_MODEL)
+  : path.join(__dirname, '../models/Qwen3-4B-Instruct-2507-Q4_K_M.gguf');
+
+const RERANKER_PATH: string = process.env.RERANKER_MODEL
+  ? path.resolve(process.env.RERANKER_MODEL)
+  : path.join(__dirname, '../models/qwen3-reranker-0.6b-q4_k_m.gguf');
+
+
+if (!fs.existsSync(MODEL_PATH)) {
+  console.error('❌ Test model not found!');
+  console.error(`   Expected: ${MODEL_PATH}`);
+  console.error('   Run: npm run download-models');
+  process.exit(1);
+}
+
+function runExample(scriptPath: string, timeout: number = 600000, extraArgs: string[] = [], modelPathOverride: string | null = null): Promise<ExampleEvent[]> {
+  return new Promise((resolve: (value: ExampleEvent[]) => void, reject: (reason: Error) => void) => {
+    const events: ExampleEvent[] = [];
+    let stderr: string = '';
+
+    const modelArg: string = modelPathOverride || MODEL_PATH;
+
+    const child: ChildProcess = spawn('npx', ['tsx', scriptPath, modelArg, '--jsonl', ...extraArgs], {
+      cwd: path.dirname(scriptPath),
+      stdio: ['ignore', 'pipe', 'pipe'],
+    });
+
+    child.stdout!.on('data', (data: Buffer) => {
+      const lines: string[] = data.toString().split('\n');
+      for (const line of lines) {
+        if (line.startsWith('{')) {
+          try {
+            const event: ExampleEvent = JSON.parse(line);
+            events.push(event);
+          } catch {
+            // Ignore malformed JSON
+          }
+        }
+      }
+    });
+
+    child.stderr!.on('data', (data: Buffer) => {
+      stderr += data.toString();
+    });
+
+    const timeoutId: NodeJS.Timeout = setTimeout(() => {
+      child.kill('SIGTERM');
+      reject(new Error('TIMEOUT'));
+    }, timeout);
+
+    child.on('close', (code: number | null) => {
+      clearTimeout(timeoutId);
+      if (code === 0) {
+        resolve(events);
+      } else {
+        reject(new Error(`Exit code ${code}\n${stderr.slice(-500)}`));
+      }
+    });
+
+    child.on('error', (err: Error) => {
+      clearTimeout(timeoutId);
+      reject(err);
+    });
+  });
+}
+
+function assert(condition: unknown, message: string): asserts condition {
+  if (!condition) {
+    throw new Error(`Assertion failed: ${message}`);
+  }
+}
+
+const EXAMPLES: Record<string, ExampleConfig> = {
+  entropy: {
+    path: 'entropy/entropy.ts',
+    timeout: 120000,
+    validate(events: ExampleEvent[]): void {
+      const start: ExampleEvent | undefined = events.find(e => e.event === 'start');
+      assert(start, 'should have start event');
+      assert(start.model, 'start should have model');
+
+      const comparisons: ExampleEvent[] = events.filter(e => e.event === 'comparison');
+      assert(comparisons.length === 3, `should have 3 comparisons, got ${comparisons.length}`);
+
+      for (const c of comparisons) {
+        assert(c.fixed && c.edt, 'comparison should have fixed and edt results');
+        assert(c.fixed.tokenCount > 0, 'fixed should generate tokens');
+        assert(c.edt.tokenCount > 0, 'edt should generate tokens');
+        assert(typeof c.edt.avgTemp === 'number', 'edt should have avgTemp');
+      }
+
+      const complete: ExampleEvent | undefined = events.find(e => e.event === 'complete');
+      assert(complete, 'should have complete event');
+      assert(complete.comparisons === 3, 'should complete 3 comparisons');
+    },
+  },
+
+  embed: {
+    path: 'embed/embed.ts',
+    timeout: 60000,
+    modelPath: EMBED_MODEL_PATH,
+    skip: !fs.existsSync(EMBED_MODEL_PATH),
+    skipReason: 'nomic-embed-text model not found',
+    validate(events: ExampleEvent[]): void {
+      const start: ExampleEvent | undefined = events.find(e => e.event === 'start');
+      assert(start, 'should have start event');
+      assert(start.embeddingDim > 0, 'should have embedding dimension');
+      assert(start.hasPooling === true, 'should have pooling enabled');
+
+      const embeddings: ExampleEvent[] = events.filter(e => e.event === 'embedding');
+      assert(embeddings.length === 4, 'should embed 4 texts');
+
+      for (const e of embeddings) {
+        assert(e.dimension > 0, 'embedding should have dimension');
+        assert(e.elapsed >= 0, 'embedding should have elapsed time');
+      }
+
+      const similarities: ExampleEvent[] = events.filter(e => e.event === 'similarity');
+      assert(similarities.length === 6, 'should have 6 similarity pairs (4 choose 2)');
+
+      for (const s of similarities) {
+        assert(s.similarity >= -1 && s.similarity <= 1, 'similarity should be in [-1, 1]');
+      }
+
+      const search: ExampleEvent | undefined = events.find(e => e.event === 'search');
+      assert(search, 'should have search event');
+      assert(search.results.length === 4, 'search should rank all texts');
+
+      const complete: ExampleEvent | undefined = events.find(e => e.event === 'complete');
+      assert(complete, 'should have complete event');
+    },
+  },
+
+  'deep-research': {
+    path: 'deep-research/deep-research.ts',
+    timeout: 300000,
+    modelPath: QWEN3_PATH,
+    extraArgs: [
+      '--reranker', RERANKER_PATH,
+      '--corpus', process.env.DEEP_RESEARCH_CORPUS || '',
+      '--query', process.env.DEEP_RESEARCH_QUERY || '',
+    ],
+    skip: !fs.existsSync(QWEN3_PATH) || !fs.existsSync(RERANKER_PATH)
+      || !process.env.DEEP_RESEARCH_CORPUS || !process.env.DEEP_RESEARCH_QUERY,
+    skipReason: 'Requires QWEN3_MODEL, RERANKER_MODEL, DEEP_RESEARCH_CORPUS, and DEEP_RESEARCH_QUERY env vars',
+    validate(events: ExampleEvent[]): void {
+      const start: ExampleEvent | undefined = events.find(e => e.event === 'start');
+      assert(start, 'should have start event');
+      assert(start.agentCount === 3, 'should have 3 agents');
+      assert(start.chunks > 0, 'should have corpus chunks');
+
+      const plan: ExampleEvent | undefined = events.find(e => e.event === 'plan');
+      assert(plan, 'should have plan event');
+      assert(plan.questions.length >= 2, 'should plan at least 2 sub-questions');
+
+      const researchStart: ExampleEvent | undefined = events.find(e => e.event === 'research_start');
+      assert(researchStart, 'should have research_start event');
+      assert(researchStart.sharedPrefixTokens > 0, 'should have shared prefix');
+
+      const toolCalls: ExampleEvent[] = events.filter(e => e.event === 'tool_call');
+      assert(toolCalls.length > 0, 'should make at least one tool call');
+
+      const agentsDone: ExampleEvent[] = events.filter(e => e.event === 'agent_done');
+      assert(agentsDone.length === 3, 'all 3 agents should finish');
+      for (const a of agentsDone) {
+        assert(a.tokenCount > 0, `agent ${a.index} should generate tokens`);
+      }
+
+      const complete: ExampleEvent | undefined = events.find(e => e.event === 'complete');
+      assert(complete, 'should have complete event');
+      assert(complete.totalToolCalls > 0, 'should have tool calls');
+      assert(complete.wallTimeMs > 0, 'should have wall time');
+      assert(complete.converged !== undefined, 'should have convergence result');
+    },
+  },
+};
+
+async function runTest(name: string, config: ExampleConfig): Promise<TestResult> {
+  const fullPath: string = path.join(__dirname, '../examples', config.path);
+
+  if (config.skip) {
+    console.log(`⏭️  ${name}: SKIPPED`);
+    console.log(`   Reason: ${config.skipReason}`);
+    return { name, skipped: true, skipReason: config.skipReason };
+  }
+
+  console.log(`\n📜 ${name}:`);
+  const startTime: number = Date.now();
+
+  try {
+    const modelPathToUse: string = config.modelPath || MODEL_PATH;
+    const extraArgs: string[] = config.extraArgs || [];
+
+    const events: ExampleEvent[] = await runExample(fullPath, config.timeout, extraArgs, modelPathToUse);
+    config.validate(events);
+
+    const elapsed: string = ((Date.now() - startTime) / 1000).toFixed(1);
+
+    console.log(`   ✅ PASSED (${elapsed}s)`);
+    console.log(`   Events: ${events.length} total`);
+
+    const complete: ExampleEvent | undefined = events.find(e => e.event === 'complete');
+    if (complete) {
+      const metrics: string[] = [];
+      if (complete.generatedTokens) metrics.push(`tokens: ${complete.generatedTokens}`);
+      if (complete.outputTokens) metrics.push(`tokens: ${complete.outputTokens}`);
+      if (complete.finalPpl) metrics.push(`ppl: ${complete.finalPpl.toFixed(2)}`);
+      if (complete.reseeds !== undefined) metrics.push(`reseeds: ${complete.reseeds}`);
+      if (complete.acceptRate !== undefined) metrics.push(`accept: ${(complete.acceptRate * 100).toFixed(0)}%`);
+      if (complete.validJsonCount !== undefined) metrics.push(`valid: ${complete.validJsonCount}/${complete.branchCount}`);
+      if (complete.bestPpl) metrics.push(`bestPpl: ${complete.bestPpl.toFixed(2)}`);
+      if (complete.embeddings) metrics.push(`embeddings: ${complete.embeddings}`);
+      if (metrics.length > 0) {
+        console.log(`   Metrics: ${metrics.join(', ')}`);
+      }
+    }
+
+    return {
+      name,
+      passed: true,
+      elapsed: parseFloat(elapsed),
+      eventCount: events.length,
+      metrics: complete || {}
+    };
+
+  } catch (err) {
+    const elapsed: string = ((Date.now() - startTime) / 1000).toFixed(1);
+    console.log(`   ❌ FAILED (${elapsed}s)`);
+    console.log(`   Error: ${(err as Error).message}`);
+    return { name, passed: false, elapsed: parseFloat(elapsed), error: (err as Error).message };
+  }
+}
+
+async function main(): Promise<void> {
+  const filterName: string | undefined = process.argv[2];
+
+  console.log('=== Examples Integration Test ===');
+  console.log(`Model: ${path.basename(MODEL_PATH)}`);
+
+  const toRun: Record<string, ExampleConfig> = filterName
+    ? { [filterName]: EXAMPLES[filterName] }
+    : EXAMPLES;
+
+  if (filterName && !EXAMPLES[filterName]) {
+    console.error(`Unknown example: ${filterName}`);
+    console.error(`Available: ${Object.keys(EXAMPLES).join(', ')}`);
+    process.exit(1);
+  }
+
+  const results: TestResult[] = [];
+
+  for (const [name, config] of Object.entries(toRun)) {
+    const result: TestResult = await runTest(name, config);
+    results.push(result);
+  }
+
+  console.log('\n' + '═'.repeat(60));
+  console.log('EXAMPLES TEST SUMMARY');
+  console.log('═'.repeat(60));
+  console.log(`Model: ${path.basename(MODEL_PATH)}`);
+  console.log();
+
+  const passed: number = results.filter(r => r.passed).length;
+  const failed: number = results.filter(r => !r.passed && !r.skipped).length;
+  const skipped: number = results.filter(r => r.skipped).length;
+  const totalTime: string = results.reduce((sum: number, r: TestResult) => sum + (r.elapsed || 0), 0).toFixed(1);
+
+  console.log('Results:');
+  for (const r of results) {
+    const status: string = r.skipped ? '⏭️ ' : (r.passed ? '✅' : '❌');
+    const time: string = r.elapsed ? ` (${r.elapsed}s)` : '';
+    const detail: string = r.skipped ? ` - ${r.skipReason}` : (r.error ? ` - ${r.error.slice(0, 50)}` : '');
+    console.log(`  ${status} ${r.name}${time}${detail}`);
+  }
+
+  console.log();
+  console.log(`Total: ${passed} passed, ${failed} failed, ${skipped} skipped in ${totalTime}s`);
+
+  process.exit(failed > 0 ? 1 : 0);
+}
+
+main().catch((err: unknown) => {
+  console.error('Fatal:', err);
+  process.exit(1);
+});
diff --git a/test/integration.js b/test/integration.ts
similarity index 76%
rename from test/integration.js
rename to test/integration.ts
index b063058..5bd4a6a 100644
--- a/test/integration.js
+++ b/test/integration.ts
@@ -12,18 +12,20 @@
  *   LLAMA_EMBED_MODEL=models/nomic-embed-text-v1.5.Q4_K_M.gguf npm run test:integration
  */
 
-const path = require('path');
-const fs = require('fs');
+import * as path from 'node:path';
+import * as fs from 'node:fs';
+import { loadBinary, Branch, BranchStore } from '../dist/index.js';
+import type { SessionContext, NativeBinding, FormattedChatResult, Produced } from '../dist/index.js';
 
-const MODEL_PATH = process.env.LLAMA_TEST_MODEL
+const MODEL_PATH: string = process.env.LLAMA_TEST_MODEL
   ? path.resolve(process.env.LLAMA_TEST_MODEL)
   : path.join(__dirname, '../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf');
-const EMBED_MODEL_PATH = process.env.LLAMA_EMBED_MODEL ||
+const EMBED_MODEL_PATH: string | null = process.env.LLAMA_EMBED_MODEL ||
   (fs.existsSync(path.join(__dirname, '../models/nomic-embed-text-v1.5.Q4_K_M.gguf'))
     ? path.join(__dirname, '../models/nomic-embed-text-v1.5.Q4_K_M.gguf')
     : null);
 
-const CTX_SIZE = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
+const CTX_SIZE: number = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
 
 if (!fs.existsSync(MODEL_PATH)) {
   console.error('Test model not found:', MODEL_PATH);
@@ -34,29 +36,28 @@ console.log('=== lloyal.node Integration Tests ===\n');
 console.log(`Model: ${path.basename(MODEL_PATH)}`);
 console.log(`Size: ${(fs.statSync(MODEL_PATH).size / 1024 / 1024).toFixed(1)} MB\n`);
 
-const { loadBinary, Branch, BranchStore } = require('..');
-let addon;
+let addon: NativeBinding;
 try {
-  addon = require('../build/Release/lloyal.node');
+  addon = require('../build/Release/lloyal.node') as NativeBinding;
 } catch {
   addon = loadBinary();
 }
 
 // Test tracking
-let passed = 0;
-let failed = 0;
+let passed: number = 0;
+let failed: number = 0;
 
-function ok(msg) {
+function ok(msg: string): void {
   passed++;
   console.log(`  [PASS] ${msg}`);
 }
 
-function fail(msg) {
+function fail(msg: string): void {
   failed++;
   console.log(`  [FAIL] ${msg}`);
 }
 
-function assert(condition, msg) {
+function assert(condition: boolean, msg: string): void {
   if (condition) {
     ok(msg);
   } else {
@@ -69,33 +70,33 @@ function assert(condition, msg) {
 // CORE API TESTS
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testCoreAPI(ctx) {
+async function testCoreAPI(ctx: SessionContext): Promise<void> {
   console.log('\n--- Core API ---');
 
   // createContext validated by caller
 
   // tokenize / detokenize
-  const text = "Hello world";
-  const tokens = await ctx.tokenize(text);
+  const text: string = "Hello world";
+  const tokens: number[] = await ctx.tokenize(text);
   assert(tokens.length > 0, `tokenize("${text}") → ${tokens.length} tokens`);
 
-  const reconstructed = await ctx.detokenize(tokens);
+  const reconstructed: string = await ctx.detokenize(tokens);
   assert(typeof reconstructed === 'string', `detokenize() → "${reconstructed}"`);
 
   // tokenToText
-  const tokenText = ctx.tokenToText(tokens[0]);
+  const tokenText: string = ctx.tokenToText(tokens[0]);
   assert(typeof tokenText === 'string', `tokenToText(${tokens[0]}) → "${tokenText}"`);
 
   // Branch-based prefill + getLogits
   const branch = Branch.create(ctx, 0, { temperature: 0 });
   await branch.prefill(tokens);
 
-  const branchLogits = branch.getLogits();
+  const branchLogits: Float32Array = branch.getLogits();
   assert(branchLogits instanceof Float32Array, `branch.getLogits() → Float32Array(${branchLogits.length})`);
   assert(branchLogits.length === ctx.vocabSize, `branchLogits.length === vocabSize (${ctx.vocabSize})`);
 
   // Validate logits are not garbage
-  let hasNonZero = false, hasNaN = false;
+  let hasNonZero: boolean = false, hasNaN: boolean = false;
   for (let i = 0; i < branchLogits.length; i++) {
     if (branchLogits[i] !== 0.0) hasNonZero = true;
     if (isNaN(branchLogits[i])) hasNaN = true;
@@ -103,15 +104,15 @@ async function testCoreAPI(ctx) {
   assert(hasNonZero && !hasNaN, 'branch logits valid (non-zero, no NaN)');
 
   // branch.modelEntropy
-  const entropy = branch.modelEntropy('nats');
+  const entropy: number = branch.modelEntropy('nats');
   assert(isFinite(entropy) && entropy >= 0, `branch.modelEntropy() → ${entropy.toFixed(4)} nats`);
 
   // Branch greedy sampling (temperature: 0)
-  const greedy = branch.sample();
+  const greedy: number = branch.sample();
   assert(greedy >= 0 && greedy < ctx.vocabSize, `branch.sample() greedy → ${greedy}`);
 
   // isStopToken - EOS should be a stop token
-  const eos = ctx.getEogToken();
+  const eos: number = ctx.getEogToken();
   assert(ctx.isStopToken(eos), `isStopToken(EOS=${eos}) → true`);
 
   await branch.prune();
@@ -121,20 +122,20 @@ async function testCoreAPI(ctx) {
 // KV CACHE TESTS
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testKVCache(ctx) {
+async function testKVCache(ctx: SessionContext): Promise<void> {
   console.log('\n--- KV Cache ---');
 
   await ctx.kvCacheClear();
-  const tokens = await ctx.tokenize("Test prompt");
+  const tokens: number[] = await ctx.tokenize("Test prompt");
   const branch = Branch.create(ctx, 0, { temperature: 0 });
   await branch.prefill(tokens);
 
-  const sizeBefore = ctx.kvCacheSize();
+  const sizeBefore: number = ctx.kvCacheSize();
   assert(sizeBefore >= 0, `kvCacheSize() after prefill → ${sizeBefore}`);
 
   await branch.prune();
   await ctx.kvCacheClear();
-  const sizeAfter = ctx.kvCacheSize();
+  const sizeAfter: number = ctx.kvCacheSize();
   assert(sizeAfter === -1, `kvCacheClear() → size=${sizeAfter} (empty)`);
 }
 
@@ -142,10 +143,10 @@ async function testKVCache(ctx) {
 // MULTI-SEQUENCE TESTS
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testMultiSequence() {
+async function testMultiSequence(): Promise<void> {
   console.log('\n--- Multi-Sequence KV ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nThreads: 4,
@@ -154,12 +155,12 @@ async function testMultiSequence() {
 
   try {
     // Use a branch to prefill tokens (populates KV on its seq_id)
-    const tokens = await ctx.tokenize("The quick brown fox");
+    const tokens: number[] = await ctx.tokenize("The quick brown fox");
     const branch = Branch.create(ctx, 0, { temperature: 0 });
     await branch.prefill(tokens);
 
     // Branch allocates a seq_id — check its KV is populated
-    const branchPos = branch.position;
+    const branchPos: number = branch.position;
     assert(branchPos === tokens.length, `branch position → ${branchPos}`);
 
     // Fork creates a new sequence with copied KV
@@ -167,7 +168,7 @@ async function testMultiSequence() {
     assert(forked.position === branchPos, `forked position matches parent → ${forked.position}`);
 
     // Raw KV seq ops still work for advanced use
-    const seq1Before = ctx.kvSeqPosMax(3);  // unused seq_id
+    const seq1Before: number = ctx.kvSeqPosMax(3);  // unused seq_id
     assert(seq1Before === -1, `kvSeqPosMax(unused) → ${seq1Before} (empty)`);
 
     await forked.prune();
@@ -181,10 +182,10 @@ async function testMultiSequence() {
 // GRAMMAR TESTS
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testGrammar() {
+async function testGrammar(): Promise<void> {
   console.log('\n--- Grammar Sampling ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nThreads: 4,
@@ -192,15 +193,15 @@ async function testGrammar() {
   });
 
   try {
-    const grammar = `root ::= "{" ws "}" ws
+    const grammar: string = `root ::= "{" ws "}" ws
 ws ::= [ \\t\\n]*`;
 
     // Branch API with grammar
-    const prompt = await ctx.tokenize("Output: ");
+    const prompt: number[] = await ctx.tokenize("Output: ");
     const branch = Branch.create(ctx, 0, { temperature: 0 }, undefined, grammar);
     await branch.prefill(prompt);
 
-    const output = [];
+    const output: string[] = [];
     for (let i = 0; i < 10; i++) {
       const { token, text, isStop } = await branch.produce();
       if (isStop) break;
@@ -208,12 +209,12 @@ ws ::= [ \\t\\n]*`;
       output.push(text);
     }
 
-    const result = output.join('');
+    const result: string = output.join('');
     assert(/^\{\s*\}\s*$/.test(result), `Branch+grammar → "${result}"`);
 
     // Grammar is cloned on fork — independent parser states
     await ctx.kvCacheClear();
-    const prompt2 = await ctx.tokenize("Output: ");
+    const prompt2: number[] = await ctx.tokenize("Output: ");
     const root = Branch.create(ctx, 0, { temperature: 0 }, undefined, grammar);
     await root.prefill(prompt2);
 
@@ -221,15 +222,15 @@ ws ::= [ \\t\\n]*`;
     const childB = await root.fork();
 
     // Both children should produce grammar-valid output independently
-    const outA = [], outB = [];
+    const outA: string[] = [], outB: string[] = [];
     for (let i = 0; i < 10; i++) {
-      const pA = await childA.produce();
+      const pA: Produced = await childA.produce();
       if (!pA.isStop) { await childA.commit(pA.token); outA.push(pA.text); }
-      const pB = await childB.produce();
+      const pB: Produced = await childB.produce();
       if (!pB.isStop) { await childB.commit(pB.token); outB.push(pB.text); }
     }
 
-    const resultA = outA.join(''), resultB = outB.join('');
+    const resultA: string = outA.join(''), resultB: string = outB.join('');
     assert(/^\{\s*\}\s*$/.test(resultA), `Fork A grammar → "${resultA}"`);
     assert(/^\{\s*\}\s*$/.test(resultB), `Fork B grammar → "${resultB}"`);
 
@@ -246,20 +247,20 @@ ws ::= [ \\t\\n]*`;
 // METRICS API TESTS
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testMetrics(ctx) {
+async function testMetrics(ctx: SessionContext): Promise<void> {
   console.log('\n--- Metrics API ---');
 
   await ctx.kvCacheClear();
-  const tokens = await ctx.tokenize("Hello");
+  const tokens: number[] = await ctx.tokenize("Hello");
   const branch = Branch.create(ctx, 0, { temperature: 0 });
   await branch.prefill(tokens);
 
   // branch.modelSurprisal
-  const token1 = branch.sample();
-  const surprisal = branch.modelSurprisal(token1, "nats");
+  const token1: number = branch.sample();
+  const surprisal: number = branch.modelSurprisal(token1, "nats");
   assert(surprisal >= 0, `branch.modelSurprisal() → ${surprisal.toFixed(2)} nats`);
 
-  const surprisalBits = branch.modelSurprisal(token1, "bits");
+  const surprisalBits: number = branch.modelSurprisal(token1, "bits");
   assert(Math.abs(surprisalBits - surprisal / Math.log(2)) < 0.01, 'bits = nats / ln(2)');
 
   // Branch perplexity — built-in, accumulates through commit()
@@ -267,7 +268,7 @@ async function testMetrics(ctx) {
   const { token: token2 } = await branch.produce();
   await branch.commit(token2);
 
-  const ppl = branch.perplexity;
+  const ppl: number = branch.perplexity;
   assert(isFinite(ppl) && ppl >= 1.0, `branch.perplexity → ${ppl.toFixed(2)}`);
 
   await branch.prune();
@@ -277,10 +278,10 @@ async function testMetrics(ctx) {
 // BRANCH PREFILL TESTS
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testBranchPrefill() {
+async function testBranchPrefill(): Promise<void> {
   console.log('\n--- Branch.prefill Multi-Turn ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nBatch: 512,
@@ -288,21 +289,21 @@ async function testBranchPrefill() {
   });
 
   try {
-    const GEN_TOKENS = 5;
-    const turns = [
+    const GEN_TOKENS: number = 5;
+    const turns: string[] = [
       "What is the capital of France?",
       " Tell me more.",
       " What about transportation?"
     ];
 
-    const messages = [{ role: 'user', content: turns[0] }];
+    const messages: Array<{ role: string; content: string }> = [{ role: 'user', content: turns[0] }];
     const { prompt } = await ctx.formatChat(JSON.stringify(messages));
-    const promptToks = await ctx.tokenize(prompt);
+    const promptToks: number[] = await ctx.tokenize(prompt);
     const branch = Branch.create(ctx, 0, { temperature: 0 });
     await branch.prefill(promptToks);
 
     // Turn 1
-    const gen1 = [];
+    const gen1: number[] = [];
     for (let i = 0; i < GEN_TOKENS; i++) {
       const { token, isStop } = await branch.produce();
       if (isStop) break;
@@ -312,11 +313,11 @@ async function testBranchPrefill() {
     assert(gen1.length > 0, `Turn 1: generated ${gen1.length} tokens`);
 
     // Track assistant response
-    const assistantText1 = await ctx.detokenize(gen1);
+    const assistantText1: string = await ctx.detokenize(gen1);
     messages.push({ role: 'assistant', content: assistantText1 });
 
     // Warm continuation: format only new message + turn separator
-    const sep = ctx.getTurnSeparator();
+    const sep: number[] = ctx.getTurnSeparator();
 
     // Turn 2-3: prefill using format-only-new pattern + generate
     for (let t = 1; t < turns.length; t++) {
@@ -325,15 +326,15 @@ async function testBranchPrefill() {
         { role: 'system', content: '' },
         { role: 'user', content: turns[t] }
       ]));
-      const delta = await ctx.tokenize(prompt, false);
-      const prefillToks = [...sep, ...delta];
+      const delta: number[] = await ctx.tokenize(prompt, false);
+      const prefillToks: number[] = [...sep, ...delta];
 
-      const posBefore = branch.position;
+      const posBefore: number = branch.position;
       await branch.prefill(prefillToks);
       assert(branch.position === posBefore + prefillToks.length,
         `Turn ${t + 1}: prefill ${prefillToks.length} tokens → pos=${branch.position}`);
 
-      const gen = [];
+      const gen: number[] = [];
       for (let i = 0; i < GEN_TOKENS; i++) {
         const { token, isStop } = await branch.produce();
         if (isStop) break;
@@ -343,7 +344,7 @@ async function testBranchPrefill() {
       assert(gen.length > 0, `Turn ${t + 1}: generated ${gen.length} tokens`);
 
       // Track assistant response
-      const assistantText = await ctx.detokenize(gen);
+      const assistantText: string = await ctx.detokenize(gen);
       messages.push({ role: 'assistant', content: assistantText });
     }
 
@@ -358,10 +359,10 @@ async function testBranchPrefill() {
 // Mirrors liblloyal C++ test: chat_in_integration_test.cpp
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testWarmMultiTurnRecall() {
+async function testWarmMultiTurnRecall(): Promise<void> {
   console.log('\n--- Warm Multi-Turn Recall ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nBatch: 512,
@@ -369,11 +370,11 @@ async function testWarmMultiTurnRecall() {
   });
 
   try {
-    const sep = ctx.getTurnSeparator();
+    const sep: number[] = ctx.getTurnSeparator();
 
     // Helper: generate until EOG (matches C++ test pattern)
-    async function generate(branch) {
-      const gen = [];
+    async function generate(branch: InstanceType<typeof Branch>): Promise<string> {
+      const gen: number[] = [];
       for (;;) {
         const { token, isStop } = await branch.produce();
         if (isStop) break;
@@ -384,25 +385,25 @@ async function testWarmMultiTurnRecall() {
     }
 
     // Helper: warm continuation — sep + format([{system,""},{user,msg}])
-    async function warmTurn(branch, userContent) {
+    async function warmTurn(branch: InstanceType<typeof Branch>, userContent: string): Promise<string> {
       const { prompt } = await ctx.formatChat(JSON.stringify([
         { role: 'system', content: '' },
         { role: 'user', content: userContent }
       ]), {});
-      const delta = await ctx.tokenize(prompt, false);
+      const delta: number[] = await ctx.tokenize(prompt, false);
       await branch.prefill([...sep, ...delta]);
       return generate(branch);
     }
 
     // Turn 1 (COLD): introduce name
-    const msgs1 = [{ role: 'user', content: 'Hi, my name is Lloyal' }];
+    const msgs1: Array<{ role: string; content: string }> = [{ role: 'user', content: 'Hi, my name is Lloyal' }];
     const { prompt, format, reasoningFormat } = await ctx.formatChat(JSON.stringify(msgs1), {});
-    const promptToks = await ctx.tokenize(prompt);
+    const promptToks: number[] = await ctx.tokenize(prompt);
     const branch = Branch.create(ctx, 0, { temperature: 0 });
     await branch.prefill(promptToks);
 
     // Helper: parse output and check content (not reasoning) for a term
-    function checkRecall(rawText, term) {
+    function checkRecall(rawText: string, term: string): boolean {
       const { content } = ctx.parseChatOutput(rawText, format, {
         reasoningFormat,
         isPartial: false,
@@ -411,25 +412,25 @@ async function testWarmMultiTurnRecall() {
       return (content || '').toLowerCase().includes(term.toLowerCase());
     }
 
-    const turn1 = await generate(branch);
+    const turn1: string = await generate(branch);
     console.log(`  Turn 1: "${turn1.trim()}"`);
     assert(turn1.length > 0, 'Turn 1: generated response');
 
     // Turn 2 (WARM): introduce favourite food
-    const turn2 = await warmTurn(branch, 'My favourite food is pizza');
+    const turn2: string = await warmTurn(branch, 'My favourite food is pizza');
     console.log(`  Turn 2: "${turn2.trim()}"`);
     assert(turn2.length > 0, 'Turn 2: generated response');
 
     // Turn 3 (WARM): recall name
-    const turn3 = await warmTurn(branch, 'Do you remember my name?');
+    const turn3: string = await warmTurn(branch, 'Do you remember my name?');
     console.log(`  Turn 3 (name recall): "${turn3.trim()}"`);
-    const nameRecalled = checkRecall(turn3, 'lloyal');
+    const nameRecalled: boolean = checkRecall(turn3, 'lloyal');
     assert(nameRecalled, `Name recall: ${nameRecalled ? 'found "Lloyal"' : 'MISSING "Lloyal" in: ' + turn3.trim()}`);
 
     // Turn 4 (WARM): recall food
-    const turn4 = await warmTurn(branch, 'Do you remember my favourite food?');
+    const turn4: string = await warmTurn(branch, 'Do you remember my favourite food?');
     console.log(`  Turn 4 (food recall): "${turn4.trim()}"`);
-    const foodRecalled = checkRecall(turn4, 'pizza');
+    const foodRecalled: boolean = checkRecall(turn4, 'pizza');
     assert(foodRecalled, `Food recall: ${foodRecalled ? 'found "pizza"' : 'MISSING "pizza" in: ' + turn4.trim()}`);
 
     await branch.prune();
@@ -442,7 +443,7 @@ async function testWarmMultiTurnRecall() {
 // WARM CONTINUATION SEMANTIC RECALL - Proves context survives delta-only prefill
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testWarmSemanticRecall() {
+async function testWarmSemanticRecall(): Promise<void> {
   if (!EMBED_MODEL_PATH) {
     console.log('\n--- Warm Semantic Recall (SKIPPED - no LLAMA_EMBED_MODEL) ---');
     return;
@@ -450,11 +451,11 @@ async function testWarmSemanticRecall() {
 
   console.log('\n--- Warm Semantic Recall ---');
 
-  const GEN_TOKENS = 40;
+  const GEN_TOKENS: number = 40;
 
   // Helper: cosine similarity
-  function cosine(a, b) {
-    let dot = 0, na = 0, nb = 0;
+  function cosine(a: Float32Array, b: Float32Array): number {
+    let dot: number = 0, na: number = 0, nb: number = 0;
     for (let i = 0; i < a.length; i++) {
       dot += a[i] * b[i];
       na += a[i] * a[i];
@@ -464,9 +465,9 @@ async function testWarmSemanticRecall() {
   }
 
   // Phase 1: Generate multi-turn conversation via warm continuation
-  let recallText;
+  let recallText: string;
   {
-    const ctx = await addon.createContext({
+    const ctx: SessionContext = await addon.createContext({
       modelPath: MODEL_PATH,
       nCtx: CTX_SIZE,
       nBatch: 512,
@@ -474,28 +475,28 @@ async function testWarmSemanticRecall() {
     });
 
     try {
-      const sep = ctx.getTurnSeparator();
-      let branch;
-      const messages = [];
+      const sep: number[] = ctx.getTurnSeparator();
+      let branch: InstanceType<typeof Branch>;
+      const messages: Array<{ role: string; content: string }> = [];
 
       // Helper: format-only-new warm continuation
-      async function warmTurn(userContent) {
+      async function warmTurn(userContent: string): Promise<string> {
         messages.push({ role: 'user', content: userContent });
         const { prompt } = await ctx.formatChat(JSON.stringify([
           { role: 'system', content: '' },
           { role: 'user', content: userContent }
         ]));
-        const delta = await ctx.tokenize(prompt, false);
+        const delta: number[] = await ctx.tokenize(prompt, false);
         await branch.prefill([...sep, ...delta]);
 
-        const gen = [];
+        const gen: number[] = [];
         for (let i = 0; i < GEN_TOKENS; i++) {
           const { token, isStop } = await branch.produce();
           if (isStop) break;
           await branch.commit(token);
           gen.push(token);
         }
-        const text = await ctx.detokenize(gen);
+        const text: string = await ctx.detokenize(gen);
         messages.push({ role: 'assistant', content: text });
         return text;
       }
@@ -503,19 +504,19 @@ async function testWarmSemanticRecall() {
       // Turn 1: Plant a specific, recallable fact
       messages.push({ role: 'user', content: 'Remember this: my dog is named Max.' });
       const { prompt } = await ctx.formatChat(JSON.stringify(messages));
-      const promptToks = await ctx.tokenize(prompt);
+      const promptToks: number[] = await ctx.tokenize(prompt);
       branch = Branch.create(ctx, 0, { temperature: 0 });
       await branch.prefill(promptToks);
 
       // Generate turn 1 response
-      const gen = [];
+      const gen: number[] = [];
       for (let i = 0; i < GEN_TOKENS; i++) {
         const { token, isStop } = await branch.produce();
         if (isStop) break;
         await branch.commit(token);
         gen.push(token);
       }
-      const turn1Response = await ctx.detokenize(gen);
+      const turn1Response: string = await ctx.detokenize(gen);
       messages.push({ role: 'assistant', content: turn1Response });
 
       // Turn 2: Distractor
@@ -535,7 +536,7 @@ async function testWarmSemanticRecall() {
 
   // Phase 2: Score via embedding similarity (chat model fully released)
   {
-    const embedCtx = await addon.createContext({
+    const embedCtx: SessionContext = await addon.createContext({
       modelPath: EMBED_MODEL_PATH,
       nCtx: 512,
       nBatch: 512,
@@ -545,8 +546,8 @@ async function testWarmSemanticRecall() {
     });
 
     try {
-      async function embed(text) {
-        const tokens = await embedCtx.tokenize(text);
+      async function embed(text: string): Promise<Float32Array> {
+        const tokens: number[] = await embedCtx.tokenize(text);
         await embedCtx.kvCacheClear();
         await embedCtx.encode(tokens);
         return embedCtx.getEmbeddings(true);
@@ -554,12 +555,12 @@ async function testWarmSemanticRecall() {
 
       console.log(`  Recall response: "${recallText.trim()}"`);
 
-      const embResponse = await embed(recallText);
-      const embCorrect = await embed('The dog is named Max.');
-      const embWrong = await embed('Red, blue, and green are three colors.');
+      const embResponse: Float32Array = await embed(recallText);
+      const embCorrect: Float32Array = await embed('The dog is named Max.');
+      const embWrong: Float32Array = await embed('Red, blue, and green are three colors.');
 
-      const simCorrect = cosine(embResponse, embCorrect);
-      const simWrong = cosine(embResponse, embWrong);
+      const simCorrect: number = cosine(embResponse, embCorrect);
+      const simWrong: number = cosine(embResponse, embWrong);
 
       assert(simCorrect > simWrong,
         `Semantic recall: correct=${simCorrect.toFixed(3)} > wrong=${simWrong.toFixed(3)}`);
@@ -573,10 +574,10 @@ async function testWarmSemanticRecall() {
 // BRANCH STEER TESTS - Dynamic per-sample logit manipulation
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testBranchSteer() {
+async function testBranchSteer(): Promise<void> {
   console.log('\n--- Branch.steer ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nThreads: 4,
@@ -584,25 +585,25 @@ async function testBranchSteer() {
   });
 
   try {
-    const tokens = await ctx.tokenize("The quick brown");
+    const tokens: number[] = await ctx.tokenize("The quick brown");
     const branch = Branch.create(ctx, 0, { temperature: 0 });
     await branch.prefill(tokens);
 
     // Get the greedy token (what would be sampled without steer)
-    const greedyToken = branch.sample();
+    const greedyToken: number = branch.sample();
     assert(greedyToken >= 0, `Greedy sample → ${greedyToken}`);
 
     // Block the greedy token with steer
     branch.steer([{ token: greedyToken, bias: -Infinity }]);
 
     // Sample again - should get a different token
-    const steeredToken = branch.sample();
+    const steeredToken: number = branch.sample();
     assert(steeredToken !== greedyToken,
       `steer() blocks greedy: ${greedyToken} → ${steeredToken}`);
 
     // Clear steer - should get greedy token again
     branch.clearSteer();
-    const afterClear = branch.sample();
+    const afterClear: number = branch.sample();
     assert(afterClear === greedyToken,
       `clearSteer() restores greedy: ${afterClear} === ${greedyToken}`);
 
@@ -611,49 +612,49 @@ async function testBranchSteer() {
       { token: greedyToken, bias: -Infinity },
       { token: steeredToken, bias: -Infinity },
     ]);
-    const doubleBlocked = branch.sample();
+    const doubleBlocked: number = branch.sample();
     assert(doubleBlocked !== greedyToken && doubleBlocked !== steeredToken,
       `Multiple blocks: ${doubleBlocked} ≠ {${greedyToken}, ${steeredToken}}`);
 
     // Test boost (positive bias)
     branch.clearSteer();
     branch.steer([{ token: 42, bias: 100.0 }]);  // Massive boost to token 42
-    const boosted = branch.sample();
+    const boosted: number = branch.sample();
     assert(boosted === 42, `Boost token 42 → ${boosted}`);
 
     await branch.prune();
     ok('steer()/clearSteer() work correctly');
 
     // Test fork invariant: steer is NOT cloned on fork
-    const tokens2 = await ctx.tokenize("Hello world");
+    const tokens2: number[] = await ctx.tokenize("Hello world");
     const parent = Branch.create(ctx, 0, { temperature: 0 });
     await parent.prefill(tokens2);
 
-    const parentGreedy = parent.sample();
+    const parentGreedy: number = parent.sample();
 
     // Apply steer to parent - block the greedy token
     parent.steer([{ token: parentGreedy, bias: -Infinity }]);
-    const parentSteered = parent.sample();
+    const parentSteered: number = parent.sample();
     assert(parentSteered !== parentGreedy, `Parent steered: ${parentSteered} ≠ ${parentGreedy}`);
 
     // Fork from parent - child should NOT inherit steer
     const child = await parent.fork();
-    const childSample = child.sample();
+    const childSample: number = child.sample();
     assert(childSample === parentGreedy,
       `Fork does NOT inherit steer: child=${childSample} === greedy=${parentGreedy}`);
 
     // Verify parent still has steer active
-    const parentStillSteered = parent.sample();
+    const parentStillSteered: number = parent.sample();
     assert(parentStillSteered === parentSteered,
       `Parent retains steer after fork: ${parentStillSteered} === ${parentSteered}`);
 
     // Apply different steer to child - should not affect parent
     child.steer([{ token: 99, bias: 100.0 }]);
-    const childBoosted = child.sample();
+    const childBoosted: number = child.sample();
     assert(childBoosted === 99, `Child can set own steer: ${childBoosted} === 99`);
 
     // Parent should be unaffected by child's steer
-    const parentUnaffected = parent.sample();
+    const parentUnaffected: number = parent.sample();
     assert(parentUnaffected === parentSteered,
       `Parent unaffected by child steer: ${parentUnaffected} === ${parentSteered}`);
 
@@ -669,14 +670,14 @@ async function testBranchSteer() {
 // NBATCH ABLATION - Chunk size must not affect output
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testNBatchAblation() {
+async function testNBatchAblation(): Promise<void> {
   console.log('\n--- nBatch Ablation ---');
 
-  const nBatchValues = [32, 64, 128, 512];
-  const results = {};
+  const nBatchValues: number[] = [32, 64, 128, 512];
+  const results: Record<number, string> = {};
 
   for (const nBatch of nBatchValues) {
-    const ctx = await addon.createContext({
+    const ctx: SessionContext = await addon.createContext({
       modelPath: MODEL_PATH,
       nCtx: CTX_SIZE,
       nBatch,
@@ -684,16 +685,16 @@ async function testNBatchAblation() {
     });
 
     try {
-      const messages = [{ role: 'user', content: "Hello, how are you today?" }];
+      const messages: Array<{ role: string; content: string }> = [{ role: 'user', content: "Hello, how are you today?" }];
       const { prompt } = await ctx.formatChat(JSON.stringify(messages));
-      const promptToks = await ctx.tokenize(prompt);
+      const promptToks: number[] = await ctx.tokenize(prompt);
       const branch = Branch.create(ctx, 0, { temperature: 0 }, nBatch);
       await branch.prefill(promptToks);
 
-      const followUp = await ctx.tokenize(" What else?");
+      const followUp: number[] = await ctx.tokenize(" What else?");
       await branch.prefill(followUp);
 
-      const gen = [];
+      const gen: number[] = [];
       for (let i = 0; i < 5; i++) {
         const { token, isStop } = await branch.produce();
         if (isStop) break;
@@ -708,8 +709,8 @@ async function testNBatchAblation() {
     }
   }
 
-  const ref = results[nBatchValues[0]];
-  let allMatch = true;
+  const ref: string = results[nBatchValues[0]];
+  let allMatch: boolean = true;
   for (const nb of nBatchValues) {
     if (results[nb] !== ref) allMatch = false;
   }
@@ -721,37 +722,37 @@ async function testNBatchAblation() {
 // TOKENIZER BEHAVIOR TESTS
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testTokenizer(ctx) {
+async function testTokenizer(ctx: SessionContext): Promise<void> {
   console.log('\n--- Tokenizer ---');
 
   // getEogToken
-  const eog = ctx.getEogToken();
+  const eog: number = ctx.getEogToken();
   assert(Number.isInteger(eog), `getEogToken() → ${eog}`);
   assert(ctx.isStopToken(eog), `EOS ${eog} is stop token`);
 
-  const eogText = ctx.tokenToText(eog);
+  const eogText: string = ctx.tokenToText(eog);
   assert(eogText.length > 0, `EOS text: "${eogText}"`);
 
   // tokenize with addSpecial
-  const withSpecial = await ctx.tokenize('Hello world', true);
-  const noSpecial = await ctx.tokenize('Hello world', false);
+  const withSpecial: number[] = await ctx.tokenize('Hello world', true);
+  const noSpecial: number[] = await ctx.tokenize('Hello world', false);
 
   assert(noSpecial.length <= withSpecial.length,
     `addSpecial=false (${noSpecial.length}) <= addSpecial=true (${withSpecial.length})`);
 
   // getTurnSeparator
-  const sep = ctx.getTurnSeparator();
+  const sep: number[] = ctx.getTurnSeparator();
   assert(Array.isArray(sep) && sep.length > 0, `getTurnSeparator() → [${sep.join(',')}]`);
 
-  const hasStop = sep.some(t => ctx.isStopToken(t));
+  const hasStop: boolean = sep.some((t: number) => ctx.isStopToken(t));
   assert(hasStop, 'Separator contains stop token');
 
-  const sepText = sep.map(t => ctx.tokenToText(t)).join('');
+  const sepText: string = sep.map((t: number) => ctx.tokenToText(t)).join('');
   ok(`Separator text: ${JSON.stringify(sepText)}`);
 
   // Caching
-  const sep2 = ctx.getTurnSeparator();
-  assert(sep.length === sep2.length && sep.every((t, i) => t === sep2[i]),
+  const sep2: number[] = ctx.getTurnSeparator();
+  assert(sep.length === sep2.length && sep.every((t: number, i: number) => t === sep2[i]),
     'getTurnSeparator() cached');
 }
 
@@ -759,25 +760,25 @@ async function testTokenizer(ctx) {
 // DETERMINISM TEST - Same prompt must produce identical output
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testDeterminism() {
+async function testDeterminism(): Promise<void> {
   console.log('\n--- Determinism ---');
 
-  async function generate(prompt) {
-    const ctx = await addon.createContext({
+  async function generate(prompt: string): Promise<string> {
+    const ctx: SessionContext = await addon.createContext({
       modelPath: MODEL_PATH,
       nCtx: CTX_SIZE,
       nThreads: 4
     });
 
     try {
-      const messages = [{ role: 'user', content: prompt }];
+      const messages: Array<{ role: string; content: string }> = [{ role: 'user', content: prompt }];
       const { prompt: formatted } = await ctx.formatChat(JSON.stringify(messages));
-      const tokens = await ctx.tokenize(formatted);
+      const tokens: number[] = await ctx.tokenize(formatted);
 
       const branch = Branch.create(ctx, 0, { temperature: 0 });
       await branch.prefill(tokens);
 
-      const gen = [];
+      const gen: number[] = [];
       for (let i = 0; i < 20; i++) {
         const { token, isStop } = await branch.produce();
         if (isStop) break;
@@ -791,9 +792,9 @@ async function testDeterminism() {
     }
   }
 
-  const prompt = "Count from 1 to 5.";
-  const run1 = await generate(prompt);
-  const run2 = await generate(prompt);
+  const prompt: string = "Count from 1 to 5.";
+  const run1: string = await generate(prompt);
+  const run2: string = await generate(prompt);
 
   assert(run1 === run2, `Deterministic: run1 === run2 (${run1.split(',').length} tokens)`);
 }
@@ -802,7 +803,7 @@ async function testDeterminism() {
 // EMBEDDING TESTS (optional)
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testEmbeddings() {
+async function testEmbeddings(): Promise<void> {
   if (!EMBED_MODEL_PATH) {
     console.log('\n--- Embeddings (SKIPPED - no LLAMA_EMBED_MODEL) ---');
     return;
@@ -811,7 +812,7 @@ async function testEmbeddings() {
   console.log('\n--- Embeddings ---');
   console.log(`  Model: ${path.basename(EMBED_MODEL_PATH)}`);
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: EMBED_MODEL_PATH,
     nCtx: 512,
     nBatch: 512,
@@ -823,28 +824,28 @@ async function testEmbeddings() {
   try {
     assert(ctx.hasPooling(), 'hasPooling() → true');
 
-    const dim = ctx.getEmbeddingDimension();
+    const dim: number = ctx.getEmbeddingDimension();
     assert(dim > 0, `getEmbeddingDimension() → ${dim}`);
 
-    async function embed(text) {
-      const tokens = await ctx.tokenize(text);
+    async function embed(text: string): Promise<Float32Array> {
+      const tokens: number[] = await ctx.tokenize(text);
       await ctx.kvCacheClear();
       await ctx.encode(tokens);
       return ctx.getEmbeddings(true);
     }
 
-    const emb1 = await embed("Hello world");
+    const emb1: Float32Array = await embed("Hello world");
     assert(emb1.length === dim, `embed("Hello world") → Float32Array(${emb1.length})`);
 
     // L2 norm should be ~1.0
-    let norm = 0;
+    let norm: number = 0;
     for (let i = 0; i < emb1.length; i++) norm += emb1[i] * emb1[i];
     norm = Math.sqrt(norm);
     assert(Math.abs(norm - 1.0) < 0.01, `L2 normalized: norm=${norm.toFixed(4)}`);
 
     // Cosine similarity
-    function cosine(a, b) {
-      let dot = 0, na = 0, nb = 0;
+    function cosine(a: Float32Array, b: Float32Array): number {
+      let dot: number = 0, na: number = 0, nb: number = 0;
       for (let i = 0; i < a.length; i++) {
         dot += a[i] * b[i];
         na += a[i] * a[i];
@@ -853,16 +854,16 @@ async function testEmbeddings() {
       return dot / (Math.sqrt(na) * Math.sqrt(nb));
     }
 
-    const emb1Copy = await embed("Hello world");
-    const simIdentical = cosine(emb1, emb1Copy);
+    const emb1Copy: Float32Array = await embed("Hello world");
+    const simIdentical: number = cosine(emb1, emb1Copy);
     assert(simIdentical > 0.99, `Identical texts similarity: ${simIdentical.toFixed(4)}`);
 
-    const embSimilar = await embed("The cat sat on the mat");
-    const embDifferent = await embed("Stock prices rose sharply");
-    const embCat = await embed("A cat rested on the rug");
+    const embSimilar: Float32Array = await embed("The cat sat on the mat");
+    const embDifferent: Float32Array = await embed("Stock prices rose sharply");
+    const embCat: Float32Array = await embed("A cat rested on the rug");
 
-    const simSimilar = cosine(embSimilar, embCat);
-    const simDifferent = cosine(embSimilar, embDifferent);
+    const simSimilar: number = cosine(embSimilar, embCat);
+    const simDifferent: number = cosine(embSimilar, embDifferent);
     assert(simSimilar > simDifferent,
       `Semantic: similar=${simSimilar.toFixed(3)} > different=${simDifferent.toFixed(3)}`);
   } finally {
@@ -874,31 +875,31 @@ async function testEmbeddings() {
 // BRANCH PREFILL + GET LOGITS (replaces testDecodeAndCapture)
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testBranchPrefillAndLogits() {
+async function testBranchPrefillAndLogits(): Promise<void> {
   console.log('\n--- Branch prefill + getLogits ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nThreads: 4
   });
 
   try {
-    const tokens = await ctx.tokenize("Hello");
+    const tokens: number[] = await ctx.tokenize("Hello");
     const branch = Branch.create(ctx, 0, { temperature: 0 });
     await branch.prefill(tokens);
 
-    const logits = branch.getLogits();
-    let valid = false;
+    const logits: Float32Array = branch.getLogits();
+    let valid: boolean = false;
     for (let i = 0; i < logits.length; i++) {
       if (logits[i] !== 0 && !isNaN(logits[i])) valid = true;
     }
     assert(valid, `branch.prefill() + getLogits() → valid logits`);
 
     // Branch logits are an independent copy
-    const orig = logits[0];
+    const orig: number = logits[0];
     logits[0] = -999;
-    const logits2 = branch.getLogits();
+    const logits2: Float32Array = branch.getLogits();
     assert(logits2[0] !== -999, 'branch.getLogits() returns independent copy');
 
     await branch.prune();
@@ -911,12 +912,12 @@ async function testBranchPrefillAndLogits() {
 // MAIN
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testChatInOut(ctx) {
+async function testChatInOut(ctx: SessionContext): Promise<void> {
   console.log('\n── chat_in / chat_out ──');
 
   // formatChat with empty options object (new signature)
-  const messages = [{ role: 'user', content: 'Hello' }];
-  const result = await ctx.formatChat(JSON.stringify(messages), {});
+  const messages: Array<{ role: string; content: string }> = [{ role: 'user', content: 'Hello' }];
+  const result: FormattedChatResult = await ctx.formatChat(JSON.stringify(messages), {});
   assert(result.prompt.includes('Hello'), 'formatChat with options: prompt contains Hello');
   assert(typeof result.format === 'number', 'formatChat returns format as number');
   assert(typeof result.grammar === 'string', 'formatChat returns grammar as string');
@@ -928,12 +929,12 @@ async function testChatInOut(ctx) {
   ok('formatChat with options returns extended result');
 
   // Backward compat: string second argument still works
-  const backCompat = await ctx.formatChat(JSON.stringify(messages));
+  const backCompat: FormattedChatResult = await ctx.formatChat(JSON.stringify(messages));
   assert(backCompat.prompt.includes('Hello'), 'formatChat backward compat works');
   ok('formatChat backward compat (no second arg)');
 
   // formatChat with tools
-  const tools = [{
+  const tools: Array<{ type: string; function: { name: string; description: string; parameters: object } }> = [{
     type: 'function',
     function: {
       name: 'get_weather',
@@ -941,7 +942,7 @@ async function testChatInOut(ctx) {
       parameters: { type: 'object', properties: { location: { type: 'string' } } }
     }
   }];
-  const toolResult = await ctx.formatChat(JSON.stringify(messages), {
+  const toolResult: FormattedChatResult = await ctx.formatChat(JSON.stringify(messages), {
     tools: JSON.stringify(tools),
     toolChoice: 'auto'
   });
@@ -975,10 +976,10 @@ async function testChatInOut(ctx) {
 // wrapper surface and real-world workflows.
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testBranchStore() {
+async function testBranchStore(): Promise<void> {
   console.log('\n--- BranchStore ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nBatch: 512,
@@ -987,7 +988,7 @@ async function testBranchStore() {
   });
 
   try {
-    const promptToks = await ctx.tokenize("The quick brown fox jumps over the lazy");
+    const promptToks: number[] = await ctx.tokenize("The quick brown fox jumps over the lazy");
     const store = new BranchStore(ctx);
 
     // ── Test A: Best-of-N generation ──
@@ -998,24 +999,26 @@ async function testBranchStore() {
     {
       const root = Branch.create(ctx, 0, { temperature: 0.8 });
       await root.prefill(promptToks);
-      const branches = [root, await root.fork(), await root.fork()];
+      const branches: InstanceType<typeof Branch>[] = [root, await root.fork(), await root.fork()];
       branches[1].reseedSampler(42);
       branches[2].reseedSampler(99);
 
       for (let step = 0; step < 10; step++) {
-        const produced = await Promise.all(branches.map(async b => [b, await b.produce()]));
+        const produced: Array<[InstanceType<typeof Branch>, Produced]> = await Promise.all(
+          branches.map(async (b): Promise<[InstanceType<typeof Branch>, Produced]> => [b, await b.produce()])
+        );
         const live = produced.filter(([, p]) => !p.isStop);
         if (!live.length) break;
-        await store.commit(live.map(([b, p]) => [b, p.token]));
+        await store.commit(live.map(([b, p]) => [b, p.token] as [InstanceType<typeof Branch>, number]));
       }
 
-      const ppls = branches.map(b => b.perplexity);
-      console.log(`  best-of-N perplexities: [${ppls.map(p => p.toFixed(2)).join(', ')}]`);
-      assert(ppls.every(p => isFinite(p) && p >= 1.0),
-        `best-of-N: all perplexities valid [${ppls.map(p => p.toFixed(2))}]`);
+      const ppls: number[] = branches.map((b) => b.perplexity);
+      console.log(`  best-of-N perplexities: [${ppls.map((p) => p.toFixed(2)).join(', ')}]`);
+      assert(ppls.every((p) => isFinite(p) && p >= 1.0),
+        `best-of-N: all perplexities valid [${ppls.map((p) => p.toFixed(2))}]`);
 
-      const best = ppls.reduce((a, b) => Math.min(a, b));
-      const worst = ppls.reduce((a, b) => Math.max(a, b));
+      const best: number = ppls.reduce((a, b) => Math.min(a, b));
+      const worst: number = ppls.reduce((a, b) => Math.max(a, b));
       console.log(`  [PASS] best-of-N: best=${best.toFixed(2)}, worst=${worst.toFixed(2)}`);
 
       await root.pruneSubtree();
@@ -1031,15 +1034,15 @@ async function testBranchStore() {
       const b2 = await b1.fork();
 
       // Phase 1: Rehydrate from "saved" histories
-      const history1 = await ctx.tokenize(" dog. The weather is nice today and I want to go", false);
-      const history2 = await ctx.tokenize(" cat. Let me explain how quantum entanglement works in", false);
+      const history1: number[] = await ctx.tokenize(" dog. The weather is nice today and I want to go", false);
+      const history2: number[] = await ctx.tokenize(" cat. Let me explain how quantum entanglement works in", false);
       await store.prefill([[b1, history1], [b2, history2]]);
 
       // Branches should be at different-length positions? No — same length coincidentally.
       // But logits must differ (different KV contents)
-      const logitsAfterPrefill1 = b1.getLogits();
-      const logitsAfterPrefill2 = b2.getLogits();
-      let prefillDiffer = false;
+      const logitsAfterPrefill1: Float32Array = b1.getLogits();
+      const logitsAfterPrefill2: Float32Array = b2.getLogits();
+      let prefillDiffer: boolean = false;
       for (let i = 0; i < logitsAfterPrefill1.length; i++) {
         if (logitsAfterPrefill1[i] !== logitsAfterPrefill2[i]) { prefillDiffer = true; break; }
       }
@@ -1047,19 +1050,19 @@ async function testBranchStore() {
         `rehydrate: different histories → different logits after prefill`);
 
       // Phase 2: Generate continuations
-      const gen1 = [], gen2 = [];
+      const gen1: number[] = [], gen2: number[] = [];
       for (let i = 0; i < 5; i++) {
-        const produced = [[b1, await b1.produce()], [b2, await b2.produce()]];
+        const produced: Array<[InstanceType<typeof Branch>, Produced]> = [[b1, await b1.produce()], [b2, await b2.produce()]];
         const live = produced.filter(([, p]) => !p.isStop);
         if (!live.length) break;
-        await store.commit(live.map(([b, p]) => [b, p.token]));
+        await store.commit(live.map(([b, p]) => [b, p.token] as [InstanceType<typeof Branch>, number]));
         for (const [b, p] of live) {
           (b === b1 ? gen1 : gen2).push(p.token);
         }
       }
 
-      const text1 = await ctx.detokenize(gen1);
-      const text2 = await ctx.detokenize(gen2);
+      const text1: string = await ctx.detokenize(gen1);
+      const text2: string = await ctx.detokenize(gen2);
       console.log(`  rehydrate "weather" → "${text1}"`);
       console.log(`  rehydrate "quantum" → "${text2}"`);
 
@@ -1078,22 +1081,22 @@ async function testBranchStore() {
       const b1 = Branch.create(ctx, 0, { temperature: 0 });
       await b1.prefill(promptToks);
 
-      const logits = b1.getLogits();
+      const logits: Float32Array = b1.getLogits();
       assert(logits instanceof Float32Array,
         `getLogits: returns Float32Array`);
       assert(logits.length === ctx.vocabSize,
         `getLogits: length=${logits.length} === vocabSize=${ctx.vocabSize}`);
 
       // branch.modelEntropy — proves the logits snapshot is a valid distribution
-      const entropyFromBranch = b1.modelEntropy("nats");
+      const entropyFromBranch: number = b1.modelEntropy("nats");
       assert(isFinite(entropyFromBranch) && entropyFromBranch > 0,
         `branch.modelEntropy: ${entropyFromBranch.toFixed(4)} nats`);
 
       // After store.commit, logits change — branch reflects new state
-      const p = await b1.produce();
+      const p: Produced = await b1.produce();
       assert(!p.isStop, `modelEntropy: produce() should not hit EOG on first token`);
       await store.commit([[b1, p.token]]);
-      const entropyAfter = b1.modelEntropy("nats");
+      const entropyAfter: number = b1.modelEntropy("nats");
       assert(isFinite(entropyAfter),
         `modelEntropy after commit: entropy=${entropyAfter.toFixed(4)} nats`);
 
@@ -1109,10 +1112,10 @@ async function testBranchStore() {
       await b1.prefill(promptToks);
       const b2 = await b1.fork();
 
-      const output = [];
+      const output: string[] = [];
       for (let i = 0; i < 5; i++) {
         // Inspect with produce() — does NOT advance state
-        const p1 = await b1.produce(), p2 = await b2.produce();
+        const p1: Produced = await b1.produce(), p2: Produced = await b2.produce();
 
         // Can inspect text and isStop before committing
         assert(typeof p1.text === 'string' && typeof p2.text === 'string',
@@ -1143,27 +1146,27 @@ async function testBranchStore() {
 
       // Step 1-3: single-branch commit (decode::one path)
       for (let i = 0; i < 3; i++) {
-        const produced = [[b1, await b1.produce()], [b2, await b2.produce()]];
+        const produced: Array<[InstanceType<typeof Branch>, Produced]> = [[b1, await b1.produce()], [b2, await b2.produce()]];
         const live = produced.filter(([, p]) => !p.isStop);
         if (!live.length) break;
         for (const [b, p] of live) await b.commit(p.token);
       }
-      const posAfterSingle = b1.position;
+      const posAfterSingle: number = b1.position;
 
       // Step 4-6: batched commit (decode::each path)
       for (let i = 0; i < 3; i++) {
-        const produced = [[b1, await b1.produce()], [b2, await b2.produce()]];
+        const produced: Array<[InstanceType<typeof Branch>, Produced]> = [[b1, await b1.produce()], [b2, await b2.produce()]];
         const live = produced.filter(([, p]) => !p.isStop);
         if (!live.length) break;
-        await store.commit(live.map(([b, p]) => [b, p.token]));
+        await store.commit(live.map(([b, p]) => [b, p.token] as [InstanceType<typeof Branch>, number]));
       }
-      const posAfterBatched = b1.position;
+      const posAfterBatched: number = b1.position;
       assert(posAfterBatched === posAfterSingle + 3,
         `mixed ops: position correct after single→batched (${posAfterSingle}→${posAfterBatched})`);
 
       // Step 7-9: back to single-branch commit
       for (let i = 0; i < 3; i++) {
-        const produced = [[b1, await b1.produce()], [b2, await b2.produce()]];
+        const produced: Array<[InstanceType<typeof Branch>, Produced]> = [[b1, await b1.produce()], [b2, await b2.produce()]];
         const live = produced.filter(([, p]) => !p.isStop);
         if (!live.length) break;
         for (const [b, p] of live) await b.commit(p.token);
@@ -1185,9 +1188,9 @@ async function testBranchStore() {
       await b1.prefill(promptToks);
       const b2 = await b1.fork();
 
-      const eog = ctx.getEogToken();
-      const gen1 = [], gen2 = [];
-      const stopped = [false, false];
+      const eog: number = ctx.getEogToken();
+      const gen1: number[] = [], gen2: number[] = [];
+      const stopped: [boolean, boolean] = [false, false];
 
       for (let step = 0; step < 8; step++) {
         // At step 3, force b1 to hit EOG
@@ -1195,9 +1198,9 @@ async function testBranchStore() {
           b1.steer([{ token: eog, bias: 100.0 }]);
         }
 
-        const pairs = [
-          ...(!stopped[0] ? [[b1, await b1.produce()]] : []),
-          ...(!stopped[1] ? [[b2, await b2.produce()]] : []),
+        const pairs: Array<[InstanceType<typeof Branch>, Produced]> = [
+          ...(!stopped[0] ? [[b1, await b1.produce()] as [InstanceType<typeof Branch>, Produced]] : []),
+          ...(!stopped[1] ? [[b2, await b2.produce()] as [InstanceType<typeof Branch>, Produced]] : []),
         ];
 
         const live = pairs.filter(([, p]) => !p.isStop);
@@ -1210,7 +1213,7 @@ async function testBranchStore() {
         }
 
         if (!live.length) break;
-        await store.commit(live.map(([b, p]) => [b, p.token]));
+        await store.commit(live.map(([b, p]) => [b, p.token] as [InstanceType<typeof Branch>, number]));
 
         for (const [b, p] of live) {
           (b === b1 ? gen1 : gen2).push(p.token);
@@ -1225,7 +1228,7 @@ async function testBranchStore() {
       assert(gen2.length > gen1.length,
         `independent EOG: b2 continued past b1's EOG (b1=${gen1.length}, b2=${gen2.length})`);
 
-      const text2 = await ctx.detokenize(gen2);
+      const text2: string = await ctx.detokenize(gen2);
       console.log(`  independent EOG: b1 stopped at step 3, b2 continued → "${text2}"`);
 
       // b2's position should reflect all its tokens, not be truncated by b1's stop
@@ -1243,19 +1246,19 @@ async function testBranchStore() {
 // PPL SANITY — commit() must produce sane perplexity (not millions)
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testPplSanity() {
+async function testPplSanity(): Promise<void> {
   console.log('\n--- PPL Sanity ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nThreads: 4
   });
 
   try {
-    const messages = [{ role: 'user', content: 'Tell me about the weather.' }];
+    const messages: Array<{ role: string; content: string }> = [{ role: 'user', content: 'Tell me about the weather.' }];
     const { prompt } = await ctx.formatChat(JSON.stringify(messages));
-    const promptToks = await ctx.tokenize(prompt);
+    const promptToks: number[] = await ctx.tokenize(prompt);
     const branch = Branch.create(ctx, 0, { temperature: 0 });
     await branch.prefill(promptToks);
 
@@ -1265,7 +1268,7 @@ async function testPplSanity() {
       await branch.commit(token);
     }
 
-    const ppl = branch.perplexity;
+    const ppl: number = branch.perplexity;
     console.log(`  perplexity after 10 commits: ${ppl.toFixed(2)}`);
     assert(isFinite(ppl) && ppl >= 1.0 && ppl < 1000,
       `PPL sanity: ${ppl.toFixed(2)} is in [1, 1000)`);
@@ -1280,14 +1283,14 @@ async function testPplSanity() {
 // COMMIT ROLLBACK — decode failure must restore sampler/grammar/metrics
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testCommitRollback() {
+async function testCommitRollback(): Promise<void> {
   console.log('\n--- Commit Rollback ---');
 
   // Tiny KV (nCtx=32) with many branches (nSeqMax=8). Each branch consumes
   // 1 KV cell per commit. With 8 branches and ~5 shared prefix cells, the
   // 32-cell budget exhausts after ~3 commits per branch. decode_each returns
   // non-zero (find_slot fails) → StoreCommitWorker throws → rollback fires.
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: 32,
     nBatch: 512,
@@ -1296,10 +1299,10 @@ async function testCommitRollback() {
   });
 
   try {
-    const promptToks = await ctx.tokenize("Hi");
+    const promptToks: number[] = await ctx.tokenize("Hi");
     const root = Branch.create(ctx, 0, { temperature: 1.0 });
     await root.prefill(promptToks);
-    const branches = [root];
+    const branches: InstanceType<typeof Branch>[] = [root];
     for (let i = 1; i < 8; i++) {
       const b = await root.fork();
       b.reseedSampler(1000 + i); // Divergent tokens → separate KV cells
@@ -1311,29 +1314,31 @@ async function testCommitRollback() {
     // Commit until decode fails from KV exhaustion
     // nCtx may be clamped to a model minimum (e.g. 256), so we need enough
     // rounds for 8 branches to exhaust ~256 cells: 256/8 = 32 rounds
-    let successfulRounds = 0;
-    let failedRound = false;
+    let successfulRounds: number = 0;
+    let failedRound: boolean = false;
     for (let round = 0; round < 50; round++) {
-      const produced = await Promise.all(branches.map(async b => [b, await b.produce()]));
+      const produced: Array<[InstanceType<typeof Branch>, Produced]> = await Promise.all(
+        branches.map(async (b): Promise<[InstanceType<typeof Branch>, Produced]> => [b, await b.produce()])
+      );
       const live = produced.filter(([, p]) => !p.isStop);
       if (!live.length) break;
 
       // Snapshot PPL before this round
-      const pplsBefore = live.map(([b]) => b.perplexity);
+      const pplsBefore: number[] = live.map(([b]) => b.perplexity);
 
       try {
-        await store.commit(live.map(([b, p]) => [b, p.token]));
+        await store.commit(live.map(([b, p]) => [b, p.token] as [InstanceType<typeof Branch>, number]));
         successfulRounds++;
       } catch {
         // Decode failed — verify PPL restored
-        const pplsAfter = live.map(([b]) => b.perplexity);
-        const allRestored = pplsBefore.every((p, i) => p === pplsAfter[i]);
+        const pplsAfter: number[] = live.map(([b]) => b.perplexity);
+        const allRestored: boolean = pplsBefore.every((p, i) => p === pplsAfter[i]);
         assert(allRestored,
           `rollback: all PPLs restored after decode failure at round ${round}`);
 
         // Branches still usable for single commits (1 token fits)
         const [b0, p0] = live[0];
-        const posBefore = b0.position;
+        const posBefore: number = b0.position;
         try {
           await b0.commit(p0.token);
           assert(b0.position === posBefore + 1,
@@ -1361,10 +1366,10 @@ async function testCommitRollback() {
 // ASYNC REJECTION — Worker failures must reject, branch state un-advanced
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testAsyncRejection() {
+async function testAsyncRejection(): Promise<void> {
   console.log('\n--- Async Rejection ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nThreads: 4,
@@ -1372,7 +1377,7 @@ async function testAsyncRejection() {
   });
 
   try {
-    const tokens = await ctx.tokenize("Hello world");
+    const tokens: number[] = await ctx.tokenize("Hello world");
     const branch = Branch.create(ctx, 0, { temperature: 0 });
     await branch.prefill(tokens);
 
@@ -1380,56 +1385,56 @@ async function testAsyncRejection() {
     const { token, isStop } = await branch.produce();
     assert(!isStop, 'rejection: initial produce succeeds');
     await branch.commit(token);
-    const posAfterCommit = branch.position;
+    const posAfterCommit: number = branch.position;
 
     // Prune the branch — frees native resources
     await branch.prune();
     assert(branch.disposed, 'rejection: branch is disposed after prune');
 
     // commit() on disposed branch — _ensureNotDisposed should throw synchronously
-    let threwOnCommit = false;
+    let threwOnCommit: boolean = false;
     try {
       await branch.commit(token);
-    } catch (e) {
+    } catch (err) {
       threwOnCommit = true;
-      assert(e.message.includes('disposed'), `rejection: commit error says "disposed": "${e.message}"`);
+      assert((err as Error).message.includes('disposed'), `rejection: commit error says "disposed": "${(err as Error).message}"`);
     }
     assert(threwOnCommit, 'rejection: commit on disposed branch throws');
 
     // produce() on disposed branch — async version rejects
-    let threwOnProduce = false;
+    let threwOnProduce: boolean = false;
     try {
       await branch.produce();
-    } catch (e) {
+    } catch (err) {
       threwOnProduce = true;
     }
     assert(threwOnProduce, 'rejection: produce on disposed branch rejects');
 
     // produceSync() on disposed branch — throws synchronously
-    let threwOnProduceSync = false;
+    let threwOnProduceSync: boolean = false;
     try {
       branch.produceSync();
-    } catch (e) {
+    } catch (err) {
       threwOnProduceSync = true;
     }
     assert(threwOnProduceSync, 'rejection: produceSync on disposed branch throws');
 
     // fork() on disposed branch
-    let threwOnFork = false;
+    let threwOnFork: boolean = false;
     try {
       await branch.fork();
-    } catch (e) {
+    } catch (err) {
       threwOnFork = true;
     }
     assert(threwOnFork, 'rejection: fork on disposed branch throws');
 
     // Native AsyncWorker rejection: call _branchPrefill with invalid handle (0)
-    let nativeRejected = false;
+    let nativeRejected: boolean = false;
     try {
       await ctx._branchPrefill(0, [token]);
-    } catch (e) {
+    } catch (err) {
       nativeRejected = true;
-      assert(e instanceof Error, `rejection: native rejection is Error: ${e.constructor.name}`);
+      assert(err instanceof Error, `rejection: native rejection is Error: ${(err as Error).constructor.name}`);
     }
     assert(nativeRejected, 'rejection: invalid handle to AsyncWorker rejects promise');
   } finally {
@@ -1441,10 +1446,10 @@ async function testAsyncRejection() {
 // EMPTY INPUT EDGE CASES — Batch workers with empty arrays resolve cleanly
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testEmptyInputEdgeCases() {
+async function testEmptyInputEdgeCases(): Promise<void> {
   console.log('\n--- Empty Input Edge Cases ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nThreads: 4,
@@ -1452,12 +1457,12 @@ async function testEmptyInputEdgeCases() {
   });
 
   try {
-    const tokens = await ctx.tokenize("Hello world");
+    const tokens: number[] = await ctx.tokenize("Hello world");
     const branch = Branch.create(ctx, 0, { temperature: 0 });
     await branch.prefill(tokens);
     const store = new BranchStore(ctx);
 
-    const posBefore = branch.position;
+    const posBefore: number = branch.position;
 
     // store.commit([]) — empty batch
     await store.commit([]);
@@ -1490,10 +1495,10 @@ async function testEmptyInputEdgeCases() {
 // JSON SCHEMA TO GRAMMAR — AsyncWorker with zero prior coverage
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testJsonSchemaToGrammar() {
+async function testJsonSchemaToGrammar(): Promise<void> {
   console.log('\n--- jsonSchemaToGrammar ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nThreads: 4
@@ -1510,17 +1515,17 @@ async function testJsonSchemaToGrammar() {
     };
 
     // Happy path: valid schema → GBNF string
-    const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(schema));
+    const grammar: string = await ctx.jsonSchemaToGrammar(JSON.stringify(schema));
     assert(typeof grammar === 'string' && grammar.length > 0,
       `jsonSchemaToGrammar: returned ${grammar.length}-char grammar`);
     assert(grammar.includes('root'), 'jsonSchemaToGrammar: grammar contains "root" rule');
 
     // Use the grammar with Branch.create to prove it's valid GBNF
-    const prompt = await ctx.tokenize("Output JSON: ");
+    const prompt: number[] = await ctx.tokenize("Output JSON: ");
     const branch = Branch.create(ctx, 0, { temperature: 0 }, undefined, grammar);
     await branch.prefill(prompt);
 
-    const output = [];
+    const output: string[] = [];
     for (let i = 0; i < 50; i++) {
       const { token, text, isStop } = await branch.produce();
       if (isStop) break;
@@ -1528,8 +1533,8 @@ async function testJsonSchemaToGrammar() {
       output.push(text);
     }
 
-    const result = output.join('');
-    let parsed;
+    const result: string = output.join('');
+    let parsed: { name: string; age: number } | undefined;
     try {
       parsed = JSON.parse(result);
     } catch {
@@ -1547,12 +1552,12 @@ async function testJsonSchemaToGrammar() {
     await branch.prune();
 
     // Error path: invalid JSON → promise rejects
-    let rejected = false;
+    let rejected: boolean = false;
     try {
       await ctx.jsonSchemaToGrammar('not valid json {{{');
-    } catch (e) {
+    } catch (err) {
       rejected = true;
-      assert(e instanceof Error, `jsonSchemaToGrammar: rejection is Error: ${e.constructor.name}`);
+      assert(err instanceof Error, `jsonSchemaToGrammar: rejection is Error: ${(err as Error).constructor.name}`);
     }
     assert(rejected, 'jsonSchemaToGrammar: invalid JSON rejects');
   } finally {
@@ -1564,10 +1569,10 @@ async function testJsonSchemaToGrammar() {
 // DISPOSED-DURING-ASYNC — _disposed set synchronously prevents use-after-prune
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testDisposedDuringAsync() {
+async function testDisposedDuringAsync(): Promise<void> {
   console.log('\n--- Disposed During Async ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nThreads: 4,
@@ -1575,7 +1580,7 @@ async function testDisposedDuringAsync() {
   });
 
   try {
-    const tokens = await ctx.tokenize("Test prompt");
+    const tokens: number[] = await ctx.tokenize("Test prompt");
     const branch = Branch.create(ctx, 0, { temperature: 0 });
     await branch.prefill(tokens);
 
@@ -1584,13 +1589,13 @@ async function testDisposedDuringAsync() {
     await branch.commit(token);
 
     // Call prune() — DO NOT await yet
-    const prunePromise = branch.prune();
+    const prunePromise: Promise<void> = branch.prune();
 
     // Immediately (before microtask resolves) check disposed
     assert(branch.disposed, 'disposed-during: _disposed is true synchronously after prune() call');
 
     // produceSync() should throw synchronously
-    let threwProduce = false;
+    let threwProduce: boolean = false;
     try {
       branch.produceSync();
     } catch {
@@ -1599,7 +1604,7 @@ async function testDisposedDuringAsync() {
     assert(threwProduce, 'disposed-during: produceSync() throws before prune promise resolves');
 
     // commit() should throw synchronously (the _ensureNotDisposed guard)
-    let threwCommit = false;
+    let threwCommit: boolean = false;
     try {
       await branch.commit(token);
     } catch {
@@ -1623,10 +1628,10 @@ async function testDisposedDuringAsync() {
 // ASYNC ITERATOR — Branch as async iterable
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testAsyncIterator() {
+async function testAsyncIterator(): Promise<void> {
   console.log('\n--- Async Iterator ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nThreads: 4,
@@ -1634,13 +1639,13 @@ async function testAsyncIterator() {
   });
 
   try {
-    const prompt = await ctx.tokenize("The quick brown fox");
+    const prompt: number[] = await ctx.tokenize("The quick brown fox");
 
     // Generate to EOG via for-await
     const branch = Branch.create(ctx, 0, { temperature: 0 });
     await branch.prefill(prompt);
 
-    const tokens = [];
+    const tokens: number[] = [];
     for await (const { token, text } of branch) {
       assert(typeof token === 'number' && typeof text === 'string',
         `iterator: yields {token, text} (token=${token})`);
@@ -1663,7 +1668,7 @@ async function testAsyncIterator() {
     await ctx.kvCacheClear();
     const branchManual = Branch.create(ctx, 0, { temperature: 0 });
     await branchManual.prefill(prompt);
-    const manualTokens = [];
+    const manualTokens: number[] = [];
     for (let i = 0; i < 10; i++) {
       const { token, isStop } = await branchManual.produce();
       if (isStop) break;
@@ -1672,7 +1677,7 @@ async function testAsyncIterator() {
     }
 
     assert(tokens.length === manualTokens.length &&
-      tokens.every((t, i) => t === manualTokens[i]),
+      tokens.every((t: number, i: number) => t === manualTokens[i]),
       'iterator: output matches manual produce/commit (deterministic)');
 
     await branchManual.prune();
@@ -1685,27 +1690,27 @@ async function testAsyncIterator() {
 // HOT-SWAP TESTS (setSamplerParams / setGrammar)
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testSetSamplerParams() {
+async function testSetSamplerParams(): Promise<void> {
   console.log('\n--- setSamplerParams ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nThreads: 4,
   });
 
   try {
-    const prompt = await ctx.tokenize("The capital of France is");
+    const prompt: number[] = await ctx.tokenize("The capital of France is");
 
     // Greedy baseline
     const greedy = Branch.create(ctx, 0, { temperature: 0, topK: 0, topP: 1.0, minP: 0 });
     await greedy.prefill(prompt);
-    const greedyTok = greedy.sample();
+    const greedyTok: number = greedy.sample();
     assert(greedyTok >= 0, `setSamplerParams: greedy token valid (${greedyTok})`);
 
     // Switch to stochastic — at high temp, should eventually diverge
     greedy.setSamplerParams({ temperature: 1.5, seed: 42, topK: 0, topP: 1.0, minP: 0 });
-    let diverged = false;
+    let diverged: boolean = false;
     for (let i = 0; i < 20; i++) {
       if (greedy.sample() !== greedyTok) { diverged = true; break; }
     }
@@ -1713,8 +1718,8 @@ async function testSetSamplerParams() {
 
     // Switch back to greedy — should be deterministic again
     greedy.setSamplerParams({ temperature: 0, topK: 0, topP: 1.0, minP: 0 });
-    const tok2 = greedy.sample();
-    const tok3 = greedy.sample();
+    const tok2: number = greedy.sample();
+    const tok3: number = greedy.sample();
     assert(tok2 === tok3, `setSamplerParams: greedy restored (${tok2} === ${tok3})`);
 
     await greedy.prune();
@@ -1732,10 +1737,10 @@ async function testSetSamplerParams() {
   }
 }
 
-async function testSetGrammar() {
+async function testSetGrammar(): Promise<void> {
   console.log('\n--- setGrammar ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nThreads: 4,
@@ -1743,23 +1748,23 @@ async function testSetGrammar() {
   });
 
   try {
-    const grammar = `root ::= "{" ws "}" ws
+    const grammar: string = `root ::= "{" ws "}" ws
 ws ::= [ \\t\\n]*`;
 
     // Hot-swap: create without grammar, then add one
-    const prompt = await ctx.tokenize("Output: ");
+    const prompt: number[] = await ctx.tokenize("Output: ");
     const branch = Branch.create(ctx, 0, { temperature: 0 });
     await branch.prefill(prompt);
 
     branch.setGrammar(grammar);
-    const output = [];
+    const output: string[] = [];
     for (let i = 0; i < 10; i++) {
       const { token, text, isStop } = await branch.produce();
       if (isStop) break;
       await branch.commit(token);
       output.push(text);
     }
-    const result = output.join('');
+    const result: string = output.join('');
     assert(/^\{\s*\}\s*$/.test(result), `setGrammar: hot-swap constrains → "${result}"`);
 
     // Remove grammar
@@ -1777,14 +1782,14 @@ ws ::= [ \\t\\n]*`;
     root.setGrammar(grammar);
 
     const child = await root.fork();
-    const childOut = [];
+    const childOut: string[] = [];
     for (let i = 0; i < 10; i++) {
-      const p = await child.produce();
+      const p: Produced = await child.produce();
       if (p.isStop) break;
       await child.commit(p.token);
       childOut.push(p.text);
     }
-    const childResult = childOut.join('');
+    const childResult: string = childOut.join('');
     assert(/^\{\s*\}\s*$/.test(childResult), `setGrammar: fork inherits grammar → "${childResult}"`);
 
     await child.prune();
@@ -1798,10 +1803,10 @@ ws ::= [ \\t\\n]*`;
 // BRANCH METRICS & LOGIT BIAS
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testBranchMetrics() {
+async function testBranchMetrics(): Promise<void> {
   console.log('\n--- Branch Metrics & Logit Bias ---');
 
-  const ctx = await addon.createContext({
+  const ctx: SessionContext = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
     nThreads: 4,
@@ -1809,30 +1814,30 @@ async function testBranchMetrics() {
   });
 
   try {
-    const tokens = await ctx.tokenize("The capital of France is");
+    const tokens: number[] = await ctx.tokenize("The capital of France is");
     const branch = Branch.create(ctx, 0, { temperature: 0.8, seed: 42 });
     await branch.prefill(tokens);
 
     // branch.modelEntropy
-    const entropy = branch.modelEntropy('nats');
+    const entropy: number = branch.modelEntropy('nats');
     assert(isFinite(entropy) && entropy >= 0, `branch.modelEntropy('nats') → ${entropy.toFixed(4)}`);
 
-    const entropyBits = branch.modelEntropy('bits');
+    const entropyBits: number = branch.modelEntropy('bits');
     assert(Math.abs(entropyBits - entropy / Math.log(2)) < 0.01,
       `branch.modelEntropy('bits') consistent with nats`);
 
     // branch.modelSurprisal
-    const token = branch.sample();
-    const surprisal = branch.modelSurprisal(token, 'nats');
+    const token: number = branch.sample();
+    const surprisal: number = branch.modelSurprisal(token, 'nats');
     assert(isFinite(surprisal) && surprisal >= 0,
       `branch.modelSurprisal(${token}, 'nats') → ${surprisal.toFixed(4)}`);
 
-    const surprisalBits = branch.modelSurprisal(token, 'bits');
+    const surprisalBits: number = branch.modelSurprisal(token, 'bits');
     assert(Math.abs(surprisalBits - surprisal / Math.log(2)) < 0.01,
       `branch.modelSurprisal bits consistent with nats`);
 
     // branch.samplingPerplexity — before any commits, must be Infinity
-    const pplBefore = branch.samplingPerplexity;
+    const pplBefore: number = branch.samplingPerplexity;
     assert(pplBefore === Infinity,
       `branch.samplingPerplexity before commit should be Infinity, got ${pplBefore}`);
 
@@ -1841,27 +1846,27 @@ async function testBranchMetrics() {
     const { token: t2 } = await branch.produce();
     await branch.commit(t2);
 
-    const pplAfter = branch.samplingPerplexity;
+    const pplAfter: number = branch.samplingPerplexity;
     assert(isFinite(pplAfter) && pplAfter >= 1.0,
       `branch.samplingPerplexity after commits → ${pplAfter.toFixed(4)}`);
 
     // setLogitBias — get greedy baseline, ban it, verify it changes
     const baseline = Branch.create(ctx, 0, { temperature: 0 });
     await baseline.prefill(tokens);
-    const bannedToken = baseline.sample();
+    const bannedToken: number = baseline.sample();
     await baseline.prune();
 
     const greedy = Branch.create(ctx, 0, { temperature: 0 });
     await greedy.prefill(tokens);
     greedy.setLogitBias([{ token: bannedToken, bias: -Infinity }]);
-    const alternative = greedy.sample();
+    const alternative: number = greedy.sample();
     assert(alternative !== bannedToken,
       `setLogitBias: banned token ${bannedToken} not sampled (got ${alternative})`);
 
     // clearLogitBias — after clearing, the greedy baseline token should come back
     const greedy2 = Branch.create(ctx, 0, { temperature: 0 });
     await greedy2.prefill(tokens);
-    const greedyToken = greedy2.sample();
+    const greedyToken: number = greedy2.sample();
     assert(greedyToken === bannedToken,
       `clearLogitBias: greedy token ${greedyToken} === baseline ${bannedToken}`);
 
@@ -1870,7 +1875,7 @@ async function testBranchMetrics() {
     await parent.prefill(tokens);
     parent.setLogitBias([{ token: bannedToken, bias: -Infinity }]);
     const child = await parent.fork();
-    const childToken = child.sample();
+    const childToken: number = child.sample();
     assert(childToken !== bannedToken,
       `setLogitBias cloned on fork: child doesn't sample banned token`);
 
@@ -1887,8 +1892,8 @@ async function testBranchMetrics() {
 // MAIN
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function main() {
-  let mainCtx = null;
+async function main(): Promise<void> {
+  let mainCtx: SessionContext | null = null;
 
   try {
     // Create main context for reusable tests
@@ -1942,8 +1947,8 @@ async function main() {
       process.exit(1);
     }
   } catch (err) {
-    console.error('\nFatal error:', err.message);
-    console.error(err.stack);
+    console.error('\nFatal error:', (err as Error).message);
+    console.error((err as Error).stack);
     process.exit(1);
   } finally {
     if (mainCtx) mainCtx.dispose();
diff --git a/tsconfig.json b/tsconfig.json
new file mode 100644
index 0000000..40c7ec9
--- /dev/null
+++ b/tsconfig.json
@@ -0,0 +1,18 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "CommonJS",
+    "moduleResolution": "Node",
+    "lib": ["ES2022"],
+    "outDir": "dist",
+    "rootDir": "src",
+    "declaration": true,
+    "declarationMap": true,
+    "sourceMap": true,
+    "strict": true,
+    "skipLibCheck": true,
+    "types": ["node"]
+  },
+  "include": ["src/**/*.ts"],
+  "exclude": ["node_modules", "dist", "build"]
+}
diff --git a/typedoc.json b/typedoc.json
index a638334..fb2354e 100644
--- a/typedoc.json
+++ b/typedoc.json
@@ -1,7 +1,7 @@
 {
   "$schema": "https://typedoc.org/schema.json",
   "plugin": ["typedoc-rhineai-theme"],
-  "entryPoints": ["lib/index.d.ts"],
+  "entryPoints": ["dist/index.d.ts"],
   "out": "docs/api",
   "name": "lloyal.node API Reference",
   "includeVersion": true,

From c1bd8afeec50579dea426e5a6110ff0d1e7d6bf8 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Mon, 23 Feb 2026 04:54:17 +1100
Subject: [PATCH 04/17] feat(agents): decoupled deep-research from monolith

---
 CMakeLists.txt                            |   18 +
 examples/deep-research/deep-research.ts   | 1009 ++++-----------------
 examples/deep-research/display.ts         |   22 +
 examples/deep-research/reranker.ts        |   88 ++
 examples/deep-research/resources/files.ts |   44 +
 examples/deep-research/resources/types.ts |   10 +
 examples/deep-research/tasks/eval.md      |    3 +
 examples/deep-research/tasks/eval.ts      |   54 ++
 examples/deep-research/tasks/plan.md      |    1 +
 examples/deep-research/tasks/plan.ts      |   57 ++
 examples/deep-research/tasks/research.md  |    1 +
 examples/deep-research/tasks/research.ts  |  126 +++
 examples/deep-research/tasks/verify.md    |    5 +
 examples/deep-research/tasks/verify.ts    |   91 ++
 examples/deep-research/tools/index.ts     |   28 +
 examples/deep-research/tools/read-file.ts |   39 +
 examples/deep-research/tools/report.ts    |   22 +
 examples/deep-research/tools/search.ts    |   23 +
 examples/deep-research/tools/types.ts     |   23 +
 src/Util.cpp                              |  193 ++++
 src/Util.hpp                              |   15 +
 src/binding.cpp                           |    4 +
 22 files changed, 1065 insertions(+), 811 deletions(-)
 create mode 100644 examples/deep-research/display.ts
 create mode 100644 examples/deep-research/reranker.ts
 create mode 100644 examples/deep-research/resources/files.ts
 create mode 100644 examples/deep-research/resources/types.ts
 create mode 100644 examples/deep-research/tasks/eval.md
 create mode 100644 examples/deep-research/tasks/eval.ts
 create mode 100644 examples/deep-research/tasks/plan.md
 create mode 100644 examples/deep-research/tasks/plan.ts
 create mode 100644 examples/deep-research/tasks/research.md
 create mode 100644 examples/deep-research/tasks/research.ts
 create mode 100644 examples/deep-research/tasks/verify.md
 create mode 100644 examples/deep-research/tasks/verify.ts
 create mode 100644 examples/deep-research/tools/index.ts
 create mode 100644 examples/deep-research/tools/read-file.ts
 create mode 100644 examples/deep-research/tools/report.ts
 create mode 100644 examples/deep-research/tools/search.ts
 create mode 100644 examples/deep-research/tools/types.ts
 create mode 100644 src/Util.cpp
 create mode 100644 src/Util.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 170a818..7f1a294 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,6 +116,21 @@ endif()
 # This also sets up the llama/llama.h include structure automatically
 add_subdirectory(${LIBLLOYAL_DIR} liblloyal)
 
+# =============================================================================
+# md4c (Markdown parser for structure extraction)
+# =============================================================================
+
+include(FetchContent)
+FetchContent_Declare(
+  md4c
+  GIT_REPOSITORY https://github.com/mity/md4c
+  GIT_TAG        release-0.5.2
+)
+set(BUILD_MD2HTML_EXECUTABLE OFF CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(md4c)
+FetchContent_GetProperties(md4c)
+set(MD4C_INCLUDE_DIR "${md4c_SOURCE_DIR}/src")
+
 # =============================================================================
 # Addon Sources
 # =============================================================================
@@ -124,6 +139,7 @@ set(ADDON_SOURCES
     src/binding.cpp
     src/BackendManager.cpp
     src/SessionContext.cpp
+    src/Util.cpp
 )
 
 # =============================================================================
@@ -136,6 +152,7 @@ add_library(${PROJECT_NAME} MODULE ${ADDON_SOURCES} ${CMAKE_JS_SRC})
 target_include_directories(${PROJECT_NAME} PRIVATE
     ${CMAKE_JS_INC}
     ${NODE_ADDON_API_DIR}
+    ${MD4C_INCLUDE_DIR}
     src
 )
 
@@ -147,6 +164,7 @@ target_include_directories(${PROJECT_NAME} PRIVATE
 target_link_libraries(${PROJECT_NAME} PRIVATE
     liblloyal::liblloyal
     common
+    md4c
     ${CMAKE_JS_LIB}
 )
 
diff --git a/examples/deep-research/deep-research.ts b/examples/deep-research/deep-research.ts
index 8cb1f91..ecf6394 100644
--- a/examples/deep-research/deep-research.ts
+++ b/examples/deep-research/deep-research.ts
@@ -2,47 +2,32 @@
 /**
  * Deep Research with Tool-Calling Agents via BranchStore
  *
- * Demonstrates three fork patterns in a multi-agent research pipeline:
+ * Demonstrates composable fork patterns in a multi-agent research pipeline:
  *
- * 1. PLAN:     Branch.create() + grammar — single constrained generation
- * 2. RESEARCH: fork() + prefill() divergent suffixes — content-based divergence
- *              from shared prefix, with tool-calling agentic loop
- * 3. VERIFY:   fork() + reseed() same prompt — stochastic divergence for
- *              convergence checking, then model-as-judge eval fork
+ * - PLAN:     Branch.create() + grammar — constrained single generation
+ * - RESEARCH: fork() + prefill() divergent suffixes — parallel tool-calling agents
+ * - VERIFY:   fork() + reseed() — stochastic divergence for convergence checking
+ * - EVAL:     Branch.create() + grammar — model-as-judge
  *
- * Search uses a Qwen3-Reranker-0.6B cross-encoder for semantic relevance
- * scoring over a local corpus of markdown files. Both models (generative +
- * reranker) are loaded simultaneously — Qwen3 family shares vocabulary.
- *
- * The key performance insight: BranchStore.commit() packs N branches into
- * ONE llama_decode() call. N agents generate in lockstep with O(1) GPU
- * dispatches per step, regardless of branch count.
+ * Cold run composes: plan → research → verify → eval
+ * Warm follow-up composes: research(parent: trunk) → session.prefillUser → generate
  *
  * Usage:
- *   node deep-research.ts <model-path> --corpus <path> --query <text> [options]
- *
- * Required:
- *   <model-path>     Path to generative model (e.g. Qwen3-4B-Instruct)
- *   --corpus  path   Directory of .md files (or single .md file) to research
- *   --query   text   Research question
- *
- * Options:
- *   --reranker path  Reranker model path (default: qwen3-reranker-0.6b)
- *   --jsonl          JSONL output for testing
- *   --verbose        Show native llama.cpp logs
- *
- * Example:
- *   node deep-research.ts ./models/Qwen3-4B.gguf \
- *     --corpus ~/docs --query "How does the auth system work?"
+ *   node deep-research.ts [model-path] --corpus <path> [--query <text>] [options]
  */
 
 import * as fs from 'node:fs';
 import * as path from 'node:path';
 import * as readline from 'node:readline';
-import {
-  createContext, Branch, BranchStore, Session, forkAgent, runAgents,
-} from '../../dist/index.js';
-import type { SessionContext, AgentState } from '../../dist/index.js';
+import { createContext, BranchStore, Session } from '../../dist/index.js';
+import { c, log, emit, setJsonlMode, pad, fmtSize } from './display.js';
+import { loadResources, chunkResources } from './resources/files.js';
+import { createReranker } from './reranker.js';
+import { createTools } from './tools/index.js';
+import { plan } from './tasks/plan.js';
+import { research } from './tasks/research.js';
+import { verify } from './tasks/verify.js';
+import { evaluate } from './tasks/eval.js';
 
 // ================================================================
 // CLI ARGS
@@ -74,345 +59,41 @@ const flagIndices = new Set(
 
 const rerankModelPath = argVal('--reranker') || DEFAULT_RERANKER;
 const corpusDir = argVal('--corpus');
-const QUERY = argVal('--query');
+const initialQuery = argVal('--query');
 const modelPath = args.find((a, i) =>
   !a.startsWith('--') && !flagIndices.has(i)
 ) || DEFAULT_MODEL;
 
-if (!corpusDir || !QUERY) {
-  const missing = [
-    !corpusDir && '--corpus',
-    !QUERY && '--query',
-  ].filter(Boolean);
+if (!corpusDir) {
   process.stdout.write(
-    `Usage: node deep-research.ts [model-path] --corpus <path> --query <text> [--reranker <path>]\n` +
-    `Missing: ${missing.join(', ')}\n`
+    `Usage: node deep-research.ts [model-path] --corpus <path> [--query <text>] [--reranker <path>]\n` +
+    `Missing: --corpus\n`
   );
   process.exit(1);
 }
 
-// ================================================================
-// Suppress native llama.cpp logs (C-level stderr) for clean output.
-// The native binary hasn't loaded yet (lazy on first createContext),
-// so redirecting fd 2 here catches all ggml/llama init logs.
-// Use --verbose to see them.
-// ================================================================
+if (jsonlMode) setJsonlMode(true);
+
+// Suppress native llama.cpp logs for clean output
 if (!verbose && !jsonlMode) {
   try {
     fs.closeSync(2);
     fs.openSync(process.platform === 'win32' ? '\\\\.\\NUL' : '/dev/null', 'w');
-  } catch { /* non-fatal — logs will show */ }
+  } catch { /* non-fatal */ }
 }
 
-// ================================================================
-// DISPLAY — ANSI formatting for terminal output
-// ================================================================
-
-const isTTY = process.stdout.isTTY;
-const c = isTTY ? {
-  bold: '\x1b[1m', dim: '\x1b[2m', reset: '\x1b[0m',
-  green: '\x1b[32m', cyan: '\x1b[36m', yellow: '\x1b[33m', red: '\x1b[31m',
-} : { bold: '', dim: '', reset: '', green: '', cyan: '', yellow: '', red: '' };
-
-const log = (...a: unknown[]): void => { if (!jsonlMode) console.log(...a); };
-
-function emit(event: string, data: Record<string, unknown>): void {
-  if (jsonlMode) console.log(JSON.stringify({ event, ...data }));
-}
-
-// ================================================================
-// CONSTANTS
-// ================================================================
-
 const AGENT_COUNT = 3;
 const VERIFY_COUNT = 3;
 const MAX_TOOL_TURNS = 6;
 
-// ================================================================
-// CORPUS — load and chunk at ## boundaries
-// ================================================================
-
-interface CorpusFile { name: string; content: string }
-interface Chunk { file: string; heading: string; text: string; tokens: number[] }
-interface SubChunk { heading: string; text: string }
-
-function loadCorpus(): CorpusFile[] {
-  if (!fs.existsSync(corpusDir!)) {
-    process.stdout.write(`Error: corpus not found: ${corpusDir}\n`);
-    process.exit(1);
-  }
-  const stat = fs.statSync(corpusDir!);
-  if (stat.isFile()) {
-    return [{ name: path.basename(corpusDir!), content: fs.readFileSync(corpusDir!, 'utf8') }];
-  }
-  const files = fs.readdirSync(corpusDir!).filter((f) => f.endsWith('.md'));
-  if (!files.length) {
-    process.stdout.write(`Error: no .md files in: ${corpusDir}\n`);
-    process.exit(1);
-  }
-  return files.map((f) => ({
-    name: f,
-    content: fs.readFileSync(path.join(corpusDir!, f), 'utf8'),
-  }));
-}
-
-// Max chars per chunk — conservative estimate at ~3 chars/token for code-heavy
-// content, leaving room for reranker template overhead (~130 tokens).
-// With reranker nCtx=8192: budget ≈ 8000 tokens × 3 = 24000 chars.
-const CHUNK_CHAR_LIMIT = 24000;
-
-function chunkCorpus(files: CorpusFile[]): Chunk[] {
-  const out: Chunk[] = [];
-  for (const file of files) {
-    for (const section of file.content.split(/(?=^## )/m)) {
-      const heading = (section.match(/^##?\s+(.+)/m) || [, file.name])[1]!;
-      const trimmed = section.trim();
-      if (trimmed.length <= CHUNK_CHAR_LIMIT) {
-        out.push({ file: file.name, heading, text: trimmed, tokens: [] });
-        continue;
-      }
-      // Sub-split oversized sections: ### → paragraph → hard truncate
-      for (const sub of subChunk(trimmed, heading)) {
-        out.push({ file: file.name, heading: sub.heading, text: sub.text, tokens: [] });
-      }
-    }
-  }
-  return out;
-}
-
-function subChunk(text: string, parentHeading: string): SubChunk[] {
-  // Try splitting at ### boundaries first
-  const subSections = text.split(/(?=^### )/m);
-  if (subSections.length > 1) {
-    const results: SubChunk[] = [];
-    for (const sub of subSections) {
-      const subHeading = (sub.match(/^###?\s+(.+)/m) || [, parentHeading])[1]!;
-      const trimmed = sub.trim();
-      if (trimmed.length <= CHUNK_CHAR_LIMIT) {
-        results.push({ heading: subHeading, text: trimmed });
-      } else {
-        // Still too large — fall through to paragraph splitting
-        results.push(...splitByParagraph(trimmed, subHeading));
-      }
-    }
-    return results;
-  }
-  // No ### headings — split by paragraphs
-  return splitByParagraph(text, parentHeading);
-}
-
-function splitByParagraph(text: string, heading: string): SubChunk[] {
-  const paragraphs = text.split(/\n\n+/);
-  const results: SubChunk[] = [];
-  let current = '';
-  let partIndex = 0;
-
-  for (const para of paragraphs) {
-    if (current.length + para.length + 2 > CHUNK_CHAR_LIMIT && current.length > 0) {
-      results.push({ heading: `${heading} (${++partIndex})`, text: current.trim() });
-      current = '';
-    }
-    // Single paragraph exceeds limit — hard truncate
-    if (para.length > CHUNK_CHAR_LIMIT) {
-      if (current.length > 0) {
-        results.push({ heading: `${heading} (${++partIndex})`, text: current.trim() });
-        current = '';
-      }
-      results.push({ heading: `${heading} (${++partIndex})`, text: para.slice(0, CHUNK_CHAR_LIMIT) });
-      continue;
-    }
-    current += (current ? '\n\n' : '') + para;
-  }
-  if (current.trim()) {
-    results.push({ heading: `${heading} (${partIndex > 0 ? ++partIndex : ''})`.replace(/ \(\)$/, ''), text: current.trim() });
-  }
-  return results;
-}
-
-const corpus = loadCorpus();
-const chunks: Chunk[] = chunkCorpus(corpus);
-
-// ================================================================
-// RERANKER — Qwen3-Reranker cross-encoder scoring via Branch API
-// ================================================================
-
-// Prompt template from Qwen3-Reranker model card: system (yes/no judge) +
-// user (<Instruct> + <Query> + <Document>) + empty think block prefix.
-const RERANK_PREFIX =
-  '<|im_start|>system\n' +
-  'Judge whether the Document meets the requirements based on the Query ' +
-  'and the Instruct provided. Note that the answer can only be "yes" or "no".' +
-  '<|im_end|>\n<|im_start|>user\n' +
-  '<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n\n' +
-  '<Query>: ';
-const RERANK_MID = '\n\n<Document>: ';
-const RERANK_SUFFIX = '<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n';
-
-let rerankCtx: SessionContext | null = null;
-let yesId = 0;
-let noId = 0;
-
-// Pre-tokenized template segments — populated after reranker loads.
-let rerankPrefixTokens: number[] | null = null; // RERANK_PREFIX (with BOS)
-let rerankMidTokens: number[] | null = null;    // RERANK_MID
-let rerankSuffixTokens: number[] | null = null; // RERANK_SUFFIX
-
-function rerankScore(logits: Float32Array): number {
-  const max = Math.max(logits[yesId], logits[noId]);
-  const yesExp = Math.exp(logits[yesId] - max);
-  const noExp = Math.exp(logits[noId] - max);
-  return yesExp / (yesExp + noExp);
-}
-
-// ================================================================
-// TOOLS — reranker-backed search + snippet extraction
-// ================================================================
-
-interface ScoredChunk { file: string; heading: string; score: number }
-
-async function toolSearch(query: string): Promise<ScoredChunk[]> {
-  const queryTokens = await rerankCtx!.tokenize(query, false);
-  const scored: ScoredChunk[] = [];
-  for (const chunk of chunks) {
-    // Pre-tokenized segments — no string concat, no per-chunk tokenize().
-    // Boundary safety: all joints are at special tokens or newlines,
-    // which are explicit token boundaries in Qwen3's BPE vocabulary.
-    const tokens = [
-      ...rerankPrefixTokens!, ...queryTokens,
-      ...rerankMidTokens!, ...chunk.tokens,
-      ...rerankSuffixTokens!,
-    ];
-    // Fresh branch per chunk — position must start at 0 each time.
-    const branch = Branch.create(rerankCtx!, 0, { temperature: 0 });
-    await branch.prefill(tokens);
-    const score = rerankScore(branch.getLogits());
-    await branch.prune();
-    scored.push({ file: chunk.file, heading: chunk.heading, score: Math.round(score * 1000) / 1000 });
-  }
-  return scored.sort((a, b) => b.score - a.score).slice(0, 5);
-}
-
-interface ReadFileResult {
-  file: string;
-  content?: string;
-  snippets?: string[];
-  error?: string;
-}
-
-function toolReadFile(filename: string, query: string): ReadFileResult | { error: string } {
-  const file = corpus.find((f) => f.name === filename);
-  if (!file) {
-    return { error: `File not found: ${filename}. Available: ${corpus.map((f) => f.name).join(', ')}` };
-  }
-  if (!query) return { file: file.name, content: file.content.slice(0, 800) };
-  const terms = query.toLowerCase().split(/\s+/).filter(Boolean);
-  const lines = file.content.split('\n');
-  const snippets: string[] = [];
-  const seen = new Set<number>();
-  for (let i = 0; i < lines.length; i++) {
-    if (!terms.some((t) => lines[i].toLowerCase().includes(t))) continue;
-    const start = Math.max(0, i - 1);
-    const end = Math.min(lines.length, i + 4);
-    if (seen.has(start)) continue;
-    seen.add(start);
-    snippets.push(lines.slice(start, end).join('\n'));
-    if (snippets.length >= 3) break;
-  }
-  return snippets.length > 0
-    ? { file: file.name, snippets }
-    : { file: file.name, snippets: ['No matches for: ' + query] };
-}
-
-async function executeTool(name: string, toolArgs: Record<string, unknown>): Promise<unknown> {
-  switch (name) {
-    case 'search':
-      return toolSearch((toolArgs.query as string) || '');
-    case 'read_file':
-      return toolReadFile(
-        (toolArgs.filename as string) || (toolArgs.path as string) || '',
-        (toolArgs.query as string) || ''
-      );
-    case 'report':
-      return { acknowledged: true };
-    default:
-      return { error: `Unknown tool: ${name}` };
-  }
-}
-
-const TOOLS = [
-  {
-    type: 'function',
-    function: {
-      name: 'search',
-      description: 'Search the knowledge base for relevant content. Returns sections ranked by semantic relevance.',
-      parameters: {
-        type: 'object',
-        properties: { query: { type: 'string', description: 'Search query' } },
-        required: ['query'],
-      },
-    },
-  },
-  {
-    type: 'function',
-    function: {
-      name: 'read_file',
-      description: 'Extract relevant snippets from a specific file. Use query to target specific content.',
-      parameters: {
-        type: 'object',
-        properties: {
-          filename: { type: 'string', description: 'Filename from search results (e.g. "api-security.md")' },
-          query: { type: 'string', description: 'What to extract from the file' },
-        },
-        required: ['filename'],
-      },
-    },
-  },
-  {
-    type: 'function',
-    function: {
-      name: 'report',
-      description: 'Submit your final research findings. Call this when you have gathered enough information to answer the question.',
-      parameters: {
-        type: 'object',
-        properties: { findings: { type: 'string', description: 'Your research findings and answer' } },
-        required: ['findings'],
-      },
-    },
-  },
-];
-
-const TOOLS_JSON = JSON.stringify(TOOLS);
-
-const AGENT_SYSTEM_PROMPT =
-  'You are a research assistant with access to a knowledge base. ' +
-  'Use the search and read_file tools to find information, then call report with your findings. ' +
-  'Be thorough: search first, read relevant files, then report. ' +
-  'Available files: ' + corpus.map((f) => f.name).join(', ');
-
-// ================================================================
-// HELPERS
-// ================================================================
-
-const sec = (a: number, b: number): string => ((b - a) / 1000).toFixed(1);
-const pad = (s: unknown, n: number): string => String(s).padStart(n);
-const fmtSize = (bytes: number): string => bytes > 1e9
-  ? (bytes / 1e9).toFixed(1) + ' GB'
-  : (bytes / 1e6).toFixed(0) + ' MB';
-
 // ================================================================
 // MAIN
 // ================================================================
 
-interface Attempt {
-  branch: InstanceType<typeof Branch>;
-  output: string;
-  done: boolean;
-  tokenCount: number;
-  ppl: number;
-}
-
 async function main(): Promise<void> {
-  const t0 = performance.now();
+  // Resources
+  const resources = loadResources(corpusDir!);
+  const chunks = chunkResources(resources);
 
   const modelName = path.basename(modelPath).replace(/-Q\w+\.gguf$/, '');
   const rerankName = path.basename(rerankModelPath).replace(/-q\w+\.gguf$/i, '');
@@ -423,437 +104,221 @@ async function main(): Promise<void> {
   log(`${c.bold}  Deep Research${c.reset} ${c.dim}— BranchStore Tool-Calling Agents${c.reset}`);
   log();
 
-  emit('start', {
-    model: path.basename(modelPath),
-    reranker: path.basename(rerankModelPath),
-    query: QUERY!,
-    agentCount: AGENT_COUNT,
-    verifyCount: VERIFY_COUNT,
-    chunks: chunks.length,
-  });
-
   log(`  ${c.green}●${c.reset} Loading ${c.bold}${modelName}${c.reset} ${c.dim}(${modelSize}, KV: Q4_0)${c.reset}`);
 
-  // Load generative model
   const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '16384', 10);
   const ctx = await createContext({
-    modelPath,
-    nCtx,
+    modelPath, nCtx,
     nSeqMax: AGENT_COUNT + 1,
-    typeK: 'q4_0',
-    typeV: 'q4_0',
+    typeK: 'q4_0', typeV: 'q4_0',
   });
 
   log(`  ${c.green}●${c.reset} Loading ${c.bold}${rerankName}${c.reset} ${c.dim}(${rerankSize}, reranker)${c.reset}`);
 
-  // Load reranker (small — ~300 MB alongside the 4B generative model)
-  rerankCtx = await createContext({
-    modelPath: rerankModelPath,
-    nCtx: 8192,
-    nSeqMax: AGENT_COUNT,
-  });
-
-  // Pre-tokenize reranker template segments + chunk texts.
-  // Done once — saves N_chunks × tokenize() calls per search.
-  [yesId] = await rerankCtx.tokenize('yes', false);
-  [noId] = await rerankCtx.tokenize('no', false);
-  rerankPrefixTokens = await rerankCtx.tokenize(RERANK_PREFIX, true);
-  rerankMidTokens = await rerankCtx.tokenize(RERANK_MID, false);
-  rerankSuffixTokens = await rerankCtx.tokenize(RERANK_SUFFIX, false);
-  for (const chunk of chunks) {
-    chunk.tokens = await rerankCtx.tokenize(chunk.text, false);
-  }
+  const reranker = await createReranker(rerankModelPath, { nSeqMax: AGENT_COUNT });
+  await reranker.tokenizeChunks(chunks);
 
-  const corpusIsFile = corpus.length === 1 && fs.statSync(corpusDir!).isFile();
+  const corpusIsFile = resources.length === 1 && fs.statSync(corpusDir!).isFile();
   const corpusLabel = corpusIsFile
     ? path.basename(corpusDir!)
-    : `${path.basename(corpusDir!)}/ — ${corpus.length} files`;
+    : `${path.basename(corpusDir!)}/ — ${resources.length} files`;
   log(`  ${c.dim}  Corpus: ${corpusLabel} → ${chunks.length} chunks${c.reset}`);
 
+  const { toolsJson, executeTool } = createTools({ resources, chunks, reranker });
   const store = new BranchStore(ctx);
+  const session = new Session({ ctx, store });
 
-  log();
-  log(`  ${c.dim}Query${c.reset}`);
-  log(`  ${c.bold}${QUERY}${c.reset}`);
-
-  // ================================================================
-  // PHASE 1: PLAN — Branch.create() + grammar
-  // ================================================================
-  const tPlan = performance.now();
-
-  const planSchema = {
-    type: 'object',
-    properties: {
-      questions: {
-        type: 'array',
-        items: { type: 'string' },
-        minItems: 2,
-        maxItems: AGENT_COUNT,
-      },
-    },
-    required: ['questions'],
+  // Tool call display — shared across cold + warm paths
+  const onToolCall = (ai: number, toolName: string, argsStr: string): void => {
+    emit('tool_call', { agentIndex: ai, toolName, arguments: argsStr });
+    let toolArgs: Record<string, string>;
+    try { toolArgs = JSON.parse(argsStr); } catch { toolArgs = {}; }
+    const argSummary = toolName === 'search'
+      ? `"${toolArgs.query || ''}"`
+      : toolName === 'report' ? ''
+      : `${toolArgs.filename}` + (toolArgs.startLine ? ` L${toolArgs.startLine}-${toolArgs.endLine}` : '');
+    log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.cyan}${toolName}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
+  };
+  const onToolResult = (ai: number, toolName: string, resultStr: string): void => {
+    emit('tool_result', {
+      agentIndex: ai, toolName,
+      result: resultStr.length > 200 ? resultStr.slice(0, 200) + '...' : resultStr,
+    });
+    log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.dim}← ${toolName} ${resultStr.length}b${c.reset}`);
   };
-  const planGrammar = await ctx.jsonSchemaToGrammar(JSON.stringify(planSchema));
-
-  const planMessages = [
-    { role: 'system', content: 'You break research queries into sub-questions. Output JSON only.' },
-    { role: 'user', content: `Break this into ${AGENT_COUNT} independent sub-questions for parallel research: "${QUERY}"` },
-  ];
-  const { prompt: planPrompt } = await ctx.formatChat(JSON.stringify(planMessages));
-  const planTokens = await ctx.tokenize(planPrompt);
-
-  const lead = Branch.create(ctx, 0, { temperature: 0.3 }, undefined, planGrammar);
-  await lead.prefill(planTokens);
-
-  let planOutput = '';
-  let planTokenCount = 0;
-  for await (const { text } of lead) {
-    planOutput += text;
-    planTokenCount++;
-  }
-  await lead.prune();
-
-  let questions: string[];
-  try {
-    const plan = JSON.parse(planOutput);
-    questions = plan.questions.slice(0, AGENT_COUNT);
-    if (!questions.length) throw new Error('empty questions');
-  } catch {
-    questions = Array.from({ length: AGENT_COUNT }, (_, i) => `${QUERY} (aspect ${i + 1})`);
-  }
-
-  emit('plan', { questions, planTokens: planTokenCount });
 
   // ================================================================
-  // PHASE 2: RESEARCH — fork() + prefill() divergent suffixes + tools
+  // handleQuery — the orchestrator
+  //
+  // No session yet → cold: plan → research → verify → eval
+  // Session exists → warm: research(parent: trunk) → prefillUser → generate
   // ================================================================
-  const tResearch = performance.now();
 
-  log();
-  log(`  ${c.green}●${c.reset} ${c.bold}Plan${c.reset} ${c.dim}${planTokenCount} tok · ${sec(tPlan, tResearch)}s${c.reset}`);
-  questions.forEach((q, i) => log(`    ${c.dim}${i + 1}.${c.reset} ${q}`));
-
-  // Shared prefix: system prompt + tool definitions, NO assistant prompt
-  const sharedMessages = [{ role: 'system', content: AGENT_SYSTEM_PROMPT }];
-  const sharedFmt = await ctx.formatChat(
-    JSON.stringify(sharedMessages),
-    { tools: TOOLS_JSON, addGenerationPrompt: false }
-  );
-  const sharedTokens = await ctx.tokenize(sharedFmt.prompt);
-
-  // Root branch — prefill shared prefix once
-  const agentRoot = Branch.create(ctx, 0, { temperature: 0.5 });
-  await agentRoot.prefill(sharedTokens);
-
-  // Fork N agents, compute divergent suffixes via token slicing
-  const agents: AgentState[] = [];
-  for (const q of questions) {
-    const branch = await agentRoot.fork();
-
-    const fullMessages = [
-      { role: 'system', content: AGENT_SYSTEM_PROMPT },
-      { role: 'user', content: q },
-    ];
-    const fmt = await ctx.formatChat(JSON.stringify(fullMessages), { tools: TOOLS_JSON });
-    const fullTokens = await ctx.tokenize(fmt.prompt);
-    const suffixTokens = fullTokens.slice(sharedTokens.length);
-
-    agents.push({
-      branch,
-      suffixTokens,
-      fmt: {
-        format: fmt.format,
-        reasoningFormat: fmt.reasoningFormat,
-        thinkingForcedOpen: fmt.thinkingForcedOpen,
-        parser: fmt.parser,
-      },
-      rawOutput: '',
-      done: false,
-      tokenCount: 0,
-      toolCallCount: 0,
-      turns: 0,
-      findings: null,
-    });
-  }
-  // agentRoot pruned after agents are done (can't prune parent with live children)
+  async function handleQuery(query: string): Promise<void> {
+    if (!session.trunk) {
+      // ─── cold: plan → research → verify → eval ─────────
+      const t0 = performance.now();
 
-  // Batched prefill — only the unique suffixes
-  await store.prefill(agents.map((w) => [w.branch, w.suffixTokens]));
+      emit('start', {
+        model: path.basename(modelPath), reranker: path.basename(rerankModelPath),
+        query, agentCount: AGENT_COUNT, verifyCount: VERIFY_COUNT, chunks: chunks.length,
+      });
 
-  emit('research_start', {
-    agentCount: agents.length,
-    sharedPrefixTokens: sharedTokens.length,
-  });
+      log();
+      log(`  ${c.dim}Query${c.reset}`);
+      log(`  ${c.bold}${query}${c.reset}`);
 
-  log();
-  log(`  ${c.green}●${c.reset} ${c.bold}Research${c.reset} ${c.dim}${agents.length} agents · shared prefix ${sharedTokens.length} tok${c.reset}`);
-
-  // Reranker mutex — serializes llama_decode calls on rerankCtx.
-  // Fire-and-forget tool dispatch means multiple agents can dispatch search
-  // concurrently; _branchPrefill runs on the libuv thread pool, so concurrent
-  // calls race llama_decode on the same llama_context. BranchStore serializes
-  // via batched decode (one llama_decode per commit/prefill), but individual
-  // Branch.prefill calls on rerankCtx bypass that.
-  let rerankLock = Promise.resolve();
-  function withRerankLock<T>(fn: () => Promise<T>): Promise<T> {
-    const prev = rerankLock;
-    let release: () => void;
-    rerankLock = new Promise((r) => { release = r; });
-    return prev.then(fn).finally(release!);
-  }
+      // ─── query → questions ────────────────────────────
+      let t = performance.now();
 
-  const executeToolLocked = (name: string, args: Record<string, unknown>): Promise<unknown> =>
-    name === 'search'
-      ? withRerankLock(() => executeTool(name, args))
-      : executeTool(name, args);
-
-  const { totalTokens: totalAgentTokens, totalToolCalls, steps: researchSteps, counters } =
-    await runAgents(agents, {
-      store, ctx,
-      executeTool: executeToolLocked,
-      maxTurns: MAX_TOOL_TURNS,
-      onToolCall(ai: number, toolName: string, args: string) {
-        emit('tool_call', { agentIndex: ai, toolName, arguments: args });
-        let toolArgs: Record<string, string>;
-        try { toolArgs = JSON.parse(args); } catch { toolArgs = {}; }
-        const argSummary = toolName === 'search'
-          ? `"${toolArgs.query || ''}"`
-          : toolName === 'report' ? ''
-          : toolArgs.filename + (toolArgs.query ? `, "${toolArgs.query}"` : '');
-        log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.cyan}${toolName}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
-      },
-      onToolResult(ai: number, toolName: string, resultStr: string) {
-        emit('tool_result', {
-          agentIndex: ai, toolName,
-          result: resultStr.length > 200 ? resultStr.slice(0, 200) + '...' : resultStr,
-        });
-        log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.dim}← ${toolName} ${resultStr.length}b${c.reset}`);
-      },
-    });
+      const { questions, tokenCount: planTokens } = await plan(ctx, {
+        query, agentCount: AGENT_COUNT,
+      });
 
-  for (let i = 0; i < agents.length; i++) {
-    const w = agents[i];
-    const isLast = i === agents.length - 1;
-    const branchChar = isLast ? '└' : '├';
-
-    emit('agent_done', {
-      index: i,
-      question: questions[i],
-      findings: (w.findings || '').slice(0, 500),
-      toolCalls: w.toolCallCount,
-      turns: w.turns,
-      tokenCount: w.tokenCount,
-    });
+      emit('plan', { questions, planTokens });
+      const planMs = performance.now() - t;
+      log(`\n  ${c.green}●${c.reset} ${c.bold}Plan${c.reset} ${c.dim}${planTokens} tok · ${(planMs / 1000).toFixed(1)}s${c.reset}`);
+      questions.forEach((q, i) => log(`    ${c.dim}${i + 1}.${c.reset} ${q}`));
 
-    log(`    ${c.dim}${branchChar}${c.reset} ${c.yellow}${i}${c.reset} ${c.green}done${c.reset} ${c.dim}${w.tokenCount} tok · ${w.toolCallCount} tools${c.reset}`);
+      // ─── questions → findings ─────────────────────────
+      t = performance.now();
+      log(`\n  ${c.green}●${c.reset} ${c.bold}Research${c.reset} ${c.dim}${questions.length} agents${c.reset}`);
 
-    await w.branch.prune();
-  }
-  await agentRoot.prune();
+      const researchResult = await research(ctx, store, {
+        questions, toolsJson, executeTool,
+        maxTurns: MAX_TOOL_TURNS, onToolCall, onToolResult,
+      });
 
-  // ================================================================
-  // PHASE 3: VERIFY — fork() + reseed() + eval fork
-  // ================================================================
-  const tVerify = performance.now();
+      const researchMs = performance.now() - t;
+      researchResult.agents.forEach((a, i) => {
+        const tree = i === researchResult.agents.length - 1 ? '└' : '├';
+        emit('agent_done', { index: i, question: questions[i], findings: (a.findings || '').slice(0, 500), toolCalls: a.toolCallCount, tokenCount: a.tokenCount });
+        log(`    ${c.dim}${tree}${c.reset} ${c.yellow}${i}${c.reset} ${c.green}done${c.reset} ${c.dim}${a.tokenCount} tok · ${a.toolCallCount} tools${c.reset}`);
+      });
+      log(`    ${c.dim}${researchResult.totalTokens} tok · ${researchResult.totalToolCalls} tools · ${(researchMs / 1000).toFixed(1)}s${c.reset}`);
 
-  log(`    ${c.dim}${totalAgentTokens} tok · ${totalToolCalls} tools · ${sec(tResearch, tVerify)}s${c.reset}`);
+      // ─── findings → attempts ──────────────────────────
+      t = performance.now();
 
-  const findingsText = agents
-    .map((w, i) => `Q: ${questions[i]}\nA: ${(w.findings || '').trim()}`)
-    .join('\n\n');
+      const findingsText = researchResult.agents
+        .map((a, i) => `Q: ${questions[i]}\nA: ${(a.findings || '').trim()}`)
+        .join('\n\n');
 
-  const synthMessages = [
-    { role: 'system', content: 'Synthesize the research findings into a coherent, concise summary.' },
-    { role: 'user', content: `Research findings:\n\n${findingsText}\n\nSynthesize these into a brief summary answering: "${QUERY}"` },
-  ];
-  const { prompt: synthPrompt } = await ctx.formatChat(JSON.stringify(synthMessages));
-  const synthTokens = await ctx.tokenize(synthPrompt);
+      log(`\n  ${c.green}●${c.reset} ${c.bold}Verify${c.reset} ${c.dim}${VERIFY_COUNT} attempts${c.reset}`);
 
-  const synthRoot = Branch.create(ctx, 0, { temperature: 0.7 });
-  await synthRoot.prefill(synthTokens);
+      const verifyResult = await verify(ctx, store, {
+        findings: findingsText, query, count: VERIFY_COUNT,
+      });
 
-  emit('verify_start', {
-    attemptCount: VERIFY_COUNT,
-    prefixTokens: synthTokens.length,
-  });
+      const verifyMs = performance.now() - t;
+      verifyResult.attempts.forEach((a, i) => {
+        const tree = i === verifyResult.attempts.length - 1 ? '└' : '├';
+        emit('attempt_done', { index: i, output: a.output.trim().slice(0, 500), tokenCount: a.tokenCount, ppl: a.ppl });
+        log(`    ${c.dim}${tree} ${a.tokenCount} tok · ppl ${a.ppl.toFixed(2)}${c.reset}`);
+      });
+      log(`    ${c.dim}${verifyResult.totalTokens} tok · ${(verifyMs / 1000).toFixed(1)}s${c.reset}`);
 
-  log();
-  log(`  ${c.green}●${c.reset} ${c.bold}Verify${c.reset} ${c.dim}${VERIFY_COUNT} attempts · shared prefix ${synthTokens.length} tok${c.reset}`);
+      // ─── attempts → convergence ───────────────────────
+      t = performance.now();
 
-  const attempts: Attempt[] = [];
-  for (let i = 0; i < VERIFY_COUNT; i++) {
-    const branch = await synthRoot.fork();
-    branch.reseedSampler(2000 + i);
-    attempts.push({ branch, output: '', done: false, tokenCount: 0, ppl: Infinity });
-  }
-  // synthRoot pruned after attempts are done (can't prune parent with live children)
-
-  let verifySteps = 0;
-  for (;;) {
-    const entries: [InstanceType<typeof Branch>, number][] = [];
-    for (const a of attempts) {
-      if (a.done) continue;
-      const { token, text, isStop } = a.branch.produceSync();
-      if (isStop) {
-        const p = a.branch.perplexity;
-        a.ppl = Number.isFinite(p) ? p : Infinity;
-        a.done = true;
-        continue;
-      }
-      entries.push([a.branch, token]);
-      a.output += text;
-      a.tokenCount++;
-    }
-    if (entries.length === 0) break;
-    await store.commit(entries);
-    verifySteps++;
-  }
-
-  const totalVerifyTokens = attempts.reduce((s, a) => s + a.tokenCount, 0);
-  for (let i = 0; i < attempts.length; i++) {
-    const isLast = i === attempts.length - 1;
-    const branchChar = isLast ? '└' : '├';
+      const { converged, tokenCount: evalTokens } = await evaluate(ctx, {
+        attempts: verifyResult.attempts,
+      });
 
-    emit('attempt_done', {
-      index: i,
-      output: attempts[i].output.trim().slice(0, 500),
-      tokenCount: attempts[i].tokenCount,
-      ppl: attempts[i].ppl,
-    });
+      const evalMs = performance.now() - t;
+      emit('convergence', { converged, evalTokens });
+      const verdict = converged === true ? `${c.green}yes${c.reset}` : converged === false ? `${c.red}no${c.reset}` : `${c.yellow}unknown${c.reset}`;
+      log(`\n  ${c.green}●${c.reset} ${c.bold}Eval${c.reset} ${c.dim}${evalTokens} tok · ${(evalMs / 1000).toFixed(1)}s${c.reset}`);
+      log(`    Converged: ${verdict}`);
+
+      // ─── result ───────────────────────────────────────
+      const tEnd = performance.now();
+      const totalTokens = planTokens + researchResult.totalTokens + verifyResult.totalTokens + evalTokens;
+
+      log(`\n  ${c.dim}${'─'.repeat(58)}${c.reset}\n`);
+      const prose = verifyResult.bestOutput.trim()
+        .replace(/\*\*(.+?)\*\*/g, `${c.bold}$1${c.reset}`)
+        .split('\n').map((l) => `  ${l}`).join('\n');
+      log(prose);
+
+      emit('complete', {
+        planTokens, agentTokens: researchResult.totalTokens,
+        researchSteps: researchResult.steps,
+        verifyTokens: verifyResult.totalTokens, verifySteps: verifyResult.steps,
+        evalTokens, converged,
+        totalToolCalls: researchResult.totalToolCalls,
+        prefixTokens: verifyResult.prefixLength,
+        sharedPrefixTokens: researchResult.sharedPrefixLength,
+        agentCount: questions.length, attemptCount: verifyResult.attempts.length,
+        wallTimeMs: Math.round(tEnd - t0),
+        planMs: Math.round(planMs), researchMs: Math.round(researchMs),
+        verifyMs: Math.round(verifyMs), evalMs: Math.round(evalMs),
+        ...researchResult.counters,
+      });
 
-    log(`    ${c.dim}${branchChar} ${attempts[i].tokenCount} tok · ppl ${attempts[i].ppl.toFixed(2)}${c.reset}`);
-  }
+      log(`\n  ${c.dim}${'━'.repeat(58)}${c.reset}`);
+      log(`  ${c.dim}Plan       ${pad(planTokens, 5)} tok${' '.repeat(30)}${pad((planMs / 1000).toFixed(1), 6)}s${c.reset}`);
+      log(`  ${c.dim}Research   ${pad(researchResult.totalTokens, 5)} tok  (${researchResult.agents.map((a) => a.tokenCount).join(' + ')})  ${pad(researchResult.totalToolCalls, 2)} tools  ${pad((researchMs / 1000).toFixed(1), 6)}s${c.reset}`);
+      log(`  ${c.dim}Verify     ${pad(verifyResult.totalTokens, 5)} tok  (${verifyResult.attempts.map((a) => a.tokenCount).join(' + ')})${' '.repeat(11)}${pad((verifyMs / 1000).toFixed(1), 6)}s${c.reset}`);
+      log(`  ${c.dim}Eval       ${pad(evalTokens, 5)} tok  converged: ${converged ? 'yes' : 'no'}${' '.repeat(11)}${pad((evalMs / 1000).toFixed(1), 6)}s${c.reset}`);
+      const kvSaved = researchResult.sharedPrefixLength * (questions.length - 1) + verifyResult.prefixLength * (verifyResult.attempts.length - 1);
+      log(`  ${c.dim}${'━'.repeat(58)}${c.reset}`);
+      log(`  ${c.bold}Total${c.reset}      ${c.bold}${pad(totalTokens, 5)}${c.reset} tok  ${c.dim}${questions.length} agents · ${researchResult.totalToolCalls} tools${c.reset}         ${c.bold}${pad(((tEnd - t0) / 1000).toFixed(1), 6)}s${c.reset}`);
+      log(`  ${c.dim}KV shared    ${researchResult.sharedPrefixLength} × ${questions.length - 1} + ${verifyResult.prefixLength} × ${verifyResult.attempts.length - 1} = ${kvSaved.toLocaleString()} tok saved${c.reset}`);
+      log();
+
+      await session.promote(verifyResult.bestBranch);
+    } else {
+      // ─── warm: research → findings → grounded response ─
+      log(`  ${c.dim}  researching...${c.reset}`);
 
-  // Pick lowest perplexity synthesis (most coherent) — same as best-of-n.mjs
-  // Selected before pruning so we can keep the best branch alive for follow-up.
-  const bestAttempt = attempts.reduce((a, b) => a.ppl <= b.ppl ? a : b);
-
-  for (const a of attempts) { if (a !== bestAttempt) await a.branch.prune(); }
-  // synthRoot stays alive until interactive loop ends — forked children share
-  // physical KV entries with the parent via seq_id tags.
-
-  // Eval fork — model-as-judge
-  const tEval = performance.now();
-
-  log(`    ${c.dim}${totalVerifyTokens} tok · ${sec(tVerify, tEval)}s${c.reset}`);
-
-  const responsesText = attempts
-    .map((a, i) => `Response ${i + 1}: ${a.output.trim()}`)
-    .join('\n\n');
-
-  const evalMessages = [
-    {
-      role: 'system',
-      content: 'You are a consistency checker. Compare the responses and determine if they convey the same core meaning. Output JSON only.',
-    },
-    {
-      role: 'user',
-      content: `Do these responses agree on the key points?\n\n${responsesText}`,
-    },
-  ];
-
-  const evalSchema = {
-    type: 'object',
-    properties: { converged: { type: 'boolean' } },
-    required: ['converged'],
-  };
-  const evalGrammar = await ctx.jsonSchemaToGrammar(JSON.stringify(evalSchema));
+      const followUp = await research(ctx, store, {
+        questions: Array(AGENT_COUNT).fill(query),
+        parent: session.trunk!,
+        seed: Date.now(),
+        toolsJson, executeTool,
+        maxTurns: MAX_TOOL_TURNS, onToolCall, onToolResult,
+      });
 
-  const { prompt: evalPrompt } = await ctx.formatChat(JSON.stringify(evalMessages));
-  const evalTokens = await ctx.tokenize(evalPrompt);
+      log(`  ${c.dim}  ${followUp.totalToolCalls} tools · ${followUp.totalTokens} tok${c.reset}`);
 
-  const evalBranch = Branch.create(ctx, 0, { temperature: 0 }, undefined, evalGrammar);
-  await evalBranch.prefill(evalTokens);
+      const agentFindings = followUp.agents
+        .map((a, i) => a.findings ? `[Agent ${i}] ${a.findings.trim()}` : null)
+        .filter(Boolean)
+        .join('\n\n');
 
-  let evalOutput = '';
-  let evalTokenCount = 0;
-  for await (const { text } of evalBranch) {
-    evalOutput += text;
-    evalTokenCount++;
-  }
-  await evalBranch.prune();
+      const groundedContent = agentFindings
+        ? `Research findings:\n${agentFindings}\n\nUser question: ${query}\n\nAnswer based on the research findings above.`
+        : query;
+      await session.prefillUser(groundedContent);
 
-  let converged: boolean | null;
-  try {
-    converged = JSON.parse(evalOutput).converged;
-  } catch {
-    converged = null;
+      process.stdout.write(`  ${c.dim}<${c.reset} `);
+      for await (const { text } of session.trunk!) {
+        process.stdout.write(text);
+      }
+      console.log('\n');
+    }
   }
 
-  emit('convergence', { evalOutput, evalTokens: evalTokenCount, converged });
-
   // ================================================================
-  // COMPLETE
+  // REPL — single input loop drives both cold and warm paths
   // ================================================================
-  const tEnd = performance.now();
-
-  const verdict = converged === true ? `${c.green}yes${c.reset}` : converged === false ? `${c.red}no${c.reset}` : `${c.yellow}unknown${c.reset}`;
-  log();
-  log(`  ${c.green}●${c.reset} ${c.bold}Eval${c.reset} ${c.dim}${evalTokenCount} tok · ${sec(tEval, tEnd)}s${c.reset}`);
-  log(`    Converged: ${verdict}`);
-
-  log();
-  log(`  ${c.dim}${'─'.repeat(58)}${c.reset}`);
-  log();
-  const prose = bestAttempt.output.trim()
-    .replace(/\*\*(.+?)\*\*/g, `${c.bold}$1${c.reset}`)
-    .split('\n').map((l) => `  ${l}`).join('\n');
-  log(prose);
-  log();
-
-  const totalTokens = planTokenCount + totalAgentTokens + totalVerifyTokens + evalTokenCount;
-
-  emit('complete', {
-    planTokens: planTokenCount,
-    agentTokens: totalAgentTokens,
-    researchSteps,
-    verifyTokens: totalVerifyTokens,
-    verifySteps,
-    evalTokens: evalTokenCount,
-    converged,
-    totalToolCalls,
-    prefixTokens: synthTokens.length,
-    sharedPrefixTokens: sharedTokens.length,
-    agentCount: questions.length,
-    attemptCount: attempts.length,
-    wallTimeMs: Math.round(tEnd - t0),
-    planMs: Math.round(tResearch - tPlan),
-    researchMs: Math.round(tVerify - tResearch),
-    verifyMs: Math.round(tEval - tVerify),
-    evalMs: Math.round(tEnd - tEval),
-    ...counters,
-  });
-
-  log();
-  log(`  ${c.dim}${'━'.repeat(58)}${c.reset}`);
-  log(`  ${c.dim}Plan       ${pad(planTokenCount, 5)} tok${' '.repeat(30)}${pad(sec(tPlan, tResearch), 6)}s${c.reset}`);
-  log(`  ${c.dim}Research   ${pad(totalAgentTokens, 5)} tok  (${agents.map((w) => w.tokenCount).join(' + ')})  ${pad(totalToolCalls, 2)} tools  ${pad(sec(tResearch, tVerify), 6)}s${c.reset}`);
-  log(`  ${c.dim}Verify     ${pad(totalVerifyTokens, 5)} tok  (${attempts.map((a) => a.tokenCount).join(' + ')})${' '.repeat(11)}${pad(sec(tVerify, tEval), 6)}s${c.reset}`);
-  log(`  ${c.dim}Eval       ${pad(evalTokenCount, 5)} tok  converged: ${converged ? 'yes' : 'no'}${' '.repeat(11)}${pad(sec(tEval, tEnd), 6)}s${c.reset}`);
-  const kvSaved = sharedTokens.length * (agents.length - 1) + synthTokens.length * (attempts.length - 1);
-  log(`  ${c.dim}${'━'.repeat(58)}${c.reset}`);
-  log(`  ${c.bold}Total${c.reset}      ${c.bold}${pad(totalTokens, 5)}${c.reset} tok  ${c.dim}${agents.length} agents · ${totalToolCalls} tools${c.reset}         ${c.bold}${pad(sec(t0, tEnd), 6)}s${c.reset}`);
-  log(`  ${c.dim}KV shared    ${sharedTokens.length} × ${agents.length - 1} + ${synthTokens.length} × ${attempts.length - 1} = ${kvSaved.toLocaleString()} tok saved${c.reset}`);
-  log();
 
-  if (jsonlMode) {
-    await bestAttempt.branch.prune();
-    await synthRoot.prune();
-    rerankCtx!.dispose();
+  // --query with --jsonl: run cold pipeline, emit results, exit
+  if (jsonlMode && initialQuery) {
+    await handleQuery(initialQuery);
+    await session.dispose();
+    reranker.dispose();
     ctx.dispose();
     return;
   }
 
-  // ================================================================
-  // INTERACTIVE — readline follow-up loop with agent-swarm research
-  // ================================================================
-
-  // Session manages trunk lifecycle — promote crowns winner, freeing
-  // AGENT_COUNT seq_ids for follow-up research agents.
-  const session = new Session({ ctx, store });
-  await session.promote(bestAttempt.branch);
+  // --query provided interactively: use as first input
+  if (initialQuery) {
+    await handleQuery(initialQuery);
+  }
 
-  log(`  ${c.dim}Ask a follow-up question or /quit to exit${c.reset}`);
+  log(`  ${c.dim}${session.trunk ? 'Ask a follow-up question' : 'Enter your research question'} or /quit to exit${c.reset}`);
   log();
 
   await new Promise<void>((resolve) => {
@@ -867,7 +332,7 @@ async function main(): Promise<void> {
       exiting = true;
       rl.close();
       await session.dispose();
-      rerankCtx!.dispose();
+      reranker.dispose();
       ctx.dispose();
       resolve();
     }
@@ -879,87 +344,14 @@ async function main(): Promise<void> {
 
     async function handleInput(input: string): Promise<void> {
       try {
-      const trimmed = input.trim();
-      if (!trimmed || trimmed === '/quit') {
-        await exit();
-        return;
-      }
-
-      generating = true;
-
-      // Fork AGENT_COUNT research agents from the conversation trunk.
-      // Each agent inherits full conversation KV (back-references resolve
-      // naturally), gets reseeded for search diversity.
-      log(`  ${c.dim}  researching...${c.reset}`);
-
-      const followUpAgents: AgentState[] = [];
-      for (let i = 0; i < AGENT_COUNT; i++) {
-        const agent = await forkAgent(session.trunk!, {
-          systemPrompt: AGENT_SYSTEM_PROMPT,
-          content: trimmed,
-          tools: TOOLS_JSON,
-          seed: Date.now() + i,
-        }, ctx);
-        followUpAgents.push(agent);
-      }
+        const trimmed = input.trim();
+        if (!trimmed || trimmed === '/quit') { await exit(); return; }
 
-      // Batch prefill all agents' divergent suffixes
-      await store.prefill(followUpAgents.map((a) => [a.branch, a.suffixTokens]));
-
-      // Run parallel research with batched decode
-      const swarmResult = await runAgents(followUpAgents, {
-        store, ctx,
-        executeTool: executeToolLocked,
-        maxTurns: MAX_TOOL_TURNS,
-        onToolCall(ai: number, toolName: string, args: string) {
-          emit('tool_call', { agentIndex: ai, toolName, arguments: args });
-          let toolArgs: Record<string, string>;
-          try { toolArgs = JSON.parse(args); } catch { toolArgs = {}; }
-          const argSummary = toolName === 'search'
-            ? `"${toolArgs.query || ''}"`
-            : toolName === 'report' ? ''
-            : toolArgs.filename + (toolArgs.query ? `, "${toolArgs.query}"` : '');
-          log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.cyan}${toolName}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
-        },
-        onToolResult(ai: number, toolName: string, resultStr: string) {
-          emit('tool_result', { agentIndex: ai, toolName,
-            result: resultStr.length > 200 ? resultStr.slice(0, 200) + '...' : resultStr });
-          log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.dim}← ${toolName} ${resultStr.length}b${c.reset}`);
-        },
-      });
-
-      log(`  ${c.dim}  ${swarmResult.totalToolCalls} tools · ${swarmResult.totalTokens} tok${c.reset}`);
-
-      // Collect findings from all agents
-      const agentFindings = followUpAgents
-        .map((a, i) => a.findings ? `[Agent ${i}] ${a.findings.trim()}` : null)
-        .filter(Boolean)
-        .join('\n\n');
-
-      // Prune all agent branches — their findings are captured
-      for (const a of followUpAgents) await a.branch.prune();
-
-      // Format findings + question as user turn, prefill into trunk via Session
-      const groundedContent = agentFindings
-        ? `Research findings:\n${agentFindings}\n\nUser question: ${trimmed}\n\nAnswer based on the research findings above.`
-        : trimmed;
-
-      await session.prefillUser(groundedContent);
-
-      // Generate grounded response
-      process.stdout.write(`  ${c.dim}<${c.reset} `);
-      for await (const { text } of session.trunk!) {
-        process.stdout.write(text);
-      }
-      console.log('\n');
-
-      generating = false;
+        generating = true;
+        await handleQuery(trimmed);
+        generating = false;
 
-      if (eofWhileGenerating) {
-        await exit();
-      } else {
-        ask();
-      }
+        if (eofWhileGenerating) { await exit(); } else { ask(); }
       } catch (err) {
         log(`  ${c.red}Error: ${(err as Error).message}${c.reset}`);
         generating = false;
@@ -968,18 +360,13 @@ async function main(): Promise<void> {
     }
 
     rl.on('close', () => {
-      if (generating) {
-        eofWhileGenerating = true;
-      } else {
-        exit();
-      }
+      if (generating) { eofWhileGenerating = true; } else { exit(); }
     });
     ask();
   });
 }
 
 main().catch((err: unknown) => {
-  // stderr is redirected in quiet mode — use stdout for errors
   process.stdout.write(`Error: ${(err as Error).message}\n${(err as Error).stack}\n`);
   process.exit(1);
 });
diff --git a/examples/deep-research/display.ts b/examples/deep-research/display.ts
new file mode 100644
index 0000000..54e967a
--- /dev/null
+++ b/examples/deep-research/display.ts
@@ -0,0 +1,22 @@
+let _jsonlMode = false;
+
+export function setJsonlMode(on: boolean): void { _jsonlMode = on; }
+
+const isTTY = process.stdout.isTTY;
+
+export const c = isTTY ? {
+  bold: '\x1b[1m', dim: '\x1b[2m', reset: '\x1b[0m',
+  green: '\x1b[32m', cyan: '\x1b[36m', yellow: '\x1b[33m', red: '\x1b[31m',
+} : { bold: '', dim: '', reset: '', green: '', cyan: '', yellow: '', red: '' };
+
+export const log = (...a: unknown[]): void => { if (!_jsonlMode) console.log(...a); };
+
+export function emit(event: string, data: Record<string, unknown>): void {
+  if (_jsonlMode) console.log(JSON.stringify({ event, ...data }));
+}
+
+export const sec = (a: number, b: number): string => ((b - a) / 1000).toFixed(1);
+export const pad = (s: unknown, n: number): string => String(s).padStart(n);
+export const fmtSize = (bytes: number): string => bytes > 1e9
+  ? (bytes / 1e9).toFixed(1) + ' GB'
+  : (bytes / 1e6).toFixed(0) + ' MB';
diff --git a/examples/deep-research/reranker.ts b/examples/deep-research/reranker.ts
new file mode 100644
index 0000000..367c112
--- /dev/null
+++ b/examples/deep-research/reranker.ts
@@ -0,0 +1,88 @@
+import { createContext, Branch } from '../../dist/index.js';
+import type { Chunk } from './resources/types.js';
+import type { Reranker, ScoredChunk } from './tools/types.js';
+
+const RERANK_PREFIX =
+  '<|im_start|>system\n' +
+  'Judge whether the Document meets the requirements based on the Query ' +
+  'and the Instruct provided. Note that the answer can only be "yes" or "no".' +
+  '<|im_end|>\n<|im_start|>user\n' +
+  '<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n\n' +
+  '<Query>: ';
+const RERANK_MID = '\n\n<Document>: ';
+const RERANK_SUFFIX = '<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n';
+
+export async function createReranker(
+  modelPath: string,
+  opts?: { nSeqMax?: number },
+): Promise<Reranker> {
+  const ctx = await createContext({
+    modelPath,
+    nCtx: 16384,
+    nSeqMax: opts?.nSeqMax ?? 3,
+  });
+
+  const [yesId] = await ctx.tokenize('yes', false);
+  const [noId] = await ctx.tokenize('no', false);
+  const prefixTokens = await ctx.tokenize(RERANK_PREFIX, true);
+  const midTokens = await ctx.tokenize(RERANK_MID, false);
+  const suffixTokens = await ctx.tokenize(RERANK_SUFFIX, false);
+
+  function rerankScore(logits: Float32Array): number {
+    const max = Math.max(logits[yesId], logits[noId]);
+    const yesExp = Math.exp(logits[yesId] - max);
+    const noExp = Math.exp(logits[noId] - max);
+    return yesExp / (yesExp + noExp);
+  }
+
+  // Serialize access — concurrent Branch.prefill on the same llama_context
+  // races llama_decode. BranchStore serializes via batched decode, but
+  // individual Branch.prefill calls on the reranker bypass that.
+  let lock = Promise.resolve();
+
+  return {
+    async score(query: string, chunks: Chunk[]): Promise<ScoredChunk[]> {
+      const prev = lock;
+      let release!: () => void;
+      lock = new Promise<void>((r) => { release = r; });
+      await prev;
+      try {
+        const queryTokens = await ctx.tokenize(query, false);
+        const budget = 16384 - prefixTokens.length - queryTokens.length
+                     - midTokens.length - suffixTokens.length;
+        const scored: ScoredChunk[] = [];
+        for (const chunk of chunks) {
+          const docTokens = chunk.tokens.length > budget
+            ? chunk.tokens.slice(0, budget) : chunk.tokens;
+          const tokens = [
+            ...prefixTokens, ...queryTokens,
+            ...midTokens, ...docTokens,
+            ...suffixTokens,
+          ];
+          const branch = Branch.create(ctx, 0, { temperature: 0 });
+          await branch.prefill(tokens);
+          const score = rerankScore(branch.getLogits());
+          await branch.prune();
+          scored.push({
+            file: chunk.resource, heading: chunk.heading,
+            score: Math.round(score * 1000) / 1000,
+            startLine: chunk.startLine, endLine: chunk.endLine,
+          });
+        }
+        return scored.sort((a, b) => b.score - a.score).slice(0, 5);
+      } finally {
+        release();
+      }
+    },
+
+    async tokenizeChunks(chunks: Chunk[]): Promise<void> {
+      for (const chunk of chunks) {
+        chunk.tokens = await ctx.tokenize(chunk.text, false);
+      }
+    },
+
+    dispose(): void {
+      ctx.dispose();
+    },
+  };
+}
diff --git a/examples/deep-research/resources/files.ts b/examples/deep-research/resources/files.ts
new file mode 100644
index 0000000..7200c26
--- /dev/null
+++ b/examples/deep-research/resources/files.ts
@@ -0,0 +1,44 @@
+import * as fs from 'node:fs';
+import * as path from 'node:path';
+import { loadBinary } from '../../../dist/index.js';
+import type { Resource, Chunk } from './types.js';
+
+interface Section { heading: string; level: number; startLine: number; endLine: number }
+const { parseMarkdown } = loadBinary() as unknown as { parseMarkdown(text: string): Section[] };
+
+export function loadResources(dir: string): Resource[] {
+  if (!fs.existsSync(dir)) {
+    process.stdout.write(`Error: corpus not found: ${dir}\n`);
+    process.exit(1);
+  }
+  const stat = fs.statSync(dir);
+  if (stat.isFile()) {
+    return [{ name: path.basename(dir), content: fs.readFileSync(dir, 'utf8') }];
+  }
+  const files = fs.readdirSync(dir).filter((f) => f.endsWith('.md'));
+  if (!files.length) {
+    process.stdout.write(`Error: no .md files in: ${dir}\n`);
+    process.exit(1);
+  }
+  return files.map((f) => ({
+    name: f,
+    content: fs.readFileSync(path.join(dir, f), 'utf8'),
+  }));
+}
+
+export function chunkResources(resources: Resource[]): Chunk[] {
+  const out: Chunk[] = [];
+  for (const res of resources) {
+    const sections = parseMarkdown(res.content);
+    const lines = res.content.split('\n');
+    for (const sec of sections) {
+      const text = lines.slice(sec.startLine - 1, sec.endLine).join('\n').trim();
+      if (!text) continue;
+      out.push({
+        resource: res.name, heading: sec.heading || res.name, text, tokens: [],
+        startLine: sec.startLine, endLine: sec.endLine,
+      });
+    }
+  }
+  return out;
+}
diff --git a/examples/deep-research/resources/types.ts b/examples/deep-research/resources/types.ts
new file mode 100644
index 0000000..17242b1
--- /dev/null
+++ b/examples/deep-research/resources/types.ts
@@ -0,0 +1,10 @@
+export interface Resource { name: string; content: string }
+
+export interface Chunk {
+  resource: string;
+  heading: string;
+  text: string;
+  tokens: number[];
+  startLine: number;
+  endLine: number;
+}
diff --git a/examples/deep-research/tasks/eval.md b/examples/deep-research/tasks/eval.md
new file mode 100644
index 0000000..c408002
--- /dev/null
+++ b/examples/deep-research/tasks/eval.md
@@ -0,0 +1,3 @@
+Do these responses agree on the key points?
+
+{{responses}}
\ No newline at end of file
diff --git a/examples/deep-research/tasks/eval.ts b/examples/deep-research/tasks/eval.ts
new file mode 100644
index 0000000..9eb3079
--- /dev/null
+++ b/examples/deep-research/tasks/eval.ts
@@ -0,0 +1,54 @@
+import * as fs from 'node:fs';
+import * as path from 'node:path';
+import { Branch } from '../../../dist/index.js';
+import type { SessionContext } from '../../../dist/index.js';
+
+const EVAL_PROMPT = fs.readFileSync(path.resolve(__dirname, 'eval.md'), 'utf8');
+
+export async function evaluate(ctx: SessionContext, opts: {
+  attempts: { output: string }[];
+}): Promise<{ converged: boolean | null; tokenCount: number }> {
+  const responsesText = opts.attempts
+    .map((a, i) => `Response ${i + 1}: ${a.output.trim()}`)
+    .join('\n\n');
+
+  const userContent = EVAL_PROMPT.replace('{{responses}}', responsesText);
+
+  const messages = [
+    {
+      role: 'system',
+      content: 'You are a consistency checker. Compare the responses and determine if they convey the same core meaning. Output JSON only.',
+    },
+    { role: 'user', content: userContent },
+  ];
+
+  const evalSchema = {
+    type: 'object',
+    properties: { converged: { type: 'boolean' } },
+    required: ['converged'],
+  };
+  const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(evalSchema));
+
+  const { prompt } = await ctx.formatChat(JSON.stringify(messages));
+  const tokens = await ctx.tokenize(prompt);
+
+  const branch = Branch.create(ctx, 0, { temperature: 0 }, undefined, grammar);
+  await branch.prefill(tokens);
+
+  let output = '';
+  let tokenCount = 0;
+  for await (const { text } of branch) {
+    output += text;
+    tokenCount++;
+  }
+  await branch.prune();
+
+  let converged: boolean | null;
+  try {
+    converged = JSON.parse(output).converged;
+  } catch {
+    converged = null;
+  }
+
+  return { converged, tokenCount };
+}
diff --git a/examples/deep-research/tasks/plan.md b/examples/deep-research/tasks/plan.md
new file mode 100644
index 0000000..8f94bd7
--- /dev/null
+++ b/examples/deep-research/tasks/plan.md
@@ -0,0 +1 @@
+Break this into {{count}} independent sub-questions for parallel research: "{{query}}"
\ No newline at end of file
diff --git a/examples/deep-research/tasks/plan.ts b/examples/deep-research/tasks/plan.ts
new file mode 100644
index 0000000..8f6e4f8
--- /dev/null
+++ b/examples/deep-research/tasks/plan.ts
@@ -0,0 +1,57 @@
+import * as fs from 'node:fs';
+import * as path from 'node:path';
+import { Branch } from '../../../dist/index.js';
+import type { SessionContext } from '../../../dist/index.js';
+
+const PLAN_PROMPT = fs.readFileSync(path.resolve(__dirname, 'plan.md'), 'utf8');
+
+export async function plan(ctx: SessionContext, opts: {
+  query: string;
+  agentCount: number;
+}): Promise<{ questions: string[]; tokenCount: number }> {
+  const schema = {
+    type: 'object',
+    properties: {
+      questions: {
+        type: 'array',
+        items: { type: 'string' },
+        minItems: 2,
+        maxItems: opts.agentCount,
+      },
+    },
+    required: ['questions'],
+  };
+  const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(schema));
+
+  const userContent = PLAN_PROMPT
+    .replace('{{count}}', String(opts.agentCount))
+    .replace('{{query}}', opts.query);
+
+  const messages = [
+    { role: 'system', content: 'You break research queries into sub-questions. Output JSON only.' },
+    { role: 'user', content: userContent },
+  ];
+  const { prompt } = await ctx.formatChat(JSON.stringify(messages));
+  const tokens = await ctx.tokenize(prompt);
+
+  const lead = Branch.create(ctx, 0, { temperature: 0.3 }, undefined, grammar);
+  await lead.prefill(tokens);
+
+  let output = '';
+  let tokenCount = 0;
+  for await (const { text } of lead) {
+    output += text;
+    tokenCount++;
+  }
+  await lead.prune();
+
+  let questions: string[];
+  try {
+    questions = JSON.parse(output).questions.slice(0, opts.agentCount);
+    if (!questions.length) throw new Error('empty questions');
+  } catch {
+    questions = Array.from({ length: opts.agentCount }, (_, i) => `${opts.query} (aspect ${i + 1})`);
+  }
+
+  return { questions, tokenCount };
+}
diff --git a/examples/deep-research/tasks/research.md b/examples/deep-research/tasks/research.md
new file mode 100644
index 0000000..659fd41
--- /dev/null
+++ b/examples/deep-research/tasks/research.md
@@ -0,0 +1 @@
+You are a research assistant with access to a knowledge base. Use the search and read_file tools to find information, then call report with your findings. Be thorough: search first, read relevant files, then report.
\ No newline at end of file
diff --git a/examples/deep-research/tasks/research.ts b/examples/deep-research/tasks/research.ts
new file mode 100644
index 0000000..1807e7f
--- /dev/null
+++ b/examples/deep-research/tasks/research.ts
@@ -0,0 +1,126 @@
+import * as fs from 'node:fs';
+import * as path from 'node:path';
+import { Branch, BranchStore, forkAgent, runAgents } from '../../../dist/index.js';
+import type { SessionContext, AgentState } from '../../../dist/index.js';
+import type { ExecuteToolFn } from '../tools/types.js';
+
+const DEFAULT_SYSTEM_PROMPT = fs.readFileSync(path.resolve(__dirname, 'research.md'), 'utf8');
+
+export { DEFAULT_SYSTEM_PROMPT as RESEARCH_SYSTEM_PROMPT };
+
+export interface AgentResult {
+  findings: string | null;
+  toolCallCount: number;
+  tokenCount: number;
+}
+
+export interface ResearchResult {
+  agents: AgentResult[];
+  totalTokens: number;
+  totalToolCalls: number;
+  steps: number;
+  counters: Record<string, number>;
+  sharedPrefixLength: number;
+}
+
+export async function research(ctx: SessionContext, store: BranchStore, opts: {
+  questions: string[];
+  parent?: InstanceType<typeof Branch>;
+  seed?: number;
+  systemPrompt?: string;
+  toolsJson: string;
+  executeTool: ExecuteToolFn;
+  maxTurns?: number;
+  onToolCall?: (agentIndex: number, toolName: string, args: string) => void;
+  onToolResult?: (agentIndex: number, toolName: string, resultStr: string) => void;
+}): Promise<ResearchResult> {
+  const systemPrompt = opts.systemPrompt ?? DEFAULT_SYSTEM_PROMPT;
+
+  let agents: AgentState[];
+  let sharedPrefixLength: number;
+  let root: InstanceType<typeof Branch> | null;
+
+  if (opts.parent) {
+    // Warm: fork from conversation trunk — each agent inherits full KV,
+    // gets a fresh system prompt + question injected as suffix.
+    // Diversity via reseeded sampler, not divergent content.
+    agents = await Promise.all(
+      opts.questions.map((q, i) =>
+        forkAgent(opts.parent!, {
+          systemPrompt,
+          content: q,
+          tools: opts.toolsJson,
+          seed: opts.seed != null ? opts.seed + i : undefined,
+        }, ctx)
+      )
+    );
+    sharedPrefixLength = 0;
+    root = null;
+  } else {
+    // Cold: shared-prefix optimization — one root with system prompt,
+    // fork N agents with divergent user-question suffixes.
+    const sharedMessages = [{ role: 'system', content: systemPrompt }];
+    const sharedFmt = await ctx.formatChat(
+      JSON.stringify(sharedMessages),
+      { tools: opts.toolsJson, addGenerationPrompt: false },
+    );
+    const sharedTokens = await ctx.tokenize(sharedFmt.prompt);
+
+    root = Branch.create(ctx, 0, { temperature: 0.5 });
+    await root.prefill(sharedTokens);
+
+    agents = [];
+    for (const q of opts.questions) {
+      const branch = await root.fork();
+      const fullMessages = [
+        { role: 'system', content: systemPrompt },
+        { role: 'user', content: q },
+      ];
+      const fmt = await ctx.formatChat(JSON.stringify(fullMessages), { tools: opts.toolsJson });
+      const fullTokens = await ctx.tokenize(fmt.prompt);
+      const suffixTokens = fullTokens.slice(sharedTokens.length);
+
+      agents.push({
+        branch,
+        suffixTokens,
+        fmt: {
+          format: fmt.format,
+          reasoningFormat: fmt.reasoningFormat,
+          thinkingForcedOpen: fmt.thinkingForcedOpen,
+          parser: fmt.parser,
+        },
+        rawOutput: '',
+        done: false,
+        tokenCount: 0,
+        toolCallCount: 0,
+        turns: 0,
+        findings: null,
+      });
+    }
+    sharedPrefixLength = sharedTokens.length;
+  }
+
+  // Common path: batch prefill + agentic loop + prune
+  await store.prefill(agents.map((a) => [a.branch, a.suffixTokens]));
+
+  const result = await runAgents(agents, {
+    store, ctx,
+    executeTool: opts.executeTool,
+    maxTurns: opts.maxTurns ?? 6,
+    onToolCall: opts.onToolCall,
+    onToolResult: opts.onToolResult,
+  });
+
+  for (const a of agents) await a.branch.prune();
+  if (root) await root.prune();
+
+  return {
+    agents: agents.map((a) => ({
+      findings: a.findings,
+      toolCallCount: a.toolCallCount,
+      tokenCount: a.tokenCount,
+    })),
+    ...result,
+    sharedPrefixLength,
+  };
+}
diff --git a/examples/deep-research/tasks/verify.md b/examples/deep-research/tasks/verify.md
new file mode 100644
index 0000000..27e0c97
--- /dev/null
+++ b/examples/deep-research/tasks/verify.md
@@ -0,0 +1,5 @@
+Research findings:
+
+{{findings}}
+
+Synthesize these into a brief summary answering: "{{query}}"
\ No newline at end of file
diff --git a/examples/deep-research/tasks/verify.ts b/examples/deep-research/tasks/verify.ts
new file mode 100644
index 0000000..4beb9a9
--- /dev/null
+++ b/examples/deep-research/tasks/verify.ts
@@ -0,0 +1,91 @@
+import * as fs from 'node:fs';
+import * as path from 'node:path';
+import { Branch, BranchStore } from '../../../dist/index.js';
+import type { SessionContext } from '../../../dist/index.js';
+
+const VERIFY_PROMPT = fs.readFileSync(path.resolve(__dirname, 'verify.md'), 'utf8');
+
+export interface Attempt {
+  output: string;
+  tokenCount: number;
+  ppl: number;
+}
+
+export interface VerifyResult {
+  attempts: Attempt[];
+  bestOutput: string;
+  bestBranch: InstanceType<typeof Branch>;
+  totalTokens: number;
+  steps: number;
+  prefixLength: number;
+}
+
+export async function verify(ctx: SessionContext, store: BranchStore, opts: {
+  findings: string;
+  query: string;
+  count: number;
+}): Promise<VerifyResult> {
+  const userContent = VERIFY_PROMPT
+    .replace('{{findings}}', opts.findings)
+    .replace('{{query}}', opts.query);
+
+  const messages = [
+    { role: 'system', content: 'Synthesize the research findings into a coherent, concise summary.' },
+    { role: 'user', content: userContent },
+  ];
+  const { prompt } = await ctx.formatChat(JSON.stringify(messages));
+  const synthTokens = await ctx.tokenize(prompt);
+
+  const synthRoot = Branch.create(ctx, 0, { temperature: 0.7 });
+  await synthRoot.prefill(synthTokens);
+
+  // Fork N branches with reseeded samplers for stochastic divergence
+  const live: { branch: InstanceType<typeof Branch>; output: string; done: boolean; tokenCount: number; ppl: number }[] = [];
+  for (let i = 0; i < opts.count; i++) {
+    const branch = await synthRoot.fork();
+    branch.reseedSampler(2000 + i);
+    live.push({ branch, output: '', done: false, tokenCount: 0, ppl: Infinity });
+  }
+
+  // BranchStore batched decode — produceSync/commit loop
+  let steps = 0;
+  for (;;) {
+    const entries: [InstanceType<typeof Branch>, number][] = [];
+    for (const a of live) {
+      if (a.done) continue;
+      const { token, text, isStop } = a.branch.produceSync();
+      if (isStop) {
+        const p = a.branch.perplexity;
+        a.ppl = Number.isFinite(p) ? p : Infinity;
+        a.done = true;
+        continue;
+      }
+      entries.push([a.branch, token]);
+      a.output += text;
+      a.tokenCount++;
+    }
+    if (entries.length === 0) break;
+    await store.commit(entries);
+    steps++;
+  }
+
+  // Pick lowest perplexity (most coherent)
+  const bestIdx = live.reduce((bi, a, i) => a.ppl <= live[bi].ppl ? i : bi, 0);
+
+  // Prune non-best attempts; synthRoot stays alive (bestBranch is its child)
+  // — caller's retainOnly will clean up synthRoot when promoting bestBranch
+  for (let i = 0; i < live.length; i++) {
+    if (i !== bestIdx) await live[i].branch.prune();
+  }
+
+  const totalTokens = live.reduce((s, a) => s + a.tokenCount, 0);
+
+  return {
+    attempts: live.map((a) => ({ output: a.output, tokenCount: a.tokenCount, ppl: a.ppl })),
+    bestOutput: live[bestIdx].output,
+    bestBranch: live[bestIdx].branch,
+    totalTokens,
+    steps,
+    prefixLength: synthTokens.length,
+  };
+}
diff --git a/examples/deep-research/tools/index.ts b/examples/deep-research/tools/index.ts
new file mode 100644
index 0000000..bd4c829
--- /dev/null
+++ b/examples/deep-research/tools/index.ts
@@ -0,0 +1,28 @@
+import type { Resource, Chunk } from '../resources/types.js';
+import type { Reranker, Tool, ExecuteToolFn } from './types.js';
+import { createSearchTool } from './search.js';
+import { createReadFileTool } from './read-file.js';
+import { createReportTool } from './report.js';
+
+export function createTools(opts: {
+  resources: Resource[];
+  chunks: Chunk[];
+  reranker: Reranker;
+}): { tools: Tool[]; toolsJson: string; executeTool: ExecuteToolFn } {
+  const tools = [
+    createSearchTool(opts.chunks, opts.reranker),
+    createReadFileTool(opts.resources),
+    createReportTool(),
+  ];
+
+  const toolsJson = JSON.stringify(tools.map((t) => t.schema));
+  const toolMap = new Map(tools.map((t) => [t.name, t]));
+
+  const executeTool: ExecuteToolFn = async (name, args) => {
+    const tool = toolMap.get(name);
+    if (!tool) return { error: `Unknown tool: ${name}` };
+    return tool.execute(args);
+  };
+
+  return { tools, toolsJson, executeTool };
+}
diff --git a/examples/deep-research/tools/read-file.ts b/examples/deep-research/tools/read-file.ts
new file mode 100644
index 0000000..327e54e
--- /dev/null
+++ b/examples/deep-research/tools/read-file.ts
@@ -0,0 +1,39 @@
+import type { Resource } from '../resources/types.js';
+import type { Tool } from './types.js';
+
+export function createReadFileTool(resources: Resource[]): Tool {
+  return {
+    name: 'read_file',
+    schema: {
+      type: 'function',
+      function: {
+        name: 'read_file',
+        description: 'Read content from a file at specific line ranges. Use startLine/endLine from search results.',
+        parameters: {
+          type: 'object',
+          properties: {
+            filename: {
+              type: 'string',
+              description: 'Filename from search results',
+              enum: resources.map((r) => r.name),
+            },
+            startLine: { type: 'number', description: 'Start line (1-indexed, from search results)' },
+            endLine: { type: 'number', description: 'End line (1-indexed, from search results)' },
+          },
+          required: ['filename'],
+        },
+      },
+    },
+    async execute(args) {
+      const filename = (args.filename as string) || (args.path as string) || '';
+      const file = resources.find((r) => r.name === filename);
+      if (!file) {
+        return { error: `File not found: ${filename}. Available: ${resources.map((r) => r.name).join(', ')}` };
+      }
+      const lines = file.content.split('\n');
+      const s = Math.max(0, ((args.startLine as number) ?? 1) - 1);
+      const e = Math.min(lines.length, (args.endLine as number) ?? Math.min(100, lines.length));
+      return { file: file.name, content: lines.slice(s, e).join('\n') };
+    },
+  };
+}
diff --git a/examples/deep-research/tools/report.ts b/examples/deep-research/tools/report.ts
new file mode 100644
index 0000000..253a820
--- /dev/null
+++ b/examples/deep-research/tools/report.ts
@@ -0,0 +1,22 @@
+import type { Tool } from './types.js';
+
+export function createReportTool(): Tool {
+  return {
+    name: 'report',
+    schema: {
+      type: 'function',
+      function: {
+        name: 'report',
+        description: 'Submit your final research findings. Call this when you have gathered enough information to answer the question.',
+        parameters: {
+          type: 'object',
+          properties: { findings: { type: 'string', description: 'Your research findings and answer' } },
+          required: ['findings'],
+        },
+      },
+    },
+    async execute() {
+      return { acknowledged: true };
+    },
+  };
+}
diff --git a/examples/deep-research/tools/search.ts b/examples/deep-research/tools/search.ts
new file mode 100644
index 0000000..fef2fe7
--- /dev/null
+++ b/examples/deep-research/tools/search.ts
@@ -0,0 +1,23 @@
+import type { Chunk } from '../resources/types.js';
+import type { Reranker, Tool } from './types.js';
+
+export function createSearchTool(chunks: Chunk[], reranker: Reranker): Tool {
+  return {
+    name: 'search',
+    schema: {
+      type: 'function',
+      function: {
+        name: 'search',
+        description: 'Search the knowledge base. Returns sections ranked by relevance with line ranges for read_file.',
+        parameters: {
+          type: 'object',
+          properties: { query: { type: 'string', description: 'Search query' } },
+          required: ['query'],
+        },
+      },
+    },
+    async execute(args) {
+      return reranker.score((args.query as string) || '', chunks);
+    },
+  };
+}
diff --git a/examples/deep-research/tools/types.ts b/examples/deep-research/tools/types.ts
new file mode 100644
index 0000000..ff52c8d
--- /dev/null
+++ b/examples/deep-research/tools/types.ts
@@ -0,0 +1,23 @@
+import type { Chunk } from '../resources/types.js';
+
+export interface ScoredChunk {
+  file: string;
+  heading: string;
+  score: number;
+  startLine: number;
+  endLine: number;
+}
+
+export interface Reranker {
+  score(query: string, chunks: Chunk[]): Promise<ScoredChunk[]>;
+  tokenizeChunks(chunks: Chunk[]): Promise<void>;
+  dispose(): void;
+}
+
+export interface Tool {
+  name: string;
+  schema: object;
+  execute: (args: Record<string, unknown>) => Promise<unknown>;
+}
+
+export type ExecuteToolFn = (name: string, args: Record<string, unknown>) => Promise<unknown>;
diff --git a/src/Util.cpp b/src/Util.cpp
new file mode 100644
index 0000000..da5539c
--- /dev/null
+++ b/src/Util.cpp
@@ -0,0 +1,193 @@
+#include "Util.hpp"
+#include <md4c.h>
+#include <algorithm>
+#include <climits>
+#include <string>
+#include <vector>
+
+namespace liblloyal_node {
+
+struct Section {
+  std::string heading;
+  unsigned level = 0;
+  int start_line = 0;
+  int end_line = 0;
+};
+
+struct ParseState {
+  const char* input;
+  size_t input_size;
+  std::vector<size_t> line_starts;
+
+  int depth = 0;
+  bool in_heading = false;
+  unsigned heading_level = 0;
+  std::string heading_text;
+
+  // Byte offset of the first text seen in the current top-level block
+  size_t block_first_offset = SIZE_MAX;
+
+  std::vector<Section> sections;
+
+  void build_line_table() {
+    line_starts.push_back(0);
+    for (size_t i = 0; i < input_size; i++) {
+      if (input[i] == '\n') {
+        line_starts.push_back(i + 1);
+      }
+    }
+  }
+
+  // Binary search: find the 1-indexed line number containing the given byte offset
+  int line_at(size_t offset) const {
+    auto it = std::upper_bound(line_starts.begin(), line_starts.end(), offset);
+    return static_cast<int>(it - line_starts.begin());
+  }
+
+  // Line number of the last content line in the input
+  int last_line() const {
+    if (input_size == 0) return 0;
+    size_t last = input_size - 1;
+    if (input[last] == '\n' && last > 0) last--;
+    return line_at(last);
+  }
+};
+
+// md4c callbacks — static functions with C-compatible signatures
+
+static int on_enter_block(MD_BLOCKTYPE type, void* detail, void* userdata) {
+  auto* s = static_cast<ParseState*>(userdata);
+  s->depth++;
+
+  // depth==2 means direct child of MD_BLOCK_DOC (top-level block)
+  if (s->depth == 2) {
+    s->block_first_offset = SIZE_MAX;
+
+    if (type == MD_BLOCK_H) {
+      s->in_heading = true;
+      s->heading_level = static_cast<MD_BLOCK_H_DETAIL*>(detail)->level;
+      s->heading_text.clear();
+    }
+  }
+  return 0;
+}
+
+static int on_leave_block(MD_BLOCKTYPE type, void* /* detail */, void* userdata) {
+  auto* s = static_cast<ParseState*>(userdata);
+
+  if (s->depth == 2 && type == MD_BLOCK_H && s->block_first_offset != SIZE_MAX) {
+    int heading_line = s->line_at(s->block_first_offset);
+
+    // Close previous section
+    if (!s->sections.empty()) {
+      s->sections.back().end_line = heading_line - 1;
+    }
+
+    // Start new section at this heading
+    Section sec;
+    sec.heading = s->heading_text;
+    sec.level = s->heading_level;
+    sec.start_line = heading_line;
+    s->sections.push_back(sec);
+
+    s->in_heading = false;
+  }
+
+  s->depth--;
+  return 0;
+}
+
+static int on_enter_span(MD_SPANTYPE /* type */, void* /* detail */, void* /* userdata */) {
+  return 0;
+}
+
+static int on_leave_span(MD_SPANTYPE /* type */, void* /* detail */, void* /* userdata */) {
+  return 0;
+}
+
+static int on_text(MD_TEXTTYPE /* type */, const MD_CHAR* text, MD_SIZE size, void* userdata) {
+  auto* s = static_cast<ParseState*>(userdata);
+
+  // Track first text offset for the current top-level block
+  if (s->depth >= 2 && s->block_first_offset == SIZE_MAX) {
+    s->block_first_offset = static_cast<size_t>(text - s->input);
+  }
+
+  // Accumulate heading text
+  if (s->in_heading) {
+    s->heading_text.append(text, size);
+  }
+
+  return 0;
+}
+
+// N-API entry points
+
+void Util::Init(Napi::Env env, Napi::Object exports) {
+  exports.Set("parseMarkdown", Napi::Function::New(env, ParseMarkdown));
+}
+
+Napi::Value Util::ParseMarkdown(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+
+  if (info.Length() < 1 || !info[0].IsString()) {
+    Napi::TypeError::New(env, "parseMarkdown expects a string argument")
+        .ThrowAsJavaScriptException();
+    return env.Undefined();
+  }
+
+  std::string input = info[0].As<Napi::String>().Utf8Value();
+
+  // Empty input → empty result
+  if (input.empty()) {
+    return Napi::Array::New(env, 0);
+  }
+
+  ParseState state;
+  state.input = input.c_str();
+  state.input_size = input.size();
+  state.build_line_table();
+
+  // Preamble: content before the first heading
+  Section preamble;
+  preamble.start_line = 1;
+  state.sections.push_back(preamble);
+
+  MD_PARSER parser = {};
+  parser.abi_version = 0;
+  parser.flags = MD_FLAG_TABLES | MD_FLAG_STRIKETHROUGH;
+  parser.enter_block = on_enter_block;
+  parser.leave_block = on_leave_block;
+  parser.enter_span = on_enter_span;
+  parser.leave_span = on_leave_span;
+  parser.text = on_text;
+
+  md_parse(input.c_str(), static_cast<MD_SIZE>(input.size()), &parser, &state);
+
+  // Close last section
+  if (!state.sections.empty()) {
+    state.sections.back().end_line = state.last_line();
+  }
+
+  // Remove empty sections (startLine > endLine)
+  state.sections.erase(
+      std::remove_if(state.sections.begin(), state.sections.end(),
+                     [](const Section& sec) { return sec.start_line > sec.end_line; }),
+      state.sections.end());
+
+  // Build N-API result
+  Napi::Array result = Napi::Array::New(env, state.sections.size());
+  for (size_t i = 0; i < state.sections.size(); i++) {
+    const auto& sec = state.sections[i];
+    Napi::Object obj = Napi::Object::New(env);
+    obj.Set("heading", sec.heading);
+    obj.Set("level", static_cast<double>(sec.level));
+    obj.Set("startLine", static_cast<double>(sec.start_line));
+    obj.Set("endLine", static_cast<double>(sec.end_line));
+    result.Set(static_cast<uint32_t>(i), obj);
+  }
+
+  return result;
+}
+
+} // namespace liblloyal_node
diff --git a/src/Util.hpp b/src/Util.hpp
new file mode 100644
index 0000000..2ebbc5c
--- /dev/null
+++ b/src/Util.hpp
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <napi.h>
+
+namespace liblloyal_node {
+
+class Util {
+public:
+  static void Init(Napi::Env env, Napi::Object exports);
+
+private:
+  static Napi::Value ParseMarkdown(const Napi::CallbackInfo& info);
+};
+
+} // namespace liblloyal_node
diff --git a/src/binding.cpp b/src/binding.cpp
index 447a07a..02a32da 100644
--- a/src/binding.cpp
+++ b/src/binding.cpp
@@ -1,5 +1,6 @@
 #include <napi.h>
 #include "SessionContext.hpp"
+#include "Util.hpp"
 #include <llama/llama.h>
 
 /**
@@ -24,6 +25,9 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) {
   // Export factory function
   exports.Set("createContext", Napi::Function::New(env, CreateContext));
 
+  // Export utility functions (parseMarkdown, etc.)
+  Util::Init(env, exports);
+
   return exports;
 }
 

From 3bfe0997c02755d02c56e2e37df8d773f614d3ab Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Mon, 23 Feb 2026 05:31:46 +1100
Subject: [PATCH 05/17] feat(agents): add plan to warm continuation

---
 examples/deep-research/deep-research.ts | 20 ++++++++++++++++----
 examples/deep-research/tasks/plan.ts    | 18 +++++++++++++++---
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/examples/deep-research/deep-research.ts b/examples/deep-research/deep-research.ts
index ecf6394..85e7cd9 100644
--- a/examples/deep-research/deep-research.ts
+++ b/examples/deep-research/deep-research.ts
@@ -269,18 +269,30 @@ async function main(): Promise<void> {
 
       await session.promote(verifyResult.bestBranch);
     } else {
-      // ─── warm: research → findings → grounded response ─
-      log(`  ${c.dim}  researching...${c.reset}`);
+      // ─── warm: plan → research → findings → grounded response ─
+      const { questions, tokenCount: planTokens } = await plan(ctx, {
+        query, agentCount: AGENT_COUNT,
+        parent: session.trunk!,
+      });
+
+      log(`\n  ${c.green}●${c.reset} ${c.bold}Plan${c.reset} ${c.dim}${planTokens} tok${c.reset}`);
+      questions.forEach((q, i) => log(`    ${c.dim}${i + 1}.${c.reset} ${q}`));
+
+      log(`\n  ${c.green}●${c.reset} ${c.bold}Research${c.reset} ${c.dim}${questions.length} agents${c.reset}`);
 
       const followUp = await research(ctx, store, {
-        questions: Array(AGENT_COUNT).fill(query),
+        questions,
         parent: session.trunk!,
         seed: Date.now(),
         toolsJson, executeTool,
         maxTurns: MAX_TOOL_TURNS, onToolCall, onToolResult,
       });
 
-      log(`  ${c.dim}  ${followUp.totalToolCalls} tools · ${followUp.totalTokens} tok${c.reset}`);
+      followUp.agents.forEach((a, i) => {
+        const tree = i === followUp.agents.length - 1 ? '└' : '├';
+        log(`    ${c.dim}${tree}${c.reset} ${c.yellow}${i}${c.reset} ${c.green}done${c.reset} ${c.dim}${a.tokenCount} tok · ${a.toolCallCount} tools${c.reset}`);
+      });
+      log(`    ${c.dim}${followUp.totalToolCalls} tools · ${followUp.totalTokens} tok${c.reset}`);
 
       const agentFindings = followUp.agents
         .map((a, i) => a.findings ? `[Agent ${i}] ${a.findings.trim()}` : null)
diff --git a/examples/deep-research/tasks/plan.ts b/examples/deep-research/tasks/plan.ts
index 8f6e4f8..feb7949 100644
--- a/examples/deep-research/tasks/plan.ts
+++ b/examples/deep-research/tasks/plan.ts
@@ -8,6 +8,7 @@ const PLAN_PROMPT = fs.readFileSync(path.resolve(__dirname, 'plan.md'), 'utf8');
 export async function plan(ctx: SessionContext, opts: {
   query: string;
   agentCount: number;
+  parent?: InstanceType<typeof Branch>;
 }): Promise<{ questions: string[]; tokenCount: number }> {
   const schema = {
     type: 'object',
@@ -32,10 +33,21 @@ export async function plan(ctx: SessionContext, opts: {
     { role: 'user', content: userContent },
   ];
   const { prompt } = await ctx.formatChat(JSON.stringify(messages));
-  const tokens = await ctx.tokenize(prompt);
 
-  const lead = Branch.create(ctx, 0, { temperature: 0.3 }, undefined, grammar);
-  await lead.prefill(tokens);
+  let lead: InstanceType<typeof Branch>;
+  if (opts.parent) {
+    // Warm: fork from trunk — planner inherits conversation KV
+    lead = await opts.parent.fork();
+    lead.setGrammar(grammar);
+    const sep = ctx.getTurnSeparator();
+    const delta = await ctx.tokenize(prompt, false);
+    await lead.prefill([...sep, ...delta]);
+  } else {
+    // Cold: fresh branch at position 0
+    const tokens = await ctx.tokenize(prompt);
+    lead = Branch.create(ctx, 0, { temperature: 0.3 }, undefined, grammar);
+    await lead.prefill(tokens);
+  }
 
   let output = '';
   let tokenCount = 0;

From 95897d95ebad04d6a14ec75f9a460d9ba51d99cc Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Mon, 23 Feb 2026 10:05:48 +1100
Subject: [PATCH 06/17] feat(agents): add stable agentId using branch handle

---
 examples/deep-research/deep-research.ts  | 13 +++--
 examples/deep-research/tasks/research.ts |  1 +
 src/Agent.ts                             | 49 ++++++++--------
 src/Session.ts                           | 71 +++++++++++++++++-------
 src/index.ts                             |  4 +-
 src/types.ts                             | 15 +++--
 6 files changed, 93 insertions(+), 60 deletions(-)

diff --git a/examples/deep-research/deep-research.ts b/examples/deep-research/deep-research.ts
index 85e7cd9..43636e0 100644
--- a/examples/deep-research/deep-research.ts
+++ b/examples/deep-research/deep-research.ts
@@ -129,22 +129,23 @@ async function main(): Promise<void> {
   const session = new Session({ ctx, store });
 
   // Tool call display — shared across cold + warm paths
-  const onToolCall = (ai: number, toolName: string, argsStr: string): void => {
-    emit('tool_call', { agentIndex: ai, toolName, arguments: argsStr });
+  // agentId = branch handle — stable identifier, useful for C++/KV-level debugging
+  const onToolCall = (agentId: number, toolName: string, argsStr: string): void => {
+    emit('tool_call', { agentId, toolName, arguments: argsStr });
     let toolArgs: Record<string, string>;
     try { toolArgs = JSON.parse(argsStr); } catch { toolArgs = {}; }
     const argSummary = toolName === 'search'
       ? `"${toolArgs.query || ''}"`
       : toolName === 'report' ? ''
       : `${toolArgs.filename}` + (toolArgs.startLine ? ` L${toolArgs.startLine}-${toolArgs.endLine}` : '');
-    log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.cyan}${toolName}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
+    log(`    ${c.dim}├${c.reset} ${c.yellow}${agentId}${c.reset} ${c.cyan}${toolName}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
   };
-  const onToolResult = (ai: number, toolName: string, resultStr: string): void => {
+  const onToolResult = (agentId: number, toolName: string, resultStr: string): void => {
     emit('tool_result', {
-      agentIndex: ai, toolName,
+      agentId, toolName,
       result: resultStr.length > 200 ? resultStr.slice(0, 200) + '...' : resultStr,
     });
-    log(`    ${c.dim}├${c.reset} ${c.yellow}${ai}${c.reset} ${c.dim}← ${toolName} ${resultStr.length}b${c.reset}`);
+    log(`    ${c.dim}├${c.reset} ${c.yellow}${agentId}${c.reset} ${c.dim}← ${toolName} ${resultStr.length}b${c.reset}`);
   };
 
   // ================================================================
diff --git a/examples/deep-research/tasks/research.ts b/examples/deep-research/tasks/research.ts
index 1807e7f..6bd6e7e 100644
--- a/examples/deep-research/tasks/research.ts
+++ b/examples/deep-research/tasks/research.ts
@@ -81,6 +81,7 @@ export async function research(ctx: SessionContext, store: BranchStore, opts: {
       const suffixTokens = fullTokens.slice(sharedTokens.length);
 
       agents.push({
+        agentId: branch.handle,
         branch,
         suffixTokens,
         fmt: {
diff --git a/src/Agent.ts b/src/Agent.ts
index 76097bd..d903c61 100644
--- a/src/Agent.ts
+++ b/src/Agent.ts
@@ -7,6 +7,7 @@ import type {
   RunAgentsResult,
   SessionContext,
 } from './types';
+import { buildToolResultDelta } from './Session';
 
 /**
  * Fork an agent from a parent branch with its own system prompt + task
@@ -48,6 +49,7 @@ export async function forkAgent(
   const suffixTokens = [...sep, ...await ctx.tokenize(fmt.prompt, false)];
   if (task.seed != null) branch.reseedSampler(task.seed);
   return {
+    agentId: branch.handle,
     branch,
     suffixTokens,
     fmt: {
@@ -81,7 +83,7 @@ export async function forkAgent(
  *   store, ctx,
  *   executeTool: (name, args) => myToolDispatch(name, args),
  *   maxTurns: 6,
- *   onToolCall(ai, name, args) { console.log(`Agent ${ai}: ${name}`); },
+ *   onToolCall(agentId, name, args) { console.log(`Agent ${agentId}: ${name}`); },
  * });
  * ```
  *
@@ -92,7 +94,6 @@ export async function runAgents(
   opts: RunAgentsOptions
 ): Promise<RunAgentsResult> {
   const { store, ctx, executeTool, maxTurns = 6, onToolCall, onToolResult, onReport } = opts;
-  const sep = ctx.getTurnSeparator();
 
   let steps = 0;
   let totalToolCalls = 0;
@@ -104,12 +105,13 @@ export async function runAgents(
     idleTicks: 0,
   };
 
+  // Keyed by agentId (= branch handle) — stable across reordering
   const pendingTools = new Map<number, {
-    promise: Promise<{ ai: number; prefillTokens: number[] | null }>;
+    promise: Promise<{ agentId: number; prefillTokens: number[] | null }>;
     name: string;
   }>();
 
-  function dispatchTool(ai: number, w: AgentState, tc: ParsedToolCall): void {
+  function dispatchTool(w: AgentState, tc: ParsedToolCall): void {
     let toolArgs: Record<string, unknown>;
     try { toolArgs = JSON.parse(tc.arguments); } catch { toolArgs = {}; }
     const callId = tc.id || `call_${w.toolCallCount}`;
@@ -118,40 +120,36 @@ export async function runAgents(
     totalToolCalls++;
     w.turns++;
 
-    if (onToolCall) onToolCall(ai, tc.name, tc.arguments);
+    if (onToolCall) onToolCall(w.agentId, tc.name, tc.arguments);
 
     const promise = (async () => {
       try {
         const result = await executeTool(tc.name, toolArgs);
         const resultStr = JSON.stringify(result);
 
-        if (onToolResult) onToolResult(ai, tc.name, resultStr);
+        if (onToolResult) onToolResult(w.agentId, tc.name, resultStr);
 
-        const { prompt } = await ctx.formatChat(
-          JSON.stringify([
-            { role: 'system', content: '' },
-            { role: 'tool', content: resultStr, tool_call_id: callId },
-          ])
-        );
-        const delta = await ctx.tokenize(prompt, false);
-        return { ai, prefillTokens: [...sep, ...delta] as number[] | null };
+        const prefillTokens = await buildToolResultDelta(ctx, resultStr, callId);
+        return { agentId: w.agentId, prefillTokens: prefillTokens as number[] | null };
       } catch (err) {
         w.done = true;
         w.findings = `Tool error: ${(err as Error).message}`;
-        return { ai, prefillTokens: null };
+        return { agentId: w.agentId, prefillTokens: null };
       }
     })();
 
-    pendingTools.set(ai, { promise, name: tc.name });
+    pendingTools.set(w.agentId, { promise, name: tc.name });
     counters.maxConcurrentTools = Math.max(counters.maxConcurrentTools, pendingTools.size);
   }
 
+  // Build agentId → index lookup for SETTLE phase
+  const agentById = new Map(agents.map((w) => [w.agentId, w]));
+
   for (;;) {
     // -- Phase 1: PRODUCE -- sample from active agents
     const entries: [Branch, number][] = [];
-    for (let ai = 0; ai < agents.length; ai++) {
-      const w = agents[ai];
-      if (w.done || pendingTools.has(ai)) continue;
+    for (const w of agents) {
+      if (w.done || pendingTools.has(w.agentId)) continue;
 
       const { token, text, isStop } = w.branch.produceSync();
       if (isStop) {
@@ -173,13 +171,13 @@ export async function runAgents(
           w.done = true;
           w.toolCallCount++;
           totalToolCalls++;
-          if (onToolCall) onToolCall(ai, 'report', tc.arguments);
-          if (onReport) onReport(ai, w.findings!);
+          if (onToolCall) onToolCall(w.agentId, 'report', tc.arguments);
+          if (onReport) onReport(w.agentId, w.findings!);
           continue;
         }
 
         // Fire-and-forget — dispatch tool without blocking the decode loop
-        dispatchTool(ai, w, tc);
+        dispatchTool(w, tc);
         w.rawOutput = '';
         continue;
       }
@@ -197,12 +195,13 @@ export async function runAgents(
 
     // -- Phase 3: SETTLE -- non-blocking check for resolved tools
     const prefillPairs: [Branch, number[]][] = [];
-    for (const [ai, info] of pendingTools) {
+    for (const [id, info] of pendingTools) {
       const result = await Promise.race([info.promise, Promise.resolve(null)]);
       if (result !== null) {
-        pendingTools.delete(ai);
+        pendingTools.delete(id);
         if (result.prefillTokens) {
-          prefillPairs.push([agents[result.ai].branch, result.prefillTokens]);
+          const w = agentById.get(result.agentId)!;
+          prefillPairs.push([w.branch, result.prefillTokens]);
         }
       }
     }
diff --git a/src/Session.ts b/src/Session.ts
index 068b3c7..63fea5e 100644
--- a/src/Session.ts
+++ b/src/Session.ts
@@ -2,6 +2,52 @@ import type { Branch } from './Branch';
 import type { BranchStore } from './BranchStore';
 import type { SessionContext } from './types';
 
+/**
+ * Build token delta for a user turn (sep + formatChat + tokenize)
+ *
+ * Usable with any branch — not tied to Session's trunk. This is the
+ * canonical way to build a user-turn delta for warm prefill.
+ *
+ * @category Branching
+ */
+export async function buildUserDelta(
+  ctx: SessionContext,
+  content: string,
+  opts: { tools?: string } = {}
+): Promise<number[]> {
+  const sep = ctx.getTurnSeparator();
+  const fmtOpts = opts.tools ? { tools: opts.tools } : {};
+  const { prompt } = await ctx.formatChat(
+    JSON.stringify([{ role: 'system', content: '' }, { role: 'user', content }]),
+    fmtOpts
+  );
+  const delta = await ctx.tokenize(prompt, false);
+  return [...sep, ...delta];
+}
+
+/**
+ * Build token delta for a tool result turn (sep + formatChat + tokenize)
+ *
+ * Usable with any branch — not tied to Session's trunk.
+ *
+ * @category Branching
+ */
+export async function buildToolResultDelta(
+  ctx: SessionContext,
+  resultStr: string,
+  callId: string
+): Promise<number[]> {
+  const sep = ctx.getTurnSeparator();
+  const { prompt } = await ctx.formatChat(
+    JSON.stringify([
+      { role: 'system', content: '' },
+      { role: 'tool', content: resultStr, tool_call_id: callId },
+    ])
+  );
+  const delta = await ctx.tokenize(prompt, false);
+  return [...sep, ...delta];
+}
+
 /**
  * Session - Trunk lifecycle + conversation delta helpers
  *
@@ -77,39 +123,22 @@ export class Session {
   /**
    * Prefill a user turn into trunk
    *
-   * Centralizes: sep + formatChat([system:'', user:content]) + tokenize(false) + prefill
-   *
    * @param content - User message content
    * @param opts - Optional tools JSON string
    */
   async prefillUser(content: string, opts: { tools?: string } = {}): Promise<void> {
-    const sep = this._ctx.getTurnSeparator();
-    const fmtOpts = opts.tools ? { tools: opts.tools } : {};
-    const { prompt } = await this._ctx.formatChat(
-      JSON.stringify([{ role: 'system', content: '' }, { role: 'user', content }]),
-      fmtOpts
-    );
-    const delta = await this._ctx.tokenize(prompt, false);
-    await this._trunk!.prefill([...sep, ...delta]);
+    const tokens = await buildUserDelta(this._ctx, content, opts);
+    await this._trunk!.prefill(tokens);
   }
 
   /**
    * Prefill a tool result turn into trunk
    *
-   * Centralizes: sep + formatChat([system:'', tool:result]) + tokenize(false) + prefill
-   *
    * @param resultStr - JSON-stringified tool result
    * @param callId - Tool call ID
    */
   async prefillToolResult(resultStr: string, callId: string): Promise<void> {
-    const sep = this._ctx.getTurnSeparator();
-    const { prompt } = await this._ctx.formatChat(
-      JSON.stringify([
-        { role: 'system', content: '' },
-        { role: 'tool', content: resultStr, tool_call_id: callId },
-      ])
-    );
-    const delta = await this._ctx.tokenize(prompt, false);
-    await this._trunk!.prefill([...sep, ...delta]);
+    const tokens = await buildToolResultDelta(this._ctx, resultStr, callId);
+    await this._trunk!.prefill(tokens);
   }
 }
diff --git a/src/index.ts b/src/index.ts
index 100b5fb..ddb43de 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -39,7 +39,7 @@ import type {
 
 import { Branch } from './Branch';
 import { BranchStore } from './BranchStore';
-import { Session } from './Session';
+import { Session, buildUserDelta, buildToolResultDelta } from './Session';
 import { forkAgent, runAgents } from './Agent';
 
 /**
@@ -250,7 +250,7 @@ export const createContext = async (
   return binary.createContext(options);
 };
 
-export { Branch, BranchStore, Session, forkAgent, runAgents };
+export { Branch, BranchStore, Session, buildUserDelta, buildToolResultDelta, forkAgent, runAgents };
 export { PoolingType, ChatFormat, ReasoningFormat, GrammarTriggerType } from './types';
 export type {
   GpuVariant,
diff --git a/src/types.ts b/src/types.ts
index b9a0c7e..aa6a4a6 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -1461,6 +1461,9 @@ export interface AgentTask {
  * @category Branching
  */
 export interface AgentState {
+  /** Stable identifier — the branch handle. Survives reordering, useful for
+   *  keying pending-tool maps and correlating with C++ / KV-level diagnostics. */
+  agentId: number;
   /** The agent's branch */
   branch: Branch;
   /** Tokens to prefill before the loop starts */
@@ -1500,12 +1503,12 @@ export interface RunAgentsOptions {
   executeTool: (name: string, args: Record<string, unknown>) => Promise<unknown>;
   /** Maximum tool-call turns per agent (default: 6) */
   maxTurns?: number;
-  /** Called when an agent dispatches a tool call */
-  onToolCall?: (agentIndex: number, toolName: string, args: string) => void;
-  /** Called when a tool result returns */
-  onToolResult?: (agentIndex: number, toolName: string, resultStr: string) => void;
-  /** Called when an agent submits a report */
-  onReport?: (agentIndex: number, findings: string) => void;
+  /** Called when an agent dispatches a tool call (agentId = branch handle) */
+  onToolCall?: (agentId: number, toolName: string, args: string) => void;
+  /** Called when a tool result returns (agentId = branch handle) */
+  onToolResult?: (agentId: number, toolName: string, resultStr: string) => void;
+  /** Called when an agent submits a report (agentId = branch handle) */
+  onReport?: (agentId: number, findings: string) => void;
 }
 
 /**

From 6dd616bda08b8cd56ad94b8ed7547065a706b68f Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Mon, 23 Feb 2026 22:41:32 +1100
Subject: [PATCH 07/17] feat(agents): migrate re-ranker off chatml to
 fromatChat()

---
 examples/deep-research/reranker.ts | 40 +++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/examples/deep-research/reranker.ts b/examples/deep-research/reranker.ts
index 367c112..525bb1a 100644
--- a/examples/deep-research/reranker.ts
+++ b/examples/deep-research/reranker.ts
@@ -2,31 +2,44 @@ import { createContext, Branch } from '../../dist/index.js';
 import type { Chunk } from './resources/types.js';
 import type { Reranker, ScoredChunk } from './tools/types.js';
 
-const RERANK_PREFIX =
-  '<|im_start|>system\n' +
+const SYSTEM_PROMPT =
   'Judge whether the Document meets the requirements based on the Query ' +
-  'and the Instruct provided. Note that the answer can only be "yes" or "no".' +
-  '<|im_end|>\n<|im_start|>user\n' +
+  'and the Instruct provided. Note that the answer can only be "yes" or "no".';
+
+const USER_PREFIX =
   '<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n\n' +
   '<Query>: ';
-const RERANK_MID = '\n\n<Document>: ';
-const RERANK_SUFFIX = '<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n';
 
 export async function createReranker(
   modelPath: string,
-  opts?: { nSeqMax?: number },
+  opts?: { nSeqMax?: number; nCtx?: number },
 ): Promise<Reranker> {
+  const nCtx = opts?.nCtx ?? 16384;
   const ctx = await createContext({
     modelPath,
-    nCtx: 16384,
+    nCtx,
     nSeqMax: opts?.nSeqMax ?? 3,
   });
 
   const [yesId] = await ctx.tokenize('yes', false);
   const [noId] = await ctx.tokenize('no', false);
-  const prefixTokens = await ctx.tokenize(RERANK_PREFIX, true);
-  const midTokens = await ctx.tokenize(RERANK_MID, false);
-  const suffixTokens = await ctx.tokenize(RERANK_SUFFIX, false);
+
+  // Probe the chat template once to extract prefix/mid/suffix, then
+  // pre-tokenize segments. Per-chunk scoring concatenates token arrays
+  // synchronously — no per-chunk formatChat calls needed.
+  const SENTINEL_Q = '\x00QUERY\x00';
+  const SENTINEL_D = '\x00DOC\x00';
+  const probe = await ctx.formatChat(JSON.stringify([
+    { role: 'system', content: SYSTEM_PROMPT },
+    { role: 'user', content: `${USER_PREFIX}${SENTINEL_Q}\n\n<Document>: ${SENTINEL_D}` },
+  ]), { addGenerationPrompt: true, enableThinking: false });
+
+  const p = probe.prompt;
+  const qi = p.indexOf(SENTINEL_Q);
+  const di = p.indexOf(SENTINEL_D);
+  const prefixTokens = await ctx.tokenize(p.slice(0, qi), true);
+  const midTokens = await ctx.tokenize(p.slice(qi + SENTINEL_Q.length, di), false);
+  const suffixTokens = await ctx.tokenize(p.slice(di + SENTINEL_D.length), false);
 
   function rerankScore(logits: Float32Array): number {
     const max = Math.max(logits[yesId], logits[noId]);
@@ -36,8 +49,7 @@ export async function createReranker(
   }
 
   // Serialize access — concurrent Branch.prefill on the same llama_context
-  // races llama_decode. BranchStore serializes via batched decode, but
-  // individual Branch.prefill calls on the reranker bypass that.
+  // races llama_decode.
   let lock = Promise.resolve();
 
   return {
@@ -48,7 +60,7 @@ export async function createReranker(
       await prev;
       try {
         const queryTokens = await ctx.tokenize(query, false);
-        const budget = 16384 - prefixTokens.length - queryTokens.length
+        const budget = nCtx - prefixTokens.length - queryTokens.length
                      - midTokens.length - suffixTokens.length;
         const scored: ScoredChunk[] = [];
         for (const chunk of chunks) {

From a7c29ec774cb2e86e66a9f7e84088e38072b8710 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Fri, 27 Feb 2026 23:20:45 +1100
Subject: [PATCH 08/17] feat(agents): first class rerank support

---
 src/Agent.ts           |  77 ++++++++++--
 src/Branch.ts          |  43 ++++++-
 src/Rerank.ts          | 268 +++++++++++++++++++++++++++++++++++++++++
 src/SessionContext.cpp | 137 +++++++++++++++++++++
 src/SessionContext.hpp |   5 +
 src/index.ts           |   6 +-
 src/types.ts           |  66 +++++++++-
 test/integration.ts    | 227 +++++++++++++++++++++++++++++++++-
 8 files changed, 813 insertions(+), 16 deletions(-)
 create mode 100644 src/Rerank.ts

diff --git a/src/Agent.ts b/src/Agent.ts
index d903c61..fde7486 100644
--- a/src/Agent.ts
+++ b/src/Agent.ts
@@ -1,11 +1,12 @@
 import type { Branch } from './Branch';
-import type {
-  AgentState,
-  AgentTask,
-  ParsedToolCall,
-  RunAgentsOptions,
-  RunAgentsResult,
-  SessionContext,
+import {
+  GrammarTriggerType,
+  type AgentState,
+  type AgentTask,
+  type ParsedToolCall,
+  type RunAgentsOptions,
+  type RunAgentsResult,
+  type SessionContext,
 } from './types';
 import { buildToolResultDelta } from './Session';
 
@@ -57,6 +58,9 @@ export async function forkAgent(
       reasoningFormat: fmt.reasoningFormat,
       thinkingForcedOpen: fmt.thinkingForcedOpen,
       parser: fmt.parser,
+      grammar: fmt.grammar,
+      grammarLazy: fmt.grammarLazy,
+      grammarTriggers: fmt.grammarTriggers,
     },
     rawOutput: '',
     done: false,
@@ -93,7 +97,7 @@ export async function runAgents(
   agents: AgentState[],
   opts: RunAgentsOptions
 ): Promise<RunAgentsResult> {
-  const { store, ctx, executeTool, maxTurns = 6, onToolCall, onToolResult, onReport } = opts;
+  const { store, ctx, executeTool, maxTurns = 100, onProduce, onToolCall, onToolResult, onToolProgress, onReport } = opts;
 
   let steps = 0;
   let totalToolCalls = 0;
@@ -122,9 +126,13 @@ export async function runAgents(
 
     if (onToolCall) onToolCall(w.agentId, tc.name, tc.arguments);
 
+    const toolContext = onToolProgress ? {
+      onProgress: (p: { filled: number; total: number }) => onToolProgress(w.agentId, tc.name, p),
+    } : undefined;
+
     const promise = (async () => {
       try {
-        const result = await executeTool(tc.name, toolArgs);
+        const result = await executeTool(tc.name, toolArgs, toolContext);
         const resultStr = JSON.stringify(result);
 
         if (onToolResult) onToolResult(w.agentId, tc.name, resultStr);
@@ -145,6 +153,30 @@ export async function runAgents(
   // Build agentId → index lookup for SETTLE phase
   const agentById = new Map(agents.map((w) => [w.agentId, w]));
 
+  // Lazy grammar: unconstrained until trigger fires, then grammar-constrained.
+  // Prevents Qwen3 from generating JSON tool calls instead of expected XML.
+  //
+  // Upstream triggers include tool_start (e.g. "<tool_call>\n<function="),
+  // which fires AFTER the model has already committed to XML — useless when
+  // the model diverges to JSON. Truncate WORD triggers to scope_start only
+  // (e.g. "<tool_call>\n") so the grammar activates at the divergence point
+  // and forces the correct format.
+  const applyLazyGrammar = (w: AgentState): void => {
+    if (w.fmt.grammar && w.fmt.grammarLazy && w.fmt.grammarTriggers.length > 0) {
+      const triggers = w.fmt.grammarTriggers.map(t => {
+        if (t.type === GrammarTriggerType.WORD) {
+          const nlIdx = t.value.indexOf('\n');
+          if (nlIdx >= 0 && nlIdx < t.value.length - 1) {
+            return { ...t, value: t.value.slice(0, nlIdx + 1) };
+          }
+        }
+        return t;
+      });
+      w.branch.setGrammarLazy(w.fmt.grammar, triggers);
+    }
+  };
+  for (const w of agents) applyLazyGrammar(w);
+
   for (;;) {
     // -- Phase 1: PRODUCE -- sample from active agents
     const entries: [Branch, number][] = [];
@@ -162,11 +194,28 @@ export async function runAgents(
         const tc = parsed.toolCalls[0];
         if (!tc || w.turns >= maxTurns) {
           w.done = true;
-          if (!w.findings && parsed.content) w.findings = parsed.content;
+          // Accept content as findings only if agent did actual research
+          if (!w.findings && w.toolCallCount > 0 && parsed.content) {
+            w.findings = parsed.content;
+            if (onReport) onReport(w.agentId, w.findings);
+          }
           continue;
         }
 
         if (tc.name === 'report') {
+          if (w.toolCallCount === 0) {
+            // Reject report without prior research — force the agent to use tools first
+            const callId = tc.id || `call_${w.toolCallCount}`;
+            const errorMsg = 'You must search or read the corpus before reporting. Use search, grep, or read_file first.';
+            w.turns++;
+            const promise = (async () => {
+              const prefillTokens = await buildToolResultDelta(ctx, JSON.stringify({ error: errorMsg }), callId);
+              return { agentId: w.agentId, prefillTokens: prefillTokens as number[] | null };
+            })();
+            pendingTools.set(w.agentId, { promise, name: tc.name });
+            w.rawOutput = '';
+            continue;
+          }
           try { w.findings = JSON.parse(tc.arguments).findings; } catch { w.findings = tc.arguments; }
           w.done = true;
           w.toolCallCount++;
@@ -185,6 +234,7 @@ export async function runAgents(
       entries.push([w.branch, token]);
       w.rawOutput += text;
       w.tokenCount++;
+      if (onProduce) onProduce(w.agentId, text, w.tokenCount);
     }
 
     // -- Phase 2: COMMIT -- batch-decode produced tokens
@@ -210,6 +260,13 @@ export async function runAgents(
       await store.prefill(prefillPairs);
       counters.warmPrefillCalls++;
       counters.warmPrefillBranches += prefillPairs.length;
+
+      // Reset lazy grammar — previous grammar consumed the tool call and is
+      // now in a terminal state. Fresh grammar awaits the next trigger.
+      for (const [branch] of prefillPairs) {
+        const w = agents.find(a => a.branch === branch);
+        if (w) applyLazyGrammar(w);
+      }
     }
 
     // -- Termination + idle yield
diff --git a/src/Branch.ts b/src/Branch.ts
index d444395..6991cf2 100644
--- a/src/Branch.ts
+++ b/src/Branch.ts
@@ -1,4 +1,5 @@
-import type { SessionContext, SamplingParams, Produced } from './types';
+import type { SessionContext, SamplingParams, Produced, GrammarTrigger } from './types';
+import { GrammarTriggerType } from './types';
 
 /**
  * Forkable inference handle for covalent generation
@@ -345,6 +346,46 @@ export class Branch {
     this._ctx._branchSetGrammar(this._handle, grammarStr || '');
   }
 
+  /**
+   * Set lazy grammar — unconstrained until trigger, then grammar-constrained
+   *
+   * Generation runs freely until a trigger pattern or token fires, at which
+   * point the grammar activates and constrains subsequent tokens. Used for
+   * tool-call generation: model writes freely until `<tool_call>`, then
+   * grammar forces valid XML structure.
+   *
+   * The grammar state is cloned on fork(), so sibling branches can diverge
+   * independently. Call again after a tool result prefill to reset.
+   *
+   * @param grammar - GBNF grammar string
+   * @param triggers - Trigger conditions from formatChat().grammarTriggers
+   */
+  setGrammarLazy(grammar: string, triggers: GrammarTrigger[]): void {
+    this._ensureNotDisposed();
+    const escapeRegex = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+    const patterns: string[] = [];
+    const tokens: number[] = [];
+    for (const t of triggers) {
+      switch (t.type) {
+        case GrammarTriggerType.WORD:
+          patterns.push(escapeRegex(t.value));
+          break;
+        case GrammarTriggerType.PATTERN:
+          patterns.push(t.value);
+          break;
+        case GrammarTriggerType.PATTERN_FULL: {
+          const p = t.value;
+          patterns.push((p[0] !== '^' ? '^' : '') + p + (p[p.length - 1] !== '$' ? '$' : ''));
+          break;
+        }
+        case GrammarTriggerType.TOKEN:
+          tokens.push(t.token);
+          break;
+      }
+    }
+    this._ctx._branchSetGrammarLazy(this._handle, grammar, patterns, tokens);
+  }
+
   /**
    * Sample next token without advancing state (async)
    *
diff --git a/src/Rerank.ts b/src/Rerank.ts
new file mode 100644
index 0000000..0771fef
--- /dev/null
+++ b/src/Rerank.ts
@@ -0,0 +1,268 @@
+import { createContext } from './index.js';
+import type { SessionContext, RerankOptions, RerankResult, RerankProgress } from './types';
+
+const SYSTEM_PROMPT =
+  'Judge whether the Document meets the requirements based on the Query ' +
+  'and the Instruct provided. Note that the answer can only be "yes" or "no".';
+
+const USER_PREFIX =
+  '<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n\n' +
+  '<Query>: ';
+
+interface ScoringRequest {
+  tokenArrays: number[][];
+  cursor: number;
+  scores: number[];
+  filled: number;
+  topK: number | undefined;
+  total: number;
+  push: (progress: RerankProgress) => void;
+  finish: () => void;
+  error: (err: Error) => void;
+}
+
+/** Simple async channel — _drain pushes, consumer pulls via for-await */
+function channel<T>(): {
+  push: (value: T) => void;
+  finish: () => void;
+  error: (err: Error) => void;
+  iterable: AsyncIterable<T>;
+} {
+  const buffer: T[] = [];
+  let done = false;
+  let err: Error | null = null;
+  let notify: (() => void) | null = null;
+
+  const wait = () => new Promise<void>((r) => { notify = r; });
+
+  return {
+    push(value: T) {
+      buffer.push(value);
+      notify?.();
+      notify = null;
+    },
+    finish() {
+      done = true;
+      notify?.();
+      notify = null;
+    },
+    error(e: Error) {
+      err = e;
+      notify?.();
+      notify = null;
+    },
+    iterable: {
+      [Symbol.asyncIterator](): AsyncIterator<T> {
+        return {
+          async next(): Promise<IteratorResult<T>> {
+            while (buffer.length === 0 && !done && !err) await wait();
+            if (err) throw err;
+            if (buffer.length > 0) return { value: buffer.shift()!, done: false };
+            return { value: undefined as unknown as T, done: true };
+          },
+        };
+      },
+    },
+  };
+}
+
+export class Rerank {
+  private _ctx: SessionContext;
+  private _nSeqMax: number;
+  private _nCtx: number;
+  private _yesId: number;
+  private _noId: number;
+  private _prefixTokens: number[];
+  private _midTokens: number[];
+  private _suffixTokens: number[];
+  private _pending: ScoringRequest[] = [];
+  private _draining = false;
+  private _disposed = false;
+
+  private constructor(
+    ctx: SessionContext,
+    nSeqMax: number,
+    nCtx: number,
+    yesId: number,
+    noId: number,
+    prefixTokens: number[],
+    midTokens: number[],
+    suffixTokens: number[],
+  ) {
+    this._ctx = ctx;
+    this._nSeqMax = nSeqMax;
+    this._nCtx = nCtx;
+    this._yesId = yesId;
+    this._noId = noId;
+    this._prefixTokens = prefixTokens;
+    this._midTokens = midTokens;
+    this._suffixTokens = suffixTokens;
+  }
+
+  static async create(options: RerankOptions): Promise<Rerank> {
+    const nSeqMax = options.nSeqMax ?? 8;
+    const nCtx = options.nCtx ?? 4096;
+    const ctx = await createContext({
+      modelPath: options.modelPath,
+      nCtx,
+      nSeqMax,
+      typeK: options.typeK ?? 'q4_0',
+      typeV: options.typeV ?? 'q4_0',
+    });
+
+    const [yesId] = await ctx.tokenize('yes', false);
+    const [noId] = await ctx.tokenize('no', false);
+
+    const SENTINEL_Q = '\x00QUERY\x00';
+    const SENTINEL_D = '\x00DOC\x00';
+    const probe = await ctx.formatChat(JSON.stringify([
+      { role: 'system', content: SYSTEM_PROMPT },
+      { role: 'user', content: `${USER_PREFIX}${SENTINEL_Q}\n\n<Document>: ${SENTINEL_D}` },
+    ]), { addGenerationPrompt: true, enableThinking: false });
+
+    const p = probe.prompt;
+    const qi = p.indexOf(SENTINEL_Q);
+    const di = p.indexOf(SENTINEL_D);
+    const prefixTokens = await ctx.tokenize(p.slice(0, qi), true);
+    const midTokens = await ctx.tokenize(p.slice(qi + SENTINEL_Q.length, di), false);
+    const suffixTokens = await ctx.tokenize(p.slice(di + SENTINEL_D.length), false);
+
+    return new Rerank(ctx, nSeqMax, nCtx, yesId, noId, prefixTokens, midTokens, suffixTokens);
+  }
+
+  score(query: string, documents: number[][], topK?: number): AsyncIterable<RerankProgress> {
+    if (this._disposed) throw new Error('Rerank disposed');
+
+    const self = this;
+    const ch = channel<RerankProgress>();
+
+    (async () => {
+      try {
+        const queryTokens = await self._ctx.tokenize(query, false);
+        const shared = [...self._prefixTokens, ...queryTokens, ...self._midTokens];
+        const maxDoc = Math.floor(self._nCtx / self._nSeqMax) - shared.length - self._suffixTokens.length;
+
+        const tokenArrays = documents.map((doc) => {
+          const trimmed = doc.length > maxDoc ? doc.slice(0, maxDoc) : doc;
+          return [...shared, ...trimmed, ...self._suffixTokens];
+        });
+
+        self._enqueue(tokenArrays, topK, ch.push, ch.finish, ch.error);
+      } catch (err) {
+        ch.error(err instanceof Error ? err : new Error(String(err)));
+      }
+    })();
+
+    return ch.iterable;
+  }
+
+  async tokenize(text: string): Promise<number[]> {
+    return this._ctx.tokenize(text, false);
+  }
+
+  dispose(): void {
+    this._disposed = true;
+    const err = new Error('Rerank disposed');
+    for (const req of this._pending) req.error(err);
+    this._pending.length = 0;
+    this._ctx.dispose();
+  }
+
+  // ── Queue internals ──────────────────────────────────────────
+
+  private _sortResults(scores: number[], topK: number | undefined): RerankResult[] {
+    const sorted = scores
+      .map((score, index) => ({ score: Math.round(score * 1000) / 1000, index }))
+      .sort((a, b) => b.score - a.score);
+    return topK != null ? sorted.slice(0, topK) : sorted;
+  }
+
+  private _enqueue(
+    tokenArrays: number[][],
+    topK: number | undefined,
+    push: (progress: RerankProgress) => void,
+    finish: () => void,
+    error: (err: Error) => void,
+  ): void {
+    this._pending.push({
+      tokenArrays, cursor: 0,
+      scores: new Array(tokenArrays.length),
+      filled: 0,
+      topK,
+      total: tokenArrays.length,
+      push, finish, error,
+    });
+    this._drain();
+  }
+
+  private _fillGroup(): { reqIdx: number; promptIdx: number; tokens: number[] }[] {
+    const group: { reqIdx: number; promptIdx: number; tokens: number[] }[] = [];
+    let added = true;
+    while (group.length < this._nSeqMax && added) {
+      added = false;
+      for (let r = 0; r < this._pending.length && group.length < this._nSeqMax; r++) {
+        const req = this._pending[r];
+        if (req.cursor < req.tokenArrays.length) {
+          group.push({ reqIdx: r, promptIdx: req.cursor, tokens: req.tokenArrays[req.cursor] });
+          req.cursor++;
+          added = true;
+        }
+      }
+    }
+    return group;
+  }
+
+  private async _drain(): Promise<void> {
+    if (this._draining) return;
+    this._draining = true;
+
+    try {
+      while (this._pending.length > 0) {
+        const group = this._fillGroup();
+        if (group.length === 0) break;
+
+        let logits: Float32Array[];
+        try {
+          logits = await this._ctx._scoreGroup(group.map((g) => g.tokens));
+        } catch (err) {
+          const error = err instanceof Error ? err : new Error(String(err));
+          for (const req of this._pending) req.error(error);
+          this._pending.length = 0;
+          return;
+        }
+
+        // Track which requests got new scores this group
+        const touched = new Set<number>();
+        for (let i = 0; i < group.length; i++) {
+          const req = this._pending[group[i].reqIdx];
+          req.scores[group[i].promptIdx] = this._rerankScore(logits[i]);
+          req.filled++;
+          touched.add(group[i].reqIdx);
+        }
+
+        // Push progress for each request that advanced, finish completed ones
+        for (let r = this._pending.length - 1; r >= 0; r--) {
+          const req = this._pending[r];
+          if (!touched.has(r)) continue;
+
+          const results = this._sortResults(req.scores, req.topK);
+          req.push({ filled: req.filled, total: req.total, results });
+
+          if (req.filled === req.total) {
+            req.finish();
+            this._pending.splice(r, 1);
+          }
+        }
+      }
+    } finally {
+      this._draining = false;
+    }
+  }
+
+  private _rerankScore(logits: Float32Array): number {
+    const max = Math.max(logits[this._yesId], logits[this._noId]);
+    const yesExp = Math.exp(logits[this._yesId] - max);
+    const noExp = Math.exp(logits[this._noId] - max);
+    return yesExp / (yesExp + noExp);
+  }
+}
diff --git a/src/SessionContext.cpp b/src/SessionContext.cpp
index afc74bc..46177e3 100644
--- a/src/SessionContext.cpp
+++ b/src/SessionContext.cpp
@@ -9,9 +9,11 @@
 #include <lloyal/chat_out.hpp>
 #include <lloyal/grammar.hpp>
 #include <lloyal/kv.hpp>
+#include <lloyal/logits.hpp>
 #include <lloyal/embedding.hpp>
 #include <lloyal/metrics.hpp>
 #include <cmath>
+#include <cstring>
 #include <iostream>
 
 namespace liblloyal_node {
@@ -707,6 +709,68 @@ class StorePrefillWorker : public Napi::AsyncWorker {
   std::vector<std::vector<llama_token>> _tokenStorage;
 };
 
+/**
+ * AsyncWorker for batch logit scoring (process_chunks)
+ * Owns token storage and logit output buffers
+ */
+class ScoreGroupWorker : public Napi::AsyncWorker {
+public:
+  ScoreGroupWorker(Napi::Env env,
+                   llama_context* ctx,
+                   llama_model* model,
+                   int32_t nSeqMax,
+                   std::vector<std::vector<llama_token>> tokenStorage)
+    : AsyncWorker(env), _deferred(env), _ctx(ctx), _model(model),
+      _nSeqMax(nSeqMax), _tokenStorage(std::move(tokenStorage)) {}
+
+  void Execute() override {
+    try {
+      if (static_cast<int32_t>(_tokenStorage.size()) > _nSeqMax) {
+        SetError("_scoreGroup: input size " + std::to_string(_tokenStorage.size()) +
+                 " exceeds n_seq_max " + std::to_string(_nSeqMax));
+        return;
+      }
+
+      int32_t n_vocab = lloyal::tokenizer::vocab_size(_model);
+      size_t n = _tokenStorage.size();
+
+      _logitsStorage.resize(n);
+      std::vector<std::span<const llama_token>> spans(n);
+      std::vector<float*> outputs(n);
+      for (size_t i = 0; i < n; ++i) {
+        _logitsStorage[i].resize(n_vocab);
+        spans[i] = _tokenStorage[i];
+        outputs[i] = _logitsStorage[i].data();
+      }
+
+      lloyal::logits::process_chunks(_ctx, spans, outputs, n_vocab);
+    } catch (const std::exception& e) { SetError(e.what()); }
+  }
+
+  void OnOK() override {
+    Napi::Env env = Env();
+    Napi::Array result = Napi::Array::New(env, _logitsStorage.size());
+    for (size_t i = 0; i < _logitsStorage.size(); ++i) {
+      auto buf = Napi::Float32Array::New(env, _logitsStorage[i].size());
+      std::memcpy(buf.Data(), _logitsStorage[i].data(),
+                  _logitsStorage[i].size() * sizeof(float));
+      result.Set(static_cast<uint32_t>(i), buf);
+    }
+    _deferred.Resolve(result);
+  }
+
+  void OnError(const Napi::Error& err) override { _deferred.Reject(err.Value()); }
+  Napi::Promise GetPromise() { return _deferred.Promise(); }
+
+private:
+  Napi::Promise::Deferred _deferred;
+  llama_context* _ctx;
+  llama_model* _model;
+  int32_t _nSeqMax;
+  std::vector<std::vector<llama_token>> _tokenStorage;
+  std::vector<std::vector<float>> _logitsStorage;
+};
+
 /**
  * AsyncWorker for JSON schema → GBNF grammar conversion
  * Pure CPU, no shared state — cleanest worker
@@ -796,6 +860,7 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) {
     InstanceMethod("_branchClearSteer", &SessionContext::_branchClearSteer),
     InstanceMethod("_branchSetSamplerParams", &SessionContext::_branchSetSamplerParams),
     InstanceMethod("_branchSetGrammar", &SessionContext::_branchSetGrammar),
+    InstanceMethod("_branchSetGrammarLazy", &SessionContext::_branchSetGrammarLazy),
     InstanceMethod("_branchModelEntropy", &SessionContext::_branchModelEntropy),
     InstanceMethod("_branchModelSurprisal", &SessionContext::_branchModelSurprisal),
     InstanceMethod("_branchGetSamplingPerplexity", &SessionContext::_branchGetSamplingPerplexity),
@@ -808,6 +873,9 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) {
     InstanceMethod("_storeRetainOnly", &SessionContext::_storeRetainOnly),
     InstanceMethod("_storeAvailable", &SessionContext::_storeAvailable),
 
+    // ===== SCORING API =====
+    InstanceMethod("_scoreGroup", &SessionContext::_scoreGroup),
+
     // ===== PROPERTIES =====
     InstanceAccessor("vocabSize", &SessionContext::getVocabSize, nullptr),
     InstanceAccessor("memorySize", &SessionContext::getMemorySize, nullptr)
@@ -1479,6 +1547,42 @@ Napi::Value SessionContext::kvCacheReadFile(const Napi::CallbackInfo& info) {
   return worker->GetPromise();
 }
 
+// ===== SCORING API =====
+
+Napi::Value SessionContext::_scoreGroup(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  ensureNotDisposed();
+
+  if (info.Length() < 1 || !info[0].IsArray()) {
+    throw Napi::Error::New(env, "_scoreGroup requires (tokenArrays: number[][])");
+  }
+
+  Napi::Array jsTokenArrays = info[0].As<Napi::Array>();
+  uint32_t n = jsTokenArrays.Length();
+
+  if (n == 0) {
+    auto deferred = Napi::Promise::Deferred::New(env);
+    deferred.Resolve(Napi::Array::New(env, 0));
+    return deferred.Promise();
+  }
+
+  std::vector<std::vector<llama_token>> tokenStorage(n);
+  for (uint32_t i = 0; i < n; i++) {
+    Napi::Array jsArr = jsTokenArrays.Get(i).As<Napi::Array>();
+    uint32_t len = jsArr.Length();
+    tokenStorage[i].resize(len);
+    for (uint32_t j = 0; j < len; j++) {
+      tokenStorage[i][j] = static_cast<llama_token>(
+        jsArr.Get(j).As<Napi::Number>().Int32Value());
+    }
+  }
+
+  int32_t nSeqMax = static_cast<int32_t>(llama_n_seq_max(_context));
+  auto* worker = new ScoreGroupWorker(env, _context, _model.get(), nSeqMax, std::move(tokenStorage));
+  worker->Queue();
+  return worker->GetPromise();
+}
+
 // ===== FACTORY FUNCTION =====
 
 Napi::Value CreateContext(const Napi::CallbackInfo& info) {
@@ -1961,6 +2065,39 @@ Napi::Value SessionContext::_branchSetGrammar(const Napi::CallbackInfo& info) {
   return env.Undefined();
 }
 
+Napi::Value SessionContext::_branchSetGrammarLazy(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  ensureNotDisposed();
+
+  if (info.Length() < 4) {
+    throw Napi::Error::New(env, "_branchSetGrammarLazy requires (handle, grammarStr, triggerPatterns, triggerTokens)");
+  }
+
+  auto handle = static_cast<lloyal::branch::BranchHandle>(info[0].As<Napi::Number>().Uint32Value());
+  std::string grammar_str = info[1].As<Napi::String>().Utf8Value();
+
+  // Extract trigger patterns (string[])
+  std::vector<std::string> trigger_patterns;
+  Napi::Array pArr = info[2].As<Napi::Array>();
+  for (uint32_t i = 0; i < pArr.Length(); i++) {
+    trigger_patterns.push_back(pArr.Get(i).As<Napi::String>().Utf8Value());
+  }
+
+  // Extract trigger tokens (number[])
+  std::vector<llama_token> trigger_tokens;
+  Napi::Array tArr = info[3].As<Napi::Array>();
+  for (uint32_t i = 0; i < tArr.Length(); i++) {
+    trigger_tokens.push_back(static_cast<llama_token>(tArr.Get(i).As<Napi::Number>().Int32Value()));
+  }
+
+  lloyal::branch::set_grammar_lazy(
+    handle, _model.get(), grammar_str.c_str(),
+    trigger_patterns, trigger_tokens, _branchStore
+  );
+
+  return env.Undefined();
+}
+
 // ===== BRANCH METRICS & LOGIT BIAS =====
 
 Napi::Value SessionContext::_branchModelEntropy(const Napi::CallbackInfo& info) {
diff --git a/src/SessionContext.hpp b/src/SessionContext.hpp
index 55eb35a..6e4209a 100644
--- a/src/SessionContext.hpp
+++ b/src/SessionContext.hpp
@@ -249,6 +249,7 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
   Napi::Value _branchClearSteer(const Napi::CallbackInfo& info);
   Napi::Value _branchSetSamplerParams(const Napi::CallbackInfo& info);
   Napi::Value _branchSetGrammar(const Napi::CallbackInfo& info);
+  Napi::Value _branchSetGrammarLazy(const Napi::CallbackInfo& info);
   Napi::Value _branchModelEntropy(const Napi::CallbackInfo& info);
   Napi::Value _branchModelSurprisal(const Napi::CallbackInfo& info);
   Napi::Value _branchGetSamplingPerplexity(const Napi::CallbackInfo& info);
@@ -262,6 +263,10 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
   Napi::Value _storeRetainOnly(const Napi::CallbackInfo& info);
   Napi::Value _storeAvailable(const Napi::CallbackInfo& info);
 
+  // ===== SCORING API =====
+
+  Napi::Value _scoreGroup(const Napi::CallbackInfo& info);
+
 private:
   // ===== INTERNAL STATE =====
 
diff --git a/src/index.ts b/src/index.ts
index ddb43de..cb7642c 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -41,6 +41,7 @@ import { Branch } from './Branch';
 import { BranchStore } from './BranchStore';
 import { Session, buildUserDelta, buildToolResultDelta } from './Session';
 import { forkAgent, runAgents } from './Agent';
+import { Rerank } from './Rerank';
 
 /**
  * Platform package naming: @lloyal-labs/lloyal.node-{platform}-{arch}[-{gpu}]
@@ -250,7 +251,7 @@ export const createContext = async (
   return binary.createContext(options);
 };
 
-export { Branch, BranchStore, Session, buildUserDelta, buildToolResultDelta, forkAgent, runAgents };
+export { Branch, BranchStore, Session, buildUserDelta, buildToolResultDelta, forkAgent, runAgents, Rerank };
 export { PoolingType, ChatFormat, ReasoningFormat, GrammarTriggerType } from './types';
 export type {
   GpuVariant,
@@ -275,5 +276,8 @@ export type {
   AgentState,
   RunAgentsOptions,
   RunAgentsResult,
+  RerankOptions,
+  RerankResult,
+  RerankProgress,
   NativeBinding,
 } from './types';
diff --git a/src/types.ts b/src/types.ts
index aa6a4a6..3eb2211 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -1392,6 +1392,9 @@ export interface SessionContext {
   /** @internal */
   _branchSetGrammar(handle: number, grammarStr: string): void;
 
+  /** @internal */
+  _branchSetGrammarLazy(handle: number, grammar: string, patterns: string[], tokens: number[]): void;
+
   /** @internal */
   _branchModelEntropy(handle: number, base?: string): number;
 
@@ -1420,6 +1423,11 @@ export interface SessionContext {
 
   /** @internal */
   _storeAvailable(): number;
+
+  // ===== SCORING API =====
+
+  /** @internal — processes ≤ n_seq_max prompts in a single group */
+  _scoreGroup(tokenArrays: number[][]): Promise<Float32Array[]>;
 }
 
 /**
@@ -1468,12 +1476,15 @@ export interface AgentState {
   branch: Branch;
   /** Tokens to prefill before the loop starts */
   suffixTokens: number[];
-  /** Format metadata for parseChatOutput */
+  /** Format metadata for parseChatOutput + grammar constraints */
   fmt: {
     format: ChatFormat;
     reasoningFormat: ReasoningFormat;
     thinkingForcedOpen: boolean;
     parser: string;
+    grammar: string;
+    grammarLazy: boolean;
+    grammarTriggers: GrammarTrigger[];
   };
   /** Accumulated raw output text */
   rawOutput: string;
@@ -1485,7 +1496,7 @@ export interface AgentState {
   toolCallCount: number;
   /** Number of tool-call turns completed */
   turns: number;
-  /** Final findings (set by report tool or fallback content) */
+  /** Final findings (set by report tool, or extracted from content if agent researched but didn't report) */
   findings: string | null;
 }
 
@@ -1500,13 +1511,21 @@ export interface RunAgentsOptions {
   /** SessionContext for parseChatOutput, formatChat, tokenize */
   ctx: SessionContext;
   /** Tool executor — consumer wraps with locks as needed */
-  executeTool: (name: string, args: Record<string, unknown>) => Promise<unknown>;
+  executeTool: (
+    name: string,
+    args: Record<string, unknown>,
+    context?: { onProgress?: (p: { filled: number; total: number }) => void },
+  ) => Promise<unknown>;
   /** Maximum tool-call turns per agent (default: 6) */
   maxTurns?: number;
+  /** Called when an agent produces a token (agentId = branch handle) */
+  onProduce?: (agentId: number, text: string, tokenCount: number) => void;
   /** Called when an agent dispatches a tool call (agentId = branch handle) */
   onToolCall?: (agentId: number, toolName: string, args: string) => void;
   /** Called when a tool result returns (agentId = branch handle) */
   onToolResult?: (agentId: number, toolName: string, resultStr: string) => void;
+  /** Called during tool execution with intermediate progress */
+  onToolProgress?: (agentId: number, toolName: string, progress: { filled: number; total: number }) => void;
   /** Called when an agent submits a report (agentId = branch handle) */
   onReport?: (agentId: number, findings: string) => void;
 }
@@ -1533,6 +1552,47 @@ export interface RunAgentsResult {
   };
 }
 
+/**
+ * Options for Rerank context creation
+ * @category Core
+ */
+export interface RerankOptions {
+  /** Path to reranker .gguf model */
+  modelPath: string;
+  /** Max prompts per GPU dispatch (default: 8) */
+  nSeqMax?: number;
+  /** Context window size (default: 4096) */
+  nCtx?: number;
+  /** KV cache key quantization (default: 'q4_0') */
+  typeK?: KvCacheType;
+  /** KV cache value quantization (default: 'q4_0') */
+  typeV?: KvCacheType;
+}
+
+/**
+ * A single rerank result — score for one document
+ * @category Core
+ */
+export interface RerankResult {
+  /** Relevance probability (0–1) */
+  score: number;
+  /** Original index in the input array */
+  index: number;
+}
+
+/**
+ * Progress yielded by Rerank.score() after each scoring group completes
+ * @category Core
+ */
+export interface RerankProgress {
+  /** Number of documents scored so far */
+  filled: number;
+  /** Total documents to score */
+  total: number;
+  /** Sorted results — partial until filled === total */
+  results: RerankResult[];
+}
+
 /**
  * Native binding interface — what loadBinary() returns
  *
diff --git a/test/integration.ts b/test/integration.ts
index 5bd4a6a..42b042c 100644
--- a/test/integration.ts
+++ b/test/integration.ts
@@ -10,11 +10,14 @@
  *
  * Optional embedding tests:
  *   LLAMA_EMBED_MODEL=models/nomic-embed-text-v1.5.Q4_K_M.gguf npm run test:integration
+ *
+ * Optional rerank tests:
+ *   LLAMA_RERANK_MODEL=models/bge-reranker-v2-m3-Q4_K_M.gguf npm run test:integration
  */
 
 import * as path from 'node:path';
 import * as fs from 'node:fs';
-import { loadBinary, Branch, BranchStore } from '../dist/index.js';
+import { loadBinary, Branch, BranchStore, Rerank } from '../dist/index.js';
 import type { SessionContext, NativeBinding, FormattedChatResult, Produced } from '../dist/index.js';
 
 const MODEL_PATH: string = process.env.LLAMA_TEST_MODEL
@@ -24,6 +27,10 @@ const EMBED_MODEL_PATH: string | null = process.env.LLAMA_EMBED_MODEL ||
   (fs.existsSync(path.join(__dirname, '../models/nomic-embed-text-v1.5.Q4_K_M.gguf'))
     ? path.join(__dirname, '../models/nomic-embed-text-v1.5.Q4_K_M.gguf')
     : null);
+const RERANK_MODEL_PATH: string | null = process.env.LLAMA_RERANK_MODEL ||
+  (fs.existsSync(path.join(__dirname, '../models/qwen3-reranker-0.6b-q4_k_m.gguf'))
+    ? path.join(__dirname, '../models/qwen3-reranker-0.6b-q4_k_m.gguf')
+    : null);
 
 const CTX_SIZE: number = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
 
@@ -1888,6 +1895,221 @@ async function testBranchMetrics(): Promise<void> {
   }
 }
 
+// ═══════════════════════════════════════════════════════════════════════════
+// RERANK TESTS (optional)
+// ═══════════════════════════════════════════════════════════════════════════
+
+async function testRerank(): Promise<void> {
+  if (!RERANK_MODEL_PATH) {
+    console.log('\n--- Rerank (SKIPPED - no LLAMA_RERANK_MODEL) ---');
+    return;
+  }
+
+  console.log('\n--- Rerank ---');
+  console.log(`  Model: ${path.basename(RERANK_MODEL_PATH)}`);
+
+  const rerank = await Rerank.create({ modelPath: RERANK_MODEL_PATH });
+
+  try {
+    // Tokenize documents
+    const query = 'What is the capital of France?';
+    const docs = [
+      'Berlin is the capital of Germany and its largest city.',
+      'Paris is the capital and most populous city of France.',
+      'The Amazon rainforest produces about 20% of the world\'s oxygen.',
+      'France is a country in Western Europe, with its capital being Paris.',
+    ];
+    const tokenized: number[][] = await Promise.all(docs.map(d => rerank.tokenize(d)));
+
+    // Score all documents — drain async iterable to final progress
+    let results!: { score: number; index: number }[];
+    let progressCount = 0;
+    for await (const p of rerank.score(query, tokenized)) {
+      progressCount++;
+      results = p.results;
+    }
+    assert(progressCount > 0, `rerank: received progress updates (got ${progressCount})`);
+
+    // All results returned (no topK)
+    assert(results.length === docs.length,
+      `rerank: returns all ${docs.length} results when no topK`);
+
+    // Scores are valid probabilities (0-1) and sorted descending
+    for (let i = 0; i < results.length; i++) {
+      assert(results[i].score >= 0 && results[i].score <= 1,
+        `rerank: score[${i}] = ${results[i].score} is in [0, 1]`);
+      assert(Number.isInteger(results[i].index) && results[i].index >= 0 && results[i].index < docs.length,
+        `rerank: index[${i}] = ${results[i].index} is valid`);
+      if (i > 0) {
+        assert(results[i].score <= results[i - 1].score,
+          `rerank: sorted descending (${results[i - 1].score} >= ${results[i].score})`);
+      }
+    }
+
+    // Semantic: Paris docs (index 1, 3) should rank above Amazon doc (index 2)
+    const topIndices = results.slice(0, 2).map(r => r.index);
+    assert(topIndices.includes(1) || topIndices.includes(3),
+      `rerank: a Paris doc in top 2 (top indices: [${topIndices}])`);
+
+    const amazonRank = results.findIndex(r => r.index === 2);
+    assert(amazonRank >= 2,
+      `rerank: Amazon doc not in top 2 (rank: ${amazonRank})`);
+    ok(`rerank: semantic ordering correct (top: [${topIndices}], amazon rank: ${amazonRank})`);
+
+    // topK parameter
+    let top2!: { score: number; index: number }[];
+    for await (const p of rerank.score(query, tokenized, 2)) { top2 = p.results; }
+    assert(top2.length === 2, `rerank: topK=2 returns 2 results`);
+    assert(top2[0].score === results[0].score && top2[0].index === results[0].index,
+      `rerank: topK=2 matches top of full results`);
+
+    // tokenize() produces consistent output
+    const tokens1 = await rerank.tokenize('hello');
+    const tokens2 = await rerank.tokenize('hello');
+    assert(tokens1.length === tokens2.length && tokens1.every((t, i) => t === tokens2[i]),
+      `rerank: tokenize() is deterministic`);
+  } finally {
+    rerank.dispose();
+  }
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// LARGE CORPUS RERANK — >n_seq_max documents via C++ grouping
+// ═══════════════════════════════════════════════════════════════════════════
+
+async function testRerankLargeCorpus(): Promise<void> {
+  if (!RERANK_MODEL_PATH) {
+    console.log('\n--- Rerank Large Corpus (SKIPPED - no LLAMA_RERANK_MODEL) ---');
+    return;
+  }
+
+  console.log('\n--- Rerank Large Corpus ---');
+  console.log(`  Model: ${path.basename(RERANK_MODEL_PATH)}`);
+
+  // n_seq_max=8 so 20 documents requires 3 groups (8+8+4)
+  const rerank = await Rerank.create({ modelPath: RERANK_MODEL_PATH, nSeqMax: 8 });
+
+  try {
+    const query = 'What is the capital of France?';
+    const relevantDoc = 'Paris is the capital and most populous city of France.';
+
+    // Build 20 documents: 1 relevant + 19 irrelevant
+    const docTexts: string[] = [
+      relevantDoc,
+      'The Amazon rainforest produces about 20% of the world\'s oxygen.',
+      'Berlin is the capital of Germany and its largest city.',
+      'The Great Wall of China is over 13,000 miles long.',
+      'Tokyo is the most populous metropolitan area in the world.',
+      'The Sahara Desert is the largest hot desert in the world.',
+      'Mount Everest is the highest mountain above sea level.',
+      'The Pacific Ocean is the largest and deepest ocean.',
+      'Antarctica is the coldest continent on Earth.',
+      'The Nile is traditionally considered the longest river.',
+      'Australia is both a country and a continent.',
+      'The human body contains approximately 206 bones.',
+      'Jupiter is the largest planet in our solar system.',
+      'The speed of light is approximately 299,792 kilometers per second.',
+      'DNA was first identified by Friedrich Miescher in 1869.',
+      'The International Space Station orbits Earth every 90 minutes.',
+      'Honey never spoils due to its low moisture content.',
+      'Venice is built on more than 100 small islands.',
+      'The deepest point in the ocean is the Mariana Trench.',
+      'Photosynthesis converts carbon dioxide and water into glucose.',
+    ];
+
+    const tokenized: number[][] = await Promise.all(docTexts.map(d => rerank.tokenize(d)));
+    assert(tokenized.length === 20, `large corpus: 20 documents tokenized`);
+
+    // Single score() call — drain iterable, verify incremental progress
+    let results!: { score: number; index: number }[];
+    let progressCount = 0;
+    for await (const p of rerank.score(query, tokenized)) {
+      progressCount++;
+      assert(p.total === 20, `large corpus: total is 20 (got ${p.total})`);
+      assert(p.filled <= p.total, `large corpus: filled ${p.filled} <= total ${p.total}`);
+      results = p.results;
+    }
+    assert(progressCount >= 3, `large corpus: ≥3 progress updates for 20 docs / n_seq_max=8 (got ${progressCount})`);
+
+    assert(results.length === 20, `large corpus: all 20 results returned`);
+
+    // Scores sorted descending
+    for (let i = 1; i < results.length; i++) {
+      assert(results[i].score <= results[i - 1].score,
+        `large corpus: sorted descending at index ${i}`);
+    }
+
+    // Relevant doc (index 0) should rank in top 3
+    const relevantRank = results.findIndex(r => r.index === 0);
+    assert(relevantRank < 3,
+      `large corpus: relevant doc ranks ${relevantRank} (expected < 3)`);
+
+    // topK across group boundary
+    let top5!: { score: number; index: number }[];
+    for await (const p of rerank.score(query, tokenized, 5)) { top5 = p.results; }
+    assert(top5.length === 5, `large corpus: topK=5 returns 5 results`);
+    assert(top5[0].score === results[0].score && top5[0].index === results[0].index,
+      `large corpus: topK=5 top result matches full ranking`);
+
+    ok(`large corpus: 20 docs with n_seq_max=8 → relevant doc at rank ${relevantRank}`);
+  } finally {
+    rerank.dispose();
+  }
+}
+
+async function testRerankConcurrent(): Promise<void> {
+  if (!RERANK_MODEL_PATH) {
+    console.log('\n--- Rerank Concurrent (SKIPPED - no LLAMA_RERANK_MODEL) ---');
+    return;
+  }
+
+  console.log('\n--- Rerank Concurrent ---');
+
+  const rerank = await Rerank.create({ modelPath: RERANK_MODEL_PATH, nSeqMax: 4 });
+
+  try {
+    const docs = [
+      'Paris is the capital of France.',
+      'Machine learning is a branch of artificial intelligence.',
+      'The sun is a star at the center of the solar system.',
+      'Deep learning uses neural networks with many layers.',
+      'London is the capital of the United Kingdom.',
+      'Gradient descent is an optimization algorithm.',
+    ];
+    const tokenized: number[][] = await Promise.all(docs.map(d => rerank.tokenize(d)));
+
+    // Drain helper — collects final results from score() async iterable
+    async function drain(iter: AsyncIterable<{ results: { score: number; index: number }[] }>)
+        : Promise<{ score: number; index: number }[]> {
+      let last!: { score: number; index: number }[];
+      for await (const p of iter) last = p.results;
+      return last;
+    }
+
+    // Fire both score calls concurrently — exercises the queue's round-robin
+    const [r1, r2] = await Promise.all([
+      drain(rerank.score('What is the capital of France?', tokenized)),
+      drain(rerank.score('What is machine learning?', tokenized)),
+    ]);
+
+    assert(r1.length === docs.length, 'concurrent: caller 1 gets all results');
+    assert(r2.length === docs.length, 'concurrent: caller 2 gets all results');
+
+    // Paris doc (index 0) should rank high for query 1
+    assert(r1[0].index === 0 || r1[1].index === 0,
+      `concurrent: Paris doc in top 2 for query 1 (got [${r1.slice(0, 2).map(r => r.index)}])`);
+
+    // ML docs (index 1 or 3 or 5) should rank high for query 2
+    const top2q2 = r2.slice(0, 2).map(r => r.index);
+    assert(top2q2.includes(1) || top2q2.includes(3) || top2q2.includes(5),
+      `concurrent: ML doc in top 2 for query 2 (got [${top2q2}])`);
+
+    ok(`concurrent: two callers scored ${docs.length} docs each with independent results`);
+  } finally {
+    rerank.dispose();
+  }
+}
+
 // ═══════════════════════════════════════════════════════════════════════════
 // MAIN
 // ═══════════════════════════════════════════════════════════════════════════
@@ -1932,6 +2154,9 @@ async function main(): Promise<void> {
     await testSetSamplerParams();
     await testSetGrammar();
     await testBranchMetrics();
+    await testRerank();
+    await testRerankLargeCorpus();
+    await testRerankConcurrent();
     await testEmbeddings();
 
     // Summary

From 453c24c3f25433aaa682af28ebe71a6b53d3186b Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Sat, 28 Feb 2026 01:34:14 +1100
Subject: [PATCH 09/17] feat(agents): deep research example

---
 examples/deep-research/deep-research.ts   | 308 +++++++++++++++-------
 examples/deep-research/display.ts         |  20 +-
 examples/deep-research/reranker.ts        | 115 +++-----
 examples/deep-research/resources/files.ts |  29 ++
 examples/deep-research/tasks/research.md  |   2 +-
 examples/deep-research/tasks/research.ts  |  15 +-
 examples/deep-research/tools/grep.ts      |  67 +++++
 examples/deep-research/tools/index.ts     |   6 +-
 examples/deep-research/tools/search.ts    |  11 +-
 examples/deep-research/tools/types.ts     |  19 +-
 liblloyal                                 |   2 +-
 11 files changed, 401 insertions(+), 193 deletions(-)
 create mode 100644 examples/deep-research/tools/grep.ts

diff --git a/examples/deep-research/deep-research.ts b/examples/deep-research/deep-research.ts
index 43636e0..715567e 100644
--- a/examples/deep-research/deep-research.ts
+++ b/examples/deep-research/deep-research.ts
@@ -20,7 +20,7 @@ import * as fs from 'node:fs';
 import * as path from 'node:path';
 import * as readline from 'node:readline';
 import { createContext, BranchStore, Session } from '../../dist/index.js';
-import { c, log, emit, setJsonlMode, pad, fmtSize } from './display.js';
+import { c, log, emit, setJsonlMode, status, statusClear, pad, fmtSize } from './display.js';
 import { loadResources, chunkResources } from './resources/files.js';
 import { createReranker } from './reranker.js';
 import { createTools } from './tools/index.js';
@@ -109,13 +109,13 @@ async function main(): Promise<void> {
   const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '16384', 10);
   const ctx = await createContext({
     modelPath, nCtx,
-    nSeqMax: AGENT_COUNT + 1,
+    nSeqMax: Math.max(AGENT_COUNT, VERIFY_COUNT) + 1,
     typeK: 'q4_0', typeV: 'q4_0',
   });
 
   log(`  ${c.green}●${c.reset} Loading ${c.bold}${rerankName}${c.reset} ${c.dim}(${rerankSize}, reranker)${c.reset}`);
 
-  const reranker = await createReranker(rerankModelPath, { nSeqMax: AGENT_COUNT });
+  const reranker = await createReranker(rerankModelPath, { nSeqMax: 8, nCtx: 4096 });
   await reranker.tokenizeChunks(chunks);
 
   const corpusIsFile = resources.length === 1 && fs.statSync(corpusDir!).isFile();
@@ -128,24 +128,127 @@ async function main(): Promise<void> {
   const store = new BranchStore(ctx);
   const session = new Session({ ctx, store });
 
-  // Tool call display — shared across cold + warm paths
-  // agentId = branch handle — stable identifier, useful for C++/KV-level debugging
+  // ── Agent labels + status line ──────────────────────────
+  const agentLabel = new Map<number, string>();
+  let nextLabel = 0;
+  function label(agentId: number): string {
+    let l = agentLabel.get(agentId);
+    if (!l) { l = `A${nextLabel++}`; agentLabel.set(agentId, l); }
+    return l;
+  }
+  const agentText = new Map<number, string>();    // accumulated raw text per agent
+  function resetLabels(): void { nextLabel = 0; agentLabel.clear(); agentStatus.clear(); agentText.clear(); }
+
+  const agentStatus = new Map<number, { state: string; tokenCount: number; detail: string }>();
+
+  function renderStatus(): void {
+    const active = [...agentStatus.entries()].filter(([, s]) => s.state !== 'done');
+    if (active.length === 0) return;
+
+    const generating = active.filter(([, s]) => s.state === 'gen');
+
+    // Single agent generating → stream text on status line (rewritable — clears
+    // when tool call fires or agent finishes)
+    if (generating.length === 1 && active.length === 1) {
+      const [id] = generating[0];
+      const raw = (agentText.get(id) ?? '').replace(/\n/g, ' ').trimStart();
+      const cols = process.stdout.columns || 80;
+      const maxLen = cols - 12;  // "    ◆ A0 " prefix ≈ 9 visible chars + margin
+      const text = raw.length > maxLen ? raw.slice(raw.length - maxLen) : raw;
+      status(`    ${c.dim}\u25c6${c.reset} ${c.yellow}${label(id)}${c.reset} ${text}`);
+      return;
+    }
+
+    // Multi-agent: compact counters
+    const parts = active.map(([id, s]) => {
+      const lbl = `${c.yellow}${label(id)}${c.reset}`;
+      if (s.state === 'gen') return `${lbl}: ${s.tokenCount} tok`;
+      const detail = s.detail ? ` ${s.detail}` : '';
+      return `${lbl}: ${c.cyan}${s.state}${c.reset}${detail}`;
+    });
+    status(`    ${c.dim}\u25c6${c.reset} ${parts.join('  ')}`);
+  }
+
+  // ── Callbacks — shared across cold + warm paths ────────
+  const onProduce = (agentId: number, text: string, tokenCount: number): void => {
+    agentText.set(agentId, (agentText.get(agentId) ?? '') + text);
+    agentStatus.set(agentId, { state: 'gen', tokenCount, detail: '' });
+    renderStatus();
+  };
+
+  const onToolProgress = (agentId: number, toolName: string, p: { filled: number; total: number }): void => {
+    agentStatus.set(agentId, { state: toolName, tokenCount: 0, detail: `${p.filled}/${p.total}` });
+    renderStatus();
+  };
+
   const onToolCall = (agentId: number, toolName: string, argsStr: string): void => {
+    agentText.delete(agentId);  // this generation led to a parsed tool call — clear
+    agentStatus.set(agentId, { state: toolName, tokenCount: 0, detail: '' });
     emit('tool_call', { agentId, toolName, arguments: argsStr });
     let toolArgs: Record<string, string>;
     try { toolArgs = JSON.parse(argsStr); } catch { toolArgs = {}; }
     const argSummary = toolName === 'search'
       ? `"${toolArgs.query || ''}"`
+      : toolName === 'grep'
+      ? `/${toolArgs.pattern || ''}/`
       : toolName === 'report' ? ''
       : `${toolArgs.filename}` + (toolArgs.startLine ? ` L${toolArgs.startLine}-${toolArgs.endLine}` : '');
-    log(`    ${c.dim}├${c.reset} ${c.yellow}${agentId}${c.reset} ${c.cyan}${toolName}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
+    log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(agentId)}${c.reset} ${c.cyan}${toolName}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
   };
+
   const onToolResult = (agentId: number, toolName: string, resultStr: string): void => {
     emit('tool_result', {
       agentId, toolName,
       result: resultStr.length > 200 ? resultStr.slice(0, 200) + '...' : resultStr,
     });
-    log(`    ${c.dim}├${c.reset} ${c.yellow}${agentId}${c.reset} ${c.dim}← ${toolName} ${resultStr.length}b${c.reset}`);
+    let preview = '';
+    if (toolName === 'read_file') {
+      try {
+        const firstLine = (JSON.parse(resultStr) as { content: string }).content.split('\n').find(l => l.trim());
+        if (firstLine) preview = ` · ${firstLine.trim().slice(0, 60)}${firstLine.trim().length > 60 ? '\u2026' : ''}`;
+      } catch { /* non-fatal */ }
+    } else if (toolName === 'search') {
+      try {
+        const top = (JSON.parse(resultStr) as { heading: string }[])[0];
+        if (top?.heading) preview = ` · ${top.heading}`;
+      } catch { /* non-fatal */ }
+    } else if (toolName === 'grep') {
+      try {
+        const r = JSON.parse(resultStr) as { totalMatches: number; matchingLines: number };
+        preview = ` · ${r.totalMatches} matches in ${r.matchingLines} lines`;
+      } catch { /* non-fatal */ }
+    }
+    log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(agentId)}${c.reset} ${c.dim}\u2190 ${toolName} ${resultStr.length}b${preview}${c.reset}`);
+  };
+
+  const onReport = (agentId: number, findings: string): void => {
+    agentStatus.set(agentId, { state: 'done', tokenCount: 0, detail: '' });
+    const cols = process.stdout.columns || 80;
+    const lbl = `${c.yellow}${label(agentId)}${c.reset}`;
+    const prefix = `    ${c.dim}\u2502${c.reset}   `;
+    // visible width: "    │   " = 8 chars
+    const wrap = cols - 8;
+
+    log(`    ${c.dim}\u2502${c.reset}`);
+    log(`    ${c.dim}\u251c\u2500\u2500${c.reset} ${lbl} ${c.bold}findings${c.reset}`);
+
+    // Word-wrap findings, preserve paragraph breaks
+    for (const para of findings.split('\n')) {
+      if (!para.trim()) { log(prefix); continue; }
+      const words = para.split(/\s+/);
+      let line = '';
+      for (const word of words) {
+        if (line && line.length + 1 + word.length > wrap) {
+          log(`${prefix}${c.dim}${line}${c.reset}`);
+          line = word;
+        } else {
+          line = line ? `${line} ${word}` : word;
+        }
+      }
+      if (line) log(`${prefix}${c.dim}${line}${c.reset}`);
+    }
+
+    log(`    ${c.dim}\u2502${c.reset}`);
   };
 
   // ================================================================
@@ -156,62 +259,78 @@ async function main(): Promise<void> {
   // ================================================================
 
   async function handleQuery(query: string): Promise<void> {
-    if (!session.trunk) {
-      // ─── cold: plan → research → verify → eval ─────────
-      const t0 = performance.now();
+    const t0 = performance.now();
+    const warm = !!session.trunk;
 
+    if (!warm) {
       emit('start', {
         model: path.basename(modelPath), reranker: path.basename(rerankModelPath),
         query, agentCount: AGENT_COUNT, verifyCount: VERIFY_COUNT, chunks: chunks.length,
       });
-
       log();
       log(`  ${c.dim}Query${c.reset}`);
       log(`  ${c.bold}${query}${c.reset}`);
+    }
 
-      // ─── query → questions ────────────────────────────
-      let t = performance.now();
-
-      const { questions, tokenCount: planTokens } = await plan(ctx, {
-        query, agentCount: AGENT_COUNT,
-      });
-
-      emit('plan', { questions, planTokens });
-      const planMs = performance.now() - t;
-      log(`\n  ${c.green}●${c.reset} ${c.bold}Plan${c.reset} ${c.dim}${planTokens} tok · ${(planMs / 1000).toFixed(1)}s${c.reset}`);
-      questions.forEach((q, i) => log(`    ${c.dim}${i + 1}.${c.reset} ${q}`));
-
-      // ─── questions → findings ─────────────────────────
-      t = performance.now();
-      log(`\n  ${c.green}●${c.reset} ${c.bold}Research${c.reset} ${c.dim}${questions.length} agents${c.reset}`);
-
-      const researchResult = await research(ctx, store, {
-        questions, toolsJson, executeTool,
-        maxTurns: MAX_TOOL_TURNS, onToolCall, onToolResult,
-      });
-
-      const researchMs = performance.now() - t;
-      researchResult.agents.forEach((a, i) => {
-        const tree = i === researchResult.agents.length - 1 ? '└' : '├';
-        emit('agent_done', { index: i, question: questions[i], findings: (a.findings || '').slice(0, 500), toolCalls: a.toolCallCount, tokenCount: a.tokenCount });
-        log(`    ${c.dim}${tree}${c.reset} ${c.yellow}${i}${c.reset} ${c.green}done${c.reset} ${c.dim}${a.tokenCount} tok · ${a.toolCallCount} tools${c.reset}`);
-      });
-      log(`    ${c.dim}${researchResult.totalTokens} tok · ${researchResult.totalToolCalls} tools · ${(researchMs / 1000).toFixed(1)}s${c.reset}`);
-
-      // ─── findings → attempts ──────────────────────────
+    // ─── plan ─────────────────────────────────────────────
+    let t = performance.now();
+    const { questions, tokenCount: planTokens } = await plan(ctx, {
+      query, agentCount: AGENT_COUNT,
+      ...(warm && { parent: session.trunk! }),
+    });
+    const planMs = performance.now() - t;
+
+    if (!warm) emit('plan', { questions, planTokens });
+    log(`\n  ${c.green}●${c.reset} ${c.bold}Plan${c.reset} ${c.dim}${planTokens} tok · ${(planMs / 1000).toFixed(1)}s${c.reset}`);
+    questions.forEach((q, i) => log(`    ${c.dim}${i + 1}.${c.reset} ${q}`));
+
+    // ─── research ─────────────────────────────────────────
+    t = performance.now();
+    log(`\n  ${c.green}●${c.reset} ${c.bold}Research${c.reset} ${c.dim}${questions.length} agents${c.reset}`);
+
+    resetLabels();
+    const researchResult = await research(ctx, store, {
+      questions, toolsJson, executeTool,
+      maxTurns: MAX_TOOL_TURNS,
+      onProduce, onToolCall, onToolResult, onToolProgress, onReport,
+      ...(warm && { parent: session.trunk!, seed: Date.now() }),
+    });
+    statusClear();
+    const researchMs = performance.now() - t;
+
+    researchResult.agents.forEach((a, i) => {
+      const tree = i === researchResult.agents.length - 1 ? '└' : '├';
+      emit('agent_done', { index: i, question: questions[i], findings: (a.findings || '').slice(0, 500), toolCalls: a.toolCallCount, tokenCount: a.tokenCount });
+      // Show remaining accumulated text — unparsed tool calls, reasoning, etc.
+      // (agentText is cleared by onToolCall on successful parse, so only failed-parse text remains)
+      const raw = (agentText.get(a.agentId) ?? '').replace(/\n/g, ' ').trim();
+      if (raw) log(`    ${c.dim}├${c.reset} ${c.yellow}${label(a.agentId)}${c.reset} ${c.dim}▸ ${raw.slice(0, 120)}${raw.length > 120 ? '…' : ''}${c.reset}`);
+      log(`    ${c.dim}${tree}${c.reset} ${c.yellow}${label(a.agentId)}${c.reset} ${c.green}done${c.reset} ${c.dim}${a.tokenCount} tok · ${a.toolCallCount} tools${c.reset}`);
+    });
+    log(`    ${c.dim}${researchResult.totalTokens} tok · ${researchResult.totalToolCalls} tools · ${(researchMs / 1000).toFixed(1)}s${c.reset}`);
+
+    // ─── post-research: verify+eval (cold) or generate (warm) ─
+    const phases: { label: string; tokens: number; detail: string; timeMs: number }[] = [
+      { label: 'Plan', tokens: planTokens, detail: '', timeMs: planMs },
+      {
+        label: 'Research', tokens: researchResult.totalTokens,
+        detail: `(${researchResult.agents.map(a => a.tokenCount).join(' + ')})  ${pad(researchResult.totalToolCalls, 2)} tools`,
+        timeMs: researchMs,
+      },
+    ];
+    let kvLine: string | null = null;
+
+    if (!warm) {
+      // ── verify ──────────────────────────────────────────
       t = performance.now();
-
       const findingsText = researchResult.agents
         .map((a, i) => `Q: ${questions[i]}\nA: ${(a.findings || '').trim()}`)
         .join('\n\n');
 
       log(`\n  ${c.green}●${c.reset} ${c.bold}Verify${c.reset} ${c.dim}${VERIFY_COUNT} attempts${c.reset}`);
-
-      const verifyResult = await verify(ctx, store, {
-        findings: findingsText, query, count: VERIFY_COUNT,
-      });
-
+      const verifyResult = await verify(ctx, store, { findings: findingsText, query, count: VERIFY_COUNT });
       const verifyMs = performance.now() - t;
+
       verifyResult.attempts.forEach((a, i) => {
         const tree = i === verifyResult.attempts.length - 1 ? '└' : '├';
         emit('attempt_done', { index: i, output: a.output.trim().slice(0, 500), tokenCount: a.tokenCount, ppl: a.ppl });
@@ -219,29 +338,32 @@ async function main(): Promise<void> {
       });
       log(`    ${c.dim}${verifyResult.totalTokens} tok · ${(verifyMs / 1000).toFixed(1)}s${c.reset}`);
 
-      // ─── attempts → convergence ───────────────────────
+      // ── eval ────────────────────────────────────────────
       t = performance.now();
-
-      const { converged, tokenCount: evalTokens } = await evaluate(ctx, {
-        attempts: verifyResult.attempts,
-      });
-
+      const { converged, tokenCount: evalTokens } = await evaluate(ctx, { attempts: verifyResult.attempts });
       const evalMs = performance.now() - t;
+
       emit('convergence', { converged, evalTokens });
       const verdict = converged === true ? `${c.green}yes${c.reset}` : converged === false ? `${c.red}no${c.reset}` : `${c.yellow}unknown${c.reset}`;
       log(`\n  ${c.green}●${c.reset} ${c.bold}Eval${c.reset} ${c.dim}${evalTokens} tok · ${(evalMs / 1000).toFixed(1)}s${c.reset}`);
       log(`    Converged: ${verdict}`);
 
-      // ─── result ───────────────────────────────────────
-      const tEnd = performance.now();
-      const totalTokens = planTokens + researchResult.totalTokens + verifyResult.totalTokens + evalTokens;
-
+      // ── answer ──────────────────────────────────────────
       log(`\n  ${c.dim}${'─'.repeat(58)}${c.reset}\n`);
       const prose = verifyResult.bestOutput.trim()
         .replace(/\*\*(.+?)\*\*/g, `${c.bold}$1${c.reset}`)
         .split('\n').map((l) => `  ${l}`).join('\n');
       log(prose);
 
+      phases.push(
+        { label: 'Verify', tokens: verifyResult.totalTokens, detail: `(${verifyResult.attempts.map(a => a.tokenCount).join(' + ')})`, timeMs: verifyMs },
+        { label: 'Eval', tokens: evalTokens, detail: `converged: ${converged ? 'yes' : 'no'}`, timeMs: evalMs },
+      );
+
+      const kvSaved = researchResult.sharedPrefixLength * (questions.length - 1)
+        + verifyResult.prefixLength * (verifyResult.attempts.length - 1);
+      kvLine = `  ${c.dim}KV shared    ${researchResult.sharedPrefixLength} × ${questions.length - 1} + ${verifyResult.prefixLength} × ${verifyResult.attempts.length - 1} = ${kvSaved.toLocaleString()} tok saved${c.reset}`;
+
       emit('complete', {
         planTokens, agentTokens: researchResult.totalTokens,
         researchSteps: researchResult.steps,
@@ -251,66 +373,56 @@ async function main(): Promise<void> {
         prefixTokens: verifyResult.prefixLength,
         sharedPrefixTokens: researchResult.sharedPrefixLength,
         agentCount: questions.length, attemptCount: verifyResult.attempts.length,
-        wallTimeMs: Math.round(tEnd - t0),
+        wallTimeMs: Math.round(performance.now() - t0),
         planMs: Math.round(planMs), researchMs: Math.round(researchMs),
         verifyMs: Math.round(verifyMs), evalMs: Math.round(evalMs),
         ...researchResult.counters,
       });
 
-      log(`\n  ${c.dim}${'━'.repeat(58)}${c.reset}`);
-      log(`  ${c.dim}Plan       ${pad(planTokens, 5)} tok${' '.repeat(30)}${pad((planMs / 1000).toFixed(1), 6)}s${c.reset}`);
-      log(`  ${c.dim}Research   ${pad(researchResult.totalTokens, 5)} tok  (${researchResult.agents.map((a) => a.tokenCount).join(' + ')})  ${pad(researchResult.totalToolCalls, 2)} tools  ${pad((researchMs / 1000).toFixed(1), 6)}s${c.reset}`);
-      log(`  ${c.dim}Verify     ${pad(verifyResult.totalTokens, 5)} tok  (${verifyResult.attempts.map((a) => a.tokenCount).join(' + ')})${' '.repeat(11)}${pad((verifyMs / 1000).toFixed(1), 6)}s${c.reset}`);
-      log(`  ${c.dim}Eval       ${pad(evalTokens, 5)} tok  converged: ${converged ? 'yes' : 'no'}${' '.repeat(11)}${pad((evalMs / 1000).toFixed(1), 6)}s${c.reset}`);
-      const kvSaved = researchResult.sharedPrefixLength * (questions.length - 1) + verifyResult.prefixLength * (verifyResult.attempts.length - 1);
-      log(`  ${c.dim}${'━'.repeat(58)}${c.reset}`);
-      log(`  ${c.bold}Total${c.reset}      ${c.bold}${pad(totalTokens, 5)}${c.reset} tok  ${c.dim}${questions.length} agents · ${researchResult.totalToolCalls} tools${c.reset}         ${c.bold}${pad(((tEnd - t0) / 1000).toFixed(1), 6)}s${c.reset}`);
-      log(`  ${c.dim}KV shared    ${researchResult.sharedPrefixLength} × ${questions.length - 1} + ${verifyResult.prefixLength} × ${verifyResult.attempts.length - 1} = ${kvSaved.toLocaleString()} tok saved${c.reset}`);
-      log();
-
       await session.promote(verifyResult.bestBranch);
     } else {
-      // ─── warm: plan → research → findings → grounded response ─
-      const { questions, tokenCount: planTokens } = await plan(ctx, {
-        query, agentCount: AGENT_COUNT,
-        parent: session.trunk!,
-      });
-
-      log(`\n  ${c.green}●${c.reset} ${c.bold}Plan${c.reset} ${c.dim}${planTokens} tok${c.reset}`);
-      questions.forEach((q, i) => log(`    ${c.dim}${i + 1}.${c.reset} ${q}`));
-
-      log(`\n  ${c.green}●${c.reset} ${c.bold}Research${c.reset} ${c.dim}${questions.length} agents${c.reset}`);
-
-      const followUp = await research(ctx, store, {
-        questions,
-        parent: session.trunk!,
-        seed: Date.now(),
-        toolsJson, executeTool,
-        maxTurns: MAX_TOOL_TURNS, onToolCall, onToolResult,
-      });
-
-      followUp.agents.forEach((a, i) => {
-        const tree = i === followUp.agents.length - 1 ? '└' : '├';
-        log(`    ${c.dim}${tree}${c.reset} ${c.yellow}${i}${c.reset} ${c.green}done${c.reset} ${c.dim}${a.tokenCount} tok · ${a.toolCallCount} tools${c.reset}`);
-      });
-      log(`    ${c.dim}${followUp.totalToolCalls} tools · ${followUp.totalTokens} tok${c.reset}`);
-
-      const agentFindings = followUp.agents
+      // ── grounded response ───────────────────────────────
+      const agentFindings = researchResult.agents
         .map((a, i) => a.findings ? `[Agent ${i}] ${a.findings.trim()}` : null)
         .filter(Boolean)
         .join('\n\n');
 
-      const groundedContent = agentFindings
+      await session.prefillUser(agentFindings
         ? `Research findings:\n${agentFindings}\n\nUser question: ${query}\n\nAnswer based on the research findings above.`
-        : query;
-      await session.prefillUser(groundedContent);
+        : query);
 
+      t = performance.now();
+      let responseTokens = 0;
       process.stdout.write(`  ${c.dim}<${c.reset} `);
       for await (const { text } of session.trunk!) {
         process.stdout.write(text);
+        responseTokens++;
       }
       console.log('\n');
+
+      phases.push({ label: 'Response', tokens: responseTokens, detail: '', timeMs: performance.now() - t });
+    }
+
+    // ─── stats table ──────────────────────────────────────
+    const tEnd = performance.now();
+    const totalTokens = phases.reduce((s, p) => s + p.tokens, 0);
+
+    log(`\n  ${c.dim}${'━'.repeat(58)}${c.reset}`);
+    for (const p of phases) {
+      const left = `${p.label.padEnd(10)} ${pad(p.tokens, 5)} tok`;
+      const detail = p.detail ? `  ${p.detail}` : '';
+      const right = `${pad((p.timeMs / 1000).toFixed(1), 6)}s`;
+      log(`  ${c.dim}${left}${detail}${' '.repeat(Math.max(1, 58 - left.length - detail.length - right.length))}${right}${c.reset}`);
     }
+    log(`  ${c.dim}${'━'.repeat(58)}${c.reset}`);
+    log(`  ${c.bold}Total${c.reset}      ${c.bold}${pad(totalTokens, 5)}${c.reset} tok  ${c.dim}${questions.length} agents · ${researchResult.totalToolCalls} tools${c.reset}         ${c.bold}${pad(((tEnd - t0) / 1000).toFixed(1), 6)}s${c.reset}`);
+    if (kvLine) log(kvLine);
+    const trunkPos = session.trunk ? session.trunk.position : 0;
+    const ctxPct = Math.round(100 * trunkPos / nCtx);
+    const ctxStr = `ctx: ${ctxPct}% (${trunkPos.toLocaleString()}/${nCtx.toLocaleString()})`;
+    log(`  ${c.dim}${'━'.repeat(58)}${c.reset}`);
+    log(`  ${c.dim}${' '.repeat(58 - ctxStr.length)}${ctxStr}${c.reset}`);
+    log();
   }
 
   // ================================================================
diff --git a/examples/deep-research/display.ts b/examples/deep-research/display.ts
index 54e967a..241108b 100644
--- a/examples/deep-research/display.ts
+++ b/examples/deep-research/display.ts
@@ -9,7 +9,25 @@ export const c = isTTY ? {
   green: '\x1b[32m', cyan: '\x1b[36m', yellow: '\x1b[33m', red: '\x1b[31m',
 } : { bold: '', dim: '', reset: '', green: '', cyan: '', yellow: '', red: '' };
 
-export const log = (...a: unknown[]): void => { if (!_jsonlMode) console.log(...a); };
+let _statusText = '';
+
+export function status(text: string): void {
+  if (_jsonlMode || !isTTY) return;
+  _statusText = text;
+  process.stdout.write('\r\x1b[K' + text);
+}
+
+export function statusClear(): void {
+  if (!_statusText) return;
+  _statusText = '';
+  process.stdout.write('\r\x1b[K');
+}
+
+export const log = (...a: unknown[]): void => {
+  if (_jsonlMode) return;
+  statusClear();
+  console.log(...a);
+};
 
 export function emit(event: string, data: Record<string, unknown>): void {
   if (_jsonlMode) console.log(JSON.stringify({ event, ...data }));
diff --git a/examples/deep-research/reranker.ts b/examples/deep-research/reranker.ts
index 525bb1a..3762bc9 100644
--- a/examples/deep-research/reranker.ts
+++ b/examples/deep-research/reranker.ts
@@ -1,100 +1,49 @@
-import { createContext, Branch } from '../../dist/index.js';
+import { Rerank } from '../../dist/index.js';
 import type { Chunk } from './resources/types.js';
-import type { Reranker, ScoredChunk } from './tools/types.js';
-
-const SYSTEM_PROMPT =
-  'Judge whether the Document meets the requirements based on the Query ' +
-  'and the Instruct provided. Note that the answer can only be "yes" or "no".';
-
-const USER_PREFIX =
-  '<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n\n' +
-  '<Query>: ';
+import type { Reranker, ScoredResult } from './tools/types.js';
 
 export async function createReranker(
   modelPath: string,
   opts?: { nSeqMax?: number; nCtx?: number },
 ): Promise<Reranker> {
-  const nCtx = opts?.nCtx ?? 16384;
-  const ctx = await createContext({
-    modelPath,
-    nCtx,
-    nSeqMax: opts?.nSeqMax ?? 3,
-  });
-
-  const [yesId] = await ctx.tokenize('yes', false);
-  const [noId] = await ctx.tokenize('no', false);
-
-  // Probe the chat template once to extract prefix/mid/suffix, then
-  // pre-tokenize segments. Per-chunk scoring concatenates token arrays
-  // synchronously — no per-chunk formatChat calls needed.
-  const SENTINEL_Q = '\x00QUERY\x00';
-  const SENTINEL_D = '\x00DOC\x00';
-  const probe = await ctx.formatChat(JSON.stringify([
-    { role: 'system', content: SYSTEM_PROMPT },
-    { role: 'user', content: `${USER_PREFIX}${SENTINEL_Q}\n\n<Document>: ${SENTINEL_D}` },
-  ]), { addGenerationPrompt: true, enableThinking: false });
-
-  const p = probe.prompt;
-  const qi = p.indexOf(SENTINEL_Q);
-  const di = p.indexOf(SENTINEL_D);
-  const prefixTokens = await ctx.tokenize(p.slice(0, qi), true);
-  const midTokens = await ctx.tokenize(p.slice(qi + SENTINEL_Q.length, di), false);
-  const suffixTokens = await ctx.tokenize(p.slice(di + SENTINEL_D.length), false);
-
-  function rerankScore(logits: Float32Array): number {
-    const max = Math.max(logits[yesId], logits[noId]);
-    const yesExp = Math.exp(logits[yesId] - max);
-    const noExp = Math.exp(logits[noId] - max);
-    return yesExp / (yesExp + noExp);
-  }
-
-  // Serialize access — concurrent Branch.prefill on the same llama_context
-  // races llama_decode.
-  let lock = Promise.resolve();
+  const rerank = await Rerank.create({ modelPath, ...opts });
 
   return {
-    async score(query: string, chunks: Chunk[]): Promise<ScoredChunk[]> {
-      const prev = lock;
-      let release!: () => void;
-      lock = new Promise<void>((r) => { release = r; });
-      await prev;
-      try {
-        const queryTokens = await ctx.tokenize(query, false);
-        const budget = nCtx - prefixTokens.length - queryTokens.length
-                     - midTokens.length - suffixTokens.length;
-        const scored: ScoredChunk[] = [];
-        for (const chunk of chunks) {
-          const docTokens = chunk.tokens.length > budget
-            ? chunk.tokens.slice(0, budget) : chunk.tokens;
-          const tokens = [
-            ...prefixTokens, ...queryTokens,
-            ...midTokens, ...docTokens,
-            ...suffixTokens,
-          ];
-          const branch = Branch.create(ctx, 0, { temperature: 0 });
-          await branch.prefill(tokens);
-          const score = rerankScore(branch.getLogits());
-          await branch.prune();
-          scored.push({
-            file: chunk.resource, heading: chunk.heading,
-            score: Math.round(score * 1000) / 1000,
-            startLine: chunk.startLine, endLine: chunk.endLine,
-          });
-        }
-        return scored.sort((a, b) => b.score - a.score).slice(0, 5);
-      } finally {
-        release();
-      }
+    score(query: string, chunks: Chunk[]): AsyncIterable<ScoredResult> {
+      const inner = rerank.score(query, chunks.map(c => c.tokens), 5);
+      return {
+        [Symbol.asyncIterator](): AsyncIterator<ScoredResult> {
+          const it = inner[Symbol.asyncIterator]();
+          return {
+            async next(): Promise<IteratorResult<ScoredResult>> {
+              const { value, done } = await it.next();
+              if (done) return { value: undefined as unknown as ScoredResult, done: true };
+              return {
+                value: {
+                  filled: value.filled,
+                  total: value.total,
+                  results: value.results.map(r => ({
+                    file: chunks[r.index].resource,
+                    heading: chunks[r.index].heading,
+                    score: r.score,
+                    startLine: chunks[r.index].startLine,
+                    endLine: chunks[r.index].endLine,
+                  })),
+                },
+                done: false,
+              };
+            },
+          };
+        },
+      };
     },
 
     async tokenizeChunks(chunks: Chunk[]): Promise<void> {
       for (const chunk of chunks) {
-        chunk.tokens = await ctx.tokenize(chunk.text, false);
+        chunk.tokens = await rerank.tokenize(chunk.text);
       }
     },
 
-    dispose(): void {
-      ctx.dispose();
-    },
+    dispose() { rerank.dispose(); },
   };
 }
diff --git a/examples/deep-research/resources/files.ts b/examples/deep-research/resources/files.ts
index 7200c26..d41cadc 100644
--- a/examples/deep-research/resources/files.ts
+++ b/examples/deep-research/resources/files.ts
@@ -26,10 +26,39 @@ export function loadResources(dir: string): Resource[] {
   }));
 }
 
+/** Split plain text into chunks on blank-line paragraph boundaries */
+function chunkByParagraph(res: Resource): Chunk[] {
+  const lines = res.content.split('\n');
+  const chunks: Chunk[] = [];
+  let start = 0;
+  for (let i = 0; i <= lines.length; i++) {
+    const blank = i === lines.length || !lines[i].trim();
+    if (blank && i > start) {
+      const text = lines.slice(start, i).join('\n').trim();
+      if (text) {
+        chunks.push({
+          resource: res.name,
+          heading: text.slice(0, 60).replace(/\n/g, ' ') + (text.length > 60 ? '…' : ''),
+          text, tokens: [],
+          startLine: start + 1,
+          endLine: i,
+        });
+      }
+    }
+    if (blank) start = i + 1;
+  }
+  return chunks;
+}
+
 export function chunkResources(resources: Resource[]): Chunk[] {
   const out: Chunk[] = [];
   for (const res of resources) {
     const sections = parseMarkdown(res.content);
+    // Single section covering the whole file = no headings found → paragraph split
+    if (sections.length <= 1 && res.content.split('\n').length > 10) {
+      out.push(...chunkByParagraph(res));
+      continue;
+    }
     const lines = res.content.split('\n');
     for (const sec of sections) {
       const text = lines.slice(sec.startLine - 1, sec.endLine).join('\n').trim();
diff --git a/examples/deep-research/tasks/research.md b/examples/deep-research/tasks/research.md
index 659fd41..b1318da 100644
--- a/examples/deep-research/tasks/research.md
+++ b/examples/deep-research/tasks/research.md
@@ -1 +1 @@
-You are a research assistant with access to a knowledge base. Use the search and read_file tools to find information, then call report with your findings. Be thorough: search first, read relevant files, then report.
\ No newline at end of file
+You are a research assistant with access to a knowledge base. You have these tools: search (semantic relevance ranking), grep (regex pattern matching), read_file (read specific line ranges), and report (submit findings). Use search for topical queries, grep for exact patterns, and read_file to inspect context around results. Call report with your findings when done.
\ No newline at end of file
diff --git a/examples/deep-research/tasks/research.ts b/examples/deep-research/tasks/research.ts
index 6bd6e7e..f5319d8 100644
--- a/examples/deep-research/tasks/research.ts
+++ b/examples/deep-research/tasks/research.ts
@@ -9,6 +9,7 @@ const DEFAULT_SYSTEM_PROMPT = fs.readFileSync(path.resolve(__dirname, 'research.
 export { DEFAULT_SYSTEM_PROMPT as RESEARCH_SYSTEM_PROMPT };
 
 export interface AgentResult {
+  agentId: number;
   findings: string | null;
   toolCallCount: number;
   tokenCount: number;
@@ -31,8 +32,11 @@ export async function research(ctx: SessionContext, store: BranchStore, opts: {
   toolsJson: string;
   executeTool: ExecuteToolFn;
   maxTurns?: number;
-  onToolCall?: (agentIndex: number, toolName: string, args: string) => void;
-  onToolResult?: (agentIndex: number, toolName: string, resultStr: string) => void;
+  onProduce?: (agentId: number, text: string, tokenCount: number) => void;
+  onToolCall?: (agentId: number, toolName: string, args: string) => void;
+  onToolResult?: (agentId: number, toolName: string, resultStr: string) => void;
+  onToolProgress?: (agentId: number, toolName: string, progress: { filled: number; total: number }) => void;
+  onReport?: (agentId: number, findings: string) => void;
 }): Promise<ResearchResult> {
   const systemPrompt = opts.systemPrompt ?? DEFAULT_SYSTEM_PROMPT;
 
@@ -89,6 +93,9 @@ export async function research(ctx: SessionContext, store: BranchStore, opts: {
           reasoningFormat: fmt.reasoningFormat,
           thinkingForcedOpen: fmt.thinkingForcedOpen,
           parser: fmt.parser,
+          grammar: fmt.grammar,
+          grammarLazy: fmt.grammarLazy,
+          grammarTriggers: fmt.grammarTriggers,
         },
         rawOutput: '',
         done: false,
@@ -108,8 +115,11 @@ export async function research(ctx: SessionContext, store: BranchStore, opts: {
     store, ctx,
     executeTool: opts.executeTool,
     maxTurns: opts.maxTurns ?? 6,
+    onProduce: opts.onProduce,
     onToolCall: opts.onToolCall,
     onToolResult: opts.onToolResult,
+    onToolProgress: opts.onToolProgress,
+    onReport: opts.onReport,
   });
 
   for (const a of agents) await a.branch.prune();
@@ -117,6 +127,7 @@ export async function research(ctx: SessionContext, store: BranchStore, opts: {
 
   return {
     agents: agents.map((a) => ({
+      agentId: a.agentId,
       findings: a.findings,
       toolCallCount: a.toolCallCount,
       tokenCount: a.tokenCount,
diff --git a/examples/deep-research/tools/grep.ts b/examples/deep-research/tools/grep.ts
new file mode 100644
index 0000000..72cfd68
--- /dev/null
+++ b/examples/deep-research/tools/grep.ts
@@ -0,0 +1,67 @@
+import type { Resource } from '../resources/types.js';
+import type { Tool } from './types.js';
+
+export function createGrepTool(resources: Resource[]): Tool {
+  return {
+    name: 'grep',
+    schema: {
+      type: 'function',
+      function: {
+        name: 'grep',
+        description: 'Search the entire corpus for a regex pattern. Returns every matching line with line numbers and total match count. Complements search() which ranks by relevance — grep scans exhaustively.',
+        parameters: {
+          type: 'object',
+          properties: {
+            pattern: { type: 'string', description: 'Regex pattern (e.g. "\\bshor\\b" for whole-word, "hidden_secret" for literal)' },
+            ignoreCase: { type: 'boolean', description: 'Case-insensitive matching (default: true)' },
+          },
+          required: ['pattern'],
+        },
+      },
+    },
+    async execute(args) {
+      const pattern = (args.pattern as string)?.trim();
+      if (!pattern) return { error: 'pattern must not be empty' };
+      const flags = (args.ignoreCase === false) ? 'g' : 'gi';
+      let re: RegExp;
+      try { re = new RegExp(pattern, flags); }
+      catch { return { error: `Invalid regex: ${pattern}` }; }
+
+      const matches: { file: string; line: number; text: string }[] = [];
+      let totalMatches = 0;
+
+      for (const res of resources) {
+        const lines = res.content.split('\n');
+        for (let i = 0; i < lines.length; i++) {
+          const hits = lines[i].match(re);
+          if (hits) {
+            totalMatches += hits.length;
+            const raw = lines[i].trim();
+            let text: string;
+            if (raw.length <= 200) {
+              text = raw;
+            } else {
+              // Truncate around first match so the matched term is always visible
+              const idx = raw.search(re);
+              const start = Math.max(0, idx - 40);
+              const end = Math.min(raw.length, start + 200);
+              text = (start > 0 ? '…' : '') + raw.slice(start, end) + (end < raw.length ? '…' : '');
+            }
+            matches.push({ file: res.name, line: i + 1, text });
+          }
+        }
+      }
+
+      if (totalMatches === 0) {
+        return {
+          totalMatches: 0, matchingLines: 0, matches: [],
+          note: 'Zero matches does NOT mean the topic is absent — only that this exact pattern was not found. Try search() for semantic matching or a broader/simpler regex.',
+        };
+      }
+
+      const limit = 50;
+      const truncated = matches.length > limit;
+      return { totalMatches, matchingLines: matches.length, truncated, matches: matches.slice(0, limit) };
+    },
+  };
+}
diff --git a/examples/deep-research/tools/index.ts b/examples/deep-research/tools/index.ts
index bd4c829..5d09ec3 100644
--- a/examples/deep-research/tools/index.ts
+++ b/examples/deep-research/tools/index.ts
@@ -2,6 +2,7 @@ import type { Resource, Chunk } from '../resources/types.js';
 import type { Reranker, Tool, ExecuteToolFn } from './types.js';
 import { createSearchTool } from './search.js';
 import { createReadFileTool } from './read-file.js';
+import { createGrepTool } from './grep.js';
 import { createReportTool } from './report.js';
 
 export function createTools(opts: {
@@ -12,16 +13,17 @@ export function createTools(opts: {
   const tools = [
     createSearchTool(opts.chunks, opts.reranker),
     createReadFileTool(opts.resources),
+    createGrepTool(opts.resources),
     createReportTool(),
   ];
 
   const toolsJson = JSON.stringify(tools.map((t) => t.schema));
   const toolMap = new Map(tools.map((t) => [t.name, t]));
 
-  const executeTool: ExecuteToolFn = async (name, args) => {
+  const executeTool: ExecuteToolFn = async (name, args, context?) => {
     const tool = toolMap.get(name);
     if (!tool) return { error: `Unknown tool: ${name}` };
-    return tool.execute(args);
+    return tool.execute(args, context);
   };
 
   return { tools, toolsJson, executeTool };
diff --git a/examples/deep-research/tools/search.ts b/examples/deep-research/tools/search.ts
index fef2fe7..d45788d 100644
--- a/examples/deep-research/tools/search.ts
+++ b/examples/deep-research/tools/search.ts
@@ -16,8 +16,15 @@ export function createSearchTool(chunks: Chunk[], reranker: Reranker): Tool {
         },
       },
     },
-    async execute(args) {
-      return reranker.score((args.query as string) || '', chunks);
+    async execute(args, context?) {
+      const query = (args.query as string)?.trim();
+      if (!query) return { error: 'query must not be empty' };
+      let last;
+      for await (const { results, filled, total } of reranker.score(query, chunks)) {
+        if (context?.onProgress) context.onProgress({ filled, total });
+        last = results;
+      }
+      return last;
     },
   };
 }
diff --git a/examples/deep-research/tools/types.ts b/examples/deep-research/tools/types.ts
index ff52c8d..3ce04dd 100644
--- a/examples/deep-research/tools/types.ts
+++ b/examples/deep-research/tools/types.ts
@@ -8,8 +8,14 @@ export interface ScoredChunk {
   endLine: number;
 }
 
+export interface ScoredResult {
+  results: ScoredChunk[];
+  filled: number;
+  total: number;
+}
+
 export interface Reranker {
-  score(query: string, chunks: Chunk[]): Promise<ScoredChunk[]>;
+  score(query: string, chunks: Chunk[]): AsyncIterable<ScoredResult>;
   tokenizeChunks(chunks: Chunk[]): Promise<void>;
   dispose(): void;
 }
@@ -17,7 +23,14 @@ export interface Reranker {
 export interface Tool {
   name: string;
   schema: object;
-  execute: (args: Record<string, unknown>) => Promise<unknown>;
+  execute: (
+    args: Record<string, unknown>,
+    context?: { onProgress?: (p: { filled: number; total: number }) => void },
+  ) => Promise<unknown>;
 }
 
-export type ExecuteToolFn = (name: string, args: Record<string, unknown>) => Promise<unknown>;
+export type ExecuteToolFn = (
+  name: string,
+  args: Record<string, unknown>,
+  context?: { onProgress?: (p: { filled: number; total: number }) => void },
+) => Promise<unknown>;
diff --git a/liblloyal b/liblloyal
index 388e255..757f595 160000
--- a/liblloyal
+++ b/liblloyal
@@ -1 +1 @@
-Subproject commit 388e255adad2eda1a4e18c8e25345404fee39573
+Subproject commit 757f595a5e2e952ab0da01f888cdc00c6a757551

From 15dca224c3d449a47a7404c7786b2a4bb99d515b Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Sat, 28 Feb 2026 21:49:38 +1100
Subject: [PATCH 10/17] feat(agents): pilot

---
 examples/deep-research/deep-research.ts   | 497 ----------------------
 examples/deep-research/display.ts         | 293 ++++++++++++-
 examples/deep-research/harness.ts         | 369 ++++++++++++++++
 examples/deep-research/main.ts            | 160 +++++++
 examples/deep-research/tasks/eval.md      |   4 +-
 examples/deep-research/tasks/eval.ts      |  54 ---
 examples/deep-research/tasks/plan.md      |   4 +-
 examples/deep-research/tasks/plan.ts      |  69 ---
 examples/deep-research/tasks/research.ts  | 138 ------
 examples/deep-research/tasks/verify.md    |   4 +-
 examples/deep-research/tasks/verify.ts    |  91 ----
 examples/deep-research/tools/grep.ts      | 112 ++---
 examples/deep-research/tools/index.ts     |  35 +-
 examples/deep-research/tools/read-file.ts |  70 +--
 examples/deep-research/tools/report.ts    |  22 -
 examples/deep-research/tools/search.ts    |  56 +--
 examples/deep-research/tools/types.ts     |  15 -
 package-lock.json                         | 166 +++++++-
 package.json                              |   1 +
 src/Agent.ts                              | 288 -------------
 src/Branch.ts                             |   7 +
 src/Session.ts                            |  47 +-
 src/agents/Tool.ts                        |  76 ++++
 src/agents/agent-pool.ts                  | 442 +++++++++++++++++++
 src/agents/context.ts                     |  37 ++
 src/agents/deltas.ts                      |  63 +++
 src/agents/diverge.ts                     | 133 ++++++
 src/agents/generate.ts                    |  59 +++
 src/agents/index.ts                       |  31 ++
 src/agents/init.ts                        |  78 ++++
 src/agents/run-agents.ts                  |  45 ++
 src/agents/shared-root.ts                 |  82 ++++
 src/agents/toolkit.ts                     |  45 ++
 src/agents/types.ts                       | 312 ++++++++++++++
 src/index.ts                              |  45 +-
 src/types.ts                              | 109 +----
 36 files changed, 2569 insertions(+), 1490 deletions(-)
 delete mode 100644 examples/deep-research/deep-research.ts
 create mode 100644 examples/deep-research/harness.ts
 create mode 100644 examples/deep-research/main.ts
 delete mode 100644 examples/deep-research/tasks/eval.ts
 delete mode 100644 examples/deep-research/tasks/plan.ts
 delete mode 100644 examples/deep-research/tasks/research.ts
 delete mode 100644 examples/deep-research/tasks/verify.ts
 delete mode 100644 examples/deep-research/tools/report.ts
 delete mode 100644 src/Agent.ts
 create mode 100644 src/agents/Tool.ts
 create mode 100644 src/agents/agent-pool.ts
 create mode 100644 src/agents/context.ts
 create mode 100644 src/agents/deltas.ts
 create mode 100644 src/agents/diverge.ts
 create mode 100644 src/agents/generate.ts
 create mode 100644 src/agents/index.ts
 create mode 100644 src/agents/init.ts
 create mode 100644 src/agents/run-agents.ts
 create mode 100644 src/agents/shared-root.ts
 create mode 100644 src/agents/toolkit.ts
 create mode 100644 src/agents/types.ts

diff --git a/examples/deep-research/deep-research.ts b/examples/deep-research/deep-research.ts
deleted file mode 100644
index 715567e..0000000
--- a/examples/deep-research/deep-research.ts
+++ /dev/null
@@ -1,497 +0,0 @@
-#!/usr/bin/env node
-/**
- * Deep Research with Tool-Calling Agents via BranchStore
- *
- * Demonstrates composable fork patterns in a multi-agent research pipeline:
- *
- * - PLAN:     Branch.create() + grammar — constrained single generation
- * - RESEARCH: fork() + prefill() divergent suffixes — parallel tool-calling agents
- * - VERIFY:   fork() + reseed() — stochastic divergence for convergence checking
- * - EVAL:     Branch.create() + grammar — model-as-judge
- *
- * Cold run composes: plan → research → verify → eval
- * Warm follow-up composes: research(parent: trunk) → session.prefillUser → generate
- *
- * Usage:
- *   node deep-research.ts [model-path] --corpus <path> [--query <text>] [options]
- */
-
-import * as fs from 'node:fs';
-import * as path from 'node:path';
-import * as readline from 'node:readline';
-import { createContext, BranchStore, Session } from '../../dist/index.js';
-import { c, log, emit, setJsonlMode, status, statusClear, pad, fmtSize } from './display.js';
-import { loadResources, chunkResources } from './resources/files.js';
-import { createReranker } from './reranker.js';
-import { createTools } from './tools/index.js';
-import { plan } from './tasks/plan.js';
-import { research } from './tasks/research.js';
-import { verify } from './tasks/verify.js';
-import { evaluate } from './tasks/eval.js';
-
-// ================================================================
-// CLI ARGS
-// ================================================================
-
-const DEFAULT_MODEL = path.resolve(
-  __dirname,
-  '../../models/Qwen3-4B-Instruct-2507-Q4_K_M.gguf'
-);
-const DEFAULT_RERANKER = path.resolve(
-  __dirname,
-  '../../models/qwen3-reranker-0.6b-q4_k_m.gguf'
-);
-
-const args = process.argv.slice(2);
-const jsonlMode = args.includes('--jsonl');
-const verbose = args.includes('--verbose');
-
-function argVal(flag: string): string | null {
-  const i = args.indexOf(flag);
-  return i !== -1 ? args[i + 1] : null;
-}
-const flagIndices = new Set(
-  ['--reranker', '--corpus', '--query'].flatMap((f) => {
-    const i = args.indexOf(f);
-    return i !== -1 ? [i, i + 1] : [];
-  })
-);
-
-const rerankModelPath = argVal('--reranker') || DEFAULT_RERANKER;
-const corpusDir = argVal('--corpus');
-const initialQuery = argVal('--query');
-const modelPath = args.find((a, i) =>
-  !a.startsWith('--') && !flagIndices.has(i)
-) || DEFAULT_MODEL;
-
-if (!corpusDir) {
-  process.stdout.write(
-    `Usage: node deep-research.ts [model-path] --corpus <path> [--query <text>] [--reranker <path>]\n` +
-    `Missing: --corpus\n`
-  );
-  process.exit(1);
-}
-
-if (jsonlMode) setJsonlMode(true);
-
-// Suppress native llama.cpp logs for clean output
-if (!verbose && !jsonlMode) {
-  try {
-    fs.closeSync(2);
-    fs.openSync(process.platform === 'win32' ? '\\\\.\\NUL' : '/dev/null', 'w');
-  } catch { /* non-fatal */ }
-}
-
-const AGENT_COUNT = 3;
-const VERIFY_COUNT = 3;
-const MAX_TOOL_TURNS = 6;
-
-// ================================================================
-// MAIN
-// ================================================================
-
-async function main(): Promise<void> {
-  // Resources
-  const resources = loadResources(corpusDir!);
-  const chunks = chunkResources(resources);
-
-  const modelName = path.basename(modelPath).replace(/-Q\w+\.gguf$/, '');
-  const rerankName = path.basename(rerankModelPath).replace(/-q\w+\.gguf$/i, '');
-  const modelSize = fmtSize(fs.statSync(modelPath).size);
-  const rerankSize = fmtSize(fs.statSync(rerankModelPath).size);
-
-  log();
-  log(`${c.bold}  Deep Research${c.reset} ${c.dim}— BranchStore Tool-Calling Agents${c.reset}`);
-  log();
-
-  log(`  ${c.green}●${c.reset} Loading ${c.bold}${modelName}${c.reset} ${c.dim}(${modelSize}, KV: Q4_0)${c.reset}`);
-
-  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '16384', 10);
-  const ctx = await createContext({
-    modelPath, nCtx,
-    nSeqMax: Math.max(AGENT_COUNT, VERIFY_COUNT) + 1,
-    typeK: 'q4_0', typeV: 'q4_0',
-  });
-
-  log(`  ${c.green}●${c.reset} Loading ${c.bold}${rerankName}${c.reset} ${c.dim}(${rerankSize}, reranker)${c.reset}`);
-
-  const reranker = await createReranker(rerankModelPath, { nSeqMax: 8, nCtx: 4096 });
-  await reranker.tokenizeChunks(chunks);
-
-  const corpusIsFile = resources.length === 1 && fs.statSync(corpusDir!).isFile();
-  const corpusLabel = corpusIsFile
-    ? path.basename(corpusDir!)
-    : `${path.basename(corpusDir!)}/ — ${resources.length} files`;
-  log(`  ${c.dim}  Corpus: ${corpusLabel} → ${chunks.length} chunks${c.reset}`);
-
-  const { toolsJson, executeTool } = createTools({ resources, chunks, reranker });
-  const store = new BranchStore(ctx);
-  const session = new Session({ ctx, store });
-
-  // ── Agent labels + status line ──────────────────────────
-  const agentLabel = new Map<number, string>();
-  let nextLabel = 0;
-  function label(agentId: number): string {
-    let l = agentLabel.get(agentId);
-    if (!l) { l = `A${nextLabel++}`; agentLabel.set(agentId, l); }
-    return l;
-  }
-  const agentText = new Map<number, string>();    // accumulated raw text per agent
-  function resetLabels(): void { nextLabel = 0; agentLabel.clear(); agentStatus.clear(); agentText.clear(); }
-
-  const agentStatus = new Map<number, { state: string; tokenCount: number; detail: string }>();
-
-  function renderStatus(): void {
-    const active = [...agentStatus.entries()].filter(([, s]) => s.state !== 'done');
-    if (active.length === 0) return;
-
-    const generating = active.filter(([, s]) => s.state === 'gen');
-
-    // Single agent generating → stream text on status line (rewritable — clears
-    // when tool call fires or agent finishes)
-    if (generating.length === 1 && active.length === 1) {
-      const [id] = generating[0];
-      const raw = (agentText.get(id) ?? '').replace(/\n/g, ' ').trimStart();
-      const cols = process.stdout.columns || 80;
-      const maxLen = cols - 12;  // "    ◆ A0 " prefix ≈ 9 visible chars + margin
-      const text = raw.length > maxLen ? raw.slice(raw.length - maxLen) : raw;
-      status(`    ${c.dim}\u25c6${c.reset} ${c.yellow}${label(id)}${c.reset} ${text}`);
-      return;
-    }
-
-    // Multi-agent: compact counters
-    const parts = active.map(([id, s]) => {
-      const lbl = `${c.yellow}${label(id)}${c.reset}`;
-      if (s.state === 'gen') return `${lbl}: ${s.tokenCount} tok`;
-      const detail = s.detail ? ` ${s.detail}` : '';
-      return `${lbl}: ${c.cyan}${s.state}${c.reset}${detail}`;
-    });
-    status(`    ${c.dim}\u25c6${c.reset} ${parts.join('  ')}`);
-  }
-
-  // ── Callbacks — shared across cold + warm paths ────────
-  const onProduce = (agentId: number, text: string, tokenCount: number): void => {
-    agentText.set(agentId, (agentText.get(agentId) ?? '') + text);
-    agentStatus.set(agentId, { state: 'gen', tokenCount, detail: '' });
-    renderStatus();
-  };
-
-  const onToolProgress = (agentId: number, toolName: string, p: { filled: number; total: number }): void => {
-    agentStatus.set(agentId, { state: toolName, tokenCount: 0, detail: `${p.filled}/${p.total}` });
-    renderStatus();
-  };
-
-  const onToolCall = (agentId: number, toolName: string, argsStr: string): void => {
-    agentText.delete(agentId);  // this generation led to a parsed tool call — clear
-    agentStatus.set(agentId, { state: toolName, tokenCount: 0, detail: '' });
-    emit('tool_call', { agentId, toolName, arguments: argsStr });
-    let toolArgs: Record<string, string>;
-    try { toolArgs = JSON.parse(argsStr); } catch { toolArgs = {}; }
-    const argSummary = toolName === 'search'
-      ? `"${toolArgs.query || ''}"`
-      : toolName === 'grep'
-      ? `/${toolArgs.pattern || ''}/`
-      : toolName === 'report' ? ''
-      : `${toolArgs.filename}` + (toolArgs.startLine ? ` L${toolArgs.startLine}-${toolArgs.endLine}` : '');
-    log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(agentId)}${c.reset} ${c.cyan}${toolName}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
-  };
-
-  const onToolResult = (agentId: number, toolName: string, resultStr: string): void => {
-    emit('tool_result', {
-      agentId, toolName,
-      result: resultStr.length > 200 ? resultStr.slice(0, 200) + '...' : resultStr,
-    });
-    let preview = '';
-    if (toolName === 'read_file') {
-      try {
-        const firstLine = (JSON.parse(resultStr) as { content: string }).content.split('\n').find(l => l.trim());
-        if (firstLine) preview = ` · ${firstLine.trim().slice(0, 60)}${firstLine.trim().length > 60 ? '\u2026' : ''}`;
-      } catch { /* non-fatal */ }
-    } else if (toolName === 'search') {
-      try {
-        const top = (JSON.parse(resultStr) as { heading: string }[])[0];
-        if (top?.heading) preview = ` · ${top.heading}`;
-      } catch { /* non-fatal */ }
-    } else if (toolName === 'grep') {
-      try {
-        const r = JSON.parse(resultStr) as { totalMatches: number; matchingLines: number };
-        preview = ` · ${r.totalMatches} matches in ${r.matchingLines} lines`;
-      } catch { /* non-fatal */ }
-    }
-    log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(agentId)}${c.reset} ${c.dim}\u2190 ${toolName} ${resultStr.length}b${preview}${c.reset}`);
-  };
-
-  const onReport = (agentId: number, findings: string): void => {
-    agentStatus.set(agentId, { state: 'done', tokenCount: 0, detail: '' });
-    const cols = process.stdout.columns || 80;
-    const lbl = `${c.yellow}${label(agentId)}${c.reset}`;
-    const prefix = `    ${c.dim}\u2502${c.reset}   `;
-    // visible width: "    │   " = 8 chars
-    const wrap = cols - 8;
-
-    log(`    ${c.dim}\u2502${c.reset}`);
-    log(`    ${c.dim}\u251c\u2500\u2500${c.reset} ${lbl} ${c.bold}findings${c.reset}`);
-
-    // Word-wrap findings, preserve paragraph breaks
-    for (const para of findings.split('\n')) {
-      if (!para.trim()) { log(prefix); continue; }
-      const words = para.split(/\s+/);
-      let line = '';
-      for (const word of words) {
-        if (line && line.length + 1 + word.length > wrap) {
-          log(`${prefix}${c.dim}${line}${c.reset}`);
-          line = word;
-        } else {
-          line = line ? `${line} ${word}` : word;
-        }
-      }
-      if (line) log(`${prefix}${c.dim}${line}${c.reset}`);
-    }
-
-    log(`    ${c.dim}\u2502${c.reset}`);
-  };
-
-  // ================================================================
-  // handleQuery — the orchestrator
-  //
-  // No session yet → cold: plan → research → verify → eval
-  // Session exists → warm: research(parent: trunk) → prefillUser → generate
-  // ================================================================
-
-  async function handleQuery(query: string): Promise<void> {
-    const t0 = performance.now();
-    const warm = !!session.trunk;
-
-    if (!warm) {
-      emit('start', {
-        model: path.basename(modelPath), reranker: path.basename(rerankModelPath),
-        query, agentCount: AGENT_COUNT, verifyCount: VERIFY_COUNT, chunks: chunks.length,
-      });
-      log();
-      log(`  ${c.dim}Query${c.reset}`);
-      log(`  ${c.bold}${query}${c.reset}`);
-    }
-
-    // ─── plan ─────────────────────────────────────────────
-    let t = performance.now();
-    const { questions, tokenCount: planTokens } = await plan(ctx, {
-      query, agentCount: AGENT_COUNT,
-      ...(warm && { parent: session.trunk! }),
-    });
-    const planMs = performance.now() - t;
-
-    if (!warm) emit('plan', { questions, planTokens });
-    log(`\n  ${c.green}●${c.reset} ${c.bold}Plan${c.reset} ${c.dim}${planTokens} tok · ${(planMs / 1000).toFixed(1)}s${c.reset}`);
-    questions.forEach((q, i) => log(`    ${c.dim}${i + 1}.${c.reset} ${q}`));
-
-    // ─── research ─────────────────────────────────────────
-    t = performance.now();
-    log(`\n  ${c.green}●${c.reset} ${c.bold}Research${c.reset} ${c.dim}${questions.length} agents${c.reset}`);
-
-    resetLabels();
-    const researchResult = await research(ctx, store, {
-      questions, toolsJson, executeTool,
-      maxTurns: MAX_TOOL_TURNS,
-      onProduce, onToolCall, onToolResult, onToolProgress, onReport,
-      ...(warm && { parent: session.trunk!, seed: Date.now() }),
-    });
-    statusClear();
-    const researchMs = performance.now() - t;
-
-    researchResult.agents.forEach((a, i) => {
-      const tree = i === researchResult.agents.length - 1 ? '└' : '├';
-      emit('agent_done', { index: i, question: questions[i], findings: (a.findings || '').slice(0, 500), toolCalls: a.toolCallCount, tokenCount: a.tokenCount });
-      // Show remaining accumulated text — unparsed tool calls, reasoning, etc.
-      // (agentText is cleared by onToolCall on successful parse, so only failed-parse text remains)
-      const raw = (agentText.get(a.agentId) ?? '').replace(/\n/g, ' ').trim();
-      if (raw) log(`    ${c.dim}├${c.reset} ${c.yellow}${label(a.agentId)}${c.reset} ${c.dim}▸ ${raw.slice(0, 120)}${raw.length > 120 ? '…' : ''}${c.reset}`);
-      log(`    ${c.dim}${tree}${c.reset} ${c.yellow}${label(a.agentId)}${c.reset} ${c.green}done${c.reset} ${c.dim}${a.tokenCount} tok · ${a.toolCallCount} tools${c.reset}`);
-    });
-    log(`    ${c.dim}${researchResult.totalTokens} tok · ${researchResult.totalToolCalls} tools · ${(researchMs / 1000).toFixed(1)}s${c.reset}`);
-
-    // ─── post-research: verify+eval (cold) or generate (warm) ─
-    const phases: { label: string; tokens: number; detail: string; timeMs: number }[] = [
-      { label: 'Plan', tokens: planTokens, detail: '', timeMs: planMs },
-      {
-        label: 'Research', tokens: researchResult.totalTokens,
-        detail: `(${researchResult.agents.map(a => a.tokenCount).join(' + ')})  ${pad(researchResult.totalToolCalls, 2)} tools`,
-        timeMs: researchMs,
-      },
-    ];
-    let kvLine: string | null = null;
-
-    if (!warm) {
-      // ── verify ──────────────────────────────────────────
-      t = performance.now();
-      const findingsText = researchResult.agents
-        .map((a, i) => `Q: ${questions[i]}\nA: ${(a.findings || '').trim()}`)
-        .join('\n\n');
-
-      log(`\n  ${c.green}●${c.reset} ${c.bold}Verify${c.reset} ${c.dim}${VERIFY_COUNT} attempts${c.reset}`);
-      const verifyResult = await verify(ctx, store, { findings: findingsText, query, count: VERIFY_COUNT });
-      const verifyMs = performance.now() - t;
-
-      verifyResult.attempts.forEach((a, i) => {
-        const tree = i === verifyResult.attempts.length - 1 ? '└' : '├';
-        emit('attempt_done', { index: i, output: a.output.trim().slice(0, 500), tokenCount: a.tokenCount, ppl: a.ppl });
-        log(`    ${c.dim}${tree} ${a.tokenCount} tok · ppl ${a.ppl.toFixed(2)}${c.reset}`);
-      });
-      log(`    ${c.dim}${verifyResult.totalTokens} tok · ${(verifyMs / 1000).toFixed(1)}s${c.reset}`);
-
-      // ── eval ────────────────────────────────────────────
-      t = performance.now();
-      const { converged, tokenCount: evalTokens } = await evaluate(ctx, { attempts: verifyResult.attempts });
-      const evalMs = performance.now() - t;
-
-      emit('convergence', { converged, evalTokens });
-      const verdict = converged === true ? `${c.green}yes${c.reset}` : converged === false ? `${c.red}no${c.reset}` : `${c.yellow}unknown${c.reset}`;
-      log(`\n  ${c.green}●${c.reset} ${c.bold}Eval${c.reset} ${c.dim}${evalTokens} tok · ${(evalMs / 1000).toFixed(1)}s${c.reset}`);
-      log(`    Converged: ${verdict}`);
-
-      // ── answer ──────────────────────────────────────────
-      log(`\n  ${c.dim}${'─'.repeat(58)}${c.reset}\n`);
-      const prose = verifyResult.bestOutput.trim()
-        .replace(/\*\*(.+?)\*\*/g, `${c.bold}$1${c.reset}`)
-        .split('\n').map((l) => `  ${l}`).join('\n');
-      log(prose);
-
-      phases.push(
-        { label: 'Verify', tokens: verifyResult.totalTokens, detail: `(${verifyResult.attempts.map(a => a.tokenCount).join(' + ')})`, timeMs: verifyMs },
-        { label: 'Eval', tokens: evalTokens, detail: `converged: ${converged ? 'yes' : 'no'}`, timeMs: evalMs },
-      );
-
-      const kvSaved = researchResult.sharedPrefixLength * (questions.length - 1)
-        + verifyResult.prefixLength * (verifyResult.attempts.length - 1);
-      kvLine = `  ${c.dim}KV shared    ${researchResult.sharedPrefixLength} × ${questions.length - 1} + ${verifyResult.prefixLength} × ${verifyResult.attempts.length - 1} = ${kvSaved.toLocaleString()} tok saved${c.reset}`;
-
-      emit('complete', {
-        planTokens, agentTokens: researchResult.totalTokens,
-        researchSteps: researchResult.steps,
-        verifyTokens: verifyResult.totalTokens, verifySteps: verifyResult.steps,
-        evalTokens, converged,
-        totalToolCalls: researchResult.totalToolCalls,
-        prefixTokens: verifyResult.prefixLength,
-        sharedPrefixTokens: researchResult.sharedPrefixLength,
-        agentCount: questions.length, attemptCount: verifyResult.attempts.length,
-        wallTimeMs: Math.round(performance.now() - t0),
-        planMs: Math.round(planMs), researchMs: Math.round(researchMs),
-        verifyMs: Math.round(verifyMs), evalMs: Math.round(evalMs),
-        ...researchResult.counters,
-      });
-
-      await session.promote(verifyResult.bestBranch);
-    } else {
-      // ── grounded response ───────────────────────────────
-      const agentFindings = researchResult.agents
-        .map((a, i) => a.findings ? `[Agent ${i}] ${a.findings.trim()}` : null)
-        .filter(Boolean)
-        .join('\n\n');
-
-      await session.prefillUser(agentFindings
-        ? `Research findings:\n${agentFindings}\n\nUser question: ${query}\n\nAnswer based on the research findings above.`
-        : query);
-
-      t = performance.now();
-      let responseTokens = 0;
-      process.stdout.write(`  ${c.dim}<${c.reset} `);
-      for await (const { text } of session.trunk!) {
-        process.stdout.write(text);
-        responseTokens++;
-      }
-      console.log('\n');
-
-      phases.push({ label: 'Response', tokens: responseTokens, detail: '', timeMs: performance.now() - t });
-    }
-
-    // ─── stats table ──────────────────────────────────────
-    const tEnd = performance.now();
-    const totalTokens = phases.reduce((s, p) => s + p.tokens, 0);
-
-    log(`\n  ${c.dim}${'━'.repeat(58)}${c.reset}`);
-    for (const p of phases) {
-      const left = `${p.label.padEnd(10)} ${pad(p.tokens, 5)} tok`;
-      const detail = p.detail ? `  ${p.detail}` : '';
-      const right = `${pad((p.timeMs / 1000).toFixed(1), 6)}s`;
-      log(`  ${c.dim}${left}${detail}${' '.repeat(Math.max(1, 58 - left.length - detail.length - right.length))}${right}${c.reset}`);
-    }
-    log(`  ${c.dim}${'━'.repeat(58)}${c.reset}`);
-    log(`  ${c.bold}Total${c.reset}      ${c.bold}${pad(totalTokens, 5)}${c.reset} tok  ${c.dim}${questions.length} agents · ${researchResult.totalToolCalls} tools${c.reset}         ${c.bold}${pad(((tEnd - t0) / 1000).toFixed(1), 6)}s${c.reset}`);
-    if (kvLine) log(kvLine);
-    const trunkPos = session.trunk ? session.trunk.position : 0;
-    const ctxPct = Math.round(100 * trunkPos / nCtx);
-    const ctxStr = `ctx: ${ctxPct}% (${trunkPos.toLocaleString()}/${nCtx.toLocaleString()})`;
-    log(`  ${c.dim}${'━'.repeat(58)}${c.reset}`);
-    log(`  ${c.dim}${' '.repeat(58 - ctxStr.length)}${ctxStr}${c.reset}`);
-    log();
-  }
-
-  // ================================================================
-  // REPL — single input loop drives both cold and warm paths
-  // ================================================================
-
-  // --query with --jsonl: run cold pipeline, emit results, exit
-  if (jsonlMode && initialQuery) {
-    await handleQuery(initialQuery);
-    await session.dispose();
-    reranker.dispose();
-    ctx.dispose();
-    return;
-  }
-
-  // --query provided interactively: use as first input
-  if (initialQuery) {
-    await handleQuery(initialQuery);
-  }
-
-  log(`  ${c.dim}${session.trunk ? 'Ask a follow-up question' : 'Enter your research question'} or /quit to exit${c.reset}`);
-  log();
-
-  await new Promise<void>((resolve) => {
-    const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
-    let exiting = false;
-    let generating = false;
-    let eofWhileGenerating = false;
-
-    async function exit(): Promise<void> {
-      if (exiting) return;
-      exiting = true;
-      rl.close();
-      await session.dispose();
-      reranker.dispose();
-      ctx.dispose();
-      resolve();
-    }
-
-    const ask = (): void => {
-      if (exiting) return;
-      rl.question(`  ${c.dim}>${c.reset} `, handleInput);
-    };
-
-    async function handleInput(input: string): Promise<void> {
-      try {
-        const trimmed = input.trim();
-        if (!trimmed || trimmed === '/quit') { await exit(); return; }
-
-        generating = true;
-        await handleQuery(trimmed);
-        generating = false;
-
-        if (eofWhileGenerating) { await exit(); } else { ask(); }
-      } catch (err) {
-        log(`  ${c.red}Error: ${(err as Error).message}${c.reset}`);
-        generating = false;
-        ask();
-      }
-    }
-
-    rl.on('close', () => {
-      if (generating) { eofWhileGenerating = true; } else { exit(); }
-    });
-    ask();
-  });
-}
-
-main().catch((err: unknown) => {
-  process.stdout.write(`Error: ${(err as Error).message}\n${(err as Error).stack}\n`);
-  process.exit(1);
-});
diff --git a/examples/deep-research/display.ts b/examples/deep-research/display.ts
index 241108b..a28ed43 100644
--- a/examples/deep-research/display.ts
+++ b/examples/deep-research/display.ts
@@ -1,3 +1,11 @@
+import * as fs from 'node:fs';
+import { each } from 'effection';
+import type { Operation, Signal } from 'effection';
+import type { HarnessEvent, PhaseStats } from './harness.js';
+import type { AgentPoolResult } from '../../dist/agents/index.js';
+
+// ── Mode + color ─────────────────────────────────────────────────
+
 let _jsonlMode = false;
 
 export function setJsonlMode(on: boolean): void { _jsonlMode = on; }
@@ -9,6 +17,8 @@ export const c = isTTY ? {
   green: '\x1b[32m', cyan: '\x1b[36m', yellow: '\x1b[33m', red: '\x1b[31m',
 } : { bold: '', dim: '', reset: '', green: '', cyan: '', yellow: '', red: '' };
 
+// ── Primitives ───────────────────────────────────────────────────
+
 let _statusText = '';
 
 export function status(text: string): void {
@@ -29,7 +39,7 @@ export const log = (...a: unknown[]): void => {
   console.log(...a);
 };
 
-export function emit(event: string, data: Record<string, unknown>): void {
+function emit(event: string, data: Record<string, unknown>): void {
   if (_jsonlMode) console.log(JSON.stringify({ event, ...data }));
 }
 
@@ -38,3 +48,284 @@ export const pad = (s: unknown, n: number): string => String(s).padStart(n);
 export const fmtSize = (bytes: number): string => bytes > 1e9
   ? (bytes / 1e9).toFixed(1) + ' GB'
   : (bytes / 1e6).toFixed(0) + ' MB';
+
+// ── Display subscriber ──────────────────────────────────────────
+// Spawned once in main.ts. Handles both AgentEvent (agent-level,
+// from useAgentPool) and PhaseEvent (harness-level).
+
+export interface DisplayOptions {
+  model: string;
+  reranker: string;
+  agentCount: number;
+  verifyCount: number;
+  chunkCount: number;
+}
+
+export function* displaySubscriber(
+  events: Signal<HarnessEvent, void>,
+  opts: DisplayOptions,
+): Operation<void> {
+  // Agent label tracking — scoped to subscriber lifetime
+  const agentLabel = new Map<number, string>();
+  let nextLabel = 0;
+  const agentText = new Map<number, string>();
+  const agentStatus = new Map<number, { state: string; tokenCount: number; detail: string }>();
+
+  function label(agentId: number): string {
+    let l = agentLabel.get(agentId);
+    if (!l) { l = `A${nextLabel++}`; agentLabel.set(agentId, l); }
+    return l;
+  }
+
+  function resetLabels(): void {
+    nextLabel = 0; agentLabel.clear(); agentStatus.clear(); agentText.clear();
+  }
+
+  function renderStatus(): void {
+    const active = [...agentStatus.entries()].filter(([, s]) => s.state !== 'done');
+    if (active.length === 0) return;
+
+    const generating = active.filter(([, s]) => s.state === 'gen');
+
+    if (generating.length === 1 && active.length === 1) {
+      const [id] = generating[0];
+      const raw = (agentText.get(id) ?? '').replace(/\n/g, ' ').trimStart();
+      const cols = process.stdout.columns || 80;
+      const maxLen = cols - 12;
+      const text = raw.length > maxLen ? raw.slice(raw.length - maxLen) : raw;
+      status(`    ${c.dim}\u25c6${c.reset} ${c.yellow}${label(id)}${c.reset} ${text}`);
+      return;
+    }
+
+    const parts = active.map(([id, s]) => {
+      const lbl = `${c.yellow}${label(id)}${c.reset}`;
+      if (s.state === 'gen') return `${lbl}: ${s.tokenCount} tok`;
+      const detail = s.detail ? ` ${s.detail}` : '';
+      return `${lbl}: ${c.cyan}${s.state}${c.reset}${detail}`;
+    });
+    status(`    ${c.dim}\u25c6${c.reset} ${parts.join('  ')}`);
+  }
+
+  function renderStats(phases: PhaseStats[], kvLine?: string, ctxPct?: number, ctxPos?: number, ctxTotal?: number): void {
+    const totalTokens = phases.reduce((s, p) => s + p.tokens, 0);
+    const totalMs = phases.reduce((s, p) => s + p.timeMs, 0);
+
+    log(`\n  ${c.dim}${'\u2501'.repeat(58)}${c.reset}`);
+    for (const p of phases) {
+      const left = `${p.label.padEnd(10)} ${pad(p.tokens, 5)} tok`;
+      const detail = p.detail ? `  ${p.detail}` : '';
+      const right = p.timeMs > 0 ? `${pad((p.timeMs / 1000).toFixed(1), 6)}s` : '';
+      log(`  ${c.dim}${left}${detail}${' '.repeat(Math.max(1, 58 - left.length - detail.length - right.length))}${right}${c.reset}`);
+    }
+    log(`  ${c.dim}${'\u2501'.repeat(58)}${c.reset}`);
+    log(`  ${c.bold}Total${c.reset}      ${c.bold}${pad(totalTokens, 5)}${c.reset} tok         ${c.bold}${pad((totalMs / 1000).toFixed(1), 6)}s${c.reset}`);
+    if (kvLine) log(`  ${c.dim}${kvLine}${c.reset}`);
+    if (ctxPct != null && ctxPos != null && ctxTotal != null) {
+      const ctxStr = `ctx: ${ctxPct}% (${ctxPos.toLocaleString()}/${ctxTotal.toLocaleString()})`;
+      log(`  ${c.dim}${'\u2501'.repeat(58)}${c.reset}`);
+      log(`  ${c.dim}${' '.repeat(58 - ctxStr.length)}${ctxStr}${c.reset}`);
+    }
+    log();
+  }
+
+  // ── Trace persistence ────────────────────────────────────────
+  // Per-token trace data lives on AgentResult.trace (populated by
+  // useAgentPool when trace: true). We just write it to disk here.
+  let traceQuery = '';
+
+  function flushTrace(pool: AgentPoolResult): void {
+    if (!pool.agents.some(a => a.trace?.length)) return;
+    const filename = `trace-${Date.now()}.json`;
+    fs.writeFileSync(filename, JSON.stringify({
+      query: traceQuery,
+      timestamp: new Date().toISOString(),
+      agents: pool.agents.map(a => ({
+        agentId: a.agentId, label: label(a.agentId),
+        ppl: a.ppl, samplingPpl: a.samplingPpl,
+        tokenCount: a.tokenCount, toolCallCount: a.toolCallCount,
+        findings: a.findings, trace: a.trace ?? [],
+      })),
+    }, null, 2));
+    log(`  ${c.dim}Trace written to ${filename}${c.reset}`);
+  }
+
+  for (const ev of yield* each(events)) {
+    switch (ev.type) {
+      // ── Agent-level events (from useAgentPool) ──────────
+      case 'agent:produce': {
+        agentText.set(ev.agentId, (agentText.get(ev.agentId) ?? '') + ev.text);
+        agentStatus.set(ev.agentId, { state: 'gen', tokenCount: ev.tokenCount, detail: '' });
+        renderStatus();
+        break;
+      }
+      case 'agent:tool_call': {
+        agentText.delete(ev.agentId);
+        agentStatus.set(ev.agentId, { state: ev.tool, tokenCount: 0, detail: '' });
+        emit('tool_call', { agentId: ev.agentId, toolName: ev.tool, arguments: ev.args });
+        let toolArgs: Record<string, string>;
+        try { toolArgs = JSON.parse(ev.args); } catch { toolArgs = {}; }
+        const argSummary = ev.tool === 'search'
+          ? `"${toolArgs.query || ''}"`
+          : ev.tool === 'grep'
+          ? `/${toolArgs.pattern || ''}/`
+          : ev.tool === 'report' ? ''
+          : `${toolArgs.filename}` + (toolArgs.startLine ? ` L${toolArgs.startLine}-${toolArgs.endLine}` : '');
+        log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(ev.agentId)}${c.reset} ${c.cyan}${ev.tool}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
+        break;
+      }
+      case 'agent:tool_result': {
+        emit('tool_result', {
+          agentId: ev.agentId, toolName: ev.tool,
+          result: ev.result.length > 200 ? ev.result.slice(0, 200) + '...' : ev.result,
+        });
+        let preview = '';
+        if (ev.tool === 'read_file') {
+          try {
+            const firstLine = (JSON.parse(ev.result) as { content: string }).content.split('\n').find((l: string) => l.trim());
+            if (firstLine) preview = ` \u00b7 ${firstLine.trim().slice(0, 60)}${firstLine.trim().length > 60 ? '\u2026' : ''}`;
+          } catch { /* non-fatal */ }
+        } else if (ev.tool === 'search') {
+          try {
+            const top = (JSON.parse(ev.result) as { heading: string }[])[0];
+            if (top?.heading) preview = ` \u00b7 ${top.heading}`;
+          } catch { /* non-fatal */ }
+        } else if (ev.tool === 'grep') {
+          try {
+            const r = JSON.parse(ev.result) as { totalMatches: number; matchingLines: number };
+            preview = ` \u00b7 ${r.totalMatches} matches in ${r.matchingLines} lines`;
+          } catch { /* non-fatal */ }
+        }
+        log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(ev.agentId)}${c.reset} ${c.dim}\u2190 ${ev.tool} ${ev.result.length}b${preview}${c.reset}`);
+        break;
+      }
+      case 'agent:tool_progress': {
+        agentStatus.set(ev.agentId, { state: ev.tool, tokenCount: 0, detail: `${ev.filled}/${ev.total}` });
+        renderStatus();
+        break;
+      }
+      case 'agent:report': {
+        agentStatus.set(ev.agentId, { state: 'done', tokenCount: 0, detail: '' });
+        const cols = process.stdout.columns || 80;
+        const lbl = `${c.yellow}${label(ev.agentId)}${c.reset}`;
+        const prefix = `    ${c.dim}\u2502${c.reset}   `;
+        const wrap = cols - 8;
+
+        log(`    ${c.dim}\u2502${c.reset}`);
+        log(`    ${c.dim}\u251c\u2500\u2500${c.reset} ${lbl} ${c.bold}findings${c.reset}`);
+
+        for (const para of ev.findings.split('\n')) {
+          if (!para.trim()) { log(prefix); continue; }
+          const words = para.split(/\s+/);
+          let line = '';
+          for (const word of words) {
+            if (line && line.length + 1 + word.length > wrap) {
+              log(`${prefix}${c.dim}${line}${c.reset}`);
+              line = word;
+            } else {
+              line = line ? `${line} ${word}` : word;
+            }
+          }
+          if (line) log(`${prefix}${c.dim}${line}${c.reset}`);
+        }
+        log(`    ${c.dim}\u2502${c.reset}`);
+        break;
+      }
+      case 'agent:done': break;
+
+      // ── Phase events (from harness) ─────────────────────
+      case 'query': {
+        traceQuery = ev.query;
+        if (!ev.warm) {
+          emit('start', {
+            model: opts.model, reranker: opts.reranker, query: ev.query,
+            agentCount: opts.agentCount, verifyCount: opts.verifyCount, chunks: opts.chunkCount,
+          });
+          log();
+          log(`  ${c.dim}Query${c.reset}`);
+          log(`  ${c.bold}${ev.query}${c.reset}`);
+        }
+        break;
+      }
+      case 'plan': {
+        emit('plan', { questions: ev.questions, planTokens: ev.tokenCount });
+        log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Plan${c.reset} ${c.dim}${ev.tokenCount} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
+        ev.questions.forEach((q: string, i: number) => log(`    ${c.dim}${i + 1}.${c.reset} ${q}`));
+        break;
+      }
+      case 'research:start': {
+        log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Research${c.reset} ${c.dim}${ev.agentCount} agents${c.reset}`);
+        resetLabels();
+        break;
+      }
+      case 'research:done': {
+        statusClear();
+        ev.pool.agents.forEach((a, i) => {
+          const tree = i === ev.pool.agents.length - 1 ? '\u2514' : '\u251c';
+          emit('agent_done', {
+            index: i, findings: (a.findings || '').slice(0, 500),
+            toolCalls: a.toolCallCount, tokenCount: a.tokenCount,
+            ppl: a.ppl, samplingPpl: a.samplingPpl,
+          });
+          const raw = (agentText.get(a.agentId) ?? '').replace(/\n/g, ' ').trim();
+          if (raw) log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(a.agentId)}${c.reset} ${c.dim}\u25b8 ${raw.slice(0, 120)}${raw.length > 120 ? '\u2026' : ''}${c.reset}`);
+          const pplStr = Number.isFinite(a.ppl) ? ` \u00b7 ppl ${a.ppl.toFixed(2)}` : '';
+          log(`    ${c.dim}${tree}${c.reset} ${c.yellow}${label(a.agentId)}${c.reset} ${c.green}done${c.reset} ${c.dim}${a.tokenCount} tok \u00b7 ${a.toolCallCount} tools${pplStr}${c.reset}`);
+        });
+        log(`    ${c.dim}${ev.pool.totalTokens} tok \u00b7 ${ev.pool.totalToolCalls} tools \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
+        flushTrace(ev.pool);
+        break;
+      }
+      case 'verify:start': {
+        log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Verify${c.reset} ${c.dim}${ev.count} attempts${c.reset}`);
+        break;
+      }
+      case 'verify:done': {
+        ev.result.attempts.forEach((a, i) => {
+          const tree = i === ev.result.attempts.length - 1 ? '\u2514' : '\u251c';
+          emit('attempt_done', { index: i, output: a.output.trim().slice(0, 500), tokenCount: a.tokenCount, ppl: a.ppl });
+          log(`    ${c.dim}${tree} ${a.tokenCount} tok \u00b7 ppl ${a.ppl.toFixed(2)}${c.reset}`);
+        });
+        log(`    ${c.dim}${ev.result.totalTokens} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
+        break;
+      }
+      case 'eval:done': {
+        emit('convergence', { converged: ev.converged, evalTokens: ev.tokenCount });
+        const verdict = ev.converged === true ? `${c.green}yes${c.reset}`
+          : ev.converged === false ? `${c.red}no${c.reset}`
+          : `${c.yellow}unknown${c.reset}`;
+        log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Eval${c.reset} ${c.dim}${ev.tokenCount} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
+        log(`    Converged: ${verdict}`);
+        break;
+      }
+      case 'answer': {
+        log(`\n  ${c.dim}${'\u2500'.repeat(58)}${c.reset}\n`);
+        const prose = ev.text.trim()
+          .replace(/\*\*(.+?)\*\*/g, `${c.bold}$1${c.reset}`)
+          .split('\n').map((l: string) => `  ${l}`).join('\n');
+        log(prose);
+        break;
+      }
+      case 'response:start': {
+        process.stdout.write(`  ${c.dim}<${c.reset} `);
+        break;
+      }
+      case 'response:text': {
+        process.stdout.write(ev.text);
+        break;
+      }
+      case 'response:done': {
+        console.log('\n');
+        break;
+      }
+      case 'stats': {
+        renderStats(ev.phases, ev.kvLine, ev.ctxPct, ev.ctxPos, ev.ctxTotal);
+        break;
+      }
+      case 'complete': {
+        emit('complete', ev.data);
+        break;
+      }
+    }
+    yield* each.next();
+  }
+}
diff --git a/examples/deep-research/harness.ts b/examples/deep-research/harness.ts
new file mode 100644
index 0000000..bab8016
--- /dev/null
+++ b/examples/deep-research/harness.ts
@@ -0,0 +1,369 @@
+import * as fs from 'node:fs';
+import * as path from 'node:path';
+import { call } from 'effection';
+import type { Operation, Signal } from 'effection';
+import { Branch, Session } from '../../dist/index.js';
+import type { SessionContext } from '../../dist/index.js';
+import {
+  Ctx,
+  generate, runAgents, diverge, withSharedRoot,
+} from '../../dist/agents/index.js';
+import type { Tool, AgentPoolResult, DivergeResult, AgentEvent } from '../../dist/agents/index.js';
+
+/** Load a task prompt file. Convention: system prompt above `---`, user content below. */
+function loadTask(name: string): { system: string; user: string } {
+  const raw = fs.readFileSync(path.resolve(__dirname, `tasks/${name}.md`), 'utf8').trim();
+  const sep = raw.indexOf('\n---\n');
+  if (sep === -1) return { system: raw, user: '' };
+  return { system: raw.slice(0, sep).trim(), user: raw.slice(sep + 5).trim() };
+}
+
+const PLAN = loadTask('plan');
+const RESEARCH = loadTask('research');
+const VERIFY = loadTask('verify');
+const EVAL = loadTask('eval');
+
+// ── Harness events ───────────────────────────────────────────────
+// Phase-level events sent by the harness. Display subscribes to these
+// alongside AgentEvent (agent-level events from useAgentPool).
+
+export interface PhaseStats {
+  label: string;
+  tokens: number;
+  detail: string;
+  timeMs: number;
+}
+
+export type PhaseEvent =
+  | { type: 'query'; query: string; warm: boolean }
+  | { type: 'plan'; questions: string[]; tokenCount: number; timeMs: number }
+  | { type: 'research:start'; agentCount: number }
+  | { type: 'research:done'; pool: AgentPoolResult; sharedPrefixLength: number; timeMs: number }
+  | { type: 'verify:start'; count: number }
+  | { type: 'verify:done'; result: DivergeResult; timeMs: number }
+  | { type: 'eval:done'; converged: boolean | null; tokenCount: number; timeMs: number }
+  | { type: 'answer'; text: string }
+  | { type: 'response:start' }
+  | { type: 'response:text'; text: string }
+  | { type: 'response:done'; tokenCount: number; timeMs: number }
+  | { type: 'stats'; phases: PhaseStats[]; kvLine?: string; ctxPct: number; ctxPos: number; ctxTotal: number }
+  | { type: 'complete'; data: Record<string, unknown> };
+
+export type HarnessEvent = AgentEvent | PhaseEvent;
+
+// ── Options ──────────────────────────────────────────────────────
+
+export interface HarnessOptions {
+  session: Session;
+  toolMap: Map<string, Tool>;
+  toolsJson: string;
+  agentCount: number;
+  verifyCount: number;
+  maxTurns: number;
+  nCtx: number;
+  trace: boolean;
+  events: Signal<HarnessEvent, void>;
+}
+
+// ── Plan ─────────────────────────────────────────────────────────
+
+function* planPhase(
+  query: string,
+  agentCount: number,
+  parent?: Branch,
+): Operation<{ questions: string[]; tokenCount: number }> {
+  const ctx: SessionContext = yield* Ctx.expect();
+
+  const schema = {
+    type: 'object',
+    properties: {
+      questions: {
+        type: 'array',
+        items: { type: 'string' },
+        minItems: 2,
+        maxItems: agentCount,
+      },
+    },
+    required: ['questions'],
+  };
+  const grammar: string = yield* call(() => ctx.jsonSchemaToGrammar(JSON.stringify(schema)));
+
+  const userContent = PLAN.user
+    .replace('{{count}}', String(agentCount))
+    .replace('{{query}}', query);
+
+  const messages = [
+    { role: 'system', content: PLAN.system },
+    { role: 'user', content: userContent },
+  ];
+  const { prompt }: { prompt: string } = yield* call(() => ctx.formatChat(JSON.stringify(messages)));
+
+  let output: string;
+  let tokenCount: number;
+
+  if (parent) {
+    // Warm: fork from trunk — planner inherits conversation KV
+    const lead: Branch = yield* call(() => parent.fork());
+    try {
+      lead.setGrammar(grammar);
+      const sep = ctx.getTurnSeparator();
+      const delta: number[] = yield* call(() => ctx.tokenize(prompt, false));
+      yield* call(() => lead.prefill([...sep, ...delta]));
+
+      ({ output, tokenCount } = yield* call(async () => {
+        let o = '';
+        let tc = 0;
+        for await (const { text } of lead) { o += text; tc++; }
+        return { output: o, tokenCount: tc };
+      }));
+    } finally {
+      if (!lead.disposed) yield* call(() => lead.prune());
+    }
+  } else {
+    // Cold: fresh branch via generate()
+    const result = yield* generate({ prompt, grammar, params: { temperature: 0.3 } });
+    output = result.output;
+    tokenCount = result.tokenCount;
+  }
+
+  let questions: string[];
+  try {
+    questions = JSON.parse(output).questions.slice(0, agentCount);
+    if (!questions.length) throw new Error('empty');
+  } catch {
+    questions = Array.from({ length: agentCount }, (_, i) => `${query} (aspect ${i + 1})`);
+  }
+
+  return { questions, tokenCount };
+}
+
+// ── Verify ───────────────────────────────────────────────────────
+
+function* verifyPhase(opts: {
+  findings: string;
+  query: string;
+  count: number;
+}): Operation<DivergeResult> {
+  const ctx: SessionContext = yield* Ctx.expect();
+
+  const userContent = VERIFY.user
+    .replace('{{findings}}', opts.findings)
+    .replace('{{query}}', opts.query);
+
+  const messages = [
+    { role: 'system', content: VERIFY.system },
+    { role: 'user', content: userContent },
+  ];
+  const { prompt }: { prompt: string } = yield* call(() => ctx.formatChat(JSON.stringify(messages)));
+
+  return yield* diverge({
+    prompt,
+    attempts: opts.count,
+    params: { temperature: 0.7 },
+  });
+}
+
+// ── Eval ─────────────────────────────────────────────────────────
+
+function* evalPhase(
+  attempts: { output: string }[],
+): Operation<{ converged: boolean | null; tokenCount: number }> {
+  const ctx: SessionContext = yield* Ctx.expect();
+
+  const responsesText = attempts
+    .map((a, i) => `Response ${i + 1}: ${a.output.trim()}`)
+    .join('\n\n');
+
+  const userContent = EVAL.user.replace('{{responses}}', responsesText);
+
+  const messages = [
+    { role: 'system', content: EVAL.system },
+    { role: 'user', content: userContent },
+  ];
+
+  const evalSchema = {
+    type: 'object',
+    properties: { converged: { type: 'boolean' } },
+    required: ['converged'],
+  };
+  const grammar: string = yield* call(() => ctx.jsonSchemaToGrammar(JSON.stringify(evalSchema)));
+  const { prompt }: { prompt: string } = yield* call(() => ctx.formatChat(JSON.stringify(messages)));
+
+  const result = yield* generate({
+    prompt,
+    grammar,
+    params: { temperature: 0 },
+    parse: (output: string) => {
+      try { return JSON.parse(output).converged as boolean; }
+      catch { return null; }
+    },
+  });
+
+  return { converged: result.parsed as boolean | null, tokenCount: result.tokenCount };
+}
+
+// ── handleQuery — the orchestrator ───────────────────────────────
+// Composes phases, sends HarnessEvent for display, touches no log().
+
+export function* handleQuery(query: string, opts: HarnessOptions): Operation<void> {
+  const { session, toolMap, toolsJson, agentCount, verifyCount, maxTurns, nCtx, trace, events } = opts;
+  const warm = !!session.trunk;
+  const t0 = performance.now();
+
+  events.send({ type: 'query', query, warm });
+
+  // ── Plan
+  let t = performance.now();
+  const { questions, tokenCount: planTokens } = yield* planPhase(
+    query, agentCount, warm ? session.trunk! : undefined,
+  );
+  const planMs = performance.now() - t;
+  events.send({ type: 'plan', questions, tokenCount: planTokens, timeMs: planMs });
+
+  // ── Research
+  events.send({ type: 'research:start', agentCount: questions.length });
+  t = performance.now();
+
+  let pool: AgentPoolResult;
+  let sharedPrefixLength: number;
+
+  const agentTasks = (parent: Branch, seed?: number) => questions.map((q, i) => ({
+    systemPrompt: RESEARCH.system,
+    content: q,
+    tools: toolsJson,
+    parent,
+    seed: seed != null ? seed + i : undefined,
+  }));
+
+  if (!warm) {
+    // Cold: withSharedRoot handles root create → prefill → cleanup
+    const { result, prefixLen } = yield* withSharedRoot(
+      { systemPrompt: RESEARCH.system, tools: toolsJson },
+      function*(root, prefixLen) {
+        const result = yield* runAgents({ tasks: agentTasks(root), tools: toolMap, maxTurns, trace });
+        return { result, prefixLen };
+      },
+    );
+    pool = result;
+    sharedPrefixLength = prefixLen;
+  } else {
+    // Warm: fork from conversation trunk
+    pool = yield* runAgents({
+      tasks: agentTasks(session.trunk!, Date.now()),
+      tools: toolMap,
+      maxTurns, trace,
+    });
+    sharedPrefixLength = 0;
+  }
+
+  const researchMs = performance.now() - t;
+  events.send({ type: 'research:done', pool, sharedPrefixLength, timeMs: researchMs });
+
+  // ── Post-research diverges based on cold/warm
+  const phases: PhaseStats[] = [
+    { label: 'Plan', tokens: planTokens, detail: '', timeMs: planMs },
+    {
+      label: 'Research', tokens: pool.totalTokens,
+      detail: `(${pool.agents.map(a => a.tokenCount).join(' + ')})  ${pool.totalToolCalls} tools`,
+      timeMs: researchMs,
+    },
+  ];
+
+  if (!warm) {
+    // ── Verify
+    const findingsText = pool.agents
+      .map((a, i) => `Q: ${questions[i]}\nA: ${(a.findings || '').trim()}`)
+      .join('\n\n');
+
+    events.send({ type: 'verify:start', count: verifyCount });
+    t = performance.now();
+    const verifyResult = yield* verifyPhase({ findings: findingsText, query, count: verifyCount });
+    const verifyMs = performance.now() - t;
+    events.send({ type: 'verify:done', result: verifyResult, timeMs: verifyMs });
+
+    // ── Eval
+    t = performance.now();
+    const { converged, tokenCount: evalTokens } = yield* evalPhase(verifyResult.attempts);
+    const evalMs = performance.now() - t;
+    events.send({ type: 'eval:done', converged, tokenCount: evalTokens, timeMs: evalMs });
+
+    // ── Answer
+    events.send({ type: 'answer', text: verifyResult.bestOutput });
+
+    phases.push(
+      {
+        label: 'Verify', tokens: verifyResult.totalTokens,
+        detail: `(${verifyResult.attempts.map(a => a.tokenCount).join(' + ')})`,
+        timeMs: verifyMs,
+      },
+      { label: 'Eval', tokens: evalTokens, detail: `converged: ${converged ? 'yes' : 'no'}`, timeMs: evalMs },
+    );
+
+    yield* call(() => session.promote(verifyResult.best));
+
+    const kvSaved = sharedPrefixLength * (questions.length - 1)
+      + verifyResult.prefixLength * (verifyResult.attempts.length - 1);
+
+    events.send({
+      type: 'stats', phases,
+      kvLine: `KV shared    ${sharedPrefixLength} \u00d7 ${questions.length - 1} + ${verifyResult.prefixLength} \u00d7 ${verifyResult.attempts.length - 1} = ${kvSaved.toLocaleString()} tok saved`,
+      ctxPct: Math.round(100 * (session.trunk?.position ?? 0) / nCtx),
+      ctxPos: session.trunk?.position ?? 0,
+      ctxTotal: nCtx,
+    });
+
+    events.send({
+      type: 'complete',
+      data: {
+        planTokens,
+        agentTokens: pool.totalTokens, researchSteps: pool.steps,
+        agentPpl: pool.agents.map(a => a.ppl),
+        verifyTokens: verifyResult.totalTokens, verifySteps: verifyResult.steps,
+        evalTokens, converged,
+        totalToolCalls: pool.totalToolCalls,
+        prefixTokens: verifyResult.prefixLength,
+        sharedPrefixTokens: sharedPrefixLength,
+        agentCount: questions.length, attemptCount: verifyResult.attempts.length,
+        wallTimeMs: Math.round(performance.now() - t0),
+        planMs: Math.round(planMs), researchMs: Math.round(researchMs),
+        verifyMs: Math.round(verifyMs), evalMs: Math.round(evalMs),
+        ...pool.counters,
+      },
+    });
+
+  } else {
+    // ── Grounded response from trunk
+    const agentFindings = pool.agents
+      .map((a: { findings: string | null }, i: number) =>
+        a.findings ? `[Agent ${i}] ${a.findings.trim()}` : null)
+      .filter(Boolean)
+      .join('\n\n');
+
+    yield* call(() => session.prefillUser(agentFindings
+      ? `Research findings:\n${agentFindings}\n\nUser question: ${query}\n\nAnswer based on the research findings above.`
+      : query));
+
+    events.send({ type: 'response:start' });
+    t = performance.now();
+    let responseTokens = 0;
+    const trunk = session.trunk!;
+    for (;;) {
+      const { token, text, isStop } = trunk.produceSync();
+      if (isStop) break;
+      yield* call(() => trunk.commit(token));
+      responseTokens++;
+      events.send({ type: 'response:text', text } as HarnessEvent);
+    }
+    const responseMs = performance.now() - t;
+    events.send({ type: 'response:done', tokenCount: responseTokens, timeMs: responseMs });
+
+    phases.push({ label: 'Response', tokens: responseTokens, detail: '', timeMs: responseMs });
+
+    events.send({
+      type: 'stats', phases,
+      ctxPct: Math.round(100 * (session.trunk?.position ?? 0) / nCtx),
+      ctxPos: session.trunk?.position ?? 0,
+      ctxTotal: nCtx,
+    });
+  }
+}
diff --git a/examples/deep-research/main.ts b/examples/deep-research/main.ts
new file mode 100644
index 0000000..5350ca3
--- /dev/null
+++ b/examples/deep-research/main.ts
@@ -0,0 +1,160 @@
+#!/usr/bin/env node
+/**
+ * Deep Research — CLI entry point
+ *
+ * Wiring only: setup, display subscriber, signal-based REPL.
+ * Orchestration lives in harness.ts. Rendering lives in display.ts.
+ *
+ * Usage:
+ *   npx tsx examples/deep-research/main.ts [model-path] --corpus <path> [--query <text>] [options]
+ */
+
+import * as fs from 'node:fs';
+import * as path from 'node:path';
+import * as readline from 'node:readline';
+import { main, ensure, createSignal, spawn, each, call, action } from 'effection';
+import { createContext } from '../../dist/index.js';
+import type { SessionContext } from '../../dist/index.js';
+import { initAgents } from '../../dist/agents/index.js';
+import { c, log, setJsonlMode, fmtSize } from './display.js';
+import { displaySubscriber } from './display.js';
+import { loadResources, chunkResources } from './resources/files.js';
+import { createReranker } from './reranker.js';
+import { createTools } from './tools/index.js';
+import { handleQuery } from './harness.js';
+import type { HarnessEvent, HarnessOptions } from './harness.js';
+
+// ── CLI args ─────────────────────────────────────────────────────
+
+const DEFAULT_MODEL = path.resolve(__dirname, '../../models/Qwen3-4B-Instruct-2507-Q4_K_M.gguf');
+const DEFAULT_RERANKER = path.resolve(__dirname, '../../models/qwen3-reranker-0.6b-q4_k_m.gguf');
+
+const args = process.argv.slice(2);
+const jsonlMode = args.includes('--jsonl');
+const verbose = args.includes('--verbose');
+const trace = args.includes('--trace');
+
+function argVal(flag: string): string | null {
+  const i = args.indexOf(flag);
+  return i !== -1 ? args[i + 1] : null;
+}
+const flagIndices = new Set(
+  ['--reranker', '--corpus', '--query'].flatMap((f) => {
+    const i = args.indexOf(f);
+    return i !== -1 ? [i, i + 1] : [];
+  }),
+);
+
+const rerankModelPath = argVal('--reranker') || DEFAULT_RERANKER;
+const corpusDir = argVal('--corpus');
+const initialQuery = argVal('--query');
+const modelPath = args.find((a, i) => !a.startsWith('--') && !flagIndices.has(i)) || DEFAULT_MODEL;
+
+if (!corpusDir) {
+  process.stdout.write(
+    `Usage: npx tsx examples/deep-research/main.ts [model-path] --corpus <path> [--query <text>] [--reranker <path>]\nMissing: --corpus\n`,
+  );
+  process.exit(1);
+}
+
+if (jsonlMode) setJsonlMode(true);
+if (!verbose && !jsonlMode) {
+  try { fs.closeSync(2); fs.openSync(process.platform === 'win32' ? '\\\\.\\NUL' : '/dev/null', 'w'); } catch { /* non-fatal */ }
+}
+
+const AGENT_COUNT = 3;
+const VERIFY_COUNT = 3;
+const MAX_TOOL_TURNS = 6;
+
+// ── Main ─────────────────────────────────────────────────────────
+
+main(function*() {
+  const resources = loadResources(corpusDir!);
+  const chunks = chunkResources(resources);
+
+  const modelName = path.basename(modelPath).replace(/-Q\w+\.gguf$/, '');
+  const rerankName = path.basename(rerankModelPath).replace(/-q\w+\.gguf$/i, '');
+
+  log();
+  log(`${c.bold}  Deep Research${c.reset} ${c.dim}\u2014 Structured Concurrency Runtime${c.reset}`);
+  log();
+  log(`  ${c.green}\u25cf${c.reset} Loading ${c.bold}${modelName}${c.reset} ${c.dim}(${fmtSize(fs.statSync(modelPath).size)}, KV: Q4_0)${c.reset}`);
+
+  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '16384', 10);
+  const ctx: SessionContext = yield* call(() => createContext({
+    modelPath, nCtx,
+    nSeqMax: Math.max(AGENT_COUNT, VERIFY_COUNT) + 1,
+    typeK: 'q4_0', typeV: 'q4_0',
+  }));
+
+  log(`  ${c.green}\u25cf${c.reset} Loading ${c.bold}${rerankName}${c.reset} ${c.dim}(${fmtSize(fs.statSync(rerankModelPath).size)}, reranker)${c.reset}`);
+
+  const reranker = yield* call(() => createReranker(rerankModelPath, { nSeqMax: 8, nCtx: 4096 }));
+  yield* ensure(() => { reranker.dispose(); });
+  yield* call(() => reranker.tokenizeChunks(chunks));
+
+  const corpusIsFile = resources.length === 1 && fs.statSync(corpusDir!).isFile();
+  const corpusLabel = corpusIsFile
+    ? path.basename(corpusDir!)
+    : `${path.basename(corpusDir!)}/ \u2014 ${resources.length} files`;
+  log(`  ${c.dim}  Corpus: ${corpusLabel} \u2192 ${chunks.length} chunks${c.reset}`);
+
+  const { toolMap, toolsJson } = createTools({ resources, chunks, reranker });
+  const { session, events } = yield* initAgents<HarnessEvent>(ctx);
+
+  // Display subscriber — all rendering lives here
+  yield* spawn(function*() {
+    yield* displaySubscriber(events, {
+      model: path.basename(modelPath),
+      reranker: path.basename(rerankModelPath),
+      agentCount: AGENT_COUNT,
+      verifyCount: VERIFY_COUNT,
+      chunkCount: chunks.length,
+    });
+  });
+
+  const harnessOpts: HarnessOptions = {
+    session, toolMap, toolsJson, events,
+    agentCount: AGENT_COUNT, verifyCount: VERIFY_COUNT,
+    maxTurns: MAX_TOOL_TURNS, nCtx, trace,
+  };
+
+  // Initial query
+  if (initialQuery) {
+    yield* handleQuery(initialQuery, harnessOpts);
+    if (jsonlMode) return;  // scope exit triggers initAgents + ensure cleanup
+  }
+
+  // REPL — signal bridges readline into Effection scope
+  log(`  ${c.dim}${session.trunk ? 'Ask a follow-up question' : 'Enter your research question'} or /quit to exit${c.reset}`);
+  log();
+
+  const inputSignal = createSignal<string, void>();
+  const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
+  rl.setPrompt(`  ${c.dim}>${c.reset} `);
+
+  yield* spawn(function*() {
+    yield* action<void>((resolve) => {
+      rl.on('line', (line: string) => inputSignal.send(line.trim()));
+      rl.on('close', () => { inputSignal.close(); resolve(); });
+      return () => rl.close();
+    });
+  });
+
+  rl.prompt();
+  for (const input of yield* each(inputSignal)) {
+    if (!input || input === '/quit') break;
+    try {
+      yield* handleQuery(input, harnessOpts);
+    } catch (err) {
+      log(`  ${c.red}Error: ${(err as Error).message}${c.reset}`);
+    }
+    yield* each.next();
+    try { rl.prompt(); } catch { break; }
+  }
+
+  // scope exit triggers initAgents + ensure cleanup
+}).catch((err: unknown) => {
+  process.stdout.write(`Error: ${(err as Error).message}\n${(err as Error).stack}\n`);
+  process.exit(1);
+});
diff --git a/examples/deep-research/tasks/eval.md b/examples/deep-research/tasks/eval.md
index c408002..d555374 100644
--- a/examples/deep-research/tasks/eval.md
+++ b/examples/deep-research/tasks/eval.md
@@ -1,3 +1,5 @@
+You are a consistency checker. Compare the responses and determine if they convey the same core meaning. Output JSON only.
+---
 Do these responses agree on the key points?
 
-{{responses}}
\ No newline at end of file
+{{responses}}
diff --git a/examples/deep-research/tasks/eval.ts b/examples/deep-research/tasks/eval.ts
deleted file mode 100644
index 9eb3079..0000000
--- a/examples/deep-research/tasks/eval.ts
+++ /dev/null
@@ -1,54 +0,0 @@
-import * as fs from 'node:fs';
-import * as path from 'node:path';
-import { Branch } from '../../../dist/index.js';
-import type { SessionContext } from '../../../dist/index.js';
-
-const EVAL_PROMPT = fs.readFileSync(path.resolve(__dirname, 'eval.md'), 'utf8');
-
-export async function evaluate(ctx: SessionContext, opts: {
-  attempts: { output: string }[];
-}): Promise<{ converged: boolean | null; tokenCount: number }> {
-  const responsesText = opts.attempts
-    .map((a, i) => `Response ${i + 1}: ${a.output.trim()}`)
-    .join('\n\n');
-
-  const userContent = EVAL_PROMPT.replace('{{responses}}', responsesText);
-
-  const messages = [
-    {
-      role: 'system',
-      content: 'You are a consistency checker. Compare the responses and determine if they convey the same core meaning. Output JSON only.',
-    },
-    { role: 'user', content: userContent },
-  ];
-
-  const evalSchema = {
-    type: 'object',
-    properties: { converged: { type: 'boolean' } },
-    required: ['converged'],
-  };
-  const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(evalSchema));
-
-  const { prompt } = await ctx.formatChat(JSON.stringify(messages));
-  const tokens = await ctx.tokenize(prompt);
-
-  const branch = Branch.create(ctx, 0, { temperature: 0 }, undefined, grammar);
-  await branch.prefill(tokens);
-
-  let output = '';
-  let tokenCount = 0;
-  for await (const { text } of branch) {
-    output += text;
-    tokenCount++;
-  }
-  await branch.prune();
-
-  let converged: boolean | null;
-  try {
-    converged = JSON.parse(output).converged;
-  } catch {
-    converged = null;
-  }
-
-  return { converged, tokenCount };
-}
diff --git a/examples/deep-research/tasks/plan.md b/examples/deep-research/tasks/plan.md
index 8f94bd7..05bba9a 100644
--- a/examples/deep-research/tasks/plan.md
+++ b/examples/deep-research/tasks/plan.md
@@ -1 +1,3 @@
-Break this into {{count}} independent sub-questions for parallel research: "{{query}}"
\ No newline at end of file
+You break research queries into sub-questions. Output JSON only.
+---
+Break this into {{count}} independent sub-questions for parallel research: "{{query}}"
diff --git a/examples/deep-research/tasks/plan.ts b/examples/deep-research/tasks/plan.ts
deleted file mode 100644
index feb7949..0000000
--- a/examples/deep-research/tasks/plan.ts
+++ /dev/null
@@ -1,69 +0,0 @@
-import * as fs from 'node:fs';
-import * as path from 'node:path';
-import { Branch } from '../../../dist/index.js';
-import type { SessionContext } from '../../../dist/index.js';
-
-const PLAN_PROMPT = fs.readFileSync(path.resolve(__dirname, 'plan.md'), 'utf8');
-
-export async function plan(ctx: SessionContext, opts: {
-  query: string;
-  agentCount: number;
-  parent?: InstanceType<typeof Branch>;
-}): Promise<{ questions: string[]; tokenCount: number }> {
-  const schema = {
-    type: 'object',
-    properties: {
-      questions: {
-        type: 'array',
-        items: { type: 'string' },
-        minItems: 2,
-        maxItems: opts.agentCount,
-      },
-    },
-    required: ['questions'],
-  };
-  const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(schema));
-
-  const userContent = PLAN_PROMPT
-    .replace('{{count}}', String(opts.agentCount))
-    .replace('{{query}}', opts.query);
-
-  const messages = [
-    { role: 'system', content: 'You break research queries into sub-questions. Output JSON only.' },
-    { role: 'user', content: userContent },
-  ];
-  const { prompt } = await ctx.formatChat(JSON.stringify(messages));
-
-  let lead: InstanceType<typeof Branch>;
-  if (opts.parent) {
-    // Warm: fork from trunk — planner inherits conversation KV
-    lead = await opts.parent.fork();
-    lead.setGrammar(grammar);
-    const sep = ctx.getTurnSeparator();
-    const delta = await ctx.tokenize(prompt, false);
-    await lead.prefill([...sep, ...delta]);
-  } else {
-    // Cold: fresh branch at position 0
-    const tokens = await ctx.tokenize(prompt);
-    lead = Branch.create(ctx, 0, { temperature: 0.3 }, undefined, grammar);
-    await lead.prefill(tokens);
-  }
-
-  let output = '';
-  let tokenCount = 0;
-  for await (const { text } of lead) {
-    output += text;
-    tokenCount++;
-  }
-  await lead.prune();
-
-  let questions: string[];
-  try {
-    questions = JSON.parse(output).questions.slice(0, opts.agentCount);
-    if (!questions.length) throw new Error('empty questions');
-  } catch {
-    questions = Array.from({ length: opts.agentCount }, (_, i) => `${opts.query} (aspect ${i + 1})`);
-  }
-
-  return { questions, tokenCount };
-}
diff --git a/examples/deep-research/tasks/research.ts b/examples/deep-research/tasks/research.ts
deleted file mode 100644
index f5319d8..0000000
--- a/examples/deep-research/tasks/research.ts
+++ /dev/null
@@ -1,138 +0,0 @@
-import * as fs from 'node:fs';
-import * as path from 'node:path';
-import { Branch, BranchStore, forkAgent, runAgents } from '../../../dist/index.js';
-import type { SessionContext, AgentState } from '../../../dist/index.js';
-import type { ExecuteToolFn } from '../tools/types.js';
-
-const DEFAULT_SYSTEM_PROMPT = fs.readFileSync(path.resolve(__dirname, 'research.md'), 'utf8');
-
-export { DEFAULT_SYSTEM_PROMPT as RESEARCH_SYSTEM_PROMPT };
-
-export interface AgentResult {
-  agentId: number;
-  findings: string | null;
-  toolCallCount: number;
-  tokenCount: number;
-}
-
-export interface ResearchResult {
-  agents: AgentResult[];
-  totalTokens: number;
-  totalToolCalls: number;
-  steps: number;
-  counters: Record<string, number>;
-  sharedPrefixLength: number;
-}
-
-export async function research(ctx: SessionContext, store: BranchStore, opts: {
-  questions: string[];
-  parent?: InstanceType<typeof Branch>;
-  seed?: number;
-  systemPrompt?: string;
-  toolsJson: string;
-  executeTool: ExecuteToolFn;
-  maxTurns?: number;
-  onProduce?: (agentId: number, text: string, tokenCount: number) => void;
-  onToolCall?: (agentId: number, toolName: string, args: string) => void;
-  onToolResult?: (agentId: number, toolName: string, resultStr: string) => void;
-  onToolProgress?: (agentId: number, toolName: string, progress: { filled: number; total: number }) => void;
-  onReport?: (agentId: number, findings: string) => void;
-}): Promise<ResearchResult> {
-  const systemPrompt = opts.systemPrompt ?? DEFAULT_SYSTEM_PROMPT;
-
-  let agents: AgentState[];
-  let sharedPrefixLength: number;
-  let root: InstanceType<typeof Branch> | null;
-
-  if (opts.parent) {
-    // Warm: fork from conversation trunk — each agent inherits full KV,
-    // gets a fresh system prompt + question injected as suffix.
-    // Diversity via reseeded sampler, not divergent content.
-    agents = await Promise.all(
-      opts.questions.map((q, i) =>
-        forkAgent(opts.parent!, {
-          systemPrompt,
-          content: q,
-          tools: opts.toolsJson,
-          seed: opts.seed != null ? opts.seed + i : undefined,
-        }, ctx)
-      )
-    );
-    sharedPrefixLength = 0;
-    root = null;
-  } else {
-    // Cold: shared-prefix optimization — one root with system prompt,
-    // fork N agents with divergent user-question suffixes.
-    const sharedMessages = [{ role: 'system', content: systemPrompt }];
-    const sharedFmt = await ctx.formatChat(
-      JSON.stringify(sharedMessages),
-      { tools: opts.toolsJson, addGenerationPrompt: false },
-    );
-    const sharedTokens = await ctx.tokenize(sharedFmt.prompt);
-
-    root = Branch.create(ctx, 0, { temperature: 0.5 });
-    await root.prefill(sharedTokens);
-
-    agents = [];
-    for (const q of opts.questions) {
-      const branch = await root.fork();
-      const fullMessages = [
-        { role: 'system', content: systemPrompt },
-        { role: 'user', content: q },
-      ];
-      const fmt = await ctx.formatChat(JSON.stringify(fullMessages), { tools: opts.toolsJson });
-      const fullTokens = await ctx.tokenize(fmt.prompt);
-      const suffixTokens = fullTokens.slice(sharedTokens.length);
-
-      agents.push({
-        agentId: branch.handle,
-        branch,
-        suffixTokens,
-        fmt: {
-          format: fmt.format,
-          reasoningFormat: fmt.reasoningFormat,
-          thinkingForcedOpen: fmt.thinkingForcedOpen,
-          parser: fmt.parser,
-          grammar: fmt.grammar,
-          grammarLazy: fmt.grammarLazy,
-          grammarTriggers: fmt.grammarTriggers,
-        },
-        rawOutput: '',
-        done: false,
-        tokenCount: 0,
-        toolCallCount: 0,
-        turns: 0,
-        findings: null,
-      });
-    }
-    sharedPrefixLength = sharedTokens.length;
-  }
-
-  // Common path: batch prefill + agentic loop + prune
-  await store.prefill(agents.map((a) => [a.branch, a.suffixTokens]));
-
-  const result = await runAgents(agents, {
-    store, ctx,
-    executeTool: opts.executeTool,
-    maxTurns: opts.maxTurns ?? 6,
-    onProduce: opts.onProduce,
-    onToolCall: opts.onToolCall,
-    onToolResult: opts.onToolResult,
-    onToolProgress: opts.onToolProgress,
-    onReport: opts.onReport,
-  });
-
-  for (const a of agents) await a.branch.prune();
-  if (root) await root.prune();
-
-  return {
-    agents: agents.map((a) => ({
-      agentId: a.agentId,
-      findings: a.findings,
-      toolCallCount: a.toolCallCount,
-      tokenCount: a.tokenCount,
-    })),
-    ...result,
-    sharedPrefixLength,
-  };
-}
diff --git a/examples/deep-research/tasks/verify.md b/examples/deep-research/tasks/verify.md
index 27e0c97..0713358 100644
--- a/examples/deep-research/tasks/verify.md
+++ b/examples/deep-research/tasks/verify.md
@@ -1,5 +1,7 @@
+Synthesize the research findings into a coherent, concise summary.
+---
 Research findings:
 
 {{findings}}
 
-Synthesize these into a brief summary answering: "{{query}}"
\ No newline at end of file
+Synthesize these into a brief summary answering: "{{query}}"
diff --git a/examples/deep-research/tasks/verify.ts b/examples/deep-research/tasks/verify.ts
deleted file mode 100644
index 4beb9a9..0000000
--- a/examples/deep-research/tasks/verify.ts
+++ /dev/null
@@ -1,91 +0,0 @@
-import * as fs from 'node:fs';
-import * as path from 'node:path';
-import { Branch, BranchStore } from '../../../dist/index.js';
-import type { SessionContext } from '../../../dist/index.js';
-
-const VERIFY_PROMPT = fs.readFileSync(path.resolve(__dirname, 'verify.md'), 'utf8');
-
-export interface Attempt {
-  output: string;
-  tokenCount: number;
-  ppl: number;
-}
-
-export interface VerifyResult {
-  attempts: Attempt[];
-  bestOutput: string;
-  bestBranch: InstanceType<typeof Branch>;
-  totalTokens: number;
-  steps: number;
-  prefixLength: number;
-}
-
-export async function verify(ctx: SessionContext, store: BranchStore, opts: {
-  findings: string;
-  query: string;
-  count: number;
-}): Promise<VerifyResult> {
-  const userContent = VERIFY_PROMPT
-    .replace('{{findings}}', opts.findings)
-    .replace('{{query}}', opts.query);
-
-  const messages = [
-    { role: 'system', content: 'Synthesize the research findings into a coherent, concise summary.' },
-    { role: 'user', content: userContent },
-  ];
-  const { prompt } = await ctx.formatChat(JSON.stringify(messages));
-  const synthTokens = await ctx.tokenize(prompt);
-
-  const synthRoot = Branch.create(ctx, 0, { temperature: 0.7 });
-  await synthRoot.prefill(synthTokens);
-
-  // Fork N branches with reseeded samplers for stochastic divergence
-  const live: { branch: InstanceType<typeof Branch>; output: string; done: boolean; tokenCount: number; ppl: number }[] = [];
-  for (let i = 0; i < opts.count; i++) {
-    const branch = await synthRoot.fork();
-    branch.reseedSampler(2000 + i);
-    live.push({ branch, output: '', done: false, tokenCount: 0, ppl: Infinity });
-  }
-
-  // BranchStore batched decode — produceSync/commit loop
-  let steps = 0;
-  for (;;) {
-    const entries: [InstanceType<typeof Branch>, number][] = [];
-    for (const a of live) {
-      if (a.done) continue;
-      const { token, text, isStop } = a.branch.produceSync();
-      if (isStop) {
-        const p = a.branch.perplexity;
-        a.ppl = Number.isFinite(p) ? p : Infinity;
-        a.done = true;
-        continue;
-      }
-      entries.push([a.branch, token]);
-      a.output += text;
-      a.tokenCount++;
-    }
-    if (entries.length === 0) break;
-    await store.commit(entries);
-    steps++;
-  }
-
-  // Pick lowest perplexity (most coherent)
-  const bestIdx = live.reduce((bi, a, i) => a.ppl <= live[bi].ppl ? i : bi, 0);
-
-  // Prune non-best attempts; synthRoot stays alive (bestBranch is its child)
-  // — caller's retainOnly will clean up synthRoot when promoting bestBranch
-  for (let i = 0; i < live.length; i++) {
-    if (i !== bestIdx) await live[i].branch.prune();
-  }
-
-  const totalTokens = live.reduce((s, a) => s + a.tokenCount, 0);
-
-  return {
-    attempts: live.map((a) => ({ output: a.output, tokenCount: a.tokenCount, ppl: a.ppl })),
-    bestOutput: live[bestIdx].output,
-    bestBranch: live[bestIdx].branch,
-    totalTokens,
-    steps,
-    prefixLength: synthTokens.length,
-  };
-}
diff --git a/examples/deep-research/tools/grep.ts b/examples/deep-research/tools/grep.ts
index 72cfd68..a0313fe 100644
--- a/examples/deep-research/tools/grep.ts
+++ b/examples/deep-research/tools/grep.ts
@@ -1,67 +1,67 @@
+import { Tool } from '../../../dist/agents/index.js';
+import type { JsonSchema } from '../../../dist/agents/index.js';
 import type { Resource } from '../resources/types.js';
-import type { Tool } from './types.js';
 
-export function createGrepTool(resources: Resource[]): Tool {
-  return {
-    name: 'grep',
-    schema: {
-      type: 'function',
-      function: {
-        name: 'grep',
-        description: 'Search the entire corpus for a regex pattern. Returns every matching line with line numbers and total match count. Complements search() which ranks by relevance — grep scans exhaustively.',
-        parameters: {
-          type: 'object',
-          properties: {
-            pattern: { type: 'string', description: 'Regex pattern (e.g. "\\bshor\\b" for whole-word, "hidden_secret" for literal)' },
-            ignoreCase: { type: 'boolean', description: 'Case-insensitive matching (default: true)' },
-          },
-          required: ['pattern'],
-        },
-      },
+export class GrepTool extends Tool<{ pattern: string; ignoreCase?: boolean }> {
+  readonly name = 'grep';
+  readonly description = 'Search the entire corpus for a regex pattern. Returns every matching line with line numbers and total match count. Complements search() which ranks by relevance — grep scans exhaustively.';
+  readonly parameters: JsonSchema = {
+    type: 'object',
+    properties: {
+      pattern: { type: 'string', description: 'Regex pattern (e.g. "\\bshor\\b" for whole-word, "hidden_secret" for literal)' },
+      ignoreCase: { type: 'boolean', description: 'Case-insensitive matching (default: true)' },
     },
-    async execute(args) {
-      const pattern = (args.pattern as string)?.trim();
-      if (!pattern) return { error: 'pattern must not be empty' };
-      const flags = (args.ignoreCase === false) ? 'g' : 'gi';
-      let re: RegExp;
-      try { re = new RegExp(pattern, flags); }
-      catch { return { error: `Invalid regex: ${pattern}` }; }
+    required: ['pattern'],
+  };
+
+  private _resources: Resource[];
+
+  constructor(resources: Resource[]) {
+    super();
+    this._resources = resources;
+  }
+
+  async execute(args: { pattern: string; ignoreCase?: boolean }): Promise<unknown> {
+    const pattern = args.pattern?.trim();
+    if (!pattern) return { error: 'pattern must not be empty' };
+    const flags = (args.ignoreCase === false) ? 'g' : 'gi';
+    let re: RegExp;
+    try { re = new RegExp(pattern, flags); }
+    catch { return { error: `Invalid regex: ${pattern}` }; }
 
-      const matches: { file: string; line: number; text: string }[] = [];
-      let totalMatches = 0;
+    const matches: { file: string; line: number; text: string }[] = [];
+    let totalMatches = 0;
 
-      for (const res of resources) {
-        const lines = res.content.split('\n');
-        for (let i = 0; i < lines.length; i++) {
-          const hits = lines[i].match(re);
-          if (hits) {
-            totalMatches += hits.length;
-            const raw = lines[i].trim();
-            let text: string;
-            if (raw.length <= 200) {
-              text = raw;
-            } else {
-              // Truncate around first match so the matched term is always visible
-              const idx = raw.search(re);
-              const start = Math.max(0, idx - 40);
-              const end = Math.min(raw.length, start + 200);
-              text = (start > 0 ? '…' : '') + raw.slice(start, end) + (end < raw.length ? '…' : '');
-            }
-            matches.push({ file: res.name, line: i + 1, text });
+    for (const res of this._resources) {
+      const lines = res.content.split('\n');
+      for (let i = 0; i < lines.length; i++) {
+        const hits = lines[i].match(re);
+        if (hits) {
+          totalMatches += hits.length;
+          const raw = lines[i].trim();
+          let text: string;
+          if (raw.length <= 200) {
+            text = raw;
+          } else {
+            const idx = raw.search(re);
+            const start = Math.max(0, idx - 40);
+            const end = Math.min(raw.length, start + 200);
+            text = (start > 0 ? '\u2026' : '') + raw.slice(start, end) + (end < raw.length ? '\u2026' : '');
           }
+          matches.push({ file: res.name, line: i + 1, text });
         }
       }
+    }
 
-      if (totalMatches === 0) {
-        return {
-          totalMatches: 0, matchingLines: 0, matches: [],
-          note: 'Zero matches does NOT mean the topic is absent — only that this exact pattern was not found. Try search() for semantic matching or a broader/simpler regex.',
-        };
-      }
+    if (totalMatches === 0) {
+      return {
+        totalMatches: 0, matchingLines: 0, matches: [],
+        note: 'Zero matches does NOT mean the topic is absent \u2014 only that this exact pattern was not found. Try search() for semantic matching or a broader/simpler regex.',
+      };
+    }
 
-      const limit = 50;
-      const truncated = matches.length > limit;
-      return { totalMatches, matchingLines: matches.length, truncated, matches: matches.slice(0, limit) };
-    },
-  };
+    const limit = 50;
+    const truncated = matches.length > limit;
+    return { totalMatches, matchingLines: matches.length, truncated, matches: matches.slice(0, limit) };
+  }
 }
diff --git a/examples/deep-research/tools/index.ts b/examples/deep-research/tools/index.ts
index 5d09ec3..276730e 100644
--- a/examples/deep-research/tools/index.ts
+++ b/examples/deep-research/tools/index.ts
@@ -1,30 +1,19 @@
+import { createToolkit } from '../../../dist/agents/index.js';
+import type { Toolkit } from '../../../dist/agents/index.js';
 import type { Resource, Chunk } from '../resources/types.js';
-import type { Reranker, Tool, ExecuteToolFn } from './types.js';
-import { createSearchTool } from './search.js';
-import { createReadFileTool } from './read-file.js';
-import { createGrepTool } from './grep.js';
-import { createReportTool } from './report.js';
+import type { Reranker } from './types.js';
+import { SearchTool } from './search.js';
+import { ReadFileTool } from './read-file.js';
+import { GrepTool } from './grep.js';
 
 export function createTools(opts: {
   resources: Resource[];
   chunks: Chunk[];
   reranker: Reranker;
-}): { tools: Tool[]; toolsJson: string; executeTool: ExecuteToolFn } {
-  const tools = [
-    createSearchTool(opts.chunks, opts.reranker),
-    createReadFileTool(opts.resources),
-    createGrepTool(opts.resources),
-    createReportTool(),
-  ];
-
-  const toolsJson = JSON.stringify(tools.map((t) => t.schema));
-  const toolMap = new Map(tools.map((t) => [t.name, t]));
-
-  const executeTool: ExecuteToolFn = async (name, args, context?) => {
-    const tool = toolMap.get(name);
-    if (!tool) return { error: `Unknown tool: ${name}` };
-    return tool.execute(args, context);
-  };
-
-  return { tools, toolsJson, executeTool };
+}): Toolkit {
+  return createToolkit([
+    new SearchTool(opts.chunks, opts.reranker),
+    new ReadFileTool(opts.resources),
+    new GrepTool(opts.resources),
+  ]);
 }
diff --git a/examples/deep-research/tools/read-file.ts b/examples/deep-research/tools/read-file.ts
index 327e54e..3a2851b 100644
--- a/examples/deep-research/tools/read-file.ts
+++ b/examples/deep-research/tools/read-file.ts
@@ -1,39 +1,41 @@
+import { Tool } from '../../../dist/agents/index.js';
+import type { JsonSchema } from '../../../dist/agents/index.js';
 import type { Resource } from '../resources/types.js';
-import type { Tool } from './types.js';
 
-export function createReadFileTool(resources: Resource[]): Tool {
-  return {
-    name: 'read_file',
-    schema: {
-      type: 'function',
-      function: {
-        name: 'read_file',
-        description: 'Read content from a file at specific line ranges. Use startLine/endLine from search results.',
-        parameters: {
-          type: 'object',
-          properties: {
-            filename: {
-              type: 'string',
-              description: 'Filename from search results',
-              enum: resources.map((r) => r.name),
-            },
-            startLine: { type: 'number', description: 'Start line (1-indexed, from search results)' },
-            endLine: { type: 'number', description: 'End line (1-indexed, from search results)' },
-          },
-          required: ['filename'],
+export class ReadFileTool extends Tool<{ filename: string; startLine?: number; endLine?: number }> {
+  readonly name = 'read_file';
+  readonly description = 'Read content from a file at specific line ranges. Use startLine/endLine from search results.';
+  readonly parameters: JsonSchema;
+
+  private _resources: Resource[];
+
+  constructor(resources: Resource[]) {
+    super();
+    this._resources = resources;
+    this.parameters = {
+      type: 'object',
+      properties: {
+        filename: {
+          type: 'string',
+          description: 'Filename from search results',
+          enum: resources.map(r => r.name),
         },
+        startLine: { type: 'number', description: 'Start line (1-indexed, from search results)' },
+        endLine: { type: 'number', description: 'End line (1-indexed, from search results)' },
       },
-    },
-    async execute(args) {
-      const filename = (args.filename as string) || (args.path as string) || '';
-      const file = resources.find((r) => r.name === filename);
-      if (!file) {
-        return { error: `File not found: ${filename}. Available: ${resources.map((r) => r.name).join(', ')}` };
-      }
-      const lines = file.content.split('\n');
-      const s = Math.max(0, ((args.startLine as number) ?? 1) - 1);
-      const e = Math.min(lines.length, (args.endLine as number) ?? Math.min(100, lines.length));
-      return { file: file.name, content: lines.slice(s, e).join('\n') };
-    },
-  };
+      required: ['filename'],
+    };
+  }
+
+  async execute(args: { filename: string; startLine?: number; endLine?: number } & Record<string, unknown>): Promise<unknown> {
+    const filename = args.filename || (args.path as string) || '';
+    const file = this._resources.find(r => r.name === filename);
+    if (!file) {
+      return { error: `File not found: ${filename}. Available: ${this._resources.map(r => r.name).join(', ')}` };
+    }
+    const lines = file.content.split('\n');
+    const s = Math.max(0, (args.startLine ?? 1) - 1);
+    const e = Math.min(lines.length, args.endLine ?? Math.min(100, lines.length));
+    return { file: file.name, content: lines.slice(s, e).join('\n') };
+  }
 }
diff --git a/examples/deep-research/tools/report.ts b/examples/deep-research/tools/report.ts
deleted file mode 100644
index 253a820..0000000
--- a/examples/deep-research/tools/report.ts
+++ /dev/null
@@ -1,22 +0,0 @@
-import type { Tool } from './types.js';
-
-export function createReportTool(): Tool {
-  return {
-    name: 'report',
-    schema: {
-      type: 'function',
-      function: {
-        name: 'report',
-        description: 'Submit your final research findings. Call this when you have gathered enough information to answer the question.',
-        parameters: {
-          type: 'object',
-          properties: { findings: { type: 'string', description: 'Your research findings and answer' } },
-          required: ['findings'],
-        },
-      },
-    },
-    async execute() {
-      return { acknowledged: true };
-    },
-  };
-}
diff --git a/examples/deep-research/tools/search.ts b/examples/deep-research/tools/search.ts
index d45788d..df20630 100644
--- a/examples/deep-research/tools/search.ts
+++ b/examples/deep-research/tools/search.ts
@@ -1,30 +1,34 @@
+import { Tool } from '../../../dist/agents/index.js';
+import type { JsonSchema, ToolContext } from '../../../dist/agents/index.js';
 import type { Chunk } from '../resources/types.js';
-import type { Reranker, Tool } from './types.js';
+import type { Reranker } from './types.js';
 
-export function createSearchTool(chunks: Chunk[], reranker: Reranker): Tool {
-  return {
-    name: 'search',
-    schema: {
-      type: 'function',
-      function: {
-        name: 'search',
-        description: 'Search the knowledge base. Returns sections ranked by relevance with line ranges for read_file.',
-        parameters: {
-          type: 'object',
-          properties: { query: { type: 'string', description: 'Search query' } },
-          required: ['query'],
-        },
-      },
-    },
-    async execute(args, context?) {
-      const query = (args.query as string)?.trim();
-      if (!query) return { error: 'query must not be empty' };
-      let last;
-      for await (const { results, filled, total } of reranker.score(query, chunks)) {
-        if (context?.onProgress) context.onProgress({ filled, total });
-        last = results;
-      }
-      return last;
-    },
+export class SearchTool extends Tool<{ query: string }> {
+  readonly name = 'search';
+  readonly description = 'Search the knowledge base. Returns sections ranked by relevance with line ranges for read_file.';
+  readonly parameters: JsonSchema = {
+    type: 'object',
+    properties: { query: { type: 'string', description: 'Search query' } },
+    required: ['query'],
   };
+
+  private _chunks: Chunk[];
+  private _reranker: Reranker;
+
+  constructor(chunks: Chunk[], reranker: Reranker) {
+    super();
+    this._chunks = chunks;
+    this._reranker = reranker;
+  }
+
+  async execute(args: { query: string }, context?: ToolContext): Promise<unknown> {
+    const query = args.query?.trim();
+    if (!query) return { error: 'query must not be empty' };
+    let last;
+    for await (const { results, filled, total } of this._reranker.score(query, this._chunks)) {
+      if (context?.onProgress) context.onProgress({ filled, total });
+      last = results;
+    }
+    return last;
+  }
 }
diff --git a/examples/deep-research/tools/types.ts b/examples/deep-research/tools/types.ts
index 3ce04dd..4a9fc8d 100644
--- a/examples/deep-research/tools/types.ts
+++ b/examples/deep-research/tools/types.ts
@@ -19,18 +19,3 @@ export interface Reranker {
   tokenizeChunks(chunks: Chunk[]): Promise<void>;
   dispose(): void;
 }
-
-export interface Tool {
-  name: string;
-  schema: object;
-  execute: (
-    args: Record<string, unknown>,
-    context?: { onProgress?: (p: { filled: number; total: number }) => void },
-  ) => Promise<unknown>;
-}
-
-export type ExecuteToolFn = (
-  name: string,
-  args: Record<string, unknown>,
-  context?: { onProgress?: (p: { filled: number; total: number }) => void },
-) => Promise<unknown>;
diff --git a/package-lock.json b/package-lock.json
index 94ddd88..e618748 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -10,6 +10,7 @@
       "license": "Apache-2.0",
       "dependencies": {
         "@lloyal-labs/tsampler": "^0.2.0",
+        "effection": "^4.0.2",
         "node-addon-api": "^8.5.0"
       },
       "devDependencies": {
@@ -528,43 +529,173 @@
       }
     },
     "node_modules/@lloyal-labs/lloyal.node-darwin-arm64": {
-      "optional": true
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-darwin-arm64/-/lloyal.node-darwin-arm64-1.6.0.tgz",
+      "integrity": "sha512-T8Xt2ZSyY7yLQQgVLQZhR4Wb61LuEEnZdSF7+C0wu9BbB/DMyum2Ix6lDsufGf/oXOLiSrwVbNUvIplfE6u7YQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-darwin-x64": {
-      "optional": true
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-darwin-x64/-/lloyal.node-darwin-x64-1.6.0.tgz",
+      "integrity": "sha512-AlHhmFFoU8J1BNsqGc0leok0R+Ot4jzm3d1O/atPAi8EMFmFoSI6/af9iJcKy1//+goiPGlA1sl3BH+d7o/syw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-arm64": {
-      "optional": true
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-arm64/-/lloyal.node-linux-arm64-1.6.0.tgz",
+      "integrity": "sha512-cQJpiy061atIRRYErbnP6UjFo6owxa2dGEIGQ4u/DcwDaGX/cQMnT/PQCxHMcjGNVb8M6eDwL0Qc07SwCHikMg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-arm64-cuda": {
-      "optional": true
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-arm64-cuda/-/lloyal.node-linux-arm64-cuda-1.6.0.tgz",
+      "integrity": "sha512-K0qIdYBOWctBOwrXmTmcgoSRhDLWdBsCZTMLiYTIp8wVDF6SEBmVBNG4DNTqCaM9482RiKoJcKLnNwleCCkiLw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-arm64-vulkan": {
-      "optional": true
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-arm64-vulkan/-/lloyal.node-linux-arm64-vulkan-1.6.0.tgz",
+      "integrity": "sha512-7+3f8gUa3e8j/DatoZqrllcUkJgTHcjWzIOlP1t5443ed9pKlZWonMx/1hPyF51rIMvozodR7QVCtY12hYSOfg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-x64": {
-      "optional": true
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-x64/-/lloyal.node-linux-x64-1.6.0.tgz",
+      "integrity": "sha512-fr7h/rcpDCehveoJMskohq3mGT4moU5NqFKlaXOZEADC1PXNjjmTktcznU4SRAFGYjBhlAF3uDr4JM7CQBNY8Q==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-x64-cuda": {
-      "optional": true
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-x64-cuda/-/lloyal.node-linux-x64-cuda-1.6.0.tgz",
+      "integrity": "sha512-ddD4NtOGUzSFSYfBZLs42Y6mEplsMrQr6UXYbz1HBHUTQ867lJodF+nxsos03lcIAuAWxRCLFAUz2Nk8KgRrQA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-x64-vulkan": {
-      "optional": true
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-x64-vulkan/-/lloyal.node-linux-x64-vulkan-1.6.0.tgz",
+      "integrity": "sha512-F0t83fJnNJl9LZB+5kwXhvqLV1ZtXrFLNp8c/4JySA1lDvnZuGt8AyGbfzqMO938X5y86yiHafNsbGQgdT7OpA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-arm64": {
-      "optional": true
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-arm64/-/lloyal.node-win32-arm64-1.6.0.tgz",
+      "integrity": "sha512-BHrSLnMlYnJ1YloRziLL2VqiMswXPtLFaEKmsq4EyVRBYTcs8rGxxobptM3Kf5FrDnjL5PYSxPnwpSy90tQk0Q==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-arm64-vulkan": {
-      "optional": true
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-arm64-vulkan/-/lloyal.node-win32-arm64-vulkan-1.6.0.tgz",
+      "integrity": "sha512-o4gBVTXCYLF/gPWfsoTXwSL/uyu/b54sk+nMDyAue5TwBPos1vfYPjm02wNIEv9K17NbuG8Cz05cDU+xSn34OQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-x64": {
-      "optional": true
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-x64/-/lloyal.node-win32-x64-1.6.0.tgz",
+      "integrity": "sha512-PifF8Iy1IOfJIPa32ppI2ODaKk0x0Cmvkgg2nExLuG+DhxYc7KeTAJQ5es605ll9hTUOXrQVIPSZORBHYzUEbg==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-x64-cuda": {
-      "optional": true
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-x64-cuda/-/lloyal.node-win32-x64-cuda-1.6.0.tgz",
+      "integrity": "sha512-OUIc4G1tkxJp3N5VUhSanmZsPAcffg0JbUyhlNVV9v4EilQ7wxy3s0fN0gQfDl0XEaUXTc2zujuibdj3htLG9Q==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-x64-vulkan": {
-      "optional": true
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-x64-vulkan/-/lloyal.node-win32-x64-vulkan-1.6.0.tgz",
+      "integrity": "sha512-q39Kh+SYGna/8DDQ8ncEHAMRym57IerLrLQ876+gO79HjdgdGfayw1H1yclmIqWczYzQuY/qPFxWTPoF3Pmdvw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/tsampler": {
       "version": "0.2.0",
@@ -919,6 +1050,15 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/effection": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/effection/-/effection-4.0.2.tgz",
+      "integrity": "sha512-O8WMGP10nPuJDwbNGILcaCNWS+CvDYjcdsUSD79nWZ+WtUQ8h1MEV7JJwCSZCSeKx8+TdEaZ/8r6qPTR2o/o8w==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 16"
+      }
+    },
     "node_modules/emoji-regex": {
       "version": "9.2.2",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz",
diff --git a/package.json b/package.json
index 6dcedfc..ce36291 100644
--- a/package.json
+++ b/package.json
@@ -46,6 +46,7 @@
   "homepage": "https://github.com/lloyal-ai/lloyal.node#readme",
   "dependencies": {
     "@lloyal-labs/tsampler": "^0.2.0",
+    "effection": "^4.0.2",
     "node-addon-api": "^8.5.0"
   },
   "devDependencies": {
diff --git a/src/Agent.ts b/src/Agent.ts
deleted file mode 100644
index fde7486..0000000
--- a/src/Agent.ts
+++ /dev/null
@@ -1,288 +0,0 @@
-import type { Branch } from './Branch';
-import {
-  GrammarTriggerType,
-  type AgentState,
-  type AgentTask,
-  type ParsedToolCall,
-  type RunAgentsOptions,
-  type RunAgentsResult,
-  type SessionContext,
-} from './types';
-import { buildToolResultDelta } from './Session';
-
-/**
- * Fork an agent from a parent branch with its own system prompt + task
- *
- * Always prepends getTurnSeparator() for a clean structural break before
- * the agent's system prompt. Returns AgentState ready for store.prefill().
- *
- * @param parent - Branch to fork from
- * @param task - Agent task description
- * @param ctx - SessionContext for formatting and tokenization
- * @returns AgentState with branch and suffixTokens
- *
- * @example
- * ```typescript
- * const agent = await forkAgent(trunk, {
- *   systemPrompt: 'You are a research assistant.',
- *   content: 'What is X?',
- *   tools: toolsJson,
- *   seed: Date.now(),
- * }, ctx);
- * await store.prefill([[agent.branch, agent.suffixTokens]]);
- * ```
- *
- * @category Branching
- */
-export async function forkAgent(
-  parent: Branch,
-  task: AgentTask,
-  ctx: SessionContext
-): Promise<AgentState> {
-  const branch = await parent.fork();
-  const messages = [
-    { role: 'system', content: task.systemPrompt },
-    { role: 'user', content: task.content },
-  ];
-  const fmtOpts = task.tools ? { tools: task.tools } : {};
-  const fmt = await ctx.formatChat(JSON.stringify(messages), fmtOpts);
-  const sep = ctx.getTurnSeparator();
-  const suffixTokens = [...sep, ...await ctx.tokenize(fmt.prompt, false)];
-  if (task.seed != null) branch.reseedSampler(task.seed);
-  return {
-    agentId: branch.handle,
-    branch,
-    suffixTokens,
-    fmt: {
-      format: fmt.format,
-      reasoningFormat: fmt.reasoningFormat,
-      thinkingForcedOpen: fmt.thinkingForcedOpen,
-      parser: fmt.parser,
-      grammar: fmt.grammar,
-      grammarLazy: fmt.grammarLazy,
-      grammarTriggers: fmt.grammarTriggers,
-    },
-    rawOutput: '',
-    done: false,
-    tokenCount: 0,
-    toolCallCount: 0,
-    turns: 0,
-    findings: null,
-  };
-}
-
-/**
- * Run agents in a batched three-phase tick loop
- *
- * Preserves the mechanical execution wins from BranchStore:
- * shared-prefix KV, batched decode, fire-and-forget tools, idle yield.
- *
- * @param agents - Array of AgentState (from forkAgent or manual construction)
- * @param opts - Configuration including store, ctx, executeTool, and callbacks
- * @returns Aggregate statistics
- *
- * @example
- * ```typescript
- * const result = await runAgents(agents, {
- *   store, ctx,
- *   executeTool: (name, args) => myToolDispatch(name, args),
- *   maxTurns: 6,
- *   onToolCall(agentId, name, args) { console.log(`Agent ${agentId}: ${name}`); },
- * });
- * ```
- *
- * @category Branching
- */
-export async function runAgents(
-  agents: AgentState[],
-  opts: RunAgentsOptions
-): Promise<RunAgentsResult> {
-  const { store, ctx, executeTool, maxTurns = 100, onProduce, onToolCall, onToolResult, onToolProgress, onReport } = opts;
-
-  let steps = 0;
-  let totalToolCalls = 0;
-  const counters = {
-    warmPrefillCalls: 0,
-    warmPrefillBranches: 0,
-    stalledTicks: 0,
-    maxConcurrentTools: 0,
-    idleTicks: 0,
-  };
-
-  // Keyed by agentId (= branch handle) — stable across reordering
-  const pendingTools = new Map<number, {
-    promise: Promise<{ agentId: number; prefillTokens: number[] | null }>;
-    name: string;
-  }>();
-
-  function dispatchTool(w: AgentState, tc: ParsedToolCall): void {
-    let toolArgs: Record<string, unknown>;
-    try { toolArgs = JSON.parse(tc.arguments); } catch { toolArgs = {}; }
-    const callId = tc.id || `call_${w.toolCallCount}`;
-
-    w.toolCallCount++;
-    totalToolCalls++;
-    w.turns++;
-
-    if (onToolCall) onToolCall(w.agentId, tc.name, tc.arguments);
-
-    const toolContext = onToolProgress ? {
-      onProgress: (p: { filled: number; total: number }) => onToolProgress(w.agentId, tc.name, p),
-    } : undefined;
-
-    const promise = (async () => {
-      try {
-        const result = await executeTool(tc.name, toolArgs, toolContext);
-        const resultStr = JSON.stringify(result);
-
-        if (onToolResult) onToolResult(w.agentId, tc.name, resultStr);
-
-        const prefillTokens = await buildToolResultDelta(ctx, resultStr, callId);
-        return { agentId: w.agentId, prefillTokens: prefillTokens as number[] | null };
-      } catch (err) {
-        w.done = true;
-        w.findings = `Tool error: ${(err as Error).message}`;
-        return { agentId: w.agentId, prefillTokens: null };
-      }
-    })();
-
-    pendingTools.set(w.agentId, { promise, name: tc.name });
-    counters.maxConcurrentTools = Math.max(counters.maxConcurrentTools, pendingTools.size);
-  }
-
-  // Build agentId → index lookup for SETTLE phase
-  const agentById = new Map(agents.map((w) => [w.agentId, w]));
-
-  // Lazy grammar: unconstrained until trigger fires, then grammar-constrained.
-  // Prevents Qwen3 from generating JSON tool calls instead of expected XML.
-  //
-  // Upstream triggers include tool_start (e.g. "<tool_call>\n<function="),
-  // which fires AFTER the model has already committed to XML — useless when
-  // the model diverges to JSON. Truncate WORD triggers to scope_start only
-  // (e.g. "<tool_call>\n") so the grammar activates at the divergence point
-  // and forces the correct format.
-  const applyLazyGrammar = (w: AgentState): void => {
-    if (w.fmt.grammar && w.fmt.grammarLazy && w.fmt.grammarTriggers.length > 0) {
-      const triggers = w.fmt.grammarTriggers.map(t => {
-        if (t.type === GrammarTriggerType.WORD) {
-          const nlIdx = t.value.indexOf('\n');
-          if (nlIdx >= 0 && nlIdx < t.value.length - 1) {
-            return { ...t, value: t.value.slice(0, nlIdx + 1) };
-          }
-        }
-        return t;
-      });
-      w.branch.setGrammarLazy(w.fmt.grammar, triggers);
-    }
-  };
-  for (const w of agents) applyLazyGrammar(w);
-
-  for (;;) {
-    // -- Phase 1: PRODUCE -- sample from active agents
-    const entries: [Branch, number][] = [];
-    for (const w of agents) {
-      if (w.done || pendingTools.has(w.agentId)) continue;
-
-      const { token, text, isStop } = w.branch.produceSync();
-      if (isStop) {
-        const parsed = ctx.parseChatOutput(w.rawOutput, w.fmt.format, {
-          reasoningFormat: w.fmt.reasoningFormat,
-          thinkingForcedOpen: w.fmt.thinkingForcedOpen,
-          parser: w.fmt.parser,
-        });
-
-        const tc = parsed.toolCalls[0];
-        if (!tc || w.turns >= maxTurns) {
-          w.done = true;
-          // Accept content as findings only if agent did actual research
-          if (!w.findings && w.toolCallCount > 0 && parsed.content) {
-            w.findings = parsed.content;
-            if (onReport) onReport(w.agentId, w.findings);
-          }
-          continue;
-        }
-
-        if (tc.name === 'report') {
-          if (w.toolCallCount === 0) {
-            // Reject report without prior research — force the agent to use tools first
-            const callId = tc.id || `call_${w.toolCallCount}`;
-            const errorMsg = 'You must search or read the corpus before reporting. Use search, grep, or read_file first.';
-            w.turns++;
-            const promise = (async () => {
-              const prefillTokens = await buildToolResultDelta(ctx, JSON.stringify({ error: errorMsg }), callId);
-              return { agentId: w.agentId, prefillTokens: prefillTokens as number[] | null };
-            })();
-            pendingTools.set(w.agentId, { promise, name: tc.name });
-            w.rawOutput = '';
-            continue;
-          }
-          try { w.findings = JSON.parse(tc.arguments).findings; } catch { w.findings = tc.arguments; }
-          w.done = true;
-          w.toolCallCount++;
-          totalToolCalls++;
-          if (onToolCall) onToolCall(w.agentId, 'report', tc.arguments);
-          if (onReport) onReport(w.agentId, w.findings!);
-          continue;
-        }
-
-        // Fire-and-forget — dispatch tool without blocking the decode loop
-        dispatchTool(w, tc);
-        w.rawOutput = '';
-        continue;
-      }
-
-      entries.push([w.branch, token]);
-      w.rawOutput += text;
-      w.tokenCount++;
-      if (onProduce) onProduce(w.agentId, text, w.tokenCount);
-    }
-
-    // -- Phase 2: COMMIT -- batch-decode produced tokens
-    if (entries.length > 0) {
-      await store.commit(entries);
-      steps++;
-    }
-
-    // -- Phase 3: SETTLE -- non-blocking check for resolved tools
-    const prefillPairs: [Branch, number[]][] = [];
-    for (const [id, info] of pendingTools) {
-      const result = await Promise.race([info.promise, Promise.resolve(null)]);
-      if (result !== null) {
-        pendingTools.delete(id);
-        if (result.prefillTokens) {
-          const w = agentById.get(result.agentId)!;
-          prefillPairs.push([w.branch, result.prefillTokens]);
-        }
-      }
-    }
-
-    if (prefillPairs.length > 0) {
-      await store.prefill(prefillPairs);
-      counters.warmPrefillCalls++;
-      counters.warmPrefillBranches += prefillPairs.length;
-
-      // Reset lazy grammar — previous grammar consumed the tool call and is
-      // now in a terminal state. Fresh grammar awaits the next trigger.
-      for (const [branch] of prefillPairs) {
-        const w = agents.find(a => a.branch === branch);
-        if (w) applyLazyGrammar(w);
-      }
-    }
-
-    // -- Termination + idle yield
-    const allDone = agents.every((w) => w.done) && pendingTools.size === 0;
-    if (allDone) break;
-
-    if (entries.length === 0 && pendingTools.size > 0) {
-      counters.stalledTicks++;
-      if (prefillPairs.length === 0) {
-        // Nothing produced, nothing settled — yield until a tool resolves
-        await Promise.race([...pendingTools.values()].map((i) => i.promise));
-        counters.idleTicks++;
-      }
-    }
-  }
-
-  const totalTokens = agents.reduce((s, w) => s + w.tokenCount, 0);
-  return { totalTokens, totalToolCalls, steps, counters };
-}
diff --git a/src/Branch.ts b/src/Branch.ts
index 6991cf2..be5e286 100644
--- a/src/Branch.ts
+++ b/src/Branch.ts
@@ -175,6 +175,13 @@ export class Branch {
    */
   async prune(): Promise<void> {
     if (this._disposed) return;
+    const kids = this.children;
+    if (kids.length > 0) {
+      throw new Error(
+        `Branch.prune(): branch ${this._handle} has ${kids.length} active child(ren) ` +
+        `[${kids.join(', ')}]. Prune children first or use pruneSubtree().`,
+      );
+    }
     this._ctx._branchPrune(this._handle);
     this._disposed = true;
   }
diff --git a/src/Session.ts b/src/Session.ts
index 63fea5e..a03d907 100644
--- a/src/Session.ts
+++ b/src/Session.ts
@@ -1,52 +1,7 @@
 import type { Branch } from './Branch';
 import type { BranchStore } from './BranchStore';
 import type { SessionContext } from './types';
-
-/**
- * Build token delta for a user turn (sep + formatChat + tokenize)
- *
- * Usable with any branch — not tied to Session's trunk. This is the
- * canonical way to build a user-turn delta for warm prefill.
- *
- * @category Branching
- */
-export async function buildUserDelta(
-  ctx: SessionContext,
-  content: string,
-  opts: { tools?: string } = {}
-): Promise<number[]> {
-  const sep = ctx.getTurnSeparator();
-  const fmtOpts = opts.tools ? { tools: opts.tools } : {};
-  const { prompt } = await ctx.formatChat(
-    JSON.stringify([{ role: 'system', content: '' }, { role: 'user', content }]),
-    fmtOpts
-  );
-  const delta = await ctx.tokenize(prompt, false);
-  return [...sep, ...delta];
-}
-
-/**
- * Build token delta for a tool result turn (sep + formatChat + tokenize)
- *
- * Usable with any branch — not tied to Session's trunk.
- *
- * @category Branching
- */
-export async function buildToolResultDelta(
-  ctx: SessionContext,
-  resultStr: string,
-  callId: string
-): Promise<number[]> {
-  const sep = ctx.getTurnSeparator();
-  const { prompt } = await ctx.formatChat(
-    JSON.stringify([
-      { role: 'system', content: '' },
-      { role: 'tool', content: resultStr, tool_call_id: callId },
-    ])
-  );
-  const delta = await ctx.tokenize(prompt, false);
-  return [...sep, ...delta];
-}
+import { buildUserDelta, buildToolResultDelta } from './agents/deltas';
 
 /**
  * Session - Trunk lifecycle + conversation delta helpers
diff --git a/src/agents/Tool.ts b/src/agents/Tool.ts
new file mode 100644
index 0000000..20c34d8
--- /dev/null
+++ b/src/agents/Tool.ts
@@ -0,0 +1,76 @@
+import type { JsonSchema, ToolSchema, ToolContext } from './types';
+
+/**
+ * Abstract base class for tools usable by agents in the runtime
+ *
+ * Subclass to define tools that agents can invoke during generation.
+ * Implement `name`, `description`, `parameters`, and `execute()`. The
+ * {@link schema} getter auto-generates the OpenAI-compatible function
+ * schema expected by `formatChat()`.
+ *
+ * Pass tool instances to {@link createToolkit} to build the `toolMap`
+ * and `toolsJson` pair consumed by {@link useAgentPool} and
+ * {@link runAgents}.
+ *
+ * @example Search tool
+ * ```typescript
+ * class SearchTool extends Tool<{ query: string; topK?: number }> {
+ *   readonly name = 'search';
+ *   readonly description = 'Search the corpus for relevant passages';
+ *   readonly parameters = {
+ *     type: 'object',
+ *     properties: {
+ *       query: { type: 'string', description: 'Search query' },
+ *       topK: { type: 'number', description: 'Number of results' },
+ *     },
+ *     required: ['query'],
+ *   };
+ *
+ *   async execute(args: { query: string; topK?: number }, ctx?: ToolContext) {
+ *     const results = await this.reranker.rank(args.query, args.topK ?? 5);
+ *     return { results };
+ *   }
+ * }
+ * ```
+ *
+ * @category Agents
+ */
+export abstract class Tool<TArgs = Record<string, unknown>> {
+  /** Tool name — used as the function identifier in tool calls */
+  abstract readonly name: string;
+  /** Human-readable description shown to the model */
+  abstract readonly description: string;
+  /** JSON Schema describing the tool's expected arguments */
+  abstract readonly parameters: JsonSchema;
+
+  /**
+   * Execute the tool with parsed arguments
+   *
+   * Called by the agent pool when the model emits a tool call matching
+   * this tool's name. The return value is JSON-serialized and prefilled
+   * back into the agent's context as a tool result.
+   *
+   * @param args - Parsed arguments from the model's tool call
+   * @param context - Execution context with progress reporting callback
+   * @returns Tool result (will be JSON-serialized)
+   */
+  abstract execute(args: TArgs, context?: ToolContext): Promise<unknown>;
+
+  /**
+   * OpenAI-compatible function tool schema
+   *
+   * Auto-generated from `name`, `description`, and `parameters`.
+   * Used by {@link createToolkit} to build the JSON string passed
+   * to `formatChat()`.
+   */
+  get schema(): ToolSchema {
+    return {
+      type: 'function',
+      function: {
+        name: this.name,
+        description: this.description,
+        parameters: this.parameters,
+      },
+    };
+  }
+}
diff --git a/src/agents/agent-pool.ts b/src/agents/agent-pool.ts
new file mode 100644
index 0000000..e382da3
--- /dev/null
+++ b/src/agents/agent-pool.ts
@@ -0,0 +1,442 @@
+import { resource, call, action, useScope } from 'effection';
+import type { Operation, Scope } from 'effection';
+import type { Branch } from '../Branch';
+import { GrammarTriggerType, type GrammarTrigger, type ParsedToolCall, type SessionContext } from '../types';
+import type { BranchStore } from '../BranchStore';
+import { Ctx, Store, Events } from './context';
+import { buildToolResultDelta } from './deltas';
+import type { Tool } from './Tool';
+import type {
+  TraceToken,
+  AgentTaskSpec,
+  AgentPoolOptions,
+  AgentPoolResult,
+  AgentResult,
+  AgentEvent,
+} from './types';
+import type { Signal } from 'effection';
+
+// ── Internal agent state machine ───────────────────────────────
+// generating → awaiting_tool → generating  (tool result prefilled)
+// generating → done                         (stop + no tool call, or report)
+// awaiting_tool → done                      (tool error)
+
+type AgentInternalState = 'generating' | 'awaiting_tool' | 'done';
+
+interface AgentInternal {
+  id: number;           // = branch.handle
+  branch: Branch;
+  state: AgentInternalState;
+  fmt: {
+    format: number;
+    reasoningFormat: number;
+    thinkingForcedOpen: boolean;
+    parser: string;
+    grammar: string;
+    grammarLazy: boolean;
+    grammarTriggers: GrammarTrigger[];
+  };
+  rawOutput: string;
+  tokenCount: number;
+  toolCallCount: number;
+  turns: number;
+  findings: string | null;
+  traceBuffer: TraceToken[];
+}
+
+interface SettledTool {
+  agentId: number;
+  prefillTokens: number[];
+  toolName: string;
+}
+
+// Report tool schema — auto-injected into agent tools by setupAgent().
+// useAgentPool() intercepts report calls (never dispatched to execute()).
+const REPORT_SCHEMA = {
+  type: 'function' as const,
+  function: {
+    name: 'report',
+    description: 'Submit your final research findings. Call this when you have gathered enough information to answer the question.',
+    parameters: {
+      type: 'object',
+      properties: { findings: { type: 'string', description: 'Your research findings and answer' } },
+      required: ['findings'],
+    },
+  },
+};
+
+/** Inject report tool schema if tools are present and report isn't already defined. */
+function ensureReportTool(toolsJson: string): string {
+  const schemas = JSON.parse(toolsJson) as { type: string; function: { name: string } }[];
+  if (schemas.some(s => s.function?.name === 'report')) return toolsJson;
+  schemas.push(REPORT_SCHEMA);
+  return JSON.stringify(schemas);
+}
+
+/**
+ * Fork an agent from a parent branch with its own system prompt and task.
+ *
+ * Formats the agent's messages via `formatChat()`, tokenizes the suffix,
+ * and optionally reseeds the sampler for stochastic diversity. When the
+ * task has tools, the `report` tool schema is auto-injected if absent.
+ */
+async function setupAgent(
+  parent: Branch,
+  task: AgentTaskSpec,
+  ctx: SessionContext,
+): Promise<{ agent: AgentInternal; suffixTokens: number[] }> {
+  const branch = await parent.fork();
+  const messages = [
+    { role: 'system', content: task.systemPrompt },
+    { role: 'user', content: task.content },
+  ];
+  const tools = task.tools ? ensureReportTool(task.tools) : undefined;
+  const fmtOpts = tools ? { tools } : {};
+  const fmt = await ctx.formatChat(JSON.stringify(messages), fmtOpts);
+  const sep = ctx.getTurnSeparator();
+  const suffixTokens = [...sep, ...await ctx.tokenize(fmt.prompt, false)];
+  if (task.seed != null) branch.reseedSampler(task.seed);
+
+  return {
+    agent: {
+      id: branch.handle,
+      branch,
+      state: 'generating',
+      fmt: {
+        format: fmt.format,
+        reasoningFormat: fmt.reasoningFormat,
+        thinkingForcedOpen: fmt.thinkingForcedOpen,
+        parser: fmt.parser,
+        grammar: fmt.grammar,
+        grammarLazy: fmt.grammarLazy,
+        grammarTriggers: fmt.grammarTriggers,
+      },
+      rawOutput: '',
+      tokenCount: 0,
+      toolCallCount: 0,
+      turns: 0,
+      findings: null,
+      traceBuffer: [],
+    },
+    suffixTokens,
+  };
+}
+
+/**
+ * Concurrent agent generation loop as an Effection resource
+ *
+ * Runs N agents in parallel using a three-phase tick loop over shared
+ * {@link BranchStore} infrastructure. Each agent forks from a parent
+ * branch, generates tokens, invokes tools, and reports findings.
+ *
+ * **Three-phase tick loop:**
+ * 1. **PRODUCE** — sample all active agents via `produceSync()` (no async gap)
+ * 2. **COMMIT** — single GPU call via `store.commit()` for all produced tokens
+ * 3. **SETTLE** — drain settled tool results, batch prefill, reset grammars
+ *
+ * Tool dispatch uses `scope.run()` for eager start — tool executions run as
+ * children of the agent pool scope and are cancelled if the scope exits.
+ *
+ * **Resource semantics:** `provide()` suspends after all agents complete,
+ * keeping branches alive so the caller can fork from them (e.g. for
+ * verification). Branches are pruned in the finally block when the
+ * scope exits.
+ *
+ * For automatic branch cleanup on return, use {@link runAgents} instead.
+ *
+ * @param opts - Pool configuration: tasks, tools, sampling params, max turns
+ * @returns Agent pool result with per-agent findings and aggregate statistics
+ *
+ * @example Shared root with agent pool
+ * ```typescript
+ * const pool = yield* withSharedRoot(
+ *   { systemPrompt: RESEARCH_PROMPT, tools: toolsJson },
+ *   function*(root) {
+ *     return yield* useAgentPool({
+ *       tasks: questions.map(q => ({
+ *         systemPrompt: RESEARCH_PROMPT,
+ *         content: q,
+ *         tools: toolsJson,
+ *         parent: root,
+ *       })),
+ *       tools: toolMap,
+ *       maxTurns: 6,
+ *     });
+ *   },
+ * );
+ * ```
+ *
+ * @category Agents
+ */
+export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult> {
+  return resource(function*(provide) {
+    const ctx: SessionContext = yield* Ctx.expect();
+    const store: BranchStore = yield* Store.expect();
+    const events: Signal<AgentEvent, void> = yield* Events.expect();
+    const scope: Scope = yield* useScope();
+    const { tasks, tools, maxTurns = 100, trace = false } = opts;
+
+    // ── Setup: fork branches, collect suffix tokens ──────────
+    const agents: AgentInternal[] = [];
+    const prefillSetup: [Branch, number[]][] = [];
+
+    for (const task of tasks) {
+      // Per-task parent for tree topology, or first task's parent as shared root
+      const parent = task.parent;
+      if (!parent) throw new Error('useAgentPool: each task must have a parent branch');
+
+      const { agent, suffixTokens } = yield* call(() => setupAgent(parent, task, ctx));
+      agents.push(agent);
+      prefillSetup.push([agent.branch, suffixTokens]);
+    }
+
+    // Batch prefill all agent suffixes
+    yield* call(() => store.prefill(prefillSetup));
+
+    // ── Lazy grammar setup ───────────────────────────────────
+    const applyLazyGrammar = (a: AgentInternal): void => {
+      if (a.fmt.grammar && a.fmt.grammarLazy && a.fmt.grammarTriggers.length > 0) {
+        const triggers = a.fmt.grammarTriggers.map(t => {
+          if (t.type === GrammarTriggerType.WORD) {
+            const nlIdx = t.value.indexOf('\n');
+            if (nlIdx >= 0 && nlIdx < t.value.length - 1) {
+              return { ...t, value: t.value.slice(0, nlIdx + 1) };
+            }
+          }
+          return t;
+        });
+        a.branch.setGrammarLazy(a.fmt.grammar, triggers);
+      }
+    };
+    for (const a of agents) applyLazyGrammar(a);
+
+    // ── Tool dispatch coordination ───────────────────────────
+    // Plain JS buffer: spawned tool tasks push synchronously on completion.
+    // SETTLE drains with splice(0). Safe because generators are synchronous
+    // between yields — spawns can only push at yield points (during COMMIT's
+    // yield* call()), and SETTLE runs after COMMIT in the same tick.
+    const settledBuffer: SettledTool[] = [];
+    const agentById = new Map(agents.map(a => [a.id, a]));
+
+    // Track pending tool count for idle detection
+    let pendingToolCount = 0;
+
+    // Resolve function for idle wake — set when all agents stall
+    let wakeIdle: (() => void) | null = null;
+
+    let steps = 0;
+    let totalToolCalls = 0;
+    const counters = {
+      warmPrefillCalls: 0,
+      warmPrefillBranches: 0,
+      stalledTicks: 0,
+      maxConcurrentTools: 0,
+      idleTicks: 0,
+    };
+
+    function dispatchTool(agent: AgentInternal, tc: ParsedToolCall): void {
+      let toolArgs: Record<string, unknown>;
+      try { toolArgs = JSON.parse(tc.arguments); } catch { toolArgs = {}; }
+      const callId = tc.id || `call_${agent.toolCallCount}`;
+
+      agent.toolCallCount++;
+      totalToolCalls++;
+      agent.turns++;
+      agent.state = 'awaiting_tool';
+
+      events.send({ type: 'agent:tool_call', agentId: agent.id, tool: tc.name, args: tc.arguments });
+
+      const tool = tools.get(tc.name);
+      pendingToolCount++;
+      counters.maxConcurrentTools = Math.max(counters.maxConcurrentTools, pendingToolCount);
+
+      // scope.run() — eager start, child of agent pool scope, cancelled if scope exits.
+      // spawn() is lazy (Operation), but we're in a plain function — scope.run() is eager.
+      scope.run(function*() {
+        try {
+          const toolContext = {
+            onProgress: (p: { filled: number; total: number }) => {
+              events.send({ type: 'agent:tool_progress', agentId: agent.id, tool: tc.name, filled: p.filled, total: p.total });
+            },
+          };
+
+          const result: unknown = yield* call(() =>
+            tool ? tool.execute(toolArgs, toolContext) : Promise.resolve({ error: `Unknown tool: ${tc.name}` })
+          );
+          const resultStr = JSON.stringify(result);
+          events.send({ type: 'agent:tool_result', agentId: agent.id, tool: tc.name, result: resultStr });
+
+          const prefillTokens: number[] = yield* call(() => buildToolResultDelta(ctx, resultStr, callId));
+          settledBuffer.push({ agentId: agent.id, prefillTokens, toolName: tc.name });
+        } catch (err) {
+          agent.state = 'done';
+          agent.findings = `Tool error: ${(err as Error).message}`;
+        } finally {
+          pendingToolCount--;
+          if (wakeIdle) { wakeIdle(); wakeIdle = null; }
+        }
+      });
+    }
+
+    // ── Three-phase tick loop ────────────────────────────────
+    for (;;) {
+      // -- Phase 1: PRODUCE -- sample from active agents
+      const entries: [Branch, number][] = [];
+      for (const a of agents) {
+        if (a.state !== 'generating') continue;
+
+        const { token, text, isStop } = a.branch.produceSync();
+        if (isStop) {
+          const parsed = ctx.parseChatOutput(a.rawOutput, a.fmt.format, {
+            reasoningFormat: a.fmt.reasoningFormat,
+            thinkingForcedOpen: a.fmt.thinkingForcedOpen,
+            parser: a.fmt.parser,
+          });
+
+          const tc = parsed.toolCalls[0];
+          if (!tc || a.turns >= maxTurns) {
+            a.state = 'done';
+            if (!a.findings && a.toolCallCount > 0 && parsed.content) {
+              a.findings = parsed.content;
+              events.send({ type: 'agent:report', agentId: a.id, findings: a.findings });
+            }
+            events.send({ type: 'agent:done', agentId: a.id });
+            continue;
+          }
+
+          // Report tool special case — reject if no prior research
+          if (tc.name === 'report') {
+            if (a.toolCallCount === 0) {
+              const callId = tc.id || `call_${a.toolCallCount}`;
+              const errorMsg = 'You must search or read the corpus before reporting. Use search, grep, or read_file first.';
+              a.turns++;
+              a.state = 'awaiting_tool';
+              pendingToolCount++;
+              scope.run(function*() {
+                try {
+                  const prefillTokens: number[] = yield* call(() =>
+                    buildToolResultDelta(ctx, JSON.stringify({ error: errorMsg }), callId)
+                  );
+                  settledBuffer.push({ agentId: a.id, prefillTokens, toolName: tc.name });
+                } finally {
+                  pendingToolCount--;
+                  if (wakeIdle) { wakeIdle(); wakeIdle = null; }
+                }
+              });
+              a.rawOutput = '';
+              continue;
+            }
+            try { a.findings = JSON.parse(tc.arguments).findings; } catch { a.findings = tc.arguments; }
+            a.state = 'done';
+            a.toolCallCount++;
+            totalToolCalls++;
+            events.send({ type: 'agent:tool_call', agentId: a.id, tool: 'report', args: tc.arguments });
+            events.send({ type: 'agent:report', agentId: a.id, findings: a.findings! });
+            events.send({ type: 'agent:done', agentId: a.id });
+            continue;
+          }
+
+          // Fire-and-forget — dispatch tool without blocking the decode loop
+          dispatchTool(a, tc);
+          a.rawOutput = '';
+          continue;
+        }
+
+        entries.push([a.branch, token]);
+        a.rawOutput += text;
+        a.tokenCount++;
+        if (trace) {
+          const entropy = a.branch.modelEntropy();
+          const surprisal = a.branch.modelSurprisal(token);
+          a.traceBuffer.push({ text, entropy, surprisal });
+          events.send({
+            type: 'agent:produce', agentId: a.id, text, tokenCount: a.tokenCount,
+            entropy, surprisal,
+          });
+        } else {
+          events.send({ type: 'agent:produce', agentId: a.id, text, tokenCount: a.tokenCount });
+        }
+      }
+
+      // -- Phase 2: COMMIT -- batch-decode produced tokens
+      if (entries.length > 0) {
+        yield* call(() => store.commit(entries));
+        steps++;
+      }
+
+      // -- Phase 3: SETTLE -- drain settled tool buffer, batch prefill
+      const settled = settledBuffer.splice(0);
+      if (settled.length > 0) {
+        const prefillPairs: [Branch, number[]][] = [];
+        const settledAgents: AgentInternal[] = [];
+
+        for (const item of settled) {
+          const a = agentById.get(item.agentId);
+          if (!a || a.state === 'done') continue;
+          prefillPairs.push([a.branch, item.prefillTokens]);
+          settledAgents.push(a);
+        }
+
+        if (prefillPairs.length > 0) {
+          yield* call(() => store.prefill(prefillPairs));
+          counters.warmPrefillCalls++;
+          counters.warmPrefillBranches += prefillPairs.length;
+
+          // Only NOW transition state + reset grammar
+          for (const a of settledAgents) {
+            a.state = 'generating';
+            a.rawOutput = '';
+            applyLazyGrammar(a);
+          }
+        }
+      }
+
+      // -- Termination + idle yield
+      const allDone = agents.every(a => a.state === 'done') && pendingToolCount === 0;
+      if (allDone) break;
+
+      if (entries.length === 0 && pendingToolCount > 0) {
+        counters.stalledTicks++;
+        if (settled.length === 0) {
+          // Nothing produced, nothing settled — yield until a tool resolves
+          yield* action<void>((resolve) => {
+            wakeIdle = resolve;
+            return () => { wakeIdle = null; };
+          });
+          counters.idleTicks++;
+        }
+      }
+    }
+
+    // ── Provide result — suspends, branches stay alive ───────
+    const result: AgentPoolResult = {
+      agents: agents.map(a => ({
+        agentId: a.id,
+        branch: a.branch,
+        findings: a.findings,
+        toolCallCount: a.toolCallCount,
+        tokenCount: a.tokenCount,
+        ppl: a.branch.perplexity,
+        samplingPpl: a.branch.samplingPerplexity,
+        trace: trace ? a.traceBuffer : undefined,
+      })),
+      totalTokens: agents.reduce((s, a) => s + a.tokenCount, 0),
+      totalToolCalls,
+      steps,
+      counters,
+    };
+
+    try {
+      yield* provide(result);
+    } finally {
+      // Structured cleanup: prune all agent branches when scope exits.
+      // Must be in finally — provide() suspends via yield* suspend(),
+      // and halting jumps to finally blocks, skipping non-finally code.
+      for (const a of agents) {
+        if (!a.branch.disposed) {
+          try { yield* call(() => a.branch.prune()); } catch { /* branch may already be pruned */ }
+        }
+      }
+    }
+  });
+}
diff --git a/src/agents/context.ts b/src/agents/context.ts
new file mode 100644
index 0000000..68eb4c9
--- /dev/null
+++ b/src/agents/context.ts
@@ -0,0 +1,37 @@
+import { createContext } from 'effection';
+import type { SessionContext } from '../types';
+import type { BranchStore } from '../BranchStore';
+import type { Signal } from 'effection';
+import type { AgentEvent } from './types';
+
+/**
+ * Effection context holding the active {@link SessionContext}
+ *
+ * Set by {@link initAgents} in the caller's scope. All agent operations
+ * (`generate`, `diverge`, `useAgentPool`, `withSharedRoot`) read from this
+ * context via `yield* Ctx.expect()`.
+ *
+ * @category Agents
+ */
+export const Ctx = createContext<SessionContext>('lloyal.ctx');
+
+/**
+ * Effection context holding the active {@link BranchStore}
+ *
+ * Set by {@link initAgents}. Used by {@link diverge} and {@link useAgentPool}
+ * for batched commit/prefill across multiple branches.
+ *
+ * @category Agents
+ */
+export const Store = createContext<BranchStore>('lloyal.store');
+
+/**
+ * Effection context holding the agent event signal
+ *
+ * Set by {@link initAgents}. {@link useAgentPool} emits {@link AgentEvent}
+ * values through this signal. Harnesses can extend the event type with
+ * phase-level events for display subscribers.
+ *
+ * @category Agents
+ */
+export const Events = createContext<Signal<AgentEvent, void>>('lloyal.events');
diff --git a/src/agents/deltas.ts b/src/agents/deltas.ts
new file mode 100644
index 0000000..ba9d055
--- /dev/null
+++ b/src/agents/deltas.ts
@@ -0,0 +1,63 @@
+import type { SessionContext } from '../types';
+
+/**
+ * Build a token delta for a user turn
+ *
+ * Composes `getTurnSeparator()` + `formatChat()` + `tokenize()` into a
+ * single token array suitable for `branch.prefill()`. Usable with any
+ * branch — not tied to {@link Session}'s trunk.
+ *
+ * This is the canonical way to build a user-turn delta for warm prefill
+ * in multi-turn conversations.
+ *
+ * @param ctx - Active session context
+ * @param content - User message content
+ * @param opts - Optional tools JSON for tool-aware formatting
+ * @returns Token array ready for `branch.prefill()`
+ *
+ * @category Agents
+ */
+export async function buildUserDelta(
+  ctx: SessionContext,
+  content: string,
+  opts: { tools?: string } = {}
+): Promise<number[]> {
+  const sep = ctx.getTurnSeparator();
+  const fmtOpts = opts.tools ? { tools: opts.tools } : {};
+  const { prompt } = await ctx.formatChat(
+    JSON.stringify([{ role: 'system', content: '' }, { role: 'user', content }]),
+    fmtOpts
+  );
+  const delta = await ctx.tokenize(prompt, false);
+  return [...sep, ...delta];
+}
+
+/**
+ * Build a token delta for a tool result turn
+ *
+ * Composes `getTurnSeparator()` + `formatChat()` + `tokenize()` into a
+ * single token array suitable for `branch.prefill()`. Used by
+ * {@link useAgentPool} to inject tool results back into agent context.
+ *
+ * @param ctx - Active session context
+ * @param resultStr - JSON-serialized tool result
+ * @param callId - Tool call identifier from the model's parsed output
+ * @returns Token array ready for `branch.prefill()`
+ *
+ * @category Agents
+ */
+export async function buildToolResultDelta(
+  ctx: SessionContext,
+  resultStr: string,
+  callId: string
+): Promise<number[]> {
+  const sep = ctx.getTurnSeparator();
+  const { prompt } = await ctx.formatChat(
+    JSON.stringify([
+      { role: 'system', content: '' },
+      { role: 'tool', content: resultStr, tool_call_id: callId },
+    ])
+  );
+  const delta = await ctx.tokenize(prompt, false);
+  return [...sep, ...delta];
+}
diff --git a/src/agents/diverge.ts b/src/agents/diverge.ts
new file mode 100644
index 0000000..c383719
--- /dev/null
+++ b/src/agents/diverge.ts
@@ -0,0 +1,133 @@
+import { call } from 'effection';
+import type { Operation } from 'effection';
+import { Branch } from '../Branch';
+import { Ctx, Store } from './context';
+import type { DivergeOptions, DivergeResult, DivergeAttempt } from './types';
+
+/**
+ * Multi-branch perplexity selection as an Effection operation
+ *
+ * Forks N branches from a parent (or a fresh root), generates to EOG via
+ * batched {@link BranchStore.commit}, then selects the lowest-perplexity
+ * attempt. Loser branches are pruned; the caller receives the best branch
+ * still alive.
+ *
+ * When `opts.parent` is provided, the parent branch is NOT pruned — it's
+ * owned by the calling scope. Only the forked attempt branches (losers)
+ * are pruned. The caller owns the winning branch's lifecycle, typically
+ * via {@link Session.promote}.
+ *
+ * Error-path cleanup: if generation throws, all forked branches are
+ * pruned. If a fresh root was created (`opts.prompt` without `opts.parent`),
+ * the root is also pruned if it has no surviving children.
+ *
+ * @param opts - Diverge options specifying parent or prompt, attempt count,
+ *   and sampling parameters
+ * @returns Result containing the best branch, all attempt outputs, and
+ *   aggregate statistics
+ *
+ * @example Verify with perplexity selection
+ * ```typescript
+ * const verified = yield* diverge({
+ *   prompt: verifyPrompt,
+ *   attempts: 3,
+ *   params: { temperature: 0.7 },
+ * });
+ * // verified.best is the lowest-perplexity branch, still alive
+ * yield* call(() => session.promote(verified.best));
+ * ```
+ *
+ * @category Agents
+ */
+export function* diverge(opts: DivergeOptions): Operation<DivergeResult> {
+  const ctx = yield* Ctx.expect();
+  const store = yield* Store.expect();
+
+  // If parent provided, fork from it. Otherwise create a fresh root.
+  let root: Branch;
+  let ownRoot = false;
+  let prefixLength: number;
+
+  if (opts.parent) {
+    root = opts.parent;
+    prefixLength = root.position;
+  } else {
+    if (!opts.prompt) throw new Error('diverge() requires either opts.parent or opts.prompt');
+    const tokens: number[] = yield* call(() => ctx.tokenize(opts.prompt!));
+    root = Branch.create(ctx, 0, opts.params ?? {});
+    yield* call(() => root.prefill(tokens));
+    prefixLength = tokens.length;
+    ownRoot = true;
+  }
+
+  const live: { branch: Branch; output: string; done: boolean; tokenCount: number; ppl: number }[] = [];
+
+  try {
+    for (let i = 0; i < opts.attempts; i++) {
+      const branch: Branch = yield* call(() => root.fork());
+      branch.reseedSampler(2000 + i);
+      live.push({ branch, output: '', done: false, tokenCount: 0, ppl: Infinity });
+    }
+
+    // Batched generation — produceSync/commit loop
+    let steps = 0;
+    for (;;) {
+      const entries: [Branch, number][] = [];
+      for (const a of live) {
+        if (a.done) continue;
+        const { token, text, isStop } = a.branch.produceSync();
+        if (isStop) {
+          const p = a.branch.perplexity;
+          a.ppl = Number.isFinite(p) ? p : Infinity;
+          a.done = true;
+          continue;
+        }
+        entries.push([a.branch, token]);
+        a.output += text;
+        a.tokenCount++;
+      }
+      if (entries.length === 0) break;
+      yield* call(() => store.commit(entries));
+      steps++;
+    }
+
+    // Select by lowest perplexity (most coherent)
+    const bestIdx = live.reduce((bi, a, i) => a.ppl <= live[bi].ppl ? i : bi, 0);
+
+    // Prune losers — winner stays alive as caller's result.
+    for (let i = 0; i < live.length; i++) {
+      if (i !== bestIdx && !live[i].branch.disposed) {
+        yield* call(() => live[i].branch.prune());
+      }
+    }
+
+    const totalTokens = live.reduce((s, a) => s + a.tokenCount, 0);
+    const attempts: DivergeAttempt[] = live.map(a => ({
+      branch: a.branch,
+      output: a.output,
+      tokenCount: a.tokenCount,
+      ppl: a.ppl,
+    }));
+
+    return {
+      best: live[bestIdx].branch,
+      bestOutput: live[bestIdx].output,
+      attempts,
+      totalTokens,
+      steps,
+      prefixLength,
+    };
+  } catch (err) {
+    // Error path: prune all forked branches, then re-throw.
+    for (const a of live) {
+      if (!a.branch.disposed) {
+        try { yield* call(() => a.branch.prune()); } catch { /* already gone */ }
+      }
+    }
+    // If we created the root and it has no surviving children, clean it up too.
+    if (ownRoot && !root.disposed) {
+      try { yield* call(() => root.prune()); } catch { /* children may remain */ }
+    }
+    throw err;
+  }
+}
diff --git a/src/agents/generate.ts b/src/agents/generate.ts
new file mode 100644
index 0000000..4178e0a
--- /dev/null
+++ b/src/agents/generate.ts
@@ -0,0 +1,59 @@
+import { call } from 'effection';
+import type { Operation } from 'effection';
+import { Branch } from '../Branch';
+import { Ctx } from './context';
+import type { GenerateOptions, GenerateResult } from './types';
+
+/**
+ * Single-branch grammar-constrained generation as an Effection operation
+ *
+ * Creates a fresh branch at position 0, prefills the prompt, generates
+ * to EOG, and prunes the branch. Uses {@link Branch}'s async iterator
+ * — single-branch generation doesn't need batched commit.
+ *
+ * The branch is always cleaned up via try/finally, even on error or
+ * scope cancellation.
+ *
+ * @param opts - Generation options (prompt, grammar, params, parse)
+ * @returns Generated text, token count, and optionally parsed result
+ *
+ * @example Grammar-constrained JSON generation
+ * ```typescript
+ * const plan = yield* generate({
+ *   prompt: planPrompt,
+ *   grammar: planGrammar,
+ *   params: { temperature: 0.3 },
+ *   parse: output => JSON.parse(output),
+ * });
+ * console.log(plan.parsed); // typed result from parse()
+ * ```
+ *
+ * @category Agents
+ */
+export function* generate<T = unknown>(opts: GenerateOptions): Operation<GenerateResult<T>> {
+  const ctx = yield* Ctx.expect();
+
+  const samplerParams = opts.params ?? {};
+  const branch = Branch.create(ctx, 0, samplerParams, undefined, opts.grammar);
+
+  try {
+    const tokens: number[] = yield* call(() => ctx.tokenize(opts.prompt));
+    yield* call(() => branch.prefill(tokens));
+
+    // Consume async iterator inside call() — generators can't use for-await
+    const { output, tokenCount } = yield* call(async () => {
+      let output = '';
+      let tokenCount = 0;
+      for await (const { text } of branch) {
+        output += text;
+        tokenCount++;
+      }
+      return { output, tokenCount };
+    });
+
+    const parsed = opts.parse ? opts.parse(output) as T : undefined;
+    return { output, tokenCount, parsed };
+  } finally {
+    if (!branch.disposed) yield* call(() => branch.prune());
+  }
+}
diff --git a/src/agents/index.ts b/src/agents/index.ts
new file mode 100644
index 0000000..a57d259
--- /dev/null
+++ b/src/agents/index.ts
@@ -0,0 +1,31 @@
+export { Ctx, Store, Events } from './context';
+export { Tool } from './Tool';
+export { buildUserDelta, buildToolResultDelta } from './deltas';
+export { generate } from './generate';
+export { diverge } from './diverge';
+export { useAgentPool } from './agent-pool';
+export { runAgents } from './run-agents';
+export { createToolkit } from './toolkit';
+export { initAgents } from './init';
+export { withSharedRoot } from './shared-root';
+
+export type { Toolkit } from './toolkit';
+export type { AgentHandle } from './init';
+export type { SharedRootOptions } from './shared-root';
+
+export type {
+  TraceToken,
+  JsonSchema,
+  ToolSchema,
+  ToolContext,
+  AgentTaskSpec,
+  AgentPoolOptions,
+  AgentResult,
+  AgentPoolResult,
+  GenerateOptions,
+  GenerateResult,
+  DivergeOptions,
+  DivergeAttempt,
+  DivergeResult,
+  AgentEvent,
+} from './types';
diff --git a/src/agents/init.ts b/src/agents/init.ts
new file mode 100644
index 0000000..9a5b8c6
--- /dev/null
+++ b/src/agents/init.ts
@@ -0,0 +1,78 @@
+import { ensure, createSignal, call } from 'effection';
+import type { Operation, Signal } from 'effection';
+import { BranchStore } from '../BranchStore';
+import { Session } from '../Session';
+import type { SessionContext } from '../types';
+import { Ctx, Store, Events } from './context';
+import type { AgentEvent } from './types';
+
+/**
+ * Handle returned by {@link initAgents} containing all agent resources
+ *
+ * @category Agents
+ */
+export interface AgentHandle<E extends AgentEvent = AgentEvent> {
+  /** The session context (model, tokenizer, KV cache) */
+  ctx: SessionContext;
+  /** Branch store for batched commit/prefill across branches */
+  store: BranchStore;
+  /** Session managing conversation trunk and branch lifecycle */
+  session: Session;
+  /** Signal for subscribing to agent and harness events */
+  events: Signal<E, void>;
+}
+
+/**
+ * Bootstrap the agent infrastructure and register structured cleanup
+ *
+ * Creates {@link BranchStore}, {@link Session}, and an event signal, then
+ * sets all three Effection contexts ({@link Ctx}, {@link Store},
+ * {@link Events}) in the caller's scope. Cleanup runs on scope exit
+ * (Ctrl-C, error, normal completion) via `ensure()`.
+ *
+ * Context values are set in the caller's scope — visible to all subsequent
+ * operations. This is why `initAgents` uses `ensure()` rather than
+ * `resource()`: a resource creates a child scope where `Ctx.set()` would
+ * be invisible to sibling operations.
+ *
+ * The caller creates the {@link SessionContext} (model path, nCtx, KV types
+ * are harness-specific decisions) and passes it in.
+ *
+ * @param ctx - Session context created via `createContext()`
+ * @returns Agent handle with session, store, and event signal
+ *
+ * @example Canonical bootstrap
+ * ```typescript
+ * main(function*() {
+ *   const ctx = yield* call(() => createContext({
+ *     modelPath, nCtx: 16384,
+ *     nSeqMax: 4, typeK: 'q4_0', typeV: 'q4_0',
+ *   }));
+ *
+ *   const { session, events } = yield* initAgents(ctx);
+ *   // Ctx, Store, Events are now set — generate(), diverge(),
+ *   // useAgentPool() will find them automatically.
+ *   // Cleanup runs on scope exit.
+ * });
+ * ```
+ *
+ * @category Agents
+ */
+export function* initAgents<E extends AgentEvent = AgentEvent>(
+  ctx: SessionContext,
+): Operation<AgentHandle<E>> {
+  const store = new BranchStore(ctx);
+  const session = new Session({ ctx, store });
+  const events: Signal<E, void> = createSignal<E, void>();
+
+  yield* Ctx.set(ctx);
+  yield* Store.set(store);
+  yield* Events.set(events as unknown as Signal<AgentEvent, void>);
+
+  yield* ensure(function*() {
+    yield* call(() => session.dispose());
+    ctx.dispose();
+  });
+
+  return { ctx, store, session, events };
+}
diff --git a/src/agents/run-agents.ts b/src/agents/run-agents.ts
new file mode 100644
index 0000000..b2c71dc
--- /dev/null
+++ b/src/agents/run-agents.ts
@@ -0,0 +1,45 @@
+import { scoped } from 'effection';
+import type { Operation } from 'effection';
+import { useAgentPool } from './agent-pool';
+import type { AgentPoolOptions, AgentPoolResult } from './types';
+
+/**
+ * Run an agent pool with automatic branch cleanup on return
+ *
+ * Wraps {@link useAgentPool} in `scoped()` — agent branches are pruned
+ * when the scope exits, before this operation returns. Use this when you
+ * don't need to fork from agent branches after the pool completes.
+ *
+ * For multi-level tree topology (forking from agent branches for
+ * verification or follow-up), use {@link useAgentPool} directly within
+ * your own scope management.
+ *
+ * @param opts - Pool configuration: tasks, tools, sampling params, max turns
+ * @returns Agent pool result (branches already pruned)
+ *
+ * @example Research agents with shared root
+ * ```typescript
+ * const pool = yield* withSharedRoot(
+ *   { systemPrompt: RESEARCH_PROMPT, tools: toolsJson },
+ *   function*(root, prefixLen) {
+ *     return yield* runAgents({
+ *       tasks: questions.map(q => ({
+ *         systemPrompt: RESEARCH_PROMPT,
+ *         content: q,
+ *         tools: toolsJson,
+ *         parent: root,
+ *       })),
+ *       tools: toolMap,
+ *       maxTurns: 6,
+ *     });
+ *   },
+ * );
+ * ```
+ *
+ * @category Agents
+ */
+export function* runAgents(opts: AgentPoolOptions): Operation<AgentPoolResult> {
+  return yield* scoped(function*() {
+    return yield* useAgentPool(opts);
+  });
+}
diff --git a/src/agents/shared-root.ts b/src/agents/shared-root.ts
new file mode 100644
index 0000000..a1e4987
--- /dev/null
+++ b/src/agents/shared-root.ts
@@ -0,0 +1,82 @@
+import { call } from 'effection';
+import type { Operation } from 'effection';
+import { Branch } from '../Branch';
+import type { SessionContext } from '../types';
+import { Ctx } from './context';
+import type { SamplingParams } from './types';
+
+/**
+ * Configuration for {@link withSharedRoot}
+ *
+ * @category Agents
+ */
+export interface SharedRootOptions {
+  /** System prompt to tokenize and prefill into the shared root */
+  systemPrompt: string;
+  /** JSON-serialized tool schemas for tool-aware prompt formatting */
+  tools?: string;
+  /** Sampling parameters for the root branch */
+  params?: SamplingParams;
+}
+
+/**
+ * Scoped shared root branch with guaranteed cleanup
+ *
+ * Creates a root branch, prefills the system prompt, and passes it to
+ * the body function. The root is pruned via try/finally when the body
+ * returns or throws, regardless of whether children still exist.
+ *
+ * Use this for the cold-path pattern where multiple agents share a
+ * tokenized system prompt prefix. The `sharedPrefixLength` passed to
+ * the body enables KV savings calculation.
+ *
+ * @param opts - System prompt, tools, and sampling parameters
+ * @param body - Operation that receives the root branch and prefix length.
+ *   Typically calls {@link runAgents} or {@link useAgentPool} inside.
+ * @returns The body's return value
+ *
+ * @example Cold-path research with shared prefix
+ * ```typescript
+ * const { result, prefixLen } = yield* withSharedRoot(
+ *   { systemPrompt: RESEARCH_PROMPT, tools: toolsJson },
+ *   function*(root, prefixLen) {
+ *     const result = yield* runAgents({
+ *       tasks: questions.map(q => ({
+ *         systemPrompt: RESEARCH_PROMPT,
+ *         content: q,
+ *         tools: toolsJson,
+ *         parent: root,
+ *       })),
+ *       tools: toolMap,
+ *     });
+ *     return { result, prefixLen };
+ *   },
+ * );
+ * ```
+ *
+ * @category Agents
+ */
+export function* withSharedRoot<T>(
+  opts: SharedRootOptions,
+  body: (root: Branch, sharedPrefixLength: number) => Operation<T>,
+): Operation<T> {
+  const ctx: SessionContext = yield* Ctx.expect();
+
+  const messages = [{ role: 'system', content: opts.systemPrompt }];
+  const fmtOpts = opts.tools
+    ? { tools: opts.tools, addGenerationPrompt: false }
+    : { addGenerationPrompt: false };
+  const fmt = yield* call(() => ctx.formatChat(JSON.stringify(messages), fmtOpts));
+  const sharedTokens: number[] = yield* call(() => ctx.tokenize(fmt.prompt));
+
+  const root = Branch.create(ctx, 0, opts.params ?? { temperature: 0.5 });
+  yield* call(() => root.prefill(sharedTokens));
+
+  try {
+    return yield* body(root, sharedTokens.length);
+  } finally {
+    if (!root.disposed) {
+      yield* call(() => root.prune());
+    }
+  }
+}
diff --git a/src/agents/toolkit.ts b/src/agents/toolkit.ts
new file mode 100644
index 0000000..f15faf1
--- /dev/null
+++ b/src/agents/toolkit.ts
@@ -0,0 +1,45 @@
+import type { Tool } from './Tool';
+
+/**
+ * Aggregated tool registry for agent pool consumption
+ *
+ * Contains the `toolMap` for dispatch and `toolsJson` for prompt
+ * formatting. Created by {@link createToolkit}.
+ *
+ * @category Agents
+ */
+export interface Toolkit {
+  /** Name-to-instance map used by {@link useAgentPool} for tool dispatch */
+  toolMap: Map<string, Tool>;
+  /** JSON-serialized tool schemas passed to `formatChat()` via task specs */
+  toolsJson: string;
+}
+
+/**
+ * Aggregate an array of {@link Tool} instances into a toolkit
+ *
+ * Builds both the dispatch map and the JSON schema string from the
+ * tool array. Pass the result directly to {@link AgentPoolOptions}
+ * and {@link AgentTaskSpec}.
+ *
+ * @param tools - Tool instances to aggregate
+ * @returns Toolkit with `toolMap` and `toolsJson`
+ *
+ * @example
+ * ```typescript
+ * const { toolMap, toolsJson } = createToolkit([
+ *   new SearchTool(chunks, reranker),
+ *   new ReadFileTool(resources),
+ *   new GrepTool(resources),
+ * ]);
+ * // report tool schema is auto-injected by useAgentPool()
+ * ```
+ *
+ * @category Agents
+ */
+export function createToolkit(tools: Tool[]): Toolkit {
+  return {
+    toolMap: new Map(tools.map(t => [t.name, t])),
+    toolsJson: JSON.stringify(tools.map(t => t.schema)),
+  };
+}
diff --git a/src/agents/types.ts b/src/agents/types.ts
new file mode 100644
index 0000000..95ea828
--- /dev/null
+++ b/src/agents/types.ts
@@ -0,0 +1,312 @@
+import type { Branch } from '../Branch';
+import type { SessionContext } from '../types';
+
+// ── Tool base class types ──────────────────────────────────────
+
+/**
+ * JSON Schema definition for tool parameter validation
+ *
+ * Describes the shape of arguments a {@link Tool} accepts. Passed to the
+ * model via `formatChat()` so it can generate valid tool-call arguments.
+ *
+ * @category Agents
+ */
+export interface JsonSchema {
+  /** JSON Schema type (e.g. `"object"`, `"string"`, `"array"`) */
+  type: string;
+  /** Property definitions when `type` is `"object"` */
+  properties?: Record<string, unknown>;
+  /** Required property names when `type` is `"object"` */
+  required?: string[];
+  /** Additional schema constraints (minItems, enum, etc.) */
+  [key: string]: unknown;
+}
+
+/**
+ * OpenAI-compatible function tool schema
+ *
+ * The wrapper format expected by `formatChat()` when passing tools to the
+ * model. {@link Tool.schema} generates this automatically from the tool's
+ * `name`, `description`, and `parameters`.
+ *
+ * @category Agents
+ */
+export interface ToolSchema {
+  /** Always `"function"` for function-calling tools */
+  type: 'function';
+  /** Function definition containing name, description, and parameter schema */
+  function: {
+    /** Tool name — used as the function identifier in tool calls */
+    name: string;
+    /** Human-readable description shown to the model */
+    description: string;
+    /** JSON Schema describing the tool's arguments */
+    parameters: JsonSchema;
+  };
+}
+
+/**
+ * Execution context passed to {@link Tool.execute}
+ *
+ * Provides callbacks for reporting progress during long-running tool
+ * operations (e.g. reranker scoring chunks).
+ *
+ * @category Agents
+ */
+export interface ToolContext {
+  /** Progress callback for long-running operations */
+  onProgress?: (p: { filled: number; total: number }) => void;
+}
+
+// ── Trace types ───────────────────────────────────────────────
+
+/**
+ * Per-token trace entry captured when {@link AgentPoolOptions.trace} is true
+ *
+ * Each entry corresponds to one sampled token and the distribution state
+ * at the moment it was drawn. Available on {@link AgentResult.trace} after
+ * pool completion.
+ *
+ * @category Agents
+ */
+export interface TraceToken {
+  /** Decoded text for this token */
+  text: string;
+  /** Shannon entropy of the full vocabulary distribution (bits, base-2) */
+  entropy: number;
+  /** Surprisal of the chosen token: -log2(p) */
+  surprisal: number;
+}
+
+// ── Agent pool types ───────────────────────────────────────────
+
+/**
+ * Task specification for a single agent in {@link useAgentPool}
+ *
+ * Each task defines the agent's system prompt, user content, available
+ * tools, and parent branch to fork from. The parent branch determines
+ * the agent's KV prefix — fork from a shared root to amortize system
+ * prompt tokenization across agents.
+ *
+ * @category Agents
+ */
+export interface AgentTaskSpec {
+  /** System prompt defining the agent's role and behavior */
+  systemPrompt: string;
+  /** User message content — the agent's specific sub-question or task */
+  content: string;
+  /** JSON-serialized tool schemas (from {@link createToolkit}) */
+  tools?: string;
+  /** PRNG seed for sampler diversity — pass different seeds per agent */
+  seed?: number;
+  /** Parent branch to fork from (required by {@link useAgentPool}) */
+  parent?: Branch;
+}
+
+/**
+ * Sampling parameters for generation
+ *
+ * Controls the sampler chain applied during token generation. Passed to
+ * {@link Branch.create}, {@link generate}, {@link diverge}, and agent
+ * pool tasks.
+ *
+ * @category Agents
+ */
+export interface SamplingParams {
+  /** Temperature for softmax scaling (0 = greedy, higher = more random) */
+  temperature?: number;
+  /** Nucleus sampling threshold — cumulative probability cutoff */
+  topP?: number;
+  /** Top-K sampling — keep only the K most likely tokens */
+  topK?: number;
+  /** Minimum probability threshold relative to the most likely token */
+  minP?: number;
+  /** Additional sampler-specific parameters */
+  [key: string]: unknown;
+}
+
+/**
+ * Configuration for {@link useAgentPool} and {@link runAgents}
+ *
+ * @category Agents
+ */
+export interface AgentPoolOptions {
+  /** Agent task specifications — one per concurrent agent */
+  tasks: AgentTaskSpec[];
+  /** Tool registry mapping tool names to {@link Tool} instances */
+  tools: Map<string, import('./Tool').Tool>;
+  /** Sampling parameters applied to all agents */
+  params?: SamplingParams;
+  /** Maximum tool-call turns per agent before forced termination */
+  maxTurns?: number;
+  /** Enable per-token entropy/surprisal on `agent:produce` events */
+  trace?: boolean;
+}
+
+/**
+ * Result for a single completed agent
+ *
+ * @category Agents
+ */
+export interface AgentResult {
+  /** Stable agent identifier (branch handle at creation time) */
+  agentId: number;
+  /** The agent's branch — still alive when returned from {@link useAgentPool} */
+  branch: Branch;
+  /** Agent's research findings (from `report` tool or final output), or null */
+  findings: string | null;
+  /** Number of tool calls the agent made */
+  toolCallCount: number;
+  /** Total tokens generated by this agent */
+  tokenCount: number;
+  /** Model-level perplexity at completion (exp of mean NLL from raw logits) */
+  ppl: number;
+  /** Sampling-level perplexity at completion (from filtered distribution) */
+  samplingPpl: number;
+  /** Per-token trace data (present only when {@link AgentPoolOptions.trace} is true) */
+  trace?: TraceToken[];
+}
+
+/**
+ * Aggregate result from a completed agent pool run
+ *
+ * Returned by both {@link useAgentPool} and {@link runAgents}. Contains
+ * per-agent results plus aggregate statistics for display and telemetry.
+ *
+ * @category Agents
+ */
+export interface AgentPoolResult {
+  /** Per-agent results in task order */
+  agents: AgentResult[];
+  /** Sum of all agent token counts */
+  totalTokens: number;
+  /** Sum of all agent tool calls */
+  totalToolCalls: number;
+  /** Number of batched commit steps in the tick loop */
+  steps: number;
+  /** Internal performance counters for telemetry */
+  counters: {
+    /** Number of batch prefill calls for tool result injection */
+    warmPrefillCalls: number;
+    /** Total branches across all warm prefill batches */
+    warmPrefillBranches: number;
+    /** Ticks where no agent was generating (all awaiting tools) */
+    stalledTicks: number;
+    /** Peak concurrent tool executions */
+    maxConcurrentTools: number;
+    /** Ticks spent idle-waiting via action() */
+    idleTicks: number;
+  };
+}
+
+// ── Generate types ─────────────────────────────────────────────
+
+/**
+ * Options for single-branch {@link generate}
+ *
+ * @category Agents
+ */
+export interface GenerateOptions {
+  /** Pre-formatted prompt string (from `formatChat()` + `tokenize()`) */
+  prompt: string;
+  /** GBNF grammar string for constrained generation */
+  grammar?: string;
+  /** Sampling parameters */
+  params?: SamplingParams;
+  /** Optional parser applied to the raw output string */
+  parse?: (output: string) => unknown;
+}
+
+/**
+ * Result from single-branch {@link generate}
+ *
+ * @category Agents
+ */
+export interface GenerateResult<T = unknown> {
+  /** Raw generated text */
+  output: string;
+  /** Number of tokens generated */
+  tokenCount: number;
+  /** Parsed output (present only when `parse` was provided in options) */
+  parsed?: T;
+}
+
+// ── Diverge types ──────────────────────────────────────────────
+
+/**
+ * Options for multi-branch {@link diverge}
+ *
+ * Either `parent` or `prompt` must be provided. When `parent` is given,
+ * branches fork from it and no new root is created. When only `prompt`
+ * is given, a fresh root is created, prefilled, and cleaned up on error.
+ *
+ * @category Agents
+ */
+export interface DivergeOptions {
+  /** Pre-formatted prompt for creating a fresh root (mutually exclusive with parent) */
+  prompt?: string;
+  /** Number of parallel generation attempts */
+  attempts: number;
+  /** Parent branch to fork from (mutually exclusive with prompt) */
+  parent?: Branch;
+  /** Sampling parameters for all attempts */
+  params?: SamplingParams;
+}
+
+/**
+ * Single attempt result from {@link diverge}
+ *
+ * @category Agents
+ */
+export interface DivergeAttempt {
+  /** The attempt's branch (only the best branch survives after diverge) */
+  branch: Branch;
+  /** Generated text for this attempt */
+  output: string;
+  /** Number of tokens generated */
+  tokenCount: number;
+  /** Model perplexity — lower indicates more coherent generation */
+  ppl: number;
+}
+
+/**
+ * Aggregate result from {@link diverge}
+ *
+ * The `best` branch is still alive; all other attempt branches have been
+ * pruned. The caller owns cleanup — typically via {@link Session.promote}
+ * to make the best branch the new conversation trunk.
+ *
+ * @category Agents
+ */
+export interface DivergeResult {
+  /** Lowest-perplexity branch — still alive, caller owns cleanup */
+  best: Branch;
+  /** Text output from the best attempt */
+  bestOutput: string;
+  /** All attempts (losers already pruned, branches disposed) */
+  attempts: DivergeAttempt[];
+  /** Sum of all attempt token counts */
+  totalTokens: number;
+  /** Number of batched commit steps */
+  steps: number;
+  /** Shared prefix length in tokens (for KV savings calculation) */
+  prefixLength: number;
+}
+
+// ── Runtime events ─────────────────────────────────────────────
+
+/**
+ * Events emitted by the runtime during agent pool execution
+ *
+ * Subscribe to these via the `events` signal from {@link initAgents}.
+ * Harnesses can extend this union with phase-level events for display.
+ *
+ * @category Agents
+ */
+export type AgentEvent =
+  | { type: 'agent:produce'; agentId: number; text: string; tokenCount: number; entropy?: number; surprisal?: number }
+  | { type: 'agent:tool_call'; agentId: number; tool: string; args: string }
+  | { type: 'agent:tool_result'; agentId: number; tool: string; result: string }
+  | { type: 'agent:tool_progress'; agentId: number; tool: string; filled: number; total: number }
+  | { type: 'agent:report'; agentId: number; findings: string }
+  | { type: 'agent:done'; agentId: number };
diff --git a/src/index.ts b/src/index.ts
index cb7642c..b0787c5 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -39,8 +39,8 @@ import type {
 
 import { Branch } from './Branch';
 import { BranchStore } from './BranchStore';
-import { Session, buildUserDelta, buildToolResultDelta } from './Session';
-import { forkAgent, runAgents } from './Agent';
+import { Session } from './Session';
+import { buildUserDelta, buildToolResultDelta } from './agents/deltas';
 import { Rerank } from './Rerank';
 
 /**
@@ -251,7 +251,42 @@ export const createContext = async (
   return binary.createContext(options);
 };
 
-export { Branch, BranchStore, Session, buildUserDelta, buildToolResultDelta, forkAgent, runAgents, Rerank };
+// ── Layer 1: Substrate (unchanged) ──────────────────────────────
+export { Branch, BranchStore, Session, buildUserDelta, buildToolResultDelta, Rerank };
+
+// ── Layer 2: Agents (structured concurrency) ────────────────────
+export {
+  Ctx, Store, Events,
+  Tool,
+  useAgentPool,
+  runAgents,
+  generate,
+  diverge,
+  createToolkit,
+  initAgents,
+  withSharedRoot,
+} from './agents/index';
+
+export type {
+  Toolkit,
+  AgentHandle,
+  SharedRootOptions,
+  JsonSchema,
+  ToolSchema,
+  ToolContext,
+  AgentTaskSpec,
+  AgentPoolOptions,
+  AgentResult,
+  AgentPoolResult,
+  GenerateOptions,
+  GenerateResult,
+  DivergeOptions,
+  DivergeAttempt,
+  DivergeResult,
+  AgentEvent,
+} from './agents/index';
+
+// ── Enums + types from types.ts ─────────────────────────────────
 export { PoolingType, ChatFormat, ReasoningFormat, GrammarTriggerType } from './types';
 export type {
   GpuVariant,
@@ -272,10 +307,6 @@ export type {
   SamplingParams,
   SessionContext,
   Produced,
-  AgentTask,
-  AgentState,
-  RunAgentsOptions,
-  RunAgentsResult,
   RerankOptions,
   RerankResult,
   RerankProgress,
diff --git a/src/types.ts b/src/types.ts
index 3eb2211..1fa511d 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -1444,113 +1444,8 @@ export interface Produced {
   isStop: boolean;
 }
 
-/**
- * Task description for forkAgent
- *
- * @category Branching
- */
-export interface AgentTask {
-  /** System prompt for the agent */
-  systemPrompt: string;
-  /** User content / question for the agent */
-  content: string;
-  /** JSON-stringified tool definitions (optional) */
-  tools?: string;
-  /** PRNG seed for sampler diversity (optional) */
-  seed?: number;
-}
-
-/**
- * State of a single agent in a runAgents loop
- *
- * Returned by forkAgent(). Also constructible manually for shared-prefix
- * patterns (Phase 2 agentRoot + slice).
- *
- * @category Branching
- */
-export interface AgentState {
-  /** Stable identifier — the branch handle. Survives reordering, useful for
-   *  keying pending-tool maps and correlating with C++ / KV-level diagnostics. */
-  agentId: number;
-  /** The agent's branch */
-  branch: Branch;
-  /** Tokens to prefill before the loop starts */
-  suffixTokens: number[];
-  /** Format metadata for parseChatOutput + grammar constraints */
-  fmt: {
-    format: ChatFormat;
-    reasoningFormat: ReasoningFormat;
-    thinkingForcedOpen: boolean;
-    parser: string;
-    grammar: string;
-    grammarLazy: boolean;
-    grammarTriggers: GrammarTrigger[];
-  };
-  /** Accumulated raw output text */
-  rawOutput: string;
-  /** Whether the agent has finished */
-  done: boolean;
-  /** Number of tokens generated */
-  tokenCount: number;
-  /** Number of tool calls made */
-  toolCallCount: number;
-  /** Number of tool-call turns completed */
-  turns: number;
-  /** Final findings (set by report tool, or extracted from content if agent researched but didn't report) */
-  findings: string | null;
-}
-
-/**
- * Options for runAgents
- *
- * @category Branching
- */
-export interface RunAgentsOptions {
-  /** BranchStore for commit/prefill */
-  store: BranchStore;
-  /** SessionContext for parseChatOutput, formatChat, tokenize */
-  ctx: SessionContext;
-  /** Tool executor — consumer wraps with locks as needed */
-  executeTool: (
-    name: string,
-    args: Record<string, unknown>,
-    context?: { onProgress?: (p: { filled: number; total: number }) => void },
-  ) => Promise<unknown>;
-  /** Maximum tool-call turns per agent (default: 6) */
-  maxTurns?: number;
-  /** Called when an agent produces a token (agentId = branch handle) */
-  onProduce?: (agentId: number, text: string, tokenCount: number) => void;
-  /** Called when an agent dispatches a tool call (agentId = branch handle) */
-  onToolCall?: (agentId: number, toolName: string, args: string) => void;
-  /** Called when a tool result returns (agentId = branch handle) */
-  onToolResult?: (agentId: number, toolName: string, resultStr: string) => void;
-  /** Called during tool execution with intermediate progress */
-  onToolProgress?: (agentId: number, toolName: string, progress: { filled: number; total: number }) => void;
-  /** Called when an agent submits a report (agentId = branch handle) */
-  onReport?: (agentId: number, findings: string) => void;
-}
-
-/**
- * Result from runAgents
- *
- * @category Branching
- */
-export interface RunAgentsResult {
-  /** Total tokens generated across all agents */
-  totalTokens: number;
-  /** Total tool calls across all agents */
-  totalToolCalls: number;
-  /** Number of batched decode steps */
-  steps: number;
-  /** Performance counters */
-  counters: {
-    warmPrefillCalls: number;
-    warmPrefillBranches: number;
-    stalledTicks: number;
-    maxConcurrentTools: number;
-    idleTicks: number;
-  };
-}
+// AgentTask, AgentState, RunAgentsOptions, RunAgentsResult removed —
+// superseded by src/runtime/ (useAgentPool, AgentTaskSpec, AgentPoolResult)
 
 /**
  * Options for Rerank context creation

From 22b521605a2431907c78c8606d2b72c9ba2fc369 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Sun, 1 Mar 2026 20:51:39 +1100
Subject: [PATCH 11/17] feat(agents): composable workflow, SC cleanup fix,
 research prompt.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Decompose monolithic handleQuery and tui into composable operations and view handlers connected by a Channel. Operations (plan, research, verify, evaluate, respond, etc.) are plain Operation<T> generators that compose via yield*/spawn — no linear phase/step assumptions.

- Fix structured concurrency bug in useAgentPool where try/finally only wrapped provide(), not the tick loop — if the tick loop threw, agent branches were never pruned. Move try/finally to wrap the entire agent lifecycle from creation through provide.

- Rewrite research prompt to use unconditional sequential process instead of conditional logic ("if zero results, simplify"). The 4B model doesn't follow conditionals but reliably follows checklists. Simple keyword greps first, completeness-check grep last. Improves recall from 2-4/5 to 5/5 on dorpus benchmark.

- Add SC integration tests (test/agents.ts) verifying ensure lifecycle, branch cleanup on normal/error/tool-error paths via initAgents.
---
 examples/deep-research/display.ts         | 331 -----------------
 examples/deep-research/harness.ts         | 426 +++++++++++-----------
 examples/deep-research/main.ts            | 199 ++++++----
 examples/deep-research/reranker.ts        |  24 +-
 examples/deep-research/resources/files.ts |   4 +-
 examples/deep-research/tasks/research.md  |  13 +-
 examples/deep-research/tools/grep.ts      |   6 +-
 examples/deep-research/tools/index.ts     |  14 +-
 examples/deep-research/tools/read-file.ts |   6 +-
 examples/deep-research/tools/search.ts    |   8 +-
 examples/deep-research/tools/types.ts     |   2 +-
 examples/deep-research/tui.ts             | 417 +++++++++++++++++++++
 package.json                              |   1 +
 src/agents/agent-pool.ts                  |  63 ++--
 src/agents/context.ts                     |   9 +-
 src/agents/init.ts                        |  20 +-
 src/agents/types.ts                       |   2 +-
 test/agents.ts                            | 272 ++++++++++++++
 typedoc.json                              |   3 +-
 19 files changed, 1135 insertions(+), 685 deletions(-)
 delete mode 100644 examples/deep-research/display.ts
 create mode 100644 examples/deep-research/tui.ts
 create mode 100644 test/agents.ts

diff --git a/examples/deep-research/display.ts b/examples/deep-research/display.ts
deleted file mode 100644
index a28ed43..0000000
--- a/examples/deep-research/display.ts
+++ /dev/null
@@ -1,331 +0,0 @@
-import * as fs from 'node:fs';
-import { each } from 'effection';
-import type { Operation, Signal } from 'effection';
-import type { HarnessEvent, PhaseStats } from './harness.js';
-import type { AgentPoolResult } from '../../dist/agents/index.js';
-
-// ── Mode + color ─────────────────────────────────────────────────
-
-let _jsonlMode = false;
-
-export function setJsonlMode(on: boolean): void { _jsonlMode = on; }
-
-const isTTY = process.stdout.isTTY;
-
-export const c = isTTY ? {
-  bold: '\x1b[1m', dim: '\x1b[2m', reset: '\x1b[0m',
-  green: '\x1b[32m', cyan: '\x1b[36m', yellow: '\x1b[33m', red: '\x1b[31m',
-} : { bold: '', dim: '', reset: '', green: '', cyan: '', yellow: '', red: '' };
-
-// ── Primitives ───────────────────────────────────────────────────
-
-let _statusText = '';
-
-export function status(text: string): void {
-  if (_jsonlMode || !isTTY) return;
-  _statusText = text;
-  process.stdout.write('\r\x1b[K' + text);
-}
-
-export function statusClear(): void {
-  if (!_statusText) return;
-  _statusText = '';
-  process.stdout.write('\r\x1b[K');
-}
-
-export const log = (...a: unknown[]): void => {
-  if (_jsonlMode) return;
-  statusClear();
-  console.log(...a);
-};
-
-function emit(event: string, data: Record<string, unknown>): void {
-  if (_jsonlMode) console.log(JSON.stringify({ event, ...data }));
-}
-
-export const sec = (a: number, b: number): string => ((b - a) / 1000).toFixed(1);
-export const pad = (s: unknown, n: number): string => String(s).padStart(n);
-export const fmtSize = (bytes: number): string => bytes > 1e9
-  ? (bytes / 1e9).toFixed(1) + ' GB'
-  : (bytes / 1e6).toFixed(0) + ' MB';
-
-// ── Display subscriber ──────────────────────────────────────────
-// Spawned once in main.ts. Handles both AgentEvent (agent-level,
-// from useAgentPool) and PhaseEvent (harness-level).
-
-export interface DisplayOptions {
-  model: string;
-  reranker: string;
-  agentCount: number;
-  verifyCount: number;
-  chunkCount: number;
-}
-
-export function* displaySubscriber(
-  events: Signal<HarnessEvent, void>,
-  opts: DisplayOptions,
-): Operation<void> {
-  // Agent label tracking — scoped to subscriber lifetime
-  const agentLabel = new Map<number, string>();
-  let nextLabel = 0;
-  const agentText = new Map<number, string>();
-  const agentStatus = new Map<number, { state: string; tokenCount: number; detail: string }>();
-
-  function label(agentId: number): string {
-    let l = agentLabel.get(agentId);
-    if (!l) { l = `A${nextLabel++}`; agentLabel.set(agentId, l); }
-    return l;
-  }
-
-  function resetLabels(): void {
-    nextLabel = 0; agentLabel.clear(); agentStatus.clear(); agentText.clear();
-  }
-
-  function renderStatus(): void {
-    const active = [...agentStatus.entries()].filter(([, s]) => s.state !== 'done');
-    if (active.length === 0) return;
-
-    const generating = active.filter(([, s]) => s.state === 'gen');
-
-    if (generating.length === 1 && active.length === 1) {
-      const [id] = generating[0];
-      const raw = (agentText.get(id) ?? '').replace(/\n/g, ' ').trimStart();
-      const cols = process.stdout.columns || 80;
-      const maxLen = cols - 12;
-      const text = raw.length > maxLen ? raw.slice(raw.length - maxLen) : raw;
-      status(`    ${c.dim}\u25c6${c.reset} ${c.yellow}${label(id)}${c.reset} ${text}`);
-      return;
-    }
-
-    const parts = active.map(([id, s]) => {
-      const lbl = `${c.yellow}${label(id)}${c.reset}`;
-      if (s.state === 'gen') return `${lbl}: ${s.tokenCount} tok`;
-      const detail = s.detail ? ` ${s.detail}` : '';
-      return `${lbl}: ${c.cyan}${s.state}${c.reset}${detail}`;
-    });
-    status(`    ${c.dim}\u25c6${c.reset} ${parts.join('  ')}`);
-  }
-
-  function renderStats(phases: PhaseStats[], kvLine?: string, ctxPct?: number, ctxPos?: number, ctxTotal?: number): void {
-    const totalTokens = phases.reduce((s, p) => s + p.tokens, 0);
-    const totalMs = phases.reduce((s, p) => s + p.timeMs, 0);
-
-    log(`\n  ${c.dim}${'\u2501'.repeat(58)}${c.reset}`);
-    for (const p of phases) {
-      const left = `${p.label.padEnd(10)} ${pad(p.tokens, 5)} tok`;
-      const detail = p.detail ? `  ${p.detail}` : '';
-      const right = p.timeMs > 0 ? `${pad((p.timeMs / 1000).toFixed(1), 6)}s` : '';
-      log(`  ${c.dim}${left}${detail}${' '.repeat(Math.max(1, 58 - left.length - detail.length - right.length))}${right}${c.reset}`);
-    }
-    log(`  ${c.dim}${'\u2501'.repeat(58)}${c.reset}`);
-    log(`  ${c.bold}Total${c.reset}      ${c.bold}${pad(totalTokens, 5)}${c.reset} tok         ${c.bold}${pad((totalMs / 1000).toFixed(1), 6)}s${c.reset}`);
-    if (kvLine) log(`  ${c.dim}${kvLine}${c.reset}`);
-    if (ctxPct != null && ctxPos != null && ctxTotal != null) {
-      const ctxStr = `ctx: ${ctxPct}% (${ctxPos.toLocaleString()}/${ctxTotal.toLocaleString()})`;
-      log(`  ${c.dim}${'\u2501'.repeat(58)}${c.reset}`);
-      log(`  ${c.dim}${' '.repeat(58 - ctxStr.length)}${ctxStr}${c.reset}`);
-    }
-    log();
-  }
-
-  // ── Trace persistence ────────────────────────────────────────
-  // Per-token trace data lives on AgentResult.trace (populated by
-  // useAgentPool when trace: true). We just write it to disk here.
-  let traceQuery = '';
-
-  function flushTrace(pool: AgentPoolResult): void {
-    if (!pool.agents.some(a => a.trace?.length)) return;
-    const filename = `trace-${Date.now()}.json`;
-    fs.writeFileSync(filename, JSON.stringify({
-      query: traceQuery,
-      timestamp: new Date().toISOString(),
-      agents: pool.agents.map(a => ({
-        agentId: a.agentId, label: label(a.agentId),
-        ppl: a.ppl, samplingPpl: a.samplingPpl,
-        tokenCount: a.tokenCount, toolCallCount: a.toolCallCount,
-        findings: a.findings, trace: a.trace ?? [],
-      })),
-    }, null, 2));
-    log(`  ${c.dim}Trace written to ${filename}${c.reset}`);
-  }
-
-  for (const ev of yield* each(events)) {
-    switch (ev.type) {
-      // ── Agent-level events (from useAgentPool) ──────────
-      case 'agent:produce': {
-        agentText.set(ev.agentId, (agentText.get(ev.agentId) ?? '') + ev.text);
-        agentStatus.set(ev.agentId, { state: 'gen', tokenCount: ev.tokenCount, detail: '' });
-        renderStatus();
-        break;
-      }
-      case 'agent:tool_call': {
-        agentText.delete(ev.agentId);
-        agentStatus.set(ev.agentId, { state: ev.tool, tokenCount: 0, detail: '' });
-        emit('tool_call', { agentId: ev.agentId, toolName: ev.tool, arguments: ev.args });
-        let toolArgs: Record<string, string>;
-        try { toolArgs = JSON.parse(ev.args); } catch { toolArgs = {}; }
-        const argSummary = ev.tool === 'search'
-          ? `"${toolArgs.query || ''}"`
-          : ev.tool === 'grep'
-          ? `/${toolArgs.pattern || ''}/`
-          : ev.tool === 'report' ? ''
-          : `${toolArgs.filename}` + (toolArgs.startLine ? ` L${toolArgs.startLine}-${toolArgs.endLine}` : '');
-        log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(ev.agentId)}${c.reset} ${c.cyan}${ev.tool}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
-        break;
-      }
-      case 'agent:tool_result': {
-        emit('tool_result', {
-          agentId: ev.agentId, toolName: ev.tool,
-          result: ev.result.length > 200 ? ev.result.slice(0, 200) + '...' : ev.result,
-        });
-        let preview = '';
-        if (ev.tool === 'read_file') {
-          try {
-            const firstLine = (JSON.parse(ev.result) as { content: string }).content.split('\n').find((l: string) => l.trim());
-            if (firstLine) preview = ` \u00b7 ${firstLine.trim().slice(0, 60)}${firstLine.trim().length > 60 ? '\u2026' : ''}`;
-          } catch { /* non-fatal */ }
-        } else if (ev.tool === 'search') {
-          try {
-            const top = (JSON.parse(ev.result) as { heading: string }[])[0];
-            if (top?.heading) preview = ` \u00b7 ${top.heading}`;
-          } catch { /* non-fatal */ }
-        } else if (ev.tool === 'grep') {
-          try {
-            const r = JSON.parse(ev.result) as { totalMatches: number; matchingLines: number };
-            preview = ` \u00b7 ${r.totalMatches} matches in ${r.matchingLines} lines`;
-          } catch { /* non-fatal */ }
-        }
-        log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(ev.agentId)}${c.reset} ${c.dim}\u2190 ${ev.tool} ${ev.result.length}b${preview}${c.reset}`);
-        break;
-      }
-      case 'agent:tool_progress': {
-        agentStatus.set(ev.agentId, { state: ev.tool, tokenCount: 0, detail: `${ev.filled}/${ev.total}` });
-        renderStatus();
-        break;
-      }
-      case 'agent:report': {
-        agentStatus.set(ev.agentId, { state: 'done', tokenCount: 0, detail: '' });
-        const cols = process.stdout.columns || 80;
-        const lbl = `${c.yellow}${label(ev.agentId)}${c.reset}`;
-        const prefix = `    ${c.dim}\u2502${c.reset}   `;
-        const wrap = cols - 8;
-
-        log(`    ${c.dim}\u2502${c.reset}`);
-        log(`    ${c.dim}\u251c\u2500\u2500${c.reset} ${lbl} ${c.bold}findings${c.reset}`);
-
-        for (const para of ev.findings.split('\n')) {
-          if (!para.trim()) { log(prefix); continue; }
-          const words = para.split(/\s+/);
-          let line = '';
-          for (const word of words) {
-            if (line && line.length + 1 + word.length > wrap) {
-              log(`${prefix}${c.dim}${line}${c.reset}`);
-              line = word;
-            } else {
-              line = line ? `${line} ${word}` : word;
-            }
-          }
-          if (line) log(`${prefix}${c.dim}${line}${c.reset}`);
-        }
-        log(`    ${c.dim}\u2502${c.reset}`);
-        break;
-      }
-      case 'agent:done': break;
-
-      // ── Phase events (from harness) ─────────────────────
-      case 'query': {
-        traceQuery = ev.query;
-        if (!ev.warm) {
-          emit('start', {
-            model: opts.model, reranker: opts.reranker, query: ev.query,
-            agentCount: opts.agentCount, verifyCount: opts.verifyCount, chunks: opts.chunkCount,
-          });
-          log();
-          log(`  ${c.dim}Query${c.reset}`);
-          log(`  ${c.bold}${ev.query}${c.reset}`);
-        }
-        break;
-      }
-      case 'plan': {
-        emit('plan', { questions: ev.questions, planTokens: ev.tokenCount });
-        log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Plan${c.reset} ${c.dim}${ev.tokenCount} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
-        ev.questions.forEach((q: string, i: number) => log(`    ${c.dim}${i + 1}.${c.reset} ${q}`));
-        break;
-      }
-      case 'research:start': {
-        log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Research${c.reset} ${c.dim}${ev.agentCount} agents${c.reset}`);
-        resetLabels();
-        break;
-      }
-      case 'research:done': {
-        statusClear();
-        ev.pool.agents.forEach((a, i) => {
-          const tree = i === ev.pool.agents.length - 1 ? '\u2514' : '\u251c';
-          emit('agent_done', {
-            index: i, findings: (a.findings || '').slice(0, 500),
-            toolCalls: a.toolCallCount, tokenCount: a.tokenCount,
-            ppl: a.ppl, samplingPpl: a.samplingPpl,
-          });
-          const raw = (agentText.get(a.agentId) ?? '').replace(/\n/g, ' ').trim();
-          if (raw) log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(a.agentId)}${c.reset} ${c.dim}\u25b8 ${raw.slice(0, 120)}${raw.length > 120 ? '\u2026' : ''}${c.reset}`);
-          const pplStr = Number.isFinite(a.ppl) ? ` \u00b7 ppl ${a.ppl.toFixed(2)}` : '';
-          log(`    ${c.dim}${tree}${c.reset} ${c.yellow}${label(a.agentId)}${c.reset} ${c.green}done${c.reset} ${c.dim}${a.tokenCount} tok \u00b7 ${a.toolCallCount} tools${pplStr}${c.reset}`);
-        });
-        log(`    ${c.dim}${ev.pool.totalTokens} tok \u00b7 ${ev.pool.totalToolCalls} tools \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
-        flushTrace(ev.pool);
-        break;
-      }
-      case 'verify:start': {
-        log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Verify${c.reset} ${c.dim}${ev.count} attempts${c.reset}`);
-        break;
-      }
-      case 'verify:done': {
-        ev.result.attempts.forEach((a, i) => {
-          const tree = i === ev.result.attempts.length - 1 ? '\u2514' : '\u251c';
-          emit('attempt_done', { index: i, output: a.output.trim().slice(0, 500), tokenCount: a.tokenCount, ppl: a.ppl });
-          log(`    ${c.dim}${tree} ${a.tokenCount} tok \u00b7 ppl ${a.ppl.toFixed(2)}${c.reset}`);
-        });
-        log(`    ${c.dim}${ev.result.totalTokens} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
-        break;
-      }
-      case 'eval:done': {
-        emit('convergence', { converged: ev.converged, evalTokens: ev.tokenCount });
-        const verdict = ev.converged === true ? `${c.green}yes${c.reset}`
-          : ev.converged === false ? `${c.red}no${c.reset}`
-          : `${c.yellow}unknown${c.reset}`;
-        log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Eval${c.reset} ${c.dim}${ev.tokenCount} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
-        log(`    Converged: ${verdict}`);
-        break;
-      }
-      case 'answer': {
-        log(`\n  ${c.dim}${'\u2500'.repeat(58)}${c.reset}\n`);
-        const prose = ev.text.trim()
-          .replace(/\*\*(.+?)\*\*/g, `${c.bold}$1${c.reset}`)
-          .split('\n').map((l: string) => `  ${l}`).join('\n');
-        log(prose);
-        break;
-      }
-      case 'response:start': {
-        process.stdout.write(`  ${c.dim}<${c.reset} `);
-        break;
-      }
-      case 'response:text': {
-        process.stdout.write(ev.text);
-        break;
-      }
-      case 'response:done': {
-        console.log('\n');
-        break;
-      }
-      case 'stats': {
-        renderStats(ev.phases, ev.kvLine, ev.ctxPct, ev.ctxPos, ev.ctxTotal);
-        break;
-      }
-      case 'complete': {
-        emit('complete', ev.data);
-        break;
-      }
-    }
-    yield* each.next();
-  }
-}
diff --git a/examples/deep-research/harness.ts b/examples/deep-research/harness.ts
index bab8016..7bc0e4b 100644
--- a/examples/deep-research/harness.ts
+++ b/examples/deep-research/harness.ts
@@ -1,14 +1,15 @@
 import * as fs from 'node:fs';
 import * as path from 'node:path';
 import { call } from 'effection';
-import type { Operation, Signal } from 'effection';
-import { Branch, Session } from '../../dist/index.js';
-import type { SessionContext } from '../../dist/index.js';
+import type { Operation, Channel } from 'effection';
+import { Branch, Session } from '../../dist';
+import type { SessionContext } from '../../dist';
 import {
   Ctx,
   generate, runAgents, diverge, withSharedRoot,
-} from '../../dist/agents/index.js';
-import type { Tool, AgentPoolResult, DivergeResult, AgentEvent } from '../../dist/agents/index.js';
+} from '../../dist/agents';
+import type { Tool, AgentPoolResult, DivergeResult } from '../../dist/agents';
+import type { WorkflowEvent, OpTiming } from './tui';
 
 /** Load a task prompt file. Convention: system prompt above `---`, user content below. */
 function loadTask(name: string): { system: string; user: string } {
@@ -23,37 +24,9 @@ const RESEARCH = loadTask('research');
 const VERIFY = loadTask('verify');
 const EVAL = loadTask('eval');
 
-// ── Harness events ───────────────────────────────────────────────
-// Phase-level events sent by the harness. Display subscribes to these
-// alongside AgentEvent (agent-level events from useAgentPool).
-
-export interface PhaseStats {
-  label: string;
-  tokens: number;
-  detail: string;
-  timeMs: number;
-}
-
-export type PhaseEvent =
-  | { type: 'query'; query: string; warm: boolean }
-  | { type: 'plan'; questions: string[]; tokenCount: number; timeMs: number }
-  | { type: 'research:start'; agentCount: number }
-  | { type: 'research:done'; pool: AgentPoolResult; sharedPrefixLength: number; timeMs: number }
-  | { type: 'verify:start'; count: number }
-  | { type: 'verify:done'; result: DivergeResult; timeMs: number }
-  | { type: 'eval:done'; converged: boolean | null; tokenCount: number; timeMs: number }
-  | { type: 'answer'; text: string }
-  | { type: 'response:start' }
-  | { type: 'response:text'; text: string }
-  | { type: 'response:done'; tokenCount: number; timeMs: number }
-  | { type: 'stats'; phases: PhaseStats[]; kvLine?: string; ctxPct: number; ctxPos: number; ctxTotal: number }
-  | { type: 'complete'; data: Record<string, unknown> };
-
-export type HarnessEvent = AgentEvent | PhaseEvent;
-
 // ── Options ──────────────────────────────────────────────────────
 
-export interface HarnessOptions {
+export interface WorkflowOpts {
   session: Session;
   toolMap: Map<string, Tool>;
   toolsJson: string;
@@ -62,17 +35,26 @@ export interface HarnessOptions {
   maxTurns: number;
   nCtx: number;
   trace: boolean;
-  events: Signal<HarnessEvent, void>;
+  events: Channel<WorkflowEvent, void>;
 }
 
-// ── Plan ─────────────────────────────────────────────────────────
+// ── Agent task builder ───────────────────────────────────────────
 
-function* planPhase(
-  query: string,
-  agentCount: number,
-  parent?: Branch,
-): Operation<{ questions: string[]; tokenCount: number }> {
+function agentTasks(questions: string[], toolsJson: string, parent: Branch, seed?: number) {
+  return questions.map((q, i) => ({
+    systemPrompt: RESEARCH.system,
+    content: q,
+    tools: toolsJson,
+    parent,
+    seed: seed != null ? seed + i : undefined,
+  }));
+}
+
+// ── Operations ───────────────────────────────────────────────────
+
+function* plan(query: string, opts: WorkflowOpts): Operation<{ questions: string[]; tokenCount: number; timeMs: number }> {
   const ctx: SessionContext = yield* Ctx.expect();
+  const t = performance.now();
 
   const schema = {
     type: 'object',
@@ -81,7 +63,7 @@ function* planPhase(
         type: 'array',
         items: { type: 'string' },
         minItems: 2,
-        maxItems: agentCount,
+        maxItems: opts.agentCount,
       },
     },
     required: ['questions'],
@@ -89,7 +71,7 @@ function* planPhase(
   const grammar: string = yield* call(() => ctx.jsonSchemaToGrammar(JSON.stringify(schema)));
 
   const userContent = PLAN.user
-    .replace('{{count}}', String(agentCount))
+    .replace('{{count}}', String(opts.agentCount))
     .replace('{{query}}', query);
 
   const messages = [
@@ -101,8 +83,8 @@ function* planPhase(
   let output: string;
   let tokenCount: number;
 
+  const parent = opts.session.trunk ?? undefined;
   if (parent) {
-    // Warm: fork from trunk — planner inherits conversation KV
     const lead: Branch = yield* call(() => parent.fork());
     try {
       lead.setGrammar(grammar);
@@ -117,10 +99,9 @@ function* planPhase(
         return { output: o, tokenCount: tc };
       }));
     } finally {
-      if (!lead.disposed) yield* call(() => lead.prune());
+      yield* call(() => lead.prune());
     }
   } else {
-    // Cold: fresh branch via generate()
     const result = yield* generate({ prompt, grammar, params: { temperature: 0.3 } });
     output = result.output;
     tokenCount = result.tokenCount;
@@ -128,27 +109,71 @@ function* planPhase(
 
   let questions: string[];
   try {
-    questions = JSON.parse(output).questions.slice(0, agentCount);
+    questions = JSON.parse(output).questions.slice(0, opts.agentCount);
     if (!questions.length) throw new Error('empty');
   } catch {
-    questions = Array.from({ length: agentCount }, (_, i) => `${query} (aspect ${i + 1})`);
+    questions = Array.from({ length: opts.agentCount }, (_, i) => `${query} (aspect ${i + 1})`);
   }
 
-  return { questions, tokenCount };
+  const timeMs = performance.now() - t;
+  yield* opts.events.send({ type: 'plan', questions, tokenCount, timeMs });
+  return { questions, tokenCount, timeMs };
+}
+
+function* research(
+  questions: string[],
+  opts: WorkflowOpts,
+): Operation<{ pool: AgentPoolResult; sharedPrefixLength: number; timeMs: number }> {
+  yield* opts.events.send({ type: 'research:start', agentCount: questions.length });
+  const t = performance.now();
+
+  const { result: pool, prefixLen: sharedPrefixLength } = yield* withSharedRoot(
+    { systemPrompt: RESEARCH.system, tools: opts.toolsJson },
+    function*(root, prefixLen) {
+      const result = yield* runAgents({
+        tasks: agentTasks(questions, opts.toolsJson, root),
+        tools: opts.toolMap, maxTurns: opts.maxTurns, trace: opts.trace,
+      });
+      return { result, prefixLen };
+    },
+  );
+
+  const timeMs = performance.now() - t;
+  yield* opts.events.send({ type: 'research:done', pool, timeMs });
+  return { pool, sharedPrefixLength, timeMs };
 }
 
-// ── Verify ───────────────────────────────────────────────────────
+function* warmResearch(
+  questions: string[],
+  opts: WorkflowOpts,
+): Operation<{ pool: AgentPoolResult; timeMs: number }> {
+  yield* opts.events.send({ type: 'research:start', agentCount: questions.length });
+  const t = performance.now();
+
+  const pool = yield* runAgents({
+    tasks: agentTasks(questions, opts.toolsJson, opts.session.trunk!, Date.now()),
+    tools: opts.toolMap, maxTurns: opts.maxTurns, trace: opts.trace,
+  });
+
+  const timeMs = performance.now() - t;
+  yield* opts.events.send({ type: 'research:done', pool, timeMs });
+  return { pool, timeMs };
+}
 
-function* verifyPhase(opts: {
-  findings: string;
-  query: string;
-  count: number;
-}): Operation<DivergeResult> {
+function* verify(
+  pool: AgentPoolResult,
+  questions: string[],
+  query: string,
+  opts: WorkflowOpts,
+): Operation<{ result: DivergeResult; timeMs: number }> {
   const ctx: SessionContext = yield* Ctx.expect();
+  const findingsText = pool.agents
+    .map((a, i) => `Q: ${questions[i]}\nA: ${(a.findings || '').trim()}`)
+    .join('\n\n');
 
   const userContent = VERIFY.user
-    .replace('{{findings}}', opts.findings)
-    .replace('{{query}}', opts.query);
+    .replace('{{findings}}', findingsText)
+    .replace('{{query}}', query);
 
   const messages = [
     { role: 'system', content: VERIFY.system },
@@ -156,21 +181,25 @@ function* verifyPhase(opts: {
   ];
   const { prompt }: { prompt: string } = yield* call(() => ctx.formatChat(JSON.stringify(messages)));
 
-  return yield* diverge({
+  yield* opts.events.send({ type: 'verify:start', count: opts.verifyCount });
+  const t = performance.now();
+  const result = yield* diverge({
     prompt,
-    attempts: opts.count,
+    attempts: opts.verifyCount,
     params: { temperature: 0.7 },
   });
+  const timeMs = performance.now() - t;
+  yield* opts.events.send({ type: 'verify:done', result, timeMs });
+  return { result, timeMs };
 }
 
-// ── Eval ─────────────────────────────────────────────────────────
-
-function* evalPhase(
-  attempts: { output: string }[],
-): Operation<{ converged: boolean | null; tokenCount: number }> {
+function* evaluate(
+  verifyResult: DivergeResult,
+  opts: WorkflowOpts,
+): Operation<{ converged: boolean | null; tokenCount: number; timeMs: number }> {
   const ctx: SessionContext = yield* Ctx.expect();
 
-  const responsesText = attempts
+  const responsesText = verifyResult.attempts
     .map((a, i) => `Response ${i + 1}: ${a.output.trim()}`)
     .join('\n\n');
 
@@ -189,6 +218,7 @@ function* evalPhase(
   const grammar: string = yield* call(() => ctx.jsonSchemaToGrammar(JSON.stringify(evalSchema)));
   const { prompt }: { prompt: string } = yield* call(() => ctx.formatChat(JSON.stringify(messages)));
 
+  const t = performance.now();
   const result = yield* generate({
     prompt,
     grammar,
@@ -198,172 +228,138 @@ function* evalPhase(
       catch { return null; }
     },
   });
+  const timeMs = performance.now() - t;
+  yield* opts.events.send({ type: 'eval:done', converged: result.parsed as boolean | null, tokenCount: result.tokenCount, timeMs });
+  return { converged: result.parsed as boolean | null, tokenCount: result.tokenCount, timeMs };
+}
 
-  return { converged: result.parsed as boolean | null, tokenCount: result.tokenCount };
+function* answer(verifyResult: DivergeResult, opts: WorkflowOpts): Operation<void> {
+  yield* opts.events.send({ type: 'answer', text: verifyResult.bestOutput });
 }
 
-// ── handleQuery — the orchestrator ───────────────────────────────
-// Composes phases, sends HarnessEvent for display, touches no log().
+function* promote(verifyResult: DivergeResult, opts: WorkflowOpts): Operation<void> {
+  yield* call(() => opts.session.promote(verifyResult.best));
+}
 
-export function* handleQuery(query: string, opts: HarnessOptions): Operation<void> {
-  const { session, toolMap, toolsJson, agentCount, verifyCount, maxTurns, nCtx, trace, events } = opts;
-  const warm = !!session.trunk;
-  const t0 = performance.now();
+function* respond(
+  pool: AgentPoolResult,
+  query: string,
+  opts: WorkflowOpts,
+): Operation<{ tokenCount: number; timeMs: number }> {
+  const agentFindings = pool.agents
+    .map((a: { findings: string | null }, i: number) =>
+      a.findings ? `[Agent ${i}] ${a.findings.trim()}` : null)
+    .filter(Boolean)
+    .join('\n\n');
 
-  events.send({ type: 'query', query, warm });
+  yield* call(() => opts.session.prefillUser(agentFindings
+    ? `Research findings:\n${agentFindings}\n\nUser question: ${query}\n\nAnswer based on the research findings above.`
+    : query));
+
+  yield* opts.events.send({ type: 'response:start' });
+  const t = performance.now();
+  let tokenCount = 0;
+  const trunk = opts.session.trunk!;
+  for (;;) {
+    const { token, text, isStop } = trunk.produceSync();
+    if (isStop) break;
+    yield* call(() => trunk.commit(token));
+    tokenCount++;
+    yield* opts.events.send({ type: 'response:text', text });
+  }
+  const timeMs = performance.now() - t;
+  yield* opts.events.send({ type: 'response:done' });
+  return { tokenCount, timeMs };
+}
 
-  // ── Plan
-  let t = performance.now();
-  const { questions, tokenCount: planTokens } = yield* planPhase(
-    query, agentCount, warm ? session.trunk! : undefined,
-  );
-  const planMs = performance.now() - t;
-  events.send({ type: 'plan', questions, tokenCount: planTokens, timeMs: planMs });
+function* summarize(
+  timings: OpTiming[],
+  opts: WorkflowOpts,
+  extra?: { kvLine?: string },
+): Operation<void> {
+  yield* opts.events.send({
+    type: 'stats', timings,
+    kvLine: extra?.kvLine,
+    ctxPct: Math.round(100 * (opts.session.trunk?.position ?? 0) / opts.nCtx),
+    ctxPos: opts.session.trunk?.position ?? 0,
+    ctxTotal: opts.nCtx,
+  });
+}
 
-  // ── Research
-  events.send({ type: 'research:start', agentCount: questions.length });
-  t = performance.now();
+// ── Workflow compositions ────────────────────────────────────────
 
-  let pool: AgentPoolResult;
-  let sharedPrefixLength: number;
+function* coldQuery(query: string, opts: WorkflowOpts): Operation<void> {
+  const t0 = performance.now();
 
-  const agentTasks = (parent: Branch, seed?: number) => questions.map((q, i) => ({
-    systemPrompt: RESEARCH.system,
-    content: q,
-    tools: toolsJson,
-    parent,
-    seed: seed != null ? seed + i : undefined,
-  }));
+  const p = yield* plan(query, opts);
+  const r = yield* research(p.questions, opts);
+  const v = yield* verify(r.pool, p.questions, query, opts);
+  const e = yield* evaluate(v.result, opts);
+  yield* answer(v.result, opts);
+  yield* promote(v.result, opts);
 
-  if (!warm) {
-    // Cold: withSharedRoot handles root create → prefill → cleanup
-    const { result, prefixLen } = yield* withSharedRoot(
-      { systemPrompt: RESEARCH.system, tools: toolsJson },
-      function*(root, prefixLen) {
-        const result = yield* runAgents({ tasks: agentTasks(root), tools: toolMap, maxTurns, trace });
-        return { result, prefixLen };
-      },
-    );
-    pool = result;
-    sharedPrefixLength = prefixLen;
-  } else {
-    // Warm: fork from conversation trunk
-    pool = yield* runAgents({
-      tasks: agentTasks(session.trunk!, Date.now()),
-      tools: toolMap,
-      maxTurns, trace,
-    });
-    sharedPrefixLength = 0;
-  }
+  const timings: OpTiming[] = [
+    { label: 'Plan', tokens: p.tokenCount, detail: '', timeMs: p.timeMs },
+    {
+      label: 'Research', tokens: r.pool.totalTokens,
+      detail: `(${r.pool.agents.map(a => a.tokenCount).join(' + ')})  ${r.pool.totalToolCalls} tools`,
+      timeMs: r.timeMs,
+    },
+    {
+      label: 'Verify', tokens: v.result.totalTokens,
+      detail: `(${v.result.attempts.map(a => a.tokenCount).join(' + ')})`,
+      timeMs: v.timeMs,
+    },
+    { label: 'Eval', tokens: e.tokenCount, detail: `converged: ${e.converged ? 'yes' : 'no'}`, timeMs: e.timeMs },
+  ];
+
+  const kvSaved = r.sharedPrefixLength * (p.questions.length - 1)
+    + v.result.prefixLength * (v.result.attempts.length - 1);
+  const kvLine = `KV shared    ${r.sharedPrefixLength} \u00d7 ${p.questions.length - 1} + ${v.result.prefixLength} \u00d7 ${v.result.attempts.length - 1} = ${kvSaved.toLocaleString()} tok saved`;
+
+  yield* summarize(timings, opts, { kvLine });
+
+  yield* opts.events.send({
+    type: 'complete',
+    data: {
+      planTokens: p.tokenCount,
+      agentTokens: r.pool.totalTokens, researchSteps: r.pool.steps,
+      agentPpl: r.pool.agents.map(a => a.ppl),
+      verifyTokens: v.result.totalTokens, verifySteps: v.result.steps,
+      evalTokens: e.tokenCount, converged: e.converged,
+      totalToolCalls: r.pool.totalToolCalls,
+      prefixTokens: v.result.prefixLength,
+      sharedPrefixTokens: r.sharedPrefixLength,
+      agentCount: p.questions.length, attemptCount: v.result.attempts.length,
+      wallTimeMs: Math.round(performance.now() - t0),
+      planMs: Math.round(p.timeMs), researchMs: Math.round(r.timeMs),
+      verifyMs: Math.round(v.timeMs), evalMs: Math.round(e.timeMs),
+      ...r.pool.counters,
+    },
+  });
+}
 
-  const researchMs = performance.now() - t;
-  events.send({ type: 'research:done', pool, sharedPrefixLength, timeMs: researchMs });
+function* warmQuery(query: string, opts: WorkflowOpts): Operation<void> {
+  const p = yield* plan(query, opts);
+  const r = yield* warmResearch(p.questions, opts);
+  const resp = yield* respond(r.pool, query, opts);
 
-  // ── Post-research diverges based on cold/warm
-  const phases: PhaseStats[] = [
-    { label: 'Plan', tokens: planTokens, detail: '', timeMs: planMs },
+  const timings: OpTiming[] = [
+    { label: 'Plan', tokens: p.tokenCount, detail: '', timeMs: p.timeMs },
     {
-      label: 'Research', tokens: pool.totalTokens,
-      detail: `(${pool.agents.map(a => a.tokenCount).join(' + ')})  ${pool.totalToolCalls} tools`,
-      timeMs: researchMs,
+      label: 'Research', tokens: r.pool.totalTokens,
+      detail: `(${r.pool.agents.map(a => a.tokenCount).join(' + ')})  ${r.pool.totalToolCalls} tools`,
+      timeMs: r.timeMs,
     },
+    { label: 'Response', tokens: resp.tokenCount, detail: '', timeMs: resp.timeMs },
   ];
 
-  if (!warm) {
-    // ── Verify
-    const findingsText = pool.agents
-      .map((a, i) => `Q: ${questions[i]}\nA: ${(a.findings || '').trim()}`)
-      .join('\n\n');
-
-    events.send({ type: 'verify:start', count: verifyCount });
-    t = performance.now();
-    const verifyResult = yield* verifyPhase({ findings: findingsText, query, count: verifyCount });
-    const verifyMs = performance.now() - t;
-    events.send({ type: 'verify:done', result: verifyResult, timeMs: verifyMs });
-
-    // ── Eval
-    t = performance.now();
-    const { converged, tokenCount: evalTokens } = yield* evalPhase(verifyResult.attempts);
-    const evalMs = performance.now() - t;
-    events.send({ type: 'eval:done', converged, tokenCount: evalTokens, timeMs: evalMs });
-
-    // ── Answer
-    events.send({ type: 'answer', text: verifyResult.bestOutput });
-
-    phases.push(
-      {
-        label: 'Verify', tokens: verifyResult.totalTokens,
-        detail: `(${verifyResult.attempts.map(a => a.tokenCount).join(' + ')})`,
-        timeMs: verifyMs,
-      },
-      { label: 'Eval', tokens: evalTokens, detail: `converged: ${converged ? 'yes' : 'no'}`, timeMs: evalMs },
-    );
-
-    yield* call(() => session.promote(verifyResult.best));
-
-    const kvSaved = sharedPrefixLength * (questions.length - 1)
-      + verifyResult.prefixLength * (verifyResult.attempts.length - 1);
-
-    events.send({
-      type: 'stats', phases,
-      kvLine: `KV shared    ${sharedPrefixLength} \u00d7 ${questions.length - 1} + ${verifyResult.prefixLength} \u00d7 ${verifyResult.attempts.length - 1} = ${kvSaved.toLocaleString()} tok saved`,
-      ctxPct: Math.round(100 * (session.trunk?.position ?? 0) / nCtx),
-      ctxPos: session.trunk?.position ?? 0,
-      ctxTotal: nCtx,
-    });
-
-    events.send({
-      type: 'complete',
-      data: {
-        planTokens,
-        agentTokens: pool.totalTokens, researchSteps: pool.steps,
-        agentPpl: pool.agents.map(a => a.ppl),
-        verifyTokens: verifyResult.totalTokens, verifySteps: verifyResult.steps,
-        evalTokens, converged,
-        totalToolCalls: pool.totalToolCalls,
-        prefixTokens: verifyResult.prefixLength,
-        sharedPrefixTokens: sharedPrefixLength,
-        agentCount: questions.length, attemptCount: verifyResult.attempts.length,
-        wallTimeMs: Math.round(performance.now() - t0),
-        planMs: Math.round(planMs), researchMs: Math.round(researchMs),
-        verifyMs: Math.round(verifyMs), evalMs: Math.round(evalMs),
-        ...pool.counters,
-      },
-    });
-
-  } else {
-    // ── Grounded response from trunk
-    const agentFindings = pool.agents
-      .map((a: { findings: string | null }, i: number) =>
-        a.findings ? `[Agent ${i}] ${a.findings.trim()}` : null)
-      .filter(Boolean)
-      .join('\n\n');
-
-    yield* call(() => session.prefillUser(agentFindings
-      ? `Research findings:\n${agentFindings}\n\nUser question: ${query}\n\nAnswer based on the research findings above.`
-      : query));
-
-    events.send({ type: 'response:start' });
-    t = performance.now();
-    let responseTokens = 0;
-    const trunk = session.trunk!;
-    for (;;) {
-      const { token, text, isStop } = trunk.produceSync();
-      if (isStop) break;
-      yield* call(() => trunk.commit(token));
-      responseTokens++;
-      events.send({ type: 'response:text', text } as HarnessEvent);
-    }
-    const responseMs = performance.now() - t;
-    events.send({ type: 'response:done', tokenCount: responseTokens, timeMs: responseMs });
+  yield* summarize(timings, opts);
+}
 
-    phases.push({ label: 'Response', tokens: responseTokens, detail: '', timeMs: responseMs });
+// ── Entry point ──────────────────────────────────────────────────
 
-    events.send({
-      type: 'stats', phases,
-      ctxPct: Math.round(100 * (session.trunk?.position ?? 0) / nCtx),
-      ctxPos: session.trunk?.position ?? 0,
-      ctxTotal: nCtx,
-    });
-  }
+export function* handleQuery(query: string, opts: WorkflowOpts): Operation<void> {
+  yield* opts.events.send({ type: 'query', query, warm: !!opts.session.trunk });
+  yield* (opts.session.trunk ? warmQuery : coldQuery)(query, opts);
 }
diff --git a/examples/deep-research/main.ts b/examples/deep-research/main.ts
index 5350ca3..d60210d 100644
--- a/examples/deep-research/main.ts
+++ b/examples/deep-research/main.ts
@@ -2,53 +2,69 @@
 /**
  * Deep Research — CLI entry point
  *
- * Wiring only: setup, display subscriber, signal-based REPL.
- * Orchestration lives in harness.ts. Rendering lives in display.ts.
+ * Wiring only: setup, TUI subscriber, REPL.
+ * Orchestration lives in harness.ts. Presentation lives in tui.ts.
  *
  * Usage:
  *   npx tsx examples/deep-research/main.ts [model-path] --corpus <path> [--query <text>] [options]
  */
 
-import * as fs from 'node:fs';
-import * as path from 'node:path';
-import * as readline from 'node:readline';
-import { main, ensure, createSignal, spawn, each, call, action } from 'effection';
-import { createContext } from '../../dist/index.js';
-import type { SessionContext } from '../../dist/index.js';
-import { initAgents } from '../../dist/agents/index.js';
-import { c, log, setJsonlMode, fmtSize } from './display.js';
-import { displaySubscriber } from './display.js';
-import { loadResources, chunkResources } from './resources/files.js';
-import { createReranker } from './reranker.js';
-import { createTools } from './tools/index.js';
-import { handleQuery } from './harness.js';
-import type { HarnessEvent, HarnessOptions } from './harness.js';
+import * as fs from "node:fs";
+import * as path from "node:path";
+import * as readline from "node:readline";
+import {
+  main,
+  ensure,
+  createSignal,
+  spawn,
+  each,
+  call,
+  action,
+} from "effection";
+import { createContext } from "../../dist";
+import type { SessionContext } from "../../dist";
+import { initAgents } from "../../dist/agents";
+import { c, log, setJsonlMode, fmtSize, createView } from "./tui";
+import type { WorkflowEvent } from "./tui";
+import { loadResources, chunkResources } from "./resources/files";
+import { createReranker } from "./reranker";
+import { createTools } from "./tools";
+import { handleQuery } from "./harness";
+import type { WorkflowOpts } from "./harness";
 
 // ── CLI args ─────────────────────────────────────────────────────
 
-const DEFAULT_MODEL = path.resolve(__dirname, '../../models/Qwen3-4B-Instruct-2507-Q4_K_M.gguf');
-const DEFAULT_RERANKER = path.resolve(__dirname, '../../models/qwen3-reranker-0.6b-q4_k_m.gguf');
+const DEFAULT_MODEL = path.resolve(
+  __dirname,
+  "../../models/Qwen3-4B-Instruct-2507-Q4_K_M.gguf",
+);
+const DEFAULT_RERANKER = path.resolve(
+  __dirname,
+  "../../models/qwen3-reranker-0.6b-q4_k_m.gguf",
+);
 
 const args = process.argv.slice(2);
-const jsonlMode = args.includes('--jsonl');
-const verbose = args.includes('--verbose');
-const trace = args.includes('--trace');
+const jsonlMode = args.includes("--jsonl");
+const verbose = args.includes("--verbose");
+const trace = args.includes("--trace");
 
 function argVal(flag: string): string | null {
   const i = args.indexOf(flag);
   return i !== -1 ? args[i + 1] : null;
 }
 const flagIndices = new Set(
-  ['--reranker', '--corpus', '--query'].flatMap((f) => {
+  ["--reranker", "--corpus", "--query"].flatMap((f) => {
     const i = args.indexOf(f);
     return i !== -1 ? [i, i + 1] : [];
   }),
 );
 
-const rerankModelPath = argVal('--reranker') || DEFAULT_RERANKER;
-const corpusDir = argVal('--corpus');
-const initialQuery = argVal('--query');
-const modelPath = args.find((a, i) => !a.startsWith('--') && !flagIndices.has(i)) || DEFAULT_MODEL;
+const rerankModelPath = argVal("--reranker") || DEFAULT_RERANKER;
+const corpusDir = argVal("--corpus");
+const initialQuery = argVal("--query");
+const modelPath =
+  args.find((a, i) => !a.startsWith("--") && !flagIndices.has(i)) ||
+  DEFAULT_MODEL;
 
 if (!corpusDir) {
   process.stdout.write(
@@ -59,7 +75,12 @@ if (!corpusDir) {
 
 if (jsonlMode) setJsonlMode(true);
 if (!verbose && !jsonlMode) {
-  try { fs.closeSync(2); fs.openSync(process.platform === 'win32' ? '\\\\.\\NUL' : '/dev/null', 'w'); } catch { /* non-fatal */ }
+  try {
+    fs.closeSync(2);
+    fs.openSync(process.platform === "win32" ? "\\\\.\\NUL" : "/dev/null", "w");
+  } catch {
+    /* non-fatal */
+  }
 }
 
 const AGENT_COUNT = 3;
@@ -68,93 +89,131 @@ const MAX_TOOL_TURNS = 6;
 
 // ── Main ─────────────────────────────────────────────────────────
 
-main(function*() {
+main(function* () {
   const resources = loadResources(corpusDir!);
   const chunks = chunkResources(resources);
 
-  const modelName = path.basename(modelPath).replace(/-Q\w+\.gguf$/, '');
-  const rerankName = path.basename(rerankModelPath).replace(/-q\w+\.gguf$/i, '');
+  const modelName = path.basename(modelPath).replace(/-Q\w+\.gguf$/, "");
+  const rerankName = path
+    .basename(rerankModelPath)
+    .replace(/-q\w+\.gguf$/i, "");
 
   log();
-  log(`${c.bold}  Deep Research${c.reset} ${c.dim}\u2014 Structured Concurrency Runtime${c.reset}`);
+  log(
+    `${c.bold}  Deep Research${c.reset} ${c.dim}\u2014 Structured Concurrency Runtime${c.reset}`,
+  );
   log();
-  log(`  ${c.green}\u25cf${c.reset} Loading ${c.bold}${modelName}${c.reset} ${c.dim}(${fmtSize(fs.statSync(modelPath).size)}, KV: Q4_0)${c.reset}`);
+  log(
+    `  ${c.green}\u25cf${c.reset} Loading ${c.bold}${modelName}${c.reset} ${c.dim}(${fmtSize(fs.statSync(modelPath).size)}, KV: Q4_0)${c.reset}`,
+  );
 
-  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '16384', 10);
-  const ctx: SessionContext = yield* call(() => createContext({
-    modelPath, nCtx,
-    nSeqMax: Math.max(AGENT_COUNT, VERIFY_COUNT) + 1,
-    typeK: 'q4_0', typeV: 'q4_0',
-  }));
+  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || "16384", 10);
+  const ctx: SessionContext = yield* call(() =>
+    createContext({
+      modelPath,
+      nCtx,
+      nSeqMax: Math.max(AGENT_COUNT, VERIFY_COUNT) + 1,
+      typeK: "q4_0",
+      typeV: "q4_0",
+    }),
+  );
 
-  log(`  ${c.green}\u25cf${c.reset} Loading ${c.bold}${rerankName}${c.reset} ${c.dim}(${fmtSize(fs.statSync(rerankModelPath).size)}, reranker)${c.reset}`);
+  log(
+    `  ${c.green}\u25cf${c.reset} Loading ${c.bold}${rerankName}${c.reset} ${c.dim}(${fmtSize(fs.statSync(rerankModelPath).size)}, reranker)${c.reset}`,
+  );
 
-  const reranker = yield* call(() => createReranker(rerankModelPath, { nSeqMax: 8, nCtx: 4096 }));
-  yield* ensure(() => { reranker.dispose(); });
+  const reranker = yield* call(() =>
+    createReranker(rerankModelPath, { nSeqMax: 8, nCtx: 4096 }),
+  );
+  yield* ensure(() => {
+    reranker.dispose();
+  });
   yield* call(() => reranker.tokenizeChunks(chunks));
 
-  const corpusIsFile = resources.length === 1 && fs.statSync(corpusDir!).isFile();
+  const corpusIsFile =
+    resources.length === 1 && fs.statSync(corpusDir!).isFile();
   const corpusLabel = corpusIsFile
     ? path.basename(corpusDir!)
     : `${path.basename(corpusDir!)}/ \u2014 ${resources.length} files`;
-  log(`  ${c.dim}  Corpus: ${corpusLabel} \u2192 ${chunks.length} chunks${c.reset}`);
+  log(
+    `  ${c.dim}  Corpus: ${corpusLabel} \u2192 ${chunks.length} chunks${c.reset}`,
+  );
 
   const { toolMap, toolsJson } = createTools({ resources, chunks, reranker });
-  const { session, events } = yield* initAgents<HarnessEvent>(ctx);
-
-  // Display subscriber — all rendering lives here
-  yield* spawn(function*() {
-    yield* displaySubscriber(events, {
-      model: path.basename(modelPath),
-      reranker: path.basename(rerankModelPath),
-      agentCount: AGENT_COUNT,
-      verifyCount: VERIFY_COUNT,
-      chunkCount: chunks.length,
-    });
+  const { session, events } = yield* initAgents<WorkflowEvent>(ctx);
+
+  // View subscriber — all presentation lives here
+  const view = createView({
+    model: path.basename(modelPath),
+    reranker: path.basename(rerankModelPath),
+    agentCount: AGENT_COUNT,
+    verifyCount: VERIFY_COUNT,
+    chunkCount: chunks.length,
+  });
+  yield* spawn(function* () {
+    yield* view.subscribe(events);
   });
 
-  const harnessOpts: HarnessOptions = {
-    session, toolMap, toolsJson, events,
-    agentCount: AGENT_COUNT, verifyCount: VERIFY_COUNT,
-    maxTurns: MAX_TOOL_TURNS, nCtx, trace,
+  const harnessOpts: WorkflowOpts = {
+    session,
+    toolMap,
+    toolsJson,
+    events,
+    agentCount: AGENT_COUNT,
+    verifyCount: VERIFY_COUNT,
+    maxTurns: MAX_TOOL_TURNS,
+    nCtx,
+    trace,
   };
 
   // Initial query
   if (initialQuery) {
     yield* handleQuery(initialQuery, harnessOpts);
-    if (jsonlMode) return;  // scope exit triggers initAgents + ensure cleanup
+    if (jsonlMode) return;
   }
 
-  // REPL — signal bridges readline into Effection scope
-  log(`  ${c.dim}${session.trunk ? 'Ask a follow-up question' : 'Enter your research question'} or /quit to exit${c.reset}`);
+  // REPL — Signal bridges readline into Effection scope
+  log(
+    `  ${c.dim}${session.trunk ? "Ask a follow-up question" : "Enter your research question"} or /quit to exit${c.reset}`,
+  );
   log();
 
   const inputSignal = createSignal<string, void>();
-  const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
+  const rl = readline.createInterface({
+    input: process.stdin,
+    output: process.stdout,
+  });
   rl.setPrompt(`  ${c.dim}>${c.reset} `);
 
-  yield* spawn(function*() {
+  yield* spawn(function* () {
     yield* action<void>((resolve) => {
-      rl.on('line', (line: string) => inputSignal.send(line.trim()));
-      rl.on('close', () => { inputSignal.close(); resolve(); });
+      rl.on("line", (line: string) => inputSignal.send(line.trim()));
+      rl.on("close", () => {
+        inputSignal.close();
+        resolve();
+      });
       return () => rl.close();
     });
   });
 
   rl.prompt();
   for (const input of yield* each(inputSignal)) {
-    if (!input || input === '/quit') break;
+    if (!input || input === "/quit") break;
     try {
       yield* handleQuery(input, harnessOpts);
     } catch (err) {
       log(`  ${c.red}Error: ${(err as Error).message}${c.reset}`);
     }
     yield* each.next();
-    try { rl.prompt(); } catch { break; }
+    try {
+      rl.prompt();
+    } catch {
+      break;
+    }
   }
-
-  // scope exit triggers initAgents + ensure cleanup
 }).catch((err: unknown) => {
-  process.stdout.write(`Error: ${(err as Error).message}\n${(err as Error).stack}\n`);
+  process.stdout.write(
+    `Error: ${(err as Error).message}\n${(err as Error).stack}\n`,
+  );
   process.exit(1);
 });
diff --git a/examples/deep-research/reranker.ts b/examples/deep-research/reranker.ts
index 3762bc9..118e17e 100644
--- a/examples/deep-research/reranker.ts
+++ b/examples/deep-research/reranker.ts
@@ -1,6 +1,6 @@
-import { Rerank } from '../../dist/index.js';
-import type { Chunk } from './resources/types.js';
-import type { Reranker, ScoredResult } from './tools/types.js';
+import { Rerank } from "../../dist";
+import type { Chunk } from "./resources/types";
+import type { Reranker, ScoredResult } from "./tools/types";
 
 export async function createReranker(
   modelPath: string,
@@ -10,19 +10,27 @@ export async function createReranker(
 
   return {
     score(query: string, chunks: Chunk[]): AsyncIterable<ScoredResult> {
-      const inner = rerank.score(query, chunks.map(c => c.tokens), 5);
+      const inner = rerank.score(
+        query,
+        chunks.map((c) => c.tokens),
+        10,
+      );
       return {
         [Symbol.asyncIterator](): AsyncIterator<ScoredResult> {
           const it = inner[Symbol.asyncIterator]();
           return {
             async next(): Promise<IteratorResult<ScoredResult>> {
               const { value, done } = await it.next();
-              if (done) return { value: undefined as unknown as ScoredResult, done: true };
+              if (done)
+                return {
+                  value: undefined as unknown as ScoredResult,
+                  done: true,
+                };
               return {
                 value: {
                   filled: value.filled,
                   total: value.total,
-                  results: value.results.map(r => ({
+                  results: value.results.map((r) => ({
                     file: chunks[r.index].resource,
                     heading: chunks[r.index].heading,
                     score: r.score,
@@ -44,6 +52,8 @@ export async function createReranker(
       }
     },
 
-    dispose() { rerank.dispose(); },
+    dispose() {
+      rerank.dispose();
+    },
   };
 }
diff --git a/examples/deep-research/resources/files.ts b/examples/deep-research/resources/files.ts
index d41cadc..4004374 100644
--- a/examples/deep-research/resources/files.ts
+++ b/examples/deep-research/resources/files.ts
@@ -1,7 +1,7 @@
 import * as fs from 'node:fs';
 import * as path from 'node:path';
-import { loadBinary } from '../../../dist/index.js';
-import type { Resource, Chunk } from './types.js';
+import { loadBinary } from '../../../dist';
+import type { Resource, Chunk } from './types';
 
 interface Section { heading: string; level: number; startLine: number; endLine: number }
 const { parseMarkdown } = loadBinary() as unknown as { parseMarkdown(text: string): Section[] };
diff --git a/examples/deep-research/tasks/research.md b/examples/deep-research/tasks/research.md
index b1318da..60b25c2 100644
--- a/examples/deep-research/tasks/research.md
+++ b/examples/deep-research/tasks/research.md
@@ -1 +1,12 @@
-You are a research assistant with access to a knowledge base. You have these tools: search (semantic relevance ranking), grep (regex pattern matching), read_file (read specific line ranges), and report (submit findings). Use search for topical queries, grep for exact patterns, and read_file to inspect context around results. Call report with your findings when done.
\ No newline at end of file
+You are a research assistant analyzing a knowledge base. Your tools:
+- **grep**: regex pattern matching — use for precise, exhaustive retrieval
+- **search**: semantic relevance ranking — use to discover related content
+- **read_file**: read specific line ranges — use to verify and get context
+- **report**: submit your final findings with evidence
+
+Process — follow every step in order:
+1. Grep with short, simple patterns first. Use single keywords or two-word phrases — never combine multiple clauses with `.*`. Run multiple greps if needed.
+2. Use search to discover content that grep may miss (different phrasing, synonyms).
+3. Read every matching line with read_file to verify in context. Do not rely on grep/search summaries alone.
+4. Grep again with a different pattern targeting what you have NOT yet found. This is a completeness check, not confirmation of existing results.
+5. Report with line numbers and direct quotes as evidence. State what you found and what you checked.
diff --git a/examples/deep-research/tools/grep.ts b/examples/deep-research/tools/grep.ts
index a0313fe..bc3ae5f 100644
--- a/examples/deep-research/tools/grep.ts
+++ b/examples/deep-research/tools/grep.ts
@@ -1,6 +1,6 @@
-import { Tool } from '../../../dist/agents/index.js';
-import type { JsonSchema } from '../../../dist/agents/index.js';
-import type { Resource } from '../resources/types.js';
+import { Tool } from '../../../dist/agents';
+import type { JsonSchema } from '../../../dist/agents';
+import type { Resource } from '../resources/types';
 
 export class GrepTool extends Tool<{ pattern: string; ignoreCase?: boolean }> {
   readonly name = 'grep';
diff --git a/examples/deep-research/tools/index.ts b/examples/deep-research/tools/index.ts
index 276730e..9641ae7 100644
--- a/examples/deep-research/tools/index.ts
+++ b/examples/deep-research/tools/index.ts
@@ -1,10 +1,10 @@
-import { createToolkit } from '../../../dist/agents/index.js';
-import type { Toolkit } from '../../../dist/agents/index.js';
-import type { Resource, Chunk } from '../resources/types.js';
-import type { Reranker } from './types.js';
-import { SearchTool } from './search.js';
-import { ReadFileTool } from './read-file.js';
-import { GrepTool } from './grep.js';
+import { createToolkit } from '../../../dist/agents';
+import type { Toolkit } from '../../../dist/agents';
+import type { Resource, Chunk } from '../resources/types';
+import type { Reranker } from './types';
+import { SearchTool } from './search';
+import { ReadFileTool } from './read-file';
+import { GrepTool } from './grep';
 
 export function createTools(opts: {
   resources: Resource[];
diff --git a/examples/deep-research/tools/read-file.ts b/examples/deep-research/tools/read-file.ts
index 3a2851b..164a5c5 100644
--- a/examples/deep-research/tools/read-file.ts
+++ b/examples/deep-research/tools/read-file.ts
@@ -1,6 +1,6 @@
-import { Tool } from '../../../dist/agents/index.js';
-import type { JsonSchema } from '../../../dist/agents/index.js';
-import type { Resource } from '../resources/types.js';
+import { Tool } from '../../../dist/agents';
+import type { JsonSchema } from '../../../dist/agents';
+import type { Resource } from '../resources/types';
 
 export class ReadFileTool extends Tool<{ filename: string; startLine?: number; endLine?: number }> {
   readonly name = 'read_file';
diff --git a/examples/deep-research/tools/search.ts b/examples/deep-research/tools/search.ts
index df20630..034bc55 100644
--- a/examples/deep-research/tools/search.ts
+++ b/examples/deep-research/tools/search.ts
@@ -1,7 +1,7 @@
-import { Tool } from '../../../dist/agents/index.js';
-import type { JsonSchema, ToolContext } from '../../../dist/agents/index.js';
-import type { Chunk } from '../resources/types.js';
-import type { Reranker } from './types.js';
+import { Tool } from '../../../dist/agents';
+import type { JsonSchema, ToolContext } from '../../../dist/agents';
+import type { Chunk } from '../resources/types';
+import type { Reranker } from './types';
 
 export class SearchTool extends Tool<{ query: string }> {
   readonly name = 'search';
diff --git a/examples/deep-research/tools/types.ts b/examples/deep-research/tools/types.ts
index 4a9fc8d..3f0012a 100644
--- a/examples/deep-research/tools/types.ts
+++ b/examples/deep-research/tools/types.ts
@@ -1,4 +1,4 @@
-import type { Chunk } from '../resources/types.js';
+import type { Chunk } from '../resources/types';
 
 export interface ScoredChunk {
   file: string;
diff --git a/examples/deep-research/tui.ts b/examples/deep-research/tui.ts
new file mode 100644
index 0000000..71cec5d
--- /dev/null
+++ b/examples/deep-research/tui.ts
@@ -0,0 +1,417 @@
+import * as fs from 'node:fs';
+import { each } from 'effection';
+import type { Channel, Operation } from 'effection';
+import type { AgentEvent, AgentPoolResult, DivergeResult } from '../../dist/agents';
+
+// ── Event types ──────────────────────────────────────────────────
+
+export interface OpTiming {
+  label: string;
+  tokens: number;
+  detail: string;
+  timeMs: number;
+}
+
+export type StepEvent =
+  | { type: 'query'; query: string; warm: boolean }
+  | { type: 'plan'; questions: string[]; tokenCount: number; timeMs: number }
+  | { type: 'research:start'; agentCount: number }
+  | { type: 'research:done'; pool: AgentPoolResult; timeMs: number }
+  | { type: 'verify:start'; count: number }
+  | { type: 'verify:done'; result: DivergeResult; timeMs: number }
+  | { type: 'eval:done'; converged: boolean | null; tokenCount: number; timeMs: number }
+  | { type: 'answer'; text: string }
+  | { type: 'response:start' }
+  | { type: 'response:text'; text: string }
+  | { type: 'response:done' }
+  | { type: 'stats'; timings: OpTiming[]; kvLine?: string; ctxPct: number; ctxPos: number; ctxTotal: number }
+  | { type: 'complete'; data: Record<string, unknown> };
+
+export type WorkflowEvent = AgentEvent | StepEvent;
+
+// ── Mode + color ─────────────────────────────────────────────────
+
+let _jsonlMode = false;
+
+export function setJsonlMode(on: boolean): void { _jsonlMode = on; }
+
+const isTTY = process.stdout.isTTY;
+
+export const c = isTTY ? {
+  bold: '\x1b[1m', dim: '\x1b[2m', reset: '\x1b[0m',
+  green: '\x1b[32m', cyan: '\x1b[36m', yellow: '\x1b[33m', red: '\x1b[31m',
+} : { bold: '', dim: '', reset: '', green: '', cyan: '', yellow: '', red: '' };
+
+// ── Primitives ───────────────────────────────────────────────────
+
+let _statusText = '';
+
+function status(text: string): void {
+  if (_jsonlMode || !isTTY) return;
+  _statusText = text;
+  process.stdout.write('\r\x1b[K' + text);
+}
+
+function statusClear(): void {
+  if (!_statusText) return;
+  _statusText = '';
+  process.stdout.write('\r\x1b[K');
+}
+
+export const log = (...a: unknown[]): void => {
+  if (_jsonlMode) return;
+  statusClear();
+  console.log(...a);
+};
+
+function emit(event: string, data: Record<string, unknown>): void {
+  if (_jsonlMode) console.log(JSON.stringify({ event, ...data }));
+}
+
+export const fmtSize = (bytes: number): string => bytes > 1e9
+  ? (bytes / 1e9).toFixed(1) + ' GB'
+  : (bytes / 1e6).toFixed(0) + ' MB';
+
+const pad = (s: unknown, n: number): string => String(s).padStart(n);
+
+// ── View state + handler type ────────────────────────────────────
+
+interface ViewState {
+  agentLabel: Map<number, string>;
+  nextLabel: number;
+  agentText: Map<number, string>;
+  agentStatus: Map<number, { state: string; tokenCount: number; detail: string }>;
+  traceQuery: string;
+}
+
+type ViewHandler = (ev: WorkflowEvent) => void;
+
+function label(state: ViewState, agentId: number): string {
+  let l = state.agentLabel.get(agentId);
+  if (!l) { l = `A${state.nextLabel++}`; state.agentLabel.set(agentId, l); }
+  return l;
+}
+
+function resetLabels(state: ViewState): void {
+  state.nextLabel = 0;
+  state.agentLabel.clear();
+  state.agentStatus.clear();
+  state.agentText.clear();
+}
+
+function renderStatus(state: ViewState): void {
+  const active = [...state.agentStatus.entries()].filter(([, s]) => s.state !== 'done');
+  if (active.length === 0) return;
+
+  const generating = active.filter(([, s]) => s.state === 'gen');
+  if (generating.length === 1 && active.length === 1) {
+    const [id] = generating[0];
+    const raw = (state.agentText.get(id) ?? '').replace(/\n/g, ' ').trimStart();
+    const cols = process.stdout.columns || 80;
+    const maxLen = cols - 12;
+    const text = raw.length > maxLen ? raw.slice(raw.length - maxLen) : raw;
+    status(`    ${c.dim}\u25c6${c.reset} ${c.yellow}${label(state, id)}${c.reset} ${text}`);
+    return;
+  }
+
+  const parts = active.map(([id, s]) => {
+    const lbl = `${c.yellow}${label(state, id)}${c.reset}`;
+    if (s.state === 'gen') return `${lbl}: ${s.tokenCount} tok`;
+    const detail = s.detail ? ` ${s.detail}` : '';
+    return `${lbl}: ${c.cyan}${s.state}${c.reset}${detail}`;
+  });
+  status(`    ${c.dim}\u25c6${c.reset} ${parts.join('  ')}`);
+}
+
+// ── View handlers ────────────────────────────────────────────────
+
+function queryHandler(state: ViewState, opts: ViewOpts): ViewHandler {
+  return (ev) => {
+    if (ev.type !== 'query') return;
+    state.traceQuery = ev.query;
+    if (!ev.warm) {
+      emit('start', {
+        model: opts.model, reranker: opts.reranker, query: ev.query,
+        agentCount: opts.agentCount, verifyCount: opts.verifyCount, chunks: opts.chunkCount,
+      });
+      log();
+      log(`  ${c.dim}Query${c.reset}`);
+      log(`  ${c.bold}${ev.query}${c.reset}`);
+    }
+  };
+}
+
+function planHandler(): ViewHandler {
+  return (ev) => {
+    if (ev.type !== 'plan') return;
+    emit('plan', { questions: ev.questions, planTokens: ev.tokenCount });
+    log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Plan${c.reset} ${c.dim}${ev.tokenCount} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
+    ev.questions.forEach((q: string, i: number) => log(`    ${c.dim}${i + 1}.${c.reset} ${q}`));
+  };
+}
+
+function agentHandler(state: ViewState): ViewHandler {
+  return (ev) => {
+    switch (ev.type) {
+      case 'agent:produce': {
+        state.agentText.set(ev.agentId, (state.agentText.get(ev.agentId) ?? '') + ev.text);
+        state.agentStatus.set(ev.agentId, { state: 'gen', tokenCount: ev.tokenCount, detail: '' });
+        renderStatus(state);
+        break;
+      }
+      case 'agent:tool_call': {
+        state.agentText.delete(ev.agentId);
+        state.agentStatus.set(ev.agentId, { state: ev.tool, tokenCount: 0, detail: '' });
+        emit('tool_call', { agentId: ev.agentId, toolName: ev.tool, arguments: ev.args });
+        let toolArgs: Record<string, string>;
+        try { toolArgs = JSON.parse(ev.args); } catch { toolArgs = {}; }
+        const argSummary = ev.tool === 'search'
+          ? `"${toolArgs.query || ''}"`
+          : ev.tool === 'grep'
+          ? `/${toolArgs.pattern || ''}/`
+          : ev.tool === 'report' ? ''
+          : `${toolArgs.filename}` + (toolArgs.startLine ? ` L${toolArgs.startLine}-${toolArgs.endLine}` : '');
+        log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(state, ev.agentId)}${c.reset} ${c.cyan}${ev.tool}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
+        break;
+      }
+      case 'agent:tool_result': {
+        emit('tool_result', {
+          agentId: ev.agentId, toolName: ev.tool,
+          result: ev.result.length > 200 ? ev.result.slice(0, 200) + '...' : ev.result,
+        });
+        let preview = '';
+        if (ev.tool === 'read_file') {
+          try {
+            const firstLine = (JSON.parse(ev.result) as { content: string }).content.split('\n').find((l: string) => l.trim());
+            if (firstLine) preview = ` \u00b7 ${firstLine.trim().slice(0, 60)}${firstLine.trim().length > 60 ? '\u2026' : ''}`;
+          } catch { /* non-fatal */ }
+        } else if (ev.tool === 'search') {
+          try {
+            const top = (JSON.parse(ev.result) as { heading: string }[])[0];
+            if (top?.heading) preview = ` \u00b7 ${top.heading}`;
+          } catch { /* non-fatal */ }
+        } else if (ev.tool === 'grep') {
+          try {
+            const r = JSON.parse(ev.result) as { totalMatches: number; matchingLines: number };
+            preview = ` \u00b7 ${r.totalMatches} matches in ${r.matchingLines} lines`;
+          } catch { /* non-fatal */ }
+        }
+        log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(state, ev.agentId)}${c.reset} ${c.dim}\u2190 ${ev.tool} ${ev.result.length}b${preview}${c.reset}`);
+        break;
+      }
+      case 'agent:tool_progress': {
+        state.agentStatus.set(ev.agentId, { state: ev.tool, tokenCount: 0, detail: `${ev.filled}/${ev.total}` });
+        renderStatus(state);
+        break;
+      }
+      case 'agent:report': {
+        state.agentStatus.set(ev.agentId, { state: 'done', tokenCount: 0, detail: '' });
+        const cols = process.stdout.columns || 80;
+        const lbl = `${c.yellow}${label(state, ev.agentId)}${c.reset}`;
+        const prefix = `    ${c.dim}\u2502${c.reset}   `;
+        const wrap = cols - 8;
+
+        log(`    ${c.dim}\u2502${c.reset}`);
+        log(`    ${c.dim}\u251c\u2500\u2500${c.reset} ${lbl} ${c.bold}findings${c.reset}`);
+
+        for (const para of ev.findings.split('\n')) {
+          if (!para.trim()) { log(prefix); continue; }
+          const words = para.split(/\s+/);
+          let line = '';
+          for (const word of words) {
+            if (line && line.length + 1 + word.length > wrap) {
+              log(`${prefix}${c.dim}${line}${c.reset}`);
+              line = word;
+            } else {
+              line = line ? `${line} ${word}` : word;
+            }
+          }
+          if (line) log(`${prefix}${c.dim}${line}${c.reset}`);
+        }
+        log(`    ${c.dim}\u2502${c.reset}`);
+        break;
+      }
+      case 'agent:done': break;
+    }
+  };
+}
+
+function researchSummaryHandler(state: ViewState): ViewHandler {
+  function flushTrace(pool: AgentPoolResult): void {
+    if (!pool.agents.some(a => a.trace?.length)) return;
+    const filename = `trace-${Date.now()}.json`;
+    fs.writeFileSync(filename, JSON.stringify({
+      query: state.traceQuery,
+      timestamp: new Date().toISOString(),
+      agents: pool.agents.map(a => ({
+        agentId: a.agentId, label: label(state, a.agentId),
+        ppl: a.ppl, samplingPpl: a.samplingPpl,
+        tokenCount: a.tokenCount, toolCallCount: a.toolCallCount,
+        findings: a.findings, trace: a.trace ?? [],
+      })),
+    }, null, 2));
+    log(`  ${c.dim}Trace written to ${filename}${c.reset}`);
+  }
+
+  return (ev) => {
+    switch (ev.type) {
+      case 'research:start': {
+        log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Research${c.reset} ${c.dim}${ev.agentCount} agents${c.reset}`);
+        resetLabels(state);
+        break;
+      }
+      case 'research:done': {
+        statusClear();
+        ev.pool.agents.forEach((a, i) => {
+          const tree = i === ev.pool.agents.length - 1 ? '\u2514' : '\u251c';
+          emit('agent_done', {
+            index: i, findings: (a.findings || '').slice(0, 500),
+            toolCalls: a.toolCallCount, tokenCount: a.tokenCount,
+            ppl: a.ppl, samplingPpl: a.samplingPpl,
+          });
+          const raw = (state.agentText.get(a.agentId) ?? '').replace(/\n/g, ' ').trim();
+          if (raw) log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(state, a.agentId)}${c.reset} ${c.dim}\u25b8 ${raw.slice(0, 120)}${raw.length > 120 ? '\u2026' : ''}${c.reset}`);
+          const pplStr = Number.isFinite(a.ppl) ? ` \u00b7 ppl ${a.ppl.toFixed(2)}` : '';
+          log(`    ${c.dim}${tree}${c.reset} ${c.yellow}${label(state, a.agentId)}${c.reset} ${c.green}done${c.reset} ${c.dim}${a.tokenCount} tok \u00b7 ${a.toolCallCount} tools${pplStr}${c.reset}`);
+        });
+        log(`    ${c.dim}${ev.pool.totalTokens} tok \u00b7 ${ev.pool.totalToolCalls} tools \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
+        flushTrace(ev.pool);
+        break;
+      }
+    }
+  };
+}
+
+function verifyHandler(): ViewHandler {
+  return (ev) => {
+    switch (ev.type) {
+      case 'verify:start': {
+        log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Verify${c.reset} ${c.dim}${ev.count} attempts${c.reset}`);
+        break;
+      }
+      case 'verify:done': {
+        ev.result.attempts.forEach((a, i) => {
+          const tree = i === ev.result.attempts.length - 1 ? '\u2514' : '\u251c';
+          emit('attempt_done', { index: i, output: a.output.trim().slice(0, 500), tokenCount: a.tokenCount, ppl: a.ppl });
+          log(`    ${c.dim}${tree} ${a.tokenCount} tok \u00b7 ppl ${a.ppl.toFixed(2)}${c.reset}`);
+        });
+        log(`    ${c.dim}${ev.result.totalTokens} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
+        break;
+      }
+    }
+  };
+}
+
+function evalHandler(): ViewHandler {
+  return (ev) => {
+    if (ev.type !== 'eval:done') return;
+    emit('convergence', { converged: ev.converged, evalTokens: ev.tokenCount });
+    const verdict = ev.converged === true ? `${c.green}yes${c.reset}`
+      : ev.converged === false ? `${c.red}no${c.reset}`
+      : `${c.yellow}unknown${c.reset}`;
+    log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Eval${c.reset} ${c.dim}${ev.tokenCount} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
+    log(`    Converged: ${verdict}`);
+  };
+}
+
+function answerHandler(): ViewHandler {
+  return (ev) => {
+    if (ev.type !== 'answer') return;
+    log(`\n  ${c.dim}${'\u2500'.repeat(58)}${c.reset}\n`);
+    const prose = ev.text.trim()
+      .replace(/\*\*(.+?)\*\*/g, `${c.bold}$1${c.reset}`)
+      .split('\n').map((l: string) => `  ${l}`).join('\n');
+    log(prose);
+  };
+}
+
+function responseHandler(): ViewHandler {
+  return (ev) => {
+    switch (ev.type) {
+      case 'response:start':
+        process.stdout.write(`  ${c.dim}<${c.reset} `);
+        break;
+      case 'response:text':
+        process.stdout.write(ev.text);
+        break;
+      case 'response:done':
+        console.log('\n');
+        break;
+    }
+  };
+}
+
+function statsHandler(): ViewHandler {
+  return (ev) => {
+    if (ev.type !== 'stats') return;
+    const { timings, kvLine, ctxPct, ctxPos, ctxTotal } = ev;
+    const totalTokens = timings.reduce((s, p) => s + p.tokens, 0);
+    const totalMs = timings.reduce((s, p) => s + p.timeMs, 0);
+
+    log(`\n  ${c.dim}${'\u2501'.repeat(58)}${c.reset}`);
+    for (const p of timings) {
+      const left = `${p.label.padEnd(10)} ${pad(p.tokens, 5)} tok`;
+      const detail = p.detail ? `  ${p.detail}` : '';
+      const right = p.timeMs > 0 ? `${pad((p.timeMs / 1000).toFixed(1), 6)}s` : '';
+      log(`  ${c.dim}${left}${detail}${' '.repeat(Math.max(1, 58 - left.length - detail.length - right.length))}${right}${c.reset}`);
+    }
+    log(`  ${c.dim}${'\u2501'.repeat(58)}${c.reset}`);
+    log(`  ${c.bold}Total${c.reset}      ${c.bold}${pad(totalTokens, 5)}${c.reset} tok         ${c.bold}${pad((totalMs / 1000).toFixed(1), 6)}s${c.reset}`);
+    if (kvLine) log(`  ${c.dim}${kvLine}${c.reset}`);
+    if (ctxPct != null && ctxPos != null && ctxTotal != null) {
+      const ctxStr = `ctx: ${ctxPct}% (${ctxPos.toLocaleString()}/${ctxTotal.toLocaleString()})`;
+      log(`  ${c.dim}${'\u2501'.repeat(58)}${c.reset}`);
+      log(`  ${c.dim}${' '.repeat(58 - ctxStr.length)}${ctxStr}${c.reset}`);
+    }
+    log();
+  };
+}
+
+function completeHandler(): ViewHandler {
+  return (ev) => {
+    if (ev.type !== 'complete') return;
+    emit('complete', ev.data);
+  };
+}
+
+// ── createView — composable view factory ─────────────────────────
+
+export interface ViewOpts {
+  model: string;
+  reranker: string;
+  agentCount: number;
+  verifyCount: number;
+  chunkCount: number;
+}
+
+export function createView(opts: ViewOpts) {
+  const state: ViewState = {
+    agentLabel: new Map(),
+    nextLabel: 0,
+    agentText: new Map(),
+    agentStatus: new Map(),
+    traceQuery: '',
+  };
+
+  const handlers: ViewHandler[] = [
+    queryHandler(state, opts),
+    planHandler(),
+    agentHandler(state),
+    researchSummaryHandler(state),
+    verifyHandler(),
+    evalHandler(),
+    answerHandler(),
+    responseHandler(),
+    statsHandler(),
+    completeHandler(),
+  ];
+
+  return {
+    *subscribe(events: Channel<WorkflowEvent, void>): Operation<void> {
+      for (const ev of yield* each(events)) {
+        for (const h of handlers) h(ev);
+        yield* each.next();
+      }
+    },
+  };
+}
diff --git a/package.json b/package.json
index ce36291..88583af 100644
--- a/package.json
+++ b/package.json
@@ -20,6 +20,7 @@
     "docs": "npx typedoc",
     "test": "npm run test:integration",
     "test:integration": "npx tsx test/integration.ts",
+    "test:agents": "npx tsx test/agents.ts",
     "test:examples": "npx tsx test/examples.ts",
     "sync:llama-cpp": "node scripts/sync-llama-cpp.js",
     "example": "npx tsx examples/chat/chat.ts"
diff --git a/src/agents/agent-pool.ts b/src/agents/agent-pool.ts
index e382da3..ac1762c 100644
--- a/src/agents/agent-pool.ts
+++ b/src/agents/agent-pool.ts
@@ -1,5 +1,5 @@
-import { resource, call, action, useScope } from 'effection';
-import type { Operation, Scope } from 'effection';
+import { resource, call, action, useScope, createSignal, spawn, each } from 'effection';
+import type { Operation, Scope, Channel } from 'effection';
 import type { Branch } from '../Branch';
 import { GrammarTriggerType, type GrammarTrigger, type ParsedToolCall, type SessionContext } from '../types';
 import type { BranchStore } from '../BranchStore';
@@ -14,7 +14,6 @@ import type {
   AgentResult,
   AgentEvent,
 } from './types';
-import type { Signal } from 'effection';
 
 // ── Internal agent state machine ───────────────────────────────
 // generating → awaiting_tool → generating  (tool result prefilled)
@@ -172,14 +171,31 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
   return resource(function*(provide) {
     const ctx: SessionContext = yield* Ctx.expect();
     const store: BranchStore = yield* Store.expect();
-    const events: Signal<AgentEvent, void> = yield* Events.expect();
+    const events: Channel<AgentEvent, void> = yield* Events.expect();
     const scope: Scope = yield* useScope();
+
+    // Bridge for onProgress callbacks — Signal is correct here (external callback).
+    // A spawned forwarder drains the bridge into the Channel with proper scope context.
+    const progressBridge = createSignal<AgentEvent, void>();
+    yield* spawn(function*() {
+      for (const ev of yield* each(progressBridge)) {
+        yield* events.send(ev);
+        yield* each.next();
+      }
+    });
     const { tasks, tools, maxTurns = 100, trace = false } = opts;
 
     // ── Setup: fork branches, collect suffix tokens ──────────
     const agents: AgentInternal[] = [];
     const prefillSetup: [Branch, number[]][] = [];
 
+    // try/finally wraps everything from agent creation through provide().
+    // Agent branches are plain Branch objects (not Effection resources) —
+    // their cleanup is manual. Placing it here guarantees any branch that
+    // makes it into agents[] is pruned on ANY exit path: normal completion,
+    // tick loop error, or scope cancellation.
+    try {
+
     for (const task of tasks) {
       // Per-task parent for tree topology, or first task's parent as shared root
       const parent = task.parent;
@@ -234,7 +250,7 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
       idleTicks: 0,
     };
 
-    function dispatchTool(agent: AgentInternal, tc: ParsedToolCall): void {
+    function* dispatchTool(agent: AgentInternal, tc: ParsedToolCall): Operation<void> {
       let toolArgs: Record<string, unknown>;
       try { toolArgs = JSON.parse(tc.arguments); } catch { toolArgs = {}; }
       const callId = tc.id || `call_${agent.toolCallCount}`;
@@ -244,19 +260,20 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
       agent.turns++;
       agent.state = 'awaiting_tool';
 
-      events.send({ type: 'agent:tool_call', agentId: agent.id, tool: tc.name, args: tc.arguments });
+      yield* events.send({ type: 'agent:tool_call', agentId: agent.id, tool: tc.name, args: tc.arguments });
 
       const tool = tools.get(tc.name);
       pendingToolCount++;
       counters.maxConcurrentTools = Math.max(counters.maxConcurrentTools, pendingToolCount);
 
       // scope.run() — eager start, child of agent pool scope, cancelled if scope exits.
-      // spawn() is lazy (Operation), but we're in a plain function — scope.run() is eager.
+      // spawn() is lazy (Operation), but we're in a generator — scope.run() is eager.
       scope.run(function*() {
         try {
           const toolContext = {
             onProgress: (p: { filled: number; total: number }) => {
-              events.send({ type: 'agent:tool_progress', agentId: agent.id, tool: tc.name, filled: p.filled, total: p.total });
+              // Signal bridge — onProgress is an external callback, Signal.send() is correct here.
+              progressBridge.send({ type: 'agent:tool_progress', agentId: agent.id, tool: tc.name, filled: p.filled, total: p.total });
             },
           };
 
@@ -264,7 +281,7 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
             tool ? tool.execute(toolArgs, toolContext) : Promise.resolve({ error: `Unknown tool: ${tc.name}` })
           );
           const resultStr = JSON.stringify(result);
-          events.send({ type: 'agent:tool_result', agentId: agent.id, tool: tc.name, result: resultStr });
+          yield* events.send({ type: 'agent:tool_result', agentId: agent.id, tool: tc.name, result: resultStr });
 
           const prefillTokens: number[] = yield* call(() => buildToolResultDelta(ctx, resultStr, callId));
           settledBuffer.push({ agentId: agent.id, prefillTokens, toolName: tc.name });
@@ -298,9 +315,9 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
             a.state = 'done';
             if (!a.findings && a.toolCallCount > 0 && parsed.content) {
               a.findings = parsed.content;
-              events.send({ type: 'agent:report', agentId: a.id, findings: a.findings });
+              yield* events.send({ type: 'agent:report', agentId: a.id, findings: a.findings });
             }
-            events.send({ type: 'agent:done', agentId: a.id });
+            yield* events.send({ type: 'agent:done', agentId: a.id });
             continue;
           }
 
@@ -330,14 +347,14 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
             a.state = 'done';
             a.toolCallCount++;
             totalToolCalls++;
-            events.send({ type: 'agent:tool_call', agentId: a.id, tool: 'report', args: tc.arguments });
-            events.send({ type: 'agent:report', agentId: a.id, findings: a.findings! });
-            events.send({ type: 'agent:done', agentId: a.id });
+            yield* events.send({ type: 'agent:tool_call', agentId: a.id, tool: 'report', args: tc.arguments });
+            yield* events.send({ type: 'agent:report', agentId: a.id, findings: a.findings! });
+            yield* events.send({ type: 'agent:done', agentId: a.id });
             continue;
           }
 
           // Fire-and-forget — dispatch tool without blocking the decode loop
-          dispatchTool(a, tc);
+          yield* dispatchTool(a, tc);
           a.rawOutput = '';
           continue;
         }
@@ -349,12 +366,12 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
           const entropy = a.branch.modelEntropy();
           const surprisal = a.branch.modelSurprisal(token);
           a.traceBuffer.push({ text, entropy, surprisal });
-          events.send({
+          yield* events.send({
             type: 'agent:produce', agentId: a.id, text, tokenCount: a.tokenCount,
             entropy, surprisal,
           });
         } else {
-          events.send({ type: 'agent:produce', agentId: a.id, text, tokenCount: a.tokenCount });
+          yield* events.send({ type: 'agent:produce', agentId: a.id, text, tokenCount: a.tokenCount });
         }
       }
 
@@ -426,16 +443,14 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
       counters,
     };
 
-    try {
-      yield* provide(result);
+    yield* provide(result);
+
     } finally {
       // Structured cleanup: prune all agent branches when scope exits.
-      // Must be in finally — provide() suspends via yield* suspend(),
-      // and halting jumps to finally blocks, skipping non-finally code.
+      // Covers setup errors, tick loop errors, and normal scope teardown
+      // (provide() suspends via yield* suspend(), halting jumps to finally).
       for (const a of agents) {
-        if (!a.branch.disposed) {
-          try { yield* call(() => a.branch.prune()); } catch { /* branch may already be pruned */ }
-        }
+        yield* call(() => a.branch.prune());
       }
     }
   });
diff --git a/src/agents/context.ts b/src/agents/context.ts
index 68eb4c9..3fc593a 100644
--- a/src/agents/context.ts
+++ b/src/agents/context.ts
@@ -1,7 +1,7 @@
 import { createContext } from 'effection';
 import type { SessionContext } from '../types';
 import type { BranchStore } from '../BranchStore';
-import type { Signal } from 'effection';
+import type { Channel } from 'effection';
 import type { AgentEvent } from './types';
 
 /**
@@ -26,12 +26,11 @@ export const Ctx = createContext<SessionContext>('lloyal.ctx');
 export const Store = createContext<BranchStore>('lloyal.store');
 
 /**
- * Effection context holding the agent event signal
+ * Effection context holding the agent event channel
  *
  * Set by {@link initAgents}. {@link useAgentPool} emits {@link AgentEvent}
- * values through this signal. Harnesses can extend the event type with
- * phase-level events for display subscribers.
+ * values through this channel via `yield* channel.send()`.
  *
  * @category Agents
  */
-export const Events = createContext<Signal<AgentEvent, void>>('lloyal.events');
+export const Events = createContext<Channel<AgentEvent, void>>('lloyal.events');
diff --git a/src/agents/init.ts b/src/agents/init.ts
index 9a5b8c6..d7ebbd6 100644
--- a/src/agents/init.ts
+++ b/src/agents/init.ts
@@ -1,5 +1,5 @@
-import { ensure, createSignal, call } from 'effection';
-import type { Operation, Signal } from 'effection';
+import { ensure, createChannel, call } from 'effection';
+import type { Operation, Channel } from 'effection';
 import { BranchStore } from '../BranchStore';
 import { Session } from '../Session';
 import type { SessionContext } from '../types';
@@ -11,21 +11,21 @@ import type { AgentEvent } from './types';
  *
  * @category Agents
  */
-export interface AgentHandle<E extends AgentEvent = AgentEvent> {
+export interface AgentHandle<E = AgentEvent> {
   /** The session context (model, tokenizer, KV cache) */
   ctx: SessionContext;
   /** Branch store for batched commit/prefill across branches */
   store: BranchStore;
   /** Session managing conversation trunk and branch lifecycle */
   session: Session;
-  /** Signal for subscribing to agent and harness events */
-  events: Signal<E, void>;
+  /** Channel for subscribing to agent events */
+  events: Channel<E, void>;
 }
 
 /**
  * Bootstrap the agent infrastructure and register structured cleanup
  *
- * Creates {@link BranchStore}, {@link Session}, and an event signal, then
+ * Creates {@link BranchStore}, {@link Session}, and an event channel, then
  * sets all three Effection contexts ({@link Ctx}, {@link Store},
  * {@link Events}) in the caller's scope. Cleanup runs on scope exit
  * (Ctrl-C, error, normal completion) via `ensure()`.
@@ -39,7 +39,7 @@ export interface AgentHandle<E extends AgentEvent = AgentEvent> {
  * are harness-specific decisions) and passes it in.
  *
  * @param ctx - Session context created via `createContext()`
- * @returns Agent handle with session, store, and event signal
+ * @returns Agent handle with session, store, and event channel
  *
  * @example Canonical bootstrap
  * ```typescript
@@ -58,16 +58,16 @@ export interface AgentHandle<E extends AgentEvent = AgentEvent> {
  *
  * @category Agents
  */
-export function* initAgents<E extends AgentEvent = AgentEvent>(
+export function* initAgents<E = AgentEvent>(
   ctx: SessionContext,
 ): Operation<AgentHandle<E>> {
   const store = new BranchStore(ctx);
   const session = new Session({ ctx, store });
-  const events: Signal<E, void> = createSignal<E, void>();
+  const events: Channel<E, void> = createChannel<E, void>();
 
   yield* Ctx.set(ctx);
   yield* Store.set(store);
-  yield* Events.set(events as unknown as Signal<AgentEvent, void>);
+  yield* Events.set(events as unknown as Channel<AgentEvent, void>);
 
   yield* ensure(function*() {
     yield* call(() => session.dispose());
diff --git a/src/agents/types.ts b/src/agents/types.ts
index 95ea828..f629227 100644
--- a/src/agents/types.ts
+++ b/src/agents/types.ts
@@ -298,7 +298,7 @@ export interface DivergeResult {
 /**
  * Events emitted by the runtime during agent pool execution
  *
- * Subscribe to these via the `events` signal from {@link initAgents}.
+ * Subscribe to these via the `events` channel from {@link initAgents}.
  * Harnesses can extend this union with phase-level events for display.
  *
  * @category Agents
diff --git a/test/agents.ts b/test/agents.ts
new file mode 100644
index 0000000..cf52d5e
--- /dev/null
+++ b/test/agents.ts
@@ -0,0 +1,272 @@
+/**
+ * Structured concurrency tests for the agent system
+ *
+ * Verifies Effection v4 SC guarantees: branch cleanup on all exit paths,
+ * scope teardown ordering, ensure() lifecycle.
+ *
+ * Usage:
+ *   npm run test:agents
+ *   LLAMA_TEST_MODEL=models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf npm run test:agents
+ */
+
+import * as path from 'node:path';
+import * as fs from 'node:fs';
+import { run, call, spawn, ensure, each } from 'effection';
+import { loadBinary } from '../dist/index.js';
+import type { SessionContext, NativeBinding } from '../dist/index.js';
+import {
+  initAgents, runAgents, withSharedRoot, Tool,
+} from '../dist/agents/index.js';
+import type { AgentPoolResult, JsonSchema } from '../dist/agents/index.js';
+
+const MODEL_PATH: string = process.env.LLAMA_TEST_MODEL
+  ? path.resolve(process.env.LLAMA_TEST_MODEL)
+  : path.join(__dirname, '../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf');
+
+const CTX_SIZE = 2048;
+
+if (!fs.existsSync(MODEL_PATH)) {
+  console.error('Test model not found:', MODEL_PATH);
+  process.exit(1);
+}
+
+console.log('=== lloyal.node SC Agent Tests ===\n');
+console.log(`Model: ${path.basename(MODEL_PATH)}`);
+console.log(`Size: ${(fs.statSync(MODEL_PATH).size / 1024 / 1024).toFixed(1)} MB\n`);
+
+let addon: NativeBinding;
+try {
+  addon = require('../build/Release/lloyal.node') as NativeBinding;
+} catch {
+  addon = loadBinary();
+}
+
+let passed = 0;
+let failed = 0;
+
+function ok(msg: string): void {
+  passed++;
+  console.log(`  [PASS] ${msg}`);
+}
+
+function fail(msg: string): void {
+  failed++;
+  console.log(`  [FAIL] ${msg}`);
+}
+
+function assert(condition: boolean, msg: string): void {
+  if (condition) ok(msg);
+  else { fail(msg); throw new Error(msg); }
+}
+
+// ── Test tools ────────────────────────────────────────────────────
+
+class ThrowingTool extends Tool<Record<string, unknown>> {
+  readonly name = 'explode';
+  readonly description = 'A tool that always throws';
+  readonly parameters: JsonSchema = {
+    type: 'object',
+    properties: { input: { type: 'string' } },
+  };
+  async execute(): Promise<unknown> {
+    throw new Error('intentional_tool_error');
+  }
+}
+
+// ── Helpers ────────────────────────────────────────────────────────
+
+async function createTestContext(): Promise<SessionContext> {
+  return addon.createContext({
+    modelPath: MODEL_PATH,
+    nCtx: CTX_SIZE,
+    nThreads: 4,
+    nSeqMax: 4,
+    typeK: 'f16',
+    typeV: 'f16',
+  });
+}
+
+function makeTasks(parent: unknown, count: number) {
+  return Array.from({ length: count }, (_, i) => ({
+    systemPrompt: 'You are a test agent.',
+    content: `Test task ${i}`,
+    parent,
+  }));
+}
+
+/** Bootstrap agent infra via initAgents + drain events to prevent backpressure */
+function* setupTest(ctx: SessionContext) {
+  const { events } = yield* initAgents(ctx);
+  yield* spawn(function*() {
+    for (const _ev of yield* each(events)) {
+      yield* each.next();
+    }
+  });
+}
+
+// ═══════════════════════════════════════════════════════════════════
+// TEST 1: ensure() cleanup — runs on scope exit regardless of how
+// ═══════════════════════════════════════════════════════════════════
+
+async function testEnsureCleanup(): Promise<void> {
+  console.log('\n--- ensure() cleanup: runs on normal exit and on error ---');
+
+  // Test A: ensure runs on normal exit
+  let cleanupRanNormal = false;
+  await run(function*() {
+    yield* ensure(() => { cleanupRanNormal = true; });
+  });
+  assert(cleanupRanNormal, 'ensure() ran on normal scope exit');
+
+  // Test B: ensure runs on error exit
+  let cleanupRanError = false;
+  try {
+    await run(function*() {
+      yield* ensure(() => { cleanupRanError = true; });
+      throw new Error('intentional_test_error');
+    });
+  } catch {
+    // expected
+  }
+  assert(cleanupRanError, 'ensure() ran on error scope exit');
+}
+
+// ═══════════════════════════════════════════════════════════════════
+// TEST 2: Normal lifecycle — branches pruned after runAgents returns
+// ═══════════════════════════════════════════════════════════════════
+
+async function testNormalLifecycle(): Promise<void> {
+  console.log('\n--- Normal lifecycle: branches pruned after runAgents ---');
+
+  await run(function*() {
+    const ctx: SessionContext = yield* call(() => createTestContext());
+    yield* setupTest(ctx);
+
+    yield* withSharedRoot(
+      { systemPrompt: 'You are a test agent.' },
+      function*(root, prefixLen) {
+        assert(prefixLen > 0, `shared prefix has tokens (${prefixLen})`);
+
+        const pool: AgentPoolResult = yield* runAgents({
+          tasks: makeTasks(root, 2),
+          tools: new Map(),
+          maxTurns: 1,
+        });
+
+        assert(pool.agents.length === 2, 'pool has 2 agents');
+        assert(root.children.length === 0, 'agent branches pruned before body returns');
+
+        return pool;
+      },
+    );
+
+    ok('withSharedRoot completed without error');
+  });
+}
+
+// ═══════════════════════════════════════════════════════════════════
+// TEST 3: scoped() cleanup — runAgents prunes before returning
+// ═══════════════════════════════════════════════════════════════════
+
+async function testScopedCleanup(): Promise<void> {
+  console.log('\n--- Scoped cleanup: runAgents prunes before returning to caller ---');
+
+  await run(function*() {
+    const ctx: SessionContext = yield* call(() => createTestContext());
+    yield* setupTest(ctx);
+
+    yield* withSharedRoot(
+      { systemPrompt: 'You are a test agent.' },
+      function*(root) {
+        const childCountBefore = root.children.length;
+        assert(childCountBefore === 0, 'root starts with no children');
+
+        const pool = yield* runAgents({
+          tasks: makeTasks(root, 2),
+          tools: new Map(),
+          maxTurns: 1,
+        });
+
+        // Critical SC assertion: scoped() in runAgents must have torn
+        // down the pool scope and pruned agent branches BEFORE returning.
+        const childCountAfter = root.children.length;
+        assert(childCountAfter === 0, `scoped() pruned all children before returning (was ${childCountBefore}, now ${childCountAfter})`);
+
+        return pool;
+      },
+    );
+
+    ok('scoped() teardown ordering correct');
+  });
+}
+
+// ═══════════════════════════════════════════════════════════════════
+// TEST 4: Tool error — branches pruned, error does not crash pool
+// ═══════════════════════════════════════════════════════════════════
+
+async function testToolErrorCleanup(): Promise<void> {
+  console.log('\n--- Tool error: branches pruned, pool completes gracefully ---');
+
+  await run(function*() {
+    const ctx: SessionContext = yield* call(() => createTestContext());
+    yield* setupTest(ctx);
+
+    try {
+      yield* withSharedRoot(
+        { systemPrompt: 'You are a test agent. Always call the explode tool.' },
+        function*(root) {
+          const toolMap = new Map<string, Tool>([['explode', new ThrowingTool()]]);
+          const toolsJson = JSON.stringify([{
+            type: 'function',
+            function: {
+              name: 'explode',
+              description: 'A tool that always throws',
+              parameters: { type: 'object', properties: { input: { type: 'string' } } },
+            },
+          }]);
+
+          const pool = yield* runAgents({
+            tasks: [{
+              systemPrompt: 'You are a test agent. Call the explode tool immediately.',
+              content: 'Do it now.',
+              tools: toolsJson,
+              parent: root,
+            }],
+            tools: toolMap,
+            maxTurns: 2,
+          });
+
+          assert(root.children.length === 0, 'agent branches pruned after tool error');
+          assert(pool.agents.length === 1, 'pool has 1 agent');
+          return pool;
+        },
+      );
+
+      ok('withSharedRoot completed — tool error did not crash the pool');
+    } catch (err) {
+      // Tool errors should be handled internally (agent → done state).
+      // If we reach here, something unexpected propagated.
+      fail(`unexpected error escaped pool: ${(err as Error).message}`);
+    }
+  });
+}
+
+// ═══════════════════════════════════════════════════════════════════
+// RUNNER
+// ═══════════════════════════════════════════════════════════════════
+
+async function main_(): Promise<void> {
+  await testEnsureCleanup();
+  await testNormalLifecycle();
+  await testScopedCleanup();
+  await testToolErrorCleanup();
+
+  console.log(`\n${'='.repeat(40)}`);
+  console.log(`Results: ${passed} passed, ${failed} failed`);
+  if (failed > 0) process.exit(1);
+}
+
+main_().catch((err: unknown) => {
+  console.error(`\nFatal: ${(err as Error).message}\n${(err as Error).stack}`);
+  process.exit(1);
+});
diff --git a/typedoc.json b/typedoc.json
index fb2354e..763d6ac 100644
--- a/typedoc.json
+++ b/typedoc.json
@@ -1,7 +1,7 @@
 {
   "$schema": "https://typedoc.org/schema.json",
   "plugin": ["typedoc-rhineai-theme"],
-  "entryPoints": ["dist/index.d.ts"],
+  "entryPoints": ["src/index.ts"],
   "out": "docs/api",
   "name": "lloyal.node API Reference",
   "includeVersion": true,
@@ -26,6 +26,7 @@
     "Sampling",
     "Chat",
     "Branching",
+    "Agents",
     "*"
   ],
   "sort": ["kind", "instance-first", "required-first", "alphabetical"],

From c9838744492dbc5efb9f40b5a764e063733d94b8 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Mon, 2 Mar 2026 14:33:39 +1100
Subject: [PATCH 12/17] feat(agents): fix gh workflows

---
 .github/workflows/release.yml | 8 +++++++-
 .github/workflows/tests.yml   | 6 ++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 608301a..0c3511b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -385,7 +385,13 @@ jobs:
         env:
           NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
 
-      # Publish main package
+      # Build TypeScript before publishing main package
+      - name: Install dependencies
+        run: npm install --ignore-scripts
+
+      - name: Build TypeScript
+        run: npm run build:ts
+
       - name: Sync package versions
         run: node scripts/sync-versions.js
 
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index cdad345..cab8485 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -132,6 +132,12 @@ jobs:
         with:
           node-version: 24
 
+      - name: Install dependencies
+        run: npm install --ignore-scripts
+
+      - name: Build TypeScript
+        run: npm run build:ts
+
       - name: Pack package
         run: npm pack
 

From 346f62db4e2a9e048fadb9986a1c123dbb94ef66 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Mon, 2 Mar 2026 20:39:48 +1100
Subject: [PATCH 13/17] feat(agents): graceful termination

---
 examples/deep-research/harness.ts             |  3 +
 examples/deep-research/main.ts                |  5 +-
 .../tasks/graceful-termination.md             |  1 +
 examples/deep-research/tui.ts                 | 19 +++++-
 src/agents/agent-pool.ts                      | 66 ++++++++++++++++++-
 src/agents/types.ts                           |  6 ++
 src/index.ts                                  |  3 +-
 src/types.ts                                  | 24 ++++---
 8 files changed, 106 insertions(+), 21 deletions(-)
 create mode 100644 examples/deep-research/tasks/graceful-termination.md

diff --git a/examples/deep-research/harness.ts b/examples/deep-research/harness.ts
index 7bc0e4b..3d34457 100644
--- a/examples/deep-research/harness.ts
+++ b/examples/deep-research/harness.ts
@@ -23,6 +23,7 @@ const PLAN = loadTask('plan');
 const RESEARCH = loadTask('research');
 const VERIFY = loadTask('verify');
 const EVAL = loadTask('eval');
+const GRACE = loadTask('graceful-termination');
 
 // ── Options ──────────────────────────────────────────────────────
 
@@ -133,6 +134,7 @@ function* research(
       const result = yield* runAgents({
         tasks: agentTasks(questions, opts.toolsJson, root),
         tools: opts.toolMap, maxTurns: opts.maxTurns, trace: opts.trace,
+        nCtx: opts.nCtx, gracePrompt: GRACE.system,
       });
       return { result, prefixLen };
     },
@@ -153,6 +155,7 @@ function* warmResearch(
   const pool = yield* runAgents({
     tasks: agentTasks(questions, opts.toolsJson, opts.session.trunk!, Date.now()),
     tools: opts.toolMap, maxTurns: opts.maxTurns, trace: opts.trace,
+    nCtx: opts.nCtx, gracePrompt: GRACE.system,
   });
 
   const timeMs = performance.now() - t;
diff --git a/examples/deep-research/main.ts b/examples/deep-research/main.ts
index d60210d..931a905 100644
--- a/examples/deep-research/main.ts
+++ b/examples/deep-research/main.ts
@@ -24,7 +24,7 @@ import {
 import { createContext } from "../../dist";
 import type { SessionContext } from "../../dist";
 import { initAgents } from "../../dist/agents";
-import { c, log, setJsonlMode, fmtSize, createView } from "./tui";
+import { c, log, setJsonlMode, setVerboseMode, fmtSize, createView } from "./tui";
 import type { WorkflowEvent } from "./tui";
 import { loadResources, chunkResources } from "./resources/files";
 import { createReranker } from "./reranker";
@@ -74,6 +74,7 @@ if (!corpusDir) {
 }
 
 if (jsonlMode) setJsonlMode(true);
+if (verbose) setVerboseMode(true);
 if (!verbose && !jsonlMode) {
   try {
     fs.closeSync(2);
@@ -85,7 +86,7 @@ if (!verbose && !jsonlMode) {
 
 const AGENT_COUNT = 3;
 const VERIFY_COUNT = 3;
-const MAX_TOOL_TURNS = 6;
+const MAX_TOOL_TURNS = 20;
 
 // ── Main ─────────────────────────────────────────────────────────
 
diff --git a/examples/deep-research/tasks/graceful-termination.md b/examples/deep-research/tasks/graceful-termination.md
new file mode 100644
index 0000000..91fcaf8
--- /dev/null
+++ b/examples/deep-research/tasks/graceful-termination.md
@@ -0,0 +1 @@
+STOP. You MUST call the report tool RIGHT NOW with a brief summary (under 200 words) of what you found. Do NOT call any other tool. Do NOT list every detail — summarize the key findings only.
\ No newline at end of file
diff --git a/examples/deep-research/tui.ts b/examples/deep-research/tui.ts
index 71cec5d..ddaf442 100644
--- a/examples/deep-research/tui.ts
+++ b/examples/deep-research/tui.ts
@@ -32,8 +32,10 @@ export type WorkflowEvent = AgentEvent | StepEvent;
 // ── Mode + color ─────────────────────────────────────────────────
 
 let _jsonlMode = false;
+let _verboseMode = false;
 
 export function setJsonlMode(on: boolean): void { _jsonlMode = on; }
+export function setVerboseMode(on: boolean): void { _verboseMode = on; }
 
 const isTTY = process.stdout.isTTY;
 
@@ -156,10 +158,21 @@ function agentHandler(state: ViewState): ViewHandler {
       case 'agent:produce': {
         state.agentText.set(ev.agentId, (state.agentText.get(ev.agentId) ?? '') + ev.text);
         state.agentStatus.set(ev.agentId, { state: 'gen', tokenCount: ev.tokenCount, detail: '' });
-        renderStatus(state);
+        if (_verboseMode) {
+          const lbl = label(state, ev.agentId);
+          // First token for this agent — print header
+          if (ev.tokenCount === 1) {
+            statusClear();
+            process.stdout.write(`\n    ${c.dim}───${c.reset} ${c.yellow}${lbl}${c.reset} ${c.dim}tokens${c.reset} ${c.dim}───${c.reset}\n    `);
+          }
+          process.stdout.write(ev.text);
+        } else {
+          renderStatus(state);
+        }
         break;
       }
       case 'agent:tool_call': {
+        if (_verboseMode) process.stdout.write('\n');
         state.agentText.delete(ev.agentId);
         state.agentStatus.set(ev.agentId, { state: ev.tool, tokenCount: 0, detail: '' });
         emit('tool_call', { agentId: ev.agentId, toolName: ev.tool, arguments: ev.args });
@@ -231,7 +244,9 @@ function agentHandler(state: ViewState): ViewHandler {
         log(`    ${c.dim}\u2502${c.reset}`);
         break;
       }
-      case 'agent:done': break;
+      case 'agent:done':
+        if (_verboseMode) process.stdout.write('\n');
+        break;
     }
   };
 }
diff --git a/src/agents/agent-pool.ts b/src/agents/agent-pool.ts
index ac1762c..36d5eb3 100644
--- a/src/agents/agent-pool.ts
+++ b/src/agents/agent-pool.ts
@@ -1,7 +1,7 @@
 import { resource, call, action, useScope, createSignal, spawn, each } from 'effection';
 import type { Operation, Scope, Channel } from 'effection';
 import type { Branch } from '../Branch';
-import { GrammarTriggerType, type GrammarTrigger, type ParsedToolCall, type SessionContext } from '../types';
+import { CHAT_FORMAT_CONTENT_ONLY, CHAT_FORMAT_GENERIC, GrammarTriggerType, type GrammarTrigger, type ParsedToolCall, type SessionContext } from '../types';
 import type { BranchStore } from '../BranchStore';
 import { Ctx, Store, Events } from './context';
 import { buildToolResultDelta } from './deltas';
@@ -39,6 +39,7 @@ interface AgentInternal {
   tokenCount: number;
   toolCallCount: number;
   turns: number;
+  graceUsed: boolean;
   findings: string | null;
   traceBuffer: TraceToken[];
 }
@@ -92,6 +93,9 @@ async function setupAgent(
   const tools = task.tools ? ensureReportTool(task.tools) : undefined;
   const fmtOpts = tools ? { tools } : {};
   const fmt = await ctx.formatChat(JSON.stringify(messages), fmtOpts);
+  if (tools && (fmt.format === CHAT_FORMAT_CONTENT_ONLY || fmt.format === CHAT_FORMAT_GENERIC)) {
+    throw new Error('Model does not support tool calling. Please use a model with native tool support (e.g. Qwen3, Llama 3.x, Mistral).');
+  }
   const sep = ctx.getTurnSeparator();
   const suffixTokens = [...sep, ...await ctx.tokenize(fmt.prompt, false)];
   if (task.seed != null) branch.reseedSampler(task.seed);
@@ -114,6 +118,7 @@ async function setupAgent(
       tokenCount: 0,
       toolCallCount: 0,
       turns: 0,
+      graceUsed: false,
       findings: null,
       traceBuffer: [],
     },
@@ -183,7 +188,7 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
         yield* each.next();
       }
     });
-    const { tasks, tools, maxTurns = 100, trace = false } = opts;
+    const { tasks, tools, maxTurns = 100, nCtx = 0, gracePrompt, trace = false } = opts;
 
     // ── Setup: fork branches, collect suffix tokens ──────────
     const agents: AgentInternal[] = [];
@@ -295,13 +300,36 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
       });
     }
 
+    // Context pressure thresholds (in tokens)
+    const GRACE_RESERVE = 1024;    // room for grace prompt + report generation
+    const CRITICAL_RESERVE = 128;  // absolute minimum — hard stop to prevent crash
+
     // ── Three-phase tick loop ────────────────────────────────
     for (;;) {
       // -- Phase 1: PRODUCE -- sample from active agents
+
+      // Compute aggregate KV remaining once per tick.
+      // All branches (including done, not yet pruned) share one nCtx-sized KV cache.
+      // Shared prefix slots are counted once; divergent tails add per-branch.
+      let kvRemaining = Infinity;
+      if (nCtx > 0) {
+        const positions = agents.map(a => a.branch.position);
+        const sharedPrefix = Math.min(...positions);
+        const totalKV = positions.reduce((s, p) => s + p, 0) - (positions.length - 1) * sharedPrefix;
+        kvRemaining = nCtx - totalKV;
+      }
+
       const entries: [Branch, number][] = [];
       for (const a of agents) {
         if (a.state !== 'generating') continue;
 
+        // Critical context pressure — hard stop before produceSync to prevent llama_decode crash
+        if (kvRemaining < CRITICAL_RESERVE) {
+          a.state = 'done';
+          yield* events.send({ type: 'agent:done', agentId: a.id });
+          continue;
+        }
+
         const { token, text, isStop } = a.branch.produceSync();
         if (isStop) {
           const parsed = ctx.parseChatOutput(a.rawOutput, a.fmt.format, {
@@ -311,7 +339,7 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
           });
 
           const tc = parsed.toolCalls[0];
-          if (!tc || a.turns >= maxTurns) {
+          if (!tc) {
             a.state = 'done';
             if (!a.findings && a.toolCallCount > 0 && parsed.content) {
               a.findings = parsed.content;
@@ -321,6 +349,38 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
             continue;
           }
 
+          // Grace turn: context pressure or maxTurns reached, agent wants to call a tool
+          // that isn't report. Inject gracePrompt so the agent can synthesize findings.
+          // If grace already used (or no gracePrompt configured), hard-cut.
+          const contextPressure = kvRemaining < GRACE_RESERVE;
+          const shouldGrace = (a.turns >= maxTurns || contextPressure) && tc.name !== 'report';
+
+          if (shouldGrace) {
+            if (a.graceUsed || !gracePrompt) {
+              a.state = 'done';
+              yield* events.send({ type: 'agent:done', agentId: a.id });
+              continue;
+            }
+            a.graceUsed = true;
+            const callId = tc.id || `call_${a.toolCallCount}`;
+            a.turns++;
+            a.state = 'awaiting_tool';
+            pendingToolCount++;
+            scope.run(function*() {
+              try {
+                const prefillTokens: number[] = yield* call(() =>
+                  buildToolResultDelta(ctx, JSON.stringify({ error: gracePrompt }), callId)
+                );
+                settledBuffer.push({ agentId: a.id, prefillTokens, toolName: tc.name });
+              } finally {
+                pendingToolCount--;
+                if (wakeIdle) { wakeIdle(); wakeIdle = null; }
+              }
+            });
+            a.rawOutput = '';
+            continue;
+          }
+
           // Report tool special case — reject if no prior research
           if (tc.name === 'report') {
             if (a.toolCallCount === 0) {
diff --git a/src/agents/types.ts b/src/agents/types.ts
index f629227..021c425 100644
--- a/src/agents/types.ts
+++ b/src/agents/types.ts
@@ -139,6 +139,12 @@ export interface AgentPoolOptions {
   params?: SamplingParams;
   /** Maximum tool-call turns per agent before forced termination */
   maxTurns?: number;
+  /** Context window size — enables context-pressure detection when set.
+   *  Agents are gracefully stopped when remaining capacity drops below threshold. */
+  nCtx?: number;
+  /** Message injected as a tool error when an agent must stop and report.
+   *  Triggered by context pressure or maxTurns. If omitted, agents are hard-cut. */
+  gracePrompt?: string;
   /** Enable per-token entropy/surprisal on `agent:produce` events */
   trace?: boolean;
 }
diff --git a/src/index.ts b/src/index.ts
index b0787c5..1ee3f6d 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -287,7 +287,8 @@ export type {
 } from './agents/index';
 
 // ── Enums + types from types.ts ─────────────────────────────────
-export { PoolingType, ChatFormat, ReasoningFormat, GrammarTriggerType } from './types';
+export { PoolingType, CHAT_FORMAT_CONTENT_ONLY, CHAT_FORMAT_GENERIC, ReasoningFormat, GrammarTriggerType } from './types';
+export type { ChatFormat } from './types';
 export type {
   GpuVariant,
   KvCacheType,
diff --git a/src/types.ts b/src/types.ts
index 1fa511d..9d0268f 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -97,24 +97,22 @@ export enum PoolingType {
  * Chat format detected by the template engine
  *
  * Identifies how the model formats tool calls, reasoning blocks, and content.
- * Returned by {@link SessionContext.formatChat | formatChat()} in
- * {@link FormattedChatResult.format} and consumed by
+ * Opaque chat format identifier returned by
+ * {@link SessionContext.formatChat | formatChat()} and consumed by
  * {@link SessionContext.parseChatOutput | parseChatOutput()}.
  *
- * You generally don't need to inspect these values directly --
- * just pass them through from the formatChat result to parseChatOutput.
- *
- * Only commonly-used values are listed. The full set matches llama.cpp's
- * `common_chat_format` enum (30+ formats).
+ * Maps 1:1 to llama.cpp's `common_chat_format` enum (30+ values).
+ * Treat as an opaque number — pass through, don't switch on it.
  *
  * @category Chat
  */
-export enum ChatFormat {
-  /** Plain content, no special formatting */
-  CONTENT_ONLY = 0,
-  /** Generic tool call format */
-  GENERIC = 1,
-}
+export type ChatFormat = number;
+
+/** Model template has no tool/structured-output support. */
+export const CHAT_FORMAT_CONTENT_ONLY: ChatFormat = 0;
+
+/** llama.cpp's generic JSON fallback — imposes format the model wasn't trained on. */
+export const CHAT_FORMAT_GENERIC: ChatFormat = 1;
 
 /**
  * Reasoning/thinking block format

From c9efd9eae76e58386fca39c932b32f8c962c2733 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Thu, 5 Mar 2026 02:10:14 +1100
Subject: [PATCH 14/17] =?UTF-8?q?feat(agents):=20KV=20pressure=20refactor?=
 =?UTF-8?q?=20=E2=80=94=20softLimit/hardLimit,=20reporter=20sub-agents?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace graceThreshold/criticalThreshold with two-concept pressure model:
- softLimit: remaining KV floor for new work, enforced at SETTLE (reject tool results), PRODUCE (deny non-terminal tools), and INIT (drop agents)
- hardLimit: crash-prevention floor, kill before produceSync()

Fix reporter sub-agents trapped in infinite rejection loop:
- hasNonTerminalTools checked pool registry instead of agent's tools, blocking reporters from calling report() as first action
- Reporter pool now receives report-only tool registry
- Research softLimit=2048 reserves KV for reporter suffix prefill (~350 tokens per reporter from chat template + tools JSON)

Also: keep stderr alive with --trace flag, agreement scoring, report task prompt, TUI sub-agent rendering.
---
 examples/deep-research/agreement.ts           | 142 ++++++++
 examples/deep-research/harness.ts             |  75 ++++-
 examples/deep-research/main.ts                |   5 +-
 .../tasks/graceful-termination.md             |   1 -
 examples/deep-research/tasks/report.md        |   3 +
 examples/deep-research/tools/index.ts         |   4 +
 examples/deep-research/tools/report.ts        |  14 +
 examples/deep-research/tui.ts                 |  94 +++++-
 liblloyal                                     |   2 +-
 src/Branch.ts                                 |  57 +++-
 src/Session.ts                                |   4 +-
 src/SessionContext.cpp                        | 180 +++++++---
 src/SessionContext.hpp                        |  22 +-
 src/agents/agent-pool.ts                      | 311 +++++++++++-------
 src/agents/deltas.ts                          |  20 +-
 src/agents/diverge.ts                         | 140 ++++----
 src/agents/generate.ts                        |   4 +-
 src/agents/index.ts                           |   3 +-
 src/agents/shared-root.ts                     |   8 +-
 src/agents/toolkit.ts                         |   1 -
 src/agents/types.ts                           |  76 ++++-
 src/types.ts                                  |  44 +++
 22 files changed, 902 insertions(+), 308 deletions(-)
 create mode 100644 examples/deep-research/agreement.ts
 delete mode 100644 examples/deep-research/tasks/graceful-termination.md
 create mode 100644 examples/deep-research/tasks/report.md
 create mode 100644 examples/deep-research/tools/report.ts

diff --git a/examples/deep-research/agreement.ts b/examples/deep-research/agreement.ts
new file mode 100644
index 0000000..58380e1
--- /dev/null
+++ b/examples/deep-research/agreement.ts
@@ -0,0 +1,142 @@
+/**
+ * Per-section agreement analysis via bigram Jaccard similarity.
+ *
+ * Pure string math — no model calls. Used by the verify phase to quantify
+ * where N diverge attempts agree (confident) vs disagree (hallucination risk).
+ */
+
+export interface SectionAgreement {
+  label: string;       // section header or "¶1", "¶2", etc.
+  score: number;       // 0–1 average pairwise bigram Jaccard
+}
+
+export interface AgreementResult {
+  overall: number;                  // mean of section scores
+  sections: SectionAgreement[];     // per-section breakdown
+}
+
+// ── Internals ─────────────────────────────────────────────────────
+
+interface Section {
+  key: string;    // normalized header for matching, or positional index
+  label: string;  // display label
+  body: string;   // section text
+}
+
+const HEADER_RE = /^#{1,4}\s+/m;
+
+function normalizeKey(header: string): string {
+  return header.toLowerCase().replace(/[^\w\s]/g, '').trim();
+}
+
+function extractSections(text: string): Section[] {
+  const hasHeaders = HEADER_RE.test(text);
+
+  if (hasHeaders) {
+    const parts = text.split(/^(#{1,4}\s+.+)$/m).filter(Boolean);
+    const sections: Section[] = [];
+    for (let i = 0; i < parts.length; i++) {
+      const match = parts[i].match(/^#{1,4}\s+(.+)$/);
+      if (match) {
+        const header = match[1].trim();
+        const body = (parts[i + 1] ?? '').trim();
+        sections.push({ key: normalizeKey(header), label: header, body });
+        i++; // skip body part
+      }
+    }
+    return sections.length ? sections : paragraphSections(text);
+  }
+
+  return paragraphSections(text);
+}
+
+function paragraphSections(text: string): Section[] {
+  return text.split(/\n{2,}/)
+    .map(p => p.trim())
+    .filter(Boolean)
+    .map((body, i) => ({ key: String(i), label: `¶${i + 1}`, body }));
+}
+
+function wordBigrams(text: string): Set<string> {
+  const words = text.split(/\s+/).filter(Boolean);
+  const bigrams = new Set<string>();
+  for (let i = 0; i < words.length - 1; i++) {
+    bigrams.add(`${words[i]} ${words[i + 1]}`);
+  }
+  return bigrams;
+}
+
+function jaccard(a: Set<string>, b: Set<string>): number {
+  if (a.size === 0 && b.size === 0) return 1;
+  let intersection = 0;
+  const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
+  for (const x of smaller) if (larger.has(x)) intersection++;
+  const union = a.size + b.size - intersection;
+  return union === 0 ? 1 : intersection / union;
+}
+
+function averagePairwiseJaccard(texts: string[]): number {
+  if (texts.length < 2) return 1;
+  const bigramSets = texts.map(wordBigrams);
+  let sum = 0;
+  let pairs = 0;
+  for (let i = 0; i < bigramSets.length; i++) {
+    for (let j = i + 1; j < bigramSets.length; j++) {
+      sum += jaccard(bigramSets[i], bigramSets[j]);
+      pairs++;
+    }
+  }
+  return sum / pairs;
+}
+
+// ── Public API ────────────────────────────────────────────────────
+
+export function computeAgreement(outputs: string[]): AgreementResult {
+  if (outputs.length < 2) return { overall: 1, sections: [] };
+
+  const allSections = outputs.map(extractSections);
+  const hasHeaders = allSections.some(ss => ss.length > 0 && ss[0].key !== '0');
+
+  if (hasHeaders) {
+    // Collect all unique section keys across attempts
+    const keySet = new Map<string, string>(); // key → label (first seen)
+    for (const ss of allSections) {
+      for (const s of ss) {
+        if (!keySet.has(s.key)) keySet.set(s.key, s.label);
+      }
+    }
+
+    const sections: SectionAgreement[] = [...keySet.entries()].map(([key, label]) => {
+      const bodies = allSections
+        .map(ss => ss.find(s => s.key === key)?.body)
+        .filter((b): b is string => b != null && b.length > 0);
+      // Sections present in only one attempt get score 0
+      const score = bodies.length < 2 ? 0 : averagePairwiseJaccard(bodies);
+      return { label, score };
+    });
+
+    const overall = sections.length
+      ? sections.reduce((s, x) => s + x.score, 0) / sections.length
+      : 0;
+
+    return { overall, sections };
+  }
+
+  // Positional matching for headerless content
+  const maxSections = Math.max(...allSections.map(ss => ss.length));
+  const sections: SectionAgreement[] = [];
+
+  for (let i = 0; i < maxSections; i++) {
+    const bodies = allSections
+      .map(ss => ss[i]?.body)
+      .filter((b): b is string => b != null && b.length > 0);
+    const score = bodies.length < 2 ? 0 : averagePairwiseJaccard(bodies);
+    sections.push({ label: `¶${i + 1}`, score });
+  }
+
+  const overall = sections.length
+    ? sections.reduce((s, x) => s + x.score, 0) / sections.length
+    : 0;
+
+  return { overall, sections };
+}
diff --git a/examples/deep-research/harness.ts b/examples/deep-research/harness.ts
index 3d34457..8b6f687 100644
--- a/examples/deep-research/harness.ts
+++ b/examples/deep-research/harness.ts
@@ -1,15 +1,17 @@
 import * as fs from 'node:fs';
 import * as path from 'node:path';
-import { call } from 'effection';
+import { call, scoped } from 'effection';
 import type { Operation, Channel } from 'effection';
 import { Branch, Session } from '../../dist';
 import type { SessionContext } from '../../dist';
 import {
   Ctx,
-  generate, runAgents, diverge, withSharedRoot,
+  generate, useAgentPool, runAgents, diverge, withSharedRoot,
 } from '../../dist/agents';
 import type { Tool, AgentPoolResult, DivergeResult } from '../../dist/agents';
 import type { WorkflowEvent, OpTiming } from './tui';
+import { computeAgreement } from './agreement';
+import { reportTool } from './tools';
 
 /** Load a task prompt file. Convention: system prompt above `---`, user content below. */
 function loadTask(name: string): { system: string; user: string } {
@@ -23,7 +25,7 @@ const PLAN = loadTask('plan');
 const RESEARCH = loadTask('research');
 const VERIFY = loadTask('verify');
 const EVAL = loadTask('eval');
-const GRACE = loadTask('graceful-termination');
+const REPORT = loadTask('report');
 
 // ── Options ──────────────────────────────────────────────────────
 
@@ -34,7 +36,6 @@ export interface WorkflowOpts {
   agentCount: number;
   verifyCount: number;
   maxTurns: number;
-  nCtx: number;
   trace: boolean;
   events: Channel<WorkflowEvent, void>;
 }
@@ -51,6 +52,38 @@ function agentTasks(questions: string[], toolsJson: string, parent: Branch, seed
   }));
 }
 
+const reportOnlyTools = JSON.stringify([reportTool.schema]);
+
+function* reportPass(
+  pool: AgentPoolResult,
+  opts: WorkflowOpts,
+): Operation<void> {
+  const hardCut = pool.agents.filter(a => !a.findings && !a.branch.disposed);
+  if (hardCut.length === 0) return;
+
+  // Free KV from successful agents before spawning reporters
+  for (const a of pool.agents) {
+    if (a.findings && !a.branch.disposed) a.branch.pruneSync();
+  }
+
+  const reporters = yield* runAgents({
+    tasks: hardCut.map(a => ({
+      systemPrompt: REPORT.system,
+      content: REPORT.user,
+      tools: reportOnlyTools,
+      parent: a.branch,
+    })),
+    tools: new Map([['report', reportTool]]),
+    terminalTool: 'report',
+    trace: opts.trace,
+    pressure: { softLimit: 200, hardLimit: 64 },
+  });
+
+  hardCut.forEach((a, i) => {
+    if (reporters.agents[i]?.findings) a.findings = reporters.agents[i].findings;
+  });
+}
+
 // ── Operations ───────────────────────────────────────────────────
 
 function* plan(query: string, opts: WorkflowOpts): Operation<{ questions: string[]; tokenCount: number; timeMs: number }> {
@@ -131,12 +164,15 @@ function* research(
   const { result: pool, prefixLen: sharedPrefixLength } = yield* withSharedRoot(
     { systemPrompt: RESEARCH.system, tools: opts.toolsJson },
     function*(root, prefixLen) {
-      const result = yield* runAgents({
+      const pool = yield* useAgentPool({
         tasks: agentTasks(questions, opts.toolsJson, root),
         tools: opts.toolMap, maxTurns: opts.maxTurns, trace: opts.trace,
-        nCtx: opts.nCtx, gracePrompt: GRACE.system,
+        terminalTool: 'report',
+        pressure: { softLimit: 2048 },
       });
-      return { result, prefixLen };
+
+      yield* reportPass(pool, opts);
+      return { result: pool, prefixLen };
     },
   );
 
@@ -152,10 +188,16 @@ function* warmResearch(
   yield* opts.events.send({ type: 'research:start', agentCount: questions.length });
   const t = performance.now();
 
-  const pool = yield* runAgents({
-    tasks: agentTasks(questions, opts.toolsJson, opts.session.trunk!, Date.now()),
-    tools: opts.toolMap, maxTurns: opts.maxTurns, trace: opts.trace,
-    nCtx: opts.nCtx, gracePrompt: GRACE.system,
+  const pool = yield* scoped(function*() {
+    const pool = yield* useAgentPool({
+      tasks: agentTasks(questions, opts.toolsJson, opts.session.trunk!, Date.now()),
+      tools: opts.toolMap, maxTurns: opts.maxTurns, trace: opts.trace,
+      terminalTool: 'report',
+      pressure: { softLimit: 1024 },
+    });
+
+    yield* reportPass(pool, opts);
+    return pool;
   });
 
   const timeMs = performance.now() - t;
@@ -192,6 +234,8 @@ function* verify(
     params: { temperature: 0.7 },
   });
   const timeMs = performance.now() - t;
+  const agreement = computeAgreement(result.attempts.map(a => a.output));
+  yield* opts.events.send({ type: 'verify:agreement', result: agreement });
   yield* opts.events.send({ type: 'verify:done', result, timeMs });
   return { result, timeMs };
 }
@@ -280,12 +324,15 @@ function* summarize(
   opts: WorkflowOpts,
   extra?: { kvLine?: string },
 ): Operation<void> {
+  const ctx: SessionContext = yield* Ctx.expect();
+  const p = ctx._storeKvPressure();
+  const ctxTotal = p.nCtx || 1;
   yield* opts.events.send({
     type: 'stats', timings,
     kvLine: extra?.kvLine,
-    ctxPct: Math.round(100 * (opts.session.trunk?.position ?? 0) / opts.nCtx),
-    ctxPos: opts.session.trunk?.position ?? 0,
-    ctxTotal: opts.nCtx,
+    ctxPct: Math.round(100 * p.cellsUsed / ctxTotal),
+    ctxPos: p.cellsUsed,
+    ctxTotal,
   });
 }
 
diff --git a/examples/deep-research/main.ts b/examples/deep-research/main.ts
index 931a905..edeaf10 100644
--- a/examples/deep-research/main.ts
+++ b/examples/deep-research/main.ts
@@ -75,7 +75,7 @@ if (!corpusDir) {
 
 if (jsonlMode) setJsonlMode(true);
 if (verbose) setVerboseMode(true);
-if (!verbose && !jsonlMode) {
+if (!verbose && !jsonlMode && !trace) {
   try {
     fs.closeSync(2);
     fs.openSync(process.platform === "win32" ? "\\\\.\\NUL" : "/dev/null", "w");
@@ -113,7 +113,7 @@ main(function* () {
     createContext({
       modelPath,
       nCtx,
-      nSeqMax: Math.max(AGENT_COUNT, VERIFY_COUNT) + 1,
+      nSeqMax: Math.max(AGENT_COUNT, VERIFY_COUNT) * 2 + 1,
       typeK: "q4_0",
       typeV: "q4_0",
     }),
@@ -163,7 +163,6 @@ main(function* () {
     agentCount: AGENT_COUNT,
     verifyCount: VERIFY_COUNT,
     maxTurns: MAX_TOOL_TURNS,
-    nCtx,
     trace,
   };
 
diff --git a/examples/deep-research/tasks/graceful-termination.md b/examples/deep-research/tasks/graceful-termination.md
deleted file mode 100644
index 91fcaf8..0000000
--- a/examples/deep-research/tasks/graceful-termination.md
+++ /dev/null
@@ -1 +0,0 @@
-STOP. You MUST call the report tool RIGHT NOW with a brief summary (under 200 words) of what you found. Do NOT call any other tool. Do NOT list every detail — summarize the key findings only.
\ No newline at end of file
diff --git a/examples/deep-research/tasks/report.md b/examples/deep-research/tasks/report.md
new file mode 100644
index 0000000..189a41b
--- /dev/null
+++ b/examples/deep-research/tasks/report.md
@@ -0,0 +1,3 @@
+You are a research reporter. Call the report tool with a concise summary (under 200 words) of the key findings from the research above. Focus on the most important discoveries and conclusions.
+---
+Report your findings.
diff --git a/examples/deep-research/tools/index.ts b/examples/deep-research/tools/index.ts
index 9641ae7..2145f44 100644
--- a/examples/deep-research/tools/index.ts
+++ b/examples/deep-research/tools/index.ts
@@ -5,6 +5,9 @@ import type { Reranker } from './types';
 import { SearchTool } from './search';
 import { ReadFileTool } from './read-file';
 import { GrepTool } from './grep';
+import { ReportTool } from './report';
+
+export const reportTool = new ReportTool();
 
 export function createTools(opts: {
   resources: Resource[];
@@ -15,5 +18,6 @@ export function createTools(opts: {
     new SearchTool(opts.chunks, opts.reranker),
     new ReadFileTool(opts.resources),
     new GrepTool(opts.resources),
+    reportTool,
   ]);
 }
diff --git a/examples/deep-research/tools/report.ts b/examples/deep-research/tools/report.ts
new file mode 100644
index 0000000..97f061a
--- /dev/null
+++ b/examples/deep-research/tools/report.ts
@@ -0,0 +1,14 @@
+import { Tool } from '../../../dist/agents';
+import type { JsonSchema } from '../../../dist/agents';
+
+export class ReportTool extends Tool<{ findings: string }> {
+  readonly name = 'report';
+  readonly description = 'Submit your final research findings. Call this when you have gathered enough information to answer the question.';
+  readonly parameters: JsonSchema = {
+    type: 'object',
+    properties: { findings: { type: 'string', description: 'Your research findings and answer' } },
+    required: ['findings'],
+  };
+
+  async execute(): Promise<unknown> { return {}; }
+}
diff --git a/examples/deep-research/tui.ts b/examples/deep-research/tui.ts
index ddaf442..f720095 100644
--- a/examples/deep-research/tui.ts
+++ b/examples/deep-research/tui.ts
@@ -2,6 +2,7 @@ import * as fs from 'node:fs';
 import { each } from 'effection';
 import type { Channel, Operation } from 'effection';
 import type { AgentEvent, AgentPoolResult, DivergeResult } from '../../dist/agents';
+import type { AgreementResult } from './agreement';
 
 // ── Event types ──────────────────────────────────────────────────
 
@@ -19,6 +20,7 @@ export type StepEvent =
   | { type: 'research:done'; pool: AgentPoolResult; timeMs: number }
   | { type: 'verify:start'; count: number }
   | { type: 'verify:done'; result: DivergeResult; timeMs: number }
+  | { type: 'verify:agreement'; result: AgreementResult }
   | { type: 'eval:done'; converged: boolean | null; tokenCount: number; timeMs: number }
   | { type: 'answer'; text: string }
   | { type: 'response:start' }
@@ -83,11 +85,20 @@ interface ViewState {
   nextLabel: number;
   agentText: Map<number, string>;
   agentStatus: Map<number, { state: string; tokenCount: number; detail: string }>;
+  agentParent: Map<number, number>;  // childId → parentId (sub-agent tracking)
   traceQuery: string;
 }
 
 type ViewHandler = (ev: WorkflowEvent) => void;
 
+function isSubAgent(state: ViewState, agentId: number): boolean {
+  return state.agentParent.has(agentId);
+}
+
+function parentLabel(state: ViewState, agentId: number): string {
+  return label(state, state.agentParent.get(agentId)!);
+}
+
 function label(state: ViewState, agentId: number): string {
   let l = state.agentLabel.get(agentId);
   if (!l) { l = `A${state.nextLabel++}`; state.agentLabel.set(agentId, l); }
@@ -99,10 +110,12 @@ function resetLabels(state: ViewState): void {
   state.agentLabel.clear();
   state.agentStatus.clear();
   state.agentText.clear();
+  state.agentParent.clear();
 }
 
 function renderStatus(state: ViewState): void {
-  const active = [...state.agentStatus.entries()].filter(([, s]) => s.state !== 'done');
+  const active = [...state.agentStatus.entries()]
+    .filter(([id, s]) => s.state !== 'done' && !isSubAgent(state, id));
   if (active.length === 0) return;
 
   const generating = active.filter(([, s]) => s.state === 'gen');
@@ -155,12 +168,20 @@ function planHandler(): ViewHandler {
 function agentHandler(state: ViewState): ViewHandler {
   return (ev) => {
     switch (ev.type) {
+      case 'agent:spawn': {
+        // If parent is a known labeled agent, this is a sub-agent
+        if (state.agentLabel.has(ev.parentAgentId)) {
+          state.agentParent.set(ev.agentId, ev.parentAgentId);
+        }
+        break;
+      }
       case 'agent:produce': {
+        const sub = isSubAgent(state, ev.agentId);
         state.agentText.set(ev.agentId, (state.agentText.get(ev.agentId) ?? '') + ev.text);
         state.agentStatus.set(ev.agentId, { state: 'gen', tokenCount: ev.tokenCount, detail: '' });
+        if (sub) break;  // sub-agents: skip verbose/status output
         if (_verboseMode) {
           const lbl = label(state, ev.agentId);
-          // First token for this agent — print header
           if (ev.tokenCount === 1) {
             statusClear();
             process.stdout.write(`\n    ${c.dim}───${c.reset} ${c.yellow}${lbl}${c.reset} ${c.dim}tokens${c.reset} ${c.dim}───${c.reset}\n    `);
@@ -172,7 +193,8 @@ function agentHandler(state: ViewState): ViewHandler {
         break;
       }
       case 'agent:tool_call': {
-        if (_verboseMode) process.stdout.write('\n');
+        const sub = isSubAgent(state, ev.agentId);
+        if (_verboseMode && !sub) process.stdout.write('\n');
         state.agentText.delete(ev.agentId);
         state.agentStatus.set(ev.agentId, { state: ev.tool, tokenCount: 0, detail: '' });
         emit('tool_call', { agentId: ev.agentId, toolName: ev.tool, arguments: ev.args });
@@ -184,7 +206,12 @@ function agentHandler(state: ViewState): ViewHandler {
           ? `/${toolArgs.pattern || ''}/`
           : ev.tool === 'report' ? ''
           : `${toolArgs.filename}` + (toolArgs.startLine ? ` L${toolArgs.startLine}-${toolArgs.endLine}` : '');
-        log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(state, ev.agentId)}${c.reset} ${c.cyan}${ev.tool}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
+        if (sub) {
+          const plbl = `${c.yellow}${parentLabel(state, ev.agentId)}${c.reset}`;
+          log(`    ${c.dim}\u2502${c.reset}  ${c.dim}\u2514${c.reset} ${plbl} ${c.cyan}${ev.tool}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
+        } else {
+          log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(state, ev.agentId)}${c.reset} ${c.cyan}${ev.tool}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
+        }
         break;
       }
       case 'agent:tool_result': {
@@ -209,7 +236,12 @@ function agentHandler(state: ViewState): ViewHandler {
             preview = ` \u00b7 ${r.totalMatches} matches in ${r.matchingLines} lines`;
           } catch { /* non-fatal */ }
         }
-        log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(state, ev.agentId)}${c.reset} ${c.dim}\u2190 ${ev.tool} ${ev.result.length}b${preview}${c.reset}`);
+        if (isSubAgent(state, ev.agentId)) {
+          const plbl = `${c.yellow}${parentLabel(state, ev.agentId)}${c.reset}`;
+          log(`    ${c.dim}\u2502${c.reset}  ${c.dim}\u2514${c.reset} ${plbl} ${c.dim}\u2190 ${ev.tool} ${ev.result.length}b${preview}${c.reset}`);
+        } else {
+          log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(state, ev.agentId)}${c.reset} ${c.dim}\u2190 ${ev.tool} ${ev.result.length}b${preview}${c.reset}`);
+        }
         break;
       }
       case 'agent:tool_progress': {
@@ -219,13 +251,16 @@ function agentHandler(state: ViewState): ViewHandler {
       }
       case 'agent:report': {
         state.agentStatus.set(ev.agentId, { state: 'done', tokenCount: 0, detail: '' });
+        const sub = isSubAgent(state, ev.agentId);
         const cols = process.stdout.columns || 80;
-        const lbl = `${c.yellow}${label(state, ev.agentId)}${c.reset}`;
-        const prefix = `    ${c.dim}\u2502${c.reset}   `;
-        const wrap = cols - 8;
+        const displayLabel = sub ? parentLabel(state, ev.agentId) : label(state, ev.agentId);
+        const lbl = `${c.yellow}${displayLabel}${c.reset}`;
+        const indent = sub ? `    ${c.dim}\u2502${c.reset}  ` : '    ';
+        const prefix = `${indent}${c.dim}\u2502${c.reset}   `;
+        const wrap = cols - (sub ? 11 : 8);
 
-        log(`    ${c.dim}\u2502${c.reset}`);
-        log(`    ${c.dim}\u251c\u2500\u2500${c.reset} ${lbl} ${c.bold}findings${c.reset}`);
+        log(`${indent}${c.dim}\u2502${c.reset}`);
+        log(`${indent}${c.dim}\u251c\u2500\u2500${c.reset} ${lbl} ${c.bold}findings${c.reset}`);
 
         for (const para of ev.findings.split('\n')) {
           if (!para.trim()) { log(prefix); continue; }
@@ -241,11 +276,11 @@ function agentHandler(state: ViewState): ViewHandler {
           }
           if (line) log(`${prefix}${c.dim}${line}${c.reset}`);
         }
-        log(`    ${c.dim}\u2502${c.reset}`);
+        log(`${indent}${c.dim}\u2502${c.reset}`);
         break;
       }
       case 'agent:done':
-        if (_verboseMode) process.stdout.write('\n');
+        if (_verboseMode && !isSubAgent(state, ev.agentId)) process.stdout.write('\n');
         break;
     }
   };
@@ -298,19 +333,51 @@ function researchSummaryHandler(state: ViewState): ViewHandler {
 }
 
 function verifyHandler(): ViewHandler {
+  let pendingAgreement: AgreementResult | null = null;
+
   return (ev) => {
     switch (ev.type) {
       case 'verify:start': {
         log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Verify${c.reset} ${c.dim}${ev.count} attempts${c.reset}`);
+        pendingAgreement = null;
+        break;
+      }
+      case 'verify:agreement': {
+        pendingAgreement = ev.result;
+        emit('verify_agreement', {
+          overall: ev.result.overall,
+          sections: ev.result.sections.map(s => ({ label: s.label, score: s.score })),
+        });
         break;
       }
       case 'verify:done': {
         ev.result.attempts.forEach((a, i) => {
-          const tree = i === ev.result.attempts.length - 1 ? '\u2514' : '\u251c';
+          const tree = i === ev.result.attempts.length - 1
+            ? (pendingAgreement ? '\u251c' : '\u2514')
+            : '\u251c';
           emit('attempt_done', { index: i, output: a.output.trim().slice(0, 500), tokenCount: a.tokenCount, ppl: a.ppl });
           log(`    ${c.dim}${tree} ${a.tokenCount} tok \u00b7 ppl ${a.ppl.toFixed(2)}${c.reset}`);
         });
+        if (pendingAgreement && pendingAgreement.sections.length > 0) {
+          const pct = Math.round(pendingAgreement.overall * 100);
+          log(`    ${c.dim}\u251c${c.reset} Agreement: ${c.bold}${pct}%${c.reset}`);
+          const sorted = [...pendingAgreement.sections].sort((a, b) => b.score - a.score);
+          const show = sorted.slice(0, 5);
+          const maxLabelLen = Math.max(...show.map(s => s.label.length));
+          show.forEach((s, i) => {
+            const tree = i === show.length - 1 && sorted.length <= 5 ? '\u2514' : '\u251c';
+            const filled = Math.round(s.score * 10);
+            const bar = '\u2588'.repeat(filled) + '\u2591'.repeat(10 - filled);
+            const sPct = pad(Math.round(s.score * 100), 3);
+            const label = `"${s.label}"`.padEnd(maxLabelLen + 2);
+            log(`    ${c.dim}${tree}${c.reset} ${c.dim}${label}${c.reset} ${sPct}%  ${bar}`);
+          });
+          if (sorted.length > 5) {
+            log(`    ${c.dim}\u2514 \u2026 ${sorted.length - 5} more${c.reset}`);
+          }
+        }
         log(`    ${c.dim}${ev.result.totalTokens} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
+        pendingAgreement = null;
         break;
       }
     }
@@ -405,6 +472,7 @@ export function createView(opts: ViewOpts) {
     nextLabel: 0,
     agentText: new Map(),
     agentStatus: new Map(),
+    agentParent: new Map(),
     traceQuery: '',
   };
 
diff --git a/liblloyal b/liblloyal
index 757f595..1d2bec3 160000
--- a/liblloyal
+++ b/liblloyal
@@ -1 +1 @@
-Subproject commit 757f595a5e2e952ab0da01f888cdc00c6a757551
+Subproject commit 1d2bec35eece54a61780189e775b03e6b293eb24
diff --git a/src/Branch.ts b/src/Branch.ts
index be5e286..e44300e 100644
--- a/src/Branch.ts
+++ b/src/Branch.ts
@@ -80,7 +80,20 @@ export class Branch {
   }
 
   /**
-   * Fork this branch to a new sequence
+   * Fork this branch to a new sequence (async)
+   *
+   * Async contract: local branches resolve immediately; cloud branches
+   * may perform an HTTP round-trip. Use {@link forkSync} when you know
+   * the branch is local and want zero-overhead forking.
+   *
+   * @returns New forked Branch
+   */
+  async fork(): Promise<Branch> {
+    return this.forkSync();
+  }
+
+  /**
+   * Fork this branch to a new sequence (sync)
    *
    * The child shares the parent's KV prefix in memory (metadata-only under unified KV, no KV buffer copy).
    * Logits, sampler state, and perplexity tracker are cloned so the child
@@ -91,7 +104,7 @@ export class Branch {
    *
    * @returns New forked Branch
    */
-  async fork(): Promise<Branch> {
+  forkSync(): Branch {
     this._ensureNotDisposed();
     const newHandle = this._ctx._branchFork(this._handle);
     return new Branch(this._ctx, newHandle);
@@ -163,17 +176,30 @@ export class Branch {
   }
 
   /**
-   * Discard this branch — remove its divergent KV entries and free the handle
+   * Discard this branch (async)
    *
-   * Only removes KV entries divergent from the shared prefix; sibling branches
-   * are unaffected. The disposed flag is set synchronously — any call to
-   * produce(), commit(), etc. after prune() will throw immediately, even
-   * before the returned promise resolves.
+   * Async contract: local branches resolve immediately; cloud branches
+   * may perform an HTTP round-trip. Use {@link pruneSync} when you know
+   * the branch is local.
    *
    * RESTRICT mode: throws if children exist. Use {@link pruneSubtree} to
    * cascade-delete an entire subtree.
    */
   async prune(): Promise<void> {
+    this.pruneSync();
+  }
+
+  /**
+   * Discard this branch — remove its divergent KV entries and free the handle (sync)
+   *
+   * Only removes KV entries divergent from the shared prefix; sibling branches
+   * are unaffected. The disposed flag is set synchronously — any call to
+   * produce(), commit(), etc. after prune() will throw immediately.
+   *
+   * RESTRICT mode: throws if children exist. Use {@link pruneSubtreeSync} to
+   * cascade-delete an entire subtree.
+   */
+  pruneSync(): void {
     if (this._disposed) return;
     const kids = this.children;
     if (kids.length > 0) {
@@ -187,13 +213,24 @@ export class Branch {
   }
 
   /**
-   * Discard this branch and all its descendants — CASCADE delete
+   * Discard this branch and all its descendants (async)
+   *
+   * Async contract: local branches resolve immediately; cloud branches
+   * may perform an HTTP round-trip. Use {@link pruneSubtreeSync} when you know
+   * the branch is local.
+   */
+  async pruneSubtree(): Promise<void> {
+    this.pruneSubtreeSync();
+  }
+
+  /**
+   * Discard this branch and all its descendants — CASCADE delete (sync)
    *
    * Iterative post-order traversal: prunes children first, then this branch.
    * Use when tearing down an entire subtree (e.g. abandoned search path).
-   * Sets disposed synchronously, like {@link prune}.
+   * Sets disposed synchronously.
    */
-  async pruneSubtree(): Promise<void> {
+  pruneSubtreeSync(): void {
     if (this._disposed) return;
     this._ctx._branchPruneSubtree(this._handle);
     this._disposed = true;
diff --git a/src/Session.ts b/src/Session.ts
index a03d907..4ce87fb 100644
--- a/src/Session.ts
+++ b/src/Session.ts
@@ -82,7 +82,7 @@ export class Session {
    * @param opts - Optional tools JSON string
    */
   async prefillUser(content: string, opts: { tools?: string } = {}): Promise<void> {
-    const tokens = await buildUserDelta(this._ctx, content, opts);
+    const tokens = buildUserDelta(this._ctx, content, opts);
     await this._trunk!.prefill(tokens);
   }
 
@@ -93,7 +93,7 @@ export class Session {
    * @param callId - Tool call ID
    */
   async prefillToolResult(resultStr: string, callId: string): Promise<void> {
-    const tokens = await buildToolResultDelta(this._ctx, resultStr, callId);
+    const tokens = buildToolResultDelta(this._ctx, resultStr, callId);
     await this._trunk!.prefill(tokens);
   }
 }
diff --git a/src/SessionContext.cpp b/src/SessionContext.cpp
index 46177e3..bd6095e 100644
--- a/src/SessionContext.cpp
+++ b/src/SessionContext.cpp
@@ -473,6 +473,9 @@ class DetokenizeWorker : public Napi::AsyncWorker {
   std::string _result;
 };
 
+// Forward declaration — defined after parseFormatChatArgs
+static Napi::Object marshalFormatResult(Napi::Env env, const lloyal::chat_in::FormatResult& r);
+
 /**
  * AsyncWorker for formatChat operation
  */
@@ -500,45 +503,7 @@ class FormatChatWorker : public Napi::AsyncWorker {
   }
 
   void OnOK() override {
-    Napi::Env env = Env();
-
-    Napi::Object result = Napi::Object::New(env);
-    result.Set("prompt", Napi::String::New(env, _result.prompt));
-
-    // stopTokens (backward compat)
-    Napi::Array stopTokens = Napi::Array::New(env, _result.additional_stops.size());
-    for (size_t i = 0; i < _result.additional_stops.size(); i++) {
-      stopTokens[i] = Napi::String::New(env, _result.additional_stops[i]);
-    }
-    result.Set("stopTokens", stopTokens);
-
-    // Format awareness fields
-    result.Set("format", Napi::Number::New(env, static_cast<double>(_result.format)));
-    result.Set("grammar", Napi::String::New(env, _result.grammar));
-    result.Set("grammarLazy", Napi::Boolean::New(env, _result.grammar_lazy));
-    result.Set("thinkingForcedOpen", Napi::Boolean::New(env, _result.thinking_forced_open));
-    result.Set("reasoningFormat", Napi::Number::New(env, static_cast<double>(_result.reasoning_format)));
-    result.Set("parser", Napi::String::New(env, _result.parser));
-
-    // grammarTriggers: Array<{ type: number, value: string, token: number }>
-    Napi::Array triggers = Napi::Array::New(env, _result.grammar_triggers.size());
-    for (size_t i = 0; i < _result.grammar_triggers.size(); i++) {
-      Napi::Object trigger = Napi::Object::New(env);
-      trigger.Set("type", Napi::Number::New(env, static_cast<double>(_result.grammar_triggers[i].type)));
-      trigger.Set("value", Napi::String::New(env, _result.grammar_triggers[i].value));
-      trigger.Set("token", Napi::Number::New(env, static_cast<double>(_result.grammar_triggers[i].token)));
-      triggers[i] = trigger;
-    }
-    result.Set("grammarTriggers", triggers);
-
-    // preservedTokens: string[]
-    Napi::Array preserved = Napi::Array::New(env, _result.preserved_tokens.size());
-    for (size_t i = 0; i < _result.preserved_tokens.size(); i++) {
-      preserved[i] = Napi::String::New(env, _result.preserved_tokens[i]);
-    }
-    result.Set("preservedTokens", preserved);
-
-    _deferred.Resolve(result);
+    _deferred.Resolve(marshalFormatResult(Env(), _result));
   }
 
   void OnError(const Napi::Error& err) override {
@@ -808,6 +773,7 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) {
 
     // ===== PROMPT PREPARATION =====
     InstanceMethod("tokenize", &SessionContext::tokenize),
+    InstanceMethod("tokenizeSync", &SessionContext::tokenizeSync),
     InstanceMethod("detokenize", &SessionContext::detokenize),
 
     // ===== KV CACHE MANAGEMENT =====
@@ -827,8 +793,10 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) {
 
     // ===== HELPERS =====
     InstanceMethod("formatChat", &SessionContext::formatChat),
+    InstanceMethod("formatChatSync", &SessionContext::formatChatSync),
     InstanceMethod("parseChatOutput", &SessionContext::parseChatOutput),
     InstanceMethod("jsonSchemaToGrammar", &SessionContext::jsonSchemaToGrammar),
+    InstanceMethod("jsonSchemaToGrammarSync", &SessionContext::jsonSchemaToGrammarSync),
     InstanceMethod("validateChatTemplate", &SessionContext::validateChatTemplate),
 
     // ===== EMBEDDING EXTRACTION =====
@@ -872,6 +840,7 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) {
     InstanceMethod("_storePrefill", &SessionContext::_storePrefill),
     InstanceMethod("_storeRetainOnly", &SessionContext::_storeRetainOnly),
     InstanceMethod("_storeAvailable", &SessionContext::_storeAvailable),
+    InstanceMethod("_storeKvPressure", &SessionContext::_storeKvPressure),
 
     // ===== SCORING API =====
     InstanceMethod("_scoreGroup", &SessionContext::_scoreGroup),
@@ -948,6 +917,38 @@ Napi::Value SessionContext::tokenize(const Napi::CallbackInfo& info) {
   return worker->GetPromise();
 }
 
+Napi::Value SessionContext::tokenizeSync(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  ensureNotDisposed();
+
+  if (info.Length() < 1 || !info[0].IsString()) {
+    throw Napi::TypeError::New(env, "Expected (text: string[, addSpecial: boolean])");
+  }
+
+  std::string text = info[0].As<Napi::String>().Utf8Value();
+
+  bool addSpecial = true;
+  bool addSpecialOverridden = false;
+  if (info.Length() >= 2 && info[1].IsBoolean()) {
+    addSpecial = info[1].As<Napi::Boolean>().Value();
+    addSpecialOverridden = true;
+  }
+
+  std::vector<llama_token> result;
+  if (addSpecialOverridden) {
+    const llama_vocab* vocab = llama_model_get_vocab(_model.get());
+    result = lloyal::tokenizer::tokenize(vocab, text, addSpecial, true);
+  } else {
+    result = lloyal::tokenizer::tokenize(_model.get(), text);
+  }
+
+  Napi::Array jsTokens = Napi::Array::New(env, result.size());
+  for (size_t i = 0; i < result.size(); i++) {
+    jsTokens[i] = Napi::Number::New(env, static_cast<double>(result[i]));
+  }
+  return jsTokens;
+}
+
 Napi::Value SessionContext::detokenize(const Napi::CallbackInfo& info) {
   Napi::Env env = info.Env();
   ensureNotDisposed();
@@ -1119,21 +1120,13 @@ Napi::Value SessionContext::getTurnSeparator(const Napi::CallbackInfo& info) {
   return result;
 }
 
-Napi::Value SessionContext::formatChat(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  if (info.Length() < 1 || !info[0].IsString()) {
-    throw Napi::TypeError::New(env, "Expected (messagesJson: string[, options: object])");
-  }
-
+// Shared helper: parse JS args into FormatInputs
+static lloyal::chat_in::FormatInputs parseFormatChatArgs(const Napi::CallbackInfo& info) {
   lloyal::chat_in::FormatInputs inputs;
   inputs.messages_json = info[0].As<Napi::String>().Utf8Value();
 
-  // Second argument: options object (or string for backward compat)
   if (info.Length() >= 2) {
     if (info[1].IsString()) {
-      // Backward compat: formatChat(messagesJson, templateOverride)
       inputs.template_override = info[1].As<Napi::String>().Utf8Value();
     } else if (info[1].IsObject()) {
       Napi::Object opts = info[1].As<Napi::Object>();
@@ -1167,12 +1160,76 @@ Napi::Value SessionContext::formatChat(const Napi::CallbackInfo& info) {
       }
     }
   }
+  return inputs;
+}
+
+// Shared helper: marshal FormatResult → Napi::Object
+static Napi::Object marshalFormatResult(Napi::Env env, const lloyal::chat_in::FormatResult& r) {
+  Napi::Object result = Napi::Object::New(env);
+  result.Set("prompt", Napi::String::New(env, r.prompt));
+
+  Napi::Array stopTokens = Napi::Array::New(env, r.additional_stops.size());
+  for (size_t i = 0; i < r.additional_stops.size(); i++) {
+    stopTokens[i] = Napi::String::New(env, r.additional_stops[i]);
+  }
+  result.Set("stopTokens", stopTokens);
+
+  result.Set("format", Napi::Number::New(env, static_cast<double>(r.format)));
+  result.Set("grammar", Napi::String::New(env, r.grammar));
+  result.Set("grammarLazy", Napi::Boolean::New(env, r.grammar_lazy));
+  result.Set("thinkingForcedOpen", Napi::Boolean::New(env, r.thinking_forced_open));
+  result.Set("reasoningFormat", Napi::Number::New(env, static_cast<double>(r.reasoning_format)));
+  result.Set("parser", Napi::String::New(env, r.parser));
+
+  Napi::Array triggers = Napi::Array::New(env, r.grammar_triggers.size());
+  for (size_t i = 0; i < r.grammar_triggers.size(); i++) {
+    Napi::Object trigger = Napi::Object::New(env);
+    trigger.Set("type", Napi::Number::New(env, static_cast<double>(r.grammar_triggers[i].type)));
+    trigger.Set("value", Napi::String::New(env, r.grammar_triggers[i].value));
+    trigger.Set("token", Napi::Number::New(env, static_cast<double>(r.grammar_triggers[i].token)));
+    triggers[i] = trigger;
+  }
+  result.Set("grammarTriggers", triggers);
+
+  Napi::Array preserved = Napi::Array::New(env, r.preserved_tokens.size());
+  for (size_t i = 0; i < r.preserved_tokens.size(); i++) {
+    preserved[i] = Napi::String::New(env, r.preserved_tokens[i]);
+  }
+  result.Set("preservedTokens", preserved);
+
+  return result;
+}
+
+Napi::Value SessionContext::formatChat(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  ensureNotDisposed();
+
+  if (info.Length() < 1 || !info[0].IsString()) {
+    throw Napi::TypeError::New(env, "Expected (messagesJson: string[, options: object])");
+  }
 
+  auto inputs = parseFormatChatArgs(info);
   auto* worker = new FormatChatWorker(env, _model, inputs);
   worker->Queue();
   return worker->GetPromise();
 }
 
+Napi::Value SessionContext::formatChatSync(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  ensureNotDisposed();
+
+  if (info.Length() < 1 || !info[0].IsString()) {
+    throw Napi::TypeError::New(env, "Expected (messagesJson: string[, options: object])");
+  }
+
+  auto inputs = parseFormatChatArgs(info);
+  lloyal::chat_in::FormatResult result = lloyal::chat_in::format(_model.get(), inputs);
+  if (result.prompt.empty()) {
+    throw Napi::Error::New(env, "Chat template formatting failed");
+  }
+  return marshalFormatResult(env, result);
+}
+
 Napi::Value SessionContext::kvCacheSize(const Napi::CallbackInfo& info) {
   Napi::Env env = info.Env();
   ensureNotDisposed();
@@ -1303,6 +1360,19 @@ Napi::Value SessionContext::jsonSchemaToGrammar(const Napi::CallbackInfo& info)
   return worker->GetPromise();
 }
 
+Napi::Value SessionContext::jsonSchemaToGrammarSync(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  ensureNotDisposed();
+
+  if (info.Length() < 1 || !info[0].IsString()) {
+    throw Napi::TypeError::New(env, "Expected (schemaJson: string)");
+  }
+
+  std::string schemaJson = info[0].As<Napi::String>().Utf8Value();
+  std::string result = lloyal::grammar::from_json_schema(schemaJson);
+  return Napi::String::New(env, result);
+}
+
 Napi::Value SessionContext::validateChatTemplate(const Napi::CallbackInfo& info) {
   Napi::Env env = info.Env();
   ensureNotDisposed();
@@ -2404,4 +2474,16 @@ Napi::Value SessionContext::_storeAvailable(const Napi::CallbackInfo& info) {
   return Napi::Number::New(env, static_cast<double>(_branchStore.available()));
 }
 
+Napi::Value SessionContext::_storeKvPressure(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  ensureNotDisposed();
+
+  auto p = _branchStore.kv_pressure();
+  auto obj = Napi::Object::New(env);
+  obj.Set("nCtx", Napi::Number::New(env, static_cast<double>(p.n_ctx)));
+  obj.Set("cellsUsed", Napi::Number::New(env, static_cast<double>(p.cells_used)));
+  obj.Set("remaining", Napi::Number::New(env, static_cast<double>(p.remaining)));
+  return obj;
+}
+
 } // namespace liblloyal_node
diff --git a/src/SessionContext.hpp b/src/SessionContext.hpp
index 6e4209a..9039406 100644
--- a/src/SessionContext.hpp
+++ b/src/SessionContext.hpp
@@ -79,12 +79,19 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
   // ===== CORE PRIMITIVES =====
 
   /**
-   * Tokenize text to token IDs
+   * Tokenize text to token IDs (async — dispatches to libuv thread pool)
    * Args: text (string)
    * Returns: Promise<number[]>
    */
   Napi::Value tokenize(const Napi::CallbackInfo& info);
 
+  /**
+   * Tokenize text to token IDs (sync — inline on main thread)
+   * Args: text (string[, addSpecial: boolean])
+   * Returns: number[]
+   */
+  Napi::Value tokenizeSync(const Napi::CallbackInfo& info);
+
   /**
    * Detokenize tokens to text
    * Args: tokens (number[])
@@ -119,11 +126,19 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
   Napi::Value getTurnSeparator(const Napi::CallbackInfo& info);
 
   /**
-   * Format messages using model's chat template
+   * Format messages using model's chat template (async — dispatches to libuv thread pool)
    * Args: messagesJson (string), templateOverride (optional string)
    * Returns: Promise<{ prompt: string, stopTokens: string[] }>
    */
   Napi::Value formatChat(const Napi::CallbackInfo& info);
+
+  /**
+   * Format messages using model's chat template (sync — inline on main thread)
+   * Args: messagesJson (string), options? (object)
+   * Returns: { prompt: string, stopTokens: string[], ... }
+   */
+  Napi::Value formatChatSync(const Napi::CallbackInfo& info);
+
   Napi::Value parseChatOutput(const Napi::CallbackInfo& info);
 
   /**
@@ -194,9 +209,9 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
   Napi::Value kvCacheReadFile(const Napi::CallbackInfo& info);
 
   // ===== HELPERS =====
-  // Utility functions (not yet implemented)
 
   Napi::Value jsonSchemaToGrammar(const Napi::CallbackInfo& info);
+  Napi::Value jsonSchemaToGrammarSync(const Napi::CallbackInfo& info);
   Napi::Value validateChatTemplate(const Napi::CallbackInfo& info);
 
   // ===== EMBEDDING EXTRACTION =====
@@ -262,6 +277,7 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
   Napi::Value _storePrefill(const Napi::CallbackInfo& info);
   Napi::Value _storeRetainOnly(const Napi::CallbackInfo& info);
   Napi::Value _storeAvailable(const Napi::CallbackInfo& info);
+  Napi::Value _storeKvPressure(const Napi::CallbackInfo& info);
 
   // ===== SCORING API =====
 
diff --git a/src/agents/agent-pool.ts b/src/agents/agent-pool.ts
index 36d5eb3..7ff9316 100644
--- a/src/agents/agent-pool.ts
+++ b/src/agents/agent-pool.ts
@@ -1,17 +1,16 @@
-import { resource, call, action, useScope, createSignal, spawn, each } from 'effection';
+import { resource, call, action, ensure, useScope, createSignal, spawn, each } from 'effection';
 import type { Operation, Scope, Channel } from 'effection';
 import type { Branch } from '../Branch';
 import { CHAT_FORMAT_CONTENT_ONLY, CHAT_FORMAT_GENERIC, GrammarTriggerType, type GrammarTrigger, type ParsedToolCall, type SessionContext } from '../types';
 import type { BranchStore } from '../BranchStore';
 import { Ctx, Store, Events } from './context';
 import { buildToolResultDelta } from './deltas';
-import type { Tool } from './Tool';
 import type {
   TraceToken,
+  PressureThresholds,
   AgentTaskSpec,
   AgentPoolOptions,
   AgentPoolResult,
-  AgentResult,
   AgentEvent,
 } from './types';
 
@@ -24,6 +23,7 @@ type AgentInternalState = 'generating' | 'awaiting_tool' | 'done';
 
 interface AgentInternal {
   id: number;           // = branch.handle
+  parentId: number;     // = parent.handle
   branch: Branch;
   state: AgentInternalState;
   fmt: {
@@ -39,7 +39,6 @@ interface AgentInternal {
   tokenCount: number;
   toolCallCount: number;
   turns: number;
-  graceUsed: boolean;
   findings: string | null;
   traceBuffer: TraceToken[];
 }
@@ -50,59 +49,112 @@ interface SettledTool {
   toolName: string;
 }
 
-// Report tool schema — auto-injected into agent tools by setupAgent().
-// useAgentPool() intercepts report calls (never dispatched to execute()).
-const REPORT_SCHEMA = {
-  type: 'function' as const,
-  function: {
-    name: 'report',
-    description: 'Submit your final research findings. Call this when you have gathered enough information to answer the question.',
-    parameters: {
-      type: 'object',
-      properties: { findings: { type: 'string', description: 'Your research findings and answer' } },
-      required: ['findings'],
-    },
-  },
-};
-
-/** Inject report tool schema if tools are present and report isn't already defined. */
-function ensureReportTool(toolsJson: string): string {
-  const schemas = JSON.parse(toolsJson) as { type: string; function: { name: string } }[];
-  if (schemas.some(s => s.function?.name === 'report')) return toolsJson;
-  schemas.push(REPORT_SCHEMA);
-  return JSON.stringify(schemas);
+/**
+ * Immutable KV budget snapshot for one tick of the agent loop
+ *
+ * Created from `SessionContext._storeKvPressure()` which returns
+ * `{ nCtx, cellsUsed, remaining }` where `remaining = nCtx - cellsUsed`.
+ * `cellsUsed` is a monotonic counter in `BranchStore` — it increments on
+ * every `decode_each` / `decode_scatter` but does **not** decrement on
+ * individual branch prune (only resets on bulk ops like `retainOnly` and
+ * `drain`). This means `remaining` is a conservative lower bound that
+ * becomes increasingly pessimistic as branches are pruned mid-run.
+ *
+ * Two thresholds partition `remaining` into three zones:
+ *
+ * ```
+ * ┌──────────────────────────────────────────────────────┐
+ * │                    nCtx                              │
+ * │  ┌──────────┬───────────────────┬──────────────────┐ │
+ * │  │cellsUsed │    headroom > 0   │    softLimit     │ │
+ * │  │ (in use) │   (new work OK)   │   (reserved)     │ │
+ * │  └──────────┴───────────────────┴──────────────────┘ │
+ * │              ◄── remaining ──►  │                    │
+ * │                                 │                    │
+ * │  headroom = remaining - softLimit                    │
+ * │  critical = remaining < hardLimit                    │
+ * └──────────────────────────────────────────────────────┘
+ * ```
+ *
+ * - **headroom > 0** — room for new work (tool results, generation)
+ * - **headroom ≤ 0** — over budget. SETTLE rejects tool results, PRODUCE
+ *   hard-cuts non-terminal tool calls. Terminal tools still pass.
+ * - **critical** — remaining below hardLimit. Agents killed before
+ *   `produceSync()` to prevent llama_decode crashes.
+ *
+ * @category Agents
+ */
+export class ContextPressure {
+  /** Default softLimit: 1024 tokens reserved for downstream work */
+  static readonly DEFAULT_SOFT_LIMIT = 1024;
+  /** Default hardLimit: 128 tokens crash-prevention floor */
+  static readonly DEFAULT_HARD_LIMIT = 128;
+
+  /**
+   * KV slots remaining (`nCtx - cellsUsed`).
+   * Infinity when nCtx ≤ 0 (no context limit).
+   * Conservative: may undercount actual free space when branches have been
+   * pruned, since `cellsUsed` is monotonic.
+   */
+  readonly remaining: number;
+  /** Remaining KV floor — tokens reserved for downstream work */
+  readonly softLimit: number;
+  /** Crash-prevention floor — agents killed when remaining drops below */
+  readonly hardLimit: number;
+
+  constructor(ctx: SessionContext, opts?: PressureThresholds) {
+    const p = ctx._storeKvPressure();
+    this.remaining = p.nCtx <= 0 ? Infinity : p.remaining;
+    this.softLimit = opts?.softLimit ?? ContextPressure.DEFAULT_SOFT_LIMIT;
+    this.hardLimit = opts?.hardLimit ?? ContextPressure.DEFAULT_HARD_LIMIT;
+  }
+
+  /**
+   * Tokens available for new work: `remaining - softLimit`.
+   * Positive means room to accept tool results or continue generating.
+   * Negative means over budget — SETTLE rejects, PRODUCE hard-cuts.
+   */
+  get headroom(): number { return this.remaining - this.softLimit; }
+
+  /** `remaining < hardLimit` — agent must not call `produceSync()`. */
+  get critical(): boolean { return this.remaining < this.hardLimit; }
+
+  /** Can `tokenCount` tokens fit while staying above softLimit? */
+  canFit(tokenCount: number): boolean { return tokenCount <= this.headroom; }
 }
 
 /**
  * Fork an agent from a parent branch with its own system prompt and task.
  *
- * Formats the agent's messages via `formatChat()`, tokenizes the suffix,
- * and optionally reseeds the sampler for stochastic diversity. When the
- * task has tools, the `report` tool schema is auto-injected if absent.
+ * Generator — uses sync native calls so Effection sees everything.
+ * On scope exit (error, cancellation), `ensure()` prunes the branch
+ * automatically — the orphaned-branch leak is structurally impossible.
  */
-async function setupAgent(
+function* setupAgent(
   parent: Branch,
   task: AgentTaskSpec,
   ctx: SessionContext,
-): Promise<{ agent: AgentInternal; suffixTokens: number[] }> {
-  const branch = await parent.fork();
+): Operation<{ agent: AgentInternal; suffixTokens: number[] }> {
   const messages = [
     { role: 'system', content: task.systemPrompt },
     { role: 'user', content: task.content },
   ];
-  const tools = task.tools ? ensureReportTool(task.tools) : undefined;
-  const fmtOpts = tools ? { tools } : {};
-  const fmt = await ctx.formatChat(JSON.stringify(messages), fmtOpts);
-  if (tools && (fmt.format === CHAT_FORMAT_CONTENT_ONLY || fmt.format === CHAT_FORMAT_GENERIC)) {
+  const fmtOpts = task.tools ? { tools: task.tools } : {};
+  const fmt = ctx.formatChatSync(JSON.stringify(messages), fmtOpts);
+  if (task.tools && (fmt.format === CHAT_FORMAT_CONTENT_ONLY || fmt.format === CHAT_FORMAT_GENERIC)) {
+    // Error before fork — no branch to clean up
     throw new Error('Model does not support tool calling. Please use a model with native tool support (e.g. Qwen3, Llama 3.x, Mistral).');
   }
+  const branch = parent.forkSync();
+  yield* ensure(() => { if (!branch.disposed) branch.pruneSync(); });
   const sep = ctx.getTurnSeparator();
-  const suffixTokens = [...sep, ...await ctx.tokenize(fmt.prompt, false)];
+  const suffixTokens = [...sep, ...ctx.tokenizeSync(fmt.prompt, false)];
   if (task.seed != null) branch.reseedSampler(task.seed);
 
   return {
     agent: {
       id: branch.handle,
+      parentId: parent.handle,
       branch,
       state: 'generating',
       fmt: {
@@ -118,7 +170,6 @@ async function setupAgent(
       tokenCount: 0,
       toolCallCount: 0,
       turns: 0,
-      graceUsed: false,
       findings: null,
       traceBuffer: [],
     },
@@ -143,8 +194,8 @@ async function setupAgent(
  *
  * **Resource semantics:** `provide()` suspends after all agents complete,
  * keeping branches alive so the caller can fork from them (e.g. for
- * verification). Branches are pruned in the finally block when the
- * scope exits.
+ * verification). Branches are pruned when the scope exits — each branch's
+ * `ensure()` from `setupAgent` handles cleanup automatically.
  *
  * For automatic branch cleanup on return, use {@link runAgents} instead.
  *
@@ -188,31 +239,60 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
         yield* each.next();
       }
     });
-    const { tasks, tools, maxTurns = 100, nCtx = 0, gracePrompt, trace = false } = opts;
+    const { tasks, tools, maxTurns = 100, terminalTool, trace = false, pressure: pressureOpts } = opts;
+
+    // Whether the pool's tool registry contains tools besides the terminal tool.
+    // When false, agents are allowed to call the terminal tool as their first
+    // action (e.g. reporter sub-agents that only have `report()`). When true,
+    // the first tool call must be a non-terminal tool to prevent agents from
+    // immediately reporting without doing any work.
+    //
+    // IMPORTANT: this checks the pool's `tools` registry, not individual task
+    // schemas (`task.tools`). A reporter pool must pass only the terminal tool
+    // in its registry — passing the full tool map makes this flag true and
+    // traps reporters in an infinite rejection loop.
+    const hasNonTerminalTools = terminalTool ? [...tools.keys()].some(k => k !== terminalTool) : tools.size > 0;
 
     // ── Setup: fork branches, collect suffix tokens ──────────
+    // setupAgent is now a generator — each branch registers its own ensure()
+    // for cleanup. No manual try/finally needed here.
     const agents: AgentInternal[] = [];
     const prefillSetup: [Branch, number[]][] = [];
 
-    // try/finally wraps everything from agent creation through provide().
-    // Agent branches are plain Branch objects (not Effection resources) —
-    // their cleanup is manual. Placing it here guarantees any branch that
-    // makes it into agents[] is pruned on ANY exit path: normal completion,
-    // tick loop error, or scope cancellation.
-    try {
-
     for (const task of tasks) {
-      // Per-task parent for tree topology, or first task's parent as shared root
       const parent = task.parent;
       if (!parent) throw new Error('useAgentPool: each task must have a parent branch');
 
-      const { agent, suffixTokens } = yield* call(() => setupAgent(parent, task, ctx));
+      const { agent, suffixTokens } = yield* setupAgent(parent, task, ctx);
       agents.push(agent);
       prefillSetup.push([agent.branch, suffixTokens]);
     }
 
-    // Batch prefill all agent suffixes
-    yield* call(() => store.prefill(prefillSetup));
+    // Batch prefill all agent suffixes — pressure-gated.
+    // Each suffix is the full formatted chat (system prompt + tools JSON +
+    // user message + generation prompt), tokenized via formatChatSync().
+    // Suffix cost is model-dependent: ~250-400 tokens per agent depending
+    // on chat template verbosity and tool schema size.
+    const initPressure = new ContextPressure(ctx, pressureOpts);
+    const totalSuffix = prefillSetup.reduce((s, [, t]) => s + t.length, 0);
+    if (!initPressure.canFit(totalSuffix)) {
+      // Not enough room — drop agents from the end until it fits
+      while (prefillSetup.length > 0) {
+        const needed = prefillSetup.reduce((s, [, t]) => s + t.length, 0);
+        if (initPressure.canFit(needed)) break;
+        prefillSetup.pop();
+        const dropped = agents.pop()!;
+        dropped.state = 'done';
+      }
+    }
+    if (prefillSetup.length > 0) {
+      yield* call(() => store.prefill(prefillSetup));
+    }
+
+    // Emit spawn events — TUI uses parentAgentId to detect sub-agents
+    for (const a of agents) {
+      yield* events.send({ type: 'agent:spawn', agentId: a.id, parentAgentId: a.parentId });
+    }
 
     // ── Lazy grammar setup ───────────────────────────────────
     const applyLazyGrammar = (a: AgentInternal): void => {
@@ -288,7 +368,7 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
           const resultStr = JSON.stringify(result);
           yield* events.send({ type: 'agent:tool_result', agentId: agent.id, tool: tc.name, result: resultStr });
 
-          const prefillTokens: number[] = yield* call(() => buildToolResultDelta(ctx, resultStr, callId));
+          const prefillTokens = buildToolResultDelta(ctx, resultStr, callId);
           settledBuffer.push({ agentId: agent.id, prefillTokens, toolName: tc.name });
         } catch (err) {
           agent.state = 'done';
@@ -300,31 +380,21 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
       });
     }
 
-    // Context pressure thresholds (in tokens)
-    const GRACE_RESERVE = 1024;    // room for grace prompt + report generation
-    const CRITICAL_RESERVE = 128;  // absolute minimum — hard stop to prevent crash
-
     // ── Three-phase tick loop ────────────────────────────────
     for (;;) {
       // -- Phase 1: PRODUCE -- sample from active agents
+      const pressure = new ContextPressure(ctx, pressureOpts);
 
-      // Compute aggregate KV remaining once per tick.
-      // All branches (including done, not yet pruned) share one nCtx-sized KV cache.
-      // Shared prefix slots are counted once; divergent tails add per-branch.
-      let kvRemaining = Infinity;
-      if (nCtx > 0) {
-        const positions = agents.map(a => a.branch.position);
-        const sharedPrefix = Math.min(...positions);
-        const totalKV = positions.reduce((s, p) => s + p, 0) - (positions.length - 1) * sharedPrefix;
-        kvRemaining = nCtx - totalKV;
+      if (trace && (pressure.critical || pressure.headroom < 0)) {
+        const p = ctx._storeKvPressure();
+        try { process.stderr.write(`[PRODUCE] ${pressure.critical ? 'CRITICAL' : 'SOFT_LIMIT'} remaining=${p.remaining} headroom=${pressure.headroom} cellsUsed=${p.cellsUsed} nCtx=${p.nCtx}\n`); } catch {}
       }
 
       const entries: [Branch, number][] = [];
       for (const a of agents) {
         if (a.state !== 'generating') continue;
 
-        // Critical context pressure — hard stop before produceSync to prevent llama_decode crash
-        if (kvRemaining < CRITICAL_RESERVE) {
+        if (pressure.critical) {
           a.state = 'done';
           yield* events.send({ type: 'agent:done', agentId: a.id });
           continue;
@@ -349,51 +419,31 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
             continue;
           }
 
-          // Grace turn: context pressure or maxTurns reached, agent wants to call a tool
-          // that isn't report. Inject gracePrompt so the agent can synthesize findings.
-          // If grace already used (or no gracePrompt configured), hard-cut.
-          const contextPressure = kvRemaining < GRACE_RESERVE;
-          const shouldGrace = (a.turns >= maxTurns || contextPressure) && tc.name !== 'report';
+          // Over budget: deny non-terminal tool calls when the agent has
+          // exceeded maxTurns or KV headroom is negative. Terminal tools
+          // (e.g. `report()`) are always allowed through — an agent that has
+          // done research and wants to report should never be blocked by
+          // pressure, since the report call itself consumes minimal KV.
+          const overBudget = (a.turns >= maxTurns || pressure.headroom < 0)
+            && (!terminalTool || tc.name !== terminalTool);
 
-          if (shouldGrace) {
-            if (a.graceUsed || !gracePrompt) {
-              a.state = 'done';
-              yield* events.send({ type: 'agent:done', agentId: a.id });
-              continue;
-            }
-            a.graceUsed = true;
-            const callId = tc.id || `call_${a.toolCallCount}`;
-            a.turns++;
-            a.state = 'awaiting_tool';
-            pendingToolCount++;
-            scope.run(function*() {
-              try {
-                const prefillTokens: number[] = yield* call(() =>
-                  buildToolResultDelta(ctx, JSON.stringify({ error: gracePrompt }), callId)
-                );
-                settledBuffer.push({ agentId: a.id, prefillTokens, toolName: tc.name });
-              } finally {
-                pendingToolCount--;
-                if (wakeIdle) { wakeIdle(); wakeIdle = null; }
-              }
-            });
-            a.rawOutput = '';
+          if (overBudget) {
+            a.state = 'done';
+            yield* events.send({ type: 'agent:done', agentId: a.id });
             continue;
           }
 
-          // Report tool special case — reject if no prior research
-          if (tc.name === 'report') {
-            if (a.toolCallCount === 0) {
+          // Terminal tool — intercept, extract findings, mark done.
+          if (terminalTool && tc.name === terminalTool) {
+            if (a.toolCallCount === 0 && hasNonTerminalTools) {
               const callId = tc.id || `call_${a.toolCallCount}`;
-              const errorMsg = 'You must search or read the corpus before reporting. Use search, grep, or read_file first.';
+              const errorMsg = 'You must perform research before reporting. Call at least one tool first.';
               a.turns++;
               a.state = 'awaiting_tool';
               pendingToolCount++;
               scope.run(function*() {
                 try {
-                  const prefillTokens: number[] = yield* call(() =>
-                    buildToolResultDelta(ctx, JSON.stringify({ error: errorMsg }), callId)
-                  );
+                  const prefillTokens = buildToolResultDelta(ctx, JSON.stringify({ error: errorMsg }), callId);
                   settledBuffer.push({ agentId: a.id, prefillTokens, toolName: tc.name });
                 } finally {
                   pendingToolCount--;
@@ -407,7 +457,7 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
             a.state = 'done';
             a.toolCallCount++;
             totalToolCalls++;
-            yield* events.send({ type: 'agent:tool_call', agentId: a.id, tool: 'report', args: tc.arguments });
+            yield* events.send({ type: 'agent:tool_call', agentId: a.id, tool: tc.name, args: tc.arguments });
             yield* events.send({ type: 'agent:report', agentId: a.id, findings: a.findings! });
             yield* events.send({ type: 'agent:done', agentId: a.id });
             continue;
@@ -444,17 +494,42 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
       // -- Phase 3: SETTLE -- drain settled tool buffer, batch prefill
       const settled = settledBuffer.splice(0);
       if (settled.length > 0) {
+        // Fresh snapshot — Phase 2 commits may have advanced positions
+        const settlePressure = new ContextPressure(ctx, pressureOpts);
+        let headroom = settlePressure.headroom;
+
+        if (trace) {
+          const p = ctx._storeKvPressure();
+          const items = settled.map(s => `${s.toolName}:${s.prefillTokens.length}`).join(', ');
+          try { process.stderr.write(`[SETTLE] remaining=${p.remaining} headroom=${headroom} cellsUsed=${p.cellsUsed} nCtx=${p.nCtx} items=[${items}]\n`); } catch {}
+        }
+
         const prefillPairs: [Branch, number[]][] = [];
         const settledAgents: AgentInternal[] = [];
 
         for (const item of settled) {
           const a = agentById.get(item.agentId);
           if (!a || a.state === 'done') continue;
+
+          if (item.prefillTokens.length > headroom) {
+            if (trace) {
+              try { process.stderr.write(`[SETTLE] REJECT ${item.toolName}:${item.prefillTokens.length} > headroom=${headroom}\n`); } catch {}
+            }
+            a.state = 'done';
+            yield* events.send({ type: 'agent:done', agentId: a.id });
+            continue;
+          }
+
           prefillPairs.push([a.branch, item.prefillTokens]);
           settledAgents.push(a);
+          headroom -= item.prefillTokens.length;
         }
 
         if (prefillPairs.length > 0) {
+          if (trace) {
+            const totalPrefill = prefillPairs.reduce((s, [, t]) => s + t.length, 0);
+            try { process.stderr.write(`[SETTLE] PREFILL ${prefillPairs.length} branches, ${totalPrefill} tokens, headroom_after=${headroom}\n`); } catch {}
+          }
           yield* call(() => store.prefill(prefillPairs));
           counters.warmPrefillCalls++;
           counters.warmPrefillBranches += prefillPairs.length;
@@ -486,17 +561,20 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
     }
 
     // ── Provide result — suspends, branches stay alive ───────
+    // Branch cleanup is handled by each branch's ensure() from setupAgent —
+    // when this resource's scope exits, all ensure() callbacks fire.
     const result: AgentPoolResult = {
       agents: agents.map(a => ({
-        agentId: a.id,
-        branch: a.branch,
-        findings: a.findings,
-        toolCallCount: a.toolCallCount,
-        tokenCount: a.tokenCount,
-        ppl: a.branch.perplexity,
-        samplingPpl: a.branch.samplingPerplexity,
-        trace: trace ? a.traceBuffer : undefined,
-      })),
+          agentId: a.id,
+          parentAgentId: a.parentId,
+          branch: a.branch,
+          findings: a.findings,
+          toolCallCount: a.toolCallCount,
+          tokenCount: a.tokenCount,
+          ppl: a.branch.perplexity,
+          samplingPpl: a.branch.samplingPerplexity,
+          trace: trace ? a.traceBuffer : undefined,
+        })),
       totalTokens: agents.reduce((s, a) => s + a.tokenCount, 0),
       totalToolCalls,
       steps,
@@ -504,14 +582,5 @@ export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult>
     };
 
     yield* provide(result);
-
-    } finally {
-      // Structured cleanup: prune all agent branches when scope exits.
-      // Covers setup errors, tick loop errors, and normal scope teardown
-      // (provide() suspends via yield* suspend(), halting jumps to finally).
-      for (const a of agents) {
-        yield* call(() => a.branch.prune());
-      }
-    }
   });
 }
diff --git a/src/agents/deltas.ts b/src/agents/deltas.ts
index ba9d055..baf12d0 100644
--- a/src/agents/deltas.ts
+++ b/src/agents/deltas.ts
@@ -3,7 +3,7 @@ import type { SessionContext } from '../types';
 /**
  * Build a token delta for a user turn
  *
- * Composes `getTurnSeparator()` + `formatChat()` + `tokenize()` into a
+ * Composes `getTurnSeparator()` + `formatChatSync()` + `tokenizeSync()` into a
  * single token array suitable for `branch.prefill()`. Usable with any
  * branch — not tied to {@link Session}'s trunk.
  *
@@ -17,25 +17,25 @@ import type { SessionContext } from '../types';
  *
  * @category Agents
  */
-export async function buildUserDelta(
+export function buildUserDelta(
   ctx: SessionContext,
   content: string,
   opts: { tools?: string } = {}
-): Promise<number[]> {
+): number[] {
   const sep = ctx.getTurnSeparator();
   const fmtOpts = opts.tools ? { tools: opts.tools } : {};
-  const { prompt } = await ctx.formatChat(
+  const { prompt } = ctx.formatChatSync(
     JSON.stringify([{ role: 'system', content: '' }, { role: 'user', content }]),
     fmtOpts
   );
-  const delta = await ctx.tokenize(prompt, false);
+  const delta = ctx.tokenizeSync(prompt, false);
   return [...sep, ...delta];
 }
 
 /**
  * Build a token delta for a tool result turn
  *
- * Composes `getTurnSeparator()` + `formatChat()` + `tokenize()` into a
+ * Composes `getTurnSeparator()` + `formatChatSync()` + `tokenizeSync()` into a
  * single token array suitable for `branch.prefill()`. Used by
  * {@link useAgentPool} to inject tool results back into agent context.
  *
@@ -46,18 +46,18 @@ export async function buildUserDelta(
  *
  * @category Agents
  */
-export async function buildToolResultDelta(
+export function buildToolResultDelta(
   ctx: SessionContext,
   resultStr: string,
   callId: string
-): Promise<number[]> {
+): number[] {
   const sep = ctx.getTurnSeparator();
-  const { prompt } = await ctx.formatChat(
+  const { prompt } = ctx.formatChatSync(
     JSON.stringify([
       { role: 'system', content: '' },
       { role: 'tool', content: resultStr, tool_call_id: callId },
     ])
   );
-  const delta = await ctx.tokenize(prompt, false);
+  const delta = ctx.tokenizeSync(prompt, false);
   return [...sep, ...delta];
 }
diff --git a/src/agents/diverge.ts b/src/agents/diverge.ts
index c383719..ed1be3e 100644
--- a/src/agents/diverge.ts
+++ b/src/agents/diverge.ts
@@ -1,7 +1,8 @@
-import { call } from 'effection';
+import { call, ensure } from 'effection';
 import type { Operation } from 'effection';
 import { Branch } from '../Branch';
 import { Ctx, Store } from './context';
+import { ContextPressure } from './agent-pool';
 import type { DivergeOptions, DivergeResult, DivergeAttempt } from './types';
 
 /**
@@ -17,9 +18,9 @@ import type { DivergeOptions, DivergeResult, DivergeAttempt } from './types';
  * are pruned. The caller owns the winning branch's lifecycle, typically
  * via {@link Session.promote}.
  *
- * Error-path cleanup: if generation throws, all forked branches are
- * pruned. If a fresh root was created (`opts.prompt` without `opts.parent`),
- * the root is also pruned if it has no surviving children.
+ * Cleanup is structured: each forked branch registers an `ensure()` callback
+ * that prunes it on scope exit. Winners are marked disposed-safe (already
+ * pruned or ownership transferred) before the ensure fires.
  *
  * @param opts - Diverge options specifying parent or prompt, attempt count,
  *   and sampling parameters
@@ -53,81 +54,92 @@ export function* diverge(opts: DivergeOptions): Operation<DivergeResult> {
     prefixLength = root.position;
   } else {
     if (!opts.prompt) throw new Error('diverge() requires either opts.parent or opts.prompt');
-    const tokens: number[] = yield* call(() => ctx.tokenize(opts.prompt!));
+    const tokens = ctx.tokenizeSync(opts.prompt);
     root = Branch.create(ctx, 0, opts.params ?? {});
     yield* call(() => root.prefill(tokens));
     prefixLength = tokens.length;
     ownRoot = true;
+    // If we created the root, ensure it's cleaned up
+    yield* ensure(() => {
+      if (ownRoot && !root.disposed) {
+        try { root.pruneSync(); } catch { /* children may remain */ }
+      }
+    });
   }
 
   const live: { branch: Branch; output: string; done: boolean; tokenCount: number; ppl: number }[] = [];
 
-  try {
-    for (let i = 0; i < opts.attempts; i++) {
-      const branch: Branch = yield* call(() => root.fork());
-      branch.reseedSampler(2000 + i);
-      live.push({ branch, output: '', done: false, tokenCount: 0, ppl: Infinity });
-    }
-
-    // Batched generation — produceSync/commit loop
-    let steps = 0;
-    for (;;) {
-      const entries: [Branch, number][] = [];
-      for (const a of live) {
-        if (a.done) continue;
-        const { token, text, isStop } = a.branch.produceSync();
-        if (isStop) {
-          const p = a.branch.perplexity;
-          a.ppl = Number.isFinite(p) ? p : Infinity;
-          a.done = true;
-          continue;
-        }
-        entries.push([a.branch, token]);
-        a.output += text;
-        a.tokenCount++;
+  for (let i = 0; i < opts.attempts; i++) {
+    const branch = root.forkSync();
+    // Each forked branch gets its own ensure() for structured cleanup
+    yield* ensure(() => {
+      if (!branch.disposed) {
+        try { branch.pruneSync(); } catch { /* already gone */ }
       }
-      if (entries.length === 0) break;
-      yield* call(() => store.commit(entries));
-      steps++;
-    }
-
-    // Select by lowest perplexity (most coherent)
-    const bestIdx = live.reduce((bi, a, i) => a.ppl <= live[bi].ppl ? i : bi, 0);
+    });
+    branch.reseedSampler(2000 + i);
+    live.push({ branch, output: '', done: false, tokenCount: 0, ppl: Infinity });
+  }
 
-    // Prune losers — winner stays alive as caller's result.
-    for (let i = 0; i < live.length; i++) {
-      if (i !== bestIdx && !live[i].branch.disposed) {
-        yield* call(() => live[i].branch.prune());
-      }
+  // Batched generation — produceSync/commit loop
+  let steps = 0;
+  for (;;) {
+    const pressure = new ContextPressure(ctx);
+    if (pressure.critical) {
+      for (const a of live) { if (!a.done) a.done = true; }
+      break;
     }
 
-    const totalTokens = live.reduce((s, a) => s + a.tokenCount, 0);
-    const attempts: DivergeAttempt[] = live.map(a => ({
-      branch: a.branch,
-      output: a.output,
-      tokenCount: a.tokenCount,
-      ppl: a.ppl,
-    }));
-
-    return {
-      best: live[bestIdx].branch,
-      bestOutput: live[bestIdx].output,
-      attempts,
-      totalTokens,
-      steps,
-      prefixLength,
-    };
-  } catch (err) {
-    // Error path: prune all forked branches, then re-throw.
+    const entries: [Branch, number][] = [];
     for (const a of live) {
-      if (!a.branch.disposed) {
-        try { yield* call(() => a.branch.prune()); } catch { /* already gone */ }
+      if (a.done) continue;
+      const { token, text, isStop } = a.branch.produceSync();
+      if (isStop) {
+        const p = a.branch.perplexity;
+        a.ppl = Number.isFinite(p) ? p : Infinity;
+        a.done = true;
+        continue;
       }
+      entries.push([a.branch, token]);
+      a.output += text;
+      a.tokenCount++;
     }
-    // If we created the root and it has no surviving children, clean it up too.
-    if (ownRoot && !root.disposed) {
-      try { yield* call(() => root.prune()); } catch { /* children may remain */ }
+    if (entries.length === 0) break;
+    yield* call(() => store.commit(entries));
+    steps++;
+  }
+
+  // Select by lowest perplexity (most coherent)
+  const bestIdx = live.reduce((bi, a, i) => a.ppl <= live[bi].ppl ? i : bi, 0);
+
+  // Prune losers now — winner stays alive as caller's result.
+  // ensure() will be a no-op for these since they're already disposed.
+  for (let i = 0; i < live.length; i++) {
+    if (i !== bestIdx && !live[i].branch.disposed) {
+      live[i].branch.pruneSync();
     }
-    throw err;
   }
+
+  // If we created root and it's no longer needed, prune it now.
+  // (ensure() will be a no-op since it checks disposed)
+  if (ownRoot && !root.disposed && root.children.length === 0) {
+    root.pruneSync();
+  }
+
+  const totalTokens = live.reduce((s, a) => s + a.tokenCount, 0);
+  const attempts: DivergeAttempt[] = live.map(a => ({
+    branch: a.branch,
+    output: a.output,
+    tokenCount: a.tokenCount,
+    ppl: a.ppl,
+  }));
+
+  return {
+    best: live[bestIdx].branch,
+    bestOutput: live[bestIdx].output,
+    attempts,
+    totalTokens,
+    steps,
+    prefixLength,
+  };
 }
diff --git a/src/agents/generate.ts b/src/agents/generate.ts
index 4178e0a..4a37c66 100644
--- a/src/agents/generate.ts
+++ b/src/agents/generate.ts
@@ -37,7 +37,7 @@ export function* generate<T = unknown>(opts: GenerateOptions): Operation<Generat
   const branch = Branch.create(ctx, 0, samplerParams, undefined, opts.grammar);
 
   try {
-    const tokens: number[] = yield* call(() => ctx.tokenize(opts.prompt));
+    const tokens = ctx.tokenizeSync(opts.prompt);
     yield* call(() => branch.prefill(tokens));
 
     // Consume async iterator inside call() — generators can't use for-await
@@ -54,6 +54,6 @@ export function* generate<T = unknown>(opts: GenerateOptions): Operation<Generat
     const parsed = opts.parse ? opts.parse(output) as T : undefined;
     return { output, tokenCount, parsed };
   } finally {
-    if (!branch.disposed) yield* call(() => branch.prune());
+    if (!branch.disposed) branch.pruneSync();
   }
 }
diff --git a/src/agents/index.ts b/src/agents/index.ts
index a57d259..6d5b889 100644
--- a/src/agents/index.ts
+++ b/src/agents/index.ts
@@ -3,7 +3,7 @@ export { Tool } from './Tool';
 export { buildUserDelta, buildToolResultDelta } from './deltas';
 export { generate } from './generate';
 export { diverge } from './diverge';
-export { useAgentPool } from './agent-pool';
+export { useAgentPool, ContextPressure } from './agent-pool';
 export { runAgents } from './run-agents';
 export { createToolkit } from './toolkit';
 export { initAgents } from './init';
@@ -18,6 +18,7 @@ export type {
   JsonSchema,
   ToolSchema,
   ToolContext,
+  PressureThresholds,
   AgentTaskSpec,
   AgentPoolOptions,
   AgentResult,
diff --git a/src/agents/shared-root.ts b/src/agents/shared-root.ts
index a1e4987..101958a 100644
--- a/src/agents/shared-root.ts
+++ b/src/agents/shared-root.ts
@@ -66,8 +66,8 @@ export function* withSharedRoot<T>(
   const fmtOpts = opts.tools
     ? { tools: opts.tools, addGenerationPrompt: false }
     : { addGenerationPrompt: false };
-  const fmt = yield* call(() => ctx.formatChat(JSON.stringify(messages), fmtOpts));
-  const sharedTokens: number[] = yield* call(() => ctx.tokenize(fmt.prompt));
+  const fmt = ctx.formatChatSync(JSON.stringify(messages), fmtOpts);
+  const sharedTokens = ctx.tokenizeSync(fmt.prompt);
 
   const root = Branch.create(ctx, 0, opts.params ?? { temperature: 0.5 });
   yield* call(() => root.prefill(sharedTokens));
@@ -75,8 +75,6 @@ export function* withSharedRoot<T>(
   try {
     return yield* body(root, sharedTokens.length);
   } finally {
-    if (!root.disposed) {
-      yield* call(() => root.prune());
-    }
+    if (!root.disposed) root.pruneSubtreeSync();
   }
 }
diff --git a/src/agents/toolkit.ts b/src/agents/toolkit.ts
index f15faf1..86bcf0c 100644
--- a/src/agents/toolkit.ts
+++ b/src/agents/toolkit.ts
@@ -32,7 +32,6 @@ export interface Toolkit {
  *   new ReadFileTool(resources),
  *   new GrepTool(resources),
  * ]);
- * // report tool schema is auto-injected by useAgentPool()
  * ```
  *
  * @category Agents
diff --git a/src/agents/types.ts b/src/agents/types.ts
index 021c425..df8c468 100644
--- a/src/agents/types.ts
+++ b/src/agents/types.ts
@@ -125,6 +125,48 @@ export interface SamplingParams {
   [key: string]: unknown;
 }
 
+/**
+ * KV pressure thresholds controlling agent shutdown under context exhaustion
+ *
+ * Two thresholds govern what happens as remaining KV shrinks:
+ *
+ * **softLimit** (default 1024) — remaining KV floor for new work.
+ * Enforced at three points:
+ * - **SETTLE**: tool results that would cross this floor are rejected and
+ *   the agent is marked done. This is the primary enforcement point — tool
+ *   results (search results, etc.) are the largest KV consumers.
+ * - **PRODUCE (stop-token boundary)**: agents that want a non-terminal tool
+ *   call are hard-cut. Terminal tools (e.g. `report()`) still pass.
+ * - **INIT prefill**: agents that don't fit above this floor are dropped.
+ *
+ * Set to account for downstream pool needs (reporters, verification).
+ *
+ * **hardLimit** (default 128) — crash-prevention floor.
+ * When remaining drops below this, agents are killed immediately before
+ * `produceSync()`. Prevents `llama_decode` "no memory slot" failures.
+ * Pure safety net — should never be the primary budget control.
+ *
+ * @category Agents
+ */
+export interface PressureThresholds {
+  /**
+   * Remaining KV floor for new work (tokens). When remaining drops below
+   * this, SETTLE rejects tool results, PRODUCE hard-cuts non-terminal tool
+   * calls, and INIT drops agents that don't fit.
+   *
+   * Set to account for downstream pool needs (reporters, verification).
+   * Default: 1024
+   */
+  softLimit?: number;
+  /**
+   * Crash-prevention floor (tokens). When remaining drops below this,
+   * agents are killed immediately before `produceSync()`. Prevents
+   * `llama_decode` "no memory slot for batch" failures.
+   * Default: 128
+   */
+  hardLimit?: number;
+}
+
 /**
  * Configuration for {@link useAgentPool} and {@link runAgents}
  *
@@ -133,20 +175,35 @@ export interface SamplingParams {
 export interface AgentPoolOptions {
   /** Agent task specifications — one per concurrent agent */
   tasks: AgentTaskSpec[];
-  /** Tool registry mapping tool names to {@link Tool} instances */
+  /**
+   * Tool registry mapping tool names to {@link Tool} instances.
+   *
+   * This is the **execution registry** — it determines which tools can be
+   * dispatched at runtime. It is distinct from the per-task `task.tools`
+   * JSON schema that tells the model which tools are available.
+   *
+   * The registry also controls {@link AgentPoolOptions.terminalTool | terminalTool}
+   * gating: if the registry contains only the terminal tool, agents are
+   * allowed to call it as their first action (e.g. reporter sub-agents).
+   * If the registry contains other tools, the first call must be
+   * non-terminal to prevent agents from reporting without doing work.
+   */
   tools: Map<string, import('./Tool').Tool>;
   /** Sampling parameters applied to all agents */
   params?: SamplingParams;
   /** Maximum tool-call turns per agent before forced termination */
   maxTurns?: number;
-  /** Context window size — enables context-pressure detection when set.
-   *  Agents are gracefully stopped when remaining capacity drops below threshold. */
-  nCtx?: number;
-  /** Message injected as a tool error when an agent must stop and report.
-   *  Triggered by context pressure or maxTurns. If omitted, agents are hard-cut. */
-  gracePrompt?: string;
+  /** Tool name that signals agent completion. When the model calls this tool,
+   *  findings are extracted from arguments and the agent is marked done.
+   *  The tool is intercepted — never dispatched to execute(). If omitted,
+   *  agents complete only via stop token or hard-cut. */
+  terminalTool?: string;
   /** Enable per-token entropy/surprisal on `agent:produce` events */
   trace?: boolean;
+  /** KV pressure thresholds — tune per pool. Reporter pools typically use
+   *  lower thresholds than research pools since they complete in a single
+   *  terminal tool call. See {@link PressureThresholds} for tuning guidance. */
+  pressure?: PressureThresholds;
 }
 
 /**
@@ -157,9 +214,11 @@ export interface AgentPoolOptions {
 export interface AgentResult {
   /** Stable agent identifier (branch handle at creation time) */
   agentId: number;
+  /** Parent branch handle — shared root for top-level agents, parent agentId for sub-agents */
+  parentAgentId: number;
   /** The agent's branch — still alive when returned from {@link useAgentPool} */
   branch: Branch;
-  /** Agent's research findings (from `report` tool or final output), or null */
+  /** Agent's research findings (from terminal tool or final output), or null */
   findings: string | null;
   /** Number of tool calls the agent made */
   toolCallCount: number;
@@ -310,6 +369,7 @@ export interface DivergeResult {
  * @category Agents
  */
 export type AgentEvent =
+  | { type: 'agent:spawn'; agentId: number; parentAgentId: number }
   | { type: 'agent:produce'; agentId: number; text: string; tokenCount: number; entropy?: number; surprisal?: number }
   | { type: 'agent:tool_call'; agentId: number; tool: string; args: string }
   | { type: 'agent:tool_result'; agentId: number; tool: string; result: string }
diff --git a/src/types.ts b/src/types.ts
index 9d0268f..aa97a19 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -776,6 +776,20 @@ export interface SessionContext {
    */
   tokenize(text: string, addSpecial?: boolean): Promise<number[]>;
 
+  /**
+   * Tokenize text into model's vocabulary (sync — inline on main thread)
+   *
+   * Same as {@link tokenize} but synchronous. Use from Effection generators
+   * to avoid `yield* call()` overhead for CPU-only work.
+   *
+   * @param text Text to tokenize
+   * @param addSpecial Whether to add special tokens (BOS/EOS). Defaults to
+   *   model metadata setting (typically true). Pass false for mid-sequence
+   *   tokenization.
+   * @returns Array of token IDs
+   */
+  tokenizeSync(text: string, addSpecial?: boolean): number[];
+
   /**
    * Detokenize array of tokens back to text
    *
@@ -1081,6 +1095,21 @@ export interface SessionContext {
     options?: FormatChatOptions | string
   ): Promise<FormattedChatResult>;
 
+  /**
+   * Format messages using model's chat template (sync — inline on main thread)
+   *
+   * Same as {@link formatChat} but synchronous. Use from Effection generators
+   * to avoid `yield* call()` overhead for CPU-only work.
+   *
+   * @param messagesJson JSON string containing array of messages
+   * @param options Formatting options (tools, reasoning, grammar, etc.)
+   * @returns Formatted prompt with format-awareness metadata
+   */
+  formatChatSync(
+    messagesJson: string,
+    options?: FormatChatOptions | string
+  ): FormattedChatResult;
+
   /**
    * Parse model output into structured content
    *
@@ -1201,6 +1230,17 @@ export interface SessionContext {
    */
   jsonSchemaToGrammar(schemaJson: string): Promise<string>;
 
+  /**
+   * Convert JSON schema to GBNF grammar (sync — inline on main thread)
+   *
+   * Same as {@link jsonSchemaToGrammar} but synchronous. Use from Effection
+   * generators to avoid `yield* call()` overhead for CPU-only work.
+   *
+   * @param schemaJson JSON schema string
+   * @returns GBNF grammar string
+   */
+  jsonSchemaToGrammarSync(schemaJson: string): string;
+
   /**
    * Validate chat template syntax
    *
@@ -1422,6 +1462,10 @@ export interface SessionContext {
   /** @internal */
   _storeAvailable(): number;
 
+  /** KV cache pressure snapshot from native BranchStore.
+   *  cells_used is a monotonic counter reset on drain/retainOnly. */
+  _storeKvPressure(): { nCtx: number; cellsUsed: number; remaining: number };
+
   // ===== SCORING API =====
 
   /** @internal — processes ≤ n_seq_max prompts in a single group */

From 8f0cd427fec95d0bc7c5f0744f93e03ac9673d9e Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Thu, 5 Mar 2026 03:02:42 +1100
Subject: [PATCH 15/17] feat(agents): bump liblloyal

---
 liblloyal | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/liblloyal b/liblloyal
index 1d2bec3..4082e2e 160000
--- a/liblloyal
+++ b/liblloyal
@@ -1 +1 @@
-Subproject commit 1d2bec35eece54a61780189e775b03e6b293eb24
+Subproject commit 4082e2eab6618b800753d462e8ad3773541fb5f3

From d3222614ca175d5a960e0164ffd7294a47655638 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Thu, 5 Mar 2026 03:18:49 +1100
Subject: [PATCH 16/17] feat(agents): address PR feedback

---
 .github/workflows/docs.yml |  3 +--
 README.md                  |  2 +-
 test/examples.ts           | 14 +++++++-------
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index aff706f..cdefd99 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -4,8 +4,7 @@ on:
   push:
     branches: [main]
     paths:
-      - 'src/types.ts'
-      - 'src/Branch.ts'
+      - 'src/**'
       - 'package.json'
       - 'typedoc.json'
       - 'README.md'
diff --git a/README.md b/README.md
index a455bc5..a1783d3 100644
--- a/README.md
+++ b/README.md
@@ -218,7 +218,7 @@ See [`examples/grammar/`](./examples/grammar/) for the full branch fork pattern.
 
 Full API documentation: **[lloyal-ai.github.io/lloyal.node](https://lloyal-ai.github.io/lloyal.node/)**
 
-Generated from [`src/types.ts`](./src/types.ts) with TypeDoc.
+Generated from [`src/index.ts`](./src/index.ts) with TypeDoc.
 
 ---
 
diff --git a/test/examples.ts b/test/examples.ts
index 57a8997..3005fec 100644
--- a/test/examples.ts
+++ b/test/examples.ts
@@ -79,16 +79,16 @@ function runExample(scriptPath: string, timeout: number = 600000, extraArgs: str
       stdio: ['ignore', 'pipe', 'pipe'],
     });
 
+    let buf = '';
     child.stdout!.on('data', (data: Buffer) => {
-      const lines: string[] = data.toString().split('\n');
-      for (const line of lines) {
+      buf += data.toString();
+      const parts = buf.split('\n');
+      buf = parts.pop()!; // carry partial line forward
+      for (const line of parts) {
         if (line.startsWith('{')) {
           try {
-            const event: ExampleEvent = JSON.parse(line);
-            events.push(event);
-          } catch {
-            // Ignore malformed JSON
-          }
+            events.push(JSON.parse(line));
+          } catch { /* malformed */ }
         }
       }
     });

From 188d4f10c2bf894d966e4630838cba6594217d7c Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Thu, 5 Mar 2026 11:21:47 +1100
Subject: [PATCH 17/17] feat(test): support TS tests

---
 .github/workflows/gpu-test.yml | 12 ++++++++++++
 .gitignore                     |  3 +++
 package.json                   |  1 +
 test/agents.ts                 |  4 ++--
 tsconfig.test.json             | 14 ++++++++++++++
 5 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100644 tsconfig.test.json

diff --git a/.github/workflows/gpu-test.yml b/.github/workflows/gpu-test.yml
index 56e7192..f165c51 100644
--- a/.github/workflows/gpu-test.yml
+++ b/.github/workflows/gpu-test.yml
@@ -107,6 +107,18 @@ jobs:
       - name: Configure Docker for Artifact Registry
         run: gcloud auth configure-docker ${{ secrets.GCP_REGION }}-docker.pkg.dev --quiet
 
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 24
+          cache: 'npm'
+
+      - name: Compile TypeScript (src + tests)
+        run: |
+          npm ci --ignore-scripts
+          npm run build:ts
+          npm run build:test
+
       - name: Download package artifact
         uses: actions/download-artifact@v4
         with:
diff --git a/.gitignore b/.gitignore
index 1ec8cb6..4deb625 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,5 +41,8 @@ tmp/
 dist/
 packages/darwin-arm64
 
+# Compiled test artifacts (built by tsconfig.test.json for GPU CI)
+test/*.js
+
 # CI infra scripts (injected from lloyal-infra during CI)
 ci/
\ No newline at end of file
diff --git a/package.json b/package.json
index 88583af..8cd4138 100644
--- a/package.json
+++ b/package.json
@@ -12,6 +12,7 @@
     "download-models": "bash scripts/download-test-models.sh",
     "build:native": "node scripts/build.js",
     "build:ts": "tsc",
+    "build:test": "tsc -p tsconfig.test.json",
     "build": "npm run build:ts && npm run build:native",
     "build:debug": "cmake-js compile --debug",
     "rebuild": "cmake-js rebuild",
diff --git a/test/agents.ts b/test/agents.ts
index cf52d5e..5e5dfd8 100644
--- a/test/agents.ts
+++ b/test/agents.ts
@@ -12,7 +12,7 @@
 import * as path from 'node:path';
 import * as fs from 'node:fs';
 import { run, call, spawn, ensure, each } from 'effection';
-import { loadBinary } from '../dist/index.js';
+import { loadBinary, Branch } from '../dist/index.js';
 import type { SessionContext, NativeBinding } from '../dist/index.js';
 import {
   initAgents, runAgents, withSharedRoot, Tool,
@@ -86,7 +86,7 @@ async function createTestContext(): Promise<SessionContext> {
   });
 }
 
-function makeTasks(parent: unknown, count: number) {
+function makeTasks(parent: Branch, count: number) {
   return Array.from({ length: count }, (_, i) => ({
     systemPrompt: 'You are a test agent.',
     content: `Test task ${i}`,
diff --git a/tsconfig.test.json b/tsconfig.test.json
new file mode 100644
index 0000000..7fd54b7
--- /dev/null
+++ b/tsconfig.test.json
@@ -0,0 +1,14 @@
+{
+  "extends": "./tsconfig.json",
+  "compilerOptions": {
+    "rootDir": ".",
+    "outDir": ".",
+    "declaration": false,
+    "declarationMap": false,
+    "sourceMap": false,
+    "skipLibCheck": true,
+    "noEmitOnError": false
+  },
+  "include": ["test/**/*.ts"],
+  "exclude": ["node_modules", "dist", "build"]
+}