diff --git a/README.md b/README.md
index a1783d3..35df831 100644
--- a/README.md
+++ b/README.md
@@ -6,67 +6,59 @@
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
 [![llama.cpp](https://img.shields.io/badge/llama.cpp-b8087-green.svg)](https://github.com/ggml-org/llama.cpp/releases/tag/b8087)
 
-**Covalent Inference for Node.js**
+**Native backend for the lloyal inference platform.**
 
-Composable inference primitives for forkable decode state, shared-prefix KV branching, and continuous tree batching. Branches share a KV prefix while keeping independent machinery — sampler chain, grammar, logits snapshot, perplexity tracker — for controlled divergence at decode time. `BranchStore` packs tokens from N branches (each at a different position, different seq_id, each needing independent logits captured) into a single `llama_batch` and dispatches once. `kv::tenancy` manages seq_id leases automatically — acquired on `create()`/`fork()`, evicted on `prune()`, rebuilt on `retainOnly()`.
+Prebuilt llama.cpp binaries for 13 platform/GPU combinations, exposing a `SessionContext` that powers the [`@lloyal-labs/sdk`](https://github.com/lloyal-ai/sdk) inference primitives (Branch, BranchStore, Session, Rerank) and [`@lloyal-labs/lloyal-agents`](https://github.com/lloyal-ai/sdk/tree/main/packages/agents) multi-agent framework. Built on [liblloyal](https://github.com/lloyal-ai/liblloyal), a header-only C++20 inference kernel for llama.cpp.
 
-Built on [liblloyal](https://github.com/lloyal-ai/liblloyal), a header-only C++20 inference kernel for llama.cpp.
+All SDK and agent exports are re-exported from this package for convenience — `import { Branch, runAgents } from "@lloyal-labs/lloyal.node"` works out of the box.
 
-## The Branch API
+## Install
+
+```bash
+npm install @lloyal-labs/lloyal.node
+```
+
+Prebuilt binaries for 13 platform/GPU combinations. GPU selection at runtime, not install time.
+
+| Platform | Arch  | Acceleration        |
+| -------- | ----- | ------------------- |
+| macOS    | arm64 | Metal               |
+| macOS    | x64   | CPU                 |
+| Linux    | x64   | CPU / CUDA / Vulkan |
+| Linux    | arm64 | CPU / CUDA / Vulkan |
+| Windows  | x64   | CPU / CUDA / Vulkan |
+| Windows  | arm64 | CPU / Vulkan        |
+
+## Quick Start
 
 ```javascript
-import { createContext, Branch, BranchStore } from "@lloyal-labs/lloyal.node";
+import { createContext } from "@lloyal-labs/lloyal.node";
+import { Branch, BranchStore } from "@lloyal-labs/sdk";
 
-const ctx = await createContext({ modelPath: "./model.gguf", nSeqMax: 6 });
+const ctx = await createContext({ modelPath: "./model.gguf", nSeqMax: 4 });
 const store = new BranchStore(ctx);
 
-// Shared prompt: "Explain quantum entanglement"
-const prompt = await ctx.tokenize("Explain quantum entanglement");
-
 const root = Branch.create(ctx, 0, { temperature: 0.8 });
-await root.prefill(prompt);
-
-// Fork 4 branches — each gets a different reasoning prefix
-const analogy  = await root.fork();
-const formal   = await root.fork();
-const socratic = await root.fork();
-const visual   = await root.fork();
-
-// Scatter-prefill: inject divergent prefixes in one batched dispatch
-// 4 branches × variable lengths → auto bin-packed into minimal GPU calls
-await store.prefill([
-  [analogy,  await ctx.tokenize("Think of it like two coins...")],    // 12 tokens
-  [formal,   await ctx.tokenize("In quantum mechanics, the...")],     // 8 tokens
-  [socratic, await ctx.tokenize("What happens when you measure...")], // 10 tokens
-  [visual,   await ctx.tokenize("Imagine two particles...")],         // 7 tokens
-]);
-
-// Generate — all 4 in lockstep, 1 GPU call per step
-const branches = [analogy, formal, socratic, visual];
+await root.prefill(await ctx.tokenize("Explain quantum entanglement"));
+
+// Fork and generate — all branches in lockstep, 1 GPU call per step
+const branches = await Promise.all([root.fork(), root.fork(), root.fork()]);
 for (;;) {
-  const live = branches.filter(b => !b.disposed);
+  const live = branches.filter((b) => !b.disposed);
   if (!live.length) break;
-  const produced = live.map(b => ({ b, ...b.produce() }));
-
-  // Prune branches that hit stop tokens
-  for (const p of produced.filter(p => p.isStop)) await p.b.prune();
-
-  // Commit survivors — accept + decode in one GPU dispatch
+  const produced = live.map((b) => ({ b, ...b.produce() }));
+  for (const p of produced.filter((p) => p.isStop)) await p.b.prune();
   const items = produced
-    .filter(p => !p.isStop)
-    .map(p => { p.b.accept(p.token); return [p.b, p.token]; });
+    .filter((p) => !p.isStop)
+    .map((p) => {
+      p.b.accept(p.token);
+      return [p.b, p.token];
+    });
   await store.commit(items);
 }
-
-// Winner takes all — one seq_keep pass, losers vaporized
-const winner = branches
-  .filter(b => !b.disposed)
-  .reduce((a, b) => (a.perplexity < b.perplexity ? a : b));
-await store.retainOnly(winner);
-// store.available === nSeqMax - 1 — all leases recovered
 ```
 
-Or for single-branch generation, Branch is an async iterable — generate until EOG:
+Or for single-branch generation, Branch is an async iterable:
 
 ```javascript
 for await (const { token, text } of branch) {
@@ -74,162 +66,130 @@ for await (const { token, text } of branch) {
 }
 ```
 
-## Continuous Tree Batching
+See [`@lloyal-labs/sdk`](https://github.com/lloyal-ai/sdk) for the full Branch API, continuous tree batching, KV tenancy, and topology documentation.
 
-Tree search with N branches means N calls to `llama_decode()` — each paying GPU dispatch overhead, memory barriers, and PCIe round-trips. `BranchStore` eliminates this: tokens from N branches — each at a different position, different seq_id, each needing independent logits captured — are packed into a single `llama_batch` and dispatched once. N branches, 1 GPU call.
+### Without the SDK
 
-Two packing strategies for different access patterns:
+`createContext` returns a `SessionContext` — the native interface to llama.cpp. You can use it directly without the SDK's Branch/BranchStore layer:
 
 ```javascript
-// commit: 1 token per branch — one GPU dispatch for N branches
-await store.commit([[branch1, tok1], [branch2, tok2], [branch3, tok3]]);
-
-// prefill: variable tokens per branch — asymmetric injection
-await store.prefill([
-  [branchA, systemTokens],  // 200 tokens
-  [branchB, queryTokens],   //  12 tokens
-  [branchC, docTokens],     // 800 tokens
-]);
-// Greedy bin-packed into ceil(total / nBatch) dispatches
+import { createContext } from "@lloyal-labs/lloyal.node";
+
+const ctx = await createContext({ modelPath: "./model.gguf", nSeqMax: 4 });
+
+// Chat templates — model-agnostic formatting + tool calling
+const { prompt, grammar, format } = await ctx.formatChat(messages, {
+  addGenerationPrompt: true,
+  tools: [{ type: "function", function: { name: "search", parameters: schema } }],
+});
+const { content, toolCalls } = await ctx.parseChatOutput(output, format);
+
+// Branch primitives — what the SDK's Branch class wraps
+const handle = ctx._branchCreate(0, samplerParams);
+await ctx._branchPrefill(handle, tokens);
+const token = ctx._branchSample(handle);
+const text = ctx.tokenToText(token);
+const isStop = ctx.isStopToken(token);
+ctx._branchAccept(handle, token);
+const logits = ctx._branchGetLogits(handle);     // Float32Array(vocabSize)
+const entropy = ctx._branchModelEntropy(handle);
+const child = ctx._branchFork(handle);
+
+// Store primitives — what the SDK's BranchStore wraps
+await ctx._storeCommit([handle1, handle2], [tok1, tok2]);  // N branches, 1 GPU call
+await ctx._storePrefill([handle], [tokens]);
+await ctx._storeRetainOnly(winner);
+const available = ctx._storeAvailable();
+
+// KV cache — snapshot, copy, persist
+await ctx.kvSeqCopy(0, 1);                      // share prefix across sequences
+await ctx.kvCacheSave();                         // snapshot for rollback
+await ctx.kvCacheLoad();                         // restore checkpoint
+await ctx.kvCacheWriteFile("cache.bin");         // persist to disk
+
+// Embeddings
+const embeddings = await ctx.encode("query text");
+const dim = ctx.getEmbeddingDimension();
+
+// Grammar + tokenizer
+const grammar = await ctx.jsonSchemaToGrammar(schema);
+const tokens = await ctx.tokenize("Hello world");
+const sep = await ctx.getTurnSeparator();
 ```
 
-## KV Tenancy
+## What This Package Provides
 
-Two resources, two scales. Slots (65K) are how many branches can *exist* — cheap CPU state. Leases (`nSeqMax`) are how many can *decode* — scarce KV cache residency. Tenancy manages the scarce resource automatically: leases are acquired on `create()`/`fork()`, evicted on `prune()`, rebuilt on `retainOnly()`. No manual seq_id tracking, ever.
+**Native-only** (not in SDK):
 
-```javascript
-store.available;                   // leases remaining — use for width/depth budget
-await store.retainOnly(winner);    // nuclear: 1 seq_keep, rebuild vacancy
-```
+- `createContext(options)` — load a GGUF model, return a `SessionContext`
+- `loadBinary(options?)` — explicit GPU variant selection with automatic fallback
+- Prebuilt binaries for 13 platform/GPU combinations
 
-The turn lifecycle: search is surgical (N × `prune()`), promotion is nuclear (1 × `retainOnly()`). Per turn, fork → expand → evaluate → prune losers → repeat. Between turns, promote winner → tree is gone → next turn starts fresh.
+**Re-exported from [`@lloyal-labs/sdk`](https://github.com/lloyal-ai/sdk):**
 
-## Topology
+- `Branch`, `BranchStore`, `Session`, `Rerank`
+- Per-token metrics: `modelEntropy()`, `modelSurprisal()`, `samplingPerplexity`
+- Chat formatting: `formatChat()`, `parseChatOutput()`
+- Grammar: `jsonSchemaToGrammar()`, `setGrammar()`
 
-Parent/child edges are always-on. Simple chat → best-of-N → deep search is one continuum.
+**Re-exported from [`@lloyal-labs/lloyal-agents`](https://github.com/lloyal-ai/sdk/tree/main/packages/agents):**
 
-```javascript
-branch.parent;       // handle or null if root
-branch.children;     // child handles
-branch.isLeaf;       // no children?
-branch.isActive;     // holds a KV lease?
-```
+- `runAgents`, `useAgentPool`, `generate`, `diverge`, `createToolkit`
+- Structured concurrency DAG via Effection generators
+- In-loop orchestration: agents as branches of a single running process
 
-| Method | FK analogy | Behavior |
-|--------|-----------|----------|
-| `prune()` | RESTRICT | Throws if children exist |
-| `pruneSubtree()` | CASCADE | Iterative post-order traversal |
+## GPU Variant Selection
 
----
+```javascript
+import { loadBinary, createContext } from "@lloyal-labs/lloyal.node";
 
-## Install
+// Automatic — uses Metal on macOS, CPU elsewhere
+const ctx = await createContext({ modelPath: "./model.gguf" });
 
-```bash
-npm install @lloyal-labs/lloyal.node
+// Explicit CUDA
+const binding = loadBinary({ gpuVariant: "cuda" });
+const ctx = await binding.createContext({ modelPath: "./model.gguf" });
+// Falls back to CPU with a warning if CUDA runtime not available
 ```
 
-Prebuilt binaries for 13 platform/GPU combinations. GPU selection at runtime, not install time.
-
-| Platform | Arch  | Acceleration        |
-| -------- | ----- | ------------------- |
-| macOS    | arm64 | Metal               |
-| macOS    | x64   | CPU                 |
-| Linux    | x64   | CPU / CUDA / Vulkan |
-| Linux    | arm64 | CPU / CUDA / Vulkan |
-| Windows  | x64   | CPU / CUDA / Vulkan |
-| Windows  | arm64 | CPU / Vulkan        |
-
-CI integration testing (real inference):
-
-| Architecture | Test Model     | Template |
-| ------------ | -------------- | -------- |
-| Llama        | Llama 3.2 1B   | llama3   |
-| Phi          | Phi 3.5 Mini   | phi3     |
-| Qwen         | Qwen 3 1.7B    | chatml   |
-| Gemma        | Gemma 3 1B     | gemma    |
-| SmolLM       | SmolLM2 1.7B   | chatml   |
-| Ministral    | Ministral 3B   | mistral  |
-
-See [distribution.md](docs/distribution.md) for details.
-
----
-
 ## Examples
 
-| Example                                   | Pattern                                                                    |
-| ----------------------------------------- | -------------------------------------------------------------------------- |
-| [`best-of-n/`](./examples/best-of-n/)     | Branch API: fork, produce/commit, perplexity selection                     |
-| [`speculative/`](./examples/speculative/) | Branch API: draft/verify, fork/prune, bonus token sampling                 |
-| [`streaming/`](./examples/streaming/)     | Infinite context via BlinkKV reseeding with sidecar summarization          |
-| [`entropy/`](./examples/entropy/)         | `modelEntropy()` mid-generation as control signal                          |
-| [`grammar/`](./examples/grammar/)         | Pull loop with generators, JSON schema constraints, KV + grammar branching |
-| [`chat/`](./examples/chat/)               | Interactive streaming chat                                                 |
-| [`embed/`](./examples/embed/)             | Text embeddings extraction                                                 |
+| Example                           | Pattern                                           |
+| --------------------------------- | ------------------------------------------------- |
+| [`entropy/`](./examples/entropy/) | `modelEntropy()` mid-generation as control signal |
+| [`chat/`](./examples/chat/)       | Interactive streaming chat                        |
+| [`embed/`](./examples/embed/)     | Text embeddings extraction                        |
 
 ```bash
-node examples/best-of-n/best-of-n.mjs
-node examples/speculative/speculative.mjs
-```
-
-Each example has a README explaining the pattern.
-
----
-
-## Other Patterns
-
-### Entropy as Control Signal
-
-Model uncertainty mid-generation enables dynamic behavior:
-
-```javascript
-const entropy = ctx.modelEntropy("bits");
-
-if (entropy > 4.0) {
-  // High uncertainty — model is guessing
-  // Trigger retrieval, reduce temperature, or branch
-}
+npx tsx examples/best-of-n/best-of-n.ts
+npx tsx examples/chat/chat.ts ./model.gguf
 ```
 
-See [`examples/entropy/`](./examples/entropy/) for entropy-triggered sampling strategies.
+## CI Testing
 
-### Low-Level KV Operations
+Integration tests run real inference across architectures:
 
-For fine-grained control without Branch:
+| Architecture | Test Model   | Template |
+| ------------ | ------------ | -------- |
+| Llama        | Llama 3.2 1B | llama3   |
+| Phi          | Phi 3.5 Mini | phi3     |
+| Qwen         | Qwen 3 1.7B  | chatml   |
+| Gemma        | Gemma 3 1B   | gemma    |
+| SmolLM       | SmolLM2 1.7B | chatml   |
+| Ministral    | Ministral 3B | mistral  |
 
-| Approach             | Method                            | Use Case                                     |
-| -------------------- | --------------------------------- | -------------------------------------------- |
-| **Sequence copy**    | `kvSeqCopy(src, dst)`             | Share prefix across sequences                |
-| **Snapshot/restore** | `kvCacheSave()` / `kvCacheLoad()` | Sequential exploration, return to checkpoint |
-
-### Grammar-Constrained Generation
-
-```javascript
-const grammar = await ctx.jsonSchemaToGrammar(schema);
-const branch = Branch.create(ctx, 0, params, undefined, grammar);
-await branch.prefill(promptTokens);
-// Grammar state cloned automatically on fork()
-```
-
-See [`examples/grammar/`](./examples/grammar/) for the full branch fork pattern.
-
----
-
-## API Reference
-
-Full API documentation: **[lloyal-ai.github.io/lloyal.node](https://lloyal-ai.github.io/lloyal.node/)**
-
-Generated from [`src/index.ts`](./src/index.ts) with TypeDoc.
-
----
+See [distribution.md](docs/distribution.md) for details.
 
 ## Ecosystem
 
-| Package                                                 | Runtime      | Description                       |
-| ------------------------------------------------------- | ------------ | --------------------------------- |
-| [liblloyal](https://github.com/lloyal-ai/liblloyal)     | C++          | Header-only inference kernel      |
-| **lloyal.node**                                         | Node.js      | This package                      |
-| [nitro-llama](https://github.com/lloyal-ai/nitro-llama) | React Native | Mobile bindings via Nitro Modules |
-| [tsampler](https://github.com/lloyal-ai/tsampler)       | TypeScript   | Reference sampler implementation  |
+| Package                                                                                    | Description                                                                  |
+| ------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------- |
+| [`@lloyal-labs/sdk`](https://github.com/lloyal-ai/sdk)                                     | Backend-agnostic inference primitives (Branch, BranchStore, Session, Rerank) |
+| [`@lloyal-labs/lloyal-agents`](https://github.com/lloyal-ai/sdk/tree/main/packages/agents) | Multi-agent framework — in-loop orchestration via structured concurrency     |
+| [liblloyal](https://github.com/lloyal-ai/liblloyal)                                        | Header-only C++20 inference kernel for llama.cpp                             |
+| **lloyal.node**                                                                            | This package — native backend + prebuilt binaries                            |
+| [nitro-llama](https://github.com/lloyal-ai/nitro-llama)                                    | React Native backend via Nitro Modules                                       |
+| [tsampler](https://github.com/lloyal-ai/tsampler)                                          | Reference sampler implementation                                             |
 
 ## Contributing
 
diff --git a/examples/deep-research/agreement.ts b/examples/deep-research/agreement.ts
deleted file mode 100644
index 58380e1..0000000
--- a/examples/deep-research/agreement.ts
+++ /dev/null
@@ -1,142 +0,0 @@
-/**
- * Per-section agreement analysis via bigram Jaccard similarity.
- *
- * Pure string math — no model calls. Used by the verify phase to quantify
- * where N diverge attempts agree (confident) vs disagree (hallucination risk).
- */
-
-export interface SectionAgreement {
-  label: string;       // section header or "¶1", "¶2", etc.
-  score: number;       // 0–1 average pairwise bigram Jaccard
-}
-
-export interface AgreementResult {
-  overall: number;                  // mean of section scores
-  sections: SectionAgreement[];     // per-section breakdown
-}
-
-// ── Internals ─────────────────────────────────────────────────────
-
-interface Section {
-  key: string;    // normalized header for matching, or positional index
-  label: string;  // display label
-  body: string;   // section text
-}
-
-const HEADER_RE = /^#{1,4}\s+/m;
-
-function normalizeKey(header: string): string {
-  return header.toLowerCase().replace(/[^\w\s]/g, '').trim();
-}
-
-function extractSections(text: string): Section[] {
-  const hasHeaders = HEADER_RE.test(text);
-
-  if (hasHeaders) {
-    const parts = text.split(/^(#{1,4}\s+.+)$/m).filter(Boolean);
-    const sections: Section[] = [];
-    for (let i = 0; i < parts.length; i++) {
-      const match = parts[i].match(/^#{1,4}\s+(.+)$/);
-      if (match) {
-        const header = match[1].trim();
-        const body = (parts[i + 1] ?? '').trim();
-        sections.push({ key: normalizeKey(header), label: header, body });
-        i++; // skip body part
-      }
-    }
-    return sections.length ? sections : paragraphSections(text);
-  }
-
-  return paragraphSections(text);
-}
-
-function paragraphSections(text: string): Section[] {
-  return text.split(/\n{2,}/)
-    .map(p => p.trim())
-    .filter(Boolean)
-    .map((body, i) => ({ key: String(i), label: `¶${i + 1}`, body }));
-}
-
-function wordBigrams(text: string): Set<string> {
-  const words = text.split(/\s+/).filter(Boolean);
-  const bigrams = new Set<string>();
-  for (let i = 0; i < words.length - 1; i++) {
-    bigrams.add(`${words[i]} ${words[i + 1]}`);
-  }
-  return bigrams;
-}
-
-function jaccard(a: Set<string>, b: Set<string>): number {
-  if (a.size === 0 && b.size === 0) return 1;
-  let intersection = 0;
-  const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
-  for (const x of smaller) if (larger.has(x)) intersection++;
-  const union = a.size + b.size - intersection;
-  return union === 0 ? 1 : intersection / union;
-}
-
-function averagePairwiseJaccard(texts: string[]): number {
-  if (texts.length < 2) return 1;
-  const bigramSets = texts.map(wordBigrams);
-  let sum = 0;
-  let pairs = 0;
-  for (let i = 0; i < bigramSets.length; i++) {
-    for (let j = i + 1; j < bigramSets.length; j++) {
-      sum += jaccard(bigramSets[i], bigramSets[j]);
-      pairs++;
-    }
-  }
-  return sum / pairs;
-}
-
-// ── Public API ────────────────────────────────────────────────────
-
-export function computeAgreement(outputs: string[]): AgreementResult {
-  if (outputs.length < 2) return { overall: 1, sections: [] };
-
-  const allSections = outputs.map(extractSections);
-  const hasHeaders = allSections.some(ss => ss.length > 0 && ss[0].key !== '0');
-
-  if (hasHeaders) {
-    // Collect all unique section keys across attempts
-    const keySet = new Map<string, string>(); // key → label (first seen)
-    for (const ss of allSections) {
-      for (const s of ss) {
-        if (!keySet.has(s.key)) keySet.set(s.key, s.label);
-      }
-    }
-
-    const sections: SectionAgreement[] = [...keySet.entries()].map(([key, label]) => {
-      const bodies = allSections
-        .map(ss => ss.find(s => s.key === key)?.body)
-        .filter((b): b is string => b != null && b.length > 0);
-      // Sections present in only one attempt get score 0
-      const score = bodies.length < 2 ? 0 : averagePairwiseJaccard(bodies);
-      return { label, score };
-    });
-
-    const overall = sections.length
-      ? sections.reduce((s, x) => s + x.score, 0) / sections.length
-      : 0;
-
-    return { overall, sections };
-  }
-
-  // Positional matching for headerless content
-  const maxSections = Math.max(...allSections.map(ss => ss.length));
-  const sections: SectionAgreement[] = [];
-
-  for (let i = 0; i < maxSections; i++) {
-    const bodies = allSections
-      .map(ss => ss[i]?.body)
-      .filter((b): b is string => b != null && b.length > 0);
-    const score = bodies.length < 2 ? 0 : averagePairwiseJaccard(bodies);
-    sections.push({ label: `¶${i + 1}`, score });
-  }
-
-  const overall = sections.length
-    ? sections.reduce((s, x) => s + x.score, 0) / sections.length
-    : 0;
-
-  return { overall, sections };
-}
diff --git a/examples/deep-research/harness.ts b/examples/deep-research/harness.ts
deleted file mode 100644
index 8b6f687..0000000
--- a/examples/deep-research/harness.ts
+++ /dev/null
@@ -1,415 +0,0 @@
-import * as fs from 'node:fs';
-import * as path from 'node:path';
-import { call, scoped } from 'effection';
-import type { Operation, Channel } from 'effection';
-import { Branch, Session } from '../../dist';
-import type { SessionContext } from '../../dist';
-import {
-  Ctx,
-  generate, useAgentPool, runAgents, diverge, withSharedRoot,
-} from '../../dist/agents';
-import type { Tool, AgentPoolResult, DivergeResult } from '../../dist/agents';
-import type { WorkflowEvent, OpTiming } from './tui';
-import { computeAgreement } from './agreement';
-import { reportTool } from './tools';
-
-/** Load a task prompt file. Convention: system prompt above `---`, user content below. */
-function loadTask(name: string): { system: string; user: string } {
-  const raw = fs.readFileSync(path.resolve(__dirname, `tasks/${name}.md`), 'utf8').trim();
-  const sep = raw.indexOf('\n---\n');
-  if (sep === -1) return { system: raw, user: '' };
-  return { system: raw.slice(0, sep).trim(), user: raw.slice(sep + 5).trim() };
-}
-
-const PLAN = loadTask('plan');
-const RESEARCH = loadTask('research');
-const VERIFY = loadTask('verify');
-const EVAL = loadTask('eval');
-const REPORT = loadTask('report');
-
-// ── Options ──────────────────────────────────────────────────────
-
-export interface WorkflowOpts {
-  session: Session;
-  toolMap: Map<string, Tool>;
-  toolsJson: string;
-  agentCount: number;
-  verifyCount: number;
-  maxTurns: number;
-  trace: boolean;
-  events: Channel<WorkflowEvent, void>;
-}
-
-// ── Agent task builder ───────────────────────────────────────────
-
-function agentTasks(questions: string[], toolsJson: string, parent: Branch, seed?: number) {
-  return questions.map((q, i) => ({
-    systemPrompt: RESEARCH.system,
-    content: q,
-    tools: toolsJson,
-    parent,
-    seed: seed != null ? seed + i : undefined,
-  }));
-}
-
-const reportOnlyTools = JSON.stringify([reportTool.schema]);
-
-function* reportPass(
-  pool: AgentPoolResult,
-  opts: WorkflowOpts,
-): Operation<void> {
-  const hardCut = pool.agents.filter(a => !a.findings && !a.branch.disposed);
-  if (hardCut.length === 0) return;
-
-  // Free KV from successful agents before spawning reporters
-  for (const a of pool.agents) {
-    if (a.findings && !a.branch.disposed) a.branch.pruneSync();
-  }
-
-  const reporters = yield* runAgents({
-    tasks: hardCut.map(a => ({
-      systemPrompt: REPORT.system,
-      content: REPORT.user,
-      tools: reportOnlyTools,
-      parent: a.branch,
-    })),
-    tools: new Map([['report', reportTool]]),
-    terminalTool: 'report',
-    trace: opts.trace,
-    pressure: { softLimit: 200, hardLimit: 64 },
-  });
-
-  hardCut.forEach((a, i) => {
-    if (reporters.agents[i]?.findings) a.findings = reporters.agents[i].findings;
-  });
-}
-
-// ── Operations ───────────────────────────────────────────────────
-
-function* plan(query: string, opts: WorkflowOpts): Operation<{ questions: string[]; tokenCount: number; timeMs: number }> {
-  const ctx: SessionContext = yield* Ctx.expect();
-  const t = performance.now();
-
-  const schema = {
-    type: 'object',
-    properties: {
-      questions: {
-        type: 'array',
-        items: { type: 'string' },
-        minItems: 2,
-        maxItems: opts.agentCount,
-      },
-    },
-    required: ['questions'],
-  };
-  const grammar: string = yield* call(() => ctx.jsonSchemaToGrammar(JSON.stringify(schema)));
-
-  const userContent = PLAN.user
-    .replace('{{count}}', String(opts.agentCount))
-    .replace('{{query}}', query);
-
-  const messages = [
-    { role: 'system', content: PLAN.system },
-    { role: 'user', content: userContent },
-  ];
-  const { prompt }: { prompt: string } = yield* call(() => ctx.formatChat(JSON.stringify(messages)));
-
-  let output: string;
-  let tokenCount: number;
-
-  const parent = opts.session.trunk ?? undefined;
-  if (parent) {
-    const lead: Branch = yield* call(() => parent.fork());
-    try {
-      lead.setGrammar(grammar);
-      const sep = ctx.getTurnSeparator();
-      const delta: number[] = yield* call(() => ctx.tokenize(prompt, false));
-      yield* call(() => lead.prefill([...sep, ...delta]));
-
-      ({ output, tokenCount } = yield* call(async () => {
-        let o = '';
-        let tc = 0;
-        for await (const { text } of lead) { o += text; tc++; }
-        return { output: o, tokenCount: tc };
-      }));
-    } finally {
-      yield* call(() => lead.prune());
-    }
-  } else {
-    const result = yield* generate({ prompt, grammar, params: { temperature: 0.3 } });
-    output = result.output;
-    tokenCount = result.tokenCount;
-  }
-
-  let questions: string[];
-  try {
-    questions = JSON.parse(output).questions.slice(0, opts.agentCount);
-    if (!questions.length) throw new Error('empty');
-  } catch {
-    questions = Array.from({ length: opts.agentCount }, (_, i) => `${query} (aspect ${i + 1})`);
-  }
-
-  const timeMs = performance.now() - t;
-  yield* opts.events.send({ type: 'plan', questions, tokenCount, timeMs });
-  return { questions, tokenCount, timeMs };
-}
-
-function* research(
-  questions: string[],
-  opts: WorkflowOpts,
-): Operation<{ pool: AgentPoolResult; sharedPrefixLength: number; timeMs: number }> {
-  yield* opts.events.send({ type: 'research:start', agentCount: questions.length });
-  const t = performance.now();
-
-  const { result: pool, prefixLen: sharedPrefixLength } = yield* withSharedRoot(
-    { systemPrompt: RESEARCH.system, tools: opts.toolsJson },
-    function*(root, prefixLen) {
-      const pool = yield* useAgentPool({
-        tasks: agentTasks(questions, opts.toolsJson, root),
-        tools: opts.toolMap, maxTurns: opts.maxTurns, trace: opts.trace,
-        terminalTool: 'report',
-        pressure: { softLimit: 2048 },
-      });
-
-      yield* reportPass(pool, opts);
-      return { result: pool, prefixLen };
-    },
-  );
-
-  const timeMs = performance.now() - t;
-  yield* opts.events.send({ type: 'research:done', pool, timeMs });
-  return { pool, sharedPrefixLength, timeMs };
-}
-
-function* warmResearch(
-  questions: string[],
-  opts: WorkflowOpts,
-): Operation<{ pool: AgentPoolResult; timeMs: number }> {
-  yield* opts.events.send({ type: 'research:start', agentCount: questions.length });
-  const t = performance.now();
-
-  const pool = yield* scoped(function*() {
-    const pool = yield* useAgentPool({
-      tasks: agentTasks(questions, opts.toolsJson, opts.session.trunk!, Date.now()),
-      tools: opts.toolMap, maxTurns: opts.maxTurns, trace: opts.trace,
-      terminalTool: 'report',
-      pressure: { softLimit: 1024 },
-    });
-
-    yield* reportPass(pool, opts);
-    return pool;
-  });
-
-  const timeMs = performance.now() - t;
-  yield* opts.events.send({ type: 'research:done', pool, timeMs });
-  return { pool, timeMs };
-}
-
-function* verify(
-  pool: AgentPoolResult,
-  questions: string[],
-  query: string,
-  opts: WorkflowOpts,
-): Operation<{ result: DivergeResult; timeMs: number }> {
-  const ctx: SessionContext = yield* Ctx.expect();
-  const findingsText = pool.agents
-    .map((a, i) => `Q: ${questions[i]}\nA: ${(a.findings || '').trim()}`)
-    .join('\n\n');
-
-  const userContent = VERIFY.user
-    .replace('{{findings}}', findingsText)
-    .replace('{{query}}', query);
-
-  const messages = [
-    { role: 'system', content: VERIFY.system },
-    { role: 'user', content: userContent },
-  ];
-  const { prompt }: { prompt: string } = yield* call(() => ctx.formatChat(JSON.stringify(messages)));
-
-  yield* opts.events.send({ type: 'verify:start', count: opts.verifyCount });
-  const t = performance.now();
-  const result = yield* diverge({
-    prompt,
-    attempts: opts.verifyCount,
-    params: { temperature: 0.7 },
-  });
-  const timeMs = performance.now() - t;
-  const agreement = computeAgreement(result.attempts.map(a => a.output));
-  yield* opts.events.send({ type: 'verify:agreement', result: agreement });
-  yield* opts.events.send({ type: 'verify:done', result, timeMs });
-  return { result, timeMs };
-}
-
-function* evaluate(
-  verifyResult: DivergeResult,
-  opts: WorkflowOpts,
-): Operation<{ converged: boolean | null; tokenCount: number; timeMs: number }> {
-  const ctx: SessionContext = yield* Ctx.expect();
-
-  const responsesText = verifyResult.attempts
-    .map((a, i) => `Response ${i + 1}: ${a.output.trim()}`)
-    .join('\n\n');
-
-  const userContent = EVAL.user.replace('{{responses}}', responsesText);
-
-  const messages = [
-    { role: 'system', content: EVAL.system },
-    { role: 'user', content: userContent },
-  ];
-
-  const evalSchema = {
-    type: 'object',
-    properties: { converged: { type: 'boolean' } },
-    required: ['converged'],
-  };
-  const grammar: string = yield* call(() => ctx.jsonSchemaToGrammar(JSON.stringify(evalSchema)));
-  const { prompt }: { prompt: string } = yield* call(() => ctx.formatChat(JSON.stringify(messages)));
-
-  const t = performance.now();
-  const result = yield* generate({
-    prompt,
-    grammar,
-    params: { temperature: 0 },
-    parse: (output: string) => {
-      try { return JSON.parse(output).converged as boolean; }
-      catch { return null; }
-    },
-  });
-  const timeMs = performance.now() - t;
-  yield* opts.events.send({ type: 'eval:done', converged: result.parsed as boolean | null, tokenCount: result.tokenCount, timeMs });
-  return { converged: result.parsed as boolean | null, tokenCount: result.tokenCount, timeMs };
-}
-
-function* answer(verifyResult: DivergeResult, opts: WorkflowOpts): Operation<void> {
-  yield* opts.events.send({ type: 'answer', text: verifyResult.bestOutput });
-}
-
-function* promote(verifyResult: DivergeResult, opts: WorkflowOpts): Operation<void> {
-  yield* call(() => opts.session.promote(verifyResult.best));
-}
-
-function* respond(
-  pool: AgentPoolResult,
-  query: string,
-  opts: WorkflowOpts,
-): Operation<{ tokenCount: number; timeMs: number }> {
-  const agentFindings = pool.agents
-    .map((a: { findings: string | null }, i: number) =>
-      a.findings ? `[Agent ${i}] ${a.findings.trim()}` : null)
-    .filter(Boolean)
-    .join('\n\n');
-
-  yield* call(() => opts.session.prefillUser(agentFindings
-    ? `Research findings:\n${agentFindings}\n\nUser question: ${query}\n\nAnswer based on the research findings above.`
-    : query));
-
-  yield* opts.events.send({ type: 'response:start' });
-  const t = performance.now();
-  let tokenCount = 0;
-  const trunk = opts.session.trunk!;
-  for (;;) {
-    const { token, text, isStop } = trunk.produceSync();
-    if (isStop) break;
-    yield* call(() => trunk.commit(token));
-    tokenCount++;
-    yield* opts.events.send({ type: 'response:text', text });
-  }
-  const timeMs = performance.now() - t;
-  yield* opts.events.send({ type: 'response:done' });
-  return { tokenCount, timeMs };
-}
-
-function* summarize(
-  timings: OpTiming[],
-  opts: WorkflowOpts,
-  extra?: { kvLine?: string },
-): Operation<void> {
-  const ctx: SessionContext = yield* Ctx.expect();
-  const p = ctx._storeKvPressure();
-  const ctxTotal = p.nCtx || 1;
-  yield* opts.events.send({
-    type: 'stats', timings,
-    kvLine: extra?.kvLine,
-    ctxPct: Math.round(100 * p.cellsUsed / ctxTotal),
-    ctxPos: p.cellsUsed,
-    ctxTotal,
-  });
-}
-
-// ── Workflow compositions ────────────────────────────────────────
-
-function* coldQuery(query: string, opts: WorkflowOpts): Operation<void> {
-  const t0 = performance.now();
-
-  const p = yield* plan(query, opts);
-  const r = yield* research(p.questions, opts);
-  const v = yield* verify(r.pool, p.questions, query, opts);
-  const e = yield* evaluate(v.result, opts);
-  yield* answer(v.result, opts);
-  yield* promote(v.result, opts);
-
-  const timings: OpTiming[] = [
-    { label: 'Plan', tokens: p.tokenCount, detail: '', timeMs: p.timeMs },
-    {
-      label: 'Research', tokens: r.pool.totalTokens,
-      detail: `(${r.pool.agents.map(a => a.tokenCount).join(' + ')})  ${r.pool.totalToolCalls} tools`,
-      timeMs: r.timeMs,
-    },
-    {
-      label: 'Verify', tokens: v.result.totalTokens,
-      detail: `(${v.result.attempts.map(a => a.tokenCount).join(' + ')})`,
-      timeMs: v.timeMs,
-    },
-    { label: 'Eval', tokens: e.tokenCount, detail: `converged: ${e.converged ? 'yes' : 'no'}`, timeMs: e.timeMs },
-  ];
-
-  const kvSaved = r.sharedPrefixLength * (p.questions.length - 1)
-    + v.result.prefixLength * (v.result.attempts.length - 1);
-  const kvLine = `KV shared    ${r.sharedPrefixLength} \u00d7 ${p.questions.length - 1} + ${v.result.prefixLength} \u00d7 ${v.result.attempts.length - 1} = ${kvSaved.toLocaleString()} tok saved`;
-
-  yield* summarize(timings, opts, { kvLine });
-
-  yield* opts.events.send({
-    type: 'complete',
-    data: {
-      planTokens: p.tokenCount,
-      agentTokens: r.pool.totalTokens, researchSteps: r.pool.steps,
-      agentPpl: r.pool.agents.map(a => a.ppl),
-      verifyTokens: v.result.totalTokens, verifySteps: v.result.steps,
-      evalTokens: e.tokenCount, converged: e.converged,
-      totalToolCalls: r.pool.totalToolCalls,
-      prefixTokens: v.result.prefixLength,
-      sharedPrefixTokens: r.sharedPrefixLength,
-      agentCount: p.questions.length, attemptCount: v.result.attempts.length,
-      wallTimeMs: Math.round(performance.now() - t0),
-      planMs: Math.round(p.timeMs), researchMs: Math.round(r.timeMs),
-      verifyMs: Math.round(v.timeMs), evalMs: Math.round(e.timeMs),
-      ...r.pool.counters,
-    },
-  });
-}
-
-function* warmQuery(query: string, opts: WorkflowOpts): Operation<void> {
-  const p = yield* plan(query, opts);
-  const r = yield* warmResearch(p.questions, opts);
-  const resp = yield* respond(r.pool, query, opts);
-
-  const timings: OpTiming[] = [
-    { label: 'Plan', tokens: p.tokenCount, detail: '', timeMs: p.timeMs },
-    {
-      label: 'Research', tokens: r.pool.totalTokens,
-      detail: `(${r.pool.agents.map(a => a.tokenCount).join(' + ')})  ${r.pool.totalToolCalls} tools`,
-      timeMs: r.timeMs,
-    },
-    { label: 'Response', tokens: resp.tokenCount, detail: '', timeMs: resp.timeMs },
-  ];
-
-  yield* summarize(timings, opts);
-}
-
-// ── Entry point ──────────────────────────────────────────────────
-
-export function* handleQuery(query: string, opts: WorkflowOpts): Operation<void> {
-  yield* opts.events.send({ type: 'query', query, warm: !!opts.session.trunk });
-  yield* (opts.session.trunk ? warmQuery : coldQuery)(query, opts);
-}
diff --git a/examples/deep-research/main.ts b/examples/deep-research/main.ts
deleted file mode 100644
index edeaf10..0000000
--- a/examples/deep-research/main.ts
+++ /dev/null
@@ -1,219 +0,0 @@
-#!/usr/bin/env node
-/**
- * Deep Research — CLI entry point
- *
- * Wiring only: setup, TUI subscriber, REPL.
- * Orchestration lives in harness.ts. Presentation lives in tui.ts.
- *
- * Usage:
- *   npx tsx examples/deep-research/main.ts [model-path] --corpus <path> [--query <text>] [options]
- */
-
-import * as fs from "node:fs";
-import * as path from "node:path";
-import * as readline from "node:readline";
-import {
-  main,
-  ensure,
-  createSignal,
-  spawn,
-  each,
-  call,
-  action,
-} from "effection";
-import { createContext } from "../../dist";
-import type { SessionContext } from "../../dist";
-import { initAgents } from "../../dist/agents";
-import { c, log, setJsonlMode, setVerboseMode, fmtSize, createView } from "./tui";
-import type { WorkflowEvent } from "./tui";
-import { loadResources, chunkResources } from "./resources/files";
-import { createReranker } from "./reranker";
-import { createTools } from "./tools";
-import { handleQuery } from "./harness";
-import type { WorkflowOpts } from "./harness";
-
-// ── CLI args ─────────────────────────────────────────────────────
-
-const DEFAULT_MODEL = path.resolve(
-  __dirname,
-  "../../models/Qwen3-4B-Instruct-2507-Q4_K_M.gguf",
-);
-const DEFAULT_RERANKER = path.resolve(
-  __dirname,
-  "../../models/qwen3-reranker-0.6b-q4_k_m.gguf",
-);
-
-const args = process.argv.slice(2);
-const jsonlMode = args.includes("--jsonl");
-const verbose = args.includes("--verbose");
-const trace = args.includes("--trace");
-
-function argVal(flag: string): string | null {
-  const i = args.indexOf(flag);
-  return i !== -1 ? args[i + 1] : null;
-}
-const flagIndices = new Set(
-  ["--reranker", "--corpus", "--query"].flatMap((f) => {
-    const i = args.indexOf(f);
-    return i !== -1 ? [i, i + 1] : [];
-  }),
-);
-
-const rerankModelPath = argVal("--reranker") || DEFAULT_RERANKER;
-const corpusDir = argVal("--corpus");
-const initialQuery = argVal("--query");
-const modelPath =
-  args.find((a, i) => !a.startsWith("--") && !flagIndices.has(i)) ||
-  DEFAULT_MODEL;
-
-if (!corpusDir) {
-  process.stdout.write(
-    `Usage: npx tsx examples/deep-research/main.ts [model-path] --corpus <path> [--query <text>] [--reranker <path>]\nMissing: --corpus\n`,
-  );
-  process.exit(1);
-}
-
-if (jsonlMode) setJsonlMode(true);
-if (verbose) setVerboseMode(true);
-if (!verbose && !jsonlMode && !trace) {
-  try {
-    fs.closeSync(2);
-    fs.openSync(process.platform === "win32" ? "\\\\.\\NUL" : "/dev/null", "w");
-  } catch {
-    /* non-fatal */
-  }
-}
-
-const AGENT_COUNT = 3;
-const VERIFY_COUNT = 3;
-const MAX_TOOL_TURNS = 20;
-
-// ── Main ─────────────────────────────────────────────────────────
-
-main(function* () {
-  const resources = loadResources(corpusDir!);
-  const chunks = chunkResources(resources);
-
-  const modelName = path.basename(modelPath).replace(/-Q\w+\.gguf$/, "");
-  const rerankName = path
-    .basename(rerankModelPath)
-    .replace(/-q\w+\.gguf$/i, "");
-
-  log();
-  log(
-    `${c.bold}  Deep Research${c.reset} ${c.dim}\u2014 Structured Concurrency Runtime${c.reset}`,
-  );
-  log();
-  log(
-    `  ${c.green}\u25cf${c.reset} Loading ${c.bold}${modelName}${c.reset} ${c.dim}(${fmtSize(fs.statSync(modelPath).size)}, KV: Q4_0)${c.reset}`,
-  );
-
-  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || "16384", 10);
-  const ctx: SessionContext = yield* call(() =>
-    createContext({
-      modelPath,
-      nCtx,
-      nSeqMax: Math.max(AGENT_COUNT, VERIFY_COUNT) * 2 + 1,
-      typeK: "q4_0",
-      typeV: "q4_0",
-    }),
-  );
-
-  log(
-    `  ${c.green}\u25cf${c.reset} Loading ${c.bold}${rerankName}${c.reset} ${c.dim}(${fmtSize(fs.statSync(rerankModelPath).size)}, reranker)${c.reset}`,
-  );
-
-  const reranker = yield* call(() =>
-    createReranker(rerankModelPath, { nSeqMax: 8, nCtx: 4096 }),
-  );
-  yield* ensure(() => {
-    reranker.dispose();
-  });
-  yield* call(() => reranker.tokenizeChunks(chunks));
-
-  const corpusIsFile =
-    resources.length === 1 && fs.statSync(corpusDir!).isFile();
-  const corpusLabel = corpusIsFile
-    ? path.basename(corpusDir!)
-    : `${path.basename(corpusDir!)}/ \u2014 ${resources.length} files`;
-  log(
-    `  ${c.dim}  Corpus: ${corpusLabel} \u2192 ${chunks.length} chunks${c.reset}`,
-  );
-
-  const { toolMap, toolsJson } = createTools({ resources, chunks, reranker });
-  const { session, events } = yield* initAgents<WorkflowEvent>(ctx);
-
-  // View subscriber — all presentation lives here
-  const view = createView({
-    model: path.basename(modelPath),
-    reranker: path.basename(rerankModelPath),
-    agentCount: AGENT_COUNT,
-    verifyCount: VERIFY_COUNT,
-    chunkCount: chunks.length,
-  });
-  yield* spawn(function* () {
-    yield* view.subscribe(events);
-  });
-
-  const harnessOpts: WorkflowOpts = {
-    session,
-    toolMap,
-    toolsJson,
-    events,
-    agentCount: AGENT_COUNT,
-    verifyCount: VERIFY_COUNT,
-    maxTurns: MAX_TOOL_TURNS,
-    trace,
-  };
-
-  // Initial query
-  if (initialQuery) {
-    yield* handleQuery(initialQuery, harnessOpts);
-    if (jsonlMode) return;
-  }
-
-  // REPL — Signal bridges readline into Effection scope
-  log(
-    `  ${c.dim}${session.trunk ? "Ask a follow-up question" : "Enter your research question"} or /quit to exit${c.reset}`,
-  );
-  log();
-
-  const inputSignal = createSignal<string, void>();
-  const rl = readline.createInterface({
-    input: process.stdin,
-    output: process.stdout,
-  });
-  rl.setPrompt(`  ${c.dim}>${c.reset} `);
-
-  yield* spawn(function* () {
-    yield* action<void>((resolve) => {
-      rl.on("line", (line: string) => inputSignal.send(line.trim()));
-      rl.on("close", () => {
-        inputSignal.close();
-        resolve();
-      });
-      return () => rl.close();
-    });
-  });
-
-  rl.prompt();
-  for (const input of yield* each(inputSignal)) {
-    if (!input || input === "/quit") break;
-    try {
-      yield* handleQuery(input, harnessOpts);
-    } catch (err) {
-      log(`  ${c.red}Error: ${(err as Error).message}${c.reset}`);
-    }
-    yield* each.next();
-    try {
-      rl.prompt();
-    } catch {
-      break;
-    }
-  }
-}).catch((err: unknown) => {
-  process.stdout.write(
-    `Error: ${(err as Error).message}\n${(err as Error).stack}\n`,
-  );
-  process.exit(1);
-});
diff --git a/examples/deep-research/reranker.ts b/examples/deep-research/reranker.ts
deleted file mode 100644
index 118e17e..0000000
--- a/examples/deep-research/reranker.ts
+++ /dev/null
@@ -1,59 +0,0 @@
-import { Rerank } from "../../dist";
-import type { Chunk } from "./resources/types";
-import type { Reranker, ScoredResult } from "./tools/types";
-
-export async function createReranker(
-  modelPath: string,
-  opts?: { nSeqMax?: number; nCtx?: number },
-): Promise<Reranker> {
-  const rerank = await Rerank.create({ modelPath, ...opts });
-
-  return {
-    score(query: string, chunks: Chunk[]): AsyncIterable<ScoredResult> {
-      const inner = rerank.score(
-        query,
-        chunks.map((c) => c.tokens),
-        10,
-      );
-      return {
-        [Symbol.asyncIterator](): AsyncIterator<ScoredResult> {
-          const it = inner[Symbol.asyncIterator]();
-          return {
-            async next(): Promise<IteratorResult<ScoredResult>> {
-              const { value, done } = await it.next();
-              if (done)
-                return {
-                  value: undefined as unknown as ScoredResult,
-                  done: true,
-                };
-              return {
-                value: {
-                  filled: value.filled,
-                  total: value.total,
-                  results: value.results.map((r) => ({
-                    file: chunks[r.index].resource,
-                    heading: chunks[r.index].heading,
-                    score: r.score,
-                    startLine: chunks[r.index].startLine,
-                    endLine: chunks[r.index].endLine,
-                  })),
-                },
-                done: false,
-              };
-            },
-          };
-        },
-      };
-    },
-
-    async tokenizeChunks(chunks: Chunk[]): Promise<void> {
-      for (const chunk of chunks) {
-        chunk.tokens = await rerank.tokenize(chunk.text);
-      }
-    },
-
-    dispose() {
-      rerank.dispose();
-    },
-  };
-}
diff --git a/examples/deep-research/resources/files.ts b/examples/deep-research/resources/files.ts
deleted file mode 100644
index 4004374..0000000
--- a/examples/deep-research/resources/files.ts
+++ /dev/null
@@ -1,73 +0,0 @@
-import * as fs from 'node:fs';
-import * as path from 'node:path';
-import { loadBinary } from '../../../dist';
-import type { Resource, Chunk } from './types';
-
-interface Section { heading: string; level: number; startLine: number; endLine: number }
-const { parseMarkdown } = loadBinary() as unknown as { parseMarkdown(text: string): Section[] };
-
-export function loadResources(dir: string): Resource[] {
-  if (!fs.existsSync(dir)) {
-    process.stdout.write(`Error: corpus not found: ${dir}\n`);
-    process.exit(1);
-  }
-  const stat = fs.statSync(dir);
-  if (stat.isFile()) {
-    return [{ name: path.basename(dir), content: fs.readFileSync(dir, 'utf8') }];
-  }
-  const files = fs.readdirSync(dir).filter((f) => f.endsWith('.md'));
-  if (!files.length) {
-    process.stdout.write(`Error: no .md files in: ${dir}\n`);
-    process.exit(1);
-  }
-  return files.map((f) => ({
-    name: f,
-    content: fs.readFileSync(path.join(dir, f), 'utf8'),
-  }));
-}
-
-/** Split plain text into chunks on blank-line paragraph boundaries */
-function chunkByParagraph(res: Resource): Chunk[] {
-  const lines = res.content.split('\n');
-  const chunks: Chunk[] = [];
-  let start = 0;
-  for (let i = 0; i <= lines.length; i++) {
-    const blank = i === lines.length || !lines[i].trim();
-    if (blank && i > start) {
-      const text = lines.slice(start, i).join('\n').trim();
-      if (text) {
-        chunks.push({
-          resource: res.name,
-          heading: text.slice(0, 60).replace(/\n/g, ' ') + (text.length > 60 ? '…' : ''),
-          text, tokens: [],
-          startLine: start + 1,
-          endLine: i,
-        });
-      }
-    }
-    if (blank) start = i + 1;
-  }
-  return chunks;
-}
-
-export function chunkResources(resources: Resource[]): Chunk[] {
-  const out: Chunk[] = [];
-  for (const res of resources) {
-    const sections = parseMarkdown(res.content);
-    // Single section covering the whole file = no headings found → paragraph split
-    if (sections.length <= 1 && res.content.split('\n').length > 10) {
-      out.push(...chunkByParagraph(res));
-      continue;
-    }
-    const lines = res.content.split('\n');
-    for (const sec of sections) {
-      const text = lines.slice(sec.startLine - 1, sec.endLine).join('\n').trim();
-      if (!text) continue;
-      out.push({
-        resource: res.name, heading: sec.heading || res.name, text, tokens: [],
-        startLine: sec.startLine, endLine: sec.endLine,
-      });
-    }
-  }
-  return out;
-}
diff --git a/examples/deep-research/resources/types.ts b/examples/deep-research/resources/types.ts
deleted file mode 100644
index 17242b1..0000000
--- a/examples/deep-research/resources/types.ts
+++ /dev/null
@@ -1,10 +0,0 @@
-export interface Resource { name: string; content: string }
-
-export interface Chunk {
-  resource: string;
-  heading: string;
-  text: string;
-  tokens: number[];
-  startLine: number;
-  endLine: number;
-}
diff --git a/examples/deep-research/tasks/eval.md b/examples/deep-research/tasks/eval.md
deleted file mode 100644
index d555374..0000000
--- a/examples/deep-research/tasks/eval.md
+++ /dev/null
@@ -1,5 +0,0 @@
-You are a consistency checker. Compare the responses and determine if they convey the same core meaning. Output JSON only.
----
-Do these responses agree on the key points?
-
-{{responses}}
diff --git a/examples/deep-research/tasks/plan.md b/examples/deep-research/tasks/plan.md
deleted file mode 100644
index 05bba9a..0000000
--- a/examples/deep-research/tasks/plan.md
+++ /dev/null
@@ -1,3 +0,0 @@
-You break research queries into sub-questions. Output JSON only.
----
-Break this into {{count}} independent sub-questions for parallel research: "{{query}}"
diff --git a/examples/deep-research/tasks/report.md b/examples/deep-research/tasks/report.md
deleted file mode 100644
index 189a41b..0000000
--- a/examples/deep-research/tasks/report.md
+++ /dev/null
@@ -1,3 +0,0 @@
-You are a research reporter. Call the report tool with a concise summary (under 200 words) of the key findings from the research above. Focus on the most important discoveries and conclusions.
----
-Report your findings.
diff --git a/examples/deep-research/tasks/research.md b/examples/deep-research/tasks/research.md
deleted file mode 100644
index 60b25c2..0000000
--- a/examples/deep-research/tasks/research.md
+++ /dev/null
@@ -1,12 +0,0 @@
-You are a research assistant analyzing a knowledge base. Your tools:
-- **grep**: regex pattern matching — use for precise, exhaustive retrieval
-- **search**: semantic relevance ranking — use to discover related content
-- **read_file**: read specific line ranges — use to verify and get context
-- **report**: submit your final findings with evidence
-
-Process — follow every step in order:
-1. Grep with short, simple patterns first. Use single keywords or two-word phrases — never combine multiple clauses with `.*`. Run multiple greps if needed.
-2. Use search to discover content that grep may miss (different phrasing, synonyms).
-3. Read every matching line with read_file to verify in context. Do not rely on grep/search summaries alone.
-4. Grep again with a different pattern targeting what you have NOT yet found. This is a completeness check, not confirmation of existing results.
-5. Report with line numbers and direct quotes as evidence. State what you found and what you checked.
diff --git a/examples/deep-research/tasks/verify.md b/examples/deep-research/tasks/verify.md
deleted file mode 100644
index 0713358..0000000
--- a/examples/deep-research/tasks/verify.md
+++ /dev/null
@@ -1,7 +0,0 @@
-Synthesize the research findings into a coherent, concise summary.
----
-Research findings:
-
-{{findings}}
-
-Synthesize these into a brief summary answering: "{{query}}"
diff --git a/examples/deep-research/tools/grep.ts b/examples/deep-research/tools/grep.ts
deleted file mode 100644
index bc3ae5f..0000000
--- a/examples/deep-research/tools/grep.ts
+++ /dev/null
@@ -1,67 +0,0 @@
-import { Tool } from '../../../dist/agents';
-import type { JsonSchema } from '../../../dist/agents';
-import type { Resource } from '../resources/types';
-
-export class GrepTool extends Tool<{ pattern: string; ignoreCase?: boolean }> {
-  readonly name = 'grep';
-  readonly description = 'Search the entire corpus for a regex pattern. Returns every matching line with line numbers and total match count. Complements search() which ranks by relevance — grep scans exhaustively.';
-  readonly parameters: JsonSchema = {
-    type: 'object',
-    properties: {
-      pattern: { type: 'string', description: 'Regex pattern (e.g. "\\bshor\\b" for whole-word, "hidden_secret" for literal)' },
-      ignoreCase: { type: 'boolean', description: 'Case-insensitive matching (default: true)' },
-    },
-    required: ['pattern'],
-  };
-
-  private _resources: Resource[];
-
-  constructor(resources: Resource[]) {
-    super();
-    this._resources = resources;
-  }
-
-  async execute(args: { pattern: string; ignoreCase?: boolean }): Promise<unknown> {
-    const pattern = args.pattern?.trim();
-    if (!pattern) return { error: 'pattern must not be empty' };
-    const flags = (args.ignoreCase === false) ? 'g' : 'gi';
-    let re: RegExp;
-    try { re = new RegExp(pattern, flags); }
-    catch { return { error: `Invalid regex: ${pattern}` }; }
-
-    const matches: { file: string; line: number; text: string }[] = [];
-    let totalMatches = 0;
-
-    for (const res of this._resources) {
-      const lines = res.content.split('\n');
-      for (let i = 0; i < lines.length; i++) {
-        const hits = lines[i].match(re);
-        if (hits) {
-          totalMatches += hits.length;
-          const raw = lines[i].trim();
-          let text: string;
-          if (raw.length <= 200) {
-            text = raw;
-          } else {
-            const idx = raw.search(re);
-            const start = Math.max(0, idx - 40);
-            const end = Math.min(raw.length, start + 200);
-            text = (start > 0 ? '\u2026' : '') + raw.slice(start, end) + (end < raw.length ? '\u2026' : '');
-          }
-          matches.push({ file: res.name, line: i + 1, text });
-        }
-      }
-    }
-
-    if (totalMatches === 0) {
-      return {
-        totalMatches: 0, matchingLines: 0, matches: [],
-        note: 'Zero matches does NOT mean the topic is absent \u2014 only that this exact pattern was not found. Try search() for semantic matching or a broader/simpler regex.',
-      };
-    }
-
-    const limit = 50;
-    const truncated = matches.length > limit;
-    return { totalMatches, matchingLines: matches.length, truncated, matches: matches.slice(0, limit) };
-  }
-}
diff --git a/examples/deep-research/tools/index.ts b/examples/deep-research/tools/index.ts
deleted file mode 100644
index 2145f44..0000000
--- a/examples/deep-research/tools/index.ts
+++ /dev/null
@@ -1,23 +0,0 @@
-import { createToolkit } from '../../../dist/agents';
-import type { Toolkit } from '../../../dist/agents';
-import type { Resource, Chunk } from '../resources/types';
-import type { Reranker } from './types';
-import { SearchTool } from './search';
-import { ReadFileTool } from './read-file';
-import { GrepTool } from './grep';
-import { ReportTool } from './report';
-
-export const reportTool = new ReportTool();
-
-export function createTools(opts: {
-  resources: Resource[];
-  chunks: Chunk[];
-  reranker: Reranker;
-}): Toolkit {
-  return createToolkit([
-    new SearchTool(opts.chunks, opts.reranker),
-    new ReadFileTool(opts.resources),
-    new GrepTool(opts.resources),
-    reportTool,
-  ]);
-}
diff --git a/examples/deep-research/tools/read-file.ts b/examples/deep-research/tools/read-file.ts
deleted file mode 100644
index 164a5c5..0000000
--- a/examples/deep-research/tools/read-file.ts
+++ /dev/null
@@ -1,41 +0,0 @@
-import { Tool } from '../../../dist/agents';
-import type { JsonSchema } from '../../../dist/agents';
-import type { Resource } from '../resources/types';
-
-export class ReadFileTool extends Tool<{ filename: string; startLine?: number; endLine?: number }> {
-  readonly name = 'read_file';
-  readonly description = 'Read content from a file at specific line ranges. Use startLine/endLine from search results.';
-  readonly parameters: JsonSchema;
-
-  private _resources: Resource[];
-
-  constructor(resources: Resource[]) {
-    super();
-    this._resources = resources;
-    this.parameters = {
-      type: 'object',
-      properties: {
-        filename: {
-          type: 'string',
-          description: 'Filename from search results',
-          enum: resources.map(r => r.name),
-        },
-        startLine: { type: 'number', description: 'Start line (1-indexed, from search results)' },
-        endLine: { type: 'number', description: 'End line (1-indexed, from search results)' },
-      },
-      required: ['filename'],
-    };
-  }
-
-  async execute(args: { filename: string; startLine?: number; endLine?: number } & Record<string, unknown>): Promise<unknown> {
-    const filename = args.filename || (args.path as string) || '';
-    const file = this._resources.find(r => r.name === filename);
-    if (!file) {
-      return { error: `File not found: ${filename}. Available: ${this._resources.map(r => r.name).join(', ')}` };
-    }
-    const lines = file.content.split('\n');
-    const s = Math.max(0, (args.startLine ?? 1) - 1);
-    const e = Math.min(lines.length, args.endLine ?? Math.min(100, lines.length));
-    return { file: file.name, content: lines.slice(s, e).join('\n') };
-  }
-}
diff --git a/examples/deep-research/tools/report.ts b/examples/deep-research/tools/report.ts
deleted file mode 100644
index 97f061a..0000000
--- a/examples/deep-research/tools/report.ts
+++ /dev/null
@@ -1,14 +0,0 @@
-import { Tool } from '../../../dist/agents';
-import type { JsonSchema } from '../../../dist/agents';
-
-export class ReportTool extends Tool<{ findings: string }> {
-  readonly name = 'report';
-  readonly description = 'Submit your final research findings. Call this when you have gathered enough information to answer the question.';
-  readonly parameters: JsonSchema = {
-    type: 'object',
-    properties: { findings: { type: 'string', description: 'Your research findings and answer' } },
-    required: ['findings'],
-  };
-
-  async execute(): Promise<unknown> { return {}; }
-}
diff --git a/examples/deep-research/tools/search.ts b/examples/deep-research/tools/search.ts
deleted file mode 100644
index 034bc55..0000000
--- a/examples/deep-research/tools/search.ts
+++ /dev/null
@@ -1,34 +0,0 @@
-import { Tool } from '../../../dist/agents';
-import type { JsonSchema, ToolContext } from '../../../dist/agents';
-import type { Chunk } from '../resources/types';
-import type { Reranker } from './types';
-
-export class SearchTool extends Tool<{ query: string }> {
-  readonly name = 'search';
-  readonly description = 'Search the knowledge base. Returns sections ranked by relevance with line ranges for read_file.';
-  readonly parameters: JsonSchema = {
-    type: 'object',
-    properties: { query: { type: 'string', description: 'Search query' } },
-    required: ['query'],
-  };
-
-  private _chunks: Chunk[];
-  private _reranker: Reranker;
-
-  constructor(chunks: Chunk[], reranker: Reranker) {
-    super();
-    this._chunks = chunks;
-    this._reranker = reranker;
-  }
-
-  async execute(args: { query: string }, context?: ToolContext): Promise<unknown> {
-    const query = args.query?.trim();
-    if (!query) return { error: 'query must not be empty' };
-    let last;
-    for await (const { results, filled, total } of this._reranker.score(query, this._chunks)) {
-      if (context?.onProgress) context.onProgress({ filled, total });
-      last = results;
-    }
-    return last;
-  }
-}
diff --git a/examples/deep-research/tools/types.ts b/examples/deep-research/tools/types.ts
deleted file mode 100644
index 3f0012a..0000000
--- a/examples/deep-research/tools/types.ts
+++ /dev/null
@@ -1,21 +0,0 @@
-import type { Chunk } from '../resources/types';
-
-export interface ScoredChunk {
-  file: string;
-  heading: string;
-  score: number;
-  startLine: number;
-  endLine: number;
-}
-
-export interface ScoredResult {
-  results: ScoredChunk[];
-  filled: number;
-  total: number;
-}
-
-export interface Reranker {
-  score(query: string, chunks: Chunk[]): AsyncIterable<ScoredResult>;
-  tokenizeChunks(chunks: Chunk[]): Promise<void>;
-  dispose(): void;
-}
diff --git a/examples/deep-research/tui.ts b/examples/deep-research/tui.ts
deleted file mode 100644
index f720095..0000000
--- a/examples/deep-research/tui.ts
+++ /dev/null
@@ -1,500 +0,0 @@
-import * as fs from 'node:fs';
-import { each } from 'effection';
-import type { Channel, Operation } from 'effection';
-import type { AgentEvent, AgentPoolResult, DivergeResult } from '../../dist/agents';
-import type { AgreementResult } from './agreement';
-
-// ── Event types ──────────────────────────────────────────────────
-
-export interface OpTiming {
-  label: string;
-  tokens: number;
-  detail: string;
-  timeMs: number;
-}
-
-export type StepEvent =
-  | { type: 'query'; query: string; warm: boolean }
-  | { type: 'plan'; questions: string[]; tokenCount: number; timeMs: number }
-  | { type: 'research:start'; agentCount: number }
-  | { type: 'research:done'; pool: AgentPoolResult; timeMs: number }
-  | { type: 'verify:start'; count: number }
-  | { type: 'verify:done'; result: DivergeResult; timeMs: number }
-  | { type: 'verify:agreement'; result: AgreementResult }
-  | { type: 'eval:done'; converged: boolean | null; tokenCount: number; timeMs: number }
-  | { type: 'answer'; text: string }
-  | { type: 'response:start' }
-  | { type: 'response:text'; text: string }
-  | { type: 'response:done' }
-  | { type: 'stats'; timings: OpTiming[]; kvLine?: string; ctxPct: number; ctxPos: number; ctxTotal: number }
-  | { type: 'complete'; data: Record<string, unknown> };
-
-export type WorkflowEvent = AgentEvent | StepEvent;
-
-// ── Mode + color ─────────────────────────────────────────────────
-
-let _jsonlMode = false;
-let _verboseMode = false;
-
-export function setJsonlMode(on: boolean): void { _jsonlMode = on; }
-export function setVerboseMode(on: boolean): void { _verboseMode = on; }
-
-const isTTY = process.stdout.isTTY;
-
-export const c = isTTY ? {
-  bold: '\x1b[1m', dim: '\x1b[2m', reset: '\x1b[0m',
-  green: '\x1b[32m', cyan: '\x1b[36m', yellow: '\x1b[33m', red: '\x1b[31m',
-} : { bold: '', dim: '', reset: '', green: '', cyan: '', yellow: '', red: '' };
-
-// ── Primitives ───────────────────────────────────────────────────
-
-let _statusText = '';
-
-function status(text: string): void {
-  if (_jsonlMode || !isTTY) return;
-  _statusText = text;
-  process.stdout.write('\r\x1b[K' + text);
-}
-
-function statusClear(): void {
-  if (!_statusText) return;
-  _statusText = '';
-  process.stdout.write('\r\x1b[K');
-}
-
-export const log = (...a: unknown[]): void => {
-  if (_jsonlMode) return;
-  statusClear();
-  console.log(...a);
-};
-
-function emit(event: string, data: Record<string, unknown>): void {
-  if (_jsonlMode) console.log(JSON.stringify({ event, ...data }));
-}
-
-export const fmtSize = (bytes: number): string => bytes > 1e9
-  ? (bytes / 1e9).toFixed(1) + ' GB'
-  : (bytes / 1e6).toFixed(0) + ' MB';
-
-const pad = (s: unknown, n: number): string => String(s).padStart(n);
-
-// ── View state + handler type ────────────────────────────────────
-
-interface ViewState {
-  agentLabel: Map<number, string>;
-  nextLabel: number;
-  agentText: Map<number, string>;
-  agentStatus: Map<number, { state: string; tokenCount: number; detail: string }>;
-  agentParent: Map<number, number>;  // childId → parentId (sub-agent tracking)
-  traceQuery: string;
-}
-
-type ViewHandler = (ev: WorkflowEvent) => void;
-
-function isSubAgent(state: ViewState, agentId: number): boolean {
-  return state.agentParent.has(agentId);
-}
-
-function parentLabel(state: ViewState, agentId: number): string {
-  return label(state, state.agentParent.get(agentId)!);
-}
-
-function label(state: ViewState, agentId: number): string {
-  let l = state.agentLabel.get(agentId);
-  if (!l) { l = `A${state.nextLabel++}`; state.agentLabel.set(agentId, l); }
-  return l;
-}
-
-function resetLabels(state: ViewState): void {
-  state.nextLabel = 0;
-  state.agentLabel.clear();
-  state.agentStatus.clear();
-  state.agentText.clear();
-  state.agentParent.clear();
-}
-
-function renderStatus(state: ViewState): void {
-  const active = [...state.agentStatus.entries()]
-    .filter(([id, s]) => s.state !== 'done' && !isSubAgent(state, id));
-  if (active.length === 0) return;
-
-  const generating = active.filter(([, s]) => s.state === 'gen');
-  if (generating.length === 1 && active.length === 1) {
-    const [id] = generating[0];
-    const raw = (state.agentText.get(id) ?? '').replace(/\n/g, ' ').trimStart();
-    const cols = process.stdout.columns || 80;
-    const maxLen = cols - 12;
-    const text = raw.length > maxLen ? raw.slice(raw.length - maxLen) : raw;
-    status(`    ${c.dim}\u25c6${c.reset} ${c.yellow}${label(state, id)}${c.reset} ${text}`);
-    return;
-  }
-
-  const parts = active.map(([id, s]) => {
-    const lbl = `${c.yellow}${label(state, id)}${c.reset}`;
-    if (s.state === 'gen') return `${lbl}: ${s.tokenCount} tok`;
-    const detail = s.detail ? ` ${s.detail}` : '';
-    return `${lbl}: ${c.cyan}${s.state}${c.reset}${detail}`;
-  });
-  status(`    ${c.dim}\u25c6${c.reset} ${parts.join('  ')}`);
-}
-
-// ── View handlers ────────────────────────────────────────────────
-
-function queryHandler(state: ViewState, opts: ViewOpts): ViewHandler {
-  return (ev) => {
-    if (ev.type !== 'query') return;
-    state.traceQuery = ev.query;
-    if (!ev.warm) {
-      emit('start', {
-        model: opts.model, reranker: opts.reranker, query: ev.query,
-        agentCount: opts.agentCount, verifyCount: opts.verifyCount, chunks: opts.chunkCount,
-      });
-      log();
-      log(`  ${c.dim}Query${c.reset}`);
-      log(`  ${c.bold}${ev.query}${c.reset}`);
-    }
-  };
-}
-
-function planHandler(): ViewHandler {
-  return (ev) => {
-    if (ev.type !== 'plan') return;
-    emit('plan', { questions: ev.questions, planTokens: ev.tokenCount });
-    log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Plan${c.reset} ${c.dim}${ev.tokenCount} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
-    ev.questions.forEach((q: string, i: number) => log(`    ${c.dim}${i + 1}.${c.reset} ${q}`));
-  };
-}
-
-function agentHandler(state: ViewState): ViewHandler {
-  return (ev) => {
-    switch (ev.type) {
-      case 'agent:spawn': {
-        // If parent is a known labeled agent, this is a sub-agent
-        if (state.agentLabel.has(ev.parentAgentId)) {
-          state.agentParent.set(ev.agentId, ev.parentAgentId);
-        }
-        break;
-      }
-      case 'agent:produce': {
-        const sub = isSubAgent(state, ev.agentId);
-        state.agentText.set(ev.agentId, (state.agentText.get(ev.agentId) ?? '') + ev.text);
-        state.agentStatus.set(ev.agentId, { state: 'gen', tokenCount: ev.tokenCount, detail: '' });
-        if (sub) break;  // sub-agents: skip verbose/status output
-        if (_verboseMode) {
-          const lbl = label(state, ev.agentId);
-          if (ev.tokenCount === 1) {
-            statusClear();
-            process.stdout.write(`\n    ${c.dim}───${c.reset} ${c.yellow}${lbl}${c.reset} ${c.dim}tokens${c.reset} ${c.dim}───${c.reset}\n    `);
-          }
-          process.stdout.write(ev.text);
-        } else {
-          renderStatus(state);
-        }
-        break;
-      }
-      case 'agent:tool_call': {
-        const sub = isSubAgent(state, ev.agentId);
-        if (_verboseMode && !sub) process.stdout.write('\n');
-        state.agentText.delete(ev.agentId);
-        state.agentStatus.set(ev.agentId, { state: ev.tool, tokenCount: 0, detail: '' });
-        emit('tool_call', { agentId: ev.agentId, toolName: ev.tool, arguments: ev.args });
-        let toolArgs: Record<string, string>;
-        try { toolArgs = JSON.parse(ev.args); } catch { toolArgs = {}; }
-        const argSummary = ev.tool === 'search'
-          ? `"${toolArgs.query || ''}"`
-          : ev.tool === 'grep'
-          ? `/${toolArgs.pattern || ''}/`
-          : ev.tool === 'report' ? ''
-          : `${toolArgs.filename}` + (toolArgs.startLine ? ` L${toolArgs.startLine}-${toolArgs.endLine}` : '');
-        if (sub) {
-          const plbl = `${c.yellow}${parentLabel(state, ev.agentId)}${c.reset}`;
-          log(`    ${c.dim}\u2502${c.reset}  ${c.dim}\u2514${c.reset} ${plbl} ${c.cyan}${ev.tool}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
-        } else {
-          log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(state, ev.agentId)}${c.reset} ${c.cyan}${ev.tool}${c.reset}${argSummary ? `(${argSummary})` : ''}`);
-        }
-        break;
-      }
-      case 'agent:tool_result': {
-        emit('tool_result', {
-          agentId: ev.agentId, toolName: ev.tool,
-          result: ev.result.length > 200 ? ev.result.slice(0, 200) + '...' : ev.result,
-        });
-        let preview = '';
-        if (ev.tool === 'read_file') {
-          try {
-            const firstLine = (JSON.parse(ev.result) as { content: string }).content.split('\n').find((l: string) => l.trim());
-            if (firstLine) preview = ` \u00b7 ${firstLine.trim().slice(0, 60)}${firstLine.trim().length > 60 ? '\u2026' : ''}`;
-          } catch { /* non-fatal */ }
-        } else if (ev.tool === 'search') {
-          try {
-            const top = (JSON.parse(ev.result) as { heading: string }[])[0];
-            if (top?.heading) preview = ` \u00b7 ${top.heading}`;
-          } catch { /* non-fatal */ }
-        } else if (ev.tool === 'grep') {
-          try {
-            const r = JSON.parse(ev.result) as { totalMatches: number; matchingLines: number };
-            preview = ` \u00b7 ${r.totalMatches} matches in ${r.matchingLines} lines`;
-          } catch { /* non-fatal */ }
-        }
-        if (isSubAgent(state, ev.agentId)) {
-          const plbl = `${c.yellow}${parentLabel(state, ev.agentId)}${c.reset}`;
-          log(`    ${c.dim}\u2502${c.reset}  ${c.dim}\u2514${c.reset} ${plbl} ${c.dim}\u2190 ${ev.tool} ${ev.result.length}b${preview}${c.reset}`);
-        } else {
-          log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(state, ev.agentId)}${c.reset} ${c.dim}\u2190 ${ev.tool} ${ev.result.length}b${preview}${c.reset}`);
-        }
-        break;
-      }
-      case 'agent:tool_progress': {
-        state.agentStatus.set(ev.agentId, { state: ev.tool, tokenCount: 0, detail: `${ev.filled}/${ev.total}` });
-        renderStatus(state);
-        break;
-      }
-      case 'agent:report': {
-        state.agentStatus.set(ev.agentId, { state: 'done', tokenCount: 0, detail: '' });
-        const sub = isSubAgent(state, ev.agentId);
-        const cols = process.stdout.columns || 80;
-        const displayLabel = sub ? parentLabel(state, ev.agentId) : label(state, ev.agentId);
-        const lbl = `${c.yellow}${displayLabel}${c.reset}`;
-        const indent = sub ? `    ${c.dim}\u2502${c.reset}  ` : '    ';
-        const prefix = `${indent}${c.dim}\u2502${c.reset}   `;
-        const wrap = cols - (sub ? 11 : 8);
-
-        log(`${indent}${c.dim}\u2502${c.reset}`);
-        log(`${indent}${c.dim}\u251c\u2500\u2500${c.reset} ${lbl} ${c.bold}findings${c.reset}`);
-
-        for (const para of ev.findings.split('\n')) {
-          if (!para.trim()) { log(prefix); continue; }
-          const words = para.split(/\s+/);
-          let line = '';
-          for (const word of words) {
-            if (line && line.length + 1 + word.length > wrap) {
-              log(`${prefix}${c.dim}${line}${c.reset}`);
-              line = word;
-            } else {
-              line = line ? `${line} ${word}` : word;
-            }
-          }
-          if (line) log(`${prefix}${c.dim}${line}${c.reset}`);
-        }
-        log(`${indent}${c.dim}\u2502${c.reset}`);
-        break;
-      }
-      case 'agent:done':
-        if (_verboseMode && !isSubAgent(state, ev.agentId)) process.stdout.write('\n');
-        break;
-    }
-  };
-}
-
-function researchSummaryHandler(state: ViewState): ViewHandler {
-  function flushTrace(pool: AgentPoolResult): void {
-    if (!pool.agents.some(a => a.trace?.length)) return;
-    const filename = `trace-${Date.now()}.json`;
-    fs.writeFileSync(filename, JSON.stringify({
-      query: state.traceQuery,
-      timestamp: new Date().toISOString(),
-      agents: pool.agents.map(a => ({
-        agentId: a.agentId, label: label(state, a.agentId),
-        ppl: a.ppl, samplingPpl: a.samplingPpl,
-        tokenCount: a.tokenCount, toolCallCount: a.toolCallCount,
-        findings: a.findings, trace: a.trace ?? [],
-      })),
-    }, null, 2));
-    log(`  ${c.dim}Trace written to ${filename}${c.reset}`);
-  }
-
-  return (ev) => {
-    switch (ev.type) {
-      case 'research:start': {
-        log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Research${c.reset} ${c.dim}${ev.agentCount} agents${c.reset}`);
-        resetLabels(state);
-        break;
-      }
-      case 'research:done': {
-        statusClear();
-        ev.pool.agents.forEach((a, i) => {
-          const tree = i === ev.pool.agents.length - 1 ? '\u2514' : '\u251c';
-          emit('agent_done', {
-            index: i, findings: (a.findings || '').slice(0, 500),
-            toolCalls: a.toolCallCount, tokenCount: a.tokenCount,
-            ppl: a.ppl, samplingPpl: a.samplingPpl,
-          });
-          const raw = (state.agentText.get(a.agentId) ?? '').replace(/\n/g, ' ').trim();
-          if (raw) log(`    ${c.dim}\u251c${c.reset} ${c.yellow}${label(state, a.agentId)}${c.reset} ${c.dim}\u25b8 ${raw.slice(0, 120)}${raw.length > 120 ? '\u2026' : ''}${c.reset}`);
-          const pplStr = Number.isFinite(a.ppl) ? ` \u00b7 ppl ${a.ppl.toFixed(2)}` : '';
-          log(`    ${c.dim}${tree}${c.reset} ${c.yellow}${label(state, a.agentId)}${c.reset} ${c.green}done${c.reset} ${c.dim}${a.tokenCount} tok \u00b7 ${a.toolCallCount} tools${pplStr}${c.reset}`);
-        });
-        log(`    ${c.dim}${ev.pool.totalTokens} tok \u00b7 ${ev.pool.totalToolCalls} tools \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
-        flushTrace(ev.pool);
-        break;
-      }
-    }
-  };
-}
-
-function verifyHandler(): ViewHandler {
-  let pendingAgreement: AgreementResult | null = null;
-
-  return (ev) => {
-    switch (ev.type) {
-      case 'verify:start': {
-        log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Verify${c.reset} ${c.dim}${ev.count} attempts${c.reset}`);
-        pendingAgreement = null;
-        break;
-      }
-      case 'verify:agreement': {
-        pendingAgreement = ev.result;
-        emit('verify_agreement', {
-          overall: ev.result.overall,
-          sections: ev.result.sections.map(s => ({ label: s.label, score: s.score })),
-        });
-        break;
-      }
-      case 'verify:done': {
-        ev.result.attempts.forEach((a, i) => {
-          const tree = i === ev.result.attempts.length - 1
-            ? (pendingAgreement ? '\u251c' : '\u2514')
-            : '\u251c';
-          emit('attempt_done', { index: i, output: a.output.trim().slice(0, 500), tokenCount: a.tokenCount, ppl: a.ppl });
-          log(`    ${c.dim}${tree} ${a.tokenCount} tok \u00b7 ppl ${a.ppl.toFixed(2)}${c.reset}`);
-        });
-        if (pendingAgreement && pendingAgreement.sections.length > 0) {
-          const pct = Math.round(pendingAgreement.overall * 100);
-          log(`    ${c.dim}\u251c${c.reset} Agreement: ${c.bold}${pct}%${c.reset}`);
-          const sorted = [...pendingAgreement.sections].sort((a, b) => b.score - a.score);
-          const show = sorted.slice(0, 5);
-          const maxLabelLen = Math.max(...show.map(s => s.label.length));
-          show.forEach((s, i) => {
-            const tree = i === show.length - 1 && sorted.length <= 5 ? '\u2514' : '\u251c';
-            const filled = Math.round(s.score * 10);
-            const bar = '\u2588'.repeat(filled) + '\u2591'.repeat(10 - filled);
-            const sPct = pad(Math.round(s.score * 100), 3);
-            const label = `"${s.label}"`.padEnd(maxLabelLen + 2);
-            log(`    ${c.dim}${tree}${c.reset} ${c.dim}${label}${c.reset} ${sPct}%  ${bar}`);
-          });
-          if (sorted.length > 5) {
-            log(`    ${c.dim}\u2514 \u2026 ${sorted.length - 5} more${c.reset}`);
-          }
-        }
-        log(`    ${c.dim}${ev.result.totalTokens} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
-        pendingAgreement = null;
-        break;
-      }
-    }
-  };
-}
-
-function evalHandler(): ViewHandler {
-  return (ev) => {
-    if (ev.type !== 'eval:done') return;
-    emit('convergence', { converged: ev.converged, evalTokens: ev.tokenCount });
-    const verdict = ev.converged === true ? `${c.green}yes${c.reset}`
-      : ev.converged === false ? `${c.red}no${c.reset}`
-      : `${c.yellow}unknown${c.reset}`;
-    log(`\n  ${c.green}\u25cf${c.reset} ${c.bold}Eval${c.reset} ${c.dim}${ev.tokenCount} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`);
-    log(`    Converged: ${verdict}`);
-  };
-}
-
-function answerHandler(): ViewHandler {
-  return (ev) => {
-    if (ev.type !== 'answer') return;
-    log(`\n  ${c.dim}${'\u2500'.repeat(58)}${c.reset}\n`);
-    const prose = ev.text.trim()
-      .replace(/\*\*(.+?)\*\*/g, `${c.bold}$1${c.reset}`)
-      .split('\n').map((l: string) => `  ${l}`).join('\n');
-    log(prose);
-  };
-}
-
-function responseHandler(): ViewHandler {
-  return (ev) => {
-    switch (ev.type) {
-      case 'response:start':
-        process.stdout.write(`  ${c.dim}<${c.reset} `);
-        break;
-      case 'response:text':
-        process.stdout.write(ev.text);
-        break;
-      case 'response:done':
-        console.log('\n');
-        break;
-    }
-  };
-}
-
-function statsHandler(): ViewHandler {
-  return (ev) => {
-    if (ev.type !== 'stats') return;
-    const { timings, kvLine, ctxPct, ctxPos, ctxTotal } = ev;
-    const totalTokens = timings.reduce((s, p) => s + p.tokens, 0);
-    const totalMs = timings.reduce((s, p) => s + p.timeMs, 0);
-
-    log(`\n  ${c.dim}${'\u2501'.repeat(58)}${c.reset}`);
-    for (const p of timings) {
-      const left = `${p.label.padEnd(10)} ${pad(p.tokens, 5)} tok`;
-      const detail = p.detail ? `  ${p.detail}` : '';
-      const right = p.timeMs > 0 ? `${pad((p.timeMs / 1000).toFixed(1), 6)}s` : '';
-      log(`  ${c.dim}${left}${detail}${' '.repeat(Math.max(1, 58 - left.length - detail.length - right.length))}${right}${c.reset}`);
-    }
-    log(`  ${c.dim}${'\u2501'.repeat(58)}${c.reset}`);
-    log(`  ${c.bold}Total${c.reset}      ${c.bold}${pad(totalTokens, 5)}${c.reset} tok         ${c.bold}${pad((totalMs / 1000).toFixed(1), 6)}s${c.reset}`);
-    if (kvLine) log(`  ${c.dim}${kvLine}${c.reset}`);
-    if (ctxPct != null && ctxPos != null && ctxTotal != null) {
-      const ctxStr = `ctx: ${ctxPct}% (${ctxPos.toLocaleString()}/${ctxTotal.toLocaleString()})`;
-      log(`  ${c.dim}${'\u2501'.repeat(58)}${c.reset}`);
-      log(`  ${c.dim}${' '.repeat(58 - ctxStr.length)}${ctxStr}${c.reset}`);
-    }
-    log();
-  };
-}
-
-function completeHandler(): ViewHandler {
-  return (ev) => {
-    if (ev.type !== 'complete') return;
-    emit('complete', ev.data);
-  };
-}
-
-// ── createView — composable view factory ─────────────────────────
-
-export interface ViewOpts {
-  model: string;
-  reranker: string;
-  agentCount: number;
-  verifyCount: number;
-  chunkCount: number;
-}
-
-export function createView(opts: ViewOpts) {
-  const state: ViewState = {
-    agentLabel: new Map(),
-    nextLabel: 0,
-    agentText: new Map(),
-    agentStatus: new Map(),
-    agentParent: new Map(),
-    traceQuery: '',
-  };
-
-  const handlers: ViewHandler[] = [
-    queryHandler(state, opts),
-    planHandler(),
-    agentHandler(state),
-    researchSummaryHandler(state),
-    verifyHandler(),
-    evalHandler(),
-    answerHandler(),
-    responseHandler(),
-    statsHandler(),
-    completeHandler(),
-  ];
-
-  return {
-    *subscribe(events: Channel<WorkflowEvent, void>): Operation<void> {
-      for (const ev of yield* each(events)) {
-        for (const h of handlers) h(ev);
-        yield* each.next();
-      }
-    },
-  };
-}
diff --git a/package-lock.json b/package-lock.json
index e618748..cdf46c5 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -9,8 +9,9 @@
       "version": "1.6.0",
       "license": "Apache-2.0",
       "dependencies": {
+        "@lloyal-labs/lloyal-agents": "^1.0.0",
+        "@lloyal-labs/sdk": "^1.0.0",
         "@lloyal-labs/tsampler": "^0.2.0",
-        "effection": "^4.0.2",
         "node-addon-api": "^8.5.0"
       },
       "devDependencies": {
@@ -528,6 +529,16 @@
         "node": ">=18.0.0"
       }
     },
+    "node_modules/@lloyal-labs/lloyal-agents": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal-agents/-/lloyal-agents-1.0.0.tgz",
+      "integrity": "sha512-wBoUH8xxhV+qvfvlfFvqMETuOggVA8o3qIfN2c9ffyu22+lWSUKESto++2OqzROYlig7YQBPop7+ar+o4yjN/w==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@lloyal-labs/sdk": "*",
+        "effection": "^4.0.2"
+      }
+    },
     "node_modules/@lloyal-labs/lloyal.node-darwin-arm64": {
       "version": "1.6.0",
       "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-darwin-arm64/-/lloyal.node-darwin-arm64-1.6.0.tgz",
@@ -697,6 +708,12 @@
         "win32"
       ]
     },
+    "node_modules/@lloyal-labs/sdk": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/sdk/-/sdk-1.0.0.tgz",
+      "integrity": "sha512-Xu0LeSAgc+V4jeOXbtl+cgBHZE1LX31q0k0eQWJ8FX3LECYKIYv97SW9dkYNsxoOC8BJWQYPAtOyJbScgE6rzw==",
+      "license": "Apache-2.0"
+    },
     "node_modules/@lloyal-labs/tsampler": {
       "version": "0.2.0",
       "resolved": "https://registry.npmjs.org/@lloyal-labs/tsampler/-/tsampler-0.2.0.tgz",
diff --git a/package.json b/package.json
index 8cd4138..0d86e96 100644
--- a/package.json
+++ b/package.json
@@ -21,7 +21,6 @@
     "docs": "npx typedoc",
     "test": "npm run test:integration",
     "test:integration": "npx tsx test/integration.ts",
-    "test:agents": "npx tsx test/agents.ts",
     "test:examples": "npx tsx test/examples.ts",
     "sync:llama-cpp": "node scripts/sync-llama-cpp.js",
     "example": "npx tsx examples/chat/chat.ts"
@@ -47,8 +46,9 @@
   },
   "homepage": "https://github.com/lloyal-ai/lloyal.node#readme",
   "dependencies": {
+    "@lloyal-labs/lloyal-agents": "^1.0.0",
+    "@lloyal-labs/sdk": "^1.0.0",
     "@lloyal-labs/tsampler": "^0.2.0",
-    "effection": "^4.0.2",
     "node-addon-api": "^8.5.0"
   },
   "devDependencies": {
diff --git a/src/Branch.ts b/src/Branch.ts
deleted file mode 100644
index e44300e..0000000
--- a/src/Branch.ts
+++ /dev/null
@@ -1,650 +0,0 @@
-import type { SessionContext, SamplingParams, Produced, GrammarTrigger } from './types';
-import { GrammarTriggerType } from './types';
-
-/**
- * Forkable inference handle for covalent generation
- *
- * A Branch owns everything needed for independent generation: a KV cache
- * sequence, sampler chain, logits snapshot, and perplexity tracker.
- *
- * Forking is cheap — the KV prefix is shared in memory (metadata-only operation under unified KV —
- * no KV tensor buffers are copied), so sibling branches read from the same physical KV entries.
- * Only tokens decoded after the fork point are exclusive to each branch.
- *
- * Branches form trees, not just flat lists. Fork from root for best-of-N,
- * fork from children for tree search/beam search, fork from a draft for speculative
- * decoding.
- *
- * The produce/commit protocol separates sampling from state advancement:
- * produce() samples without writing to KV, letting you inspect the result
- * before deciding to commit().
- *
- * @example Best-of-N with perplexity selection
- * ```typescript
- * const root = Branch.create(ctx, tokens.length, { temperature: 0.8 });
- * await root.prefill(tokens);
- *
- * const results = [];
- * for (let i = 0; i < 5; i++) {
- *   const branch = await root.fork();
- *   branch.reseedSampler(1000 + i);
- *   const tokens = [];
- *   for await (const { token } of branch) tokens.push(token);
- *   results.push({ branch, tokens, ppl: branch.perplexity });
- * }
- *
- * const best = results.reduce((a, b) => a.ppl < b.ppl ? a : b);
- * for (const r of results) { if (r !== best) await r.branch.prune(); }
- * ```
- *
- * @category Branching
- */
-export class Branch {
-  private _ctx: SessionContext;
-  private _handle: number;
-  private _disposed: boolean;
-
-  constructor(ctx: SessionContext, handle: number) {
-    this._ctx = ctx;
-    this._handle = handle;
-    this._disposed = false;
-  }
-
-  /**
-   * Create a root branch at the given position
-   *
-   * The branch takes ownership of the sequence and creates its own sampler
-   * chain from the provided params. Call prefill() to decode prompt tokens
-   * and capture the logit distribution before forking.
-   *
-   * @param ctx - SessionContext to create branch on
-   * @param position - Starting position (typically prompt token count)
-   * @param params - Sampling parameters (temperature, topP, etc.)
-   * @param nBatch - Per-branch batch size override (defaults to context nBatch).
-   *   Controls chunk size for prefill(). Has no effect on
-   *   single-token commit() which uses a zero-allocation fast path.
-   * @param grammar - GBNF grammar string for constrained generation.
-   *   When provided, sample() returns only grammar-valid tokens. The grammar state
-   *   is cloned on fork(), so sibling branches can diverge independently.
-   * @returns New Branch instance
-   */
-  static create(
-    ctx: SessionContext,
-    position: number,
-    params?: SamplingParams,
-    nBatch?: number,
-    grammar?: string
-  ): Branch {
-    const handle = ctx._branchCreate(position, params, nBatch, grammar);
-    return new Branch(ctx, handle);
-  }
-
-  /**
-   * Fork this branch to a new sequence (async)
-   *
-   * Async contract: local branches resolve immediately; cloud branches
-   * may perform an HTTP round-trip. Use {@link forkSync} when you know
-   * the branch is local and want zero-overhead forking.
-   *
-   * @returns New forked Branch
-   */
-  async fork(): Promise<Branch> {
-    return this.forkSync();
-  }
-
-  /**
-   * Fork this branch to a new sequence (sync)
-   *
-   * The child shares the parent's KV prefix in memory (metadata-only under unified KV, no KV buffer copy).
-   * Logits, sampler state, and perplexity tracker are cloned so the child
-   * can diverge independently. Fork from any branch — root or intermediate —
-   * to build arbitrarily deep trees.
-   *
-   * Call reseedSampler() on each child for stochastic diversity.
-   *
-   * @returns New forked Branch
-   */
-  forkSync(): Branch {
-    this._ensureNotDisposed();
-    const newHandle = this._ctx._branchFork(this._handle);
-    return new Branch(this._ctx, newHandle);
-  }
-
-  /**
-   * Get a copy of this branch's captured logits snapshot.
-   *
-   * Returns n_vocab floats — the raw logit distribution from the last
-   * prefill() or commit() call.
-   *
-   * Returns an independent copy of the branch's internal snapshot.
-   * The returned Float32Array is safe to hold across async boundaries
-   * and is not affected by subsequent decode operations.
-   *
-   * @returns Independent copy of the logits snapshot (n_vocab elements)
-   * @throws If no logits have been captured yet
-   */
-  getLogits(): Float32Array {
-    this._ensureNotDisposed();
-    return this._ctx._branchGetLogits(this._handle);
-  }
-
-  /**
-   * Bulk-decode tokens into the branch's KV cache and capture logits.
-   *
-   * `tokens.length` is the total count to process; the branch's `nBatch`
-   * (set at `Branch.create`) controls how many are sent per `llama_decode`
-   * call. E.g. 500 tokens with `nBatch=64` → 8 calls (7×64 + 1×52).
-   *
-   * Advances `position` by `tokens.length`. Stores final logits into the
-   * branch's internal snapshot — the next `produce()`/`sample()` reads
-   * from it.
-   *
-   * Does NOT accept tokens into the repeat-penalty window — for external
-   * tokens (user input between turns), not model-generated tokens.
-   * For model output, use `commit()` which does accept + decode.
-   *
-   * The primary way to feed tokens into a branch's KV cache.
-   *
-   * @param tokens - Token IDs to decode
-   */
-  async prefill(tokens: number[]): Promise<void> {
-    this._ensureNotDisposed();
-    await this._ctx._branchPrefill(this._handle, tokens);
-  }
-
-  /**
-   * Sample next token from branch's logits snapshot
-   *
-   * Applies the branch's full sampler chain (top-k, top-p, temperature,
-   * repeat/presence penalties) to the captured logits.
-   *
-   * @returns Sampled token ID
-   */
-  sample(): number {
-    this._ensureNotDisposed();
-    return this._ctx._branchSample(this._handle);
-  }
-
-  /**
-   * Record token in the sampler's repeat/presence penalty window
-   *
-   * @param token - Token to accept
-   */
-  accept(token: number): void {
-    this._ensureNotDisposed();
-    this._ctx._branchAccept(this._handle, token);
-  }
-
-  /**
-   * Discard this branch (async)
-   *
-   * Async contract: local branches resolve immediately; cloud branches
-   * may perform an HTTP round-trip. Use {@link pruneSync} when you know
-   * the branch is local.
-   *
-   * RESTRICT mode: throws if children exist. Use {@link pruneSubtree} to
-   * cascade-delete an entire subtree.
-   */
-  async prune(): Promise<void> {
-    this.pruneSync();
-  }
-
-  /**
-   * Discard this branch — remove its divergent KV entries and free the handle (sync)
-   *
-   * Only removes KV entries divergent from the shared prefix; sibling branches
-   * are unaffected. The disposed flag is set synchronously — any call to
-   * produce(), commit(), etc. after prune() will throw immediately.
-   *
-   * RESTRICT mode: throws if children exist. Use {@link pruneSubtreeSync} to
-   * cascade-delete an entire subtree.
-   */
-  pruneSync(): void {
-    if (this._disposed) return;
-    const kids = this.children;
-    if (kids.length > 0) {
-      throw new Error(
-        `Branch.prune(): branch ${this._handle} has ${kids.length} active child(ren) ` +
-        `[${kids.join(', ')}]. Prune children first or use pruneSubtree().`,
-      );
-    }
-    this._ctx._branchPrune(this._handle);
-    this._disposed = true;
-  }
-
-  /**
-   * Discard this branch and all its descendants (async)
-   *
-   * Async contract: local branches resolve immediately; cloud branches
-   * may perform an HTTP round-trip. Use {@link pruneSubtreeSync} when you know
-   * the branch is local.
-   */
-  async pruneSubtree(): Promise<void> {
-    this.pruneSubtreeSync();
-  }
-
-  /**
-   * Discard this branch and all its descendants — CASCADE delete (sync)
-   *
-   * Iterative post-order traversal: prunes children first, then this branch.
-   * Use when tearing down an entire subtree (e.g. abandoned search path).
-   * Sets disposed synchronously.
-   */
-  pruneSubtreeSync(): void {
-    if (this._disposed) return;
-    this._ctx._branchPruneSubtree(this._handle);
-    this._disposed = true;
-  }
-
-  /**
-   * Reseed the sampler's PRNG for diversity after fork()
-   *
-   * CRITICAL for parallel generation: Without reseeding, all forked branches
-   * produce identical outputs because they share the same PRNG state.
-   *
-   * Only affects stochastic samplers (temperature > 0). Greedy samplers are unchanged.
-   *
-   * @param seed - New seed for the PRNG
-   */
-  reseedSampler(seed: number): void {
-    this._ensureNotDisposed();
-    this._ctx._branchSamplerChainReseed(this._handle, seed);
-  }
-
-  /**
-   * Apply dynamic logit adjustments for this branch only
-   *
-   * Unlike `logit_bias` in sampling params (which is cloned on fork), steer biases
-   * are NOT inherited by child branches. Each branch manages its own steer state
-   * independently. This makes steer ideal for path-dependent constraints.
-   *
-   * **Use cases:**
-   * - **tsampler**: Block tokens that would create repeated N-grams based on
-   *   this branch's specific generation history
-   * - **Diverse beam search**: Penalize tokens already chosen by sibling beams
-   *   to encourage output diversity across the beam
-   * - **Dynamic constraints**: Apply token restrictions that change per-step
-   *
-   * **Sampling order:** Grammar → Logit Bias → Steer → Sampler Chain
-   *
-   * @param biases - Array of token adjustments. Use `-Infinity` to completely
-   *   block a token, positive values to boost probability, negative to reduce.
-   *
-   * @example Block tokens for N-gram deduplication (tsampler pattern)
-   * ```ts
-   * // Compute which tokens would create repeated 4-grams
-   * const blocked = computeNgramBlocks(generatedTokens, n=4);
-   *
-   * // Block those tokens for this sample only
-   * branch.steer(blocked.map(t => ({ token: t, bias: -Infinity })));
-   *
-   * const { token } = await branch.produce();  // Blocked tokens won't be sampled
-   * await branch.commit(token);
-   *
-   * // Clear for next iteration (recompute based on new history)
-   * branch.clearSteer();
-   * ```
-   *
-   * @example Diverse beam search
-   * ```ts
-   * // Each beam penalizes tokens chosen by siblings this step
-   * for (const beam of beams) {
-   *   // Collect tokens chosen by other beams
-   *   const siblingTokens = beams
-   *     .filter(b => b !== beam && b.lastToken !== undefined)
-   *     .map(b => b.lastToken);
-   *
-   *   // Penalize sibling choices to encourage diversity
-   *   beam.branch.steer(siblingTokens.map(t => ({ token: t, bias: -2.0 })));
-   *
-   *   const { token } = await beam.branch.produce();
-   *   await beam.branch.commit(token);
-   *   beam.lastToken = token;
-   *   beam.branch.clearSteer();
-   * }
-   * ```
-   *
-   * @example Boost specific tokens
-   * ```ts
-   * // Boost "yes" and "no" tokens for a yes/no question
-   * branch.steer([
-   *   { token: yesTokenId, bias: 5.0 },
-   *   { token: noTokenId, bias: 5.0 }
-   * ]);
-   * ```
-   */
-  steer(biases: Array<{ token: number; bias: number }>): void {
-    this._ensureNotDisposed();
-    this._ctx._branchSteer(this._handle, biases);
-  }
-
-  /**
-   * Clear all steer biases from this branch
-   *
-   * Removes any dynamic logit adjustments set by `steer()`. Call this after
-   * each generation step if your steer constraints are computed per-step
-   * (e.g., N-gram blocking where the blocked set changes as text grows).
-   *
-   * @example Per-step steer pattern
-   * ```ts
-   * for (let i = 0; i < maxTokens; i++) {
-   *   // Compute constraints based on current state
-   *   const blocked = computeConstraints(generatedTokens);
-   *   branch.steer(blocked.map(t => ({ token: t, bias: -Infinity })));
-   *
-   *   const { token, isStop } = await branch.produce();
-   *   if (isStop) break;
-   *
-   *   await branch.commit(token);
-   *   branch.clearSteer();  // Reset for next iteration
-   *   generatedTokens.push(token);
-   * }
-   * ```
-   */
-  clearSteer(): void {
-    this._ensureNotDisposed();
-    this._ctx._branchClearSteer(this._handle);
-  }
-
-  /**
-   * Replace the sampler chain with new parameters (memoized)
-   *
-   * If the new params match the current chain's params, this is a no-op.
-   * Otherwise the old chain is freed and a new one is created. Use for
-   * Entropy-Driven Temperature (EDT) and other adaptive sampling strategies
-   * that adjust parameters per-step.
-   *
-   * @param params - New sampling parameters
-   *
-   * @example Entropy-Driven Temperature
-   * ```typescript
-   * const entropy = branch.modelEntropy('nats');
-   * branch.setSamplerParams({ temperature: edtTemperature(entropy) });
-   * const { token } = await branch.produce();
-   * await branch.commit(token);
-   * ```
-   */
-  setSamplerParams(params: SamplingParams): void {
-    this._ensureNotDisposed();
-    this._ctx._branchSetSamplerParams(this._handle, params);
-  }
-
-  /**
-   * Replace or remove the grammar constraint
-   *
-   * Pass a GBNF grammar string to constrain generation. Pass empty string
-   * or undefined to remove the constraint. The grammar state is cloned on
-   * fork(), so sibling branches can diverge independently after hot-swap.
-   *
-   * @param grammarStr - GBNF grammar string, or empty/undefined to remove
-   *
-   * @example Hot-swap grammar mid-generation
-   * ```typescript
-   * // Start unconstrained, then switch to JSON after detecting tool call
-   * branch.setGrammar(jsonGrammar);
-   * const { token } = await branch.produce();
-   * ```
-   */
-  setGrammar(grammarStr?: string): void {
-    this._ensureNotDisposed();
-    this._ctx._branchSetGrammar(this._handle, grammarStr || '');
-  }
-
-  /**
-   * Set lazy grammar — unconstrained until trigger, then grammar-constrained
-   *
-   * Generation runs freely until a trigger pattern or token fires, at which
-   * point the grammar activates and constrains subsequent tokens. Used for
-   * tool-call generation: model writes freely until `<tool_call>`, then
-   * grammar forces valid XML structure.
-   *
-   * The grammar state is cloned on fork(), so sibling branches can diverge
-   * independently. Call again after a tool result prefill to reset.
-   *
-   * @param grammar - GBNF grammar string
-   * @param triggers - Trigger conditions from formatChat().grammarTriggers
-   */
-  setGrammarLazy(grammar: string, triggers: GrammarTrigger[]): void {
-    this._ensureNotDisposed();
-    const escapeRegex = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
-    const patterns: string[] = [];
-    const tokens: number[] = [];
-    for (const t of triggers) {
-      switch (t.type) {
-        case GrammarTriggerType.WORD:
-          patterns.push(escapeRegex(t.value));
-          break;
-        case GrammarTriggerType.PATTERN:
-          patterns.push(t.value);
-          break;
-        case GrammarTriggerType.PATTERN_FULL: {
-          const p = t.value;
-          patterns.push((p[0] !== '^' ? '^' : '') + p + (p[p.length - 1] !== '$' ? '$' : ''));
-          break;
-        }
-        case GrammarTriggerType.TOKEN:
-          tokens.push(t.token);
-          break;
-      }
-    }
-    this._ctx._branchSetGrammarLazy(this._handle, grammar, patterns, tokens);
-  }
-
-  /**
-   * Sample next token without advancing state (async)
-   *
-   * Async contract: local branches resolve immediately; cloud branches
-   * may perform an HTTP round-trip. Use {@link produceSync} when you know
-   * the branch is local and want zero-overhead sampling.
-   */
-  async produce(): Promise<Produced> {
-    return this.produceSync();
-  }
-
-  /**
-   * Sample next token without advancing state (sync)
-   *
-   * Same as {@link produce} but synchronous. Use when you know the branch
-   * is local and want to avoid the microtick overhead of a promise.
-   */
-  produceSync(): Produced {
-    this._ensureNotDisposed();
-    const token = this.sample();
-    return {
-      token,
-      text: this._ctx.tokenToText(token),
-      isStop: this._ctx.isStopToken(token),
-    };
-  }
-
-  /**
-   * Accept and decode — update branch state, then write token to KV
-   *
-   * Accepts the token into the sampler penalty window (for correct PPL
-   * measurement), then decodes (writing to KV cache via AsyncWorker on
-   * the libuv thread pool) and captures the resulting logits for the next
-   * produce() call. Accept-first ordering with rollback: if decode throws,
-   * sampler/grammar/metrics are restored from clones.
-   *
-   * @param token Token to commit (from produce())
-   */
-  async commit(token: number): Promise<void> {
-    this._ensureNotDisposed();
-    await this._ctx._storeCommit([this._handle], [token]);
-  }
-
-  // ===== METRICS =====
-
-  /**
-   * Compute entropy of the branch's logits distribution
-   *
-   * Measures model uncertainty from the branch's captured logits snapshot:
-   * - Low entropy: Model is confident (peaked distribution)
-   * - High entropy: Model is uncertain (flat distribution)
-   *
-   * Operates directly on `state->logits_snapshot` — no JS round-trip.
-   *
-   * @param base - Logarithm base: "nats" (default) or "bits"
-   * @returns Entropy value in specified base
-   *
-   * COST: O(n_vocab) - must sum over all token probabilities
-   */
-  modelEntropy(base: 'nats' | 'bits' = 'nats'): number {
-    this._ensureNotDisposed();
-    return this._ctx._branchModelEntropy(this._handle, base);
-  }
-
-  /**
-   * Compute surprisal (negative log-likelihood) for a specific token
-   *
-   * Measures how "surprising" the model finds the given token from
-   * the branch's captured logits snapshot:
-   * - Low surprisal: Model expected this token (high probability)
-   * - High surprisal: Model didn't expect this token (low probability)
-   *
-   * Operates directly on `state->logits_snapshot` — no JS round-trip.
-   *
-   * @param token - Token ID to compute surprisal for
-   * @param base - Logarithm base: "nats" (default) or "bits"
-   * @returns Surprisal value in specified base
-   *
-   * COST: O(n_vocab) - softmax normalization required
-   */
-  modelSurprisal(token: number, base: 'nats' | 'bits' = 'nats'): number {
-    this._ensureNotDisposed();
-    return this._ctx._branchModelSurprisal(this._handle, token, base);
-  }
-
-  /**
-   * Sampling-level perplexity (from filtered distribution)
-   *
-   * Returns perplexity from the distribution actually sampled from
-   * (after top-k/p/temp/penalties). Useful for policy priors and
-   * monitoring sampler chain impact.
-   *
-   * Compare with {@link perplexity} which is model-level (raw logits).
-   */
-  get samplingPerplexity(): number {
-    this._ensureNotDisposed();
-    return this._ctx._branchGetSamplingPerplexity(this._handle);
-  }
-
-  /**
-   * Set static logit biases on this branch
-   *
-   * Unlike {@link steer} (which is NOT inherited on fork), logit biases
-   * ARE cloned when forking. Use for persistent constraints that should
-   * propagate to child branches.
-   *
-   * Applied during sample() in order: Grammar -> Logit Bias -> Steer -> Sampler Chain
-   *
-   * @param biases - Array of token adjustments. Use `-Infinity` to block,
-   *   positive to boost, negative to reduce.
-   */
-  setLogitBias(biases: Array<{ token: number; bias: number }>): void {
-    this._ensureNotDisposed();
-    this._ctx._branchSetLogitBias(this._handle, biases);
-  }
-
-  /**
-   * Clear all static logit biases from this branch
-   */
-  clearLogitBias(): void {
-    this._ensureNotDisposed();
-    this._ctx._branchClearLogitBias(this._handle);
-  }
-
-  // ===== ACCESSORS =====
-
-  /** Branch's current position (number of tokens decoded) */
-  get position(): number {
-    this._ensureNotDisposed();
-    return this._ctx._branchGetPosition(this._handle);
-  }
-
-  /** Branch's perplexity (exp of mean surprisal) */
-  get perplexity(): number {
-    this._ensureNotDisposed();
-    return this._ctx._branchGetPerplexity(this._handle);
-  }
-
-  /** Internal handle (for debugging) */
-  get handle(): number {
-    return this._handle;
-  }
-
-  /** Whether this branch has been disposed */
-  get disposed(): boolean {
-    return this._disposed;
-  }
-
-  /** Parent branch handle, or null if root */
-  get parent(): number | null {
-    this._ensureNotDisposed();
-    const h = this._ctx._branchParent(this._handle);
-    return h === 0 ? null : h;
-  }
-
-  /** Child branch handles */
-  get children(): number[] {
-    this._ensureNotDisposed();
-    return this._ctx._branchChildren(this._handle);
-  }
-
-  /** True if this branch has no children */
-  get isLeaf(): boolean {
-    this._ensureNotDisposed();
-    return this._ctx._branchIsLeaf(this._handle);
-  }
-
-  /** True if this branch holds a KV lease */
-  get isActive(): boolean {
-    this._ensureNotDisposed();
-    return this._ctx._branchIsActive(this._handle);
-  }
-
-  // ===== ASYNC ITERATION =====
-
-  /**
-   * Async iterator — generate tokens until EOG
-   *
-   * Commit-before-yield semantics: every yielded token is already written
-   * to KV and accepted into the sampler. Breaking out of the loop is clean —
-   * no orphaned uncommitted tokens, perplexity reflects all yielded tokens.
-   *
-   * For inspect-before-commit (speculative decoding, tree search), use
-   * the {@link produce}/{@link commit} protocol directly.
-   *
-   * @example Generate to completion
-   * ```typescript
-   * for await (const { token, text } of branch) {
-   *   process.stdout.write(text);
-   * }
-   * ```
-   *
-   * @example Generate with consumer-side bound
-   * ```typescript
-   * const tokens = [];
-   * for await (const { token } of branch) {
-   *   tokens.push(token);
-   *   if (tokens.length >= limit) break;
-   * }
-   * ```
-   */
-  async *[Symbol.asyncIterator](): AsyncIterableIterator<{ token: number; text: string }> {
-    while (!this._disposed) {
-      const { token, text, isStop } = await this.produce();
-      if (isStop) return;
-      await this.commit(token);
-      yield { token, text };
-    }
-  }
-
-  // ===== INTERNAL =====
-
-  private _ensureNotDisposed(): void {
-    if (this._disposed) {
-      throw new Error('Branch has been disposed');
-    }
-  }
-}
diff --git a/src/BranchStore.ts b/src/BranchStore.ts
deleted file mode 100644
index c4813b9..0000000
--- a/src/BranchStore.ts
+++ /dev/null
@@ -1,155 +0,0 @@
-import type { Branch } from './Branch';
-import type { SessionContext } from './types';
-
-/**
- * High-throughput multi-branch decode operations
- *
- * The naive approach to N-branch generation is N sequential llama_decode()
- * calls — each paying full GPU kernel launch overhead, memory barrier, and
- * PCIe round-trip. BranchStore eliminates this by packing all branches into
- * a single llama_batch and dispatching once: O(1) GPU round-trips regardless
- * of branch count. The GPU parallelizes across sequences within the batch,
- * so N branches approach the wall-time cost of 1.
- *
- * Two operations, two packing strategies:
- *
- * **commit()** — Generation step. Each branch contributes exactly 1 token.
- * Packs N tokens into a single batch via `decode_each` (one row per sequence,
- * all at their respective positions). Single `llama_decode()` call. Logits
- * captured per-branch at batch index `i`. O(N) total work, O(1) GPU
- * dispatches, O(1) amortized dispatch overhead per branch. Accept-first
- * ordering with rollback: accepts each token into its branch's repeat-penalty
- * window before decode, restores from clones if decode throws.
- *
- * **prefill()** — Bulk token injection. Each branch contributes a
- * variable-length token array. Uses a two-pass bin-packing algorithm:
- *
- * - *Pass 1 (planning)*: Greedy first-fit packs items into chunks ≤ nBatch.
- *   Items larger than nBatch get a dedicated chunk and fall through to
- *   decode_many's internal auto-chunking (ceil(nTokens / nBatch) calls).
- * - *Pass 2 (dispatch)*: Normal chunks dispatch via `decode_scatter` (one
- *   `llama_decode` per chunk). Logits are indexed by flattened cursor
- *   position: for item k in a chunk, logits live at `cursor + nTokens[k] - 1`.
- *
- * For T total tokens across N branches with batch capacity B:
- * - Best case (T ≤ B): 1 GPU dispatch, all branches in one batch.
- * - Worst case: ceil(T / B) dispatches. Each dispatch is fully packed.
- * - Amortized per-token GPU overhead: O(1/B) — vanishes as batch fills.
- *
- * Does NOT accept tokens into the sampler penalty window — use for
- * external/replayed tokens where repeat-penalty tracking is unwanted.
- * For model-generated tokens, use {@link commit} instead.
- *
- * Both methods take `[branch, token(s)]` tuples — the branch-to-token
- * binding is structural, not positional. After either call, each branch's
- * logits snapshot is updated with the output distribution from its decoded
- * token(s), ready for the next `produce()`/`sample()` call.
- *
- * @example 32-branch generation step — one GPU dispatch
- * ```typescript
- * const store = new BranchStore(ctx);
- * const entries = await Promise.all(branches.map(async b => [b, (await b.produce()).token] as [Branch, number]));
- * await store.commit(entries);  // 32 tokens, 1 llama_decode()
- * ```
- *
- * @example Best-of-N with batched commit
- * ```typescript
- * const store = new BranchStore(ctx);
- * const branches = [];
- * for (const _ of [1, 2, 3]) branches.push(await root.fork());
- *
- * for (let step = 0; step < 50; step++) {
- *   const produced = await Promise.all(branches.map(async b => [b, await b.produce()] as const));
- *   const live = produced.filter(([, p]) => !p.isStop);
- *   if (!live.length) break;
- *   await store.commit(live.map(([b, p]) => [b, p.token]));
- * }
- * ```
- *
- * @example Asymmetric prefill — variable-length injections, auto-chunked
- * ```typescript
- * await store.prefill([
- *   [branchA, systemPromptTokens],   // 200 tokens
- *   [branchB, shortQueryTokens],     //  12 tokens
- *   [branchC, longDocumentTokens],   // 800 tokens
- * ]);
- * // Bin-packed into ceil(1012 / nBatch) GPU dispatches
- * ```
- *
- * @category Branching
- */
-export class BranchStore {
-  private _ctx: SessionContext;
-
-  constructor(ctx: SessionContext) {
-    this._ctx = ctx;
-  }
-
-  /**
-   * Batched single-token commit for model-generated tokens
-   *
-   * Each tuple `[branch, token]` binds one token to one branch.
-   * Accepts each token into its branch's repeat-penalty window (for correct
-   * PPL measurement), then decodes all N tokens in a single llama_decode()
-   * call via decode_each and captures logits per-branch. Accept-first
-   * ordering with rollback: if decode throws, sampler/grammar/metrics are
-   * restored from clones taken before the accept.
-   *
-   * @param entries - Array of `[branch, token]` tuples (branches must not be disposed)
-   * @throws If any branch is disposed
-   */
-  async commit(entries: [Branch, number][]): Promise<void> {
-    const handles: number[] = [];
-    const tokens: number[] = [];
-    for (const [branch, token] of entries) {
-      if (branch.disposed) throw new Error('BranchStore.commit: branch is disposed');
-      handles.push(branch.handle);
-      tokens.push(token);
-    }
-    await this._ctx._storeCommit(handles, tokens);
-  }
-
-  /**
-   * Batched variable-length prefill for external tokens
-   *
-   * Each tuple `[branch, tokens]` binds a token array to one branch.
-   * Each branch can receive a different number of tokens — decode_scatter
-   * handles variable-length runs and auto-chunks to fit nBatch.
-   *
-   * Does NOT call accept_token — use for external/replayed tokens where
-   * repeat-penalty tracking is unwanted. For model-generated tokens,
-   * use {@link commit} instead.
-   *
-   * @param entries - Array of `[branch, tokens]` tuples (branches must not be disposed)
-   * @throws If any branch is disposed
-   */
-  async prefill(entries: [Branch, number[]][]): Promise<void> {
-    const handles: number[] = [];
-    const tokenArrays: number[][] = [];
-    for (const [branch, tokens] of entries) {
-      if (branch.disposed) throw new Error('BranchStore.prefill: branch is disposed');
-      handles.push(branch.handle);
-      tokenArrays.push(tokens);
-    }
-    await this._ctx._storePrefill(handles, tokenArrays);
-  }
-
-  /**
-   * Retain only the winner branch — evict all other leases and free their slots.
-   *
-   * Nuclear operation: calls `kv::seq_keep` on the winner's seq_id (stripping all
-   * other sequences from KV cache in a single pass), then frees all loser slots
-   * and rebuilds the vacancy list. The winner's topology is reset (no parent, no children).
-   *
-   * @param winner - The branch to keep (must not be disposed, must hold a lease)
-   * @throws If winner is disposed or has no lease
-   */
-  async retainOnly(winner: Branch): Promise<void> {
-    if (winner.disposed) throw new Error('BranchStore.retainOnly: winner is disposed');
-    this._ctx._storeRetainOnly(winner.handle);
-  }
-
-  get available(): number {
-    return this._ctx._storeAvailable();
-  }
-}
diff --git a/src/Rerank.ts b/src/Rerank.ts
deleted file mode 100644
index 0771fef..0000000
--- a/src/Rerank.ts
+++ /dev/null
@@ -1,268 +0,0 @@
-import { createContext } from './index.js';
-import type { SessionContext, RerankOptions, RerankResult, RerankProgress } from './types';
-
-const SYSTEM_PROMPT =
-  'Judge whether the Document meets the requirements based on the Query ' +
-  'and the Instruct provided. Note that the answer can only be "yes" or "no".';
-
-const USER_PREFIX =
-  '<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n\n' +
-  '<Query>: ';
-
-interface ScoringRequest {
-  tokenArrays: number[][];
-  cursor: number;
-  scores: number[];
-  filled: number;
-  topK: number | undefined;
-  total: number;
-  push: (progress: RerankProgress) => void;
-  finish: () => void;
-  error: (err: Error) => void;
-}
-
-/** Simple async channel — _drain pushes, consumer pulls via for-await */
-function channel<T>(): {
-  push: (value: T) => void;
-  finish: () => void;
-  error: (err: Error) => void;
-  iterable: AsyncIterable<T>;
-} {
-  const buffer: T[] = [];
-  let done = false;
-  let err: Error | null = null;
-  let notify: (() => void) | null = null;
-
-  const wait = () => new Promise<void>((r) => { notify = r; });
-
-  return {
-    push(value: T) {
-      buffer.push(value);
-      notify?.();
-      notify = null;
-    },
-    finish() {
-      done = true;
-      notify?.();
-      notify = null;
-    },
-    error(e: Error) {
-      err = e;
-      notify?.();
-      notify = null;
-    },
-    iterable: {
-      [Symbol.asyncIterator](): AsyncIterator<T> {
-        return {
-          async next(): Promise<IteratorResult<T>> {
-            while (buffer.length === 0 && !done && !err) await wait();
-            if (err) throw err;
-            if (buffer.length > 0) return { value: buffer.shift()!, done: false };
-            return { value: undefined as unknown as T, done: true };
-          },
-        };
-      },
-    },
-  };
-}
-
-export class Rerank {
-  private _ctx: SessionContext;
-  private _nSeqMax: number;
-  private _nCtx: number;
-  private _yesId: number;
-  private _noId: number;
-  private _prefixTokens: number[];
-  private _midTokens: number[];
-  private _suffixTokens: number[];
-  private _pending: ScoringRequest[] = [];
-  private _draining = false;
-  private _disposed = false;
-
-  private constructor(
-    ctx: SessionContext,
-    nSeqMax: number,
-    nCtx: number,
-    yesId: number,
-    noId: number,
-    prefixTokens: number[],
-    midTokens: number[],
-    suffixTokens: number[],
-  ) {
-    this._ctx = ctx;
-    this._nSeqMax = nSeqMax;
-    this._nCtx = nCtx;
-    this._yesId = yesId;
-    this._noId = noId;
-    this._prefixTokens = prefixTokens;
-    this._midTokens = midTokens;
-    this._suffixTokens = suffixTokens;
-  }
-
-  static async create(options: RerankOptions): Promise<Rerank> {
-    const nSeqMax = options.nSeqMax ?? 8;
-    const nCtx = options.nCtx ?? 4096;
-    const ctx = await createContext({
-      modelPath: options.modelPath,
-      nCtx,
-      nSeqMax,
-      typeK: options.typeK ?? 'q4_0',
-      typeV: options.typeV ?? 'q4_0',
-    });
-
-    const [yesId] = await ctx.tokenize('yes', false);
-    const [noId] = await ctx.tokenize('no', false);
-
-    const SENTINEL_Q = '\x00QUERY\x00';
-    const SENTINEL_D = '\x00DOC\x00';
-    const probe = await ctx.formatChat(JSON.stringify([
-      { role: 'system', content: SYSTEM_PROMPT },
-      { role: 'user', content: `${USER_PREFIX}${SENTINEL_Q}\n\n<Document>: ${SENTINEL_D}` },
-    ]), { addGenerationPrompt: true, enableThinking: false });
-
-    const p = probe.prompt;
-    const qi = p.indexOf(SENTINEL_Q);
-    const di = p.indexOf(SENTINEL_D);
-    const prefixTokens = await ctx.tokenize(p.slice(0, qi), true);
-    const midTokens = await ctx.tokenize(p.slice(qi + SENTINEL_Q.length, di), false);
-    const suffixTokens = await ctx.tokenize(p.slice(di + SENTINEL_D.length), false);
-
-    return new Rerank(ctx, nSeqMax, nCtx, yesId, noId, prefixTokens, midTokens, suffixTokens);
-  }
-
-  score(query: string, documents: number[][], topK?: number): AsyncIterable<RerankProgress> {
-    if (this._disposed) throw new Error('Rerank disposed');
-
-    const self = this;
-    const ch = channel<RerankProgress>();
-
-    (async () => {
-      try {
-        const queryTokens = await self._ctx.tokenize(query, false);
-        const shared = [...self._prefixTokens, ...queryTokens, ...self._midTokens];
-        const maxDoc = Math.floor(self._nCtx / self._nSeqMax) - shared.length - self._suffixTokens.length;
-
-        const tokenArrays = documents.map((doc) => {
-          const trimmed = doc.length > maxDoc ? doc.slice(0, maxDoc) : doc;
-          return [...shared, ...trimmed, ...self._suffixTokens];
-        });
-
-        self._enqueue(tokenArrays, topK, ch.push, ch.finish, ch.error);
-      } catch (err) {
-        ch.error(err instanceof Error ? err : new Error(String(err)));
-      }
-    })();
-
-    return ch.iterable;
-  }
-
-  async tokenize(text: string): Promise<number[]> {
-    return this._ctx.tokenize(text, false);
-  }
-
-  dispose(): void {
-    this._disposed = true;
-    const err = new Error('Rerank disposed');
-    for (const req of this._pending) req.error(err);
-    this._pending.length = 0;
-    this._ctx.dispose();
-  }
-
-  // ── Queue internals ──────────────────────────────────────────
-
-  private _sortResults(scores: number[], topK: number | undefined): RerankResult[] {
-    const sorted = scores
-      .map((score, index) => ({ score: Math.round(score * 1000) / 1000, index }))
-      .sort((a, b) => b.score - a.score);
-    return topK != null ? sorted.slice(0, topK) : sorted;
-  }
-
-  private _enqueue(
-    tokenArrays: number[][],
-    topK: number | undefined,
-    push: (progress: RerankProgress) => void,
-    finish: () => void,
-    error: (err: Error) => void,
-  ): void {
-    this._pending.push({
-      tokenArrays, cursor: 0,
-      scores: new Array(tokenArrays.length),
-      filled: 0,
-      topK,
-      total: tokenArrays.length,
-      push, finish, error,
-    });
-    this._drain();
-  }
-
-  private _fillGroup(): { reqIdx: number; promptIdx: number; tokens: number[] }[] {
-    const group: { reqIdx: number; promptIdx: number; tokens: number[] }[] = [];
-    let added = true;
-    while (group.length < this._nSeqMax && added) {
-      added = false;
-      for (let r = 0; r < this._pending.length && group.length < this._nSeqMax; r++) {
-        const req = this._pending[r];
-        if (req.cursor < req.tokenArrays.length) {
-          group.push({ reqIdx: r, promptIdx: req.cursor, tokens: req.tokenArrays[req.cursor] });
-          req.cursor++;
-          added = true;
-        }
-      }
-    }
-    return group;
-  }
-
-  private async _drain(): Promise<void> {
-    if (this._draining) return;
-    this._draining = true;
-
-    try {
-      while (this._pending.length > 0) {
-        const group = this._fillGroup();
-        if (group.length === 0) break;
-
-        let logits: Float32Array[];
-        try {
-          logits = await this._ctx._scoreGroup(group.map((g) => g.tokens));
-        } catch (err) {
-          const error = err instanceof Error ? err : new Error(String(err));
-          for (const req of this._pending) req.error(error);
-          this._pending.length = 0;
-          return;
-        }
-
-        // Track which requests got new scores this group
-        const touched = new Set<number>();
-        for (let i = 0; i < group.length; i++) {
-          const req = this._pending[group[i].reqIdx];
-          req.scores[group[i].promptIdx] = this._rerankScore(logits[i]);
-          req.filled++;
-          touched.add(group[i].reqIdx);
-        }
-
-        // Push progress for each request that advanced, finish completed ones
-        for (let r = this._pending.length - 1; r >= 0; r--) {
-          const req = this._pending[r];
-          if (!touched.has(r)) continue;
-
-          const results = this._sortResults(req.scores, req.topK);
-          req.push({ filled: req.filled, total: req.total, results });
-
-          if (req.filled === req.total) {
-            req.finish();
-            this._pending.splice(r, 1);
-          }
-        }
-      }
-    } finally {
-      this._draining = false;
-    }
-  }
-
-  private _rerankScore(logits: Float32Array): number {
-    const max = Math.max(logits[this._yesId], logits[this._noId]);
-    const yesExp = Math.exp(logits[this._yesId] - max);
-    const noExp = Math.exp(logits[this._noId] - max);
-    return yesExp / (yesExp + noExp);
-  }
-}
diff --git a/src/Session.ts b/src/Session.ts
deleted file mode 100644
index 4ce87fb..0000000
--- a/src/Session.ts
+++ /dev/null
@@ -1,99 +0,0 @@
-import type { Branch } from './Branch';
-import type { BranchStore } from './BranchStore';
-import type { SessionContext } from './types';
-import { buildUserDelta, buildToolResultDelta } from './agents/deltas';
-
-/**
- * Session - Trunk lifecycle + conversation delta helpers
- *
- * Owns the current "trunk" branch and provides promote() to crown a winner,
- * plus delta helpers that centralize the sep + formatChat + tokenize + prefill
- * pattern for injecting new turns into an ongoing conversation.
- *
- * Session does NOT own the SessionContext or BranchStore — the consumer
- * creates those and passes them in. dispose() prunes trunk only.
- *
- * @example
- * ```typescript
- * const session = new Session({ ctx, store });
- * session.trunk = initialBranch;
- *
- * // After verification, promote the best attempt
- * await session.promote(bestAttempt.branch);
- *
- * // Inject a user turn and generate
- * await session.prefillUser('What about X?');
- * for await (const { text } of session.trunk) {
- *   process.stdout.write(text);
- * }
- *
- * // Cleanup
- * await session.dispose();
- * ctx.dispose();
- * ```
- *
- * @category Branching
- */
-export class Session {
-  private _ctx: SessionContext;
-  private _store: BranchStore;
-  private _trunk: Branch | null;
-
-  constructor({ ctx, store }: { ctx: SessionContext; store: BranchStore }) {
-    this._ctx = ctx;
-    this._store = store;
-    this._trunk = null;
-  }
-
-  /** Current trunk branch */
-  get trunk(): Branch | null {
-    return this._trunk;
-  }
-
-  /** Assign initial trunk (no promote) */
-  set trunk(branch: Branch | null) {
-    this._trunk = branch;
-  }
-
-  /**
-   * Promote a winner to trunk — retainOnly + reassign
-   *
-   * Safe even if winner is the only branch (resets topology, no-op on KV).
-   */
-  async promote(winner: Branch): Promise<void> {
-    await this._store.retainOnly(winner);
-    this._trunk = winner;
-  }
-
-  /**
-   * Dispose trunk only — consumer owns ctx and other resources
-   */
-  async dispose(): Promise<void> {
-    if (this._trunk && !this._trunk.disposed) {
-      await this._trunk.prune();
-    }
-    this._trunk = null;
-  }
-
-  /**
-   * Prefill a user turn into trunk
-   *
-   * @param content - User message content
-   * @param opts - Optional tools JSON string
-   */
-  async prefillUser(content: string, opts: { tools?: string } = {}): Promise<void> {
-    const tokens = buildUserDelta(this._ctx, content, opts);
-    await this._trunk!.prefill(tokens);
-  }
-
-  /**
-   * Prefill a tool result turn into trunk
-   *
-   * @param resultStr - JSON-stringified tool result
-   * @param callId - Tool call ID
-   */
-  async prefillToolResult(resultStr: string, callId: string): Promise<void> {
-    const tokens = buildToolResultDelta(this._ctx, resultStr, callId);
-    await this._trunk!.prefill(tokens);
-  }
-}
diff --git a/src/agents/Tool.ts b/src/agents/Tool.ts
deleted file mode 100644
index 20c34d8..0000000
--- a/src/agents/Tool.ts
+++ /dev/null
@@ -1,76 +0,0 @@
-import type { JsonSchema, ToolSchema, ToolContext } from './types';
-
-/**
- * Abstract base class for tools usable by agents in the runtime
- *
- * Subclass to define tools that agents can invoke during generation.
- * Implement `name`, `description`, `parameters`, and `execute()`. The
- * {@link schema} getter auto-generates the OpenAI-compatible function
- * schema expected by `formatChat()`.
- *
- * Pass tool instances to {@link createToolkit} to build the `toolMap`
- * and `toolsJson` pair consumed by {@link useAgentPool} and
- * {@link runAgents}.
- *
- * @example Search tool
- * ```typescript
- * class SearchTool extends Tool<{ query: string; topK?: number }> {
- *   readonly name = 'search';
- *   readonly description = 'Search the corpus for relevant passages';
- *   readonly parameters = {
- *     type: 'object',
- *     properties: {
- *       query: { type: 'string', description: 'Search query' },
- *       topK: { type: 'number', description: 'Number of results' },
- *     },
- *     required: ['query'],
- *   };
- *
- *   async execute(args: { query: string; topK?: number }, ctx?: ToolContext) {
- *     const results = await this.reranker.rank(args.query, args.topK ?? 5);
- *     return { results };
- *   }
- * }
- * ```
- *
- * @category Agents
- */
-export abstract class Tool<TArgs = Record<string, unknown>> {
-  /** Tool name — used as the function identifier in tool calls */
-  abstract readonly name: string;
-  /** Human-readable description shown to the model */
-  abstract readonly description: string;
-  /** JSON Schema describing the tool's expected arguments */
-  abstract readonly parameters: JsonSchema;
-
-  /**
-   * Execute the tool with parsed arguments
-   *
-   * Called by the agent pool when the model emits a tool call matching
-   * this tool's name. The return value is JSON-serialized and prefilled
-   * back into the agent's context as a tool result.
-   *
-   * @param args - Parsed arguments from the model's tool call
-   * @param context - Execution context with progress reporting callback
-   * @returns Tool result (will be JSON-serialized)
-   */
-  abstract execute(args: TArgs, context?: ToolContext): Promise<unknown>;
-
-  /**
-   * OpenAI-compatible function tool schema
-   *
-   * Auto-generated from `name`, `description`, and `parameters`.
-   * Used by {@link createToolkit} to build the JSON string passed
-   * to `formatChat()`.
-   */
-  get schema(): ToolSchema {
-    return {
-      type: 'function',
-      function: {
-        name: this.name,
-        description: this.description,
-        parameters: this.parameters,
-      },
-    };
-  }
-}
diff --git a/src/agents/agent-pool.ts b/src/agents/agent-pool.ts
deleted file mode 100644
index 7ff9316..0000000
--- a/src/agents/agent-pool.ts
+++ /dev/null
@@ -1,586 +0,0 @@
-import { resource, call, action, ensure, useScope, createSignal, spawn, each } from 'effection';
-import type { Operation, Scope, Channel } from 'effection';
-import type { Branch } from '../Branch';
-import { CHAT_FORMAT_CONTENT_ONLY, CHAT_FORMAT_GENERIC, GrammarTriggerType, type GrammarTrigger, type ParsedToolCall, type SessionContext } from '../types';
-import type { BranchStore } from '../BranchStore';
-import { Ctx, Store, Events } from './context';
-import { buildToolResultDelta } from './deltas';
-import type {
-  TraceToken,
-  PressureThresholds,
-  AgentTaskSpec,
-  AgentPoolOptions,
-  AgentPoolResult,
-  AgentEvent,
-} from './types';
-
-// ── Internal agent state machine ───────────────────────────────
-// generating → awaiting_tool → generating  (tool result prefilled)
-// generating → done                         (stop + no tool call, or report)
-// awaiting_tool → done                      (tool error)
-
-type AgentInternalState = 'generating' | 'awaiting_tool' | 'done';
-
-interface AgentInternal {
-  id: number;           // = branch.handle
-  parentId: number;     // = parent.handle
-  branch: Branch;
-  state: AgentInternalState;
-  fmt: {
-    format: number;
-    reasoningFormat: number;
-    thinkingForcedOpen: boolean;
-    parser: string;
-    grammar: string;
-    grammarLazy: boolean;
-    grammarTriggers: GrammarTrigger[];
-  };
-  rawOutput: string;
-  tokenCount: number;
-  toolCallCount: number;
-  turns: number;
-  findings: string | null;
-  traceBuffer: TraceToken[];
-}
-
-interface SettledTool {
-  agentId: number;
-  prefillTokens: number[];
-  toolName: string;
-}
-
-/**
- * Immutable KV budget snapshot for one tick of the agent loop
- *
- * Created from `SessionContext._storeKvPressure()` which returns
- * `{ nCtx, cellsUsed, remaining }` where `remaining = nCtx - cellsUsed`.
- * `cellsUsed` is a monotonic counter in `BranchStore` — it increments on
- * every `decode_each` / `decode_scatter` but does **not** decrement on
- * individual branch prune (only resets on bulk ops like `retainOnly` and
- * `drain`). This means `remaining` is a conservative lower bound that
- * becomes increasingly pessimistic as branches are pruned mid-run.
- *
- * Two thresholds partition `remaining` into three zones:
- *
- * ```
- * ┌──────────────────────────────────────────────────────┐
- * │                    nCtx                              │
- * │  ┌──────────┬───────────────────┬──────────────────┐ │
- * │  │cellsUsed │    headroom > 0   │    softLimit     │ │
- * │  │ (in use) │   (new work OK)   │   (reserved)     │ │
- * │  └──────────┴───────────────────┴──────────────────┘ │
- * │              ◄── remaining ──►  │                    │
- * │                                 │                    │
- * │  headroom = remaining - softLimit                    │
- * │  critical = remaining < hardLimit                    │
- * └──────────────────────────────────────────────────────┘
- * ```
- *
- * - **headroom > 0** — room for new work (tool results, generation)
- * - **headroom ≤ 0** — over budget. SETTLE rejects tool results, PRODUCE
- *   hard-cuts non-terminal tool calls. Terminal tools still pass.
- * - **critical** — remaining below hardLimit. Agents killed before
- *   `produceSync()` to prevent llama_decode crashes.
- *
- * @category Agents
- */
-export class ContextPressure {
-  /** Default softLimit: 1024 tokens reserved for downstream work */
-  static readonly DEFAULT_SOFT_LIMIT = 1024;
-  /** Default hardLimit: 128 tokens crash-prevention floor */
-  static readonly DEFAULT_HARD_LIMIT = 128;
-
-  /**
-   * KV slots remaining (`nCtx - cellsUsed`).
-   * Infinity when nCtx ≤ 0 (no context limit).
-   * Conservative: may undercount actual free space when branches have been
-   * pruned, since `cellsUsed` is monotonic.
-   */
-  readonly remaining: number;
-  /** Remaining KV floor — tokens reserved for downstream work */
-  readonly softLimit: number;
-  /** Crash-prevention floor — agents killed when remaining drops below */
-  readonly hardLimit: number;
-
-  constructor(ctx: SessionContext, opts?: PressureThresholds) {
-    const p = ctx._storeKvPressure();
-    this.remaining = p.nCtx <= 0 ? Infinity : p.remaining;
-    this.softLimit = opts?.softLimit ?? ContextPressure.DEFAULT_SOFT_LIMIT;
-    this.hardLimit = opts?.hardLimit ?? ContextPressure.DEFAULT_HARD_LIMIT;
-  }
-
-  /**
-   * Tokens available for new work: `remaining - softLimit`.
-   * Positive means room to accept tool results or continue generating.
-   * Negative means over budget — SETTLE rejects, PRODUCE hard-cuts.
-   */
-  get headroom(): number { return this.remaining - this.softLimit; }
-
-  /** `remaining < hardLimit` — agent must not call `produceSync()`. */
-  get critical(): boolean { return this.remaining < this.hardLimit; }
-
-  /** Can `tokenCount` tokens fit while staying above softLimit? */
-  canFit(tokenCount: number): boolean { return tokenCount <= this.headroom; }
-}
-
-/**
- * Fork an agent from a parent branch with its own system prompt and task.
- *
- * Generator — uses sync native calls so Effection sees everything.
- * On scope exit (error, cancellation), `ensure()` prunes the branch
- * automatically — the orphaned-branch leak is structurally impossible.
- */
-function* setupAgent(
-  parent: Branch,
-  task: AgentTaskSpec,
-  ctx: SessionContext,
-): Operation<{ agent: AgentInternal; suffixTokens: number[] }> {
-  const messages = [
-    { role: 'system', content: task.systemPrompt },
-    { role: 'user', content: task.content },
-  ];
-  const fmtOpts = task.tools ? { tools: task.tools } : {};
-  const fmt = ctx.formatChatSync(JSON.stringify(messages), fmtOpts);
-  if (task.tools && (fmt.format === CHAT_FORMAT_CONTENT_ONLY || fmt.format === CHAT_FORMAT_GENERIC)) {
-    // Error before fork — no branch to clean up
-    throw new Error('Model does not support tool calling. Please use a model with native tool support (e.g. Qwen3, Llama 3.x, Mistral).');
-  }
-  const branch = parent.forkSync();
-  yield* ensure(() => { if (!branch.disposed) branch.pruneSync(); });
-  const sep = ctx.getTurnSeparator();
-  const suffixTokens = [...sep, ...ctx.tokenizeSync(fmt.prompt, false)];
-  if (task.seed != null) branch.reseedSampler(task.seed);
-
-  return {
-    agent: {
-      id: branch.handle,
-      parentId: parent.handle,
-      branch,
-      state: 'generating',
-      fmt: {
-        format: fmt.format,
-        reasoningFormat: fmt.reasoningFormat,
-        thinkingForcedOpen: fmt.thinkingForcedOpen,
-        parser: fmt.parser,
-        grammar: fmt.grammar,
-        grammarLazy: fmt.grammarLazy,
-        grammarTriggers: fmt.grammarTriggers,
-      },
-      rawOutput: '',
-      tokenCount: 0,
-      toolCallCount: 0,
-      turns: 0,
-      findings: null,
-      traceBuffer: [],
-    },
-    suffixTokens,
-  };
-}
-
-/**
- * Concurrent agent generation loop as an Effection resource
- *
- * Runs N agents in parallel using a three-phase tick loop over shared
- * {@link BranchStore} infrastructure. Each agent forks from a parent
- * branch, generates tokens, invokes tools, and reports findings.
- *
- * **Three-phase tick loop:**
- * 1. **PRODUCE** — sample all active agents via `produceSync()` (no async gap)
- * 2. **COMMIT** — single GPU call via `store.commit()` for all produced tokens
- * 3. **SETTLE** — drain settled tool results, batch prefill, reset grammars
- *
- * Tool dispatch uses `scope.run()` for eager start — tool executions run as
- * children of the agent pool scope and are cancelled if the scope exits.
- *
- * **Resource semantics:** `provide()` suspends after all agents complete,
- * keeping branches alive so the caller can fork from them (e.g. for
- * verification). Branches are pruned when the scope exits — each branch's
- * `ensure()` from `setupAgent` handles cleanup automatically.
- *
- * For automatic branch cleanup on return, use {@link runAgents} instead.
- *
- * @param opts - Pool configuration: tasks, tools, sampling params, max turns
- * @returns Agent pool result with per-agent findings and aggregate statistics
- *
- * @example Shared root with agent pool
- * ```typescript
- * const pool = yield* withSharedRoot(
- *   { systemPrompt: RESEARCH_PROMPT, tools: toolsJson },
- *   function*(root) {
- *     return yield* useAgentPool({
- *       tasks: questions.map(q => ({
- *         systemPrompt: RESEARCH_PROMPT,
- *         content: q,
- *         tools: toolsJson,
- *         parent: root,
- *       })),
- *       tools: toolMap,
- *       maxTurns: 6,
- *     });
- *   },
- * );
- * ```
- *
- * @category Agents
- */
-export function useAgentPool(opts: AgentPoolOptions): Operation<AgentPoolResult> {
-  return resource(function*(provide) {
-    const ctx: SessionContext = yield* Ctx.expect();
-    const store: BranchStore = yield* Store.expect();
-    const events: Channel<AgentEvent, void> = yield* Events.expect();
-    const scope: Scope = yield* useScope();
-
-    // Bridge for onProgress callbacks — Signal is correct here (external callback).
-    // A spawned forwarder drains the bridge into the Channel with proper scope context.
-    const progressBridge = createSignal<AgentEvent, void>();
-    yield* spawn(function*() {
-      for (const ev of yield* each(progressBridge)) {
-        yield* events.send(ev);
-        yield* each.next();
-      }
-    });
-    const { tasks, tools, maxTurns = 100, terminalTool, trace = false, pressure: pressureOpts } = opts;
-
-    // Whether the pool's tool registry contains tools besides the terminal tool.
-    // When false, agents are allowed to call the terminal tool as their first
-    // action (e.g. reporter sub-agents that only have `report()`). When true,
-    // the first tool call must be a non-terminal tool to prevent agents from
-    // immediately reporting without doing any work.
-    //
-    // IMPORTANT: this checks the pool's `tools` registry, not individual task
-    // schemas (`task.tools`). A reporter pool must pass only the terminal tool
-    // in its registry — passing the full tool map makes this flag true and
-    // traps reporters in an infinite rejection loop.
-    const hasNonTerminalTools = terminalTool ? [...tools.keys()].some(k => k !== terminalTool) : tools.size > 0;
-
-    // ── Setup: fork branches, collect suffix tokens ──────────
-    // setupAgent is now a generator — each branch registers its own ensure()
-    // for cleanup. No manual try/finally needed here.
-    const agents: AgentInternal[] = [];
-    const prefillSetup: [Branch, number[]][] = [];
-
-    for (const task of tasks) {
-      const parent = task.parent;
-      if (!parent) throw new Error('useAgentPool: each task must have a parent branch');
-
-      const { agent, suffixTokens } = yield* setupAgent(parent, task, ctx);
-      agents.push(agent);
-      prefillSetup.push([agent.branch, suffixTokens]);
-    }
-
-    // Batch prefill all agent suffixes — pressure-gated.
-    // Each suffix is the full formatted chat (system prompt + tools JSON +
-    // user message + generation prompt), tokenized via formatChatSync().
-    // Suffix cost is model-dependent: ~250-400 tokens per agent depending
-    // on chat template verbosity and tool schema size.
-    const initPressure = new ContextPressure(ctx, pressureOpts);
-    const totalSuffix = prefillSetup.reduce((s, [, t]) => s + t.length, 0);
-    if (!initPressure.canFit(totalSuffix)) {
-      // Not enough room — drop agents from the end until it fits
-      while (prefillSetup.length > 0) {
-        const needed = prefillSetup.reduce((s, [, t]) => s + t.length, 0);
-        if (initPressure.canFit(needed)) break;
-        prefillSetup.pop();
-        const dropped = agents.pop()!;
-        dropped.state = 'done';
-      }
-    }
-    if (prefillSetup.length > 0) {
-      yield* call(() => store.prefill(prefillSetup));
-    }
-
-    // Emit spawn events — TUI uses parentAgentId to detect sub-agents
-    for (const a of agents) {
-      yield* events.send({ type: 'agent:spawn', agentId: a.id, parentAgentId: a.parentId });
-    }
-
-    // ── Lazy grammar setup ───────────────────────────────────
-    const applyLazyGrammar = (a: AgentInternal): void => {
-      if (a.fmt.grammar && a.fmt.grammarLazy && a.fmt.grammarTriggers.length > 0) {
-        const triggers = a.fmt.grammarTriggers.map(t => {
-          if (t.type === GrammarTriggerType.WORD) {
-            const nlIdx = t.value.indexOf('\n');
-            if (nlIdx >= 0 && nlIdx < t.value.length - 1) {
-              return { ...t, value: t.value.slice(0, nlIdx + 1) };
-            }
-          }
-          return t;
-        });
-        a.branch.setGrammarLazy(a.fmt.grammar, triggers);
-      }
-    };
-    for (const a of agents) applyLazyGrammar(a);
-
-    // ── Tool dispatch coordination ───────────────────────────
-    // Plain JS buffer: spawned tool tasks push synchronously on completion.
-    // SETTLE drains with splice(0). Safe because generators are synchronous
-    // between yields — spawns can only push at yield points (during COMMIT's
-    // yield* call()), and SETTLE runs after COMMIT in the same tick.
-    const settledBuffer: SettledTool[] = [];
-    const agentById = new Map(agents.map(a => [a.id, a]));
-
-    // Track pending tool count for idle detection
-    let pendingToolCount = 0;
-
-    // Resolve function for idle wake — set when all agents stall
-    let wakeIdle: (() => void) | null = null;
-
-    let steps = 0;
-    let totalToolCalls = 0;
-    const counters = {
-      warmPrefillCalls: 0,
-      warmPrefillBranches: 0,
-      stalledTicks: 0,
-      maxConcurrentTools: 0,
-      idleTicks: 0,
-    };
-
-    function* dispatchTool(agent: AgentInternal, tc: ParsedToolCall): Operation<void> {
-      let toolArgs: Record<string, unknown>;
-      try { toolArgs = JSON.parse(tc.arguments); } catch { toolArgs = {}; }
-      const callId = tc.id || `call_${agent.toolCallCount}`;
-
-      agent.toolCallCount++;
-      totalToolCalls++;
-      agent.turns++;
-      agent.state = 'awaiting_tool';
-
-      yield* events.send({ type: 'agent:tool_call', agentId: agent.id, tool: tc.name, args: tc.arguments });
-
-      const tool = tools.get(tc.name);
-      pendingToolCount++;
-      counters.maxConcurrentTools = Math.max(counters.maxConcurrentTools, pendingToolCount);
-
-      // scope.run() — eager start, child of agent pool scope, cancelled if scope exits.
-      // spawn() is lazy (Operation), but we're in a generator — scope.run() is eager.
-      scope.run(function*() {
-        try {
-          const toolContext = {
-            onProgress: (p: { filled: number; total: number }) => {
-              // Signal bridge — onProgress is an external callback, Signal.send() is correct here.
-              progressBridge.send({ type: 'agent:tool_progress', agentId: agent.id, tool: tc.name, filled: p.filled, total: p.total });
-            },
-          };
-
-          const result: unknown = yield* call(() =>
-            tool ? tool.execute(toolArgs, toolContext) : Promise.resolve({ error: `Unknown tool: ${tc.name}` })
-          );
-          const resultStr = JSON.stringify(result);
-          yield* events.send({ type: 'agent:tool_result', agentId: agent.id, tool: tc.name, result: resultStr });
-
-          const prefillTokens = buildToolResultDelta(ctx, resultStr, callId);
-          settledBuffer.push({ agentId: agent.id, prefillTokens, toolName: tc.name });
-        } catch (err) {
-          agent.state = 'done';
-          agent.findings = `Tool error: ${(err as Error).message}`;
-        } finally {
-          pendingToolCount--;
-          if (wakeIdle) { wakeIdle(); wakeIdle = null; }
-        }
-      });
-    }
-
-    // ── Three-phase tick loop ────────────────────────────────
-    for (;;) {
-      // -- Phase 1: PRODUCE -- sample from active agents
-      const pressure = new ContextPressure(ctx, pressureOpts);
-
-      if (trace && (pressure.critical || pressure.headroom < 0)) {
-        const p = ctx._storeKvPressure();
-        try { process.stderr.write(`[PRODUCE] ${pressure.critical ? 'CRITICAL' : 'SOFT_LIMIT'} remaining=${p.remaining} headroom=${pressure.headroom} cellsUsed=${p.cellsUsed} nCtx=${p.nCtx}\n`); } catch {}
-      }
-
-      const entries: [Branch, number][] = [];
-      for (const a of agents) {
-        if (a.state !== 'generating') continue;
-
-        if (pressure.critical) {
-          a.state = 'done';
-          yield* events.send({ type: 'agent:done', agentId: a.id });
-          continue;
-        }
-
-        const { token, text, isStop } = a.branch.produceSync();
-        if (isStop) {
-          const parsed = ctx.parseChatOutput(a.rawOutput, a.fmt.format, {
-            reasoningFormat: a.fmt.reasoningFormat,
-            thinkingForcedOpen: a.fmt.thinkingForcedOpen,
-            parser: a.fmt.parser,
-          });
-
-          const tc = parsed.toolCalls[0];
-          if (!tc) {
-            a.state = 'done';
-            if (!a.findings && a.toolCallCount > 0 && parsed.content) {
-              a.findings = parsed.content;
-              yield* events.send({ type: 'agent:report', agentId: a.id, findings: a.findings });
-            }
-            yield* events.send({ type: 'agent:done', agentId: a.id });
-            continue;
-          }
-
-          // Over budget: deny non-terminal tool calls when the agent has
-          // exceeded maxTurns or KV headroom is negative. Terminal tools
-          // (e.g. `report()`) are always allowed through — an agent that has
-          // done research and wants to report should never be blocked by
-          // pressure, since the report call itself consumes minimal KV.
-          const overBudget = (a.turns >= maxTurns || pressure.headroom < 0)
-            && (!terminalTool || tc.name !== terminalTool);
-
-          if (overBudget) {
-            a.state = 'done';
-            yield* events.send({ type: 'agent:done', agentId: a.id });
-            continue;
-          }
-
-          // Terminal tool — intercept, extract findings, mark done.
-          if (terminalTool && tc.name === terminalTool) {
-            if (a.toolCallCount === 0 && hasNonTerminalTools) {
-              const callId = tc.id || `call_${a.toolCallCount}`;
-              const errorMsg = 'You must perform research before reporting. Call at least one tool first.';
-              a.turns++;
-              a.state = 'awaiting_tool';
-              pendingToolCount++;
-              scope.run(function*() {
-                try {
-                  const prefillTokens = buildToolResultDelta(ctx, JSON.stringify({ error: errorMsg }), callId);
-                  settledBuffer.push({ agentId: a.id, prefillTokens, toolName: tc.name });
-                } finally {
-                  pendingToolCount--;
-                  if (wakeIdle) { wakeIdle(); wakeIdle = null; }
-                }
-              });
-              a.rawOutput = '';
-              continue;
-            }
-            try { a.findings = JSON.parse(tc.arguments).findings; } catch { a.findings = tc.arguments; }
-            a.state = 'done';
-            a.toolCallCount++;
-            totalToolCalls++;
-            yield* events.send({ type: 'agent:tool_call', agentId: a.id, tool: tc.name, args: tc.arguments });
-            yield* events.send({ type: 'agent:report', agentId: a.id, findings: a.findings! });
-            yield* events.send({ type: 'agent:done', agentId: a.id });
-            continue;
-          }
-
-          // Fire-and-forget — dispatch tool without blocking the decode loop
-          yield* dispatchTool(a, tc);
-          a.rawOutput = '';
-          continue;
-        }
-
-        entries.push([a.branch, token]);
-        a.rawOutput += text;
-        a.tokenCount++;
-        if (trace) {
-          const entropy = a.branch.modelEntropy();
-          const surprisal = a.branch.modelSurprisal(token);
-          a.traceBuffer.push({ text, entropy, surprisal });
-          yield* events.send({
-            type: 'agent:produce', agentId: a.id, text, tokenCount: a.tokenCount,
-            entropy, surprisal,
-          });
-        } else {
-          yield* events.send({ type: 'agent:produce', agentId: a.id, text, tokenCount: a.tokenCount });
-        }
-      }
-
-      // -- Phase 2: COMMIT -- batch-decode produced tokens
-      if (entries.length > 0) {
-        yield* call(() => store.commit(entries));
-        steps++;
-      }
-
-      // -- Phase 3: SETTLE -- drain settled tool buffer, batch prefill
-      const settled = settledBuffer.splice(0);
-      if (settled.length > 0) {
-        // Fresh snapshot — Phase 2 commits may have advanced positions
-        const settlePressure = new ContextPressure(ctx, pressureOpts);
-        let headroom = settlePressure.headroom;
-
-        if (trace) {
-          const p = ctx._storeKvPressure();
-          const items = settled.map(s => `${s.toolName}:${s.prefillTokens.length}`).join(', ');
-          try { process.stderr.write(`[SETTLE] remaining=${p.remaining} headroom=${headroom} cellsUsed=${p.cellsUsed} nCtx=${p.nCtx} items=[${items}]\n`); } catch {}
-        }
-
-        const prefillPairs: [Branch, number[]][] = [];
-        const settledAgents: AgentInternal[] = [];
-
-        for (const item of settled) {
-          const a = agentById.get(item.agentId);
-          if (!a || a.state === 'done') continue;
-
-          if (item.prefillTokens.length > headroom) {
-            if (trace) {
-              try { process.stderr.write(`[SETTLE] REJECT ${item.toolName}:${item.prefillTokens.length} > headroom=${headroom}\n`); } catch {}
-            }
-            a.state = 'done';
-            yield* events.send({ type: 'agent:done', agentId: a.id });
-            continue;
-          }
-
-          prefillPairs.push([a.branch, item.prefillTokens]);
-          settledAgents.push(a);
-          headroom -= item.prefillTokens.length;
-        }
-
-        if (prefillPairs.length > 0) {
-          if (trace) {
-            const totalPrefill = prefillPairs.reduce((s, [, t]) => s + t.length, 0);
-            try { process.stderr.write(`[SETTLE] PREFILL ${prefillPairs.length} branches, ${totalPrefill} tokens, headroom_after=${headroom}\n`); } catch {}
-          }
-          yield* call(() => store.prefill(prefillPairs));
-          counters.warmPrefillCalls++;
-          counters.warmPrefillBranches += prefillPairs.length;
-
-          // Only NOW transition state + reset grammar
-          for (const a of settledAgents) {
-            a.state = 'generating';
-            a.rawOutput = '';
-            applyLazyGrammar(a);
-          }
-        }
-      }
-
-      // -- Termination + idle yield
-      const allDone = agents.every(a => a.state === 'done') && pendingToolCount === 0;
-      if (allDone) break;
-
-      if (entries.length === 0 && pendingToolCount > 0) {
-        counters.stalledTicks++;
-        if (settled.length === 0) {
-          // Nothing produced, nothing settled — yield until a tool resolves
-          yield* action<void>((resolve) => {
-            wakeIdle = resolve;
-            return () => { wakeIdle = null; };
-          });
-          counters.idleTicks++;
-        }
-      }
-    }
-
-    // ── Provide result — suspends, branches stay alive ───────
-    // Branch cleanup is handled by each branch's ensure() from setupAgent —
-    // when this resource's scope exits, all ensure() callbacks fire.
-    const result: AgentPoolResult = {
-      agents: agents.map(a => ({
-          agentId: a.id,
-          parentAgentId: a.parentId,
-          branch: a.branch,
-          findings: a.findings,
-          toolCallCount: a.toolCallCount,
-          tokenCount: a.tokenCount,
-          ppl: a.branch.perplexity,
-          samplingPpl: a.branch.samplingPerplexity,
-          trace: trace ? a.traceBuffer : undefined,
-        })),
-      totalTokens: agents.reduce((s, a) => s + a.tokenCount, 0),
-      totalToolCalls,
-      steps,
-      counters,
-    };
-
-    yield* provide(result);
-  });
-}
diff --git a/src/agents/context.ts b/src/agents/context.ts
deleted file mode 100644
index 3fc593a..0000000
--- a/src/agents/context.ts
+++ /dev/null
@@ -1,36 +0,0 @@
-import { createContext } from 'effection';
-import type { SessionContext } from '../types';
-import type { BranchStore } from '../BranchStore';
-import type { Channel } from 'effection';
-import type { AgentEvent } from './types';
-
-/**
- * Effection context holding the active {@link SessionContext}
- *
- * Set by {@link initAgents} in the caller's scope. All agent operations
- * (`generate`, `diverge`, `useAgentPool`, `withSharedRoot`) read from this
- * context via `yield* Ctx.expect()`.
- *
- * @category Agents
- */
-export const Ctx = createContext<SessionContext>('lloyal.ctx');
-
-/**
- * Effection context holding the active {@link BranchStore}
- *
- * Set by {@link initAgents}. Used by {@link diverge} and {@link useAgentPool}
- * for batched commit/prefill across multiple branches.
- *
- * @category Agents
- */
-export const Store = createContext<BranchStore>('lloyal.store');
-
-/**
- * Effection context holding the agent event channel
- *
- * Set by {@link initAgents}. {@link useAgentPool} emits {@link AgentEvent}
- * values through this channel via `yield* channel.send()`.
- *
- * @category Agents
- */
-export const Events = createContext<Channel<AgentEvent, void>>('lloyal.events');
diff --git a/src/agents/deltas.ts b/src/agents/deltas.ts
deleted file mode 100644
index baf12d0..0000000
--- a/src/agents/deltas.ts
+++ /dev/null
@@ -1,63 +0,0 @@
-import type { SessionContext } from '../types';
-
-/**
- * Build a token delta for a user turn
- *
- * Composes `getTurnSeparator()` + `formatChatSync()` + `tokenizeSync()` into a
- * single token array suitable for `branch.prefill()`. Usable with any
- * branch — not tied to {@link Session}'s trunk.
- *
- * This is the canonical way to build a user-turn delta for warm prefill
- * in multi-turn conversations.
- *
- * @param ctx - Active session context
- * @param content - User message content
- * @param opts - Optional tools JSON for tool-aware formatting
- * @returns Token array ready for `branch.prefill()`
- *
- * @category Agents
- */
-export function buildUserDelta(
-  ctx: SessionContext,
-  content: string,
-  opts: { tools?: string } = {}
-): number[] {
-  const sep = ctx.getTurnSeparator();
-  const fmtOpts = opts.tools ? { tools: opts.tools } : {};
-  const { prompt } = ctx.formatChatSync(
-    JSON.stringify([{ role: 'system', content: '' }, { role: 'user', content }]),
-    fmtOpts
-  );
-  const delta = ctx.tokenizeSync(prompt, false);
-  return [...sep, ...delta];
-}
-
-/**
- * Build a token delta for a tool result turn
- *
- * Composes `getTurnSeparator()` + `formatChatSync()` + `tokenizeSync()` into a
- * single token array suitable for `branch.prefill()`. Used by
- * {@link useAgentPool} to inject tool results back into agent context.
- *
- * @param ctx - Active session context
- * @param resultStr - JSON-serialized tool result
- * @param callId - Tool call identifier from the model's parsed output
- * @returns Token array ready for `branch.prefill()`
- *
- * @category Agents
- */
-export function buildToolResultDelta(
-  ctx: SessionContext,
-  resultStr: string,
-  callId: string
-): number[] {
-  const sep = ctx.getTurnSeparator();
-  const { prompt } = ctx.formatChatSync(
-    JSON.stringify([
-      { role: 'system', content: '' },
-      { role: 'tool', content: resultStr, tool_call_id: callId },
-    ])
-  );
-  const delta = ctx.tokenizeSync(prompt, false);
-  return [...sep, ...delta];
-}
diff --git a/src/agents/diverge.ts b/src/agents/diverge.ts
deleted file mode 100644
index ed1be3e..0000000
--- a/src/agents/diverge.ts
+++ /dev/null
@@ -1,145 +0,0 @@
-import { call, ensure } from 'effection';
-import type { Operation } from 'effection';
-import { Branch } from '../Branch';
-import { Ctx, Store } from './context';
-import { ContextPressure } from './agent-pool';
-import type { DivergeOptions, DivergeResult, DivergeAttempt } from './types';
-
-/**
- * Multi-branch perplexity selection as an Effection operation
- *
- * Forks N branches from a parent (or a fresh root), generates to EOG via
- * batched {@link BranchStore.commit}, then selects the lowest-perplexity
- * attempt. Loser branches are pruned; the caller receives the best branch
- * still alive.
- *
- * When `opts.parent` is provided, the parent branch is NOT pruned — it's
- * owned by the calling scope. Only the forked attempt branches (losers)
- * are pruned. The caller owns the winning branch's lifecycle, typically
- * via {@link Session.promote}.
- *
- * Cleanup is structured: each forked branch registers an `ensure()` callback
- * that prunes it on scope exit. Winners are marked disposed-safe (already
- * pruned or ownership transferred) before the ensure fires.
- *
- * @param opts - Diverge options specifying parent or prompt, attempt count,
- *   and sampling parameters
- * @returns Result containing the best branch, all attempt outputs, and
- *   aggregate statistics
- *
- * @example Verify with perplexity selection
- * ```typescript
- * const verified = yield* diverge({
- *   prompt: verifyPrompt,
- *   attempts: 3,
- *   params: { temperature: 0.7 },
- * });
- * // verified.best is the lowest-perplexity branch, still alive
- * yield* call(() => session.promote(verified.best));
- * ```
- *
- * @category Agents
- */
-export function* diverge(opts: DivergeOptions): Operation<DivergeResult> {
-  const ctx = yield* Ctx.expect();
-  const store = yield* Store.expect();
-
-  // If parent provided, fork from it. Otherwise create a fresh root.
-  let root: Branch;
-  let ownRoot = false;
-  let prefixLength: number;
-
-  if (opts.parent) {
-    root = opts.parent;
-    prefixLength = root.position;
-  } else {
-    if (!opts.prompt) throw new Error('diverge() requires either opts.parent or opts.prompt');
-    const tokens = ctx.tokenizeSync(opts.prompt);
-    root = Branch.create(ctx, 0, opts.params ?? {});
-    yield* call(() => root.prefill(tokens));
-    prefixLength = tokens.length;
-    ownRoot = true;
-    // If we created the root, ensure it's cleaned up
-    yield* ensure(() => {
-      if (ownRoot && !root.disposed) {
-        try { root.pruneSync(); } catch { /* children may remain */ }
-      }
-    });
-  }
-
-  const live: { branch: Branch; output: string; done: boolean; tokenCount: number; ppl: number }[] = [];
-
-  for (let i = 0; i < opts.attempts; i++) {
-    const branch = root.forkSync();
-    // Each forked branch gets its own ensure() for structured cleanup
-    yield* ensure(() => {
-      if (!branch.disposed) {
-        try { branch.pruneSync(); } catch { /* already gone */ }
-      }
-    });
-    branch.reseedSampler(2000 + i);
-    live.push({ branch, output: '', done: false, tokenCount: 0, ppl: Infinity });
-  }
-
-  // Batched generation — produceSync/commit loop
-  let steps = 0;
-  for (;;) {
-    const pressure = new ContextPressure(ctx);
-    if (pressure.critical) {
-      for (const a of live) { if (!a.done) a.done = true; }
-      break;
-    }
-
-    const entries: [Branch, number][] = [];
-    for (const a of live) {
-      if (a.done) continue;
-      const { token, text, isStop } = a.branch.produceSync();
-      if (isStop) {
-        const p = a.branch.perplexity;
-        a.ppl = Number.isFinite(p) ? p : Infinity;
-        a.done = true;
-        continue;
-      }
-      entries.push([a.branch, token]);
-      a.output += text;
-      a.tokenCount++;
-    }
-    if (entries.length === 0) break;
-    yield* call(() => store.commit(entries));
-    steps++;
-  }
-
-  // Select by lowest perplexity (most coherent)
-  const bestIdx = live.reduce((bi, a, i) => a.ppl <= live[bi].ppl ? i : bi, 0);
-
-  // Prune losers now — winner stays alive as caller's result.
-  // ensure() will be a no-op for these since they're already disposed.
-  for (let i = 0; i < live.length; i++) {
-    if (i !== bestIdx && !live[i].branch.disposed) {
-      live[i].branch.pruneSync();
-    }
-  }
-
-  // If we created root and it's no longer needed, prune it now.
-  // (ensure() will be a no-op since it checks disposed)
-  if (ownRoot && !root.disposed && root.children.length === 0) {
-    root.pruneSync();
-  }
-
-  const totalTokens = live.reduce((s, a) => s + a.tokenCount, 0);
-  const attempts: DivergeAttempt[] = live.map(a => ({
-    branch: a.branch,
-    output: a.output,
-    tokenCount: a.tokenCount,
-    ppl: a.ppl,
-  }));
-
-  return {
-    best: live[bestIdx].branch,
-    bestOutput: live[bestIdx].output,
-    attempts,
-    totalTokens,
-    steps,
-    prefixLength,
-  };
-}
diff --git a/src/agents/generate.ts b/src/agents/generate.ts
deleted file mode 100644
index 4a37c66..0000000
--- a/src/agents/generate.ts
+++ /dev/null
@@ -1,59 +0,0 @@
-import { call } from 'effection';
-import type { Operation } from 'effection';
-import { Branch } from '../Branch';
-import { Ctx } from './context';
-import type { GenerateOptions, GenerateResult } from './types';
-
-/**
- * Single-branch grammar-constrained generation as an Effection operation
- *
- * Creates a fresh branch at position 0, prefills the prompt, generates
- * to EOG, and prunes the branch. Uses {@link Branch}'s async iterator
- * — single-branch generation doesn't need batched commit.
- *
- * The branch is always cleaned up via try/finally, even on error or
- * scope cancellation.
- *
- * @param opts - Generation options (prompt, grammar, params, parse)
- * @returns Generated text, token count, and optionally parsed result
- *
- * @example Grammar-constrained JSON generation
- * ```typescript
- * const plan = yield* generate({
- *   prompt: planPrompt,
- *   grammar: planGrammar,
- *   params: { temperature: 0.3 },
- *   parse: output => JSON.parse(output),
- * });
- * console.log(plan.parsed); // typed result from parse()
- * ```
- *
- * @category Agents
- */
-export function* generate<T = unknown>(opts: GenerateOptions): Operation<GenerateResult<T>> {
-  const ctx = yield* Ctx.expect();
-
-  const samplerParams = opts.params ?? {};
-  const branch = Branch.create(ctx, 0, samplerParams, undefined, opts.grammar);
-
-  try {
-    const tokens = ctx.tokenizeSync(opts.prompt);
-    yield* call(() => branch.prefill(tokens));
-
-    // Consume async iterator inside call() — generators can't use for-await
-    const { output, tokenCount } = yield* call(async () => {
-      let output = '';
-      let tokenCount = 0;
-      for await (const { text } of branch) {
-        output += text;
-        tokenCount++;
-      }
-      return { output, tokenCount };
-    });
-
-    const parsed = opts.parse ? opts.parse(output) as T : undefined;
-    return { output, tokenCount, parsed };
-  } finally {
-    if (!branch.disposed) branch.pruneSync();
-  }
-}
diff --git a/src/agents/index.ts b/src/agents/index.ts
deleted file mode 100644
index 6d5b889..0000000
--- a/src/agents/index.ts
+++ /dev/null
@@ -1,32 +0,0 @@
-export { Ctx, Store, Events } from './context';
-export { Tool } from './Tool';
-export { buildUserDelta, buildToolResultDelta } from './deltas';
-export { generate } from './generate';
-export { diverge } from './diverge';
-export { useAgentPool, ContextPressure } from './agent-pool';
-export { runAgents } from './run-agents';
-export { createToolkit } from './toolkit';
-export { initAgents } from './init';
-export { withSharedRoot } from './shared-root';
-
-export type { Toolkit } from './toolkit';
-export type { AgentHandle } from './init';
-export type { SharedRootOptions } from './shared-root';
-
-export type {
-  TraceToken,
-  JsonSchema,
-  ToolSchema,
-  ToolContext,
-  PressureThresholds,
-  AgentTaskSpec,
-  AgentPoolOptions,
-  AgentResult,
-  AgentPoolResult,
-  GenerateOptions,
-  GenerateResult,
-  DivergeOptions,
-  DivergeAttempt,
-  DivergeResult,
-  AgentEvent,
-} from './types';
diff --git a/src/agents/init.ts b/src/agents/init.ts
deleted file mode 100644
index d7ebbd6..0000000
--- a/src/agents/init.ts
+++ /dev/null
@@ -1,78 +0,0 @@
-import { ensure, createChannel, call } from 'effection';
-import type { Operation, Channel } from 'effection';
-import { BranchStore } from '../BranchStore';
-import { Session } from '../Session';
-import type { SessionContext } from '../types';
-import { Ctx, Store, Events } from './context';
-import type { AgentEvent } from './types';
-
-/**
- * Handle returned by {@link initAgents} containing all agent resources
- *
- * @category Agents
- */
-export interface AgentHandle<E = AgentEvent> {
-  /** The session context (model, tokenizer, KV cache) */
-  ctx: SessionContext;
-  /** Branch store for batched commit/prefill across branches */
-  store: BranchStore;
-  /** Session managing conversation trunk and branch lifecycle */
-  session: Session;
-  /** Channel for subscribing to agent events */
-  events: Channel<E, void>;
-}
-
-/**
- * Bootstrap the agent infrastructure and register structured cleanup
- *
- * Creates {@link BranchStore}, {@link Session}, and an event channel, then
- * sets all three Effection contexts ({@link Ctx}, {@link Store},
- * {@link Events}) in the caller's scope. Cleanup runs on scope exit
- * (Ctrl-C, error, normal completion) via `ensure()`.
- *
- * Context values are set in the caller's scope — visible to all subsequent
- * operations. This is why `initAgents` uses `ensure()` rather than
- * `resource()`: a resource creates a child scope where `Ctx.set()` would
- * be invisible to sibling operations.
- *
- * The caller creates the {@link SessionContext} (model path, nCtx, KV types
- * are harness-specific decisions) and passes it in.
- *
- * @param ctx - Session context created via `createContext()`
- * @returns Agent handle with session, store, and event channel
- *
- * @example Canonical bootstrap
- * ```typescript
- * main(function*() {
- *   const ctx = yield* call(() => createContext({
- *     modelPath, nCtx: 16384,
- *     nSeqMax: 4, typeK: 'q4_0', typeV: 'q4_0',
- *   }));
- *
- *   const { session, events } = yield* initAgents(ctx);
- *   // Ctx, Store, Events are now set — generate(), diverge(),
- *   // useAgentPool() will find them automatically.
- *   // Cleanup runs on scope exit.
- * });
- * ```
- *
- * @category Agents
- */
-export function* initAgents<E = AgentEvent>(
-  ctx: SessionContext,
-): Operation<AgentHandle<E>> {
-  const store = new BranchStore(ctx);
-  const session = new Session({ ctx, store });
-  const events: Channel<E, void> = createChannel<E, void>();
-
-  yield* Ctx.set(ctx);
-  yield* Store.set(store);
-  yield* Events.set(events as unknown as Channel<AgentEvent, void>);
-
-  yield* ensure(function*() {
-    yield* call(() => session.dispose());
-    ctx.dispose();
-  });
-
-  return { ctx, store, session, events };
-}
diff --git a/src/agents/run-agents.ts b/src/agents/run-agents.ts
deleted file mode 100644
index b2c71dc..0000000
--- a/src/agents/run-agents.ts
+++ /dev/null
@@ -1,45 +0,0 @@
-import { scoped } from 'effection';
-import type { Operation } from 'effection';
-import { useAgentPool } from './agent-pool';
-import type { AgentPoolOptions, AgentPoolResult } from './types';
-
-/**
- * Run an agent pool with automatic branch cleanup on return
- *
- * Wraps {@link useAgentPool} in `scoped()` — agent branches are pruned
- * when the scope exits, before this operation returns. Use this when you
- * don't need to fork from agent branches after the pool completes.
- *
- * For multi-level tree topology (forking from agent branches for
- * verification or follow-up), use {@link useAgentPool} directly within
- * your own scope management.
- *
- * @param opts - Pool configuration: tasks, tools, sampling params, max turns
- * @returns Agent pool result (branches already pruned)
- *
- * @example Research agents with shared root
- * ```typescript
- * const pool = yield* withSharedRoot(
- *   { systemPrompt: RESEARCH_PROMPT, tools: toolsJson },
- *   function*(root, prefixLen) {
- *     return yield* runAgents({
- *       tasks: questions.map(q => ({
- *         systemPrompt: RESEARCH_PROMPT,
- *         content: q,
- *         tools: toolsJson,
- *         parent: root,
- *       })),
- *       tools: toolMap,
- *       maxTurns: 6,
- *     });
- *   },
- * );
- * ```
- *
- * @category Agents
- */
-export function* runAgents(opts: AgentPoolOptions): Operation<AgentPoolResult> {
-  return yield* scoped(function*() {
-    return yield* useAgentPool(opts);
-  });
-}
diff --git a/src/agents/shared-root.ts b/src/agents/shared-root.ts
deleted file mode 100644
index 101958a..0000000
--- a/src/agents/shared-root.ts
+++ /dev/null
@@ -1,80 +0,0 @@
-import { call } from 'effection';
-import type { Operation } from 'effection';
-import { Branch } from '../Branch';
-import type { SessionContext } from '../types';
-import { Ctx } from './context';
-import type { SamplingParams } from './types';
-
-/**
- * Configuration for {@link withSharedRoot}
- *
- * @category Agents
- */
-export interface SharedRootOptions {
-  /** System prompt to tokenize and prefill into the shared root */
-  systemPrompt: string;
-  /** JSON-serialized tool schemas for tool-aware prompt formatting */
-  tools?: string;
-  /** Sampling parameters for the root branch */
-  params?: SamplingParams;
-}
-
-/**
- * Scoped shared root branch with guaranteed cleanup
- *
- * Creates a root branch, prefills the system prompt, and passes it to
- * the body function. The root is pruned via try/finally when the body
- * returns or throws, regardless of whether children still exist.
- *
- * Use this for the cold-path pattern where multiple agents share a
- * tokenized system prompt prefix. The `sharedPrefixLength` passed to
- * the body enables KV savings calculation.
- *
- * @param opts - System prompt, tools, and sampling parameters
- * @param body - Operation that receives the root branch and prefix length.
- *   Typically calls {@link runAgents} or {@link useAgentPool} inside.
- * @returns The body's return value
- *
- * @example Cold-path research with shared prefix
- * ```typescript
- * const { result, prefixLen } = yield* withSharedRoot(
- *   { systemPrompt: RESEARCH_PROMPT, tools: toolsJson },
- *   function*(root, prefixLen) {
- *     const result = yield* runAgents({
- *       tasks: questions.map(q => ({
- *         systemPrompt: RESEARCH_PROMPT,
- *         content: q,
- *         tools: toolsJson,
- *         parent: root,
- *       })),
- *       tools: toolMap,
- *     });
- *     return { result, prefixLen };
- *   },
- * );
- * ```
- *
- * @category Agents
- */
-export function* withSharedRoot<T>(
-  opts: SharedRootOptions,
-  body: (root: Branch, sharedPrefixLength: number) => Operation<T>,
-): Operation<T> {
-  const ctx: SessionContext = yield* Ctx.expect();
-
-  const messages = [{ role: 'system', content: opts.systemPrompt }];
-  const fmtOpts = opts.tools
-    ? { tools: opts.tools, addGenerationPrompt: false }
-    : { addGenerationPrompt: false };
-  const fmt = ctx.formatChatSync(JSON.stringify(messages), fmtOpts);
-  const sharedTokens = ctx.tokenizeSync(fmt.prompt);
-
-  const root = Branch.create(ctx, 0, opts.params ?? { temperature: 0.5 });
-  yield* call(() => root.prefill(sharedTokens));
-
-  try {
-    return yield* body(root, sharedTokens.length);
-  } finally {
-    if (!root.disposed) root.pruneSubtreeSync();
-  }
-}
diff --git a/src/agents/toolkit.ts b/src/agents/toolkit.ts
deleted file mode 100644
index 86bcf0c..0000000
--- a/src/agents/toolkit.ts
+++ /dev/null
@@ -1,44 +0,0 @@
-import type { Tool } from './Tool';
-
-/**
- * Aggregated tool registry for agent pool consumption
- *
- * Contains the `toolMap` for dispatch and `toolsJson` for prompt
- * formatting. Created by {@link createToolkit}.
- *
- * @category Agents
- */
-export interface Toolkit {
-  /** Name-to-instance map used by {@link useAgentPool} for tool dispatch */
-  toolMap: Map<string, Tool>;
-  /** JSON-serialized tool schemas passed to `formatChat()` via task specs */
-  toolsJson: string;
-}
-
-/**
- * Aggregate an array of {@link Tool} instances into a toolkit
- *
- * Builds both the dispatch map and the JSON schema string from the
- * tool array. Pass the result directly to {@link AgentPoolOptions}
- * and {@link AgentTaskSpec}.
- *
- * @param tools - Tool instances to aggregate
- * @returns Toolkit with `toolMap` and `toolsJson`
- *
- * @example
- * ```typescript
- * const { toolMap, toolsJson } = createToolkit([
- *   new SearchTool(chunks, reranker),
- *   new ReadFileTool(resources),
- *   new GrepTool(resources),
- * ]);
- * ```
- *
- * @category Agents
- */
-export function createToolkit(tools: Tool[]): Toolkit {
-  return {
-    toolMap: new Map(tools.map(t => [t.name, t])),
-    toolsJson: JSON.stringify(tools.map(t => t.schema)),
-  };
-}
diff --git a/src/agents/types.ts b/src/agents/types.ts
deleted file mode 100644
index df8c468..0000000
--- a/src/agents/types.ts
+++ /dev/null
@@ -1,378 +0,0 @@
-import type { Branch } from '../Branch';
-import type { SessionContext } from '../types';
-
-// ── Tool base class types ──────────────────────────────────────
-
-/**
- * JSON Schema definition for tool parameter validation
- *
- * Describes the shape of arguments a {@link Tool} accepts. Passed to the
- * model via `formatChat()` so it can generate valid tool-call arguments.
- *
- * @category Agents
- */
-export interface JsonSchema {
-  /** JSON Schema type (e.g. `"object"`, `"string"`, `"array"`) */
-  type: string;
-  /** Property definitions when `type` is `"object"` */
-  properties?: Record<string, unknown>;
-  /** Required property names when `type` is `"object"` */
-  required?: string[];
-  /** Additional schema constraints (minItems, enum, etc.) */
-  [key: string]: unknown;
-}
-
-/**
- * OpenAI-compatible function tool schema
- *
- * The wrapper format expected by `formatChat()` when passing tools to the
- * model. {@link Tool.schema} generates this automatically from the tool's
- * `name`, `description`, and `parameters`.
- *
- * @category Agents
- */
-export interface ToolSchema {
-  /** Always `"function"` for function-calling tools */
-  type: 'function';
-  /** Function definition containing name, description, and parameter schema */
-  function: {
-    /** Tool name — used as the function identifier in tool calls */
-    name: string;
-    /** Human-readable description shown to the model */
-    description: string;
-    /** JSON Schema describing the tool's arguments */
-    parameters: JsonSchema;
-  };
-}
-
-/**
- * Execution context passed to {@link Tool.execute}
- *
- * Provides callbacks for reporting progress during long-running tool
- * operations (e.g. reranker scoring chunks).
- *
- * @category Agents
- */
-export interface ToolContext {
-  /** Progress callback for long-running operations */
-  onProgress?: (p: { filled: number; total: number }) => void;
-}
-
-// ── Trace types ───────────────────────────────────────────────
-
-/**
- * Per-token trace entry captured when {@link AgentPoolOptions.trace} is true
- *
- * Each entry corresponds to one sampled token and the distribution state
- * at the moment it was drawn. Available on {@link AgentResult.trace} after
- * pool completion.
- *
- * @category Agents
- */
-export interface TraceToken {
-  /** Decoded text for this token */
-  text: string;
-  /** Shannon entropy of the full vocabulary distribution (bits, base-2) */
-  entropy: number;
-  /** Surprisal of the chosen token: -log2(p) */
-  surprisal: number;
-}
-
-// ── Agent pool types ───────────────────────────────────────────
-
-/**
- * Task specification for a single agent in {@link useAgentPool}
- *
- * Each task defines the agent's system prompt, user content, available
- * tools, and parent branch to fork from. The parent branch determines
- * the agent's KV prefix — fork from a shared root to amortize system
- * prompt tokenization across agents.
- *
- * @category Agents
- */
-export interface AgentTaskSpec {
-  /** System prompt defining the agent's role and behavior */
-  systemPrompt: string;
-  /** User message content — the agent's specific sub-question or task */
-  content: string;
-  /** JSON-serialized tool schemas (from {@link createToolkit}) */
-  tools?: string;
-  /** PRNG seed for sampler diversity — pass different seeds per agent */
-  seed?: number;
-  /** Parent branch to fork from (required by {@link useAgentPool}) */
-  parent?: Branch;
-}
-
-/**
- * Sampling parameters for generation
- *
- * Controls the sampler chain applied during token generation. Passed to
- * {@link Branch.create}, {@link generate}, {@link diverge}, and agent
- * pool tasks.
- *
- * @category Agents
- */
-export interface SamplingParams {
-  /** Temperature for softmax scaling (0 = greedy, higher = more random) */
-  temperature?: number;
-  /** Nucleus sampling threshold — cumulative probability cutoff */
-  topP?: number;
-  /** Top-K sampling — keep only the K most likely tokens */
-  topK?: number;
-  /** Minimum probability threshold relative to the most likely token */
-  minP?: number;
-  /** Additional sampler-specific parameters */
-  [key: string]: unknown;
-}
-
-/**
- * KV pressure thresholds controlling agent shutdown under context exhaustion
- *
- * Two thresholds govern what happens as remaining KV shrinks:
- *
- * **softLimit** (default 1024) — remaining KV floor for new work.
- * Enforced at three points:
- * - **SETTLE**: tool results that would cross this floor are rejected and
- *   the agent is marked done. This is the primary enforcement point — tool
- *   results (search results, etc.) are the largest KV consumers.
- * - **PRODUCE (stop-token boundary)**: agents that want a non-terminal tool
- *   call are hard-cut. Terminal tools (e.g. `report()`) still pass.
- * - **INIT prefill**: agents that don't fit above this floor are dropped.
- *
- * Set to account for downstream pool needs (reporters, verification).
- *
- * **hardLimit** (default 128) — crash-prevention floor.
- * When remaining drops below this, agents are killed immediately before
- * `produceSync()`. Prevents `llama_decode` "no memory slot" failures.
- * Pure safety net — should never be the primary budget control.
- *
- * @category Agents
- */
-export interface PressureThresholds {
-  /**
-   * Remaining KV floor for new work (tokens). When remaining drops below
-   * this, SETTLE rejects tool results, PRODUCE hard-cuts non-terminal tool
-   * calls, and INIT drops agents that don't fit.
-   *
-   * Set to account for downstream pool needs (reporters, verification).
-   * Default: 1024
-   */
-  softLimit?: number;
-  /**
-   * Crash-prevention floor (tokens). When remaining drops below this,
-   * agents are killed immediately before `produceSync()`. Prevents
-   * `llama_decode` "no memory slot for batch" failures.
-   * Default: 128
-   */
-  hardLimit?: number;
-}
-
-/**
- * Configuration for {@link useAgentPool} and {@link runAgents}
- *
- * @category Agents
- */
-export interface AgentPoolOptions {
-  /** Agent task specifications — one per concurrent agent */
-  tasks: AgentTaskSpec[];
-  /**
-   * Tool registry mapping tool names to {@link Tool} instances.
-   *
-   * This is the **execution registry** — it determines which tools can be
-   * dispatched at runtime. It is distinct from the per-task `task.tools`
-   * JSON schema that tells the model which tools are available.
-   *
-   * The registry also controls {@link AgentPoolOptions.terminalTool | terminalTool}
-   * gating: if the registry contains only the terminal tool, agents are
-   * allowed to call it as their first action (e.g. reporter sub-agents).
-   * If the registry contains other tools, the first call must be
-   * non-terminal to prevent agents from reporting without doing work.
-   */
-  tools: Map<string, import('./Tool').Tool>;
-  /** Sampling parameters applied to all agents */
-  params?: SamplingParams;
-  /** Maximum tool-call turns per agent before forced termination */
-  maxTurns?: number;
-  /** Tool name that signals agent completion. When the model calls this tool,
-   *  findings are extracted from arguments and the agent is marked done.
-   *  The tool is intercepted — never dispatched to execute(). If omitted,
-   *  agents complete only via stop token or hard-cut. */
-  terminalTool?: string;
-  /** Enable per-token entropy/surprisal on `agent:produce` events */
-  trace?: boolean;
-  /** KV pressure thresholds — tune per pool. Reporter pools typically use
-   *  lower thresholds than research pools since they complete in a single
-   *  terminal tool call. See {@link PressureThresholds} for tuning guidance. */
-  pressure?: PressureThresholds;
-}
-
-/**
- * Result for a single completed agent
- *
- * @category Agents
- */
-export interface AgentResult {
-  /** Stable agent identifier (branch handle at creation time) */
-  agentId: number;
-  /** Parent branch handle — shared root for top-level agents, parent agentId for sub-agents */
-  parentAgentId: number;
-  /** The agent's branch — still alive when returned from {@link useAgentPool} */
-  branch: Branch;
-  /** Agent's research findings (from terminal tool or final output), or null */
-  findings: string | null;
-  /** Number of tool calls the agent made */
-  toolCallCount: number;
-  /** Total tokens generated by this agent */
-  tokenCount: number;
-  /** Model-level perplexity at completion (exp of mean NLL from raw logits) */
-  ppl: number;
-  /** Sampling-level perplexity at completion (from filtered distribution) */
-  samplingPpl: number;
-  /** Per-token trace data (present only when {@link AgentPoolOptions.trace} is true) */
-  trace?: TraceToken[];
-}
-
-/**
- * Aggregate result from a completed agent pool run
- *
- * Returned by both {@link useAgentPool} and {@link runAgents}. Contains
- * per-agent results plus aggregate statistics for display and telemetry.
- *
- * @category Agents
- */
-export interface AgentPoolResult {
-  /** Per-agent results in task order */
-  agents: AgentResult[];
-  /** Sum of all agent token counts */
-  totalTokens: number;
-  /** Sum of all agent tool calls */
-  totalToolCalls: number;
-  /** Number of batched commit steps in the tick loop */
-  steps: number;
-  /** Internal performance counters for telemetry */
-  counters: {
-    /** Number of batch prefill calls for tool result injection */
-    warmPrefillCalls: number;
-    /** Total branches across all warm prefill batches */
-    warmPrefillBranches: number;
-    /** Ticks where no agent was generating (all awaiting tools) */
-    stalledTicks: number;
-    /** Peak concurrent tool executions */
-    maxConcurrentTools: number;
-    /** Ticks spent idle-waiting via action() */
-    idleTicks: number;
-  };
-}
-
-// ── Generate types ─────────────────────────────────────────────
-
-/**
- * Options for single-branch {@link generate}
- *
- * @category Agents
- */
-export interface GenerateOptions {
-  /** Pre-formatted prompt string (from `formatChat()` + `tokenize()`) */
-  prompt: string;
-  /** GBNF grammar string for constrained generation */
-  grammar?: string;
-  /** Sampling parameters */
-  params?: SamplingParams;
-  /** Optional parser applied to the raw output string */
-  parse?: (output: string) => unknown;
-}
-
-/**
- * Result from single-branch {@link generate}
- *
- * @category Agents
- */
-export interface GenerateResult<T = unknown> {
-  /** Raw generated text */
-  output: string;
-  /** Number of tokens generated */
-  tokenCount: number;
-  /** Parsed output (present only when `parse` was provided in options) */
-  parsed?: T;
-}
-
-// ── Diverge types ──────────────────────────────────────────────
-
-/**
- * Options for multi-branch {@link diverge}
- *
- * Either `parent` or `prompt` must be provided. When `parent` is given,
- * branches fork from it and no new root is created. When only `prompt`
- * is given, a fresh root is created, prefilled, and cleaned up on error.
- *
- * @category Agents
- */
-export interface DivergeOptions {
-  /** Pre-formatted prompt for creating a fresh root (mutually exclusive with parent) */
-  prompt?: string;
-  /** Number of parallel generation attempts */
-  attempts: number;
-  /** Parent branch to fork from (mutually exclusive with prompt) */
-  parent?: Branch;
-  /** Sampling parameters for all attempts */
-  params?: SamplingParams;
-}
-
-/**
- * Single attempt result from {@link diverge}
- *
- * @category Agents
- */
-export interface DivergeAttempt {
-  /** The attempt's branch (only the best branch survives after diverge) */
-  branch: Branch;
-  /** Generated text for this attempt */
-  output: string;
-  /** Number of tokens generated */
-  tokenCount: number;
-  /** Model perplexity — lower indicates more coherent generation */
-  ppl: number;
-}
-
-/**
- * Aggregate result from {@link diverge}
- *
- * The `best` branch is still alive; all other attempt branches have been
- * pruned. The caller owns cleanup — typically via {@link Session.promote}
- * to make the best branch the new conversation trunk.
- *
- * @category Agents
- */
-export interface DivergeResult {
-  /** Lowest-perplexity branch — still alive, caller owns cleanup */
-  best: Branch;
-  /** Text output from the best attempt */
-  bestOutput: string;
-  /** All attempts (losers already pruned, branches disposed) */
-  attempts: DivergeAttempt[];
-  /** Sum of all attempt token counts */
-  totalTokens: number;
-  /** Number of batched commit steps */
-  steps: number;
-  /** Shared prefix length in tokens (for KV savings calculation) */
-  prefixLength: number;
-}
-
-// ── Runtime events ─────────────────────────────────────────────
-
-/**
- * Events emitted by the runtime during agent pool execution
- *
- * Subscribe to these via the `events` channel from {@link initAgents}.
- * Harnesses can extend this union with phase-level events for display.
- *
- * @category Agents
- */
-export type AgentEvent =
-  | { type: 'agent:spawn'; agentId: number; parentAgentId: number }
-  | { type: 'agent:produce'; agentId: number; text: string; tokenCount: number; entropy?: number; surprisal?: number }
-  | { type: 'agent:tool_call'; agentId: number; tool: string; args: string }
-  | { type: 'agent:tool_result'; agentId: number; tool: string; result: string }
-  | { type: 'agent:tool_progress'; agentId: number; tool: string; filled: number; total: number }
-  | { type: 'agent:report'; agentId: number; findings: string }
-  | { type: 'agent:done'; agentId: number };
diff --git a/src/index.ts b/src/index.ts
index 1ee3f6d..abc300d 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -30,18 +30,15 @@
  */
 
 import type {
-  ContextOptions,
   GpuVariant,
   LoadOptions,
   NativeBinding,
-  SessionContext,
 } from './types';
 
-import { Branch } from './Branch';
-import { BranchStore } from './BranchStore';
-import { Session } from './Session';
-import { buildUserDelta, buildToolResultDelta } from './agents/deltas';
-import { Rerank } from './Rerank';
+import type {
+  ContextOptions,
+  SessionContext,
+} from '@lloyal-labs/sdk';
 
 /**
  * Platform package naming: @lloyal-labs/lloyal.node-{platform}-{arch}[-{gpu}]
@@ -251,10 +248,34 @@ export const createContext = async (
   return binary.createContext(options);
 };
 
-// ── Layer 1: Substrate (unchanged) ──────────────────────────────
-export { Branch, BranchStore, Session, buildUserDelta, buildToolResultDelta, Rerank };
+// ── Re-export from @lloyal-labs/sdk ──────────────────────────────
+export { Branch, BranchStore, Session, Rerank, buildUserDelta, buildToolResultDelta } from '@lloyal-labs/sdk';
+
+export { PoolingType, CHAT_FORMAT_CONTENT_ONLY, CHAT_FORMAT_GENERIC, ReasoningFormat, GrammarTriggerType } from '@lloyal-labs/sdk';
+export type { ChatFormat } from '@lloyal-labs/sdk';
+export type {
+  ContextOptions,
+  FormatChatOptions,
+  GrammarTrigger,
+  FormattedChatResult,
+  ParseChatOutputOptions,
+  ParsedToolCall,
+  ParseChatOutputResult,
+  PenaltyParams,
+  MirostatParams,
+  DryParams,
+  XtcParams,
+  AdvancedSamplingParams,
+  SamplingParams,
+  SessionContext,
+  Produced,
+  RerankOptions,
+  RerankResult,
+  RerankProgress,
+  KvCacheType,
+} from '@lloyal-labs/sdk';
 
-// ── Layer 2: Agents (structured concurrency) ────────────────────
+// ── Re-export from @lloyal-labs/lloyal-agents ────────────────────
 export {
   Ctx, Store, Events,
   Tool,
@@ -265,7 +286,7 @@ export {
   createToolkit,
   initAgents,
   withSharedRoot,
-} from './agents/index';
+} from '@lloyal-labs/lloyal-agents';
 
 export type {
   Toolkit,
@@ -284,32 +305,7 @@ export type {
   DivergeAttempt,
   DivergeResult,
   AgentEvent,
-} from './agents/index';
+} from '@lloyal-labs/lloyal-agents';
 
-// ── Enums + types from types.ts ─────────────────────────────────
-export { PoolingType, CHAT_FORMAT_CONTENT_ONLY, CHAT_FORMAT_GENERIC, ReasoningFormat, GrammarTriggerType } from './types';
-export type { ChatFormat } from './types';
-export type {
-  GpuVariant,
-  KvCacheType,
-  LoadOptions,
-  ContextOptions,
-  FormatChatOptions,
-  GrammarTrigger,
-  FormattedChatResult,
-  ParseChatOutputOptions,
-  ParsedToolCall,
-  ParseChatOutputResult,
-  PenaltyParams,
-  MirostatParams,
-  DryParams,
-  XtcParams,
-  AdvancedSamplingParams,
-  SamplingParams,
-  SessionContext,
-  Produced,
-  RerankOptions,
-  RerankResult,
-  RerankProgress,
-  NativeBinding,
-} from './types';
+// ── Native-only types (stay in lloyal.node) ──────────────────────
+export type { GpuVariant, LoadOptions, NativeBinding } from './types';
diff --git a/src/types.ts b/src/types.ts
index aa97a19..69ac7a1 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -1,23 +1,12 @@
 /**
- * liblloyal-node TypeScript Definitions
+ * liblloyal-node — native-only type definitions
  *
- * N-API bindings for liblloyal - Node.js native addon for llama.cpp inference
- *
- * @categoryDescription Core
- * Entry points, context lifecycle, and the main inference interface.
- *
- * @categoryDescription Sampling
- * Sampler chain configuration — temperature, penalties, nucleus sampling, and advanced filters.
- *
- * @categoryDescription Chat
- * Chat template formatting, output parsing, tool calls, and reasoning extraction.
- *
- * @categoryDescription Branching
- * Parallel and tree-structured generation with batched GPU dispatch.
+ * Types specific to the Node.js native addon (binary loading, GPU variant
+ * selection). All inference primitives and shared types are in
+ * {@link @lloyal-labs/sdk | @lloyal-labs/sdk}.
  */
 
-import type { Branch } from './Branch';
-import type { BranchStore } from './BranchStore';
+import type { ContextOptions, SessionContext } from '@lloyal-labs/sdk';
 
 /**
  * GPU variant for binary loading
@@ -34,16 +23,6 @@ import type { BranchStore } from './BranchStore';
  */
 export type GpuVariant = 'default' | 'cuda' | 'vulkan';
 
-/**
- * Supported KV cache quantization types
- *
- * Matches llama.cpp CLI `-ctk` / `-ctv` flags.
- * Lower precision = less GPU memory, slight quality tradeoff.
- *
- * @category Core
- */
-export type KvCacheType = 'f32' | 'f16' | 'bf16' | 'q8_0' | 'q4_0' | 'q4_1' | 'iq4_nl' | 'q5_0' | 'q5_1';
-
 /**
  * Options for binary loading
  *
@@ -62,1474 +41,10 @@ export interface LoadOptions {
    *
    * If the requested variant is unavailable (missing runtime libraries),
    * automatically falls back to CPU with a console warning.
-   *
-   * @example
-   * ```typescript
-   * // Request CUDA with automatic fallback to CPU
-   * const ctx = await createContext(
-   *   { modelPath: './model.gguf' },
-   *   { gpuVariant: 'cuda' }
-   * );
-   * ```
    */
   gpuVariant?: GpuVariant;
 }
 
-/**
- * Pooling type for embedding extraction
- *
- * @category Core
- */
-export enum PoolingType {
-  /** No pooling - raw per-token embeddings */
-  NONE = 0,
-  /** Mean pooling - average of all token embeddings */
-  MEAN = 1,
-  /** CLS pooling - use first token embedding */
-  CLS = 2,
-  /** Last token pooling - use last token embedding */
-  LAST = 3,
-  /** Rank pooling - classification head output for reranking models */
-  RANK = 4,
-}
-
-/**
- * Chat format detected by the template engine
- *
- * Identifies how the model formats tool calls, reasoning blocks, and content.
- * Opaque chat format identifier returned by
- * {@link SessionContext.formatChat | formatChat()} and consumed by
- * {@link SessionContext.parseChatOutput | parseChatOutput()}.
- *
- * Maps 1:1 to llama.cpp's `common_chat_format` enum (30+ values).
- * Treat as an opaque number — pass through, don't switch on it.
- *
- * @category Chat
- */
-export type ChatFormat = number;
-
-/** Model template has no tool/structured-output support. */
-export const CHAT_FORMAT_CONTENT_ONLY: ChatFormat = 0;
-
-/** llama.cpp's generic JSON fallback — imposes format the model wasn't trained on. */
-export const CHAT_FORMAT_GENERIC: ChatFormat = 1;
-
-/**
- * Reasoning/thinking block format
- *
- * Controls how `<think>` blocks are handled during formatting and parsing.
- *
- * @see {@link FormatChatOptions.reasoningFormat} for input-side usage
- * @see {@link ParseChatOutputOptions.reasoningFormat} for output-side usage
- *
- * @category Chat
- */
-export enum ReasoningFormat {
-  /** No reasoning extraction (default) */
-  NONE = 0,
-  /** Auto-detect reasoning format from model template */
-  AUTO = 1,
-  /** DeepSeek legacy format (`<think>...</think>` in content) */
-  DEEPSEEK_LEGACY = 2,
-  /** DeepSeek format (structured reasoning extraction) */
-  DEEPSEEK = 3,
-}
-
-/**
- * Grammar trigger type
- *
- * Determines how lazy grammar activation is triggered during generation.
- *
- * @see {@link GrammarTrigger}
- * @see {@link FormattedChatResult.grammarTriggers}
- *
- * @category Chat
- */
-export enum GrammarTriggerType {
-  /** Trigger on a specific token ID */
-  TOKEN = 0,
-  /** Trigger on a word boundary match */
-  WORD = 1,
-  /** Trigger on a regex pattern match */
-  PATTERN = 2,
-  /** Trigger on a full-string regex pattern match */
-  PATTERN_FULL = 3,
-}
-
-/**
- * Configuration for context creation
- *
- * Controls the resource envelope for inference: context window size (`nCtx`),
- * batch throughput (`nBatch`), compute parallelism (`nThreads`), and
- * multi-sequence capacity (`nSeqMax`). These map directly to
- * `llama_context_params` and are fixed for the context's lifetime.
- *
- * Key tradeoffs:
- * - **nCtx**: Larger = longer conversations, but linear KV memory growth.
- * - **nBatch**: Larger = faster prompt prefill (more tokens per GPU dispatch),
- *   but higher peak memory. Also sets the bin-packing capacity for
- *   {@link BranchStore.prefill}.
- * - **nSeqMax**: Set ≥ your max concurrent branch count + 1 (root sequence).
- *   Each sequence shares the same KV cache memory pool — cost is metadata only
- *   under unified KV, not a per-sequence memory multiplier.
- *
- * @category Core
- */
-export interface ContextOptions {
-  /** Path to .gguf model file */
-  modelPath: string;
-
-  /** Context size (default: 2048) */
-  nCtx?: number;
-
-  /** Number of threads (default: 4) */
-  nThreads?: number;
-
-  /**
-   * Batch size for token processing
-   *
-   * Controls how many tokens are processed per llama_decode call.
-   * Higher values improve throughput for prompt prefill at the cost of memory.
-   * Also sets llama_context_params.n_batch and n_ubatch at context creation.
-   * Default: 512
-   */
-  nBatch?: number;
-
-  /**
-   * Enable embedding extraction mode
-   *
-   * When true, context is optimized for embedding extraction.
-   * Use with encode() and getEmbeddings() methods.
-   * Default: false (text generation mode)
-   */
-  embeddings?: boolean;
-
-  /**
-   * Pooling type for embedding extraction
-   *
-   * Only relevant when embeddings=true.
-   * Default: MEAN for embedding contexts, NONE otherwise
-   */
-  poolingType?: PoolingType;
-
-  /**
-   * Maximum number of sequences for multi-sequence support
-   *
-   * Set > 1 to enable multiple independent KV cache sequences.
-   * Useful for parallel decoding or conversation branching.
-   * Default: 1 (single sequence)
-   */
-  nSeqMax?: number;
-
-  /**
-   * KV cache data type for keys
-   *
-   * Quantize the key cache to reduce GPU memory. For a Q4_K_M model,
-   * F16 cache wastes precision — Q8_0 halves memory with minimal quality loss.
-   *
-   * Memory at nCtx=8192 (Qwen3-4B, 36 layers, 8 KV heads, 128 dim):
-   *   f16:  1152 MB    q8_0: ~576 MB    q4_0: ~288 MB
-   *
-   * Default: 'f16'
-   */
-  typeK?: KvCacheType;
-
-  /**
-   * KV cache data type for values
-   *
-   * Same options as typeK. V cache is slightly more quality-sensitive than K.
-   * Default: 'f16'
-   */
-  typeV?: KvCacheType;
-}
-
-/**
- * Options for chat template formatting
- *
- * Controls format-awareness fields passed to the chat template engine.
- * All fields are optional -- sensible defaults are used when omitted.
- *
- * @example With tools and reasoning
- * ```typescript
- * const result = await ctx.formatChat(messagesJson, {
- *   tools: JSON.stringify(tools),
- *   toolChoice: 'auto',
- *   reasoningFormat: 'auto',
- * });
- * ```
- *
- * @category Chat
- */
-export interface FormatChatOptions {
-  /** Custom Jinja2 template override (bypasses model's built-in template) */
-  templateOverride?: string;
-
-  /**
-   * JSON array of OpenAI-format tool definitions
-   *
-   * @example
-   * ```typescript
-   * const tools = [{ type: 'function', function: {
-   *   name: 'get_weather',
-   *   description: 'Get current weather',
-   *   parameters: { type: 'object', properties: { location: { type: 'string' } } }
-   * }}];
-   * options.tools = JSON.stringify(tools);
-   * ```
-   */
-  tools?: string;
-
-  /** Tool choice strategy (default: "auto") */
-  toolChoice?: 'auto' | 'required' | 'none';
-
-  /** Allow parallel tool calls (default: false) */
-  parallelToolCalls?: boolean;
-
-  /**
-   * Reasoning format (default: "none")
-   *
-   * Controls `<think>` block handling in the template.
-   * Use "auto" to let the model's template decide.
-   */
-  reasoningFormat?: 'none' | 'auto' | 'deepseek' | 'deepseek_legacy';
-
-  /** Enable `<think>` blocks (default: true). Pairs with reasoningFormat. */
-  enableThinking?: boolean;
-
-  /**
-   * JSON schema for constrained output. Converted to GBNF grammar internally.
-   * Mutually exclusive with `grammar`.
-   *
-   * @see {@link SessionContext.jsonSchemaToGrammar}
-   */
-  jsonSchema?: string;
-
-  /**
-   * Explicit GBNF grammar string for constrained generation.
-   * Mutually exclusive with `jsonSchema`.
-   */
-  grammar?: string;
-
-  /**
-   * Append assistant prompt prefix (default: true).
-   * Set false when formatting partial conversations or for
-   * non-generation use cases like template validation.
-   */
-  addGenerationPrompt?: boolean;
-}
-
-/**
- * Grammar trigger from format-aware chat template
- *
- * Defines conditions for lazy grammar activation. When `grammarLazy` is true
- * in {@link FormattedChatResult}, generation runs unconstrained until one of
- * these triggers fires, at which point the grammar is activated.
- *
- * @category Chat
- */
-export interface GrammarTrigger {
-  /** Trigger type */
-  type: GrammarTriggerType;
-  /** Trigger value (token text, word, or regex pattern depending on type) */
-  value: string;
-  /** Token ID (for TOKEN-type triggers, -1 when not applicable) */
-  token: number;
-}
-
-/**
- * Result from chat template formatting
- *
- * Includes format-awareness fields for proper output parsing.
- * Pass `format` and `reasoningFormat` directly to
- * {@link SessionContext.parseChatOutput | parseChatOutput()} to decode
- * the model's response.
- *
- * @example Roundtrip: format -> generate -> parse
- * ```typescript
- * const fmt = await ctx.formatChat(messagesJson, { tools: toolsJson });
- * // ... generate tokens using fmt.prompt and fmt.grammar ...
- * const parsed = ctx.parseChatOutput(output, fmt.format, {
- *   reasoningFormat: fmt.reasoningFormat,
- *   thinkingForcedOpen: fmt.thinkingForcedOpen,
- *   parser: fmt.parser,
- * });
- * ```
- *
- * @see {@link SessionContext.parseChatOutput}
- *
- * @category Chat
- */
-export interface FormattedChatResult {
-  /** Formatted prompt string ready for tokenization */
-  prompt: string;
-  /** Additional stop strings from the template */
-  stopTokens: string[];
-
-  /**
-   * Detected chat format (pass to parseChatOutput)
-   * @see {@link SessionContext.parseChatOutput}
-   */
-  format: ChatFormat;
-
-  /** Grammar string for constrained generation (empty if no tools/schema) */
-  grammar: string;
-  /** Whether grammar should be applied lazily (only after triggers fire) */
-  grammarLazy: boolean;
-  /** Whether the thinking tag was forced open by the template */
-  thinkingForcedOpen: boolean;
-
-  /**
-   * Reasoning format (pass to parseChatOutput options)
-   * @see {@link ParseChatOutputOptions.reasoningFormat}
-   */
-  reasoningFormat: ReasoningFormat;
-
-  /** PEG parser definition for PEG format models (pass to parseChatOutput options) */
-  parser: string;
-  /** Grammar trigger conditions for lazy grammar activation */
-  grammarTriggers: GrammarTrigger[];
-  /** Token strings preserved from grammar masking */
-  preservedTokens: string[];
-}
-
-/**
- * Options for parsing chat output
- *
- * All fields are optional. For correct parsing, pass through the corresponding
- * fields from {@link FormattedChatResult}.
- *
- * @see {@link FormattedChatResult}
- *
- * @category Chat
- */
-export interface ParseChatOutputOptions {
-  /**
-   * Reasoning format (from {@link FormattedChatResult.reasoningFormat})
-   */
-  reasoningFormat?: ReasoningFormat;
-
-  /**
-   * True if output is incomplete (streaming).
-   * When true, the parser tolerates unterminated tool calls and open
-   * thinking blocks, returning partial content as-is rather than
-   * treating them as parse errors.
-   */
-  isPartial?: boolean;
-
-  /** Whether thinking tag was forced open (from {@link FormattedChatResult.thinkingForcedOpen}) */
-  thinkingForcedOpen?: boolean;
-
-  /** PEG parser definition for PEG format models (from {@link FormattedChatResult.parser}) */
-  parser?: string;
-}
-
-/**
- * A tool call extracted from model output
- *
- * @example
- * ```typescript
- * for (const tc of result.toolCalls) {
- *   const args = JSON.parse(tc.arguments);
- *   await executeTool(tc.name, args);
- * }
- * ```
- *
- * @category Chat
- */
-export interface ParsedToolCall {
-  /** Tool/function name */
-  name: string;
-  /** JSON string of arguments */
-  arguments: string;
-  /** Tool call ID (may be empty depending on model format) */
-  id: string;
-}
-
-/**
- * Result from parsing chat output
- *
- * @example
- * ```typescript
- * const result = ctx.parseChatOutput(output, fmt.format);
- * if (result.toolCalls.length > 0) {
- *   for (const tc of result.toolCalls) {
- *     const args = JSON.parse(tc.arguments);
- *     await executeTool(tc.name, args);
- *   }
- * } else {
- *   console.log(result.content);
- * }
- * ```
- *
- * @category Chat
- */
-export interface ParseChatOutputResult {
-  /** Main response text */
-  content: string;
-  /**
-   * Extracted thinking/reasoning content (empty string if none).
-   * For thinking models (e.g. Qwen3), this contains the text inside
-   * `<think>...</think>` blocks. Store as `reasoning_content` in your
-   * messages array so formatChat() can reconstruct the template correctly
-   * on subsequent turns.
-   */
-  reasoningContent: string;
-  /** Extracted tool calls (empty array if none) */
-  toolCalls: ParsedToolCall[];
-}
-
-/**
- * Penalty parameters for repetition control
- *
- * @category Sampling
- */
-export interface PenaltyParams {
-  /** Repetition penalty (1.0 = disabled, >1.0 = penalize repeats) */
-  repeat?: number;
-
-  /** Frequency penalty (0.0 = disabled) */
-  frequency?: number;
-
-  /** Presence penalty (0.0 = disabled) */
-  presence?: number;
-
-  /** Tokens to consider for penalties (-1 = context size) */
-  lastN?: number;
-}
-
-/**
- * Mirostat sampling configuration
- *
- * Mirostat dynamically adjusts sampling to maintain target perplexity,
- * preventing both repetition and incoherence. Useful for long-form generation
- * where temperature alone produces inconsistent quality.
- *
- * Use Mirostat v2 (mode: 2) for most cases - it's more stable than v1.
- *
- * @category Sampling
- */
-export interface MirostatParams {
-  /** Mirostat mode (0 = disabled, 1 = v1, 2 = v2). Recommended: 2 */
-  mode?: number;
-
-  /** Target entropy (perplexity = exp(tau)). Default: 5.0. Lower = more focused */
-  tau?: number;
-
-  /** Learning rate for entropy adjustment. Default: 0.1. Higher = faster adaptation */
-  eta?: number;
-}
-
-/**
- * DRY (Don't Repeat Yourself) sampling parameters
- *
- * Penalizes repetition of token sequences, more sophisticated than
- * simple repetition penalty. Useful for reducing loops and redundancy
- * in generated text.
- *
- * @category Sampling
- */
-export interface DryParams {
-  /** Penalty strength (0.0 = disabled, higher = stronger penalty) */
-  multiplier?: number;
-
-  /** Base penalty value (typically 1.75) */
-  base?: number;
-
-  /** Minimum sequence length to trigger penalty (typically 2) */
-  allowedLength?: number;
-
-  /** Number of recent tokens to scan for repetitions */
-  penaltyLastN?: number;
-}
-
-/**
- * XTC (eXclude Top Choices) sampler parameters
- *
- * Excludes very high probability tokens to increase output diversity.
- * Useful when model is overly confident and produces repetitive text.
- *
- * @category Sampling
- */
-export interface XtcParams {
-  /** Probability of applying XTC (0.0 = disabled, 1.0 = always). Typical: 0.1 */
-  probability?: number;
-
-  /** Confidence threshold above which tokens are excluded. Typical: 0.1 */
-  threshold?: number;
-}
-
-/**
- * Advanced sampling parameters
- *
- * @category Sampling
- */
-export interface AdvancedSamplingParams {
-  /** Locally typical sampling (1.0 = disabled) */
-  typicalP?: number;
-
-  /** Mirostat sampling configuration */
-  mirostat?: MirostatParams;
-
-  /** DRY (Don't Repeat Yourself) sampling */
-  dry?: DryParams;
-
-  /** XTC sampler */
-  xtc?: XtcParams;
-}
-
-/**
- * Sampling parameters for token generation
- *
- * Configures the sampler chain — a pipeline of composable filters and
- * transforms applied to raw logits before token selection. The chain is
- * built once at branch creation and persists across decode steps
- * (penalty state accumulates, PRNG advances).
- *
- * **Chain order**: penalties → top_k → typical_p → top_p → min_p →
- * temperature → dist (stochastic) or greedy (temperature ≤ 0).
- *
- * For tree search, each {@link Branch} owns an independent clone of the
- * chain. `reseedSampler()` replaces the terminal dist sampler's PRNG seed
- * so forked branches diverge. Greedy chains (temperature ≤ 0) are
- * deterministic and unaffected by reseeding.
- *
- * Common presets:
- * - Factual/Precise: `{ temperature: 0.1 }`
- * - Balanced: `{ temperature: 0.7 }`
- * - Creative: `{ temperature: 1.0 }`
- * - Deterministic greedy: `{ temperature: 0, topK: 0, topP: 1.0, minP: 0 }`
- *
- * @category Sampling
- */
-export interface SamplingParams {
-  // ===== COMMON CONTROLS =====
-
-  /** Randomness (0.0 = always most likely, 2.0 = very random) */
-  temperature?: number;
-
-  /** Only consider top K most likely tokens (0 = disabled) */
-  topK?: number;
-
-  /** Nucleus sampling threshold (1.0 = disabled) */
-  topP?: number;
-
-  /** Minimum probability threshold */
-  minP?: number;
-
-  /** Random seed for reproducible generation (-1 = random) */
-  seed?: number;
-
-  /** GBNF grammar string for constrained generation */
-  grammar?: string;
-
-  // ===== GROUPED CONTROLS =====
-
-  /** Penalty parameters for repetition control */
-  penalties?: PenaltyParams;
-
-  /** Advanced sampling parameters */
-  advanced?: AdvancedSamplingParams;
-}
-
-/**
- * Inference context — the runtime surface for a loaded model
- *
- * A SessionContext owns a llama_context (KV cache + compute graph) bound to a
- * shared model. It provides tokenization, logit access, KV cache management,
- * chat template formatting, and embedding extraction.
- *
- * **All generation flows through {@link Branch}.** Create a branch at position 0,
- * prefill prompt tokens, then use the produce/commit loop or async iterator:
- *
- * ```typescript
- * const branch = Branch.create(ctx, 0, { temperature: 0.7 });
- * await branch.prefill(promptTokens);
- * for await (const { token, text } of branch) {
- *   process.stdout.write(text);
- * }
- * ```
- *
- * For tree-structured generation (best-of-N, beam search, speculative
- * decoding), use {@link Branch.fork} and {@link BranchStore} — they manage
- * per-branch KV sequences, sampler chains, and logits snapshots with O(1)
- * GPU dispatches via batched decode.
- *
- * **Logits**: For branch-level logits, use {@link Branch.getLogits} which
- * returns an independent copy of the branch's snapshot. For metrics, use
- * {@link Branch.modelEntropy} and {@link Branch.modelSurprisal} which
- * operate directly on the branch's logits without JS round-trips.
- *
- * **KV cache**: Supports multi-sequence operation (`nSeqMax > 1`), per-sequence
- * copy/clear/eviction, file-based persistence, and context compression via
- * `clearAndReseed()`.
- *
- * **Chat templates**: `formatChat()` and `parseChatOutput()` handle the full
- * round-trip of chat formatting, including tool calls, reasoning blocks, and
- * grammar-constrained generation — using the model's native Jinja template.
- *
- * Use {@link createContext} to initialize, and `dispose()` when done to free
- * GPU/CPU memory.
- *
- * @category Core
- */
-export interface SessionContext {
-
-  /**
-   * Convert token ID to text piece
-   *
-   * Fast synchronous lookup in vocabulary table.
-   * Call this on each generated token for streaming display.
-   *
-   * Optimized for per-token conversion during generation.
-   * For batch conversion of many tokens, use detokenize() instead.
-   *
-   * Cost: ~0.05ms
-   *
-   * @param token Token ID
-   * @returns Text string for this token
-   */
-  tokenToText(token: number): string;
-
-  /**
-   * Check if token is a model stop token
-   *
-   * Returns true for built-in end-of-generation tokens:
-   * - </s> (Llama 2)
-   * - <|endoftext|> (GPT)
-   * - <|eot_id|> (Llama 3)
-   * - Model-specific EOS tokens
-   *
-   * Note: This checks vocabulary stop tokens, not custom stop sequences.
-   * For custom stops (e.g., "\n\n", "###"), compare generated text
-   * against your stop strings in application code.
-   *
-   * Cost: <0.01ms (fast vocabulary lookup)
-   *
-   * @param token Token ID to check
-   */
-  isStopToken(token: number): boolean;
-
-  /**
-   * Get the model's end-of-generation token ID
-   *
-   * Returns the EOT token (e.g. <|im_end|> for ChatML), falling back
-   * to EOS (e.g. </s>) for Zephyr-style models. This is the inverse
-   * of isStopToken() — "what IS the stop token?" vs "is this a stop token?"
-   *
-   * Use case: warm multi-turn continuation prepends this token to close
-   * the previous assistant turn before injecting new user content.
-   *
-   * @returns Token ID (integer)
-   * @throws If model has neither EOT nor EOS token
-   */
-  getEogToken(): number;
-
-  /**
-   * Get the model's turn separator token IDs
-   *
-   * Returns the tokens that close an assistant turn and transition to the
-   * next message, as determined by the model's chat template. Computed once
-   * per model, cached.
-   *
-   * For ChatML templates: [im_end_id, newline_id] (e.g., [2, 198])
-   * For Llama 3 templates: [eot_id] (e.g., [128009])
-   *
-   * Use case: warm multi-turn prefill to achieve exact parity with cold path.
-   *
-   * @returns Array of token IDs (cached after first call)
-   *
-   * @example
-   * ```typescript
-   * const separator = ctx.getTurnSeparator();
-   * console.log(separator.map(t => ctx.tokenToText(t)).join(''));  // "<|im_end|>\n"
-   *
-   * // Warm prefill with exact cold/warm parity
-   * const deltaTokens = await ctx.tokenize(deltaPrompt, false);
-   * await branch.prefill([...separator, ...deltaTokens]);
-   * ```
-   */
-  getTurnSeparator(): number[];
-
-  // ===== PROMPT PREPARATION =====
-
-  /**
-   * Tokenize text into model's vocabulary
-   *
-   * Converts human text → token IDs for decode().
-   * Same text always produces same tokens for a given model.
-   *
-   * Cost: ~1ms per 100 characters
-   *
-   * @param text Text to tokenize
-   * @param addSpecial Whether to add special tokens (BOS/EOS). Defaults to
-   *   model metadata setting (typically true). Pass false for mid-sequence
-   *   tokenization (e.g., warm multi-turn continuation deltas).
-   * @returns Array of token IDs
-   * @example
-   * ```typescript
-   * // Full sequence (default — includes BOS)
-   * const tokens = await ctx.tokenize("Hello world");
-   *
-   * // Mid-sequence delta (no BOS)
-   * const delta = await ctx.tokenize("continuation text", false);
-   * ```
-   */
-  tokenize(text: string, addSpecial?: boolean): Promise<number[]>;
-
-  /**
-   * Tokenize text into model's vocabulary (sync — inline on main thread)
-   *
-   * Same as {@link tokenize} but synchronous. Use from Effection generators
-   * to avoid `yield* call()` overhead for CPU-only work.
-   *
-   * @param text Text to tokenize
-   * @param addSpecial Whether to add special tokens (BOS/EOS). Defaults to
-   *   model metadata setting (typically true). Pass false for mid-sequence
-   *   tokenization.
-   * @returns Array of token IDs
-   */
-  tokenizeSync(text: string, addSpecial?: boolean): number[];
-
-  /**
-   * Detokenize array of tokens back to text
-   *
-   * Inverse of tokenize(). Use for reconstructing complete text
-   * from token sequences (e.g., after KV cache operations).
-   *
-   * Optimized for batch conversion of many tokens.
-   * For single-token conversion during generation, use tokenToText().
-   *
-   * Cost: ~1ms per 100 tokens
-   *
-   * @param tokens Array of token IDs
-   * @returns Complete text representation
-   * @example
-   * ```typescript
-   * const tokens = [15496, 1917]; // "Hello world"
-   * const text = await ctx.detokenize(tokens);
-   * console.log(text); // "Hello world"
-   * ```
-   */
-  detokenize(tokens: number[]): Promise<string>;
-
-  // ===== KV CACHE MANAGEMENT =====
-
-  /**
-   * Get max position in the KV cache for a sequence
-   *
-   * Returns the highest position index in the specified sequence,
-   * or -1 if the sequence is empty. This is the same value as
-   * {@link kvSeqPosMax}. To get the token count, add 1.
-   *
-   * Think of this as: "How much has the model read so far?"
-   *
-   * Cost: <0.01ms (fast sync operation - safe to call frequently)
-   *
-   * @param sequenceId Sequence ID (defaults to 0 for single conversation)
-   * @returns Highest position index, or -1 if empty
-   */
-  kvCacheSize(sequenceId?: number): number;
-
-  /**
-   * Remove token range from KV cache
-   *
-   * Deletes tokens from model's memory. Use cases:
-   * - Removing old context when hitting limit (sliding window)
-   * - Implementing conversation pruning
-   * - Forgetting specific messages
-   * - Preparing for injection of new context
-   *
-   * CRITICAL: Call BEFORE next decode(), not after!
-   * The model needs to know about the removal before processing new tokens.
-   *
-   * Cost: ~1-5ms depending on range
-   *
-   * @param sequenceId Sequence ID (use 0 for single sequence)
-   * @param start Start position (inclusive)
-   * @param end End position (exclusive), -1 = to end
-   */
-  kvCacheRemove(sequenceId: number, start: number, end: number): Promise<void>;
-
-  /**
-   * Snapshot KV cache state for branching/undo
-   *
-   * Serializes entire model state to Buffer.
-   * Restore later with kvCacheLoad() for:
-   * - Conversation branching ("what if I said X instead?")
-   * - Undo/redo functionality
-   * - Checkpointing long conversations
-   *
-   * Size: ~500MB-2GB depending on context length and model
-   *
-   * Cost: ~100-500ms depending on cache size
-   *
-   * @param sequenceId Sequence ID (use 0 for single sequence)
-   * @returns Serialized state buffer
-   */
-  kvCacheSave(sequenceId?: number): Promise<Buffer>;
-
-  /**
-   * Restore KV cache from previous snapshot
-   *
-   * Loads saved model state. Context returns to exact state
-   * when snapshot was taken.
-   *
-   * Cost: ~100-500ms depending on snapshot size
-   *
-   * @param sequenceId Sequence ID (use 0 for single sequence)
-   * @param state Buffer from kvCacheSave()
-   * @example
-   * ```typescript
-   * const snapshot = await ctx.kvCacheSave(0);
-   *
-   * // ... many operations later ...
-   *
-   * // Restore to saved state
-   * await ctx.kvCacheLoad(0, snapshot);
-   * ```
-   */
-  kvCacheLoad(sequenceId: number, state: Buffer): Promise<void>;
-
-  /**
-   * Clear all KV cache (fresh start)
-   *
-   * Removes all cached tokens. Model returns to initial state
-   * as if no text has been processed.
-   *
-   * Use when starting a completely new conversation.
-   *
-   * Cost: ~1ms
-   */
-  kvCacheClear(): Promise<void>;
-
-  /**
-   * Blink KV — cache-local reconstruction for bounded-memory streaming
-   *
-   * Implements the [Blink KV](https://github.com/lloyal-ai/blink-kv/blob/main/blink_kv.pdf)
-   * protocol (Naqvi, 2026): when the KV cache fills, clear it entirely and
-   * re-decode retained tokens at contiguous positions `[0, 1, ..., N-1]`.
-   * This achieves cache-local position IDs — the operative requirement for
-   * stable bounded-memory streaming — without backend-specific knowledge of
-   * key storage format. Works on post-RoPE engines (where StreamingLLM's
-   * pos-shift is unavailable) and any backend exposing `clear()` + `decode()`.
-   *
-   * **Why not naive eviction?** Selective eviction (`kvCacheRemove`) preserves
-   * original position IDs, which grow without bound. Across 5 architectures,
-   * naive eviction produces PPL spanning 3 orders of magnitude — ranging from
-   * 1.15x baseline (Llama, lucky config) to 198x (Phi, sinks present).
-   * Under Blink KV reconstruction, all 5 converge to 3-16% of baseline.
-   *
-   * **Sinks are optional.** Under reconstruction, the 0+N (sinkless) config
-   * matches 4+N (with sinks) within <2% across all tested architectures.
-   * Pass an empty sinks array if you don't need them.
-   *
-   * **Algorithm:**
-   * 1. Clear entire KV cache (zero fragmentation)
-   * 2. Re-decode `sinks` at position 0 (optional attention anchors)
-   * 3. Re-decode `tail` at position `sinks.length` (recent context)
-   *
-   * **Cost:** Re-decodes `sinks.length + tail.length` tokens. At per-boundary
-   * trigger (reconstruct when cache reaches `nCtx`), amortized cost is
-   * O(cacheSize / interval) decode ops per token — ~0.14 at typical settings.
-   *
-   * @param sinks First N tokens from conversation start (typically 4, or empty).
-   *   Must be the same tokens every reseed — reusing different tokens degrades
-   *   any attention-sink patterns the model may have learned for early positions.
-   * @param tail Recent M tokens to preserve (typically 252-1020)
-   * @returns Promise that resolves when reconstruction completes.
-   *   Next decode continues at position `sinks.length + tail.length`.
-   *
-   * @example Per-boundary reconstruction
-   * ```typescript
-   * // Capture sinks once at conversation start
-   * const SINKS = allTokens.slice(0, 4);
-   *
-   * // On cache fill: compress to 512 tokens (4 sinks + 508 tail)
-   * if (position >= ctx.nCtx) {
-   *   const tail = allTokens.slice(-508);
-   *   await ctx.clearAndReseed(SINKS, tail);
-   *   position = 512;  // sinks.length + tail.length
-   * }
-   * ```
-   *
-   * @example Sinkless reconstruction (equally effective)
-   * ```typescript
-   * const tail = allTokens.slice(-256);
-   * await ctx.clearAndReseed([], tail);  // No sinks needed
-   * position = 256;
-   * ```
-   *
-   * @see [Blink KV paper](https://github.com/lloyal-ai/blink-kv/blob/main/blink_kv.pdf)
-   */
-  clearAndReseed(sinks: number[], tail: number[]): Promise<void>;
-
-  // ===== KV SEQUENCE OPERATIONS =====
-
-  /**
-   * Fork a KV cache sequence — the primitive behind {@link Branch.fork}
-   *
-   * Copies all KV cache entries from `srcSeqId` to `dstSeqId`. Under
-   * llama.cpp's unified KV cache, this is a **metadata-only operation** —
-   * no key/value tensors are copied. Both sequences reference the same
-   * physical KV entries for the shared prefix; only tokens decoded after
-   * the fork point allocate new storage. This is what makes tree-structured
-   * generation (best-of-N, beam search, speculative decoding) memory-efficient:
-   * N branches sharing a 1000-token prefix cost ~1000 KV entries, not N*1000.
-   *
-   * The higher-level {@link Branch.fork} wraps this and additionally clones
-   * the sampler chain, grammar state, logits snapshot, and perplexity tracker.
-   * Use `kvSeqCopy` directly when you need raw sequence management without
-   * the Branch abstraction.
-   *
-   * NOTE: Only full-sequence copies are supported. The p0/p1 parameters
-   * must use default values (0 and -1).
-   *
-   * Cost: O(1) metadata — no tensor copy under unified KV
-   *
-   * @param srcSeqId Source sequence to copy from
-   * @param dstSeqId Destination sequence to copy to
-   * @param p0 Start position (must be 0, default: 0)
-   * @param p1 End position (must be -1 for full copy, default: -1)
-   */
-  kvSeqCopy(srcSeqId: number, dstSeqId: number, p0?: number, p1?: number): void;
-
-  /**
-   * Keep only specified sequence, remove all others
-   *
-   * Removes all sequences except the one specified.
-   * For complete cleanup of unwanted sequences, consider using
-   * kvCacheRemove(seqId, 0, -1) on each sequence instead.
-   *
-   * @param seqId Sequence ID to keep
-   */
-  kvSeqKeep(seqId: number): void;
-
-  /**
-   * Get max position in sequence
-   *
-   * Returns the highest position index in the specified sequence,
-   * or -1 if the sequence is empty.
-   *
-   * Cost: <0.01ms (fast sync operation)
-   *
-   * @param seqId Sequence ID to query
-   * @returns Max position index, or -1 if empty
-   * @example
-   * ```typescript
-   * const pos = ctx.kvSeqPosMax(0);
-   * if (pos === -1) {
-   *   console.log('Sequence is empty');
-   * } else {
-   *   console.log(`Sequence has ${pos + 1} tokens`);
-   * }
-   * ```
-   */
-  kvSeqPosMax(seqId: number): number;
-
-  // ===== KV CACHE FILE PERSISTENCE =====
-
-  /**
-   * Write KV cache state + tokens to file
-   *
-   * Persists KV cache state for later restoration.
-   * Useful for checkpointing long conversations.
-   *
-   * @param sequenceId Sequence ID to save
-   * @param filepath Path to save file
-   * @param tokens Tokens that were decoded into this sequence
-   * @returns Promise resolving to bytes written
-   */
-  kvCacheWriteFile(
-    sequenceId: number,
-    filepath: string,
-    tokens: number[]
-  ): Promise<number>;
-
-  /**
-   * Read KV cache state + tokens from file
-   *
-   * Restores KV cache state from a previous kvCacheWriteFile call.
-   *
-   * @param sequenceId Sequence ID to restore to
-   * @param filepath Path to saved file
-   * @returns Promise resolving to tokens and bytes read
-   */
-  kvCacheReadFile(
-    sequenceId: number,
-    filepath: string
-  ): Promise<{ tokens: number[]; bytesRead: number }>;
-
-  // ===== HELPERS =====
-
-  /**
-   * Format messages using model's chat template
-   *
-   * Converts [{role, content}] -> formatted prompt string with full format awareness.
-   * Uses model's built-in template (ChatML, Llama, Mistral, etc.).
-   *
-   * The returned `format` and `reasoningFormat` fields should be passed to
-   * `parseChatOutput()` after generation to correctly decode the response.
-   *
-   * Cost: ~1-5ms depending on message count
-   *
-   * @param messagesJson JSON string containing array of messages
-   * @param options Formatting options (tools, reasoning, grammar, etc.)
-   * @returns Formatted prompt with format-awareness metadata
-   *
-   * @see {@link parseChatOutput}
-   *
-   * @example Basic usage
-   * ```typescript
-   * const result = await ctx.formatChat(JSON.stringify([
-   *   { role: "system", content: "You are a helpful assistant" },
-   *   { role: "user", content: "Hello!" }
-   * ]));
-   *
-   * const tokens = await ctx.tokenize(result.prompt);
-   * const branch = Branch.create(ctx, 0, { temperature: 0.7 });
-   * await branch.prefill(tokens);
-   * ```
-   */
-  formatChat(
-    messagesJson: string,
-    options?: FormatChatOptions | string
-  ): Promise<FormattedChatResult>;
-
-  /**
-   * Format messages using model's chat template (sync — inline on main thread)
-   *
-   * Same as {@link formatChat} but synchronous. Use from Effection generators
-   * to avoid `yield* call()` overhead for CPU-only work.
-   *
-   * @param messagesJson JSON string containing array of messages
-   * @param options Formatting options (tools, reasoning, grammar, etc.)
-   * @returns Formatted prompt with format-awareness metadata
-   */
-  formatChatSync(
-    messagesJson: string,
-    options?: FormatChatOptions | string
-  ): FormattedChatResult;
-
-  /**
-   * Parse model output into structured content
-   *
-   * Extracts plain text, reasoning/thinking blocks, and tool calls from
-   * raw model output. Uses the format detected by {@link formatChat} to apply
-   * the correct parser for the model's output format.
-   *
-   * Cost: <0.1ms (synchronous string parsing, no I/O)
-   *
-   * @param output Raw model output text
-   * @param format Chat format enum (from {@link FormattedChatResult.format})
-   * @param options Optional parsing parameters
-   * @returns Parsed content with tool calls and reasoning
-   *
-   * @see {@link formatChat}
-   *
-   * @example Basic parsing
-   * ```typescript
-   * const fmt = await ctx.formatChat(JSON.stringify(messages), { tools: toolsJson });
-   * // ... generate tokens ...
-   * const parsed = ctx.parseChatOutput(generatedText, fmt.format, {
-   *   reasoningFormat: fmt.reasoningFormat,
-   *   thinkingForcedOpen: fmt.thinkingForcedOpen,
-   *   parser: fmt.parser
-   * });
-   * if (parsed.toolCalls.length > 0) {
-   *   // Handle tool calls
-   * }
-   * ```
-   *
-   * @example Multi-turn warm continuation with reasoning models
-   * ```typescript
-   * // parseChatOutput separates <think>...</think> blocks into reasoningContent.
-   * // This is REQUIRED for correct warm continuation on thinking models (e.g. Qwen3):
-   * // if raw output containing <think> tags is stored as content, re-formatting
-   * // the conversation produces different tokens, breaking cold/warm parity.
-   *
-   * const messages: Array<{role: string; content: string; reasoning_content?: string}> = [];
-   * const sep = ctx.getTurnSeparator();
-   * let branch: Branch | null = null;
-   * let fmt: FormattedChatResult;
-   *
-   * async function handleTurn(userContent: string) {
-   *   messages.push({ role: 'user', content: userContent });
-   *
-   *   if (!branch) {
-   *     // Cold path: format full conversation, tokenize with BOS, prefill
-   *     fmt = await ctx.formatChat(JSON.stringify(messages));
-   *     const tokens = await ctx.tokenize(fmt.prompt);
-   *     branch = Branch.create(ctx, 0, { temperature: 0.7 });
-   *     await branch.prefill(tokens);
-   *   } else {
-   *     // Warm path: string-diff for delta tokens
-   *     const { prompt: full } = await ctx.formatChat(JSON.stringify(messages));
-   *     const { prompt: prefix } = await ctx.formatChat(
-   *       JSON.stringify(messages.slice(0, -1)),
-   *       { addGenerationPrompt: false }
-   *     );
-   *     const delta = await ctx.tokenize(full.substring(prefix.length), false);
-   *     await branch.prefill([...sep, ...delta]);
-   *   }
-   *
-   *   // Generate
-   *   let rawOutput = '';
-   *   while (true) {
-   *     const { token, text, isStop } = await branch.produce();
-   *     if (isStop) break;
-   *     rawOutput += text;
-   *     await branch.commit(token);
-   *   }
-   *
-   *   // Parse output: separates reasoning from content
-   *   const parsed = ctx.parseChatOutput(rawOutput, fmt.format, {
-   *     reasoningFormat: fmt.reasoningFormat,
-   *     thinkingForcedOpen: fmt.thinkingForcedOpen,
-   *     parser: fmt.parser
-   *   });
-   *
-   *   // Store parsed fields — formatChat reconstructs thinking blocks correctly
-   *   messages.push({
-   *     role: 'assistant',
-   *     content: parsed.content,
-   *     reasoning_content: parsed.reasoningContent || undefined
-   *   });
-   * }
-   * ```
-   */
-  parseChatOutput(
-    output: string,
-    format: ChatFormat,
-    options?: ParseChatOutputOptions
-  ): ParseChatOutputResult;
-
-  /**
-   * Convert JSON schema to GBNF grammar
-   *
-   * Generates grammar string for constrained JSON generation.
-   * Use with {@link Branch.create} grammar parameter for constrained generation.
-   *
-   * Cost: ~1-10ms depending on schema complexity
-   *
-   * @param schemaJson JSON schema string
-   * @returns GBNF grammar string
-   * @example
-   * ```typescript
-   * const schema = {
-   *   type: "object",
-   *   properties: {
-   *     name: { type: "string" },
-   *     age: { type: "number" }
-   *   },
-   *   required: ["name"]
-   * };
-   *
-   * const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(schema));
-   * const branch = Branch.create(ctx, 0, params, undefined, grammar);
-   * ```
-   */
-  jsonSchemaToGrammar(schemaJson: string): Promise<string>;
-
-  /**
-   * Convert JSON schema to GBNF grammar (sync — inline on main thread)
-   *
-   * Same as {@link jsonSchemaToGrammar} but synchronous. Use from Effection
-   * generators to avoid `yield* call()` overhead for CPU-only work.
-   *
-   * @param schemaJson JSON schema string
-   * @returns GBNF grammar string
-   */
-  jsonSchemaToGrammarSync(schemaJson: string): string;
-
-  /**
-   * Validate chat template syntax
-   *
-   * Checks if template string is valid before using.
-   *
-   * Cost: ~0.1-1ms
-   *
-   * @param templateString Template string to validate
-   * @returns True if template syntax is valid
-   */
-  validateChatTemplate(templateString: string): Promise<boolean>;
-
-  // ===== EMBEDDING EXTRACTION =====
-
-  /**
-   * Encode tokens for embedding extraction
-   *
-   * Unlike decode(), this marks ALL tokens with logits=true which is
-   * required for embedding extraction. Use with embeddings=true context.
-   *
-   * Workflow:
-   * 1. Create context with { embeddings: true, poolingType: PoolingType.MEAN }
-   * 2. Tokenize your text
-   * 3. Clear KV cache (important between different texts!)
-   * 4. Call encode() with tokens
-   * 5. Call getEmbeddings() to get the vector
-   *
-   * Cost: ~5-50ms depending on text length and model
-   *
-   * @param tokens Token IDs from tokenize()
-   * @example
-   * ```typescript
-   * // Create embedding context
-   * const ctx = await createContext({
-   *   modelPath: './nomic-embed.gguf',
-   *   embeddings: true,
-   *   poolingType: PoolingType.MEAN
-   * });
-   *
-   * // Get embedding for text
-   * const tokens = await ctx.tokenize("Hello world");
-   * await ctx.kvCacheClear();  // Important between texts!
-   * await ctx.encode(tokens);
-   * const embedding = ctx.getEmbeddings();
-   * ```
-   */
-  encode(tokens: number[]): Promise<void>;
-
-  /**
-   * Get embedding vector from context (after encode)
-   *
-   * Returns the embedding vector for the encoded text.
-   * Call after encode() to extract embeddings.
-   *
-   * The vector dimension depends on the model (e.g., 768 for nomic-embed).
-   * Use getEmbeddingDimension() to get the size.
-   *
-   * Cost: ~0.5ms (extraction from model state)
-   *
-   * @param normalize Apply L2 normalization (default: true for cosine similarity)
-   * @returns Float32Array of embedding values
-   * @example
-   * ```typescript
-   * await ctx.encode(tokens);
-   *
-   * // Get L2-normalized embedding (for cosine similarity)
-   * const embedding = ctx.getEmbeddings();
-   *
-   * // Or raw embedding without normalization
-   * const rawEmbedding = ctx.getEmbeddings(false);
-   * ```
-   */
-  getEmbeddings(normalize?: boolean): Float32Array;
-
-  /**
-   * Get embedding dimension for model
-   *
-   * Returns the size of embedding vectors this model produces.
-   * Common values: 768 (BERT-like), 1024, 2048, 4096.
-   *
-   * Cost: <0.01ms (fast model property lookup)
-   *
-   * @returns Embedding dimension
-   * @example
-   * ```typescript
-   * const dim = ctx.getEmbeddingDimension();
-   * console.log(`Model produces ${dim}-dimensional embeddings`);
-   * ```
-   */
-  getEmbeddingDimension(): number;
-
-  /**
-   * Check if context has pooling enabled
-   *
-   * Returns true if context was created with embeddings=true and
-   * a pooling type other than NONE.
-   *
-   * Cost: <0.01ms
-   *
-   * @returns True if pooling is enabled
-   */
-  hasPooling(): boolean;
-
-  // ===== PROPERTIES =====
-
-  /**
-   * Model vocabulary size (number of possible tokens)
-   *
-   * This is the length of the logits array from Branch.getLogits().
-   */
-  readonly vocabSize: number;
-
-  /**
-   * Memory used by this context (bytes)
-   *
-   * Reports native memory for monitoring.
-   * Includes model weights, KV cache, and context state.
-   */
-  readonly memorySize: number;
-
-  // ===== LIFECYCLE =====
-
-  /**
-   * Free native resources
-   *
-   * Call when done with context to release model and KV cache memory.
-   * Context becomes unusable after disposal.
-   */
-  dispose(): void;
-
-  // ===== BRANCH API (internal, wrapped by Branch class) =====
-
-  /** @internal */
-  _branchCreate(position: number, params?: SamplingParams, nBatch?: number, grammar?: string): number;
-
-  /** @internal */
-  _branchFork(handle: number): number;
-
-  /** @internal */
-  _branchPrefill(handle: number, tokens: number[]): Promise<void>;
-
-  /** @internal */
-  _branchSample(handle: number): number;
-
-  /** @internal */
-  _branchAccept(handle: number, token: number): void;
-
-  /** @internal */
-  _branchGetPosition(handle: number): number;
-
-  /** @internal */
-  _branchGetPerplexity(handle: number): number;
-
-  /** @internal */
-  _branchGetLogits(handle: number): Float32Array;
-
-  /** @internal */
-  _branchPrune(handle: number): void;
-
-  /** @internal */
-  _branchPruneSubtree(handle: number): void;
-
-  /** @internal */
-  _branchParent(handle: number): number;
-
-  /** @internal */
-  _branchChildren(handle: number): number[];
-
-  /** @internal */
-  _branchIsLeaf(handle: number): boolean;
-
-  /** @internal */
-  _branchIsActive(handle: number): boolean;
-
-  /** @internal */
-  _branchSamplerChainReseed(handle: number, seed: number): void;
-
-  /** @internal */
-  _branchSteer(handle: number, biases: Array<{ token: number; bias: number }>): void;
-
-  /** @internal */
-  _branchClearSteer(handle: number): void;
-
-  /** @internal */
-  _branchSetSamplerParams(handle: number, params: SamplingParams): void;
-
-  /** @internal */
-  _branchSetGrammar(handle: number, grammarStr: string): void;
-
-  /** @internal */
-  _branchSetGrammarLazy(handle: number, grammar: string, patterns: string[], tokens: number[]): void;
-
-  /** @internal */
-  _branchModelEntropy(handle: number, base?: string): number;
-
-  /** @internal */
-  _branchModelSurprisal(handle: number, token: number, base?: string): number;
-
-  /** @internal */
-  _branchGetSamplingPerplexity(handle: number): number;
-
-  /** @internal */
-  _branchSetLogitBias(handle: number, biases: Array<{ token: number; bias: number }>): void;
-
-  /** @internal */
-  _branchClearLogitBias(handle: number): void;
-
-  // ===== STORE API (internal, wrapped by BranchStore) =====
-
-  /** @internal */
-  _storeCommit(handles: number[], tokens: number[]): Promise<void>;
-
-  /** @internal */
-  _storePrefill(handles: number[], tokenArrays: number[][]): Promise<void>;
-
-  /** @internal */
-  _storeRetainOnly(handle: number): void;
-
-  /** @internal */
-  _storeAvailable(): number;
-
-  /** KV cache pressure snapshot from native BranchStore.
-   *  cells_used is a monotonic counter reset on drain/retainOnly. */
-  _storeKvPressure(): { nCtx: number; cellsUsed: number; remaining: number };
-
-  // ===== SCORING API =====
-
-  /** @internal — processes ≤ n_seq_max prompts in a single group */
-  _scoreGroup(tokenArrays: number[][]): Promise<Float32Array[]>;
-}
-
-/**
- * Result from Branch.produce()
- *
- * @category Branching
- */
-export interface Produced {
-  /** Sampled token ID */
-  token: number;
-  /** Text representation of the token */
-  text: string;
-  /** Whether this is a stop token (EOS) */
-  isStop: boolean;
-}
-
-// AgentTask, AgentState, RunAgentsOptions, RunAgentsResult removed —
-// superseded by src/runtime/ (useAgentPool, AgentTaskSpec, AgentPoolResult)
-
-/**
- * Options for Rerank context creation
- * @category Core
- */
-export interface RerankOptions {
-  /** Path to reranker .gguf model */
-  modelPath: string;
-  /** Max prompts per GPU dispatch (default: 8) */
-  nSeqMax?: number;
-  /** Context window size (default: 4096) */
-  nCtx?: number;
-  /** KV cache key quantization (default: 'q4_0') */
-  typeK?: KvCacheType;
-  /** KV cache value quantization (default: 'q4_0') */
-  typeV?: KvCacheType;
-}
-
-/**
- * A single rerank result — score for one document
- * @category Core
- */
-export interface RerankResult {
-  /** Relevance probability (0–1) */
-  score: number;
-  /** Original index in the input array */
-  index: number;
-}
-
-/**
- * Progress yielded by Rerank.score() after each scoring group completes
- * @category Core
- */
-export interface RerankProgress {
-  /** Number of documents scored so far */
-  filled: number;
-  /** Total documents to score */
-  total: number;
-  /** Sorted results — partial until filled === total */
-  results: RerankResult[];
-}
-
 /**
  * Native binding interface — what loadBinary() returns
  *
diff --git a/test/agents.ts b/test/agents.ts
deleted file mode 100644
index 5e5dfd8..0000000
--- a/test/agents.ts
+++ /dev/null
@@ -1,272 +0,0 @@
-/**
- * Structured concurrency tests for the agent system
- *
- * Verifies Effection v4 SC guarantees: branch cleanup on all exit paths,
- * scope teardown ordering, ensure() lifecycle.
- *
- * Usage:
- *   npm run test:agents
- *   LLAMA_TEST_MODEL=models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf npm run test:agents
- */
-
-import * as path from 'node:path';
-import * as fs from 'node:fs';
-import { run, call, spawn, ensure, each } from 'effection';
-import { loadBinary, Branch } from '../dist/index.js';
-import type { SessionContext, NativeBinding } from '../dist/index.js';
-import {
-  initAgents, runAgents, withSharedRoot, Tool,
-} from '../dist/agents/index.js';
-import type { AgentPoolResult, JsonSchema } from '../dist/agents/index.js';
-
-const MODEL_PATH: string = process.env.LLAMA_TEST_MODEL
-  ? path.resolve(process.env.LLAMA_TEST_MODEL)
-  : path.join(__dirname, '../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf');
-
-const CTX_SIZE = 2048;
-
-if (!fs.existsSync(MODEL_PATH)) {
-  console.error('Test model not found:', MODEL_PATH);
-  process.exit(1);
-}
-
-console.log('=== lloyal.node SC Agent Tests ===\n');
-console.log(`Model: ${path.basename(MODEL_PATH)}`);
-console.log(`Size: ${(fs.statSync(MODEL_PATH).size / 1024 / 1024).toFixed(1)} MB\n`);
-
-let addon: NativeBinding;
-try {
-  addon = require('../build/Release/lloyal.node') as NativeBinding;
-} catch {
-  addon = loadBinary();
-}
-
-let passed = 0;
-let failed = 0;
-
-function ok(msg: string): void {
-  passed++;
-  console.log(`  [PASS] ${msg}`);
-}
-
-function fail(msg: string): void {
-  failed++;
-  console.log(`  [FAIL] ${msg}`);
-}
-
-function assert(condition: boolean, msg: string): void {
-  if (condition) ok(msg);
-  else { fail(msg); throw new Error(msg); }
-}
-
-// ── Test tools ────────────────────────────────────────────────────
-
-class ThrowingTool extends Tool<Record<string, unknown>> {
-  readonly name = 'explode';
-  readonly description = 'A tool that always throws';
-  readonly parameters: JsonSchema = {
-    type: 'object',
-    properties: { input: { type: 'string' } },
-  };
-  async execute(): Promise<unknown> {
-    throw new Error('intentional_tool_error');
-  }
-}
-
-// ── Helpers ────────────────────────────────────────────────────────
-
-async function createTestContext(): Promise<SessionContext> {
-  return addon.createContext({
-    modelPath: MODEL_PATH,
-    nCtx: CTX_SIZE,
-    nThreads: 4,
-    nSeqMax: 4,
-    typeK: 'f16',
-    typeV: 'f16',
-  });
-}
-
-function makeTasks(parent: Branch, count: number) {
-  return Array.from({ length: count }, (_, i) => ({
-    systemPrompt: 'You are a test agent.',
-    content: `Test task ${i}`,
-    parent,
-  }));
-}
-
-/** Bootstrap agent infra via initAgents + drain events to prevent backpressure */
-function* setupTest(ctx: SessionContext) {
-  const { events } = yield* initAgents(ctx);
-  yield* spawn(function*() {
-    for (const _ev of yield* each(events)) {
-      yield* each.next();
-    }
-  });
-}
-
-// ═══════════════════════════════════════════════════════════════════
-// TEST 1: ensure() cleanup — runs on scope exit regardless of how
-// ═══════════════════════════════════════════════════════════════════
-
-async function testEnsureCleanup(): Promise<void> {
-  console.log('\n--- ensure() cleanup: runs on normal exit and on error ---');
-
-  // Test A: ensure runs on normal exit
-  let cleanupRanNormal = false;
-  await run(function*() {
-    yield* ensure(() => { cleanupRanNormal = true; });
-  });
-  assert(cleanupRanNormal, 'ensure() ran on normal scope exit');
-
-  // Test B: ensure runs on error exit
-  let cleanupRanError = false;
-  try {
-    await run(function*() {
-      yield* ensure(() => { cleanupRanError = true; });
-      throw new Error('intentional_test_error');
-    });
-  } catch {
-    // expected
-  }
-  assert(cleanupRanError, 'ensure() ran on error scope exit');
-}
-
-// ═══════════════════════════════════════════════════════════════════
-// TEST 2: Normal lifecycle — branches pruned after runAgents returns
-// ═══════════════════════════════════════════════════════════════════
-
-async function testNormalLifecycle(): Promise<void> {
-  console.log('\n--- Normal lifecycle: branches pruned after runAgents ---');
-
-  await run(function*() {
-    const ctx: SessionContext = yield* call(() => createTestContext());
-    yield* setupTest(ctx);
-
-    yield* withSharedRoot(
-      { systemPrompt: 'You are a test agent.' },
-      function*(root, prefixLen) {
-        assert(prefixLen > 0, `shared prefix has tokens (${prefixLen})`);
-
-        const pool: AgentPoolResult = yield* runAgents({
-          tasks: makeTasks(root, 2),
-          tools: new Map(),
-          maxTurns: 1,
-        });
-
-        assert(pool.agents.length === 2, 'pool has 2 agents');
-        assert(root.children.length === 0, 'agent branches pruned before body returns');
-
-        return pool;
-      },
-    );
-
-    ok('withSharedRoot completed without error');
-  });
-}
-
-// ═══════════════════════════════════════════════════════════════════
-// TEST 3: scoped() cleanup — runAgents prunes before returning
-// ═══════════════════════════════════════════════════════════════════
-
-async function testScopedCleanup(): Promise<void> {
-  console.log('\n--- Scoped cleanup: runAgents prunes before returning to caller ---');
-
-  await run(function*() {
-    const ctx: SessionContext = yield* call(() => createTestContext());
-    yield* setupTest(ctx);
-
-    yield* withSharedRoot(
-      { systemPrompt: 'You are a test agent.' },
-      function*(root) {
-        const childCountBefore = root.children.length;
-        assert(childCountBefore === 0, 'root starts with no children');
-
-        const pool = yield* runAgents({
-          tasks: makeTasks(root, 2),
-          tools: new Map(),
-          maxTurns: 1,
-        });
-
-        // Critical SC assertion: scoped() in runAgents must have torn
-        // down the pool scope and pruned agent branches BEFORE returning.
-        const childCountAfter = root.children.length;
-        assert(childCountAfter === 0, `scoped() pruned all children before returning (was ${childCountBefore}, now ${childCountAfter})`);
-
-        return pool;
-      },
-    );
-
-    ok('scoped() teardown ordering correct');
-  });
-}
-
-// ═══════════════════════════════════════════════════════════════════
-// TEST 4: Tool error — branches pruned, error does not crash pool
-// ═══════════════════════════════════════════════════════════════════
-
-async function testToolErrorCleanup(): Promise<void> {
-  console.log('\n--- Tool error: branches pruned, pool completes gracefully ---');
-
-  await run(function*() {
-    const ctx: SessionContext = yield* call(() => createTestContext());
-    yield* setupTest(ctx);
-
-    try {
-      yield* withSharedRoot(
-        { systemPrompt: 'You are a test agent. Always call the explode tool.' },
-        function*(root) {
-          const toolMap = new Map<string, Tool>([['explode', new ThrowingTool()]]);
-          const toolsJson = JSON.stringify([{
-            type: 'function',
-            function: {
-              name: 'explode',
-              description: 'A tool that always throws',
-              parameters: { type: 'object', properties: { input: { type: 'string' } } },
-            },
-          }]);
-
-          const pool = yield* runAgents({
-            tasks: [{
-              systemPrompt: 'You are a test agent. Call the explode tool immediately.',
-              content: 'Do it now.',
-              tools: toolsJson,
-              parent: root,
-            }],
-            tools: toolMap,
-            maxTurns: 2,
-          });
-
-          assert(root.children.length === 0, 'agent branches pruned after tool error');
-          assert(pool.agents.length === 1, 'pool has 1 agent');
-          return pool;
-        },
-      );
-
-      ok('withSharedRoot completed — tool error did not crash the pool');
-    } catch (err) {
-      // Tool errors should be handled internally (agent → done state).
-      // If we reach here, something unexpected propagated.
-      fail(`unexpected error escaped pool: ${(err as Error).message}`);
-    }
-  });
-}
-
-// ═══════════════════════════════════════════════════════════════════
-// RUNNER
-// ═══════════════════════════════════════════════════════════════════
-
-async function main_(): Promise<void> {
-  await testEnsureCleanup();
-  await testNormalLifecycle();
-  await testScopedCleanup();
-  await testToolErrorCleanup();
-
-  console.log(`\n${'='.repeat(40)}`);
-  console.log(`Results: ${passed} passed, ${failed} failed`);
-  if (failed > 0) process.exit(1);
-}
-
-main_().catch((err: unknown) => {
-  console.error(`\nFatal: ${(err as Error).message}\n${(err as Error).stack}`);
-  process.exit(1);
-});
diff --git a/test/examples.ts b/test/examples.ts
index 3005fec..e955cff 100644
--- a/test/examples.ts
+++ b/test/examples.ts
@@ -51,13 +51,6 @@ const EMBED_MODEL_PATH: string = process.env.EMBED_MODEL_PATH
   ? path.resolve(process.env.EMBED_MODEL_PATH)
   : path.join(__dirname, '../liblloyal/tests/fixtures/nomic-embed-text-v1.5.Q4_K_M.gguf');
 
-const QWEN3_PATH: string = process.env.QWEN3_MODEL
-  ? path.resolve(process.env.QWEN3_MODEL)
-  : path.join(__dirname, '../models/Qwen3-4B-Instruct-2507-Q4_K_M.gguf');
-
-const RERANKER_PATH: string = process.env.RERANKER_MODEL
-  ? path.resolve(process.env.RERANKER_MODEL)
-  : path.join(__dirname, '../models/qwen3-reranker-0.6b-q4_k_m.gguf');
 
 
 if (!fs.existsSync(MODEL_PATH)) {
@@ -185,48 +178,6 @@ const EXAMPLES: Record<string, ExampleConfig> = {
     },
   },
 
-  'deep-research': {
-    path: 'deep-research/deep-research.ts',
-    timeout: 300000,
-    modelPath: QWEN3_PATH,
-    extraArgs: [
-      '--reranker', RERANKER_PATH,
-      '--corpus', process.env.DEEP_RESEARCH_CORPUS || '',
-      '--query', process.env.DEEP_RESEARCH_QUERY || '',
-    ],
-    skip: !fs.existsSync(QWEN3_PATH) || !fs.existsSync(RERANKER_PATH)
-      || !process.env.DEEP_RESEARCH_CORPUS || !process.env.DEEP_RESEARCH_QUERY,
-    skipReason: 'Requires QWEN3_MODEL, RERANKER_MODEL, DEEP_RESEARCH_CORPUS, and DEEP_RESEARCH_QUERY env vars',
-    validate(events: ExampleEvent[]): void {
-      const start: ExampleEvent | undefined = events.find(e => e.event === 'start');
-      assert(start, 'should have start event');
-      assert(start.agentCount === 3, 'should have 3 agents');
-      assert(start.chunks > 0, 'should have corpus chunks');
-
-      const plan: ExampleEvent | undefined = events.find(e => e.event === 'plan');
-      assert(plan, 'should have plan event');
-      assert(plan.questions.length >= 2, 'should plan at least 2 sub-questions');
-
-      const researchStart: ExampleEvent | undefined = events.find(e => e.event === 'research_start');
-      assert(researchStart, 'should have research_start event');
-      assert(researchStart.sharedPrefixTokens > 0, 'should have shared prefix');
-
-      const toolCalls: ExampleEvent[] = events.filter(e => e.event === 'tool_call');
-      assert(toolCalls.length > 0, 'should make at least one tool call');
-
-      const agentsDone: ExampleEvent[] = events.filter(e => e.event === 'agent_done');
-      assert(agentsDone.length === 3, 'all 3 agents should finish');
-      for (const a of agentsDone) {
-        assert(a.tokenCount > 0, `agent ${a.index} should generate tokens`);
-      }
-
-      const complete: ExampleEvent | undefined = events.find(e => e.event === 'complete');
-      assert(complete, 'should have complete event');
-      assert(complete.totalToolCalls > 0, 'should have tool calls');
-      assert(complete.wallTimeMs > 0, 'should have wall time');
-      assert(complete.converged !== undefined, 'should have convergence result');
-    },
-  },
 };
 
 async function runTest(name: string, config: ExampleConfig): Promise<TestResult> {
diff --git a/test/integration.ts b/test/integration.ts
index 42b042c..77eca9c 100644
--- a/test/integration.ts
+++ b/test/integration.ts
@@ -17,7 +17,7 @@
 
 import * as path from 'node:path';
 import * as fs from 'node:fs';
-import { loadBinary, Branch, BranchStore, Rerank } from '../dist/index.js';
+import { loadBinary, createContext, Branch, BranchStore, Rerank } from '../dist/index.js';
 import type { SessionContext, NativeBinding, FormattedChatResult, Produced } from '../dist/index.js';
 
 const MODEL_PATH: string = process.env.LLAMA_TEST_MODEL
@@ -1908,7 +1908,14 @@ async function testRerank(): Promise<void> {
   console.log('\n--- Rerank ---');
   console.log(`  Model: ${path.basename(RERANK_MODEL_PATH)}`);
 
-  const rerank = await Rerank.create({ modelPath: RERANK_MODEL_PATH });
+  const rerankCtx = await createContext({
+    modelPath: RERANK_MODEL_PATH,
+    nCtx: 4096,
+    nSeqMax: 8,
+    typeK: 'q4_0',
+    typeV: 'q4_0',
+  });
+  const rerank = await Rerank.create(rerankCtx, { nSeqMax: 8, nCtx: 4096 });
 
   try {
     // Tokenize documents
@@ -1987,7 +1994,14 @@ async function testRerankLargeCorpus(): Promise<void> {
   console.log(`  Model: ${path.basename(RERANK_MODEL_PATH)}`);
 
   // n_seq_max=8 so 20 documents requires 3 groups (8+8+4)
-  const rerank = await Rerank.create({ modelPath: RERANK_MODEL_PATH, nSeqMax: 8 });
+  const rerankCtx = await createContext({
+    modelPath: RERANK_MODEL_PATH,
+    nCtx: 4096,
+    nSeqMax: 8,
+    typeK: 'q4_0',
+    typeV: 'q4_0',
+  });
+  const rerank = await Rerank.create(rerankCtx, { nSeqMax: 8, nCtx: 4096 });
 
   try {
     const query = 'What is the capital of France?';
@@ -2065,7 +2079,14 @@ async function testRerankConcurrent(): Promise<void> {
 
   console.log('\n--- Rerank Concurrent ---');
 
-  const rerank = await Rerank.create({ modelPath: RERANK_MODEL_PATH, nSeqMax: 4 });
+  const rerankCtx = await createContext({
+    modelPath: RERANK_MODEL_PATH,
+    nCtx: 4096,
+    nSeqMax: 4,
+    typeK: 'q4_0',
+    typeV: 'q4_0',
+  });
+  const rerank = await Rerank.create(rerankCtx, { nSeqMax: 4, nCtx: 4096 });
 
   try {
     const docs = [