From 3952867063ff89406a9e55d3888ed4f5c06a8d6a Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Fri, 20 Feb 2026 13:04:39 +1100
Subject: [PATCH 1/3] feat(api): remove flat ctx inference path, make branch
 API the only path

---
 README.md                                 |  12 +-
 examples/best-of-n/best-of-n.mjs          |  25 +-
 examples/chat/README.md                   |  40 +-
 examples/chat/chat.mjs                    |   7 +-
 examples/entropy/entropy.mjs              |  39 +-
 examples/grammar/README.md                | 122 ++--
 examples/grammar/grammar.mjs              |  96 +--
 examples/speculative/README.md            |  96 +--
 examples/speculative/speculative.mjs      |  16 +-
 examples/streaming/streaming-summary.mjs  |  56 +-
 examples/streaming/streaming-tsampler.mjs |  50 +-
 examples/streaming/streaming.mjs          |  51 +-
 lib/Branch.js                             |  52 +-
 lib/index.d.ts                            | 650 ++++-----------------
 lib/index.js                              |  19 +-
 liblloyal                                 |   2 +-
 src/SessionContext.cpp                    | 679 ++--------------------
 src/SessionContext.hpp                    | 130 +----
 test/integration.js                       | 554 ++++++++++--------
 19 files changed, 825 insertions(+), 1871 deletions(-)

diff --git a/README.md b/README.md
index eca0b26..49fb41d 100644
--- a/README.md
+++ b/README.md
@@ -22,10 +22,9 @@ const store = new BranchStore(ctx);
 
 // Shared prompt: "Explain quantum entanglement"
 const prompt = await ctx.tokenize("Explain quantum entanglement");
-await ctx.decode(prompt, 0, 0);
 
-const root = Branch.create(ctx, prompt.length, { temperature: 0.8 });
-root.captureLogits();
+const root = Branch.create(ctx, 0, { temperature: 0.8 });
+await root.prefill(prompt);
 
 // Fork 4 branches — each gets a different reasoning prefix
 const analogy  = await root.fork();
@@ -206,11 +205,12 @@ For fine-grained control without Branch:
 
 ```javascript
 const grammar = await ctx.jsonSchemaToGrammar(schema);
-const handle = ctx.createSampler(grammar);
-// Pull loop — consumer controls pace, can branch at any point
+const branch = Branch.create(ctx, 0, params, undefined, grammar);
+await branch.prefill(promptTokens);
+// Grammar state cloned automatically on fork()
 ```
 
-See [`examples/grammar/`](./examples/grammar/) for the full pull loop pattern.
+See [`examples/grammar/`](./examples/grammar/) for the full branch fork pattern.
 
 ---
 
diff --git a/examples/best-of-n/best-of-n.mjs b/examples/best-of-n/best-of-n.mjs
index 1519eba..22c9328 100644
--- a/examples/best-of-n/best-of-n.mjs
+++ b/examples/best-of-n/best-of-n.mjs
@@ -11,10 +11,10 @@
  * See: Stiennon et al. 2020 "Learning to summarize from human feedback"
  *
  * KEY IMPLEMENTATION DETAIL:
- * Uses the Branch API for parallel generation. After prefilling the prompt,
- * we create a root branch and call captureLogits(). When forking to multiple
- * candidates, each fork inherits the root's logits snapshot, ensuring all
- * candidates start from the same probability distribution.
+ * Uses the Branch API for parallel generation. The root branch prefills the
+ * prompt and captures logits. When forking to multiple candidates, each fork
+ * inherits the root's logits snapshot, ensuring all candidates start from
+ * the same probability distribution.
  *
  * Usage:
  *   node best-of-n.mjs [model-path]          # Human-readable output
@@ -92,21 +92,18 @@ async function main() {
     console.log(`\nPrompt: "${userPrompt}"`);
   }
 
-  // Prefill prompt
+  // Prefill prompt via root branch
   const promptTokens = await ctx.tokenize(prompt);
-  await ctx.decode(promptTokens, 0, 0);
 
-  if (!jsonlMode) {
-    console.log(`\nPrefill complete. Prompt length: ${promptTokens.length} tokens`);
-  }
-
-  // CRITICAL: Create root branch IMMEDIATELY after prefill to capture logits
-  // The root branch stores a snapshot of the logits for fork operations
-  const root = Branch.create(ctx, promptTokens.length, {
+  const root = Branch.create(ctx, 0, {
     temperature: HIGH_TEMP,
     topP: 0.95,
   });
-  root.captureLogits();
+  await root.prefill(promptTokens);
+
+  if (!jsonlMode) {
+    console.log(`\nPrefill complete. Prompt length: ${promptTokens.length} tokens`);
+  }
 
   // === Baseline: Single generation with low temperature ===
   if (!jsonlMode) {
diff --git a/examples/chat/README.md b/examples/chat/README.md
index f7140de..04fcedd 100644
--- a/examples/chat/README.md
+++ b/examples/chat/README.md
@@ -14,37 +14,33 @@ npm run example -- /path/to/model.gguf    # custom model
 - `/clear` - Reset conversation and clear terminal
 - `/quit` - Exit
 
-## The Pattern: Sync Produce, Async Commit
+## The Pattern: Branch Produce/Commit
 
 ```javascript
-// Sync generator - all operations are synchronous
-function* produceTokens(ctx, params) {
-  while (true) {
-    const tokenId = ctx.sample(params);      // sync
-    if (ctx.isStopToken(tokenId)) return;    // sync
-    const text = ctx.tokenToText(tokenId);   // sync
-    yield { text, tokenId };
-  }
-}
+// Create branch and prefill prompt
+const branch = Branch.create(ctx, 0, { temperature: 0.7 });
+await branch.prefill(promptTokens);
 
-// Usage - async commit is explicit in caller's loop
-for (const { text, tokenId } of produceTokens(ctx, params)) {
+// Async iterator - commit-before-yield
+for await (const { token, text } of branch) {
   process.stdout.write(text);
-  await ctx.decode([tokenId], position);     // async commit to KV
-  position += 1;
 }
+await branch.prune();
 ```
 
-**Key insight:** Token production is synchronous. Only the KV cache commit (`decode`) is async. This separation makes the control flow explicit.
+**Key insight:** The async iterator handles produce/commit internally. Each yielded token is already committed to KV. Breaking out is clean — no orphaned state.
 
 ## API Reference
 
 | Method | Sync/Async | Purpose |
 |--------|------------|---------|
-| `sample(params)` | sync | Sample next token from logits |
-| `isStopToken(id)` | sync | Check if token ends generation |
-| `tokenToText(id)` | sync | Convert token ID to text |
-| `decode(tokens, pos)` | async | Commit tokens to KV cache |
-| `tokenize(text)` | async | Convert text to token IDs |
-| `formatChat(json)` | async | Apply chat template |
-| `kvCacheClear()` | async | Reset KV cache |
+| `Branch.create(ctx, pos, params)` | sync | Create a branch for generation |
+| `branch.prefill(tokens)` | async | Feed tokens into branch's KV cache |
+| `branch.produce()` | async | Sample next token (no KV write) |
+| `branch.commit(token)` | async | Accept + decode into KV |
+| `branch.prune()` | async | Discard branch and its KV entries |
+| `ctx.isStopToken(id)` | sync | Check if token ends generation |
+| `ctx.tokenToText(id)` | sync | Convert token ID to text |
+| `ctx.tokenize(text)` | async | Convert text to token IDs |
+| `ctx.formatChat(json)` | async | Apply chat template |
+| `ctx.kvCacheClear()` | async | Reset KV cache |
diff --git a/examples/chat/chat.mjs b/examples/chat/chat.mjs
index 8f058dd..4ec2ea0 100644
--- a/examples/chat/chat.mjs
+++ b/examples/chat/chat.mjs
@@ -82,16 +82,15 @@ async function main() {
     messages.push({ role: "user", content: trimmed });
 
     if (!branch) {
-      // === COLD (position === 0): full format → tokenize with BOS → decode ===
+      // === COLD (position === 0): full format → tokenize with BOS → prefill ===
       fmt = await ctx.formatChat(JSON.stringify(messages));
       const tokens = await ctx.tokenize(fmt.prompt);
-      await ctx.decode(tokens, 0, 0);
-      branch = Branch.create(ctx, tokens.length, {
+      branch = Branch.create(ctx, 0, {
         temperature: 0.7,
         topK: 40,
         topP: 0.9,
       });
-      branch.captureLogits();
+      await branch.prefill(tokens);
     } else {
       // === WARM (position > 0): format only the new message ===
       fmt = await ctx.formatChat(
diff --git a/examples/entropy/entropy.mjs b/examples/entropy/entropy.mjs
index c9204fe..6453e22 100644
--- a/examples/entropy/entropy.mjs
+++ b/examples/entropy/entropy.mjs
@@ -10,6 +10,8 @@
  * - EDT formula: T = T₀ · N^(θ/Entropy)
  * - Side-by-side comparison with fixed temperature
  * - Different prompt types: factual, creative, mixed
+ * - Branch API for token generation (produce/commit loop)
+ *
  *
  * Usage:
  *   node entropy.mjs [model-path]          # Human-readable output
@@ -18,7 +20,7 @@
 
 import * as path from 'node:path';
 import { fileURLToPath } from 'node:url';
-import { createContext } from '../../lib/index.js';
+import { createContext, Branch } from '../../lib/index.js';
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const DEFAULT_MODEL = path.resolve(
@@ -53,47 +55,48 @@ function edtTemperature(entropy) {
 
 /**
  * Generate with a specific sampling strategy
+ *
+ * Uses Branch API with per-token setSamplerParams() for EDT adaptation.
+ * Each token gets a temperature computed from the current logit entropy.
  */
 async function generate(ctx, prompt, strategy, strategyName, maxTokens = 50) {
   const messages = [{ role: 'user', content: prompt }];
   const { prompt: formatted } = await ctx.formatChat(JSON.stringify(messages));
-
   const tokens = await ctx.tokenize(formatted);
-  await ctx.decode(tokens, 0, 0);
+
+  const baseTemp = strategy === 'edt' ? 0.8 : strategy;
+  const branch = Branch.create(ctx, 0, { temperature: baseTemp, topP: 0.9 });
+  await branch.prefill(tokens);
 
   const output = [];
   const temps = [];
   const entropies = [];
-  let pos = tokens.length;
 
   for (let i = 0; i < maxTokens; i++) {
-    const entropy = ctx.modelEntropy('nats');
+    const branchLogits = branch.getLogits();
+    const entropy = ctx.modelEntropy('nats', branchLogits);
     entropies.push(entropy);
 
-    let temp;
-    if (strategy === 'edt') {
-      temp = edtTemperature(entropy);
-    } else {
-      temp = strategy; // Fixed temperature
-    }
+    const temp = strategy === 'edt' ? edtTemperature(entropy) : strategy;
     temps.push(temp);
 
-    const token = ctx.sample({ temperature: temp });
-    if (ctx.isStopToken(token)) break;
+    if (strategy === 'edt') branch.setSamplerParams({ temperature: temp, topP: 0.9 });
+
+    const { token, isStop } = await branch.produce();
+    if (isStop) break;
 
     const text = ctx.tokenToText(token);
     emit('token', { strategy: strategyName, token, text, entropy, temp });
 
     output.push(token);
-    await ctx.decode([token], pos++, 0);
+    await branch.commit(token);
   }
 
-  // Clear KV cache for next run
-  await ctx.kvCacheClear();
+  await branch.prune();
 
   const text = await ctx.detokenize(output);
-  const avgEntropy = entropies.reduce((a, b) => a + b, 0) / entropies.length;
-  const avgTemp = temps.reduce((a, b) => a + b, 0) / temps.length;
+  const avgEntropy = entropies.length > 0 ? entropies.reduce((a, b) => a + b, 0) / entropies.length : 0;
+  const avgTemp = temps.length > 0 ? temps.reduce((a, b) => a + b, 0) / temps.length : 0;
 
   return { text, avgEntropy, avgTemp, tokenCount: output.length, temps, entropies };
 }
diff --git a/examples/grammar/README.md b/examples/grammar/README.md
index 4326511..57ac23a 100644
--- a/examples/grammar/README.md
+++ b/examples/grammar/README.md
@@ -1,6 +1,6 @@
-# Grammar-Constrained Generation with Pull Loop
+# Grammar-Constrained Generation with Branch Forking
 
-Demonstrates generator-based token streaming with grammar constraints and forkable state.
+Demonstrates grammar-constrained generation using the Branch API with automatic grammar cloning on fork.
 
 ## Run It
 
@@ -17,109 +17,61 @@ Generating until "city" field...
   "age": 30,
   "city":
 
-Saving KV cache and grammar state at branch point...
-
-Exploring 3 city branches:
+Forking into 3 branches at branch point...
 
   [NYC branch]: { "name": "John Doe", "age": 30, "city": "Seattle" }
   [LA branch]: { "name": "John Doe", "age": 30, "city": "Chicago" }
   [Chicago branch]: { "name": "John Doe", "age": 30, "city": "LA" }
 ```
 
-## The Pull Loop Pattern
+## The Branch Fork Pattern
 
-This example uses a **pull loop** via JS generators. The consumer requests tokens one at a time and decides when to stop:
+Grammar state is integrated into the branch and cloned automatically on fork:
 
 ```javascript
-function* tokenGenerator(ctx, grammarHandle, maxTokens = 100) {
-  for (let i = 0; i < maxTokens; i++) {
-    const logits = ctx.getLogits();
-    ctx.applySampler(grammarHandle, logits);
-
-    const token = ctx.sample({ temperature: 0.7 });
-    if (ctx.isStopToken(token)) return;
-
-    ctx.acceptSamplerToken(grammarHandle, token);
-
-    // Yield control back to caller
-    yield { token, text: ctx.tokenToText(token) };
-  }
+// Create root branch with grammar constraint
+const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(schema));
+const root = Branch.create(ctx, 0, params, undefined, grammar);
+await root.prefill(promptTokens);
+
+// Generate until branch point
+for (let i = 0; i < 100; i++) {
+  const { token, text, isStop } = await root.produce();
+  if (isStop) break;
+  await root.commit(token);
+  if (accumulated.includes('"city"')) break;
 }
-```
 
-Consumer decides when to continue:
+// Fork — grammar state cloned automatically
+for (const city of cities) {
+  const child = await root.fork();
+  child.reseedSampler(seed++);
 
-```javascript
-for (const { token, text } of gen) {
-  accumulated += text;
-  await ctx.decode([token], pos++, 0);
-
-  // Stop at decision point - generator pauses here
-  if (accumulated.includes('"city"')) {
-    break;  // Generator stays paused, state preserved
+  for await (const { text } of child) {
+    // Each branch generates independently with its own grammar state
   }
+  await child.prune();
 }
+await root.prune();
 ```
 
-## Why Pull Loop Here?
+## Why Branch Fork Here?
 
-For this branching use case, pull made the code simpler:
-
-```javascript
-// Stop when we see the branch point - just break
-for (const { token, text } of gen) {
-  accumulated += text;
-  if (accumulated.includes('"city"')) break;
-}
-// Generator paused mid-iteration, grammar state intact
-// Now save and branch
-```
+For grammar-constrained branching, fork handles everything atomically:
+- **KV cache**: Shared prefix, divergent-only storage per branch
+- **Grammar state**: Parser position cloned automatically
+- **Sampler chain**: Penalties and PRNG cloned and reseeded
 
-With a push loop you'd need callbacks or flags to signal "stop here" - doable, but the control flow is inverted. Pull keeps the branching logic linear and readable.
-
-## Branching Pattern
-
-1. **Generate** until decision point (pull loop pauses naturally)
-2. **Save** both KV cache and grammar state
-3. **Fork** for each branch exploration
-4. **Restore** and continue independently
-
-```javascript
-// Pause at branch point
-if (accumulated.includes('"city"')) break;
-
-// Save state
-const kvSnapshot = await ctx.kvCacheSave(0);
-const grammarSnapshot = ctx.cloneSampler(grammarHandle);
-
-// Explore branches
-for (const branch of branches) {
-  await ctx.kvCacheLoad(0, kvSnapshot);
-  const branchGrammar = ctx.cloneSampler(grammarSnapshot);
-
-  // Each branch continues independently
-  for (const { token, text } of tokenGenerator(ctx, branchGrammar)) {
-    // ...
-  }
-}
-```
+No manual KV save/load or grammar cloning needed — `fork()` is a single operation.
 
 ## Key APIs
 
 | Method | Description |
 |--------|-------------|
-| `getLogits()` | Get logits buffer (modified in-place by applySampler) |
-| `applySampler(handle, logits)` | Apply grammar constraints to logits |
-| `sample()` | Sample from modified logits |
-| `acceptSamplerToken(handle, id)` | Advance grammar parser state |
-| `createSampler(grammar)` | Create grammar handle |
-| `cloneSampler(handle)` | Clone grammar state for branching |
-| `kvCacheSave(seq)` / `kvCacheLoad(seq, buf)` | Snapshot/restore KV state |
-
-## Grammar + KV Travel Together
-
-For valid branching, fork **both**:
-- **KV cache**: Model's memory of what it has seen
-- **Grammar state**: Parser's position in the grammar
-
-Missing either causes invalid completions or grammar errors.
+| `Branch.create(ctx, pos, params, nBatch, grammar)` | Create branch with grammar constraint |
+| `branch.fork()` | Clone branch: KV prefix + grammar + sampler |
+| `branch.reseedSampler(seed)` | Diversify forked branch's PRNG |
+| `branch.produce()` | Sample grammar-valid token |
+| `branch.commit(token)` | Advance grammar + KV state |
+| `branch.prune()` | Clean up branch resources |
+| `ctx.jsonSchemaToGrammar(json)` | Convert JSON schema to GBNF grammar |
diff --git a/examples/grammar/grammar.mjs b/examples/grammar/grammar.mjs
index 1d7463b..6f96f2c 100644
--- a/examples/grammar/grammar.mjs
+++ b/examples/grammar/grammar.mjs
@@ -2,8 +2,9 @@
 /**
  * Grammar-constrained generation with forkable state
  *
- * Uses JS generators for backpressure - generation pauses at each yield,
- * allowing precise control over when to branch.
+ * Uses Branch API for grammar-constrained generation with tree branching.
+ * Grammar state is automatically cloned on fork(), so each branch can
+ * diverge independently while maintaining valid JSON output.
  *
  * Usage:
  *   node grammar.mjs [model-path]          # Human-readable output
@@ -12,7 +13,7 @@
 
 import * as path from 'node:path';
 import { fileURLToPath } from 'node:url';
-import { createContext } from '../../lib/index.js';
+import { createContext, Branch } from '../../lib/index.js';
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const DEFAULT_MODEL = path.resolve(
@@ -32,27 +33,6 @@ function emit(event, data) {
   }
 }
 
-/**
- * Generator that yields tokens one at a time
- * Caller controls pace via next() - natural backpressure
- */
-function* tokenGenerator(ctx, grammarHandle, maxTokens = 100) {
-  for (let i = 0; i < maxTokens; i++) {
-    // Apply grammar constraints to context logits
-    const logits = ctx.getLogits();
-    ctx.applySampler(grammarHandle, logits);
-
-    const token = ctx.sample();
-    if (ctx.isStopToken(token)) return;
-
-    // Advance grammar state
-    ctx.acceptSamplerToken(grammarHandle, token);
-
-    // Yield token and text - caller decides when to continue
-    yield { token, text: ctx.tokenToText(token) };
-  }
-}
-
 async function main() {
   if (!jsonlMode) {
     console.log(`Loading model: ${path.basename(modelPath)}`);
@@ -89,16 +69,16 @@ async function main() {
     console.log(grammar.slice(0, 200) + '...\n');
   }
 
-  const grammarHandle = ctx.createSampler(grammar);
-
   const prompt = 'Generate a person as JSON:\n';
   if (!jsonlMode) {
     console.log(`Prompt: "${prompt}"`);
   }
 
   const tokens = await ctx.tokenize(prompt);
-  await ctx.decode(tokens, 0, 0);
-  let pos = tokens.length;
+
+  // Root branch with grammar constraint — grammar state cloned automatically on fork()
+  const root = Branch.create(ctx, 0, { temperature: 0.7, topP: 0.9 }, undefined, grammar);
+  await root.prefill(tokens);
 
   // ===== PHASE 1: Generate until we see "city" key =====
   if (!jsonlMode) {
@@ -106,19 +86,19 @@ async function main() {
     process.stdout.write('  ');
   }
 
-  const gen = tokenGenerator(ctx, grammarHandle);
-  const collectedTokens = [];
   let accumulated = '';
 
-  for (const { token, text } of gen) {
-    collectedTokens.push(token);
+  for (let i = 0; i < 100; i++) {
+    const { token, text, isStop } = await root.produce();
+    if (isStop) break;
+
     accumulated += text;
     if (!jsonlMode) {
       process.stdout.write(text);
     }
     emit('token', { phase: 'prefix', token, text });
 
-    await ctx.decode([token], pos++, 0);
+    await root.commit(token);
 
     // Stop when we see "city": - we want to branch here
     if (accumulated.includes('"city"')) {
@@ -129,55 +109,46 @@ async function main() {
     console.log('\n');
   }
 
-  // ===== PHASE 2: Save state for branching =====
-  if (!jsonlMode) {
-    console.log('Saving KV cache and grammar state at branch point...');
-  }
-  const kvSnapshot = await ctx.kvCacheSave(0);
-  const grammarSnapshot = ctx.cloneSampler(grammarHandle);
-  const branchPos = pos;
-
-  emit('branch_point', { prefix: accumulated, position: branchPos });
-
-  // ===== PHASE 3: Complete with different cities =====
+  // ===== PHASE 2: Fork and complete with different branches =====
   const cities = ['NYC', 'LA', 'Chicago'];
   if (!jsonlMode) {
-    console.log(`\nExploring ${cities.length} city branches:\n`);
+    console.log(`Forking into ${cities.length} branches at branch point...\n`);
   }
 
-  const branches = [];
-  for (const city of cities) {
-    // Restore KV cache
-    await ctx.kvCacheLoad(0, kvSnapshot);
-    pos = branchPos;
+  emit('branch_point', { prefix: accumulated, position: root.position });
 
-    // Fresh grammar clone for this branch
-    const branchGrammar = ctx.cloneSampler(grammarSnapshot);
+  const results = [];
+  for (const city of cities) {
+    const child = await root.fork();
+    child.reseedSampler(results.length + 42);
 
-    // Generate completion for this branch
-    const branchGen = tokenGenerator(ctx, branchGrammar, 30);
     let branchText = '';
+    for (let i = 0; i < 30; i++) {
+      const { token, text, isStop } = await child.produce();
+      if (isStop) break;
 
-    for (const { token, text } of branchGen) {
       branchText += text;
       emit('token', { phase: 'branch', city, token, text });
-      await ctx.decode([token], pos++, 0);
+
+      await child.commit(token);
     }
 
     const fullOutput = accumulated + branchText;
-    branches.push({ city, output: fullOutput });
+    results.push({ city, output: fullOutput });
 
     if (!jsonlMode) {
       console.log(`  [${city} branch]: ${fullOutput}`);
     }
     emit('branch_complete', { city, output: fullOutput });
 
-    ctx.freeSamplerHandle(branchGrammar);
+    await child.prune();
   }
 
+  await root.prune();
+
   // Validate JSON outputs
   let validJsonCount = 0;
-  for (const b of branches) {
+  for (const b of results) {
     try {
       JSON.parse(b.output);
       validJsonCount++;
@@ -187,14 +158,11 @@ async function main() {
   }
 
   emit('complete', {
-    branchCount: branches.length,
+    branchCount: results.length,
     validJsonCount,
-    branches: branches.map(b => ({ city: b.city, output: b.output })),
+    branches: results.map(b => ({ city: b.city, output: b.output })),
   });
 
-  // Cleanup
-  ctx.freeSamplerHandle(grammarHandle);
-  ctx.freeSamplerHandle(grammarSnapshot);
   ctx.dispose();
 
   if (!jsonlMode) {
diff --git a/examples/speculative/README.md b/examples/speculative/README.md
index 27e5537..7433c77 100644
--- a/examples/speculative/README.md
+++ b/examples/speculative/README.md
@@ -1,6 +1,6 @@
-# Speculative Decoding with Forkable KV State
+# Speculative Decoding with Branch API
 
-Demonstrates the KV cache primitives needed for speculative decoding: draft, fork, verify, accept/reject.
+Demonstrates speculative decoding using the Branch primitive: fork a draft, verify, accept/reject, sample bonus token.
 
 ## Run It
 
@@ -31,56 +31,74 @@ Statistics
 
 | Phase | What Happens |
 |-------|--------------|
-| **1. DRAFT** | Generate N tokens greedily (fast, low quality ok) |
-| **2. FORK** | `kvSeqCopy(0, 1)` - copy KV state for verification |
-| **3. VERIFY** | Run target model on all N tokens (one batch) |
-| **4. ACCEPT** | Keep tokens where target agrees with draft |
-| **5. BONUS** | Sample one token from target at rejection point |
-| **6. CLEANUP** | `kvCacheRemove()` rejected tokens, repeat |
+| **1. MAIN** | Create main branch tracking committed state |
+| **2. FORK** | Fork draft branch (shares KV prefix with main) |
+| **3. DRAFT** | produce/commit N tokens on draft branch |
+| **4. VERIFY** | Check draft confidence (entropy threshold) |
+| **5. PRUNE** | Remove draft branch (cleans up divergent KV) |
+| **6. ACCEPT** | Commit accepted tokens to main branch |
+| **7. BONUS** | Sample one token from main at rejection point |
 
-## Key Pattern: Accept/Reject with KV Cleanup
+## Key Pattern: Fork/Draft/Verify with Branch API
 
 ```javascript
-// Draft N tokens on seq 0
-for (let i = 0; i < N; i++) {
-  const token = ctx.sample({ temperature: 0.0 });
-  await ctx.decode([token], pos++, 0);
-  drafts.push(token);
-}
-
-// Fork for verification
-ctx.kvSeqCopy(0, 1);
-
-// Verify (compare draft vs target distributions)
-const acceptedCount = verify(drafts);
-
-// Remove rejected tokens from KV cache
-if (acceptedCount < drafts.length) {
-  const rejectPos = startPos + acceptedCount;
-  await ctx.kvCacheRemove(0, rejectPos, -1);  // Critical!
-
-  // Sample bonus token from target at rejection point
-  const bonus = ctx.sample({ temperature: 0.7 });
-  await ctx.decode([bonus], rejectPos, 0);
+// Main branch tracks committed state
+const main = Branch.create(ctx, 0, { temperature: 0.7 });
+await main.prefill(promptTokens);
+
+while (output.length < maxTokens) {
+  // Fork draft from main — shares KV prefix
+  const draft = await main.fork();
+  draft.reseedSampler(iteration);
+
+  // Draft N tokens
+  const drafts = [];
+  for (let i = 0; i < N; i++) {
+    const entropy = ctx.modelEntropy('nats', draft.getLogits());
+    const { token, text, isStop } = draft.produceSync();
+    if (isStop) break;
+    drafts.push({ token, text, entropy });
+    await draft.commit(token);
+  }
+
+  // Verify and prune draft
+  const acceptedCount = verify(drafts);
+  await draft.prune();
+
+  // Commit accepted tokens to main
+  for (const d of drafts.slice(0, acceptedCount)) {
+    await main.commit(d.token);
+  }
+
+  // Bonus token from main at rejection point
+  if (acceptedCount < drafts.length) {
+    const { token } = main.produceSync();
+    await main.commit(token);
+  }
 }
+await main.prune();
 ```
 
-## Why Fork Before Verify?
+## Why Branch API?
 
-In real speculative decoding with two models:
-- Draft model: small, fast, generates candidates
-- Target model: large, slow, verifies quality
+The produce/commit separation is what makes speculative decoding natural:
 
-The fork lets you run the target model on seq 1 while keeping the draft state on seq 0. After verification, you collapse to the accepted prefix.
+- **produce()** samples without writing to KV — inspect before deciding
+- **commit()** accepts + decodes — advance state only for accepted tokens
+- **fork()** shares KV prefix — draft branch doesn't duplicate the prompt
+- **prune()** removes divergent KV — clean rejection without manual bookkeeping
 
 ## Key APIs
 
 | Method | Description |
 |--------|-------------|
-| `kvSeqCopy(src, dst)` | Fork KV cache (O(1) tag copy) |
-| `kvCacheRemove(seq, start, end)` | Remove token range from cache |
-| `modelEntropy('nats')` | Check draft confidence |
-| `nSeqMax` | Context option for multi-sequence |
+| `Branch.create(ctx, pos, params)` | Create branch at position |
+| `branch.fork()` | Fork: shared KV prefix + cloned sampler |
+| `branch.produce()` | Sample without KV write |
+| `branch.commit(token)` | Accept + decode into KV |
+| `branch.prune()` | Remove divergent KV entries |
+| `branch.reseedSampler(seed)` | Diversify forked branch |
+| `ctx.modelEntropy('nats', logits)` | Check draft confidence |
 
 ## Accept Rate
 
diff --git a/examples/speculative/speculative.mjs b/examples/speculative/speculative.mjs
index ae709c2..cf927d7 100644
--- a/examples/speculative/speculative.mjs
+++ b/examples/speculative/speculative.mjs
@@ -104,15 +104,13 @@ async function main() {
     console.log(`\nPrompt: "${prompt}"`);
   }
 
-  // Prefill prompt
+  // Prefill prompt via main branch
   const promptTokens = await ctx.tokenize(prompt);
-  await ctx.decode(promptTokens, 0, 0);
 
-  // Create main branch — tracks committed state
-  const main = Branch.create(ctx, promptTokens.length, {
+  const main = Branch.create(ctx, 0, {
     temperature: 0.7, // For bonus token sampling
   });
-  main.captureLogits();
+  await main.prefill(promptTokens);
 
   const output = [];
   let totalDrafted = 0;
@@ -138,11 +136,11 @@ async function main() {
     const drafts = [];
 
     for (let i = 0; i < DRAFT_COUNT && output.length + drafts.length < GENERATION_LENGTH; i++) {
-      // Get entropy BEFORE sampling (from current logits)
-      const entropy = ctx.modelEntropy('nats');
+      // Get entropy BEFORE sampling (from draft branch's logits snapshot)
+      const entropy = ctx.modelEntropy('nats', draft.getLogits());
 
       // produce() samples from captured logits (no KV write yet)
-      const { token, text, isStop } = draft.produce();
+      const { token, text, isStop } = draft.produceSync();
 
       if (isStop) break;
 
@@ -191,7 +189,7 @@ async function main() {
     const rejected = drafts.slice(acceptedCount);
     if (rejected.length > 0) {
       // produce() samples from main's current logits (at rejection point)
-      const { token: bonusToken, text: bonusText, isStop } = main.produce();
+      const { token: bonusToken, text: bonusText, isStop } = main.produceSync();
 
       if (!isStop) {
         await main.commit(bonusToken);
diff --git a/examples/streaming/streaming-summary.mjs b/examples/streaming/streaming-summary.mjs
index 6de9f56..769b86c 100644
--- a/examples/streaming/streaming-summary.mjs
+++ b/examples/streaming/streaming-summary.mjs
@@ -13,6 +13,7 @@
  * - Sidecar mode: optional slim-summarize model for summarization (--sidecar)
  * - Outline detection with structural progress tracking
  * - Pattern matching (not instruction following) to guide continuation
+ * - Branch API for generation (produce/commit loop)
  *
  * After reseed, KV cache contains: [progress][tail]
  * - progress = minimal anchor + checklist of done/current sections + summary
@@ -25,7 +26,7 @@
 import * as fs from 'node:fs';
 import * as path from 'node:path';
 import { fileURLToPath } from 'node:url';
-import { createContext } from '../../lib/index.js';
+import { createContext, Branch } from '../../lib/index.js';
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const DEFAULT_MODEL = path.resolve(
@@ -111,16 +112,17 @@ async function generateSummary(summaryCtx, text, options = {}) {
   }
 
   await summaryCtx.kvCacheClear();
-  await summaryCtx.decode(tokens, 0, 0);
+  const branch = Branch.create(summaryCtx, 0, { temperature: 0.3 });
+  await branch.prefill(tokens);
 
   let response = '';
-  let pos = tokens.length;
   for (let i = 0; i < maxTokens; i++) {
-    const token = summaryCtx.sample({ temperature: 0.3 });
-    if (summaryCtx.isStopToken(token)) break;
-    response += summaryCtx.tokenToText(token);
-    await summaryCtx.decode([token], pos++, 0);
+    const { token, text: t, isStop } = await branch.produce();
+    if (isStop) break;
+    response += t;
+    await branch.commit(token);
   }
+  await branch.prune();
 
   // Only parse slim-summarize Python-style list format
   return format === 'slim-summarize'
@@ -293,7 +295,9 @@ Begin:
     ? MAX_SINK_TOKENS
     : MAX_SINK_TOKENS - (anchorTokens?.length || 0);
 
-  await ctx.decode(promptTokens, 0, 0);
+  const samplingParams = { temperature: 0.8, topP: 0.9 };
+  let branch = Branch.create(ctx, 0, samplingParams);
+  await branch.prefill(promptTokens);
 
   if (!jsonlMode) {
     console.log(`\nContext size: ${nCtx}`);
@@ -305,8 +309,8 @@ Begin:
   }
 
   const allTokens = [...promptTokens];
-  const tracker = ctx.createPerplexityTracker();
-  let cachePos = promptTokens.length;
+  // Manual PPL tracking (persists across branch reseeds)
+  let nllSum = 0, nllCount = 0;
   let reseedCount = 0;
   let currentSegmentText = '';
   let allGeneratedText = '';
@@ -314,12 +318,9 @@ Begin:
   let pendingSummaryTokens = [];
 
   for (let t = 0; t < TARGET_TOKENS; t++) {
-    const token = ctx.sample({
-      temperature: 0.8,
-      topP: 0.9,
-    });
+    const { token, isStop } = await branch.produce();
 
-    if (ctx.isStopToken(token)) {
+    if (isStop) {
       if (!jsonlMode) {
         console.log('\n[EOS token reached]');
       }
@@ -327,8 +328,10 @@ Begin:
       break;
     }
 
-    const surprisal = ctx.modelSurprisal(token);
-    ctx.addSurprisal(tracker, surprisal);
+    const branchLogits = branch.getLogits();
+    const surprisal = ctx.modelSurprisal(token, 'nats', branchLogits);
+    nllSum += Math.max(0, surprisal);
+    nllCount++;
 
     const text = ctx.tokenToText(token);
     if (!jsonlMode) {
@@ -339,10 +342,10 @@ Begin:
     currentSegmentText += text;
     allGeneratedText += text;
     allTokens.push(token);
-    await ctx.decode([token], cachePos++, 0);
+    await branch.commit(token);
 
     // Cache full? Reseed with dynamic sinks
-    if (cachePos >= nCtx) {
+    if (branch.position >= nCtx) {
       // Estimate evicted portion of current segment only
       const tailCharsEstimate = TAIL_SIZE * 4;
       const evictedFromSegment = currentSegmentText.length > tailCharsEstimate
@@ -481,11 +484,16 @@ Begin:
       }
 
       const tail = allTokens.slice(-TAIL_SIZE);
-      await ctx.clearAndReseed(sinks, tail);
-      cachePos = sinks.length + TAIL_SIZE;
+
+      // Destroy current branch, clear KV, create fresh branch with re-prefill
+      await branch.prune();
+      await ctx.kvCacheClear();
+      branch = Branch.create(ctx, 0, samplingParams);
+      await branch.prefill([...sinks, ...tail]);
+
       reseedCount++;
 
-      const ppl = ctx.getPerplexity(tracker);
+      const ppl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1;
       emit('reseed', {
         count: reseedCount,
         tokenIndex: t + 1,
@@ -509,8 +517,8 @@ Begin:
     }
   }
 
-  const finalPpl = ctx.getPerplexity(tracker);
-  ctx.freePerplexityTracker(tracker);
+  const finalPpl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1;
+  await branch.prune();
 
   const generatedTokens = allTokens.length - promptTokens.length;
   const finalChain = summaries.join('\n');
diff --git a/examples/streaming/streaming-tsampler.mjs b/examples/streaming/streaming-tsampler.mjs
index 4fd3207..0d698bf 100644
--- a/examples/streaming/streaming-tsampler.mjs
+++ b/examples/streaming/streaming-tsampler.mjs
@@ -6,7 +6,8 @@
  * - TypeScript sampling via tsampler (TTA pattern)
  * - N-gram tracking to detect sequence repetition
  * - Logit steering to prevent repeated sequences
- * - clearAndReseed() for infinite context
+ * - Branch API for KV management (prefill/decodeAndCaptureOne)
+ * - KV cache clear + re-prefill for infinite context
  *
  * The key insight: llama.cpp's token-level penalties degrade prose quality.
  * Instead, we track N-grams at the app level and steer away from repeats.
@@ -18,7 +19,7 @@
 
 import * as path from 'node:path';
 import { fileURLToPath } from 'node:url';
-import { createContext } from '../../lib/index.js';
+import { createContext, Branch } from '../../lib/index.js';
 
 // Import tsampler from npm package
 import {
@@ -172,7 +173,6 @@ Begin:
   }
 
   const promptTokens = await ctx.tokenize(prompt);
-  await ctx.decode(promptTokens, 0, 0);
 
   // Track all generated tokens
   const allTokens = [...promptTokens];
@@ -201,15 +201,19 @@ Begin:
     process.stdout.write(prompt);
   }
 
-  const tracker = ctx.createPerplexityTracker();
-  let cachePos = promptTokens.length;
+  // Branch used purely for KV management — sampling done externally via tsampler
+  let branch = Branch.create(ctx, 0, { temperature: 0 });
+  await branch.prefill(promptTokens);
+
+  // Manual PPL tracking (persists across branch reseeds)
+  let nllSum = 0, nllCount = 0;
   let reseedCount = 0;
   let blockedCount = 0;
 
   for (let t = 0; t < TARGET_TOKENS; t++) {
-    // Get logits from native layer
-    const logitsBuffer = ctx.getLogits();
-    const logits = new Float32Array(logitsBuffer);
+    // Get logits from branch snapshot
+    const originalLogits = branch.getLogits();
+    const logits = new Float32Array(originalLogits);
 
     // N-gram deduplication: Check if we're about to repeat a sequence
     const blockedToken = ngramTracker.getBlockedToken();
@@ -244,9 +248,10 @@ Begin:
     // tokenHistory.accept(token); // Disabled - matching baseline
     ngramTracker.accept(token);
 
-    // Track surprisal
-    const surprisal = ctx.modelSurprisal(token);
-    ctx.addSurprisal(tracker, surprisal);
+    // Track surprisal from original (unmodified) logits
+    const surprisal = ctx.modelSurprisal(token, 'nats', originalLogits);
+    nllSum += Math.max(0, surprisal);
+    nllCount++;
 
     // Output token
     const text = ctx.tokenToText(token);
@@ -255,18 +260,23 @@ Begin:
     }
     emit('token', { index: t, token, text, surprisal, blocked: wasBlocked });
 
-    // Store and decode
+    // Store and advance KV (no sampler accept — we're using tsampler externally)
     allTokens.push(token);
-    await ctx.decode([token], cachePos++, 0);
+    await branch.decodeAndCaptureOne(token);
 
     // Cache full? Reseed at boundary
-    if (cachePos >= nCtx) {
+    if (branch.position >= nCtx) {
       const tail = allTokens.slice(-TAIL_SIZE);
-      await ctx.clearAndReseed(sinks, tail);
-      cachePos = sinks.length + TAIL_SIZE;
+
+      // Destroy current branch, clear KV, create fresh branch with re-prefill
+      await branch.prune();
+      await ctx.kvCacheClear();
+      branch = Branch.create(ctx, 0, { temperature: 0 });
+      await branch.prefill([...sinks, ...tail]);
+
       reseedCount++;
 
-      const ppl = ctx.getPerplexity(tracker);
+      const ppl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1;
       const stats = ngramTracker.stats();
 
       emit('reseed', { count: reseedCount, tokenIndex: t + 1, ppl, blockedCount, uniqueNgrams: stats.uniqueNgrams });
@@ -277,15 +287,15 @@ Begin:
     }
 
     // Progress every 1000 tokens
-    if ((t + 1) % 1000 === 0 && cachePos < nCtx && !jsonlMode) {
+    if ((t + 1) % 1000 === 0 && branch.position < nCtx && !jsonlMode) {
       const stats = ngramTracker.stats();
       console.log(`\n  [${t + 1}/${TARGET_TOKENS} | Blocked repeats: ${blockedCount} | Unique ${NGRAM_SIZE}-grams: ${stats.uniqueNgrams}]`);
     }
   }
 
-  const finalPpl = ctx.getPerplexity(tracker);
+  const finalPpl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1;
   const finalStats = ngramTracker.stats();
-  ctx.freePerplexityTracker(tracker);
+  await branch.prune();
 
   const generatedTokens = allTokens.length - promptTokens.length;
   emit('complete', {
diff --git a/examples/streaming/streaming.mjs b/examples/streaming/streaming.mjs
index 4bc52e8..96e4e20 100644
--- a/examples/streaming/streaming.mjs
+++ b/examples/streaming/streaming.mjs
@@ -8,15 +8,16 @@
  *
  * This example demonstrates:
  * - Generating tokens beyond context window limit
- * - clearAndReseed() for cache-local position reindexing
+ * - KV cache clear + re-prefill for cache-local position reindexing
  * - Per-token perplexity measurement across reseeds
+ * - Branch API for generation (produce/commit loop)
  *
  * Parameters from BlinkKV paper: 2048 context, 4 sinks, 256 tail
  */
 
 import * as path from 'node:path';
 import { fileURLToPath } from 'node:url';
-import { createContext } from '../../lib/index.js';
+import { createContext, Branch } from '../../lib/index.js';
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const DEFAULT_MODEL = path.resolve(
@@ -80,7 +81,6 @@ Begin:
   }
 
   const promptTokens = await ctx.tokenize(prompt);
-  await ctx.decode(promptTokens, 0, 0);
 
   // Track all generated tokens (needed for reseeding)
   const allTokens = [...promptTokens];
@@ -97,22 +97,22 @@ Begin:
     process.stdout.write(prompt);
   }
 
-  const tracker = ctx.createPerplexityTracker();
-  let cachePos = promptTokens.length;
+  const samplingParams = { temperature: 0.8, topP: 0.9 };
+  let branch = Branch.create(ctx, 0, samplingParams);
+  await branch.prefill(promptTokens);
+
+  // Manual PPL tracking (persists across branch reseeds)
+  let nllSum = 0, nllCount = 0;
   let reseedCount = 0;
 
   for (let t = 0; t < TARGET_TOKENS; t++) {
-    // Sample next token
     // NOTE: Token-level repeat penalties are NOT used for long-form generation.
     // llama.cpp's penalty system penalizes individual tokens (not sequences),
     // which degrades prose quality over 100+ tokens as common words accumulate
     // in the penalty buffer. For sequence-level deduplication, use N-gram
     // tracking with logit steering (TTA pattern) instead.
-    const token = ctx.sample({
-      temperature: 0.8,
-      topP: 0.9,
-    });
-    if (ctx.isStopToken(token)) {
+    const { token, isStop } = await branch.produce();
+    if (isStop) {
       if (!jsonlMode) {
         console.log('\n[EOS token reached]');
       }
@@ -120,9 +120,11 @@ Begin:
       break;
     }
 
-    // Track surprisal
-    const surprisal = ctx.modelSurprisal(token);
-    ctx.addSurprisal(tracker, surprisal);
+    // Track surprisal from the logits used by produce()
+    const branchLogits = branch.getLogits();
+    const surprisal = ctx.modelSurprisal(token, 'nats', branchLogits);
+    nllSum += Math.max(0, surprisal);
+    nllCount++;
 
     // Output token
     const text = ctx.tokenToText(token);
@@ -131,18 +133,23 @@ Begin:
     }
     emit('token', { index: t, token, text, surprisal });
 
-    // Store token and decode
+    // Store token and commit (decode + capture new logits)
     allTokens.push(token);
-    await ctx.decode([token], cachePos++, 0);
+    await branch.commit(token);
 
     // Cache full? Reseed at boundary
-    if (cachePos >= nCtx) {
+    if (branch.position >= nCtx) {
       const tail = allTokens.slice(-TAIL_SIZE);
-      await ctx.clearAndReseed(sinks, tail);
-      cachePos = sinks.length + TAIL_SIZE;
+
+      // Destroy current branch, clear KV, create fresh branch with re-prefill
+      await branch.prune();
+      await ctx.kvCacheClear();
+      branch = Branch.create(ctx, 0, samplingParams);
+      await branch.prefill([...sinks, ...tail]);
+
       reseedCount++;
 
-      const ppl = ctx.getPerplexity(tracker);
+      const ppl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1;
       emit('reseed', { count: reseedCount, tokenIndex: t + 1, ppl });
 
       if (!jsonlMode) {
@@ -156,8 +163,8 @@ Begin:
     }
   }
 
-  const finalPpl = ctx.getPerplexity(tracker);
-  ctx.freePerplexityTracker(tracker);
+  const finalPpl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1;
+  await branch.prune();
 
   const generatedTokens = allTokens.length - promptTokens.length;
   emit('complete', { generatedTokens, reseeds: reseedCount, finalPpl });
diff --git a/lib/Branch.js b/lib/Branch.js
index 941937a..08fe4bf 100644
--- a/lib/Branch.js
+++ b/lib/Branch.js
@@ -151,7 +151,7 @@ class Branch {
    * this for external tokens (user input between turns), not model-generated
    * tokens. For model output, use commit() which does accept + decode.
    *
-   * This is the branch-level equivalent of ctx.decode().
+   * The primary way to feed tokens into a branch's KV cache.
    *
    * @param {number[]} tokens - Token IDs to decode
    * @returns {Promise<void>}
@@ -263,7 +263,7 @@ class Branch {
    * const blocked = computeNgramBlocks(generatedText);
    * branch.steer(blocked.map(t => ({ token: t, bias: -Infinity })));
    *
-   * const { token } = branch.produce();  // Blocked tokens won't be sampled
+   * const { token } = await branch.produce();  // Blocked tokens won't be sampled
    * await branch.commit(token);
    *
    * branch.clearSteer();  // Reset for next iteration
@@ -285,15 +285,57 @@ class Branch {
   }
 
   /**
-   * Sample the next token without advancing state
+   * Replace the sampler chain with new parameters (memoized)
+   *
+   * If the new params match the current chain's params, this is a no-op.
+   * Otherwise the old chain is freed and a new one is created.
+   *
+   * @param {SamplingParams} params - New sampling parameters
+   */
+  setSamplerParams(params) {
+    this._ensureNotDisposed();
+    this._ctx._branchSetSamplerParams(this._handle, params);
+  }
+
+  /**
+   * Replace or remove the grammar constraint
+   *
+   * Pass a GBNF grammar string to constrain generation, or empty string / null
+   * to remove the constraint. The grammar state is cloned on fork().
+   *
+   * @param {string} [grammarStr] - GBNF grammar string, or empty/null to remove
+   */
+  setGrammar(grammarStr) {
+    this._ensureNotDisposed();
+    this._ctx._branchSetGrammar(this._handle, grammarStr || '');
+  }
+
+  /**
+   * Sample the next token without advancing state (async)
    *
    * No KV write, no position update. Inspect the result before deciding
    * to commit() — this separation is what enables speculative verification
    * and conditional branching.
    *
+   * Async contract: local branches resolve immediately; cloud branches
+   * may perform an HTTP round-trip. Use produceSync() when you know the
+   * branch is local and want zero-overhead sampling.
+   *
+   * @returns {Promise<{ token: number, text: string, isStop: boolean }>}
+   */
+  async produce() {
+    return this.produceSync();
+  }
+
+  /**
+   * Sample the next token without advancing state (sync)
+   *
+   * Same as produce() but synchronous. Use when you know the branch is
+   * local and want to avoid the microtick overhead of a promise.
+   *
    * @returns {{ token: number, text: string, isStop: boolean }}
    */
-  produce() {
+  produceSync() {
     this._ensureNotDisposed();
     const token = this.sample();
     return {
@@ -383,7 +425,7 @@ class Branch {
    */
   async *[Symbol.asyncIterator]() {
     while (!this._disposed) {
-      const { token, text, isStop } = this.produce();
+      const { token, text, isStop } = await this.produce();
       if (isStop) return;
       await this.commit(token);
       yield { token, text };
diff --git a/lib/index.d.ts b/lib/index.d.ts
index b7d6711..0b1adcf 100644
--- a/lib/index.d.ts
+++ b/lib/index.d.ts
@@ -273,8 +273,6 @@ export interface FormatChatOptions {
   /**
    * Explicit GBNF grammar string for constrained generation.
    * Mutually exclusive with `jsonSchema`.
-   *
-   * @see {@link SessionContext.createSampler}
    */
   grammar?: string;
 
@@ -550,7 +548,7 @@ export interface AdvancedSamplingParams {
  *
  * Configures the sampler chain — a pipeline of composable filters and
  * transforms applied to raw logits before token selection. The chain is
- * built once at branch/context creation and persists across decode steps
+ * built once at branch creation and persists across decode steps
  * (penalty state accumulates, PRNG advances).
  *
  * **Chain order**: penalties → top_k → typical_p → top_p → min_p →
@@ -603,24 +601,29 @@ export interface SamplingParams {
  * Inference context — the runtime surface for a loaded model
  *
  * A SessionContext owns a llama_context (KV cache + compute graph) bound to a
- * shared model. All inference flows through this interface: tokenization,
- * forward passes, logit access, sampling, KV cache management, chat template
- * formatting, and embedding extraction.
+ * shared model. It provides tokenization, logit access, KV cache management,
+ * chat template formatting, and embedding extraction.
+ *
+ * **All generation flows through {@link Branch}.** Create a branch at position 0,
+ * prefill prompt tokens, then use the produce/commit loop or async iterator:
  *
- * The core generation loop is three steps, repeated:
- * 1. **decode()** — Feed tokens through the transformer, populating KV cache.
- * 2. **getLogits()** — Zero-copy view into the model's output distribution.
- * 3. **sample()** — Select the next token via the configured sampler chain.
+ * ```typescript
+ * const branch = Branch.create(ctx, 0, { temperature: 0.7 });
+ * await branch.prefill(promptTokens);
+ * for await (const { token, text } of branch) {
+ *   process.stdout.write(text);
+ * }
+ * ```
  *
  * For tree-structured generation (best-of-N, beam search, speculative
- * decoding), use the {@link Branch} and {@link BranchStore} APIs instead —
- * they manage per-branch KV sequences, sampler chains, and logits snapshots
- * with O(1) GPU dispatches via batched decode.
+ * decoding), use {@link Branch.fork} and {@link BranchStore} — they manage
+ * per-branch KV sequences, sampler chains, and logits snapshots with O(1)
+ * GPU dispatches via batched decode.
  *
  * **Logits lifetime**: `getLogits()` returns a zero-copy Float32Array wrapping
  * llama.cpp's internal buffer. It is invalidated (ArrayBuffer detached) on
- * the next `decode()`, `encode()`, or `dispose()`. Use {@link withLogits} for
- * safe scoped access.
+ * the next `encode()` or `dispose()`. Use {@link withLogits} for safe scoped
+ * access. For branch-level logits, use {@link Branch.getLogits} instead.
  *
  * **KV cache**: Supports multi-sequence operation (`nSeqMax > 1`), per-sequence
  * copy/clear/eviction, file-based persistence, and context compression via
@@ -636,54 +639,17 @@ export interface SamplingParams {
  * @category Core
  */
 export interface SessionContext {
-  // ===== THE GENERATION LOOP =====
-
-  /**
-   * STEP 1: Process tokens through the model (forward pass)
-   *
-   * This feeds tokens through the transformer and updates the KV cache.
-   * After decoding, the model has "read" this text and is ready to predict.
-   *
-   * Think of this as: "the model reads your prompt"
-   *
-   * Why async? Model inference takes time (~45ms per token)
-   * Why position? Model needs to know where in conversation this text appears
-   *
-   * Cost: ~45ms per token (generation), ~120ms for 50 tokens (prompt)
-   *
-   * @param tokens Token IDs from tokenize()
-   * @param position Where these tokens start in the sequence
-   * @param seqId Sequence ID (default: 0)
-   * @example
-   * ```typescript
-   * const tokens = await ctx.tokenize("Hello world");
-   * await ctx.decode(tokens, 0);
-   * let position = tokens.length;
-   *
-   * // Generate next token
-   * await ctx.decode([nextToken], position++);
-   *
-   * // Multi-sequence: decode to different sequences
-   * await ctx.decode(tokens, 0, 0);  // Sequence 0
-   * await ctx.decode(tokens, 0, 1);  // Sequence 1
-   * ```
-   */
-  decode(tokens: number[], position: number, seqId?: number): Promise<void>;
 
   /**
-   * STEP 2: Get logits (zero-copy view into model memory)
+   * Get logits (zero-copy view into model memory)
    *
    * Returns unnormalized scores for every possible next token.
    * Higher score = model thinks this token is more likely.
    * The returned Float32Array wraps llama.cpp's internal buffer directly
-   * (zero-copy). It is mutable — you can write to it for custom sampling
-   * (e.g., setting banned tokens to -Infinity before calling sample()).
-   *
-   * Memoized per decode step: calling getLogits() twice between decodes
-   * returns the same Float32Array backed by the same ArrayBuffer.
+   * (zero-copy).
    *
    * LIFETIME CONSTRAINTS:
-   * - Valid ONLY until the next decode(), encode(), or dispose() call
+   * - Valid ONLY until the next encode() or dispose() call
    * - The ArrayBuffer is detached on invalidation — accessing a stale
    *   buffer throws a TypeError
    * - DO NOT retain references across async boundaries
@@ -697,60 +663,9 @@ export interface SessionContext {
    * Cost: ~0.5ms (zero-copy pointer, no data copied)
    *
    * @returns Float32Array of unnormalized logits (vocabSize elements)
-   * @example
-   * ```typescript
-   * await ctx.decode(tokens, 0);
-   *
-   * // Read logits for analysis
-   * const logits = ctx.getLogits();
-   * const entropy = ctx.modelEntropy("bits", logits);
-   *
-   * // Or modify in-place for custom sampling
-   * logits[BANNED_TOKEN] = -Infinity;
-   * const token = ctx.sample({ temperature: 0.7 });
-   *
-   * // Next decode invalidates the buffer
-   * await ctx.decode([token], position++);
-   * // logits is now DETACHED - do not access!
-   * ```
    */
   getLogits(): Float32Array;
 
-  /**
-   * STEP 3: Sample a token from logits
-   *
-   * Converts raw logits into a token decision using:
-   * - Temperature: controls randomness
-   * - Top-K/Top-P: filters unlikely tokens
-   * - Repeat/frequency/presence penalties (tracked across calls)
-   *
-   * NOTE: Grammar constraints are NOT applied by sample(). For grammar-
-   * constrained generation, use the handle-based API (createSampler /
-   * applySampler) or the Branch API which integrates grammar natively.
-   *
-   * This is where generation strategy happens.
-   *
-   * Cost: ~0.1ms (native sampling)
-   *
-   * @param params Sampling strategy (greedy if omitted)
-   * @returns Selected token ID
-   * @example
-   * ```typescript
-   * // Greedy (always pick most likely)
-   * const token = ctx.sample();
-   *
-   * // Creative generation
-   * const token = ctx.sample({ temperature: 0.9 });
-   *
-   * // Constrained to valid JSON (handle-based API)
-   * const grammarHandle = ctx.createSampler(grammar);
-   * ctx.applySampler(grammarHandle, ctx.getLogits());
-   * const token = ctx.sample({ temperature: 0.7 });
-   * ctx.acceptSamplerToken(grammarHandle, token);
-   * ```
-   */
-  sample(params?: SamplingParams): number;
-
   /**
    * Convert token ID to text piece
    *
@@ -762,20 +677,8 @@ export interface SessionContext {
    *
    * Cost: ~0.05ms
    *
-   * @param token Token ID from sample()
+   * @param token Token ID
    * @returns Text string for this token
-   * @example
-   * ```typescript
-   * while (true) {
-   *   const token = ctx.sample({ temperature: 0.8 });
-   *   if (ctx.isStopToken(token)) break;
-   *
-   *   const text = ctx.tokenToText(token);
-   *   process.stdout.write(text); // Stream to output
-   *
-   *   await ctx.decode([token], position++);
-   * }
-   * ```
    */
   tokenToText(token: number): string;
 
@@ -795,14 +698,6 @@ export interface SessionContext {
    * Cost: <0.01ms (fast vocabulary lookup)
    *
    * @param token Token ID to check
-   * @example
-   * ```typescript
-   * const token = ctx.sample();
-   * if (ctx.isStopToken(token)) {
-   *   console.log('Generation complete');
-   *   break;
-   * }
-   * ```
    */
   isStopToken(token: number): boolean;
 
@@ -910,14 +805,6 @@ export interface SessionContext {
    *
    * @param sequenceId Sequence ID (defaults to 0 for single conversation)
    * @returns Highest position index, or -1 if empty
-   * @example
-   * ```typescript
-   * const tokens = await ctx.tokenize("Hello world");
-   * await ctx.decode(tokens, 0);
-   *
-   * const maxPos = ctx.kvCacheSize(0);
-   * console.log(`${maxPos + 1} tokens in cache`);
-   * ```
    */
   kvCacheSize(sequenceId?: number): number;
 
@@ -938,18 +825,6 @@ export interface SessionContext {
    * @param sequenceId Sequence ID (use 0 for single sequence)
    * @param start Start position (inclusive)
    * @param end End position (exclusive), -1 = to end
-   * @example
-   * ```typescript
-   * // Remove old tokens to stay under context limit
-   * const currentLength = ctx.kvCacheSize(0);
-   * if (currentLength > 2000) {
-   *   // Remove oldest 500 tokens
-   *   await ctx.kvCacheRemove(0, 0, 500);
-   *
-   *   // THEN decode new tokens
-   *   await ctx.decode(newTokens, currentLength - 500);
-   * }
-   * ```
    */
   kvCacheRemove(sequenceId: number, start: number, end: number): Promise<void>;
 
@@ -968,17 +843,6 @@ export interface SessionContext {
    *
    * @param sequenceId Sequence ID (use 0 for single sequence)
    * @returns Serialized state buffer
-   * @example
-   * ```typescript
-   * // Save state before risky operation
-   * const snapshot = await ctx.kvCacheSave(0);
-   *
-   * // Try something
-   * await ctx.decode(riskyTokens, position);
-   *
-   * // Didn't work - restore previous state
-   * await ctx.kvCacheLoad(0, snapshot);
-   * ```
    */
   kvCacheSave(sequenceId?: number): Promise<Buffer>;
 
@@ -1014,14 +878,6 @@ export interface SessionContext {
    *
    * Cost: ~1ms
    *
-   * @example
-   * ```typescript
-   * // Start fresh conversation
-   * await ctx.kvCacheClear();
-   *
-   * const tokens = await ctx.tokenize("New conversation");
-   * await ctx.decode(tokens, 0);
-   * ```
    */
   kvCacheClear(): Promise<void>;
 
@@ -1113,19 +969,6 @@ export interface SessionContext {
    * @param dstSeqId Destination sequence to copy to
    * @param p0 Start position (must be 0, default: 0)
    * @param p1 End position (must be -1 for full copy, default: -1)
-   * @example
-   * ```typescript
-   * // Decode shared prompt to seq 0
-   * await ctx.decode(promptTokens, 0);
-   *
-   * // Fork to seq 1 and seq 2 (metadata-only, instant)
-   * ctx.kvSeqCopy(0, 1);
-   * ctx.kvSeqCopy(0, 2);
-   *
-   * // Divergent generation — only new tokens allocate KV entries
-   * await ctx.decode([tokenA], position, 1);
-   * await ctx.decode([tokenB], position, 2);
-   * ```
    */
   kvSeqCopy(srcSeqId: number, dstSeqId: number, p0?: number, p1?: number): void;
 
@@ -1162,91 +1005,6 @@ export interface SessionContext {
    */
   kvSeqPosMax(seqId: number): number;
 
-  // ===== HANDLE-BASED GRAMMAR =====
-
-  /**
-   * Create a new grammar sampler (returns handle)
-   *
-   * Creates an independent grammar sampler instance with its own state.
-   * Returns a handle that can be used with applySampler/acceptSamplerToken.
-   * Multiple handles can coexist with independent parser states.
-   *
-   * Cost: ~0.1-1ms depending on grammar complexity
-   *
-   * @param grammarStr GBNF grammar string
-   * @returns Handle to the created sampler
-   * @example
-   * ```typescript
-   * const grammarHandle = ctx.createSampler(jsonGrammar);
-   *
-   * // Apply grammar constraints to logits
-   * ctx.applySampler(grammarHandle, logitsBuffer);
-   * ctx.acceptSamplerToken(grammarHandle, token);
-   *
-   * // Create independent copy with same grammar
-   * const clonedHandle = ctx.cloneSampler(grammarHandle);
-   *
-   * // Cleanup when done
-   * ctx.freeSamplerHandle(grammarHandle);
-   * ctx.freeSamplerHandle(clonedHandle);
-   * ```
-   */
-  createSampler(grammarStr: string): number;
-
-  /**
-   * Apply grammar constraints using handle-based sampler
-   *
-   * Masks invalid tokens with -Infinity based on parser state.
-   * Modifies the logits buffer in-place.
-   *
-   * @param handle Sampler handle from createSampler()
-   * @param logitsBuffer ArrayBuffer or TypedArray containing logits
-   */
-  applySampler(handle: number, logitsBuffer: ArrayBuffer | Float32Array): void;
-
-  /**
-   * Accept token to advance grammar parser state (handle-based)
-   *
-   * Must be called after sampling to advance the grammar parser.
-   *
-   * @param handle Sampler handle from createSampler()
-   * @param tokenId Token that was sampled
-   */
-  acceptSamplerToken(handle: number, tokenId: number): void;
-
-  /**
-   * Clone a grammar sampler
-   *
-   * Creates a copy of the sampler with identical parser state.
-   * Both handles can then be used independently with their own state.
-   *
-   * @param handle Sampler handle to clone
-   * @returns New handle to cloned sampler
-   * @example
-   * ```typescript
-   * const original = ctx.createSampler(jsonGrammar);
-   * ctx.acceptSamplerToken(original, openBrace);
-   *
-   * // Clone preserves parser state (already accepted openBrace)
-   * const copy = ctx.cloneSampler(original);
-   *
-   * // Both can now continue independently
-   * ctx.acceptSamplerToken(original, tokenA);
-   * ctx.acceptSamplerToken(copy, tokenB);
-   * ```
-   */
-  cloneSampler(handle: number): number;
-
-  /**
-   * Free a grammar sampler handle
-   *
-   * Releases memory for the specified sampler.
-   * Handle becomes invalid after this call.
-   *
-   * @param handle Sampler handle to free
-   */
-  freeSamplerHandle(handle: number): void;
-
   // ===== METRICS API =====
 
   /**
@@ -1256,30 +1014,18 @@ export interface SessionContext {
    * - Low surprisal: Model expected this token (high probability)
    * - High surprisal: Model didn't expect this token (low probability)
    *
-   * Call after decode() to compute surprisal for any token based on
-   * the current logits distribution, or pass captured logits for
-   * offline computation (e.g., best-of-n scoring from prefill logits).
+   * Pass captured logits (e.g., from {@link Branch.getLogits}) for
+   * offline computation, or omit to use the current context logits.
    *
    * @param pickedTokenId - Token ID to compute surprisal for
    * @param base - Logarithm base: "nats" (default) or "bits"
    * @param logits - Optional Float32Array of logits (uses current context logits if omitted)
    * @returns Surprisal value in specified base
    *
-   * @example Current context logits (default)
-   * ```typescript
-   * await ctx.decode(tokens, position);
-   * const token = ctx.sample();
-   * const surprisal = ctx.modelSurprisal(token, "bits");
-   * console.log(`Model surprise: ${surprisal.toFixed(2)} bits`);
-   * ```
-   *
-   * @example Captured/arbitrary logits (for best-of-n, verification, etc.)
+   * @example With branch logits
    * ```typescript
-   * // Capture logits after prefill
-   * const capturedLogits = new Float32Array(ctx.getLogits());
-   *
-   * // Later: compute surprisal from captured logits
-   * const surprisal = ctx.modelSurprisal(token, "nats", capturedLogits);
+   * const { token } = await branch.produce();
+   * const surprisal = ctx.modelSurprisal(token, "bits", branch.getLogits());
    * ```
    *
    * COST: O(n_vocab) - softmax normalization required
@@ -1293,181 +1039,22 @@ export interface SessionContext {
    * - Low entropy: Model is confident (peaked distribution)
    * - High entropy: Model is uncertain (flat distribution)
    *
-   * Call after decode() to analyze the current prediction distribution,
-   * or pass captured logits for offline analysis.
+   * Pass captured logits (e.g., from {@link Branch.getLogits}) for
+   * offline analysis, or omit to use the current context logits.
    *
    * @param base - Logarithm base: "nats" (default) or "bits"
    * @param logits - Optional Float32Array of logits (uses current context logits if omitted)
    * @returns Entropy value in specified base
    *
-   * @example Current context logits (default)
+   * @example With branch logits
    * ```typescript
-   * await ctx.decode(tokens, position);
-   * const entropy = ctx.modelEntropy("bits");
-   * if (entropy > 5.0) {
-   *   console.log("Model is very uncertain - consider adjusting parameters");
-   * }
-   * ```
-   *
-   * @example Captured/arbitrary logits
-   * ```typescript
-   * const capturedLogits = new Float32Array(ctx.getLogits());
-   * const entropy = ctx.modelEntropy("nats", capturedLogits);
+   * const entropy = ctx.modelEntropy("bits", branch.getLogits());
    * ```
    *
    * COST: O(n_vocab) - must sum over all token probabilities
    */
   modelEntropy(base?: 'nats' | 'bits', logits?: Float32Array): number;
 
-  /**
-   * Create a new perplexity tracker.
-   *
-   * @returns Integer handle to the tracker
-   *
-   * @example
-   * ```typescript
-   * const tracker = ctx.createPerplexityTracker();
-   *
-   * // Add surprisals during generation
-   * for (let i = 0; i < tokens.length; i++) {
-   *   const surprisal = ctx.modelSurprisal(tokens[i]);
-   *   ctx.addSurprisal(tracker, surprisal);
-   * }
-   *
-   * const ppl = ctx.getPerplexity(tracker);
-   * console.log(`Sequence perplexity: ${ppl.toFixed(2)}`);
-   *
-   * ctx.freePerplexityTracker(tracker);
-   * ```
-   */
-  createPerplexityTracker(): number;
-
-  /**
-   * Add a surprisal value to the rolling tracker.
-   *
-   * @param handle - Tracker handle from createPerplexityTracker()
-   * @param surprisal - Surprisal value (from modelSurprisal or computed)
-   *
-   * @example
-   * ```typescript
-   * const surprisal = ctx.modelSurprisal(tokenId, "nats");
-   * ctx.addSurprisal(tracker, surprisal);
-   * ```
-   *
-   * COST: O(1) - numerically stable accumulation
-   * THREAD-SAFETY: Not thread-safe (handle is session-local)
-   */
-  addSurprisal(handle: number, surprisal: number): void;
-
-  /**
-   * Get current perplexity value.
-   *
-   * @param handle - Tracker handle
-   * @returns Perplexity = exp(average_surprisal_in_nats)
-   *
-   * @example
-   * ```typescript
-   * const ppl = ctx.getPerplexity(tracker);
-   * console.log(`Current PPL: ${ppl.toFixed(2)}`);
-   * ```
-   *
-   * FORMULA: PPL = exp(sum_surprisals / count)
-   * RANGE: [1, ∞) where 1 = perfect prediction
-   */
-  getPerplexity(handle: number): number;
-
-  /**
-   * Clone a perplexity tracker (for fork/branch scenarios).
-   *
-   * @param sourceHandle - Handle to clone from
-   * @returns New handle with same accumulated state
-   *
-   * @example
-   * ```typescript
-   * // Branch A and B start from same base perplexity
-   * const baseTracker = ctx.createPerplexityTracker();
-   * // ... accumulate base surprisals ...
-   *
-   * const branchA = ctx.clonePerplexityTracker(baseTracker);
-   * const branchB = ctx.clonePerplexityTracker(baseTracker);
-   *
-   * // Branch A and B now track independently
-   * ctx.addSurprisal(branchA, surprisalA);
-   * ctx.addSurprisal(branchB, surprisalB);
-   * ```
-   */
-  clonePerplexityTracker(sourceHandle: number): number;
-
-  /**
-   * Reset tracker to initial state (count=0, sum=0).
-   *
-   * @param handle - Tracker handle to reset
-   *
-   * @example
-   * ```typescript
-   * // Reuse tracker for multiple sequences
-   * const tracker = ctx.createPerplexityTracker();
-   *
-   * for (const sequence of sequences) {
-   *   ctx.resetPerplexityTracker(tracker);
-   *   // ... process sequence ...
-   *   const ppl = ctx.getPerplexity(tracker);
-   * }
-   * ```
-   */
-  resetPerplexityTracker(handle: number): void;
-
-  /**
-   * Get number of tokens tracked.
-   *
-   * @param handle - Tracker handle
-   * @returns Number of surprisal values added
-   */
-  getPerplexityCount(handle: number): number;
-
-  /**
-   * Free perplexity tracker resources.
-   *
-   * @param handle - Tracker handle to free
-   *
-   * NOTE: Auto-freed in dispose() if not manually freed
-   */
-  freePerplexityTracker(handle: number): void;
-
-  // ===== ATOMIC DECODE+CAPTURE =====
-
-  /**
-   * Decode tokens and capture logits atomically
-   *
-   * Performs decode and logits capture as a single atomic operation,
-   * ensuring the captured logits correspond exactly to the decoded tokens.
-   *
-   * Use this instead of separate decode() + getLogits() calls when
-   * you need guaranteed consistency between decode and logits capture.
-   *
-   * @param tokens Token IDs to decode
-   * @param position Start position in sequence
-   * @param seqId Sequence ID
-   * @param destBuffer Pre-allocated buffer to receive logits (vocabSize floats)
-   * @example
-   * ```typescript
-   * // Pre-allocate buffer (reuse across calls)
-   * const logitsBuffer = new Float32Array(ctx.vocabSize);
-   *
-   * // Atomic decode + capture
-   * await ctx.decodeAndCapture([token], position, seqId, logitsBuffer);
-   *
-   * // Safe to process logitsBuffer - it's an independent copy
-   * const nextToken = sampleFromLogits(logitsBuffer);
-   * ```
-   */
-  decodeAndCapture(
-    tokens: number[],
-    position: number,
-    seqId: number,
-    destBuffer: ArrayBuffer | Float32Array
-  ): Promise<void>;
-
   // ===== KV CACHE FILE PERSISTENCE =====
 
   /**
@@ -1528,26 +1115,8 @@ export interface SessionContext {
    * ]));
    *
    * const tokens = await ctx.tokenize(result.prompt);
-   * await ctx.decode(tokens, 0);
-   * ```
-   *
-   * @example With tools
-   * ```typescript
-   * const tools = [{ type: 'function', function: {
-   *   name: 'get_weather', description: 'Get weather',
-   *   parameters: { type: 'object', properties: { location: { type: 'string' } } }
-   * }}];
-   * const result = await ctx.formatChat(JSON.stringify(messages), {
-   *   tools: JSON.stringify(tools),
-   *   toolChoice: 'auto'
-   * });
-   * // result.grammar contains GBNF for constrained tool call generation
-   * // result.format identifies the chat format for output parsing
-   * ```
-   *
-   * @example Backward compatible (string as second arg)
-   * ```typescript
-   * const result = await ctx.formatChat(messagesJson, templateOverrideString);
+   * const branch = Branch.create(ctx, 0, { temperature: 0.7 });
+   * await branch.prefill(tokens);
    * ```
    */
   formatChat(
@@ -1601,12 +1170,11 @@ export interface SessionContext {
    *   messages.push({ role: 'user', content: userContent });
    *
    *   if (!branch) {
-   *     // Cold path: format full conversation, tokenize with BOS, decode all
+   *     // Cold path: format full conversation, tokenize with BOS, prefill
    *     fmt = await ctx.formatChat(JSON.stringify(messages));
    *     const tokens = await ctx.tokenize(fmt.prompt);
-   *     await ctx.decode(tokens, 0, 0);
-   *     branch = Branch.create(ctx, tokens.length, { temperature: 0.7 });
-   *     branch.captureLogits();
+   *     branch = Branch.create(ctx, 0, { temperature: 0.7 });
+   *     await branch.prefill(tokens);
    *   } else {
    *     // Warm path: string-diff for delta tokens
    *     const { prompt: full } = await ctx.formatChat(JSON.stringify(messages));
@@ -1621,7 +1189,7 @@ export interface SessionContext {
    *   // Generate
    *   let rawOutput = '';
    *   while (true) {
-   *     const { token, text, isStop } = branch.produce();
+   *     const { token, text, isStop } = await branch.produce();
    *     if (isStop) break;
    *     rawOutput += text;
    *     await branch.commit(token);
@@ -1653,7 +1221,7 @@ export interface SessionContext {
    * Convert JSON schema to GBNF grammar
    *
    * Generates grammar string for constrained JSON generation.
-   * Use with createSampler() for grammar-constrained generation.
+   * Use with {@link Branch.create} grammar parameter for constrained generation.
    *
    * Cost: ~1-10ms depending on schema complexity
    *
@@ -1671,7 +1239,7 @@ export interface SessionContext {
    * };
    *
    * const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(schema));
-   * const handle = ctx.createSampler(grammar);
+   * const branch = Branch.create(ctx, 0, params, undefined, grammar);
    * ```
    */
   jsonSchemaToGrammar(schemaJson: string): Promise<string>;
@@ -1779,18 +1347,6 @@ export interface SessionContext {
    */
   hasPooling(): boolean;
 
-  // ===== NATIVE REFERENCE IMPLEMENTATIONS =====
-
-  /**
-   * Sample greedily from current logits
-   *
-   * Selects token with highest logit value (deterministic).
-   * Equivalent to sample() with temperature=0.
-   *
-   * @returns Token ID with highest probability
-   */
-  greedySample(): number;
-
   // ===== PROPERTIES =====
 
   /**
@@ -1877,6 +1433,12 @@ export interface SessionContext {
   /** @internal Clear all dynamic logit biases from a branch */
   _branchClearSteer(handle: number): void;
 
+  /** @internal Replace sampler chain with new parameters (memoized) */
+  _branchSetSamplerParams(handle: number, params: SamplingParams): void;
+
+  /** @internal Replace or remove grammar constraint */
+  _branchSetGrammar(handle: number, grammarStr: string): void;
+
   // ===== STORE API (internal, wrapped by BranchStore) =====
 
   /** @internal Batched accept + decode_each + capture for N branches */
@@ -1905,7 +1467,6 @@ export interface SessionContext {
  * - KV cache: `nCtx * 2 * nLayers * dHead` bytes per KV type (fp16 default).
  *   For a 7B model with `nCtx: 4096`, expect ~1-2 GB of KV memory.
  * - Compute scratch: temporary buffers for the forward pass, sized to `nBatch`.
- * - Sampler state: penalty tracking window, PRNG state.
  *
  * **Model sharing:** If two contexts use the same `modelPath`, the model
  * weights are loaded once and shared. Only the KV cache and compute buffers
@@ -1926,8 +1487,9 @@ export interface SessionContext {
  *
  * try {
  *   const tokens = await ctx.tokenize("Hello");
- *   await ctx.decode(tokens, 0);
- *   const token = ctx.sample({ temperature: 0.7 });
+ *   const branch = Branch.create(ctx, 0, { temperature: 0.7 });
+ *   await branch.prefill(tokens);
+ *   for await (const { text } of branch) process.stdout.write(text);
  * } finally {
  *   ctx.dispose();
  * }
@@ -2015,56 +1577,16 @@ export function loadBinary(variant?: GpuVariant): {
  * The callback MUST NOT:
  * - Store the logits reference
  * - Return a Promise (will throw)
- * - Call decode() (would invalidate logits)
  *
  * This prevents common bugs where logits become invalid due to
  * async operations between access and usage.
  *
- * How it works:
- * - Memoization: Multiple getLogits() calls in same step return same buffer
- * - Revocation: Next decode() invalidates previous buffer
- *
  * @template T Return type of the callback
  * @param ctx The session context
  * @param fn Synchronous callback that uses logits - must not return a Promise
  * @returns The result from the callback
  * @throws Error if callback returns a Promise (async usage not allowed)
  *
- * @example Safe synchronous usage
- * ```typescript
- * // Compute entropy synchronously
- * const entropy = withLogits(ctx, (logits) => {
- *   let maxLogit = logits[0];
- *   for (let i = 1; i < logits.length; i++) {
- *     if (logits[i] > maxLogit) maxLogit = logits[i];
- *   }
- *
- *   let sumExp = 0;
- *   for (let i = 0; i < logits.length; i++) {
- *     sumExp += Math.exp(logits[i] - maxLogit);
- *   }
- *
- *   let entropy = 0;
- *   for (let i = 0; i < logits.length; i++) {
- *     const p = Math.exp(logits[i] - maxLogit) / sumExp;
- *     if (p > 0) entropy -= p * Math.log(p);
- *   }
- *   return entropy;
- * });
- *
- * // Now safe to decode (previous logits buffer is revoked)
- * await ctx.decode([nextToken], position++);
- * ```
- *
- * @example Error: async callback
- * ```typescript
- * // This will throw!
- * withLogits(ctx, async (logits) => {
- *   await something();  // NOT ALLOWED
- *   return logits[0];
- * });
- * ```
- *
  * @category Core
  */
 export function withLogits<T>(
@@ -2208,7 +1730,7 @@ export class Branch {
    * tokens (user input between turns), not model-generated tokens.
    * For model output, use `commit()` which does accept + decode.
    *
-   * Branch-level equivalent of `ctx.decode()`.
+   * The primary way to feed tokens into a branch's KV cache.
    *
    * @param tokens - Token IDs to decode
    */
@@ -2281,7 +1803,7 @@ export class Branch {
    * // Block those tokens for this sample only
    * branch.steer(blocked.map(t => ({ token: t, bias: -Infinity })));
    *
-   * const { token } = branch.produce();  // Blocked tokens won't be sampled
+   * const { token } = await branch.produce();  // Blocked tokens won't be sampled
    * await branch.commit(token);
    *
    * // Clear for next iteration (recompute based on new history)
@@ -2300,7 +1822,7 @@ export class Branch {
    *   // Penalize sibling choices to encourage diversity
    *   beam.branch.steer(siblingTokens.map(t => ({ token: t, bias: -2.0 })));
    *
-   *   const { token } = beam.branch.produce();
+   *   const { token } = await beam.branch.produce();
    *   await beam.branch.commit(token);
    *   beam.lastToken = token;
    *   beam.branch.clearSteer();
@@ -2332,7 +1854,7 @@ export class Branch {
    *   const blocked = computeConstraints(generatedTokens);
    *   branch.steer(blocked.map(t => ({ token: t, bias: -Infinity })));
    *
-   *   const { token, isStop } = branch.produce();
+   *   const { token, isStop } = await branch.produce();
    *   if (isStop) break;
    *
    *   await branch.commit(token);
@@ -2343,8 +1865,60 @@ export class Branch {
    */
   clearSteer(): void;
 
-  /** Sample next token without advancing state. Inspect before committing. */
-  produce(): Produced;
+  /**
+   * Replace the sampler chain with new parameters (memoized)
+   *
+   * If the new params match the current chain's params, this is a no-op.
+   * Otherwise the old chain is freed and a new one is created. Use for
+   * Entropy-Driven Temperature (EDT) and other adaptive sampling strategies
+   * that adjust parameters per-step.
+   *
+   * @param params - New sampling parameters
+   *
+   * @example Entropy-Driven Temperature
+   * ```typescript
+   * const entropy = ctx.modelEntropy('nats', branch.getLogits());
+   * branch.setSamplerParams({ temperature: edtTemperature(entropy) });
+   * const { token } = await branch.produce();
+   * await branch.commit(token);
+   * ```
+   */
+  setSamplerParams(params: SamplingParams): void;
+
+  /**
+   * Replace or remove the grammar constraint
+   *
+   * Pass a GBNF grammar string to constrain generation. Pass empty string
+   * or undefined to remove the constraint. The grammar state is cloned on
+   * fork(), so sibling branches can diverge independently after hot-swap.
+   *
+   * @param grammarStr - GBNF grammar string, or empty/undefined to remove
+   *
+   * @example Hot-swap grammar mid-generation
+   * ```typescript
+   * // Start unconstrained, then switch to JSON after detecting tool call
+   * branch.setGrammar(jsonGrammar);
+   * const { token } = await branch.produce();
+   * ```
+   */
+  setGrammar(grammarStr?: string): void;
+
+  /**
+   * Sample next token without advancing state (async)
+   *
+   * Async contract: local branches resolve immediately; cloud branches
+   * may perform an HTTP round-trip. Use {@link produceSync} when you know
+   * the branch is local and want zero-overhead sampling.
+   */
+  produce(): Promise<Produced>;
+
+  /**
+   * Sample next token without advancing state (sync)
+   *
+   * Same as {@link produce} but synchronous. Use when you know the branch
+   * is local and want to avoid the microtick overhead of a promise.
+   */
+  produceSync(): Produced;
 
   /**
    * Accept and decode — update branch state, then write token to KV
@@ -2428,9 +2002,9 @@ export class Branch {
  * Packs N tokens into a single batch via `decode_each` (one row per sequence,
  * all at their respective positions). Single `llama_decode()` call. Logits
  * captured per-branch at batch index `i`. O(N) total work, O(1) GPU
- * dispatches, O(1) amortized dispatch overhead per branch. Post-decode,
- * accepts each token into its branch's repeat-penalty window. Decode-first
- * ordering ensures sampler state stays consistent if decode throws.
+ * dispatches, O(1) amortized dispatch overhead per branch. Accept-first
+ * ordering with rollback: accepts each token into its branch's repeat-penalty
+ * window before decode, restores from clones if decode throws.
  *
  * **prefill()** — Bulk token injection. Each branch contributes a
  * variable-length token array. Uses a two-pass bin-packing algorithm:
@@ -2459,7 +2033,7 @@ export class Branch {
  * @example 32-branch generation step — one GPU dispatch
  * ```typescript
  * const store = new BranchStore(ctx);
- * const entries = branches.map(b => [b, b.produce().token] as [Branch, number]);
+ * const entries = await Promise.all(branches.map(async b => [b, (await b.produce()).token] as [Branch, number]));
  * await store.commit(entries);  // 32 tokens, 1 llama_decode()
  * ```
  *
@@ -2470,8 +2044,8 @@ export class Branch {
  * for (const _ of [1, 2, 3]) branches.push(await root.fork());
  *
  * for (let step = 0; step < 50; step++) {
- *   const live = branches.map(b => [b, b.produce()] as const)
- *     .filter(([, p]) => !p.isStop);
+ *   const produced = await Promise.all(branches.map(async b => [b, await b.produce()] as const));
+ *   const live = produced.filter(([, p]) => !p.isStop);
  *   if (!live.length) break;
  *   await store.commit(live.map(([b, p]) => [b, p.token]));
  * }
diff --git a/lib/index.js b/lib/index.js
index 3541a22..928c3f7 100644
--- a/lib/index.js
+++ b/lib/index.js
@@ -17,18 +17,13 @@
  * // Tokenize
  * const tokens = await ctx.tokenize("Hello world");
  *
- * // Decode
- * await ctx.decode(tokens, 0);
- *
- * // Safe logits access (Runtime Borrow Checker pattern)
- * const entropy = withLogits(ctx, (logits) => {
- *   // logits is valid here - use synchronously only!
- *   return myComputeEntropy(logits);
- * });
- *
- * // Or with native reference implementations (for testing)
- * const entropy = ctx.modelEntropy();
- * const token = ctx.greedySample();
+ * // Generate via Branch API
+ * const branch = Branch.create(ctx, 0, { temperature: 0.7 });
+ * await branch.prefill(tokens);
+ * for await (const { text } of branch) {
+ *   process.stdout.write(text);
+ * }
+ * await branch.prune();
  *
  * // Cleanup
  * ctx.dispose();
diff --git a/liblloyal b/liblloyal
index 4c932ea..557c4ef 160000
--- a/liblloyal
+++ b/liblloyal
@@ -1 +1 @@
-Subproject commit 4c932ea0b74d5dd8392458f12027c5c55875f2a3
+Subproject commit 557c4ef6c7f88824c6fdbc029ad9e9b8bea4f73d
diff --git a/src/SessionContext.cpp b/src/SessionContext.cpp
index 0fb6696..b812a76 100644
--- a/src/SessionContext.cpp
+++ b/src/SessionContext.cpp
@@ -1,7 +1,6 @@
 #include "SessionContext.hpp"
 #include "BackendManager.hpp"
 #include "FileSystem.h"
-#include <lloyal/decode.hpp>
 #include <lloyal/sampler.hpp>
 #include <lloyal/tokenizer.hpp>
 #include <lloyal/common.hpp>
@@ -409,39 +408,6 @@ class TokenizeWorker : public Napi::AsyncWorker {
 /**
  * AsyncWorker for decode operation
  */
-class DecodeWorker : public Napi::AsyncWorker {
-public:
-  DecodeWorker(Napi::Env env, llama_context* ctx, const std::vector<llama_token>& tokens,
-               int32_t pos, llama_seq_id seqId, int32_t nBatch)
-    : AsyncWorker(env), _deferred(env), _ctx(ctx), _tokens(tokens), _pos(pos), _seqId(seqId), _nBatch(nBatch) {}
-
-  void Execute() override {
-    try {
-      lloyal::decode::many(_ctx, _tokens, _pos, _nBatch, _seqId);
-    } catch (const std::exception& e) {
-      SetError(e.what());
-    }
-  }
-
-  void OnOK() override {
-    _deferred.Resolve(Env().Undefined());
-  }
-
-  void OnError(const Napi::Error& err) override {
-    _deferred.Reject(err.Value());
-  }
-
-  Napi::Promise GetPromise() { return _deferred.Promise(); }
-
-private:
-  Napi::Promise::Deferred _deferred;
-  llama_context* _ctx;
-  std::vector<llama_token> _tokens;
-  int32_t _pos;
-  llama_seq_id _seqId;
-  int32_t _nBatch;
-};
-
 /**
  * AsyncWorker for encode operation (embedding extraction)
  * Unlike DecodeWorker, marks ALL tokens with logits=true
@@ -664,14 +630,16 @@ class StoreCommitWorker : public Napi::AsyncWorker {
     // RAII snapshot of accept-mutable state. Destructor frees anything still
     // owned, so partial clones from a throwing OOM don't leak.
     struct Snapshot {
-      llama_sampler* sampler = nullptr;
-      llama_sampler* grammar = nullptr;
-      lloyal::metrics::BranchMetricsHandle metrics = 0;
+      lloyal::branch::SamplerChainHandle sampler = 0;
+      lloyal::branch::GrammarHandle grammar = 0;
+      lloyal::branch::MetricsHandle metrics = 0;
+      lloyal::branch::BranchStore* store = nullptr;
 
       ~Snapshot() {
-        if (sampler) lloyal::sampler::free_chain(sampler);
-        if (grammar) lloyal::grammar::free_sampler(grammar);
-        if (metrics) lloyal::metrics::free_branch_metrics(metrics);
+        if (!store) return;
+        if (sampler) store->free_sampler(sampler);
+        if (grammar) store->free_grammar(grammar);
+        if (metrics) store->free_metrics(metrics);
       }
 
       void restore_into(lloyal::branch::BranchState& st) {
@@ -691,12 +659,13 @@ class StoreCommitWorker : public Napi::AsyncWorker {
         if (!st) throw std::runtime_error("StoreCommitWorker: invalid handle");
 
         auto s = std::make_unique<Snapshot>();
-        s->sampler = st->sampler_chain
-            ? lloyal::sampler::clone_chain(st->sampler_chain) : nullptr;
-        s->grammar = st->grammar
-            ? lloyal::grammar::clone_sampler(st->grammar) : nullptr;
+        s->store = &_store;
+        s->sampler = st->sampler_chain != 0
+            ? _store.clone_sampler(st->sampler_chain) : 0;
+        s->grammar = st->grammar != 0
+            ? _store.clone_grammar(st->grammar) : 0;
         s->metrics = st->metrics != 0
-            ? lloyal::metrics::clone_branch_metrics(st->metrics) : 0;
+            ? _store.clone_metrics(st->metrics) : 0;
         snaps[i] = std::move(s);
       }
 
@@ -768,45 +737,6 @@ class StorePrefillWorker : public Napi::AsyncWorker {
   std::vector<std::vector<llama_token>> _tokenStorage;
 };
 
-/**
- * AsyncWorker for decode + logits capture into a JS ArrayBuffer
- * Pins the dest ArrayBuffer via Napi::Reference to prevent GC during Execute()
- */
-class DecodeAndCaptureWorker : public Napi::AsyncWorker {
-public:
-  DecodeAndCaptureWorker(Napi::Env env, llama_context* ctx,
-                          std::vector<llama_token> tokens,
-                          int32_t pos, llama_seq_id seqId, int32_t nBatch,
-                          float* dest, int nVocab,
-                          Napi::Reference<Napi::ArrayBuffer> bufRef)
-    : AsyncWorker(env), _deferred(env), _ctx(ctx), _tokens(std::move(tokens)),
-      _pos(pos), _seqId(seqId), _nBatch(nBatch), _dest(dest), _nVocab(nVocab),
-      _bufRef(std::move(bufRef)) {}
-
-  void Execute() override {
-    try {
-      lloyal::decode::many(_ctx, _tokens, _pos, _nBatch, _seqId);
-      float* logits = lloyal::logits::get(_ctx, -1);
-      std::memcpy(_dest, logits, _nVocab * sizeof(float));
-    } catch (const std::exception& e) { SetError(e.what()); }
-  }
-
-  void OnOK() override { _deferred.Resolve(Env().Undefined()); }
-  void OnError(const Napi::Error& err) override { _deferred.Reject(err.Value()); }
-  Napi::Promise GetPromise() { return _deferred.Promise(); }
-
-private:
-  Napi::Promise::Deferred _deferred;
-  llama_context* _ctx;
-  std::vector<llama_token> _tokens;
-  int32_t _pos;
-  llama_seq_id _seqId;
-  int32_t _nBatch;
-  float* _dest;
-  int _nVocab;
-  Napi::Reference<Napi::ArrayBuffer> _bufRef;  // prevent GC of dest buffer
-};
-
 /**
  * AsyncWorker for JSON schema → GBNF grammar conversion
  * Pure CPU, no shared state — cleanest worker
@@ -836,10 +766,8 @@ class JsonSchemaToGrammarWorker : public Napi::AsyncWorker {
 
 Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) {
   Napi::Function func = DefineClass(env, "SessionContext", {
-    // ===== THE GENERATION LOOP =====
-    InstanceMethod("decode", &SessionContext::decode),
+    // ===== CORE =====
     InstanceMethod("getLogits", &SessionContext::getLogits),
-    InstanceMethod("sample", &SessionContext::sample),
     InstanceMethod("tokenToText", &SessionContext::tokenToText),
     InstanceMethod("isStopToken", &SessionContext::isStopToken),
     InstanceMethod("getEogToken", &SessionContext::getEogToken),
@@ -864,16 +792,6 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) {
     InstanceMethod("kvSeqKeep", &SessionContext::kvSeqKeep),
     InstanceMethod("kvSeqPosMax", &SessionContext::kvSeqPosMax),
 
-    // ===== HANDLE-BASED GRAMMAR =====
-    InstanceMethod("createSampler", &SessionContext::createSampler),
-    InstanceMethod("applySampler", &SessionContext::applySampler),
-    InstanceMethod("acceptSamplerToken", &SessionContext::acceptSamplerToken),
-    InstanceMethod("cloneSampler", &SessionContext::cloneSampler),
-    InstanceMethod("freeSamplerHandle", &SessionContext::freeSamplerHandle),
-
-    // ===== ATOMIC DECODE+CAPTURE =====
-    InstanceMethod("decodeAndCapture", &SessionContext::decodeAndCapture),
-
     // ===== HELPERS =====
     InstanceMethod("formatChat", &SessionContext::formatChat),
     InstanceMethod("parseChatOutput", &SessionContext::parseChatOutput),
@@ -889,16 +807,6 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) {
     // ===== METRICS API =====
     InstanceMethod("modelSurprisal", &SessionContext::modelSurprisal),
     InstanceMethod("modelEntropy", &SessionContext::modelEntropy),
-    InstanceMethod("createPerplexityTracker", &SessionContext::createPerplexityTracker),
-    InstanceMethod("addSurprisal", &SessionContext::addSurprisal),
-    InstanceMethod("getPerplexity", &SessionContext::getPerplexity),
-    InstanceMethod("clonePerplexityTracker", &SessionContext::clonePerplexityTracker),
-    InstanceMethod("resetPerplexityTracker", &SessionContext::resetPerplexityTracker),
-    InstanceMethod("getPerplexityCount", &SessionContext::getPerplexityCount),
-    InstanceMethod("freePerplexityTracker", &SessionContext::freePerplexityTracker),
-
-    // ===== NATIVE REFERENCE IMPLEMENTATIONS =====
-    InstanceMethod("greedySample", &SessionContext::greedySample),
 
     // ===== LIFECYCLE =====
     InstanceMethod("dispose", &SessionContext::dispose),
@@ -923,6 +831,8 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) {
     InstanceMethod("_branchSamplerChainReseed", &SessionContext::_branchSamplerChainReseed),
     InstanceMethod("_branchSteer", &SessionContext::_branchSteer),
     InstanceMethod("_branchClearSteer", &SessionContext::_branchClearSteer),
+    InstanceMethod("_branchSetSamplerParams", &SessionContext::_branchSetSamplerParams),
+    InstanceMethod("_branchSetGrammar", &SessionContext::_branchSetGrammar),
 
     // ===== STORE API (internal, wrapped by lib/BranchStore.js) =====
     InstanceMethod("_storeCommit", &SessionContext::_storeCommit),
@@ -954,26 +864,6 @@ SessionContext::SessionContext(const Napi::CallbackInfo& info)
 
 SessionContext::~SessionContext() {
   if (!_disposed) {
-    // Free handle-based grammar samplers first
-    for (auto& [handle, sampler] : _samplerHandles) {
-      if (sampler) {
-        llama_sampler_free(sampler);
-      }
-    }
-    _samplerHandles.clear();
-
-    // Free handle-based perplexity trackers
-    for (auto& [napiHandle, pplHandle] : _perplexityHandles) {
-      lloyal::metrics::free_perplexity(pplHandle);
-    }
-    _perplexityHandles.clear();
-
-    // Free persistent sampler chain (pattern from branch.hpp)
-    if (_samplerChain) {
-      lloyal::sampler::free_chain(_samplerChain);
-      _samplerChain = nullptr;
-    }
-
     // Free context (depends on model)
     if (_context) {
       llama_free(_context);
@@ -1073,43 +963,6 @@ Napi::Value SessionContext::getLogits(const Napi::CallbackInfo& info) {
   return Napi::Float32Array::New(env, n_vocab, buffer, 0);
 }
 
-Napi::Value SessionContext::decode(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  if (info.Length() < 2 || !info[0].IsArray() || !info[1].IsNumber()) {
-    throw Napi::TypeError::New(env, "Expected (tokens: number[], position: number[, seqId: number])");
-  }
-
-  // Revoke any active logits buffer before decode
-  invalidateLogits();
-
-  // Extract tokens
-  Napi::Array jsTokens = info[0].As<Napi::Array>();
-  std::vector<llama_token> tokens;
-  tokens.reserve(jsTokens.Length());
-  for (uint32_t i = 0; i < jsTokens.Length(); i++) {
-    Napi::Value val = jsTokens[i];
-    if (!val.IsNumber()) {
-      throw Napi::TypeError::New(env, "Token array must contain only numbers");
-    }
-    tokens.push_back(static_cast<llama_token>(val.As<Napi::Number>().Int32Value()));
-  }
-
-  int32_t position = info[1].As<Napi::Number>().Int32Value();
-
-  // Extract optional seqId (default 0 for backward compatibility)
-  llama_seq_id seqId = 0;
-  if (info.Length() >= 3 && info[2].IsNumber()) {
-    seqId = static_cast<llama_seq_id>(info[2].As<Napi::Number>().Int32Value());
-  }
-
-  // Run async
-  auto* worker = new DecodeWorker(env, _context, tokens, position, seqId, _nBatch);
-  worker->Queue();
-  return worker->GetPromise();
-}
-
 Napi::Value SessionContext::tokenize(const Napi::CallbackInfo& info) {
   Napi::Env env = info.Env();
   ensureNotDisposed();
@@ -1250,20 +1103,6 @@ Napi::Value SessionContext::modelEntropy(const Napi::CallbackInfo& info) {
   return Napi::Number::New(env, static_cast<double>(entropy));
 }
 
-Napi::Value SessionContext::greedySample(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  if (!_context) {
-    throw Napi::Error::New(env, "Context not initialized");
-  }
-
-  // Use liblloyal greedy sampler with model overload
-  llama_token token = lloyal::sampler::greedy(_context, _model.get());
-
-  return Napi::Number::New(env, static_cast<double>(token));
-}
-
 Napi::Value SessionContext::tokenToText(const Napi::CallbackInfo& info) {
   Napi::Env env = info.Env();
   ensureNotDisposed();
@@ -1488,69 +1327,6 @@ Napi::Value SessionContext::kvCacheSize(const Napi::CallbackInfo& info) {
   return Napi::Number::New(env, static_cast<double>(max_pos));
 }
 
-Napi::Value SessionContext::sample(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  if (!_context) {
-    throw Napi::Error::New(env, "Context not initialized");
-  }
-
-  llama_token next_token;
-
-  // Use greedy if no params, otherwise use persistent sampler chain
-  // Pattern from branch.hpp: create chain once, reuse across samples, call accept() after
-  if (info.Length() == 0 || !info[0].IsObject()) {
-    // No params - use greedy sampling (stateless, no chain needed)
-    next_token = lloyal::sampler::greedy(_context, _model.get());
-  } else {
-    // Use adapter to convert JS params → liblloyal-compatible structure
-    LloyalSamplingParams params = adaptSamplingParamsFromJS(info[0].As<Napi::Object>());
-
-    // Create or rebuild sampler chain if params changed
-    // Pattern from branch.hpp: persistent chain enables repeat penalty tracking
-    if (!_samplerChain || params != _samplerParams) {
-      if (_samplerChain) {
-        lloyal::sampler::free_chain(_samplerChain);
-      }
-      _samplerChain = lloyal::sampler::create_chain(params);
-      _samplerParams = params;
-    }
-
-    // Get logits and build candidate array (pattern from branch.hpp::sample)
-    const int n_vocab = lloyal::tokenizer::vocab_size(_model.get());
-    float* logits = lloyal::logits::get(_context, -1);
-
-    std::vector<llama_token_data> candidates(n_vocab);
-    for (int i = 0; i < n_vocab; i++) {
-      candidates[i] = llama_token_data{static_cast<llama_token>(i), logits[i], 0.0f};
-    }
-
-    llama_token_data_array cur_p = {
-      candidates.data(),
-      static_cast<size_t>(n_vocab),
-      -1,    // selected
-      false  // sorted
-    };
-
-    // Apply persistent sampler chain (includes penalties, filters, temp, dist)
-    lloyal::sampler::apply(_samplerChain, &cur_p);
-
-    if (cur_p.selected == -1) {
-      throw Napi::Error::New(env, "Sampling failed - no token selected");
-    }
-
-    next_token = cur_p.data[cur_p.selected].id;
-
-    // Update penalty history in persistent chain (KEY CHANGE from old stateless approach)
-    // This enables repeat penalty to track ALL tokens across the generation,
-    // not just what's visible in the current KV cache window after clearAndReseed()
-    lloyal::sampler::accept(_samplerChain, next_token);
-  }
-
-  return Napi::Number::New(env, static_cast<double>(next_token));
-}
-
 Napi::Value SessionContext::dispose(const Napi::CallbackInfo& info) {
   Napi::Env env = info.Env();
 
@@ -1558,26 +1334,6 @@ Napi::Value SessionContext::dispose(const Napi::CallbackInfo& info) {
     // Revoke any active logits buffer before disposing
     invalidateLogits();
 
-    // Free handle-based grammar samplers
-    for (auto& [handle, sampler] : _samplerHandles) {
-      if (sampler) {
-        llama_sampler_free(sampler);
-      }
-    }
-    _samplerHandles.clear();
-
-    // Free handle-based perplexity trackers
-    for (auto& [napiHandle, pplHandle] : _perplexityHandles) {
-      lloyal::metrics::free_perplexity(pplHandle);
-    }
-    _perplexityHandles.clear();
-
-    // Free persistent sampler chain (pattern from branch.hpp)
-    if (_samplerChain) {
-      lloyal::sampler::free_chain(_samplerChain);
-      _samplerChain = nullptr;
-    }
-
     // Drain branch store while context is still alive
     _branchStore.drain();
 
@@ -1648,358 +1404,6 @@ Napi::Value SessionContext::kvSeqPosMax(const Napi::CallbackInfo& info) {
   return Napi::Number::New(env, static_cast<double>(pos));
 }
 
-// ===== HANDLE-BASED GRAMMAR =====
-
-Napi::Value SessionContext::createSampler(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  if (info.Length() < 1 || !info[0].IsString()) {
-    throw Napi::TypeError::New(env, "Expected (grammarStr: string)");
-  }
-
-  std::string grammarStr = info[0].As<Napi::String>().Utf8Value();
-  llama_sampler* sampler = lloyal::grammar::init_sampler(_model.get(), grammarStr);
-
-  if (!sampler) {
-    throw Napi::Error::New(env, "Failed to create grammar sampler");
-  }
-
-  int32_t handle = _nextSamplerHandle++;
-  _samplerHandles[handle] = sampler;
-
-  return Napi::Number::New(env, static_cast<double>(handle));
-}
-
-Napi::Value SessionContext::applySampler(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  if (info.Length() < 2) {
-    throw Napi::TypeError::New(env, "Expected (handle, logitsBuffer)");
-  }
-
-  int32_t handle = static_cast<int32_t>(info[0].As<Napi::Number>().Int32Value());
-
-  auto it = _samplerHandles.find(handle);
-  if (it == _samplerHandles.end()) {
-    throw Napi::Error::New(env, "Invalid sampler handle");
-  }
-
-  // Get logits buffer
-  Napi::ArrayBuffer buffer;
-  if (info[1].IsArrayBuffer()) {
-    buffer = info[1].As<Napi::ArrayBuffer>();
-  } else if (info[1].IsTypedArray()) {
-    buffer = info[1].As<Napi::TypedArray>().ArrayBuffer();
-  } else {
-    throw Napi::TypeError::New(env, "Expected ArrayBuffer or TypedArray");
-  }
-
-  float* logits = static_cast<float*>(buffer.Data());
-  int n_vocab = lloyal::tokenizer::vocab_size(_model.get());
-
-  // Build candidates array
-  std::vector<llama_token_data> candidates(n_vocab);
-  for (int i = 0; i < n_vocab; i++) {
-    candidates[i] = llama_token_data{static_cast<llama_token>(i), logits[i], 0.0f};
-  }
-
-  llama_token_data_array arr = {candidates.data(), static_cast<size_t>(n_vocab), -1, false};
-
-  // Apply grammar (modifies candidates)
-  llama_sampler_apply(it->second, &arr);
-
-  // Write back to buffer
-  for (int i = 0; i < n_vocab; i++) {
-    logits[i] = candidates[i].logit;
-  }
-
-  return env.Undefined();
-}
-
-Napi::Value SessionContext::acceptSamplerToken(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  if (info.Length() < 2) {
-    throw Napi::TypeError::New(env, "Expected (handle, tokenId)");
-  }
-
-  int32_t handle = static_cast<int32_t>(info[0].As<Napi::Number>().Int32Value());
-  llama_token token = static_cast<llama_token>(info[1].As<Napi::Number>().Int32Value());
-
-  auto it = _samplerHandles.find(handle);
-  if (it == _samplerHandles.end()) {
-    throw Napi::Error::New(env, "Invalid sampler handle");
-  }
-
-  try {
-    llama_sampler_accept(it->second, token);
-  } catch (const std::exception& e) {
-    throw Napi::Error::New(env, e.what());
-  }
-  return env.Undefined();
-}
-
-Napi::Value SessionContext::cloneSampler(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  if (info.Length() < 1) {
-    throw Napi::TypeError::New(env, "Expected (handle)");
-  }
-
-  int32_t handle = static_cast<int32_t>(info[0].As<Napi::Number>().Int32Value());
-
-  auto it = _samplerHandles.find(handle);
-  if (it == _samplerHandles.end()) {
-    throw Napi::Error::New(env, "Invalid sampler handle");
-  }
-
-  llama_sampler* cloned = lloyal::grammar::clone_sampler(it->second);
-  if (!cloned) {
-    throw Napi::Error::New(env, "Failed to clone sampler");
-  }
-
-  int32_t newHandle = _nextSamplerHandle++;
-  _samplerHandles[newHandle] = cloned;
-
-  return Napi::Number::New(env, static_cast<double>(newHandle));
-}
-
-Napi::Value SessionContext::freeSamplerHandle(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  if (info.Length() < 1) {
-    throw Napi::TypeError::New(env, "Expected (handle)");
-  }
-
-  int32_t handle = static_cast<int32_t>(info[0].As<Napi::Number>().Int32Value());
-
-  auto it = _samplerHandles.find(handle);
-  if (it != _samplerHandles.end()) {
-    llama_sampler_free(it->second);
-    _samplerHandles.erase(it);
-  }
-
-  return env.Undefined();
-}
-
-// ===== PERPLEXITY TRACKING =====
-
-Napi::Value SessionContext::createPerplexityTracker(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  // Create new perplexity tracker via metrics.hpp
-  lloyal::metrics::PerplexityHandle handle = lloyal::metrics::create_perplexity();
-
-  // Generate N-API handle
-  int32_t napiHandle = _nextPerplexityHandle++;
-  _perplexityHandles[napiHandle] = handle;
-
-  return Napi::Number::New(env, static_cast<double>(napiHandle));
-}
-
-Napi::Value SessionContext::addSurprisal(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  // Argument validation
-  if (info.Length() < 2 || !info[0].IsNumber() || !info[1].IsNumber()) {
-    throw Napi::TypeError::New(env, "Expected (handle: number, surprisal: number)");
-  }
-
-  int32_t napiHandle = info[0].As<Napi::Number>().Int32Value();
-  double surprisal = info[1].As<Napi::Number>().DoubleValue();
-
-  // Lookup handle
-  auto it = _perplexityHandles.find(napiHandle);
-  if (it == _perplexityHandles.end()) {
-    throw Napi::Error::New(env, "Invalid perplexity tracker handle");
-  }
-
-  // Add surprisal to tracker
-  lloyal::metrics::add_surprisal(it->second, static_cast<float>(surprisal));
-
-  return env.Undefined();
-}
-
-Napi::Value SessionContext::getPerplexity(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  // Argument validation
-  if (info.Length() < 1 || !info[0].IsNumber()) {
-    throw Napi::TypeError::New(env, "Expected handle: number");
-  }
-
-  int32_t napiHandle = info[0].As<Napi::Number>().Int32Value();
-
-  // Lookup handle
-  auto it = _perplexityHandles.find(napiHandle);
-  if (it == _perplexityHandles.end()) {
-    throw Napi::Error::New(env, "Invalid perplexity tracker handle");
-  }
-
-  // Get perplexity value
-  float ppl = lloyal::metrics::get_ppl(it->second);
-
-  return Napi::Number::New(env, static_cast<double>(ppl));
-}
-
-Napi::Value SessionContext::clonePerplexityTracker(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  // Argument validation
-  if (info.Length() < 1 || !info[0].IsNumber()) {
-    throw Napi::TypeError::New(env, "Expected handle: number");
-  }
-
-  int32_t sourceHandle = info[0].As<Napi::Number>().Int32Value();
-
-  // Lookup source handle
-  auto it = _perplexityHandles.find(sourceHandle);
-  if (it == _perplexityHandles.end()) {
-    throw Napi::Error::New(env, "Invalid source perplexity tracker handle");
-  }
-
-  // Clone via metrics.hpp
-  lloyal::metrics::PerplexityHandle clonedHandle =
-      lloyal::metrics::clone_perplexity(it->second);
-
-  // Generate new N-API handle
-  int32_t newNapiHandle = _nextPerplexityHandle++;
-  _perplexityHandles[newNapiHandle] = clonedHandle;
-
-  return Napi::Number::New(env, static_cast<double>(newNapiHandle));
-}
-
-Napi::Value SessionContext::resetPerplexityTracker(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  // Argument validation
-  if (info.Length() < 1 || !info[0].IsNumber()) {
-    throw Napi::TypeError::New(env, "Expected handle: number");
-  }
-
-  int32_t napiHandle = info[0].As<Napi::Number>().Int32Value();
-
-  // Lookup handle
-  auto it = _perplexityHandles.find(napiHandle);
-  if (it == _perplexityHandles.end()) {
-    throw Napi::Error::New(env, "Invalid perplexity tracker handle");
-  }
-
-  // Reset tracker
-  lloyal::metrics::reset_perplexity(it->second);
-
-  return env.Undefined();
-}
-
-Napi::Value SessionContext::getPerplexityCount(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  // Argument validation
-  if (info.Length() < 1 || !info[0].IsNumber()) {
-    throw Napi::TypeError::New(env, "Expected handle: number");
-  }
-
-  int32_t napiHandle = info[0].As<Napi::Number>().Int32Value();
-
-  // Lookup handle
-  auto it = _perplexityHandles.find(napiHandle);
-  if (it == _perplexityHandles.end()) {
-    throw Napi::Error::New(env, "Invalid perplexity tracker handle");
-  }
-
-  // Get token count
-  int count = lloyal::metrics::get_count(it->second);
-
-  return Napi::Number::New(env, static_cast<double>(count));
-}
-
-Napi::Value SessionContext::freePerplexityTracker(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  // Argument validation
-  if (info.Length() < 1 || !info[0].IsNumber()) {
-    throw Napi::TypeError::New(env, "Expected handle: number");
-  }
-
-  int32_t napiHandle = info[0].As<Napi::Number>().Int32Value();
-
-  // Lookup and remove handle
-  auto it = _perplexityHandles.find(napiHandle);
-  if (it == _perplexityHandles.end()) {
-    throw Napi::Error::New(env, "Invalid perplexity tracker handle");
-  }
-
-  // Free via metrics.hpp
-  lloyal::metrics::free_perplexity(it->second);
-
-  // Remove from map
-  _perplexityHandles.erase(it);
-
-  return env.Undefined();
-}
-
-// ===== ATOMIC DECODE+CAPTURE =====
-
-Napi::Value SessionContext::decodeAndCapture(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  if (info.Length() < 4) {
-    throw Napi::TypeError::New(env, "Expected (tokens, position, seqId, destBuffer)");
-  }
-
-  // Parse tokens
-  Napi::Array tokensArray = info[0].As<Napi::Array>();
-  std::vector<llama_token> tokens(tokensArray.Length());
-  for (uint32_t i = 0; i < tokensArray.Length(); i++) {
-    tokens[i] = static_cast<llama_token>(tokensArray.Get(i).As<Napi::Number>().Int32Value());
-  }
-
-  int32_t position = info[1].As<Napi::Number>().Int32Value();
-  llama_seq_id seqId = toSeqId(info[2].As<Napi::Number>().DoubleValue());
-
-  // Get dest buffer
-  Napi::ArrayBuffer destBuffer;
-  if (info[3].IsArrayBuffer()) {
-    destBuffer = info[3].As<Napi::ArrayBuffer>();
-  } else if (info[3].IsTypedArray()) {
-    destBuffer = info[3].As<Napi::TypedArray>().ArrayBuffer();
-  } else {
-    throw Napi::TypeError::New(env, "destBuffer must be ArrayBuffer or TypedArray");
-  }
-
-  float* dest = static_cast<float*>(destBuffer.Data());
-  int n_vocab = lloyal::tokenizer::vocab_size(_model.get());
-
-  // Main-thread work: invalidate logits views (touches Napi objects)
-  invalidateLogits();
-  _decodeStepId++;
-
-  // Pin the JS ArrayBuffer to prevent GC during worker Execute()
-  auto bufRef = Napi::Reference<Napi::ArrayBuffer>::New(destBuffer, 1);
-
-  auto* worker = new DecodeAndCaptureWorker(
-    env, _context, std::move(tokens), position, seqId, _nBatch,
-    dest, n_vocab, std::move(bufRef));
-  worker->Queue();
-  return worker->GetPromise();
-}
-
-// ===== HELPER METHODS =====
-// Pattern matches HybridSessionContext.cpp:103-106, 365-379
-
 Napi::Value SessionContext::getMemorySize(const Napi::CallbackInfo& info) {
   Napi::Env env = info.Env();
   ensureNotDisposed();
@@ -2658,10 +2062,11 @@ Napi::Value SessionContext::_branchSamplerChainReseed(const Napi::CallbackInfo&
     throw Napi::Error::New(env, "_branchSamplerChainReseed: invalid handle");
   }
 
-  // Only reseed stochastic chains (has_dist_sampler=true)
+  // Only reseed stochastic chains (has_dist=true)
   // Reseeding greedy chains would corrupt them
-  if (state->sampler_chain && state->has_dist_sampler) {
-    lloyal::sampler::reseed_chain(state->sampler_chain, seed);
+  if (state->sampler_chain != 0 && _branchStore.sampler_has_dist(state->sampler_chain)) {
+    llama_sampler* chain = _branchStore.get_sampler_chain(state->sampler_chain);
+    if (chain) lloyal::sampler::reseed_chain(chain, seed);
   }
 
   return env.Undefined();
@@ -2736,6 +2141,48 @@ Napi::Value SessionContext::_branchClearSteer(const Napi::CallbackInfo& info) {
   return env.Undefined();
 }
 
+Napi::Value SessionContext::_branchSetSamplerParams(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  ensureNotDisposed();
+
+  if (info.Length() < 2) {
+    throw Napi::Error::New(env, "_branchSetSamplerParams requires (handle, params)");
+  }
+
+  auto handle = static_cast<lloyal::branch::BranchHandle>(info[0].As<Napi::Number>().Uint32Value());
+
+  LloyalSamplingParams params;
+  if (info[1].IsObject()) {
+    params = adaptSamplingParamsFromJS(info[1].As<Napi::Object>());
+  }
+
+  lloyal::branch::set_sampler_params(handle, params, _branchStore);
+
+  return env.Undefined();
+}
+
+Napi::Value SessionContext::_branchSetGrammar(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  ensureNotDisposed();
+
+  if (info.Length() < 2) {
+    throw Napi::Error::New(env, "_branchSetGrammar requires (handle, grammarStr)");
+  }
+
+  auto handle = static_cast<lloyal::branch::BranchHandle>(info[0].As<Napi::Number>().Uint32Value());
+
+  std::string grammar_str = info[1].As<Napi::String>().Utf8Value();
+
+  lloyal::branch::set_grammar(
+    handle,
+    _model.get(),
+    grammar_str.empty() ? "" : grammar_str.c_str(),
+    _branchStore
+  );
+
+  return env.Undefined();
+}
+
 // ===== STORE API =====
 
 Napi::Value SessionContext::_storeCommit(const Napi::CallbackInfo& info) {
diff --git a/src/SessionContext.hpp b/src/SessionContext.hpp
index 769c210..3ab159d 100644
--- a/src/SessionContext.hpp
+++ b/src/SessionContext.hpp
@@ -9,7 +9,6 @@
 #include <memory>
 #include <optional>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 namespace liblloyal_node {
@@ -86,16 +85,6 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
    */
   Napi::Value getLogits(const Napi::CallbackInfo& info);
 
-  /**
-   * Decode tokens through model
-   * Args: tokens (number[]), position (number), seqId? (number, default 0)
-   * Returns: Promise<void>
-   *
-   * The seqId parameter specifies which KV cache sequence to update.
-   * Use different seqIds for independent parallel sequences.
-   */
-  Napi::Value decode(const Napi::CallbackInfo& info);
-
   /**
    * Tokenize text to token IDs
    * Args: text (string)
@@ -150,21 +139,6 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
    */
   Napi::Value kvCacheSize(const Napi::CallbackInfo& info);
 
-  // ===== NATIVE REFERENCE IMPLEMENTATIONS =====
-
-  /**
-   * Native greedy sampling (for validation)
-   * Returns: number (token ID)
-   */
-  Napi::Value greedySample(const Napi::CallbackInfo& info);
-
-  /**
-   * Native sampling with full parameters (for benchmarking)
-   * Args: params (optional object with temperature, topK, topP, etc.)
-   * Returns: number (token ID)
-   */
-  Napi::Value sample(const Napi::CallbackInfo& info);
-
   // ===== LIFECYCLE =====
 
   /**
@@ -212,49 +186,6 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
    */
   Napi::Value kvSeqPosMax(const Napi::CallbackInfo& info);
 
-  // ===== HANDLE-BASED GRAMMAR =====
-
-  /**
-   * Create a new grammar sampler, returns handle
-   * Args: grammarStr (string)
-   * Returns: number (handle)
-   */
-  Napi::Value createSampler(const Napi::CallbackInfo& info);
-
-  /**
-   * Apply grammar constraints to logits buffer
-   * Args: handle (number), logitsBuffer (ArrayBuffer)
-   */
-  Napi::Value applySampler(const Napi::CallbackInfo& info);
-
-  /**
-   * Accept token to advance grammar parser state
-   * Args: handle (number), tokenId (number)
-   */
-  Napi::Value acceptSamplerToken(const Napi::CallbackInfo& info);
-
-  /**
-   * Clone a grammar sampler
-   * Args: handle (number)
-   * Returns: number (new handle)
-   */
-  Napi::Value cloneSampler(const Napi::CallbackInfo& info);
-
-  /**
-   * Free a grammar sampler
-   * Args: handle (number)
-   */
-  Napi::Value freeSamplerHandle(const Napi::CallbackInfo& info);
-
-  // ===== ATOMIC DECODE+CAPTURE =====
-
-  /**
-   * Decode tokens and capture logits into a JS ArrayBuffer
-   * Args: tokens (number[]), position (number), seqId (number), destBuffer (ArrayBuffer)
-   * Returns: Promise<void>
-   */
-  Napi::Value decodeAndCapture(const Napi::CallbackInfo& info);
-
   /**
    * Write KV cache state + tokens to a file for disk persistence
    * Args: sequenceId (number), filepath (string), tokens (number[])
@@ -320,50 +251,6 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
    */
   Napi::Value modelEntropy(const Napi::CallbackInfo& info);
 
-  /**
-   * Create a new perplexity tracker
-   * Returns: number (handle)
-   */
-  Napi::Value createPerplexityTracker(const Napi::CallbackInfo& info);
-
-  /**
-   * Add surprisal value to tracker
-   * Args: handle (number), surprisal (number)
-   */
-  Napi::Value addSurprisal(const Napi::CallbackInfo& info);
-
-  /**
-   * Get current perplexity value
-   * Args: handle (number)
-   * Returns: number (perplexity)
-   */
-  Napi::Value getPerplexity(const Napi::CallbackInfo& info);
-
-  /**
-   * Clone perplexity tracker
-   * Args: sourceHandle (number)
-   * Returns: number (new handle)
-   */
-  Napi::Value clonePerplexityTracker(const Napi::CallbackInfo& info);
-
-  /**
-   * Reset tracker to initial state
-   * Args: handle (number)
-   */
-  Napi::Value resetPerplexityTracker(const Napi::CallbackInfo& info);
-
-  /**
-   * Get number of tokens tracked
-   * Args: handle (number)
-   * Returns: number (count)
-   */
-  Napi::Value getPerplexityCount(const Napi::CallbackInfo& info);
-
-  /**
-   * Free perplexity tracker resources
-   * Args: handle (number)
-   */
-  Napi::Value freePerplexityTracker(const Napi::CallbackInfo& info);
 
   // ===== BRANCH API (internal, wrapped by lib/Branch.ts) =====
 
@@ -386,6 +273,8 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
   Napi::Value _branchSamplerChainReseed(const Napi::CallbackInfo& info);
   Napi::Value _branchSteer(const Napi::CallbackInfo& info);
   Napi::Value _branchClearSteer(const Napi::CallbackInfo& info);
+  Napi::Value _branchSetSamplerParams(const Napi::CallbackInfo& info);
+  Napi::Value _branchSetGrammar(const Napi::CallbackInfo& info);
 
   // ===== STORE API (internal, wrapped by lib/BranchStore.js) =====
 
@@ -402,21 +291,6 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
   bool _disposed = false;
   int32_t _nBatch = lloyal::defaults::N_BATCH_INIT;
 
-  // Persistent sampling chain (for repeat penalty tracking across tokens)
-  // Pattern from branch.hpp: create once via sampler::create_chain(), reuse across samples.
-  // Penalty sampler's history is updated via sampler::accept() after each sample.
-  // This enables proper repeat penalty tracking across long generations and clearAndReseed().
-  llama_sampler* _samplerChain = nullptr;
-  LloyalSamplingParams _samplerParams;  // Track current params to detect changes
-
-  // ===== HANDLE-BASED GRAMMAR =====
-  std::unordered_map<int32_t, llama_sampler*> _samplerHandles;
-  int32_t _nextSamplerHandle = 1;
-
-  // ===== HANDLE-BASED PERPLEXITY TRACKING =====
-  std::unordered_map<int32_t, lloyal::metrics::PerplexityHandle> _perplexityHandles;
-  int32_t _nextPerplexityHandle = 1;
-
   // ===== BRANCH STORE =====
   lloyal::branch::BranchStore _branchStore{16};  // capacity 16
 
diff --git a/test/integration.js b/test/integration.js
index 9018194..e276b2c 100644
--- a/test/integration.js
+++ b/test/integration.js
@@ -86,42 +86,37 @@ async function testCoreAPI(ctx) {
   const tokenText = ctx.tokenToText(tokens[0]);
   assert(typeof tokenText === 'string', `tokenToText(${tokens[0]}) → "${tokenText}"`);
 
-  // decode + getLogits
-  await ctx.decode(tokens, 0);
-  const logits = ctx.getLogits();
-  assert(logits instanceof Float32Array, `getLogits() → Float32Array(${logits.length})`);
-  assert(logits.length === ctx.vocabSize, `logits.length === vocabSize (${ctx.vocabSize})`);
+  // Branch-based prefill + getLogits
+  const branch = Branch.create(ctx, 0, { temperature: 0 });
+  await branch.prefill(tokens);
+
+  const branchLogits = branch.getLogits();
+  assert(branchLogits instanceof Float32Array, `branch.getLogits() → Float32Array(${branchLogits.length})`);
+  assert(branchLogits.length === ctx.vocabSize, `branchLogits.length === vocabSize (${ctx.vocabSize})`);
 
   // Validate logits are not garbage
   let hasNonZero = false, hasNaN = false;
-  for (let i = 0; i < logits.length; i++) {
-    if (logits[i] !== 0.0) hasNonZero = true;
-    if (isNaN(logits[i])) hasNaN = true;
+  for (let i = 0; i < branchLogits.length; i++) {
+    if (branchLogits[i] !== 0.0) hasNonZero = true;
+    if (isNaN(branchLogits[i])) hasNaN = true;
   }
-  assert(hasNonZero && !hasNaN, 'logits valid (non-zero, no NaN)');
-
-  // modelEntropy
-  const entropy = ctx.modelEntropy();
-  assert(isFinite(entropy) && entropy >= 0, `modelEntropy() → ${entropy.toFixed(4)} nats`);
+  assert(hasNonZero && !hasNaN, 'branch logits valid (non-zero, no NaN)');
 
-  // greedySample
-  const greedy = ctx.greedySample();
-  assert(greedy >= 0 && greedy < ctx.vocabSize, `greedySample() → ${greedy}`);
+  // modelEntropy with branch logits
+  const entropy = ctx.modelEntropy('nats', branchLogits);
+  assert(isFinite(entropy) && entropy >= 0, `modelEntropy(branchLogits) → ${entropy.toFixed(4)} nats`);
 
-  // sample with params
-  const sampled = ctx.sample({ temperature: 0 });
-  assert(sampled === greedy, `sample({temp:0}) === greedySample() (${sampled})`);
+  // Branch greedy sampling (temperature: 0)
+  const greedy = branch.sample();
+  assert(greedy >= 0 && greedy < ctx.vocabSize, `branch.sample() greedy → ${greedy}`);
 
   // isStopToken - EOS should be a stop token
   const eos = ctx.getEogToken();
   assert(ctx.isStopToken(eos), `isStopToken(EOS=${eos}) → true`);
 
-  // Logits memoization
-  const logits1 = ctx.getLogits();
-  const logits2 = ctx.getLogits();
-  assert(logits1[0] === logits2[0], 'getLogits() memoized (same step = same buffer)');
-
-  // withLogits helper
+  // withLogits helper (context-level logits)
+  // Note: getLogits() reads from the shared context buffer, which is populated
+  // by branch decode operations
   const maxLogit = withLogits(ctx, (l) => {
     let max = l[0];
     for (let i = 1; i < l.length; i++) if (l[i] > max) max = l[i];
@@ -136,6 +131,8 @@ async function testCoreAPI(ctx) {
     asyncRejected = true;
   }
   assert(asyncRejected, 'withLogits() rejects async callbacks');
+
+  await branch.prune();
 }
 
 // ═══════════════════════════════════════════════════════════════════════════
@@ -147,11 +144,13 @@ async function testKVCache(ctx) {
 
   await ctx.kvCacheClear();
   const tokens = await ctx.tokenize("Test prompt");
-  await ctx.decode(tokens, 0);
+  const branch = Branch.create(ctx, 0, { temperature: 0 });
+  await branch.prefill(tokens);
 
   const sizeBefore = ctx.kvCacheSize();
-  assert(sizeBefore >= 0, `kvCacheSize() after decode → ${sizeBefore}`);
+  assert(sizeBefore >= 0, `kvCacheSize() after prefill → ${sizeBefore}`);
 
+  await branch.prune();
   await ctx.kvCacheClear();
   const sizeAfter = ctx.kvCacheSize();
   assert(sizeAfter === -1, `kvCacheClear() → size=${sizeAfter} (empty)`);
@@ -172,21 +171,25 @@ async function testMultiSequence() {
   });
 
   try {
+    // Use a branch to prefill tokens (populates KV on its seq_id)
     const tokens = await ctx.tokenize("The quick brown fox");
-    await ctx.decode(tokens, 0, 0);
+    const branch = Branch.create(ctx, 0, { temperature: 0 });
+    await branch.prefill(tokens);
 
-    const seq0Pos = ctx.kvSeqPosMax(0);
-    assert(seq0Pos >= 0, `kvSeqPosMax(0) → ${seq0Pos}`);
+    // Branch allocates a seq_id — check its KV is populated
+    const branchPos = branch.position;
+    assert(branchPos === tokens.length, `branch position → ${branchPos}`);
 
-    const seq1Before = ctx.kvSeqPosMax(1);
-    assert(seq1Before === -1, `kvSeqPosMax(1) before copy → ${seq1Before} (empty)`);
+    // Fork creates a new sequence with copied KV
+    const forked = await branch.fork();
+    assert(forked.position === branchPos, `forked position matches parent → ${forked.position}`);
 
-    ctx.kvSeqCopy(0, 1);
-    const seq1After = ctx.kvSeqPosMax(1);
-    assert(seq1After === seq0Pos, `kvSeqCopy(0,1) → seq1 pos=${seq1After}`);
+    // Raw KV seq ops still work for advanced use
+    const seq1Before = ctx.kvSeqPosMax(3);  // unused seq_id
+    assert(seq1Before === -1, `kvSeqPosMax(unused) → ${seq1Before} (empty)`);
 
-    const seq0After = ctx.kvSeqPosMax(0);
-    assert(seq0After === seq0Pos, `seq0 unchanged after copy → ${seq0After}`);
+    await forked.prune();
+    await branch.prune();
   } finally {
     ctx.dispose();
   }
@@ -202,48 +205,22 @@ async function testGrammar() {
   const ctx = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: CTX_SIZE,
-    nThreads: 4
+    nThreads: 4,
+    nSeqMax: 4
   });
 
   try {
     const grammar = `root ::= "{" ws "}" ws
 ws ::= [ \\t\\n]*`;
 
-    // Handle-based API
-    const handle = ctx.createSampler(grammar);
-    assert(typeof handle === 'number' && handle > 0, `createSampler() → handle=${handle}`);
-
-    const cloned = ctx.cloneSampler(handle);
-    assert(cloned !== handle, `cloneSampler() → new handle=${cloned}`);
-
-    const testLogits = new Float32Array(ctx.vocabSize).fill(0.5);
-    ctx.applySampler(handle, testLogits);
-
-    let masked = 0, validToken = -1;
-    for (let i = 0; i < testLogits.length; i++) {
-      if (testLogits[i] < -1e30) masked++;
-      else if (validToken === -1) validToken = i;
-    }
-    assert(masked > 0 && validToken >= 0, `applySampler() masked ${masked} tokens`);
-
-    ctx.acceptSamplerToken(handle, validToken);
-    ok(`acceptSamplerToken(${validToken})`);
-
-    ctx.freeSamplerHandle(handle);
-    ctx.freeSamplerHandle(cloned);
-    ok('freeSamplerHandle() both handles');
-
     // Branch API with grammar
-    await ctx.kvCacheClear();
     const prompt = await ctx.tokenize("Output: ");
-    await ctx.decode(prompt, 0, 0);
-
-    const branch = Branch.create(ctx, prompt.length, { temperature: 0 }, undefined, grammar);
-    branch.captureLogits();
+    const branch = Branch.create(ctx, 0, { temperature: 0 }, undefined, grammar);
+    await branch.prefill(prompt);
 
     const output = [];
     for (let i = 0; i < 10; i++) {
-      const { token, text, isStop } = branch.produce();
+      const { token, text, isStop } = await branch.produce();
       if (isStop) break;
       await branch.commit(token);
       output.push(text);
@@ -251,6 +228,32 @@ ws ::= [ \\t\\n]*`;
 
     const result = output.join('');
     assert(/^\{\s*\}\s*$/.test(result), `Branch+grammar → "${result}"`);
+
+    // Grammar is cloned on fork — independent parser states
+    await ctx.kvCacheClear();
+    const prompt2 = await ctx.tokenize("Output: ");
+    const root = Branch.create(ctx, 0, { temperature: 0 }, undefined, grammar);
+    await root.prefill(prompt2);
+
+    const childA = await root.fork();
+    const childB = await root.fork();
+
+    // Both children should produce grammar-valid output independently
+    const outA = [], outB = [];
+    for (let i = 0; i < 10; i++) {
+      const pA = await childA.produce();
+      if (!pA.isStop) { await childA.commit(pA.token); outA.push(pA.text); }
+      const pB = await childB.produce();
+      if (!pB.isStop) { await childB.commit(pB.token); outB.push(pB.text); }
+    }
+
+    const resultA = outA.join(''), resultB = outB.join('');
+    assert(/^\{\s*\}\s*$/.test(resultA), `Fork A grammar → "${resultA}"`);
+    assert(/^\{\s*\}\s*$/.test(resultB), `Fork B grammar → "${resultB}"`);
+
+    await childA.prune();
+    await childB.prune();
+    await root.prune();
     await branch.prune();
   } finally {
     ctx.dispose();
@@ -266,45 +269,27 @@ async function testMetrics(ctx) {
 
   await ctx.kvCacheClear();
   const tokens = await ctx.tokenize("Hello");
-  await ctx.decode(tokens, 0);
+  const branch = Branch.create(ctx, 0, { temperature: 0 });
+  await branch.prefill(tokens);
 
-  const token1 = ctx.greedySample();
-  const surprisal = ctx.modelSurprisal(token1, "nats");
-  assert(surprisal >= 0, `modelSurprisal() → ${surprisal.toFixed(2)} nats`);
+  // modelSurprisal with branch logits
+  const token1 = branch.sample();
+  const branchLogits = branch.getLogits();
+  const surprisal = ctx.modelSurprisal(token1, "nats", branchLogits);
+  assert(surprisal >= 0, `modelSurprisal(branchLogits) → ${surprisal.toFixed(2)} nats`);
 
-  const surprisalBits = ctx.modelSurprisal(token1, "bits");
+  const surprisalBits = ctx.modelSurprisal(token1, "bits", branchLogits);
   assert(Math.abs(surprisalBits - surprisal / Math.log(2)) < 0.01, 'bits = nats / ln(2)');
 
-  const tracker = ctx.createPerplexityTracker();
-  assert(tracker > 0, `createPerplexityTracker() → ${tracker}`);
-
-  ctx.addSurprisal(tracker, surprisal);
-  await ctx.decode([token1], tokens.length);
-  ctx.addSurprisal(tracker, ctx.modelSurprisal(ctx.greedySample()));
-
-  const count = ctx.getPerplexityCount(tracker);
-  assert(count === 2, `getPerplexityCount() → ${count}`);
-
-  const ppl = ctx.getPerplexity(tracker);
-  assert(ppl >= 1.0, `getPerplexity() → ${ppl.toFixed(2)}`);
-
-  const clonedTracker = ctx.clonePerplexityTracker(tracker);
-  assert(clonedTracker !== tracker, `clonePerplexityTracker() → ${clonedTracker}`);
-
-  ctx.resetPerplexityTracker(clonedTracker);
-  assert(ctx.getPerplexityCount(clonedTracker) === 0, 'resetPerplexityTracker() → count=0');
+  // Branch perplexity — built-in, accumulates through commit()
+  await branch.commit(token1);
+  const { token: token2 } = await branch.produce();
+  await branch.commit(token2);
 
-  ctx.freePerplexityTracker(tracker);
-  ctx.freePerplexityTracker(clonedTracker);
-  ok('freePerplexityTracker() both');
+  const ppl = branch.perplexity;
+  assert(isFinite(ppl) && ppl >= 1.0, `branch.perplexity → ${ppl.toFixed(2)}`);
 
-  let threwOnInvalid = false;
-  try {
-    ctx.getPerplexity(tracker);
-  } catch {
-    threwOnInvalid = true;
-  }
-  assert(threwOnInvalid, 'Invalid handle throws');
+  await branch.prune();
 }
 
 // ═══════════════════════════════════════════════════════════════════════════
@@ -332,15 +317,13 @@ async function testBranchPrefill() {
     const messages = [{ role: 'user', content: turns[0] }];
     const { prompt } = await ctx.formatChat(JSON.stringify(messages));
     const promptToks = await ctx.tokenize(prompt);
-    await ctx.decode(promptToks, 0, 0);
-
-    const branch = Branch.create(ctx, promptToks.length, { temperature: 0 });
-    branch.captureLogits();
+    const branch = Branch.create(ctx, 0, { temperature: 0 });
+    await branch.prefill(promptToks);
 
     // Turn 1
     const gen1 = [];
     for (let i = 0; i < GEN_TOKENS; i++) {
-      const { token, isStop } = branch.produce();
+      const { token, isStop } = await branch.produce();
       if (isStop) break;
       await branch.commit(token);
       gen1.push(token);
@@ -371,7 +354,7 @@ async function testBranchPrefill() {
 
       const gen = [];
       for (let i = 0; i < GEN_TOKENS; i++) {
-        const { token, isStop } = branch.produce();
+        const { token, isStop } = await branch.produce();
         if (isStop) break;
         await branch.commit(token);
         gen.push(token);
@@ -411,7 +394,7 @@ async function testWarmMultiTurnRecall() {
     async function generate(branch) {
       const gen = [];
       for (;;) {
-        const { token, isStop } = branch.produce();
+        const { token, isStop } = await branch.produce();
         if (isStop) break;
         await branch.commit(token);
         gen.push(token);
@@ -434,10 +417,8 @@ async function testWarmMultiTurnRecall() {
     const msgs1 = [{ role: 'user', content: 'Hi, my name is Lloyal' }];
     const { prompt, format, reasoningFormat } = await ctx.formatChat(JSON.stringify(msgs1), {});
     const promptToks = await ctx.tokenize(prompt);
-    await ctx.decode(promptToks, 0, 0);
-
-    const branch = Branch.create(ctx, promptToks.length, { temperature: 0 });
-    branch.captureLogits();
+    const branch = Branch.create(ctx, 0, { temperature: 0 });
+    await branch.prefill(promptToks);
 
     // Helper: parse output and check content (not reasoning) for a term
     function checkRecall(rawText, term) {
@@ -528,7 +509,7 @@ async function testWarmSemanticRecall() {
 
         const gen = [];
         for (let i = 0; i < GEN_TOKENS; i++) {
-          const { token, isStop } = branch.produce();
+          const { token, isStop } = await branch.produce();
           if (isStop) break;
           await branch.commit(token);
           gen.push(token);
@@ -542,15 +523,13 @@ async function testWarmSemanticRecall() {
       messages.push({ role: 'user', content: 'Remember this: my dog is named Max.' });
       const { prompt } = await ctx.formatChat(JSON.stringify(messages));
       const promptToks = await ctx.tokenize(prompt);
-      await ctx.decode(promptToks, 0, 0);
-
-      branch = Branch.create(ctx, promptToks.length, { temperature: 0 });
-      branch.captureLogits();
+      branch = Branch.create(ctx, 0, { temperature: 0 });
+      await branch.prefill(promptToks);
 
       // Generate turn 1 response
       const gen = [];
       for (let i = 0; i < GEN_TOKENS; i++) {
-        const { token, isStop } = branch.produce();
+        const { token, isStop } = await branch.produce();
         if (isStop) break;
         await branch.commit(token);
         gen.push(token);
@@ -625,11 +604,8 @@ async function testBranchSteer() {
 
   try {
     const tokens = await ctx.tokenize("The quick brown");
-    await ctx.decode(tokens, 0, 0);
-
-    // Use greedy sampling for deterministic tests
-    const branch = Branch.create(ctx, tokens.length, { temperature: 0 });
-    branch.captureLogits();
+    const branch = Branch.create(ctx, 0, { temperature: 0 });
+    await branch.prefill(tokens);
 
     // Get the greedy token (what would be sampled without steer)
     const greedyToken = branch.sample();
@@ -669,10 +645,8 @@ async function testBranchSteer() {
 
     // Test fork invariant: steer is NOT cloned on fork
     const tokens2 = await ctx.tokenize("Hello world");
-    await ctx.decode(tokens2, 0, 0);
-
-    const parent = Branch.create(ctx, tokens2.length, { temperature: 0 });
-    parent.captureLogits();
+    const parent = Branch.create(ctx, 0, { temperature: 0 });
+    await parent.prefill(tokens2);
 
     const parentGreedy = parent.sample();
 
@@ -732,17 +706,15 @@ async function testNBatchAblation() {
       const messages = [{ role: 'user', content: "Hello, how are you today?" }];
       const { prompt } = await ctx.formatChat(JSON.stringify(messages));
       const promptToks = await ctx.tokenize(prompt);
-      await ctx.decode(promptToks, 0, 0);
-
-      const branch = Branch.create(ctx, promptToks.length, { temperature: 0 }, nBatch);
-      branch.captureLogits();
+      const branch = Branch.create(ctx, 0, { temperature: 0 }, nBatch);
+      await branch.prefill(promptToks);
 
       const followUp = await ctx.tokenize(" What else?");
       await branch.prefill(followUp);
 
       const gen = [];
       for (let i = 0; i < 5; i++) {
-        const { token, isStop } = branch.produce();
+        const { token, isStop } = await branch.produce();
         if (isStop) break;
         await branch.commit(token);
         gen.push(token);
@@ -820,15 +792,18 @@ async function testDeterminism() {
       const messages = [{ role: 'user', content: prompt }];
       const { prompt: formatted } = await ctx.formatChat(JSON.stringify(messages));
       const tokens = await ctx.tokenize(formatted);
-      await ctx.decode(tokens, 0);
+
+      const branch = Branch.create(ctx, 0, { temperature: 0 });
+      await branch.prefill(tokens);
 
       const gen = [];
       for (let i = 0; i < 20; i++) {
-        const token = ctx.sample({ temperature: 0 });
-        if (ctx.isStopToken(token)) break;
+        const { token, isStop } = await branch.produce();
+        if (isStop) break;
+        await branch.commit(token);
         gen.push(token);
-        await ctx.decode([token], tokens.length + i);
       }
+      await branch.prune();
       return gen.join(',');
     } finally {
       ctx.dispose();
@@ -915,11 +890,11 @@ async function testEmbeddings() {
 }
 
 // ═══════════════════════════════════════════════════════════════════════════
-// ATOMIC DECODE AND CAPTURE
+// BRANCH PREFILL + GET LOGITS (replaces testDecodeAndCapture)
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testDecodeAndCapture() {
-  console.log('\n--- decodeAndCapture ---');
+async function testBranchPrefillAndLogits() {
+  console.log('\n--- Branch prefill + getLogits ---');
 
   const ctx = await addon.createContext({
     modelPath: MODEL_PATH,
@@ -929,23 +904,23 @@ async function testDecodeAndCapture() {
 
   try {
     const tokens = await ctx.tokenize("Hello");
-    const buffer = new Float32Array(ctx.vocabSize);
-
-    await ctx.decodeAndCapture(tokens, 0, 0, buffer);
+    const branch = Branch.create(ctx, 0, { temperature: 0 });
+    await branch.prefill(tokens);
 
+    const logits = branch.getLogits();
     let valid = false;
-    for (let i = 0; i < buffer.length; i++) {
-      if (buffer[i] !== 0 && !isNaN(buffer[i])) valid = true;
+    for (let i = 0; i < logits.length; i++) {
+      if (logits[i] !== 0 && !isNaN(logits[i])) valid = true;
     }
-    assert(valid, `decodeAndCapture() filled buffer with valid logits`);
-
-    // Verify it's a copy
-    const orig = buffer[0];
-    buffer[0] = -999;
-    const ctxLogits = ctx.getLogits();
-    const isCopy = ctxLogits[0] !== -999;
-    buffer[0] = orig;
-    assert(isCopy, 'Captured buffer is independent copy');
+    assert(valid, `branch.prefill() + getLogits() → valid logits`);
+
+    // Branch logits are an independent copy
+    const orig = logits[0];
+    logits[0] = -999;
+    const logits2 = branch.getLogits();
+    assert(logits2[0] !== -999, 'branch.getLogits() returns independent copy');
+
+    await branch.prune();
   } finally {
     ctx.dispose();
   }
@@ -1040,16 +1015,15 @@ async function testBranchStore() {
     // Tests: batched generation loop, perplexity accumulation through accept_token,
     // Branch.perplexity accessor after store ops, reseedSampler diversity.
     {
-      await ctx.decode(promptToks, 0, 0);
-      const root = Branch.create(ctx, promptToks.length, { temperature: 0.8 });
-      root.captureLogits();
+      const root = Branch.create(ctx, 0, { temperature: 0.8 });
+      await root.prefill(promptToks);
       const branches = [root, await root.fork(), await root.fork()];
       branches[1].reseedSampler(42);
       branches[2].reseedSampler(99);
 
       for (let step = 0; step < 10; step++) {
-        const live = branches.map(b => [b, b.produce()])
-          .filter(([, p]) => !p.isStop);
+        const produced = await Promise.all(branches.map(async b => [b, await b.produce()]));
+        const live = produced.filter(([, p]) => !p.isStop);
         if (!live.length) break;
         await store.commit(live.map(([b, p]) => [b, p.token]));
       }
@@ -1071,9 +1045,8 @@ async function testBranchStore() {
     // generating with store.commit(). This is the persistence/replay pattern.
     // Tests: prefill→commit lifecycle, metrics across phase transition, getLogits().
     {
-      await ctx.decode(promptToks, 0, 0);
-      const b1 = Branch.create(ctx, promptToks.length, { temperature: 0 });
-      b1.captureLogits();
+      const b1 = Branch.create(ctx, 0, { temperature: 0 });
+      await b1.prefill(promptToks);
       const b2 = await b1.fork();
 
       // Phase 1: Rehydrate from "saved" histories
@@ -1095,8 +1068,8 @@ async function testBranchStore() {
       // Phase 2: Generate continuations
       const gen1 = [], gen2 = [];
       for (let i = 0; i < 5; i++) {
-        const live = [[b1, b1.produce()], [b2, b2.produce()]]
-          .filter(([, p]) => !p.isStop);
+        const produced = [[b1, await b1.produce()], [b2, await b2.produce()]];
+        const live = produced.filter(([, p]) => !p.isStop);
         if (!live.length) break;
         await store.commit(live.map(([b, p]) => [b, p.token]));
         for (const [b, p] of live) {
@@ -1121,9 +1094,8 @@ async function testBranchStore() {
     // Verifies Branch.getLogits() returns a Float32Array consumable by the
     // existing metrics API. This tests the JS API surface of the new exposure.
     {
-      await ctx.decode(promptToks, 0, 0);
-      const b1 = Branch.create(ctx, promptToks.length, { temperature: 0 });
-      b1.captureLogits();
+      const b1 = Branch.create(ctx, 0, { temperature: 0 });
+      await b1.prefill(promptToks);
 
       const logits = b1.getLogits();
       assert(logits instanceof Float32Array,
@@ -1134,16 +1106,11 @@ async function testBranchStore() {
       // Feed branch logits into ctx.modelEntropy() — proves the returned
       // buffer is a valid logits distribution consumable by metrics API
       const entropyFromBranch = ctx.modelEntropy("nats", logits);
-      const entropyFromCtx = ctx.modelEntropy("nats");
       assert(isFinite(entropyFromBranch) && entropyFromBranch > 0,
         `getLogits→modelEntropy: ${entropyFromBranch.toFixed(4)} nats`);
 
-      // Branch logits (captured from same decode) should match context logits
-      assert(Math.abs(entropyFromBranch - entropyFromCtx) < 1e-4,
-        `getLogits→modelEntropy: branch=${entropyFromBranch.toFixed(4)} ≈ ctx=${entropyFromCtx.toFixed(4)}`);
-
       // After store.commit, logits change — getLogits() reflects new state
-      const p = b1.produce();
+      const p = await b1.produce();
       assert(!p.isStop, `getLogits: produce() should not hit EOG on first token`);
       await store.commit([[b1, p.token]]);
       const logitsAfter = b1.getLogits();
@@ -1159,15 +1126,14 @@ async function testBranchStore() {
     // Tests: produce() reads from branch snapshot, store.commit() advances state,
     // produce() on next iteration reads from updated snapshot.
     {
-      await ctx.decode(promptToks, 0, 0);
-      const b1 = Branch.create(ctx, promptToks.length, { temperature: 0 });
-      b1.captureLogits();
+      const b1 = Branch.create(ctx, 0, { temperature: 0 });
+      await b1.prefill(promptToks);
       const b2 = await b1.fork();
 
       const output = [];
       for (let i = 0; i < 5; i++) {
         // Inspect with produce() — does NOT advance state
-        const p1 = b1.produce(), p2 = b2.produce();
+        const p1 = await b1.produce(), p2 = await b2.produce();
 
         // Can inspect text and isStop before committing
         assert(typeof p1.text === 'string' && typeof p2.text === 'string',
@@ -1192,15 +1158,14 @@ async function testBranchStore() {
     // Tests: both paths write to the same branch state correctly, no corruption when
     // alternating between decode::one and decode::each on the same sequence.
     {
-      await ctx.decode(promptToks, 0, 0);
-      const b1 = Branch.create(ctx, promptToks.length, { temperature: 0 });
-      b1.captureLogits();
+      const b1 = Branch.create(ctx, 0, { temperature: 0 });
+      await b1.prefill(promptToks);
       const b2 = await b1.fork();
 
       // Step 1-3: single-branch commit (decode::one path)
       for (let i = 0; i < 3; i++) {
-        const live = [[b1, b1.produce()], [b2, b2.produce()]]
-          .filter(([, p]) => !p.isStop);
+        const produced = [[b1, await b1.produce()], [b2, await b2.produce()]];
+        const live = produced.filter(([, p]) => !p.isStop);
         if (!live.length) break;
         for (const [b, p] of live) await b.commit(p.token);
       }
@@ -1208,8 +1173,8 @@ async function testBranchStore() {
 
       // Step 4-6: batched commit (decode::each path)
       for (let i = 0; i < 3; i++) {
-        const live = [[b1, b1.produce()], [b2, b2.produce()]]
-          .filter(([, p]) => !p.isStop);
+        const produced = [[b1, await b1.produce()], [b2, await b2.produce()]];
+        const live = produced.filter(([, p]) => !p.isStop);
         if (!live.length) break;
         await store.commit(live.map(([b, p]) => [b, p.token]));
       }
@@ -1219,8 +1184,8 @@ async function testBranchStore() {
 
       // Step 7-9: back to single-branch commit
       for (let i = 0; i < 3; i++) {
-        const live = [[b1, b1.produce()], [b2, b2.produce()]]
-          .filter(([, p]) => !p.isStop);
+        const produced = [[b1, await b1.produce()], [b2, await b2.produce()]];
+        const live = produced.filter(([, p]) => !p.isStop);
         if (!live.length) break;
         for (const [b, p] of live) await b.commit(p.token);
       }
@@ -1237,9 +1202,8 @@ async function testBranchStore() {
     // Tests: per-branch EOG filtering, store.commit with shrinking branch set,
     // surviving branch generates correct output after sibling stops.
     {
-      await ctx.decode(promptToks, 0, 0);
-      const b1 = Branch.create(ctx, promptToks.length, { temperature: 0 });
-      b1.captureLogits();
+      const b1 = Branch.create(ctx, 0, { temperature: 0 });
+      await b1.prefill(promptToks);
       const b2 = await b1.fork();
 
       const eog = ctx.getEogToken();
@@ -1253,8 +1217,8 @@ async function testBranchStore() {
         }
 
         const pairs = [
-          ...(!stopped[0] ? [[b1, b1.produce()]] : []),
-          ...(!stopped[1] ? [[b2, b2.produce()]] : []),
+          ...(!stopped[0] ? [[b1, await b1.produce()]] : []),
+          ...(!stopped[1] ? [[b2, await b2.produce()]] : []),
         ];
 
         const live = pairs.filter(([, p]) => !p.isStop);
@@ -1313,13 +1277,11 @@ async function testPplSanity() {
     const messages = [{ role: 'user', content: 'Tell me about the weather.' }];
     const { prompt } = await ctx.formatChat(JSON.stringify(messages));
     const promptToks = await ctx.tokenize(prompt);
-    await ctx.decode(promptToks, 0, 0);
-
-    const branch = Branch.create(ctx, promptToks.length, { temperature: 0 });
-    branch.captureLogits();
+    const branch = Branch.create(ctx, 0, { temperature: 0 });
+    await branch.prefill(promptToks);
 
     for (let i = 0; i < 10; i++) {
-      const { token, isStop } = branch.produce();
+      const { token, isStop } = await branch.produce();
       if (isStop) break;
       await branch.commit(token);
     }
@@ -1356,10 +1318,8 @@ async function testCommitRollback() {
 
   try {
     const promptToks = await ctx.tokenize("Hi");
-    await ctx.decode(promptToks, 0, 0);
-
-    const root = Branch.create(ctx, promptToks.length, { temperature: 1.0 });
-    root.captureLogits();
+    const root = Branch.create(ctx, 0, { temperature: 1.0 });
+    await root.prefill(promptToks);
     const branches = [root];
     for (let i = 1; i < 8; i++) {
       const b = await root.fork();
@@ -1375,9 +1335,8 @@ async function testCommitRollback() {
     let successfulRounds = 0;
     let failedRound = false;
     for (let round = 0; round < 50; round++) {
-      const live = branches
-        .map(b => [b, b.produce()])
-        .filter(([, p]) => !p.isStop);
+      const produced = await Promise.all(branches.map(async b => [b, await b.produce()]));
+      const live = produced.filter(([, p]) => !p.isStop);
       if (!live.length) break;
 
       // Snapshot PPL before this round
@@ -1435,13 +1394,11 @@ async function testAsyncRejection() {
 
   try {
     const tokens = await ctx.tokenize("Hello world");
-    await ctx.decode(tokens, 0, 0);
-
-    const branch = Branch.create(ctx, tokens.length, { temperature: 0 });
-    branch.captureLogits();
+    const branch = Branch.create(ctx, 0, { temperature: 0 });
+    await branch.prefill(tokens);
 
     // Generate one token to prove branch works
-    const { token, isStop } = branch.produce();
+    const { token, isStop } = await branch.produce();
     assert(!isStop, 'rejection: initial produce succeeds');
     await branch.commit(token);
     const posAfterCommit = branch.position;
@@ -1460,14 +1417,23 @@ async function testAsyncRejection() {
     }
     assert(threwOnCommit, 'rejection: commit on disposed branch throws');
 
-    // produce() on disposed branch
+    // produce() on disposed branch — async version rejects
     let threwOnProduce = false;
     try {
-      branch.produce();
+      await branch.produce();
     } catch (e) {
       threwOnProduce = true;
     }
-    assert(threwOnProduce, 'rejection: produce on disposed branch throws');
+    assert(threwOnProduce, 'rejection: produce on disposed branch rejects');
+
+    // produceSync() on disposed branch — throws synchronously
+    let threwOnProduceSync = false;
+    try {
+      branch.produceSync();
+    } catch (e) {
+      threwOnProduceSync = true;
+    }
+    assert(threwOnProduceSync, 'rejection: produceSync on disposed branch throws');
 
     // fork() on disposed branch
     let threwOnFork = false;
@@ -1508,10 +1474,8 @@ async function testEmptyInputEdgeCases() {
 
   try {
     const tokens = await ctx.tokenize("Hello world");
-    await ctx.decode(tokens, 0, 0);
-
-    const branch = Branch.create(ctx, tokens.length, { temperature: 0 });
-    branch.captureLogits();
+    const branch = Branch.create(ctx, 0, { temperature: 0 });
+    await branch.prefill(tokens);
     const store = new BranchStore(ctx);
 
     const posBefore = branch.position;
@@ -1532,7 +1496,7 @@ async function testEmptyInputEdgeCases() {
     ok('branch.prefill([]) resolves');
 
     // Verify branch still works after empty operations
-    const { token, isStop } = branch.produce();
+    const { token, isStop } = await branch.produce();
     assert(!isStop, 'empty edge: produce still works after empty ops');
     await branch.commit(token);
     assert(branch.position === posBefore + 1, 'empty edge: commit advances position after empty ops');
@@ -1572,21 +1536,14 @@ async function testJsonSchemaToGrammar() {
       `jsonSchemaToGrammar: returned ${grammar.length}-char grammar`);
     assert(grammar.includes('root'), 'jsonSchemaToGrammar: grammar contains "root" rule');
 
-    // Use the grammar with createSampler to prove it's valid GBNF
-    const handle = ctx.createSampler(grammar);
-    assert(handle > 0, `jsonSchemaToGrammar: createSampler accepted grammar (handle=${handle})`);
-
-    // Generate tokens with grammar constraint
-    await ctx.kvCacheClear();
+    // Use the grammar with Branch.create to prove it's valid GBNF
     const prompt = await ctx.tokenize("Output JSON: ");
-    await ctx.decode(prompt, 0, 0);
-
-    const branch = Branch.create(ctx, prompt.length, { temperature: 0 }, undefined, grammar);
-    branch.captureLogits();
+    const branch = Branch.create(ctx, 0, { temperature: 0 }, undefined, grammar);
+    await branch.prefill(prompt);
 
     const output = [];
     for (let i = 0; i < 50; i++) {
-      const { token, text, isStop } = branch.produce();
+      const { token, text, isStop } = await branch.produce();
       if (isStop) break;
       await branch.commit(token);
       output.push(text);
@@ -1609,7 +1566,6 @@ async function testJsonSchemaToGrammar() {
     }
 
     await branch.prune();
-    ctx.freeSamplerHandle(handle);
 
     // Error path: invalid JSON → promise rejects
     let rejected = false;
@@ -1641,13 +1597,11 @@ async function testDisposedDuringAsync() {
 
   try {
     const tokens = await ctx.tokenize("Test prompt");
-    await ctx.decode(tokens, 0, 0);
-
-    const branch = Branch.create(ctx, tokens.length, { temperature: 0 });
-    branch.captureLogits();
+    const branch = Branch.create(ctx, 0, { temperature: 0 });
+    await branch.prefill(tokens);
 
     // Generate one token so branch has state
-    const { token } = branch.produce();
+    const { token } = await branch.produce();
     await branch.commit(token);
 
     // Call prune() — DO NOT await yet
@@ -1656,14 +1610,14 @@ async function testDisposedDuringAsync() {
     // Immediately (before microtask resolves) check disposed
     assert(branch.disposed, 'disposed-during: _disposed is true synchronously after prune() call');
 
-    // produce() should throw synchronously
+    // produceSync() should throw synchronously
     let threwProduce = false;
     try {
-      branch.produce();
+      branch.produceSync();
     } catch {
       threwProduce = true;
     }
-    assert(threwProduce, 'disposed-during: produce() throws before prune promise resolves');
+    assert(threwProduce, 'disposed-during: produceSync() throws before prune promise resolves');
 
     // commit() should throw synchronously (the _ensureNotDisposed guard)
     let threwCommit = false;
@@ -1702,11 +1656,10 @@ async function testAsyncIterator() {
 
   try {
     const prompt = await ctx.tokenize("The quick brown fox");
-    await ctx.decode(prompt, 0, 0);
 
     // Generate to EOG via for-await
-    const branch = Branch.create(ctx, prompt.length, { temperature: 0 });
-    branch.captureLogits();
+    const branch = Branch.create(ctx, 0, { temperature: 0 });
+    await branch.prefill(prompt);
 
     const tokens = [];
     for await (const { token, text } of branch) {
@@ -1729,13 +1682,11 @@ async function testAsyncIterator() {
 
     // Compare: iterator output matches produce/commit output (deterministic, temp=0)
     await ctx.kvCacheClear();
-    await ctx.decode(prompt, 0, 0);
-
-    const branchManual = Branch.create(ctx, prompt.length, { temperature: 0 });
-    branchManual.captureLogits();
+    const branchManual = Branch.create(ctx, 0, { temperature: 0 });
+    await branchManual.prefill(prompt);
     const manualTokens = [];
     for (let i = 0; i < 10; i++) {
-      const { token, isStop } = branchManual.produce();
+      const { token, isStop } = await branchManual.produce();
       if (isStop) break;
       await branchManual.commit(token);
       manualTokens.push(token);
@@ -1751,6 +1702,119 @@ async function testAsyncIterator() {
   }
 }
 
+// ═══════════════════════════════════════════════════════════════════════════
+// HOT-SWAP TESTS (setSamplerParams / setGrammar)
+// ═══════════════════════════════════════════════════════════════════════════
+
+async function testSetSamplerParams() {
+  console.log('\n--- setSamplerParams ---');
+
+  const ctx = await addon.createContext({
+    modelPath: MODEL_PATH,
+    nCtx: CTX_SIZE,
+    nThreads: 4,
+  });
+
+  try {
+    const prompt = await ctx.tokenize("The capital of France is");
+
+    // Greedy baseline
+    const greedy = Branch.create(ctx, 0, { temperature: 0, topK: 0, topP: 1.0, minP: 0 });
+    await greedy.prefill(prompt);
+    const greedyTok = greedy.sample();
+    assert(greedyTok >= 0, `setSamplerParams: greedy token valid (${greedyTok})`);
+
+    // Switch to stochastic — at high temp, should eventually diverge
+    greedy.setSamplerParams({ temperature: 1.5, seed: 42, topK: 0, topP: 1.0, minP: 0 });
+    let diverged = false;
+    for (let i = 0; i < 20; i++) {
+      if (greedy.sample() !== greedyTok) { diverged = true; break; }
+    }
+    assert(diverged, 'setSamplerParams: stochastic diverges from greedy');
+
+    // Switch back to greedy — should be deterministic again
+    greedy.setSamplerParams({ temperature: 0, topK: 0, topP: 1.0, minP: 0 });
+    const tok2 = greedy.sample();
+    const tok3 = greedy.sample();
+    assert(tok2 === tok3, `setSamplerParams: greedy restored (${tok2} === ${tok3})`);
+
+    await greedy.prune();
+
+    // Memoization: identical params should not rebuild
+    await ctx.kvCacheClear();
+    const branch = Branch.create(ctx, 0, { temperature: 0.8, seed: 100 });
+    await branch.prefill(prompt);
+    branch.setSamplerParams({ temperature: 0.8, seed: 100 });  // Same — should be no-op
+    assert(!branch.disposed, 'setSamplerParams: memoized no-op does not dispose');
+
+    await branch.prune();
+  } finally {
+    ctx.dispose();
+  }
+}
+
+async function testSetGrammar() {
+  console.log('\n--- setGrammar ---');
+
+  const ctx = await addon.createContext({
+    modelPath: MODEL_PATH,
+    nCtx: CTX_SIZE,
+    nThreads: 4,
+    nSeqMax: 4,
+  });
+
+  try {
+    const grammar = `root ::= "{" ws "}" ws
+ws ::= [ \\t\\n]*`;
+
+    // Hot-swap: create without grammar, then add one
+    const prompt = await ctx.tokenize("Output: ");
+    const branch = Branch.create(ctx, 0, { temperature: 0 });
+    await branch.prefill(prompt);
+
+    branch.setGrammar(grammar);
+    const output = [];
+    for (let i = 0; i < 10; i++) {
+      const { token, text, isStop } = await branch.produce();
+      if (isStop) break;
+      await branch.commit(token);
+      output.push(text);
+    }
+    const result = output.join('');
+    assert(/^\{\s*\}\s*$/.test(result), `setGrammar: hot-swap constrains → "${result}"`);
+
+    // Remove grammar
+    branch.setGrammar('');
+    // Should no longer be constrained (just verify it doesn't throw)
+    const { token } = await branch.produce();
+    assert(typeof token === 'number', 'setGrammar: removal works, sample succeeds');
+
+    await branch.prune();
+
+    // Hot-swap + fork: grammar cloned to child
+    await ctx.kvCacheClear();
+    const root = Branch.create(ctx, 0, { temperature: 0 });
+    await root.prefill(prompt);
+    root.setGrammar(grammar);
+
+    const child = await root.fork();
+    const childOut = [];
+    for (let i = 0; i < 10; i++) {
+      const p = await child.produce();
+      if (p.isStop) break;
+      await child.commit(p.token);
+      childOut.push(p.text);
+    }
+    const childResult = childOut.join('');
+    assert(/^\{\s*\}\s*$/.test(childResult), `setGrammar: fork inherits grammar → "${childResult}"`);
+
+    await child.prune();
+    await root.prune();
+  } finally {
+    ctx.dispose();
+  }
+}
+
 // ═══════════════════════════════════════════════════════════════════════════
 // MAIN
 // ═══════════════════════════════════════════════════════════════════════════
@@ -1783,7 +1847,7 @@ async function main() {
     await testBranchSteer();
     await testNBatchAblation();
     await testDeterminism();
-    await testDecodeAndCapture();
+    await testBranchPrefillAndLogits();
     await testBranchStore();
     await testPplSanity();
     await testCommitRollback();
@@ -1792,6 +1856,8 @@ async function main() {
     await testJsonSchemaToGrammar();
     await testDisposedDuringAsync();
     await testAsyncIterator();
+    await testSetSamplerParams();
+    await testSetGrammar();
     await testEmbeddings();
 
     // Summary

From bdf1b40819bbe71a88ac0f3631a3abb658964a2d Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Fri, 20 Feb 2026 14:47:27 +1100
Subject: [PATCH 2/3] refactor(branch): streamline decode surface across N-API
 and JS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove captureLogits(), decodeAndCaptureOne() from JS (zero callers).
Remove _branchCaptureLogits, _branchDecodeAndCaptureOne N-API bindings.
Rename _branchDecodeAndCaptureBatch → _branchPrefill through all layers.
Migrate streaming-tsampler from decodeAndCaptureOne to commit().
---
 examples/streaming/streaming-tsampler.mjs |  4 +-
 lib/Branch.js                             | 44 +++--------
 lib/index.d.ts                            | 33 ++------
 liblloyal                                 |  2 +-
 src/SessionContext.cpp                    | 93 +++--------------------
 src/SessionContext.hpp                    |  4 +-
 test/integration.js                       |  4 +-
 7 files changed, 32 insertions(+), 152 deletions(-)

diff --git a/examples/streaming/streaming-tsampler.mjs b/examples/streaming/streaming-tsampler.mjs
index 0d698bf..96cc40e 100644
--- a/examples/streaming/streaming-tsampler.mjs
+++ b/examples/streaming/streaming-tsampler.mjs
@@ -6,7 +6,7 @@
  * - TypeScript sampling via tsampler (TTA pattern)
  * - N-gram tracking to detect sequence repetition
  * - Logit steering to prevent repeated sequences
- * - Branch API for KV management (prefill/decodeAndCaptureOne)
+ * - Branch API for KV management (prefill/commit)
  * - KV cache clear + re-prefill for infinite context
  *
  * The key insight: llama.cpp's token-level penalties degrade prose quality.
@@ -262,7 +262,7 @@ Begin:
 
     // Store and advance KV (no sampler accept — we're using tsampler externally)
     allTokens.push(token);
-    await branch.decodeAndCaptureOne(token);
+    await branch.commit(token);
 
     // Cache full? Reseed at boundary
     if (branch.position >= nCtx) {
diff --git a/lib/Branch.js b/lib/Branch.js
index 08fe4bf..c3b7a7c 100644
--- a/lib/Branch.js
+++ b/lib/Branch.js
@@ -22,7 +22,7 @@
  * @example Best-of-N with perplexity selection
  * ```js
  * const root = Branch.create(ctx, tokens.length, { temperature: 0.8 });
- * root.captureLogits();
+ * await root.prefill(tokens);
  *
  * const results = [];
  * for (let i = 0; i < 5; i++) {
@@ -53,14 +53,14 @@ class Branch {
    * Create a root branch at the given position
    *
    * The branch takes ownership of the sequence and creates its own sampler
-   * chain from the provided params. Call captureLogits() after prefill to
-   * freeze the logit distribution before forking.
+   * chain from the provided params. Call prefill() to decode prompt tokens
+   * and capture the logit distribution before forking.
    *
    * @param {SessionContext} ctx - SessionContext to create branch on
    * @param {number} position - Starting position (typically prompt token count)
    * @param {SamplingParams} [params] - Sampling parameters (temperature, topP, etc.)
    * @param {number} [nBatch] - Per-branch batch size override (defaults to context nBatch).
-   *   Controls chunk size for prefill() (decode_and_capture_batch). Has no effect on
+   *   Controls chunk size for prefill(). Has no effect on
    *   single-token commit() which uses a zero-allocation fast path. Useful for tuning
    *   memory/throughput tradeoff on bulk token decode — e.g. smaller nBatch for cheap
    *   exploration branches, larger for the trunk.
@@ -92,25 +92,13 @@ class Branch {
     return new Branch(this._ctx, newHandle);
   }
 
-  /**
-   * Freeze the current logit distribution into this branch
-   *
-   * Logits are ephemeral — they're overwritten on the next decode() call.
-   * Capturing preserves them so this branch (and any forks from it) can
-   * sample from the same distribution. Essential before fork().
-   */
-  captureLogits() {
-    this._ensureNotDisposed();
-    this._ctx._branchCaptureLogits(this._handle);
-  }
-
   /**
    * Get a copy of this branch's captured logits snapshot
    *
    * Returns n_vocab floats — the raw logit distribution from the last
-   * decode_and_capture or captureLogits() call. Use for distributional
-   * analysis (KL divergence, entropy, top-k overlap) without crossing
-   * the sampling chain.
+   * prefill() or commit() call. Use for distributional analysis
+   * (KL divergence, entropy, top-k overlap) without crossing the
+   * sampling chain.
    *
    * @returns {Float32Array} Copy of the logits snapshot (n_vocab elements)
    * @throws {Error} If no logits have been captured yet
@@ -120,20 +108,6 @@ class Branch {
     return this._ctx._branchGetLogits(this._handle);
   }
 
-  /**
-   * Single-token forward pass with logit snapshot
-   *
-   * Runs one decode step (writing the token's KV entries), advances position,
-   * and captures the resulting logits for the next sample() call.
-   *
-   * @param {number} token - Token to decode
-   * @returns {Promise<void>}
-   */
-  async decodeAndCaptureOne(token) {
-    this._ensureNotDisposed();
-    await this._ctx._branchDecodeAndCaptureOne(this._handle, token);
-  }
-
   /**
    * Bulk-decode tokens into the branch's KV cache and capture logits
    *
@@ -158,7 +132,7 @@ class Branch {
    */
   async prefill(tokens) {
     this._ensureNotDisposed();
-    await this._ctx._branchDecodeAndCaptureBatch(this._handle, tokens);
+    await this._ctx._branchPrefill(this._handle, tokens);
   }
 
   /**
@@ -226,7 +200,7 @@ class Branch {
    * @example
    * ```js
    * const root = Branch.create(ctx, pos, { temperature: 0.9 });
-   * root.captureLogits();
+   * await root.prefill(promptTokens);
    *
    * // Fork and reseed for diversity
    * const branches = [];
diff --git a/lib/index.d.ts b/lib/index.d.ts
index 0b1adcf..07721a8 100644
--- a/lib/index.d.ts
+++ b/lib/index.d.ts
@@ -1382,14 +1382,8 @@ export interface SessionContext {
   /** @internal Fork a branch to a new sequence */
   _branchFork(handle: number): number;
 
-  /** @internal Capture logits into branch's snapshot */
-  _branchCaptureLogits(handle: number): void;
-
-  /** @internal Decode a single token and capture logits */
-  _branchDecodeAndCaptureOne(handle: number, token: number): Promise<void>;
-
   /** @internal Decode multiple tokens in n_batch-sized chunks and capture logits */
-  _branchDecodeAndCaptureBatch(handle: number, tokens: number[]): Promise<void>;
+  _branchPrefill(handle: number, tokens: number[]): Promise<void>;
 
   /** @internal Sample next token from branch's logits snapshot */
   _branchSample(handle: number): number;
@@ -1629,7 +1623,7 @@ export interface Produced {
  * @example Best-of-N with perplexity selection
  * ```typescript
  * const root = Branch.create(ctx, tokens.length, { temperature: 0.8 });
- * root.captureLogits();
+ * await root.prefill(tokens);
  *
  * const results = [];
  * for (let i = 0; i < 5; i++) {
@@ -1651,8 +1645,8 @@ export class Branch {
    * Create a root branch at the given position
    *
    * The branch takes ownership of the sequence and creates its own sampler
-   * chain from the provided params. Call captureLogits() after prefill to
-   * freeze the logit distribution before forking.
+   * chain from the provided params. Call prefill() to decode prompt tokens
+   * and capture the logit distribution before forking.
    *
    * @param ctx SessionContext to create branch on
    * @param position Starting position (typically prompt token count)
@@ -1681,14 +1675,11 @@ export class Branch {
    */
   fork(): Promise<Branch>;
 
-  /** Freeze the current logit distribution into this branch. Essential before fork(). */
-  captureLogits(): void;
-
   /**
    * Get a copy of this branch's captured logits snapshot.
    *
    * Returns n_vocab floats — the raw logit distribution from the last
-   * decode_and_capture or captureLogits() call.
+   * prefill() or commit() call.
    *
    * Unlike {@link SessionContext.getLogits} (zero-copy view into shared
    * model memory, invalidated by next decode), this returns an independent
@@ -1701,20 +1692,6 @@ export class Branch {
    */
   getLogits(): Float32Array;
 
-  /**
-   * Single-token forward pass with logit snapshot
-   *
-   * Runs one decode step (writing the token's KV entries), advances position,
-   * and captures the resulting logits for the next sample()/produce() call.
-   *
-   * Lower-level than {@link commit} — does NOT accept into the sampler penalty
-   * window. Use commit() for normal generation; use this when you need decode +
-   * capture without repeat-penalty tracking.
-   *
-   * @param token Token to decode
-   */
-  decodeAndCaptureOne(token: number): Promise<void>;
-
   /**
    * Bulk-decode tokens into the branch's KV cache and capture logits.
    *
diff --git a/liblloyal b/liblloyal
index 557c4ef..b0a30f6 160000
--- a/liblloyal
+++ b/liblloyal
@@ -1 +1 @@
-Subproject commit 557c4ef6c7f88824c6fdbc029ad9e9b8bea4f73d
+Subproject commit b0a30f6bf9ad313fcb3a4d03fb58cc3b34934f7f
diff --git a/src/SessionContext.cpp b/src/SessionContext.cpp
index b812a76..baf945b 100644
--- a/src/SessionContext.cpp
+++ b/src/SessionContext.cpp
@@ -556,50 +556,21 @@ class FormatChatWorker : public Napi::AsyncWorker {
 // ===== BRANCH / STORE / DECODE ASYNC WORKERS =====
 
 /**
- * AsyncWorker for single-token branch decode + logits capture
- * Wraps lloyal::branch::decode_and_capture_one on libuv pool thread
+ * AsyncWorker for bulk branch decode + logits capture (prompt injection)
+ * Wraps lloyal::branch::prefill on libuv pool thread
  */
-class BranchDecodeAndCaptureOneWorker : public Napi::AsyncWorker {
+class BranchPrefillWorker : public Napi::AsyncWorker {
 public:
-  BranchDecodeAndCaptureOneWorker(Napi::Env env,
-                                   lloyal::branch::BranchStore& store,
-                                   lloyal::branch::BranchHandle handle,
-                                   llama_token token)
-    : AsyncWorker(env), _deferred(env), _store(store), _handle(handle), _token(token) {}
-
-  void Execute() override {
-    try {
-      lloyal::branch::decode_and_capture_one(_handle, _token, _store);
-    } catch (const std::exception& e) { SetError(e.what()); }
-  }
-
-  void OnOK() override { _deferred.Resolve(Env().Undefined()); }
-  void OnError(const Napi::Error& err) override { _deferred.Reject(err.Value()); }
-  Napi::Promise GetPromise() { return _deferred.Promise(); }
-
-private:
-  Napi::Promise::Deferred _deferred;
-  lloyal::branch::BranchStore& _store;
-  lloyal::branch::BranchHandle _handle;
-  llama_token _token;
-};
-
-/**
- * AsyncWorker for bulk branch decode + logits capture
- * Wraps lloyal::branch::decode_and_capture_batch on libuv pool thread
- */
-class BranchDecodeAndCaptureBatchWorker : public Napi::AsyncWorker {
-public:
-  BranchDecodeAndCaptureBatchWorker(Napi::Env env,
-                                     lloyal::branch::BranchStore& store,
-                                     lloyal::branch::BranchHandle handle,
-                                     std::vector<llama_token> tokens)
+  BranchPrefillWorker(Napi::Env env,
+                      lloyal::branch::BranchStore& store,
+                      lloyal::branch::BranchHandle handle,
+                      std::vector<llama_token> tokens)
     : AsyncWorker(env), _deferred(env), _store(store), _handle(handle),
       _tokens(std::move(tokens)) {}
 
   void Execute() override {
     try {
-      lloyal::branch::decode_and_capture_batch(_handle, _tokens.data(), _tokens.size(), _store);
+      lloyal::branch::prefill(_handle, _tokens.data(), _tokens.size(), _store);
     } catch (const std::exception& e) { SetError(e.what()); }
   }
 
@@ -814,9 +785,7 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) {
     // ===== BRANCH API (internal, wrapped by lib/Branch.ts) =====
     InstanceMethod("_branchCreate", &SessionContext::_branchCreate),
     InstanceMethod("_branchFork", &SessionContext::_branchFork),
-    InstanceMethod("_branchCaptureLogits", &SessionContext::_branchCaptureLogits),
-    InstanceMethod("_branchDecodeAndCaptureOne", &SessionContext::_branchDecodeAndCaptureOne),
-    InstanceMethod("_branchDecodeAndCaptureBatch", &SessionContext::_branchDecodeAndCaptureBatch),
+    InstanceMethod("_branchPrefill", &SessionContext::_branchPrefill),
     InstanceMethod("_branchSample", &SessionContext::_branchSample),
     InstanceMethod("_branchAccept", &SessionContext::_branchAccept),
     InstanceMethod("_branchGetPosition", &SessionContext::_branchGetPosition),
@@ -1881,52 +1850,14 @@ Napi::Value SessionContext::_branchFork(const Napi::CallbackInfo& info) {
   return Napi::Number::New(env, newHandle);
 }
 
-Napi::Value SessionContext::_branchCaptureLogits(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  if (info.Length() < 1) {
-    throw Napi::Error::New(env, "_branchCaptureLogits requires (handle)");
-  }
-
-  auto handle = static_cast<lloyal::branch::BranchHandle>(info[0].As<Napi::Number>().Uint32Value());
-  lloyal::branch::capture_logits(handle, _branchStore);
-
-  return env.Undefined();
-}
-
-Napi::Value SessionContext::_branchDecodeAndCaptureOne(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  if (info.Length() < 2) {
-    throw Napi::Error::New(env, "_branchDecodeAndCaptureOne requires (handle, token)");
-  }
-
-  auto handle = static_cast<lloyal::branch::BranchHandle>(info[0].As<Napi::Number>().Uint32Value());
-  auto token = static_cast<llama_token>(info[1].As<Napi::Number>().Int32Value());
-
-  auto* worker = new BranchDecodeAndCaptureOneWorker(env, _branchStore, handle, token);
-  worker->Queue();
-  return worker->GetPromise();
-}
-
 // Bulk-decode tokens into a branch's KV cache and capture final logits.
-//
-// tokens.size() is the total token count (n_tokens).  The branch's n_batch
-// (set at Branch.create via the nBatch parameter, stored on BranchState)
-// controls the chunk size — decode_and_capture_batch passes both to
-// decoder::decode_tokens which loops: min(n_tokens - processed, n_batch)
-// tokens per llama_decode call.
-//
-// Does NOT accept tokens into the sampler's penalty window.
 // Wrapped by Branch.prefill() on the JS side.
-Napi::Value SessionContext::_branchDecodeAndCaptureBatch(const Napi::CallbackInfo& info) {
+Napi::Value SessionContext::_branchPrefill(const Napi::CallbackInfo& info) {
   Napi::Env env = info.Env();
   ensureNotDisposed();
 
   if (info.Length() < 2 || !info[0].IsNumber() || !info[1].IsArray()) {
-    throw Napi::Error::New(env, "_branchDecodeAndCaptureBatch requires (handle, tokens[])");
+    throw Napi::Error::New(env, "_branchPrefill requires (handle, tokens[])");
   }
 
   auto handle = static_cast<lloyal::branch::BranchHandle>(info[0].As<Napi::Number>().Uint32Value());
@@ -1944,7 +1875,7 @@ Napi::Value SessionContext::_branchDecodeAndCaptureBatch(const Napi::CallbackInf
     return deferred.Promise();
   }
 
-  auto* worker = new BranchDecodeAndCaptureBatchWorker(env, _branchStore, handle, std::move(tokens));
+  auto* worker = new BranchPrefillWorker(env, _branchStore, handle, std::move(tokens));
   worker->Queue();
   return worker->GetPromise();
 }
diff --git a/src/SessionContext.hpp b/src/SessionContext.hpp
index 3ab159d..254065e 100644
--- a/src/SessionContext.hpp
+++ b/src/SessionContext.hpp
@@ -256,9 +256,7 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
 
   Napi::Value _branchCreate(const Napi::CallbackInfo& info);
   Napi::Value _branchFork(const Napi::CallbackInfo& info);
-  Napi::Value _branchCaptureLogits(const Napi::CallbackInfo& info);
-  Napi::Value _branchDecodeAndCaptureOne(const Napi::CallbackInfo& info);
-  Napi::Value _branchDecodeAndCaptureBatch(const Napi::CallbackInfo& info);
+  Napi::Value _branchPrefill(const Napi::CallbackInfo& info);
   Napi::Value _branchSample(const Napi::CallbackInfo& info);
   Napi::Value _branchAccept(const Napi::CallbackInfo& info);
   Napi::Value _branchGetPosition(const Napi::CallbackInfo& info);
diff --git a/test/integration.js b/test/integration.js
index e276b2c..ef52b37 100644
--- a/test/integration.js
+++ b/test/integration.js
@@ -1444,10 +1444,10 @@ async function testAsyncRejection() {
     }
     assert(threwOnFork, 'rejection: fork on disposed branch throws');
 
-    // Native AsyncWorker rejection: call _branchDecodeAndCaptureOne with invalid handle (0)
+    // Native AsyncWorker rejection: call _branchPrefill with invalid handle (0)
     let nativeRejected = false;
     try {
-      await ctx._branchDecodeAndCaptureOne(0, token);
+      await ctx._branchPrefill(0, [token]);
     } catch (e) {
       nativeRejected = true;
       assert(e instanceof Error, `rejection: native rejection is Error: ${e.constructor.name}`);

From d7b7ed98f53824e48c402aaf349e9086d1f38d77 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Fri, 20 Feb 2026 14:49:48 +1100
Subject: [PATCH 3/3] chore(deps): lock file update

---
 package-lock.json | 218 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 174 insertions(+), 44 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index c0dc766..4af2f6d 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -51,29 +51,6 @@
         "@shikijs/vscode-textmate": "^10.0.2"
       }
     },
-    "node_modules/@isaacs/balanced-match": {
-      "version": "4.0.1",
-      "resolved": "https://registry.npmjs.org/@isaacs/balanced-match/-/balanced-match-4.0.1.tgz",
-      "integrity": "sha512-yzMTt9lEb8Gv7zRioUilSglI0c0smZ9k5D65677DLWLtWJaXIS3CqcGyUFByYKlnUj6TkjLVs54fBl6+TiGQDQ==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": "20 || >=22"
-      }
-    },
-    "node_modules/@isaacs/brace-expansion": {
-      "version": "5.0.1",
-      "resolved": "https://registry.npmjs.org/@isaacs/brace-expansion/-/brace-expansion-5.0.1.tgz",
-      "integrity": "sha512-WMz71T1JS624nWj2n2fnYAuPovhv7EUhk69R6i9dsVyzxt5eM3bjwvgk9L+APE1TRscGysAVMANkB0jh0LQZrQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@isaacs/balanced-match": "^4.0.1"
-      },
-      "engines": {
-        "node": "20 || >=22"
-      }
-    },
     "node_modules/@isaacs/cliui": {
       "version": "8.0.2",
       "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz",
@@ -106,43 +83,173 @@
       }
     },
     "node_modules/@lloyal-labs/lloyal.node-darwin-arm64": {
-      "optional": true
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-darwin-arm64/-/lloyal.node-darwin-arm64-1.5.0.tgz",
+      "integrity": "sha512-TZlQhkt14RQLmhCPGgu2WtZ/gC8Z0tvzu/gVUcNqsBTwepaUlyAdAuZCNnlpjGCwSV/XTvlZQMKfNKDyXoMQbQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-darwin-x64": {
-      "optional": true
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-darwin-x64/-/lloyal.node-darwin-x64-1.5.0.tgz",
+      "integrity": "sha512-0KJmT3vbrPm8HojFfu+tn433gTVF/x2vHdzi+kRGSvbI81pjzadd/pW4Qweo5NmgSfAEp2a1FTT4gXdLfAfxwQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-arm64": {
-      "optional": true
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-arm64/-/lloyal.node-linux-arm64-1.5.0.tgz",
+      "integrity": "sha512-QZPknVyCNXBF+Ed+YNBcAQgNquznegy8Q3A//Il4NsXeLrH4ZW0orygV7/sTC9z0eUKi2EFBwOVnhNbMsv66Cw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-arm64-cuda": {
-      "optional": true
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-arm64-cuda/-/lloyal.node-linux-arm64-cuda-1.5.0.tgz",
+      "integrity": "sha512-aQr2MK2V2PgTY8msfrADqQY4Ymgn7sFddLnSmLNj/8poBqT+Tj8AQXoErcbXZZ1nzrGXpk3WUpF91dcRk/X/CQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-arm64-vulkan": {
-      "optional": true
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-arm64-vulkan/-/lloyal.node-linux-arm64-vulkan-1.5.0.tgz",
+      "integrity": "sha512-dxp7lpelVd7cDV+nCwRB6F3iQ7JV6Pyh5BNZwEMXpmiaBlC/1TycOXgGrdZFnwTy/Tu/GYzKYOMu4b1ferQZ8Q==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-x64": {
-      "optional": true
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-x64/-/lloyal.node-linux-x64-1.5.0.tgz",
+      "integrity": "sha512-YVlvw1YNLTDMthHhJAi2MRkjMvQsbHgyzzHu0JRuqkmq0slLis/hL1V6qYFA1W0Y5lxJ/jnHagKkDg3ibtCDcA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-x64-cuda": {
-      "optional": true
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-x64-cuda/-/lloyal.node-linux-x64-cuda-1.5.0.tgz",
+      "integrity": "sha512-7tYHwe8wZ1U5LsJ153OR9tzYa1tio9doNBYhdJ+qKalOkRSdFSOBG67mVaYShWF9yt14GScoM3OuA1OrgJHK5Q==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-x64-vulkan": {
-      "optional": true
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-x64-vulkan/-/lloyal.node-linux-x64-vulkan-1.5.0.tgz",
+      "integrity": "sha512-vhwzN+xXdN3CDTqdgY5/awv0FxRXJtFlrwpQNH2UDS2IZucPuR/kFa27F0LLPHxTnvbeVnlBkHPg8KUsvrWRMQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-arm64": {
-      "optional": true
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-arm64/-/lloyal.node-win32-arm64-1.5.0.tgz",
+      "integrity": "sha512-eAopZwOQEKO5433KC3o3cG2lMno7hSDJ3ZcwedgNofbk47Jcg+IWUS6bJWKKEzlrxumzOdGCnD+Tm71M/6cFyQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-arm64-vulkan": {
-      "optional": true
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-arm64-vulkan/-/lloyal.node-win32-arm64-vulkan-1.5.0.tgz",
+      "integrity": "sha512-Dm/nCj+ygSpwHyqi+9h6hxtrQ2J1Jq9AFVPwycFUA5EHKItsMlBLuY2P7Hw1i1cjuwhBPW+xyE8Q7b4R5mrV3g==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-x64": {
-      "optional": true
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-x64/-/lloyal.node-win32-x64-1.5.0.tgz",
+      "integrity": "sha512-ccmXLIgtmtGkfe7LK5Vb5dyj7//9umCFAtci750tUGNLL/n+t2Yw2SnyxLwQF8e5GKg+ASfT9yQ3fejP2gu0ag==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-x64-cuda": {
-      "optional": true
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-x64-cuda/-/lloyal.node-win32-x64-cuda-1.5.0.tgz",
+      "integrity": "sha512-udd/dMPq6Bedoekrch3DHqN+KX5u93Essknd2g8kwgqrEv49Petoky4SBdAEDRNcj4ckTch3IwC/NDQmvvOBLQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-x64-vulkan": {
-      "optional": true
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-x64-vulkan/-/lloyal.node-win32-x64-vulkan-1.5.0.tgz",
+      "integrity": "sha512-tCmiI/zvNwN48WxvuEOQLcdJECq6+Ae4lKwQHSeXCmgggob4wp51xGnKFrk2l/YNwFIS6p3Bw5pZA9gJ3SJMMQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/tsampler": {
       "version": "0.2.0",
@@ -689,21 +796,44 @@
       "license": "MIT"
     },
     "node_modules/minimatch": {
-      "version": "10.1.1",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.1.1.tgz",
-      "integrity": "sha512-enIvLvRAFZYXJzkCYG5RKmPfrFArdLv+R+lbQ53BmIMLIry74bjKzX6iHAm8WYamJkhSSEabrWN5D97XnKObjQ==",
+      "version": "10.2.2",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.2.2.tgz",
+      "integrity": "sha512-+G4CpNBxa5MprY+04MbgOw1v7So6n5JY166pFi9KfYwT78fxScCeSNQSNzp6dpPSW2rONOps6Ocam1wFhCgoVw==",
       "dev": true,
       "license": "BlueOak-1.0.0",
       "dependencies": {
-        "@isaacs/brace-expansion": "^5.0.0"
+        "brace-expansion": "^5.0.2"
       },
       "engines": {
-        "node": "20 || >=22"
+        "node": "18 || 20 || >=22"
       },
       "funding": {
         "url": "https://github.com/sponsors/isaacs"
       }
     },
+    "node_modules/minimatch/node_modules/balanced-match": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-4.0.3.tgz",
+      "integrity": "sha512-1pHv8LX9CpKut1Zp4EXey7Z8OfH11ONNH6Dhi2WDUt31VVZFXZzKwXcysBgqSumFCmR+0dqjMK5v5JiFHzi0+g==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": "20 || >=22"
+      }
+    },
+    "node_modules/minimatch/node_modules/brace-expansion": {
+      "version": "5.0.2",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.2.tgz",
+      "integrity": "sha512-Pdk8c9poy+YhOgVWw1JNN22/HcivgKWwpxKq04M/jTmHyCZn12WPJebZxdjSa5TmBqISrUSgNYU3eRORljfCCw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "balanced-match": "^4.0.2"
+      },
+      "engines": {
+        "node": "20 || >=22"
+      }
+    },
     "node_modules/minimist": {
       "version": "1.2.8",
       "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
@@ -1021,9 +1151,9 @@
       }
     },
     "node_modules/typedoc": {
-      "version": "0.28.16",
-      "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.28.16.tgz",
-      "integrity": "sha512-x4xW77QC3i5DUFMBp0qjukOTnr/sSg+oEs86nB3LjDslvAmwe/PUGDWbe3GrIqt59oTqoXK5GRK9tAa0sYMiog==",
+      "version": "0.28.17",
+      "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.28.17.tgz",
+      "integrity": "sha512-ZkJ2G7mZrbxrKxinTQMjFqsCoYY6a5Luwv2GKbTnBCEgV2ihYm5CflA9JnJAwH0pZWavqfYxmDkFHPt4yx2oDQ==",
       "dev": true,
       "license": "Apache-2.0",
       "dependencies": {