lloyal-ai · lloyal-research · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/README.md b/README.md
@@ -22,10 +22,9 @@ const store = new BranchStore(ctx);
 
 // Shared prompt: "Explain quantum entanglement"
 const prompt = await ctx.tokenize("Explain quantum entanglement");
-await ctx.decode(prompt, 0, 0);
 
-const root = Branch.create(ctx, prompt.length, { temperature: 0.8 });
-root.captureLogits();
+const root = Branch.create(ctx, 0, { temperature: 0.8 });
+await root.prefill(prompt);
 
 // Fork 4 branches — each gets a different reasoning prefix
 const analogy  = await root.fork();
@@ -206,11 +205,12 @@ For fine-grained control without Branch:
 
 ```javascript
 const grammar = await ctx.jsonSchemaToGrammar(schema);
-const handle = ctx.createSampler(grammar);
-// Pull loop — consumer controls pace, can branch at any point
+const branch = Branch.create(ctx, 0, params, undefined, grammar);
+await branch.prefill(promptTokens);
+// Grammar state cloned automatically on fork()
 ```
 
-See [`examples/grammar/`](./examples/grammar/) for the full pull loop pattern.
+See [`examples/grammar/`](./examples/grammar/) for the full branch fork pattern.
 
 ---
 

diff --git a/examples/best-of-n/best-of-n.mjs b/examples/best-of-n/best-of-n.mjs
@@ -11,10 +11,10 @@
  * See: Stiennon et al. 2020 "Learning to summarize from human feedback"
  *
  * KEY IMPLEMENTATION DETAIL:
- * Uses the Branch API for parallel generation. After prefilling the prompt,
- * we create a root branch and call captureLogits(). When forking to multiple
- * candidates, each fork inherits the root's logits snapshot, ensuring all
- * candidates start from the same probability distribution.
+ * Uses the Branch API for parallel generation. The root branch prefills the
+ * prompt and captures logits. When forking to multiple candidates, each fork
+ * inherits the root's logits snapshot, ensuring all candidates start from
+ * the same probability distribution.
  *
  * Usage:
  *   node best-of-n.mjs [model-path]          # Human-readable output
@@ -92,21 +92,18 @@ async function main() {
     console.log(`\nPrompt: "${userPrompt}"`);
   }
 
-  // Prefill prompt
+  // Prefill prompt via root branch
   const promptTokens = await ctx.tokenize(prompt);
-  await ctx.decode(promptTokens, 0, 0);
 
-  if (!jsonlMode) {
-    console.log(`\nPrefill complete. Prompt length: ${promptTokens.length} tokens`);
-  }
-
-  // CRITICAL: Create root branch IMMEDIATELY after prefill to capture logits
-  // The root branch stores a snapshot of the logits for fork operations
-  const root = Branch.create(ctx, promptTokens.length, {
+  const root = Branch.create(ctx, 0, {
     temperature: HIGH_TEMP,
     topP: 0.95,
   });
-  root.captureLogits();
+  await root.prefill(promptTokens);
+
+  if (!jsonlMode) {
+    console.log(`\nPrefill complete. Prompt length: ${promptTokens.length} tokens`);
+  }
 
   // === Baseline: Single generation with low temperature ===
   if (!jsonlMode) {

diff --git a/examples/chat/README.md b/examples/chat/README.md
@@ -14,37 +14,33 @@ npm run example -- /path/to/model.gguf    # custom model
 - `/clear` - Reset conversation and clear terminal
 - `/quit` - Exit
 
-## The Pattern: Sync Produce, Async Commit
+## The Pattern: Branch Produce/Commit
 
 ```javascript
-// Sync generator - all operations are synchronous
-function* produceTokens(ctx, params) {
-  while (true) {
-    const tokenId = ctx.sample(params);      // sync
-    if (ctx.isStopToken(tokenId)) return;    // sync
-    const text = ctx.tokenToText(tokenId);   // sync
-    yield { text, tokenId };
-  }
-}
+// Create branch and prefill prompt
+const branch = Branch.create(ctx, 0, { temperature: 0.7 });
+await branch.prefill(promptTokens);
 
-// Usage - async commit is explicit in caller's loop
-for (const { text, tokenId } of produceTokens(ctx, params)) {
+// Async iterator - commit-before-yield
+for await (const { token, text } of branch) {
   process.stdout.write(text);
-  await ctx.decode([tokenId], position);     // async commit to KV
-  position += 1;
 }
+await branch.prune();
 ```
 
-**Key insight:** Token production is synchronous. Only the KV cache commit (`decode`) is async. This separation makes the control flow explicit.
+**Key insight:** The async iterator handles produce/commit internally. Each yielded token is already committed to KV. Breaking out is clean — no orphaned state.
 
 ## API Reference
 
 | Method | Sync/Async | Purpose |
 |--------|------------|---------|
-| `sample(params)` | sync | Sample next token from logits |
-| `isStopToken(id)` | sync | Check if token ends generation |
-| `tokenToText(id)` | sync | Convert token ID to text |
-| `decode(tokens, pos)` | async | Commit tokens to KV cache |
-| `tokenize(text)` | async | Convert text to token IDs |
-| `formatChat(json)` | async | Apply chat template |
-| `kvCacheClear()` | async | Reset KV cache |
+| `Branch.create(ctx, pos, params)` | sync | Create a branch for generation |
+| `branch.prefill(tokens)` | async | Feed tokens into branch's KV cache |
+| `branch.produce()` | async | Sample next token (no KV write) |
+| `branch.commit(token)` | async | Accept + decode into KV |
+| `branch.prune()` | async | Discard branch and its KV entries |
+| `ctx.isStopToken(id)` | sync | Check if token ends generation |
+| `ctx.tokenToText(id)` | sync | Convert token ID to text |
+| `ctx.tokenize(text)` | async | Convert text to token IDs |
+| `ctx.formatChat(json)` | async | Apply chat template |
+| `ctx.kvCacheClear()` | async | Reset KV cache |
diff --git a/examples/chat/chat.mjs b/examples/chat/chat.mjs
@@ -82,16 +82,15 @@ async function main() {
     messages.push({ role: "user", content: trimmed });
 
     if (!branch) {
-      // === COLD (position === 0): full format → tokenize with BOS → decode ===
+      // === COLD (position === 0): full format → tokenize with BOS → prefill ===
       fmt = await ctx.formatChat(JSON.stringify(messages));
       const tokens = await ctx.tokenize(fmt.prompt);
-      await ctx.decode(tokens, 0, 0);
-      branch = Branch.create(ctx, tokens.length, {
+      branch = Branch.create(ctx, 0, {
         temperature: 0.7,
         topK: 40,
         topP: 0.9,
       });
-      branch.captureLogits();
+      await branch.prefill(tokens);
     } else {
       // === WARM (position > 0): format only the new message ===
       fmt = await ctx.formatChat(

diff --git a/examples/entropy/entropy.mjs b/examples/entropy/entropy.mjs
@@ -10,6 +10,8 @@
  * - EDT formula: T = T₀ · N^(θ/Entropy)
  * - Side-by-side comparison with fixed temperature
  * - Different prompt types: factual, creative, mixed
+ * - Branch API for token generation (produce/commit loop)
+ *
  *
  * Usage:
  *   node entropy.mjs [model-path]          # Human-readable output
@@ -18,7 +20,7 @@
 
 import * as path from 'node:path';
 import { fileURLToPath } from 'node:url';
-import { createContext } from '../../lib/index.js';
+import { createContext, Branch } from '../../lib/index.js';
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const DEFAULT_MODEL = path.resolve(
@@ -53,47 +55,48 @@ function edtTemperature(entropy) {
 
 /**
  * Generate with a specific sampling strategy
+ *
+ * Uses Branch API with per-token setSamplerParams() for EDT adaptation.
+ * Each token gets a temperature computed from the current logit entropy.
  */
 async function generate(ctx, prompt, strategy, strategyName, maxTokens = 50) {
   const messages = [{ role: 'user', content: prompt }];
   const { prompt: formatted } = await ctx.formatChat(JSON.stringify(messages));
-
   const tokens = await ctx.tokenize(formatted);
-  await ctx.decode(tokens, 0, 0);
+
+  const baseTemp = strategy === 'edt' ? 0.8 : strategy;
+  const branch = Branch.create(ctx, 0, { temperature: baseTemp, topP: 0.9 });
+  await branch.prefill(tokens);
 
   const output = [];
   const temps = [];
   const entropies = [];
-  let pos = tokens.length;
 
   for (let i = 0; i < maxTokens; i++) {
-    const entropy = ctx.modelEntropy('nats');
+    const branchLogits = branch.getLogits();
+    const entropy = ctx.modelEntropy('nats', branchLogits);
     entropies.push(entropy);
 
-    let temp;
-    if (strategy === 'edt') {
-      temp = edtTemperature(entropy);
-    } else {
-      temp = strategy; // Fixed temperature
-    }
+    const temp = strategy === 'edt' ? edtTemperature(entropy) : strategy;
     temps.push(temp);
 
-    const token = ctx.sample({ temperature: temp });
-    if (ctx.isStopToken(token)) break;
+    if (strategy === 'edt') branch.setSamplerParams({ temperature: temp, topP: 0.9 });
+
+    const { token, isStop } = await branch.produce();
+    if (isStop) break;
 
     const text = ctx.tokenToText(token);
     emit('token', { strategy: strategyName, token, text, entropy, temp });
 
     output.push(token);
-    await ctx.decode([token], pos++, 0);
+    await branch.commit(token);
   }
 
-  // Clear KV cache for next run
-  await ctx.kvCacheClear();
+  await branch.prune();
 
   const text = await ctx.detokenize(output);
-  const avgEntropy = entropies.reduce((a, b) => a + b, 0) / entropies.length;
-  const avgTemp = temps.reduce((a, b) => a + b, 0) / temps.length;
+  const avgEntropy = entropies.length > 0 ? entropies.reduce((a, b) => a + b, 0) / entropies.length : 0;
+  const avgTemp = temps.length > 0 ? temps.reduce((a, b) => a + b, 0) / temps.length : 0;
 
   return { text, avgEntropy, avgTemp, tokenCount: output.length, temps, entropies };
 }

diff --git a/examples/grammar/README.md b/examples/grammar/README.md
@@ -1,6 +1,6 @@
-# Grammar-Constrained Generation with Pull Loop
+# Grammar-Constrained Generation with Branch Forking
 
-Demonstrates generator-based token streaming with grammar constraints and forkable state.
+Demonstrates grammar-constrained generation using the Branch API with automatic grammar cloning on fork.
 
 ## Run It
 
@@ -17,109 +17,61 @@ Generating until "city" field...
   "age": 30,
   "city":
 
-Saving KV cache and grammar state at branch point...
-
-Exploring 3 city branches:
+Forking into 3 branches at branch point...
 
   [NYC branch]: { "name": "John Doe", "age": 30, "city": "Seattle" }
   [LA branch]: { "name": "John Doe", "age": 30, "city": "Chicago" }
   [Chicago branch]: { "name": "John Doe", "age": 30, "city": "LA" }
 ```
 
-## The Pull Loop Pattern
+## The Branch Fork Pattern
 
-This example uses a **pull loop** via JS generators. The consumer requests tokens one at a time and decides when to stop:
+Grammar state is integrated into the branch and cloned automatically on fork:
 
 ```javascript
-function* tokenGenerator(ctx, grammarHandle, maxTokens = 100) {
-  for (let i = 0; i < maxTokens; i++) {
-    const logits = ctx.getLogits();
-    ctx.applySampler(grammarHandle, logits);
-
-    const token = ctx.sample({ temperature: 0.7 });
-    if (ctx.isStopToken(token)) return;
-
-    ctx.acceptSamplerToken(grammarHandle, token);
-
-    // Yield control back to caller
-    yield { token, text: ctx.tokenToText(token) };
-  }
+// Create root branch with grammar constraint
+const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(schema));
+const root = Branch.create(ctx, 0, params, undefined, grammar);
+await root.prefill(promptTokens);
+
+// Generate until branch point
+for (let i = 0; i < 100; i++) {
+  const { token, text, isStop } = await root.produce();
+  if (isStop) break;
+  await root.commit(token);
+  if (accumulated.includes('"city"')) break;
 }
-```
 
-Consumer decides when to continue:
+// Fork — grammar state cloned automatically
+for (const city of cities) {
+  const child = await root.fork();
+  child.reseedSampler(seed++);
 
-```javascript
-for (const { token, text } of gen) {
-  accumulated += text;
-  await ctx.decode([token], pos++, 0);
-
-  // Stop at decision point - generator pauses here
-  if (accumulated.includes('"city"')) {
-    break;  // Generator stays paused, state preserved
+  for await (const { text } of child) {
+    // Each branch generates independently with its own grammar state
   }
+  await child.prune();
 }
+await root.prune();
 ```
 
-## Why Pull Loop Here?
+## Why Branch Fork Here?
 
-For this branching use case, pull made the code simpler:
-
-```javascript
-// Stop when we see the branch point - just break
-for (const { token, text } of gen) {
-  accumulated += text;
-  if (accumulated.includes('"city"')) break;
-}
-// Generator paused mid-iteration, grammar state intact
-// Now save and branch
-```
+For grammar-constrained branching, fork handles everything atomically:
+- **KV cache**: Shared prefix, divergent-only storage per branch
+- **Grammar state**: Parser position cloned automatically
+- **Sampler chain**: Penalties and PRNG cloned and reseeded
 
-With a push loop you'd need callbacks or flags to signal "stop here" - doable, but the control flow is inverted. Pull keeps the branching logic linear and readable.
-
-## Branching Pattern
-
-1. **Generate** until decision point (pull loop pauses naturally)
-2. **Save** both KV cache and grammar state
-3. **Fork** for each branch exploration
-4. **Restore** and continue independently
-
-```javascript
-// Pause at branch point
-if (accumulated.includes('"city"')) break;
-
-// Save state
-const kvSnapshot = await ctx.kvCacheSave(0);
-const grammarSnapshot = ctx.cloneSampler(grammarHandle);
-
-// Explore branches
-for (const branch of branches) {
-  await ctx.kvCacheLoad(0, kvSnapshot);
-  const branchGrammar = ctx.cloneSampler(grammarSnapshot);
-
-  // Each branch continues independently
-  for (const { token, text } of tokenGenerator(ctx, branchGrammar)) {
-    // ...
-  }
-}
-```
+No manual KV save/load or grammar cloning needed — `fork()` is a single operation.
 
 ## Key APIs
 
 | Method | Description |
 |--------|-------------|
-| `getLogits()` | Get logits buffer (modified in-place by applySampler) |
-| `applySampler(handle, logits)` | Apply grammar constraints to logits |
-| `sample()` | Sample from modified logits |
-| `acceptSamplerToken(handle, id)` | Advance grammar parser state |
-| `createSampler(grammar)` | Create grammar handle |
-| `cloneSampler(handle)` | Clone grammar state for branching |
-| `kvCacheSave(seq)` / `kvCacheLoad(seq, buf)` | Snapshot/restore KV state |
-
-## Grammar + KV Travel Together
-
-For valid branching, fork **both**:
-- **KV cache**: Model's memory of what it has seen
-- **Grammar state**: Parser's position in the grammar
-
-Missing either causes invalid completions or grammar errors.
+| `Branch.create(ctx, pos, params, nBatch, grammar)` | Create branch with grammar constraint |
+| `branch.fork()` | Clone branch: KV prefix + grammar + sampler |
+| `branch.reseedSampler(seed)` | Diversify forked branch's PRNG |
+| `branch.produce()` | Sample grammar-valid token |
+| `branch.commit(token)` | Advance grammar + KV state |
+| `branch.prune()` | Clean up branch resources |
+| `ctx.jsonSchemaToGrammar(json)` | Convert JSON schema to GBNF grammar |