From 3952867063ff89406a9e55d3888ed4f5c06a8d6a Mon Sep 17 00:00:00 2001 From: LLoyal Research Date: Fri, 20 Feb 2026 13:04:39 +1100 Subject: [PATCH 1/3] feat(api): remove flat ctx inference path, make branch API the only path --- README.md | 12 +- examples/best-of-n/best-of-n.mjs | 25 +- examples/chat/README.md | 40 +- examples/chat/chat.mjs | 7 +- examples/entropy/entropy.mjs | 39 +- examples/grammar/README.md | 122 ++-- examples/grammar/grammar.mjs | 96 +-- examples/speculative/README.md | 96 +-- examples/speculative/speculative.mjs | 16 +- examples/streaming/streaming-summary.mjs | 56 +- examples/streaming/streaming-tsampler.mjs | 50 +- examples/streaming/streaming.mjs | 51 +- lib/Branch.js | 52 +- lib/index.d.ts | 650 ++++----------------- lib/index.js | 19 +- liblloyal | 2 +- src/SessionContext.cpp | 679 ++-------------------- src/SessionContext.hpp | 130 +---- test/integration.js | 554 ++++++++++-------- 19 files changed, 825 insertions(+), 1871 deletions(-) diff --git a/README.md b/README.md index eca0b26..49fb41d 100644 --- a/README.md +++ b/README.md @@ -22,10 +22,9 @@ const store = new BranchStore(ctx); // Shared prompt: "Explain quantum entanglement" const prompt = await ctx.tokenize("Explain quantum entanglement"); -await ctx.decode(prompt, 0, 0); -const root = Branch.create(ctx, prompt.length, { temperature: 0.8 }); -root.captureLogits(); +const root = Branch.create(ctx, 0, { temperature: 0.8 }); +await root.prefill(prompt); // Fork 4 branches — each gets a different reasoning prefix const analogy = await root.fork(); @@ -206,11 +205,12 @@ For fine-grained control without Branch: ```javascript const grammar = await ctx.jsonSchemaToGrammar(schema); -const handle = ctx.createSampler(grammar); -// Pull loop — consumer controls pace, can branch at any point +const branch = Branch.create(ctx, 0, params, undefined, grammar); +await branch.prefill(promptTokens); +// Grammar state cloned automatically on fork() ``` -See [`examples/grammar/`](./examples/grammar/) for the full pull loop pattern. +See [`examples/grammar/`](./examples/grammar/) for the full branch fork pattern. --- diff --git a/examples/best-of-n/best-of-n.mjs b/examples/best-of-n/best-of-n.mjs index 1519eba..22c9328 100644 --- a/examples/best-of-n/best-of-n.mjs +++ b/examples/best-of-n/best-of-n.mjs @@ -11,10 +11,10 @@ * See: Stiennon et al. 2020 "Learning to summarize from human feedback" * * KEY IMPLEMENTATION DETAIL: - * Uses the Branch API for parallel generation. After prefilling the prompt, - * we create a root branch and call captureLogits(). When forking to multiple - * candidates, each fork inherits the root's logits snapshot, ensuring all - * candidates start from the same probability distribution. + * Uses the Branch API for parallel generation. The root branch prefills the + * prompt and captures logits. When forking to multiple candidates, each fork + * inherits the root's logits snapshot, ensuring all candidates start from + * the same probability distribution. * * Usage: * node best-of-n.mjs [model-path] # Human-readable output @@ -92,21 +92,18 @@ async function main() { console.log(`\nPrompt: "${userPrompt}"`); } - // Prefill prompt + // Prefill prompt via root branch const promptTokens = await ctx.tokenize(prompt); - await ctx.decode(promptTokens, 0, 0); - if (!jsonlMode) { - console.log(`\nPrefill complete. Prompt length: ${promptTokens.length} tokens`); - } - - // CRITICAL: Create root branch IMMEDIATELY after prefill to capture logits - // The root branch stores a snapshot of the logits for fork operations - const root = Branch.create(ctx, promptTokens.length, { + const root = Branch.create(ctx, 0, { temperature: HIGH_TEMP, topP: 0.95, }); - root.captureLogits(); + await root.prefill(promptTokens); + + if (!jsonlMode) { + console.log(`\nPrefill complete. Prompt length: ${promptTokens.length} tokens`); + } // === Baseline: Single generation with low temperature === if (!jsonlMode) { diff --git a/examples/chat/README.md b/examples/chat/README.md index f7140de..04fcedd 100644 --- a/examples/chat/README.md +++ b/examples/chat/README.md @@ -14,37 +14,33 @@ npm run example -- /path/to/model.gguf # custom model - `/clear` - Reset conversation and clear terminal - `/quit` - Exit -## The Pattern: Sync Produce, Async Commit +## The Pattern: Branch Produce/Commit ```javascript -// Sync generator - all operations are synchronous -function* produceTokens(ctx, params) { - while (true) { - const tokenId = ctx.sample(params); // sync - if (ctx.isStopToken(tokenId)) return; // sync - const text = ctx.tokenToText(tokenId); // sync - yield { text, tokenId }; - } -} +// Create branch and prefill prompt +const branch = Branch.create(ctx, 0, { temperature: 0.7 }); +await branch.prefill(promptTokens); -// Usage - async commit is explicit in caller's loop -for (const { text, tokenId } of produceTokens(ctx, params)) { +// Async iterator - commit-before-yield +for await (const { token, text } of branch) { process.stdout.write(text); - await ctx.decode([tokenId], position); // async commit to KV - position += 1; } +await branch.prune(); ``` -**Key insight:** Token production is synchronous. Only the KV cache commit (`decode`) is async. This separation makes the control flow explicit. +**Key insight:** The async iterator handles produce/commit internally. Each yielded token is already committed to KV. Breaking out is clean — no orphaned state. ## API Reference | Method | Sync/Async | Purpose | |--------|------------|---------| -| `sample(params)` | sync | Sample next token from logits | -| `isStopToken(id)` | sync | Check if token ends generation | -| `tokenToText(id)` | sync | Convert token ID to text | -| `decode(tokens, pos)` | async | Commit tokens to KV cache | -| `tokenize(text)` | async | Convert text to token IDs | -| `formatChat(json)` | async | Apply chat template | -| `kvCacheClear()` | async | Reset KV cache | +| `Branch.create(ctx, pos, params)` | sync | Create a branch for generation | +| `branch.prefill(tokens)` | async | Feed tokens into branch's KV cache | +| `branch.produce()` | async | Sample next token (no KV write) | +| `branch.commit(token)` | async | Accept + decode into KV | +| `branch.prune()` | async | Discard branch and its KV entries | +| `ctx.isStopToken(id)` | sync | Check if token ends generation | +| `ctx.tokenToText(id)` | sync | Convert token ID to text | +| `ctx.tokenize(text)` | async | Convert text to token IDs | +| `ctx.formatChat(json)` | async | Apply chat template | +| `ctx.kvCacheClear()` | async | Reset KV cache | diff --git a/examples/chat/chat.mjs b/examples/chat/chat.mjs index 8f058dd..4ec2ea0 100644 --- a/examples/chat/chat.mjs +++ b/examples/chat/chat.mjs @@ -82,16 +82,15 @@ async function main() { messages.push({ role: "user", content: trimmed }); if (!branch) { - // === COLD (position === 0): full format → tokenize with BOS → decode === + // === COLD (position === 0): full format → tokenize with BOS → prefill === fmt = await ctx.formatChat(JSON.stringify(messages)); const tokens = await ctx.tokenize(fmt.prompt); - await ctx.decode(tokens, 0, 0); - branch = Branch.create(ctx, tokens.length, { + branch = Branch.create(ctx, 0, { temperature: 0.7, topK: 40, topP: 0.9, }); - branch.captureLogits(); + await branch.prefill(tokens); } else { // === WARM (position > 0): format only the new message === fmt = await ctx.formatChat( diff --git a/examples/entropy/entropy.mjs b/examples/entropy/entropy.mjs index c9204fe..6453e22 100644 --- a/examples/entropy/entropy.mjs +++ b/examples/entropy/entropy.mjs @@ -10,6 +10,8 @@ * - EDT formula: T = T₀ · N^(θ/Entropy) * - Side-by-side comparison with fixed temperature * - Different prompt types: factual, creative, mixed + * - Branch API for token generation (produce/commit loop) + * * * Usage: * node entropy.mjs [model-path] # Human-readable output @@ -18,7 +20,7 @@ import * as path from 'node:path'; import { fileURLToPath } from 'node:url'; -import { createContext } from '../../lib/index.js'; +import { createContext, Branch } from '../../lib/index.js'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const DEFAULT_MODEL = path.resolve( @@ -53,47 +55,48 @@ function edtTemperature(entropy) { /** * Generate with a specific sampling strategy + * + * Uses Branch API with per-token setSamplerParams() for EDT adaptation. + * Each token gets a temperature computed from the current logit entropy. */ async function generate(ctx, prompt, strategy, strategyName, maxTokens = 50) { const messages = [{ role: 'user', content: prompt }]; const { prompt: formatted } = await ctx.formatChat(JSON.stringify(messages)); - const tokens = await ctx.tokenize(formatted); - await ctx.decode(tokens, 0, 0); + + const baseTemp = strategy === 'edt' ? 0.8 : strategy; + const branch = Branch.create(ctx, 0, { temperature: baseTemp, topP: 0.9 }); + await branch.prefill(tokens); const output = []; const temps = []; const entropies = []; - let pos = tokens.length; for (let i = 0; i < maxTokens; i++) { - const entropy = ctx.modelEntropy('nats'); + const branchLogits = branch.getLogits(); + const entropy = ctx.modelEntropy('nats', branchLogits); entropies.push(entropy); - let temp; - if (strategy === 'edt') { - temp = edtTemperature(entropy); - } else { - temp = strategy; // Fixed temperature - } + const temp = strategy === 'edt' ? edtTemperature(entropy) : strategy; temps.push(temp); - const token = ctx.sample({ temperature: temp }); - if (ctx.isStopToken(token)) break; + if (strategy === 'edt') branch.setSamplerParams({ temperature: temp, topP: 0.9 }); + + const { token, isStop } = await branch.produce(); + if (isStop) break; const text = ctx.tokenToText(token); emit('token', { strategy: strategyName, token, text, entropy, temp }); output.push(token); - await ctx.decode([token], pos++, 0); + await branch.commit(token); } - // Clear KV cache for next run - await ctx.kvCacheClear(); + await branch.prune(); const text = await ctx.detokenize(output); - const avgEntropy = entropies.reduce((a, b) => a + b, 0) / entropies.length; - const avgTemp = temps.reduce((a, b) => a + b, 0) / temps.length; + const avgEntropy = entropies.length > 0 ? entropies.reduce((a, b) => a + b, 0) / entropies.length : 0; + const avgTemp = temps.length > 0 ? temps.reduce((a, b) => a + b, 0) / temps.length : 0; return { text, avgEntropy, avgTemp, tokenCount: output.length, temps, entropies }; } diff --git a/examples/grammar/README.md b/examples/grammar/README.md index 4326511..57ac23a 100644 --- a/examples/grammar/README.md +++ b/examples/grammar/README.md @@ -1,6 +1,6 @@ -# Grammar-Constrained Generation with Pull Loop +# Grammar-Constrained Generation with Branch Forking -Demonstrates generator-based token streaming with grammar constraints and forkable state. +Demonstrates grammar-constrained generation using the Branch API with automatic grammar cloning on fork. ## Run It @@ -17,109 +17,61 @@ Generating until "city" field... "age": 30, "city": -Saving KV cache and grammar state at branch point... - -Exploring 3 city branches: +Forking into 3 branches at branch point... [NYC branch]: { "name": "John Doe", "age": 30, "city": "Seattle" } [LA branch]: { "name": "John Doe", "age": 30, "city": "Chicago" } [Chicago branch]: { "name": "John Doe", "age": 30, "city": "LA" } ``` -## The Pull Loop Pattern +## The Branch Fork Pattern -This example uses a **pull loop** via JS generators. The consumer requests tokens one at a time and decides when to stop: +Grammar state is integrated into the branch and cloned automatically on fork: ```javascript -function* tokenGenerator(ctx, grammarHandle, maxTokens = 100) { - for (let i = 0; i < maxTokens; i++) { - const logits = ctx.getLogits(); - ctx.applySampler(grammarHandle, logits); - - const token = ctx.sample({ temperature: 0.7 }); - if (ctx.isStopToken(token)) return; - - ctx.acceptSamplerToken(grammarHandle, token); - - // Yield control back to caller - yield { token, text: ctx.tokenToText(token) }; - } +// Create root branch with grammar constraint +const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(schema)); +const root = Branch.create(ctx, 0, params, undefined, grammar); +await root.prefill(promptTokens); + +// Generate until branch point +for (let i = 0; i < 100; i++) { + const { token, text, isStop } = await root.produce(); + if (isStop) break; + await root.commit(token); + if (accumulated.includes('"city"')) break; } -``` -Consumer decides when to continue: +// Fork — grammar state cloned automatically +for (const city of cities) { + const child = await root.fork(); + child.reseedSampler(seed++); -```javascript -for (const { token, text } of gen) { - accumulated += text; - await ctx.decode([token], pos++, 0); - - // Stop at decision point - generator pauses here - if (accumulated.includes('"city"')) { - break; // Generator stays paused, state preserved + for await (const { text } of child) { + // Each branch generates independently with its own grammar state } + await child.prune(); } +await root.prune(); ``` -## Why Pull Loop Here? +## Why Branch Fork Here? -For this branching use case, pull made the code simpler: - -```javascript -// Stop when we see the branch point - just break -for (const { token, text } of gen) { - accumulated += text; - if (accumulated.includes('"city"')) break; -} -// Generator paused mid-iteration, grammar state intact -// Now save and branch -``` +For grammar-constrained branching, fork handles everything atomically: +- **KV cache**: Shared prefix, divergent-only storage per branch +- **Grammar state**: Parser position cloned automatically +- **Sampler chain**: Penalties and PRNG cloned and reseeded -With a push loop you'd need callbacks or flags to signal "stop here" - doable, but the control flow is inverted. Pull keeps the branching logic linear and readable. - -## Branching Pattern - -1. **Generate** until decision point (pull loop pauses naturally) -2. **Save** both KV cache and grammar state -3. **Fork** for each branch exploration -4. **Restore** and continue independently - -```javascript -// Pause at branch point -if (accumulated.includes('"city"')) break; - -// Save state -const kvSnapshot = await ctx.kvCacheSave(0); -const grammarSnapshot = ctx.cloneSampler(grammarHandle); - -// Explore branches -for (const branch of branches) { - await ctx.kvCacheLoad(0, kvSnapshot); - const branchGrammar = ctx.cloneSampler(grammarSnapshot); - - // Each branch continues independently - for (const { token, text } of tokenGenerator(ctx, branchGrammar)) { - // ... - } -} -``` +No manual KV save/load or grammar cloning needed — `fork()` is a single operation. ## Key APIs | Method | Description | |--------|-------------| -| `getLogits()` | Get logits buffer (modified in-place by applySampler) | -| `applySampler(handle, logits)` | Apply grammar constraints to logits | -| `sample()` | Sample from modified logits | -| `acceptSamplerToken(handle, id)` | Advance grammar parser state | -| `createSampler(grammar)` | Create grammar handle | -| `cloneSampler(handle)` | Clone grammar state for branching | -| `kvCacheSave(seq)` / `kvCacheLoad(seq, buf)` | Snapshot/restore KV state | - -## Grammar + KV Travel Together - -For valid branching, fork **both**: -- **KV cache**: Model's memory of what it has seen -- **Grammar state**: Parser's position in the grammar - -Missing either causes invalid completions or grammar errors. +| `Branch.create(ctx, pos, params, nBatch, grammar)` | Create branch with grammar constraint | +| `branch.fork()` | Clone branch: KV prefix + grammar + sampler | +| `branch.reseedSampler(seed)` | Diversify forked branch's PRNG | +| `branch.produce()` | Sample grammar-valid token | +| `branch.commit(token)` | Advance grammar + KV state | +| `branch.prune()` | Clean up branch resources | +| `ctx.jsonSchemaToGrammar(json)` | Convert JSON schema to GBNF grammar | diff --git a/examples/grammar/grammar.mjs b/examples/grammar/grammar.mjs index 1d7463b..6f96f2c 100644 --- a/examples/grammar/grammar.mjs +++ b/examples/grammar/grammar.mjs @@ -2,8 +2,9 @@ /** * Grammar-constrained generation with forkable state * - * Uses JS generators for backpressure - generation pauses at each yield, - * allowing precise control over when to branch. + * Uses Branch API for grammar-constrained generation with tree branching. + * Grammar state is automatically cloned on fork(), so each branch can + * diverge independently while maintaining valid JSON output. * * Usage: * node grammar.mjs [model-path] # Human-readable output @@ -12,7 +13,7 @@ import * as path from 'node:path'; import { fileURLToPath } from 'node:url'; -import { createContext } from '../../lib/index.js'; +import { createContext, Branch } from '../../lib/index.js'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const DEFAULT_MODEL = path.resolve( @@ -32,27 +33,6 @@ function emit(event, data) { } } -/** - * Generator that yields tokens one at a time - * Caller controls pace via next() - natural backpressure - */ -function* tokenGenerator(ctx, grammarHandle, maxTokens = 100) { - for (let i = 0; i < maxTokens; i++) { - // Apply grammar constraints to context logits - const logits = ctx.getLogits(); - ctx.applySampler(grammarHandle, logits); - - const token = ctx.sample(); - if (ctx.isStopToken(token)) return; - - // Advance grammar state - ctx.acceptSamplerToken(grammarHandle, token); - - // Yield token and text - caller decides when to continue - yield { token, text: ctx.tokenToText(token) }; - } -} - async function main() { if (!jsonlMode) { console.log(`Loading model: ${path.basename(modelPath)}`); @@ -89,16 +69,16 @@ async function main() { console.log(grammar.slice(0, 200) + '...\n'); } - const grammarHandle = ctx.createSampler(grammar); - const prompt = 'Generate a person as JSON:\n'; if (!jsonlMode) { console.log(`Prompt: "${prompt}"`); } const tokens = await ctx.tokenize(prompt); - await ctx.decode(tokens, 0, 0); - let pos = tokens.length; + + // Root branch with grammar constraint — grammar state cloned automatically on fork() + const root = Branch.create(ctx, 0, { temperature: 0.7, topP: 0.9 }, undefined, grammar); + await root.prefill(tokens); // ===== PHASE 1: Generate until we see "city" key ===== if (!jsonlMode) { @@ -106,19 +86,19 @@ async function main() { process.stdout.write(' '); } - const gen = tokenGenerator(ctx, grammarHandle); - const collectedTokens = []; let accumulated = ''; - for (const { token, text } of gen) { - collectedTokens.push(token); + for (let i = 0; i < 100; i++) { + const { token, text, isStop } = await root.produce(); + if (isStop) break; + accumulated += text; if (!jsonlMode) { process.stdout.write(text); } emit('token', { phase: 'prefix', token, text }); - await ctx.decode([token], pos++, 0); + await root.commit(token); // Stop when we see "city": - we want to branch here if (accumulated.includes('"city"')) { @@ -129,55 +109,46 @@ async function main() { console.log('\n'); } - // ===== PHASE 2: Save state for branching ===== - if (!jsonlMode) { - console.log('Saving KV cache and grammar state at branch point...'); - } - const kvSnapshot = await ctx.kvCacheSave(0); - const grammarSnapshot = ctx.cloneSampler(grammarHandle); - const branchPos = pos; - - emit('branch_point', { prefix: accumulated, position: branchPos }); - - // ===== PHASE 3: Complete with different cities ===== + // ===== PHASE 2: Fork and complete with different branches ===== const cities = ['NYC', 'LA', 'Chicago']; if (!jsonlMode) { - console.log(`\nExploring ${cities.length} city branches:\n`); + console.log(`Forking into ${cities.length} branches at branch point...\n`); } - const branches = []; - for (const city of cities) { - // Restore KV cache - await ctx.kvCacheLoad(0, kvSnapshot); - pos = branchPos; + emit('branch_point', { prefix: accumulated, position: root.position }); - // Fresh grammar clone for this branch - const branchGrammar = ctx.cloneSampler(grammarSnapshot); + const results = []; + for (const city of cities) { + const child = await root.fork(); + child.reseedSampler(results.length + 42); - // Generate completion for this branch - const branchGen = tokenGenerator(ctx, branchGrammar, 30); let branchText = ''; + for (let i = 0; i < 30; i++) { + const { token, text, isStop } = await child.produce(); + if (isStop) break; - for (const { token, text } of branchGen) { branchText += text; emit('token', { phase: 'branch', city, token, text }); - await ctx.decode([token], pos++, 0); + + await child.commit(token); } const fullOutput = accumulated + branchText; - branches.push({ city, output: fullOutput }); + results.push({ city, output: fullOutput }); if (!jsonlMode) { console.log(` [${city} branch]: ${fullOutput}`); } emit('branch_complete', { city, output: fullOutput }); - ctx.freeSamplerHandle(branchGrammar); + await child.prune(); } + await root.prune(); + // Validate JSON outputs let validJsonCount = 0; - for (const b of branches) { + for (const b of results) { try { JSON.parse(b.output); validJsonCount++; @@ -187,14 +158,11 @@ async function main() { } emit('complete', { - branchCount: branches.length, + branchCount: results.length, validJsonCount, - branches: branches.map(b => ({ city: b.city, output: b.output })), + branches: results.map(b => ({ city: b.city, output: b.output })), }); - // Cleanup - ctx.freeSamplerHandle(grammarHandle); - ctx.freeSamplerHandle(grammarSnapshot); ctx.dispose(); if (!jsonlMode) { diff --git a/examples/speculative/README.md b/examples/speculative/README.md index 27e5537..7433c77 100644 --- a/examples/speculative/README.md +++ b/examples/speculative/README.md @@ -1,6 +1,6 @@ -# Speculative Decoding with Forkable KV State +# Speculative Decoding with Branch API -Demonstrates the KV cache primitives needed for speculative decoding: draft, fork, verify, accept/reject. +Demonstrates speculative decoding using the Branch primitive: fork a draft, verify, accept/reject, sample bonus token. ## Run It @@ -31,56 +31,74 @@ Statistics | Phase | What Happens | |-------|--------------| -| **1. DRAFT** | Generate N tokens greedily (fast, low quality ok) | -| **2. FORK** | `kvSeqCopy(0, 1)` - copy KV state for verification | -| **3. VERIFY** | Run target model on all N tokens (one batch) | -| **4. ACCEPT** | Keep tokens where target agrees with draft | -| **5. BONUS** | Sample one token from target at rejection point | -| **6. CLEANUP** | `kvCacheRemove()` rejected tokens, repeat | +| **1. MAIN** | Create main branch tracking committed state | +| **2. FORK** | Fork draft branch (shares KV prefix with main) | +| **3. DRAFT** | produce/commit N tokens on draft branch | +| **4. VERIFY** | Check draft confidence (entropy threshold) | +| **5. PRUNE** | Remove draft branch (cleans up divergent KV) | +| **6. ACCEPT** | Commit accepted tokens to main branch | +| **7. BONUS** | Sample one token from main at rejection point | -## Key Pattern: Accept/Reject with KV Cleanup +## Key Pattern: Fork/Draft/Verify with Branch API ```javascript -// Draft N tokens on seq 0 -for (let i = 0; i < N; i++) { - const token = ctx.sample({ temperature: 0.0 }); - await ctx.decode([token], pos++, 0); - drafts.push(token); -} - -// Fork for verification -ctx.kvSeqCopy(0, 1); - -// Verify (compare draft vs target distributions) -const acceptedCount = verify(drafts); - -// Remove rejected tokens from KV cache -if (acceptedCount < drafts.length) { - const rejectPos = startPos + acceptedCount; - await ctx.kvCacheRemove(0, rejectPos, -1); // Critical! - - // Sample bonus token from target at rejection point - const bonus = ctx.sample({ temperature: 0.7 }); - await ctx.decode([bonus], rejectPos, 0); +// Main branch tracks committed state +const main = Branch.create(ctx, 0, { temperature: 0.7 }); +await main.prefill(promptTokens); + +while (output.length < maxTokens) { + // Fork draft from main — shares KV prefix + const draft = await main.fork(); + draft.reseedSampler(iteration); + + // Draft N tokens + const drafts = []; + for (let i = 0; i < N; i++) { + const entropy = ctx.modelEntropy('nats', draft.getLogits()); + const { token, text, isStop } = draft.produceSync(); + if (isStop) break; + drafts.push({ token, text, entropy }); + await draft.commit(token); + } + + // Verify and prune draft + const acceptedCount = verify(drafts); + await draft.prune(); + + // Commit accepted tokens to main + for (const d of drafts.slice(0, acceptedCount)) { + await main.commit(d.token); + } + + // Bonus token from main at rejection point + if (acceptedCount < drafts.length) { + const { token } = main.produceSync(); + await main.commit(token); + } } +await main.prune(); ``` -## Why Fork Before Verify? +## Why Branch API? -In real speculative decoding with two models: -- Draft model: small, fast, generates candidates -- Target model: large, slow, verifies quality +The produce/commit separation is what makes speculative decoding natural: -The fork lets you run the target model on seq 1 while keeping the draft state on seq 0. After verification, you collapse to the accepted prefix. +- **produce()** samples without writing to KV — inspect before deciding +- **commit()** accepts + decodes — advance state only for accepted tokens +- **fork()** shares KV prefix — draft branch doesn't duplicate the prompt +- **prune()** removes divergent KV — clean rejection without manual bookkeeping ## Key APIs | Method | Description | |--------|-------------| -| `kvSeqCopy(src, dst)` | Fork KV cache (O(1) tag copy) | -| `kvCacheRemove(seq, start, end)` | Remove token range from cache | -| `modelEntropy('nats')` | Check draft confidence | -| `nSeqMax` | Context option for multi-sequence | +| `Branch.create(ctx, pos, params)` | Create branch at position | +| `branch.fork()` | Fork: shared KV prefix + cloned sampler | +| `branch.produce()` | Sample without KV write | +| `branch.commit(token)` | Accept + decode into KV | +| `branch.prune()` | Remove divergent KV entries | +| `branch.reseedSampler(seed)` | Diversify forked branch | +| `ctx.modelEntropy('nats', logits)` | Check draft confidence | ## Accept Rate diff --git a/examples/speculative/speculative.mjs b/examples/speculative/speculative.mjs index ae709c2..cf927d7 100644 --- a/examples/speculative/speculative.mjs +++ b/examples/speculative/speculative.mjs @@ -104,15 +104,13 @@ async function main() { console.log(`\nPrompt: "${prompt}"`); } - // Prefill prompt + // Prefill prompt via main branch const promptTokens = await ctx.tokenize(prompt); - await ctx.decode(promptTokens, 0, 0); - // Create main branch — tracks committed state - const main = Branch.create(ctx, promptTokens.length, { + const main = Branch.create(ctx, 0, { temperature: 0.7, // For bonus token sampling }); - main.captureLogits(); + await main.prefill(promptTokens); const output = []; let totalDrafted = 0; @@ -138,11 +136,11 @@ async function main() { const drafts = []; for (let i = 0; i < DRAFT_COUNT && output.length + drafts.length < GENERATION_LENGTH; i++) { - // Get entropy BEFORE sampling (from current logits) - const entropy = ctx.modelEntropy('nats'); + // Get entropy BEFORE sampling (from draft branch's logits snapshot) + const entropy = ctx.modelEntropy('nats', draft.getLogits()); // produce() samples from captured logits (no KV write yet) - const { token, text, isStop } = draft.produce(); + const { token, text, isStop } = draft.produceSync(); if (isStop) break; @@ -191,7 +189,7 @@ async function main() { const rejected = drafts.slice(acceptedCount); if (rejected.length > 0) { // produce() samples from main's current logits (at rejection point) - const { token: bonusToken, text: bonusText, isStop } = main.produce(); + const { token: bonusToken, text: bonusText, isStop } = main.produceSync(); if (!isStop) { await main.commit(bonusToken); diff --git a/examples/streaming/streaming-summary.mjs b/examples/streaming/streaming-summary.mjs index 6de9f56..769b86c 100644 --- a/examples/streaming/streaming-summary.mjs +++ b/examples/streaming/streaming-summary.mjs @@ -13,6 +13,7 @@ * - Sidecar mode: optional slim-summarize model for summarization (--sidecar) * - Outline detection with structural progress tracking * - Pattern matching (not instruction following) to guide continuation + * - Branch API for generation (produce/commit loop) * * After reseed, KV cache contains: [progress][tail] * - progress = minimal anchor + checklist of done/current sections + summary @@ -25,7 +26,7 @@ import * as fs from 'node:fs'; import * as path from 'node:path'; import { fileURLToPath } from 'node:url'; -import { createContext } from '../../lib/index.js'; +import { createContext, Branch } from '../../lib/index.js'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const DEFAULT_MODEL = path.resolve( @@ -111,16 +112,17 @@ async function generateSummary(summaryCtx, text, options = {}) { } await summaryCtx.kvCacheClear(); - await summaryCtx.decode(tokens, 0, 0); + const branch = Branch.create(summaryCtx, 0, { temperature: 0.3 }); + await branch.prefill(tokens); let response = ''; - let pos = tokens.length; for (let i = 0; i < maxTokens; i++) { - const token = summaryCtx.sample({ temperature: 0.3 }); - if (summaryCtx.isStopToken(token)) break; - response += summaryCtx.tokenToText(token); - await summaryCtx.decode([token], pos++, 0); + const { token, text: t, isStop } = await branch.produce(); + if (isStop) break; + response += t; + await branch.commit(token); } + await branch.prune(); // Only parse slim-summarize Python-style list format return format === 'slim-summarize' @@ -293,7 +295,9 @@ Begin: ? MAX_SINK_TOKENS : MAX_SINK_TOKENS - (anchorTokens?.length || 0); - await ctx.decode(promptTokens, 0, 0); + const samplingParams = { temperature: 0.8, topP: 0.9 }; + let branch = Branch.create(ctx, 0, samplingParams); + await branch.prefill(promptTokens); if (!jsonlMode) { console.log(`\nContext size: ${nCtx}`); @@ -305,8 +309,8 @@ Begin: } const allTokens = [...promptTokens]; - const tracker = ctx.createPerplexityTracker(); - let cachePos = promptTokens.length; + // Manual PPL tracking (persists across branch reseeds) + let nllSum = 0, nllCount = 0; let reseedCount = 0; let currentSegmentText = ''; let allGeneratedText = ''; @@ -314,12 +318,9 @@ Begin: let pendingSummaryTokens = []; for (let t = 0; t < TARGET_TOKENS; t++) { - const token = ctx.sample({ - temperature: 0.8, - topP: 0.9, - }); + const { token, isStop } = await branch.produce(); - if (ctx.isStopToken(token)) { + if (isStop) { if (!jsonlMode) { console.log('\n[EOS token reached]'); } @@ -327,8 +328,10 @@ Begin: break; } - const surprisal = ctx.modelSurprisal(token); - ctx.addSurprisal(tracker, surprisal); + const branchLogits = branch.getLogits(); + const surprisal = ctx.modelSurprisal(token, 'nats', branchLogits); + nllSum += Math.max(0, surprisal); + nllCount++; const text = ctx.tokenToText(token); if (!jsonlMode) { @@ -339,10 +342,10 @@ Begin: currentSegmentText += text; allGeneratedText += text; allTokens.push(token); - await ctx.decode([token], cachePos++, 0); + await branch.commit(token); // Cache full? Reseed with dynamic sinks - if (cachePos >= nCtx) { + if (branch.position >= nCtx) { // Estimate evicted portion of current segment only const tailCharsEstimate = TAIL_SIZE * 4; const evictedFromSegment = currentSegmentText.length > tailCharsEstimate @@ -481,11 +484,16 @@ Begin: } const tail = allTokens.slice(-TAIL_SIZE); - await ctx.clearAndReseed(sinks, tail); - cachePos = sinks.length + TAIL_SIZE; + + // Destroy current branch, clear KV, create fresh branch with re-prefill + await branch.prune(); + await ctx.kvCacheClear(); + branch = Branch.create(ctx, 0, samplingParams); + await branch.prefill([...sinks, ...tail]); + reseedCount++; - const ppl = ctx.getPerplexity(tracker); + const ppl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1; emit('reseed', { count: reseedCount, tokenIndex: t + 1, @@ -509,8 +517,8 @@ Begin: } } - const finalPpl = ctx.getPerplexity(tracker); - ctx.freePerplexityTracker(tracker); + const finalPpl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1; + await branch.prune(); const generatedTokens = allTokens.length - promptTokens.length; const finalChain = summaries.join('\n'); diff --git a/examples/streaming/streaming-tsampler.mjs b/examples/streaming/streaming-tsampler.mjs index 4fd3207..0d698bf 100644 --- a/examples/streaming/streaming-tsampler.mjs +++ b/examples/streaming/streaming-tsampler.mjs @@ -6,7 +6,8 @@ * - TypeScript sampling via tsampler (TTA pattern) * - N-gram tracking to detect sequence repetition * - Logit steering to prevent repeated sequences - * - clearAndReseed() for infinite context + * - Branch API for KV management (prefill/decodeAndCaptureOne) + * - KV cache clear + re-prefill for infinite context * * The key insight: llama.cpp's token-level penalties degrade prose quality. * Instead, we track N-grams at the app level and steer away from repeats. @@ -18,7 +19,7 @@ import * as path from 'node:path'; import { fileURLToPath } from 'node:url'; -import { createContext } from '../../lib/index.js'; +import { createContext, Branch } from '../../lib/index.js'; // Import tsampler from npm package import { @@ -172,7 +173,6 @@ Begin: } const promptTokens = await ctx.tokenize(prompt); - await ctx.decode(promptTokens, 0, 0); // Track all generated tokens const allTokens = [...promptTokens]; @@ -201,15 +201,19 @@ Begin: process.stdout.write(prompt); } - const tracker = ctx.createPerplexityTracker(); - let cachePos = promptTokens.length; + // Branch used purely for KV management — sampling done externally via tsampler + let branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(promptTokens); + + // Manual PPL tracking (persists across branch reseeds) + let nllSum = 0, nllCount = 0; let reseedCount = 0; let blockedCount = 0; for (let t = 0; t < TARGET_TOKENS; t++) { - // Get logits from native layer - const logitsBuffer = ctx.getLogits(); - const logits = new Float32Array(logitsBuffer); + // Get logits from branch snapshot + const originalLogits = branch.getLogits(); + const logits = new Float32Array(originalLogits); // N-gram deduplication: Check if we're about to repeat a sequence const blockedToken = ngramTracker.getBlockedToken(); @@ -244,9 +248,10 @@ Begin: // tokenHistory.accept(token); // Disabled - matching baseline ngramTracker.accept(token); - // Track surprisal - const surprisal = ctx.modelSurprisal(token); - ctx.addSurprisal(tracker, surprisal); + // Track surprisal from original (unmodified) logits + const surprisal = ctx.modelSurprisal(token, 'nats', originalLogits); + nllSum += Math.max(0, surprisal); + nllCount++; // Output token const text = ctx.tokenToText(token); @@ -255,18 +260,23 @@ Begin: } emit('token', { index: t, token, text, surprisal, blocked: wasBlocked }); - // Store and decode + // Store and advance KV (no sampler accept — we're using tsampler externally) allTokens.push(token); - await ctx.decode([token], cachePos++, 0); + await branch.decodeAndCaptureOne(token); // Cache full? Reseed at boundary - if (cachePos >= nCtx) { + if (branch.position >= nCtx) { const tail = allTokens.slice(-TAIL_SIZE); - await ctx.clearAndReseed(sinks, tail); - cachePos = sinks.length + TAIL_SIZE; + + // Destroy current branch, clear KV, create fresh branch with re-prefill + await branch.prune(); + await ctx.kvCacheClear(); + branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill([...sinks, ...tail]); + reseedCount++; - const ppl = ctx.getPerplexity(tracker); + const ppl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1; const stats = ngramTracker.stats(); emit('reseed', { count: reseedCount, tokenIndex: t + 1, ppl, blockedCount, uniqueNgrams: stats.uniqueNgrams }); @@ -277,15 +287,15 @@ Begin: } // Progress every 1000 tokens - if ((t + 1) % 1000 === 0 && cachePos < nCtx && !jsonlMode) { + if ((t + 1) % 1000 === 0 && branch.position < nCtx && !jsonlMode) { const stats = ngramTracker.stats(); console.log(`\n [${t + 1}/${TARGET_TOKENS} | Blocked repeats: ${blockedCount} | Unique ${NGRAM_SIZE}-grams: ${stats.uniqueNgrams}]`); } } - const finalPpl = ctx.getPerplexity(tracker); + const finalPpl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1; const finalStats = ngramTracker.stats(); - ctx.freePerplexityTracker(tracker); + await branch.prune(); const generatedTokens = allTokens.length - promptTokens.length; emit('complete', { diff --git a/examples/streaming/streaming.mjs b/examples/streaming/streaming.mjs index 4bc52e8..96e4e20 100644 --- a/examples/streaming/streaming.mjs +++ b/examples/streaming/streaming.mjs @@ -8,15 +8,16 @@ * * This example demonstrates: * - Generating tokens beyond context window limit - * - clearAndReseed() for cache-local position reindexing + * - KV cache clear + re-prefill for cache-local position reindexing * - Per-token perplexity measurement across reseeds + * - Branch API for generation (produce/commit loop) * * Parameters from BlinkKV paper: 2048 context, 4 sinks, 256 tail */ import * as path from 'node:path'; import { fileURLToPath } from 'node:url'; -import { createContext } from '../../lib/index.js'; +import { createContext, Branch } from '../../lib/index.js'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const DEFAULT_MODEL = path.resolve( @@ -80,7 +81,6 @@ Begin: } const promptTokens = await ctx.tokenize(prompt); - await ctx.decode(promptTokens, 0, 0); // Track all generated tokens (needed for reseeding) const allTokens = [...promptTokens]; @@ -97,22 +97,22 @@ Begin: process.stdout.write(prompt); } - const tracker = ctx.createPerplexityTracker(); - let cachePos = promptTokens.length; + const samplingParams = { temperature: 0.8, topP: 0.9 }; + let branch = Branch.create(ctx, 0, samplingParams); + await branch.prefill(promptTokens); + + // Manual PPL tracking (persists across branch reseeds) + let nllSum = 0, nllCount = 0; let reseedCount = 0; for (let t = 0; t < TARGET_TOKENS; t++) { - // Sample next token // NOTE: Token-level repeat penalties are NOT used for long-form generation. // llama.cpp's penalty system penalizes individual tokens (not sequences), // which degrades prose quality over 100+ tokens as common words accumulate // in the penalty buffer. For sequence-level deduplication, use N-gram // tracking with logit steering (TTA pattern) instead. - const token = ctx.sample({ - temperature: 0.8, - topP: 0.9, - }); - if (ctx.isStopToken(token)) { + const { token, isStop } = await branch.produce(); + if (isStop) { if (!jsonlMode) { console.log('\n[EOS token reached]'); } @@ -120,9 +120,11 @@ Begin: break; } - // Track surprisal - const surprisal = ctx.modelSurprisal(token); - ctx.addSurprisal(tracker, surprisal); + // Track surprisal from the logits used by produce() + const branchLogits = branch.getLogits(); + const surprisal = ctx.modelSurprisal(token, 'nats', branchLogits); + nllSum += Math.max(0, surprisal); + nllCount++; // Output token const text = ctx.tokenToText(token); @@ -131,18 +133,23 @@ Begin: } emit('token', { index: t, token, text, surprisal }); - // Store token and decode + // Store token and commit (decode + capture new logits) allTokens.push(token); - await ctx.decode([token], cachePos++, 0); + await branch.commit(token); // Cache full? Reseed at boundary - if (cachePos >= nCtx) { + if (branch.position >= nCtx) { const tail = allTokens.slice(-TAIL_SIZE); - await ctx.clearAndReseed(sinks, tail); - cachePos = sinks.length + TAIL_SIZE; + + // Destroy current branch, clear KV, create fresh branch with re-prefill + await branch.prune(); + await ctx.kvCacheClear(); + branch = Branch.create(ctx, 0, samplingParams); + await branch.prefill([...sinks, ...tail]); + reseedCount++; - const ppl = ctx.getPerplexity(tracker); + const ppl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1; emit('reseed', { count: reseedCount, tokenIndex: t + 1, ppl }); if (!jsonlMode) { @@ -156,8 +163,8 @@ Begin: } } - const finalPpl = ctx.getPerplexity(tracker); - ctx.freePerplexityTracker(tracker); + const finalPpl = nllCount > 0 ? Math.exp(nllSum / nllCount) : 1; + await branch.prune(); const generatedTokens = allTokens.length - promptTokens.length; emit('complete', { generatedTokens, reseeds: reseedCount, finalPpl }); diff --git a/lib/Branch.js b/lib/Branch.js index 941937a..08fe4bf 100644 --- a/lib/Branch.js +++ b/lib/Branch.js @@ -151,7 +151,7 @@ class Branch { * this for external tokens (user input between turns), not model-generated * tokens. For model output, use commit() which does accept + decode. * - * This is the branch-level equivalent of ctx.decode(). + * The primary way to feed tokens into a branch's KV cache. * * @param {number[]} tokens - Token IDs to decode * @returns {Promise} @@ -263,7 +263,7 @@ class Branch { * const blocked = computeNgramBlocks(generatedText); * branch.steer(blocked.map(t => ({ token: t, bias: -Infinity }))); * - * const { token } = branch.produce(); // Blocked tokens won't be sampled + * const { token } = await branch.produce(); // Blocked tokens won't be sampled * await branch.commit(token); * * branch.clearSteer(); // Reset for next iteration @@ -285,15 +285,57 @@ class Branch { } /** - * Sample the next token without advancing state + * Replace the sampler chain with new parameters (memoized) + * + * If the new params match the current chain's params, this is a no-op. + * Otherwise the old chain is freed and a new one is created. + * + * @param {SamplingParams} params - New sampling parameters + */ + setSamplerParams(params) { + this._ensureNotDisposed(); + this._ctx._branchSetSamplerParams(this._handle, params); + } + + /** + * Replace or remove the grammar constraint + * + * Pass a GBNF grammar string to constrain generation, or empty string / null + * to remove the constraint. The grammar state is cloned on fork(). + * + * @param {string} [grammarStr] - GBNF grammar string, or empty/null to remove + */ + setGrammar(grammarStr) { + this._ensureNotDisposed(); + this._ctx._branchSetGrammar(this._handle, grammarStr || ''); + } + + /** + * Sample the next token without advancing state (async) * * No KV write, no position update. Inspect the result before deciding * to commit() — this separation is what enables speculative verification * and conditional branching. * + * Async contract: local branches resolve immediately; cloud branches + * may perform an HTTP round-trip. Use produceSync() when you know the + * branch is local and want zero-overhead sampling. + * + * @returns {Promise<{ token: number, text: string, isStop: boolean }>} + */ + async produce() { + return this.produceSync(); + } + + /** + * Sample the next token without advancing state (sync) + * + * Same as produce() but synchronous. Use when you know the branch is + * local and want to avoid the microtick overhead of a promise. + * * @returns {{ token: number, text: string, isStop: boolean }} */ - produce() { + produceSync() { this._ensureNotDisposed(); const token = this.sample(); return { @@ -383,7 +425,7 @@ class Branch { */ async *[Symbol.asyncIterator]() { while (!this._disposed) { - const { token, text, isStop } = this.produce(); + const { token, text, isStop } = await this.produce(); if (isStop) return; await this.commit(token); yield { token, text }; diff --git a/lib/index.d.ts b/lib/index.d.ts index b7d6711..0b1adcf 100644 --- a/lib/index.d.ts +++ b/lib/index.d.ts @@ -273,8 +273,6 @@ export interface FormatChatOptions { /** * Explicit GBNF grammar string for constrained generation. * Mutually exclusive with `jsonSchema`. - * - * @see {@link SessionContext.createSampler} */ grammar?: string; @@ -550,7 +548,7 @@ export interface AdvancedSamplingParams { * * Configures the sampler chain — a pipeline of composable filters and * transforms applied to raw logits before token selection. The chain is - * built once at branch/context creation and persists across decode steps + * built once at branch creation and persists across decode steps * (penalty state accumulates, PRNG advances). * * **Chain order**: penalties → top_k → typical_p → top_p → min_p → @@ -603,24 +601,29 @@ export interface SamplingParams { * Inference context — the runtime surface for a loaded model * * A SessionContext owns a llama_context (KV cache + compute graph) bound to a - * shared model. All inference flows through this interface: tokenization, - * forward passes, logit access, sampling, KV cache management, chat template - * formatting, and embedding extraction. + * shared model. It provides tokenization, logit access, KV cache management, + * chat template formatting, and embedding extraction. + * + * **All generation flows through {@link Branch}.** Create a branch at position 0, + * prefill prompt tokens, then use the produce/commit loop or async iterator: * - * The core generation loop is three steps, repeated: - * 1. **decode()** — Feed tokens through the transformer, populating KV cache. - * 2. **getLogits()** — Zero-copy view into the model's output distribution. - * 3. **sample()** — Select the next token via the configured sampler chain. + * ```typescript + * const branch = Branch.create(ctx, 0, { temperature: 0.7 }); + * await branch.prefill(promptTokens); + * for await (const { token, text } of branch) { + * process.stdout.write(text); + * } + * ``` * * For tree-structured generation (best-of-N, beam search, speculative - * decoding), use the {@link Branch} and {@link BranchStore} APIs instead — - * they manage per-branch KV sequences, sampler chains, and logits snapshots - * with O(1) GPU dispatches via batched decode. + * decoding), use {@link Branch.fork} and {@link BranchStore} — they manage + * per-branch KV sequences, sampler chains, and logits snapshots with O(1) + * GPU dispatches via batched decode. * * **Logits lifetime**: `getLogits()` returns a zero-copy Float32Array wrapping * llama.cpp's internal buffer. It is invalidated (ArrayBuffer detached) on - * the next `decode()`, `encode()`, or `dispose()`. Use {@link withLogits} for - * safe scoped access. + * the next `encode()` or `dispose()`. Use {@link withLogits} for safe scoped + * access. For branch-level logits, use {@link Branch.getLogits} instead. * * **KV cache**: Supports multi-sequence operation (`nSeqMax > 1`), per-sequence * copy/clear/eviction, file-based persistence, and context compression via @@ -636,54 +639,17 @@ export interface SamplingParams { * @category Core */ export interface SessionContext { - // ===== THE GENERATION LOOP ===== - - /** - * STEP 1: Process tokens through the model (forward pass) - * - * This feeds tokens through the transformer and updates the KV cache. - * After decoding, the model has "read" this text and is ready to predict. - * - * Think of this as: "the model reads your prompt" - * - * Why async? Model inference takes time (~45ms per token) - * Why position? Model needs to know where in conversation this text appears - * - * Cost: ~45ms per token (generation), ~120ms for 50 tokens (prompt) - * - * @param tokens Token IDs from tokenize() - * @param position Where these tokens start in the sequence - * @param seqId Sequence ID (default: 0) - * @example - * ```typescript - * const tokens = await ctx.tokenize("Hello world"); - * await ctx.decode(tokens, 0); - * let position = tokens.length; - * - * // Generate next token - * await ctx.decode([nextToken], position++); - * - * // Multi-sequence: decode to different sequences - * await ctx.decode(tokens, 0, 0); // Sequence 0 - * await ctx.decode(tokens, 0, 1); // Sequence 1 - * ``` - */ - decode(tokens: number[], position: number, seqId?: number): Promise; /** - * STEP 2: Get logits (zero-copy view into model memory) + * Get logits (zero-copy view into model memory) * * Returns unnormalized scores for every possible next token. * Higher score = model thinks this token is more likely. * The returned Float32Array wraps llama.cpp's internal buffer directly - * (zero-copy). It is mutable — you can write to it for custom sampling - * (e.g., setting banned tokens to -Infinity before calling sample()). - * - * Memoized per decode step: calling getLogits() twice between decodes - * returns the same Float32Array backed by the same ArrayBuffer. + * (zero-copy). * * LIFETIME CONSTRAINTS: - * - Valid ONLY until the next decode(), encode(), or dispose() call + * - Valid ONLY until the next encode() or dispose() call * - The ArrayBuffer is detached on invalidation — accessing a stale * buffer throws a TypeError * - DO NOT retain references across async boundaries @@ -697,60 +663,9 @@ export interface SessionContext { * Cost: ~0.5ms (zero-copy pointer, no data copied) * * @returns Float32Array of unnormalized logits (vocabSize elements) - * @example - * ```typescript - * await ctx.decode(tokens, 0); - * - * // Read logits for analysis - * const logits = ctx.getLogits(); - * const entropy = ctx.modelEntropy("bits", logits); - * - * // Or modify in-place for custom sampling - * logits[BANNED_TOKEN] = -Infinity; - * const token = ctx.sample({ temperature: 0.7 }); - * - * // Next decode invalidates the buffer - * await ctx.decode([token], position++); - * // logits is now DETACHED - do not access! - * ``` */ getLogits(): Float32Array; - /** - * STEP 3: Sample a token from logits - * - * Converts raw logits into a token decision using: - * - Temperature: controls randomness - * - Top-K/Top-P: filters unlikely tokens - * - Repeat/frequency/presence penalties (tracked across calls) - * - * NOTE: Grammar constraints are NOT applied by sample(). For grammar- - * constrained generation, use the handle-based API (createSampler / - * applySampler) or the Branch API which integrates grammar natively. - * - * This is where generation strategy happens. - * - * Cost: ~0.1ms (native sampling) - * - * @param params Sampling strategy (greedy if omitted) - * @returns Selected token ID - * @example - * ```typescript - * // Greedy (always pick most likely) - * const token = ctx.sample(); - * - * // Creative generation - * const token = ctx.sample({ temperature: 0.9 }); - * - * // Constrained to valid JSON (handle-based API) - * const grammarHandle = ctx.createSampler(grammar); - * ctx.applySampler(grammarHandle, ctx.getLogits()); - * const token = ctx.sample({ temperature: 0.7 }); - * ctx.acceptSamplerToken(grammarHandle, token); - * ``` - */ - sample(params?: SamplingParams): number; - /** * Convert token ID to text piece * @@ -762,20 +677,8 @@ export interface SessionContext { * * Cost: ~0.05ms * - * @param token Token ID from sample() + * @param token Token ID * @returns Text string for this token - * @example - * ```typescript - * while (true) { - * const token = ctx.sample({ temperature: 0.8 }); - * if (ctx.isStopToken(token)) break; - * - * const text = ctx.tokenToText(token); - * process.stdout.write(text); // Stream to output - * - * await ctx.decode([token], position++); - * } - * ``` */ tokenToText(token: number): string; @@ -795,14 +698,6 @@ export interface SessionContext { * Cost: <0.01ms (fast vocabulary lookup) * * @param token Token ID to check - * @example - * ```typescript - * const token = ctx.sample(); - * if (ctx.isStopToken(token)) { - * console.log('Generation complete'); - * break; - * } - * ``` */ isStopToken(token: number): boolean; @@ -910,14 +805,6 @@ export interface SessionContext { * * @param sequenceId Sequence ID (defaults to 0 for single conversation) * @returns Highest position index, or -1 if empty - * @example - * ```typescript - * const tokens = await ctx.tokenize("Hello world"); - * await ctx.decode(tokens, 0); - * - * const maxPos = ctx.kvCacheSize(0); - * console.log(`${maxPos + 1} tokens in cache`); - * ``` */ kvCacheSize(sequenceId?: number): number; @@ -938,18 +825,6 @@ export interface SessionContext { * @param sequenceId Sequence ID (use 0 for single sequence) * @param start Start position (inclusive) * @param end End position (exclusive), -1 = to end - * @example - * ```typescript - * // Remove old tokens to stay under context limit - * const currentLength = ctx.kvCacheSize(0); - * if (currentLength > 2000) { - * // Remove oldest 500 tokens - * await ctx.kvCacheRemove(0, 0, 500); - * - * // THEN decode new tokens - * await ctx.decode(newTokens, currentLength - 500); - * } - * ``` */ kvCacheRemove(sequenceId: number, start: number, end: number): Promise; @@ -968,17 +843,6 @@ export interface SessionContext { * * @param sequenceId Sequence ID (use 0 for single sequence) * @returns Serialized state buffer - * @example - * ```typescript - * // Save state before risky operation - * const snapshot = await ctx.kvCacheSave(0); - * - * // Try something - * await ctx.decode(riskyTokens, position); - * - * // Didn't work - restore previous state - * await ctx.kvCacheLoad(0, snapshot); - * ``` */ kvCacheSave(sequenceId?: number): Promise; @@ -1014,14 +878,6 @@ export interface SessionContext { * * Cost: ~1ms * - * @example - * ```typescript - * // Start fresh conversation - * await ctx.kvCacheClear(); - * - * const tokens = await ctx.tokenize("New conversation"); - * await ctx.decode(tokens, 0); - * ``` */ kvCacheClear(): Promise; @@ -1113,19 +969,6 @@ export interface SessionContext { * @param dstSeqId Destination sequence to copy to * @param p0 Start position (must be 0, default: 0) * @param p1 End position (must be -1 for full copy, default: -1) - * @example - * ```typescript - * // Decode shared prompt to seq 0 - * await ctx.decode(promptTokens, 0); - * - * // Fork to seq 1 and seq 2 (metadata-only, instant) - * ctx.kvSeqCopy(0, 1); - * ctx.kvSeqCopy(0, 2); - * - * // Divergent generation — only new tokens allocate KV entries - * await ctx.decode([tokenA], position, 1); - * await ctx.decode([tokenB], position, 2); - * ``` */ kvSeqCopy(srcSeqId: number, dstSeqId: number, p0?: number, p1?: number): void; @@ -1162,91 +1005,6 @@ export interface SessionContext { */ kvSeqPosMax(seqId: number): number; - // ===== HANDLE-BASED GRAMMAR ===== - - /** - * Create a new grammar sampler (returns handle) - * - * Creates an independent grammar sampler instance with its own state. - * Returns a handle that can be used with applySampler/acceptSamplerToken. - * Multiple handles can coexist with independent parser states. - * - * Cost: ~0.1-1ms depending on grammar complexity - * - * @param grammarStr GBNF grammar string - * @returns Handle to the created sampler - * @example - * ```typescript - * const grammarHandle = ctx.createSampler(jsonGrammar); - * - * // Apply grammar constraints to logits - * ctx.applySampler(grammarHandle, logitsBuffer); - * ctx.acceptSamplerToken(grammarHandle, token); - * - * // Create independent copy with same grammar - * const clonedHandle = ctx.cloneSampler(grammarHandle); - * - * // Cleanup when done - * ctx.freeSamplerHandle(grammarHandle); - * ctx.freeSamplerHandle(clonedHandle); - * ``` - */ - createSampler(grammarStr: string): number; - - /** - * Apply grammar constraints using handle-based sampler - * - * Masks invalid tokens with -Infinity based on parser state. - * Modifies the logits buffer in-place. - * - * @param handle Sampler handle from createSampler() - * @param logitsBuffer ArrayBuffer or TypedArray containing logits - */ - applySampler(handle: number, logitsBuffer: ArrayBuffer | Float32Array): void; - - /** - * Accept token to advance grammar parser state (handle-based) - * - * Must be called after sampling to advance the grammar parser. - * - * @param handle Sampler handle from createSampler() - * @param tokenId Token that was sampled - */ - acceptSamplerToken(handle: number, tokenId: number): void; - - /** - * Clone a grammar sampler - * - * Creates a copy of the sampler with identical parser state. - * Both handles can then be used independently with their own state. - * - * @param handle Sampler handle to clone - * @returns New handle to cloned sampler - * @example - * ```typescript - * const original = ctx.createSampler(jsonGrammar); - * ctx.acceptSamplerToken(original, openBrace); - * - * // Clone preserves parser state (already accepted openBrace) - * const copy = ctx.cloneSampler(original); - * - * // Both can now continue independently - * ctx.acceptSamplerToken(original, tokenA); - * ctx.acceptSamplerToken(copy, tokenB); - * ``` - */ - cloneSampler(handle: number): number; - - /** - * Free a grammar sampler handle - * - * Releases memory for the specified sampler. - * Handle becomes invalid after this call. - * - * @param handle Sampler handle to free - */ - freeSamplerHandle(handle: number): void; - // ===== METRICS API ===== /** @@ -1256,30 +1014,18 @@ export interface SessionContext { * - Low surprisal: Model expected this token (high probability) * - High surprisal: Model didn't expect this token (low probability) * - * Call after decode() to compute surprisal for any token based on - * the current logits distribution, or pass captured logits for - * offline computation (e.g., best-of-n scoring from prefill logits). + * Pass captured logits (e.g., from {@link Branch.getLogits}) for + * offline computation, or omit to use the current context logits. * * @param pickedTokenId - Token ID to compute surprisal for * @param base - Logarithm base: "nats" (default) or "bits" * @param logits - Optional Float32Array of logits (uses current context logits if omitted) * @returns Surprisal value in specified base * - * @example Current context logits (default) - * ```typescript - * await ctx.decode(tokens, position); - * const token = ctx.sample(); - * const surprisal = ctx.modelSurprisal(token, "bits"); - * console.log(`Model surprise: ${surprisal.toFixed(2)} bits`); - * ``` - * - * @example Captured/arbitrary logits (for best-of-n, verification, etc.) + * @example With branch logits * ```typescript - * // Capture logits after prefill - * const capturedLogits = new Float32Array(ctx.getLogits()); - * - * // Later: compute surprisal from captured logits - * const surprisal = ctx.modelSurprisal(token, "nats", capturedLogits); + * const { token } = await branch.produce(); + * const surprisal = ctx.modelSurprisal(token, "bits", branch.getLogits()); * ``` * * COST: O(n_vocab) - softmax normalization required @@ -1293,181 +1039,22 @@ export interface SessionContext { * - Low entropy: Model is confident (peaked distribution) * - High entropy: Model is uncertain (flat distribution) * - * Call after decode() to analyze the current prediction distribution, - * or pass captured logits for offline analysis. + * Pass captured logits (e.g., from {@link Branch.getLogits}) for + * offline analysis, or omit to use the current context logits. * * @param base - Logarithm base: "nats" (default) or "bits" * @param logits - Optional Float32Array of logits (uses current context logits if omitted) * @returns Entropy value in specified base * - * @example Current context logits (default) + * @example With branch logits * ```typescript - * await ctx.decode(tokens, position); - * const entropy = ctx.modelEntropy("bits"); - * if (entropy > 5.0) { - * console.log("Model is very uncertain - consider adjusting parameters"); - * } - * ``` - * - * @example Captured/arbitrary logits - * ```typescript - * const capturedLogits = new Float32Array(ctx.getLogits()); - * const entropy = ctx.modelEntropy("nats", capturedLogits); + * const entropy = ctx.modelEntropy("bits", branch.getLogits()); * ``` * * COST: O(n_vocab) - must sum over all token probabilities */ modelEntropy(base?: 'nats' | 'bits', logits?: Float32Array): number; - /** - * Create a new perplexity tracker. - * - * @returns Integer handle to the tracker - * - * @example - * ```typescript - * const tracker = ctx.createPerplexityTracker(); - * - * // Add surprisals during generation - * for (let i = 0; i < tokens.length; i++) { - * const surprisal = ctx.modelSurprisal(tokens[i]); - * ctx.addSurprisal(tracker, surprisal); - * } - * - * const ppl = ctx.getPerplexity(tracker); - * console.log(`Sequence perplexity: ${ppl.toFixed(2)}`); - * - * ctx.freePerplexityTracker(tracker); - * ``` - */ - createPerplexityTracker(): number; - - /** - * Add a surprisal value to the rolling tracker. - * - * @param handle - Tracker handle from createPerplexityTracker() - * @param surprisal - Surprisal value (from modelSurprisal or computed) - * - * @example - * ```typescript - * const surprisal = ctx.modelSurprisal(tokenId, "nats"); - * ctx.addSurprisal(tracker, surprisal); - * ``` - * - * COST: O(1) - numerically stable accumulation - * THREAD-SAFETY: Not thread-safe (handle is session-local) - */ - addSurprisal(handle: number, surprisal: number): void; - - /** - * Get current perplexity value. - * - * @param handle - Tracker handle - * @returns Perplexity = exp(average_surprisal_in_nats) - * - * @example - * ```typescript - * const ppl = ctx.getPerplexity(tracker); - * console.log(`Current PPL: ${ppl.toFixed(2)}`); - * ``` - * - * FORMULA: PPL = exp(sum_surprisals / count) - * RANGE: [1, ∞) where 1 = perfect prediction - */ - getPerplexity(handle: number): number; - - /** - * Clone a perplexity tracker (for fork/branch scenarios). - * - * @param sourceHandle - Handle to clone from - * @returns New handle with same accumulated state - * - * @example - * ```typescript - * // Branch A and B start from same base perplexity - * const baseTracker = ctx.createPerplexityTracker(); - * // ... accumulate base surprisals ... - * - * const branchA = ctx.clonePerplexityTracker(baseTracker); - * const branchB = ctx.clonePerplexityTracker(baseTracker); - * - * // Branch A and B now track independently - * ctx.addSurprisal(branchA, surprisalA); - * ctx.addSurprisal(branchB, surprisalB); - * ``` - */ - clonePerplexityTracker(sourceHandle: number): number; - - /** - * Reset tracker to initial state (count=0, sum=0). - * - * @param handle - Tracker handle to reset - * - * @example - * ```typescript - * // Reuse tracker for multiple sequences - * const tracker = ctx.createPerplexityTracker(); - * - * for (const sequence of sequences) { - * ctx.resetPerplexityTracker(tracker); - * // ... process sequence ... - * const ppl = ctx.getPerplexity(tracker); - * } - * ``` - */ - resetPerplexityTracker(handle: number): void; - - /** - * Get number of tokens tracked. - * - * @param handle - Tracker handle - * @returns Number of surprisal values added - */ - getPerplexityCount(handle: number): number; - - /** - * Free perplexity tracker resources. - * - * @param handle - Tracker handle to free - * - * NOTE: Auto-freed in dispose() if not manually freed - */ - freePerplexityTracker(handle: number): void; - - // ===== ATOMIC DECODE+CAPTURE ===== - - /** - * Decode tokens and capture logits atomically - * - * Performs decode and logits capture as a single atomic operation, - * ensuring the captured logits correspond exactly to the decoded tokens. - * - * Use this instead of separate decode() + getLogits() calls when - * you need guaranteed consistency between decode and logits capture. - * - * @param tokens Token IDs to decode - * @param position Start position in sequence - * @param seqId Sequence ID - * @param destBuffer Pre-allocated buffer to receive logits (vocabSize floats) - * @example - * ```typescript - * // Pre-allocate buffer (reuse across calls) - * const logitsBuffer = new Float32Array(ctx.vocabSize); - * - * // Atomic decode + capture - * await ctx.decodeAndCapture([token], position, seqId, logitsBuffer); - * - * // Safe to process logitsBuffer - it's an independent copy - * const nextToken = sampleFromLogits(logitsBuffer); - * ``` - */ - decodeAndCapture( - tokens: number[], - position: number, - seqId: number, - destBuffer: ArrayBuffer | Float32Array - ): Promise; - // ===== KV CACHE FILE PERSISTENCE ===== /** @@ -1528,26 +1115,8 @@ export interface SessionContext { * ])); * * const tokens = await ctx.tokenize(result.prompt); - * await ctx.decode(tokens, 0); - * ``` - * - * @example With tools - * ```typescript - * const tools = [{ type: 'function', function: { - * name: 'get_weather', description: 'Get weather', - * parameters: { type: 'object', properties: { location: { type: 'string' } } } - * }}]; - * const result = await ctx.formatChat(JSON.stringify(messages), { - * tools: JSON.stringify(tools), - * toolChoice: 'auto' - * }); - * // result.grammar contains GBNF for constrained tool call generation - * // result.format identifies the chat format for output parsing - * ``` - * - * @example Backward compatible (string as second arg) - * ```typescript - * const result = await ctx.formatChat(messagesJson, templateOverrideString); + * const branch = Branch.create(ctx, 0, { temperature: 0.7 }); + * await branch.prefill(tokens); * ``` */ formatChat( @@ -1601,12 +1170,11 @@ export interface SessionContext { * messages.push({ role: 'user', content: userContent }); * * if (!branch) { - * // Cold path: format full conversation, tokenize with BOS, decode all + * // Cold path: format full conversation, tokenize with BOS, prefill * fmt = await ctx.formatChat(JSON.stringify(messages)); * const tokens = await ctx.tokenize(fmt.prompt); - * await ctx.decode(tokens, 0, 0); - * branch = Branch.create(ctx, tokens.length, { temperature: 0.7 }); - * branch.captureLogits(); + * branch = Branch.create(ctx, 0, { temperature: 0.7 }); + * await branch.prefill(tokens); * } else { * // Warm path: string-diff for delta tokens * const { prompt: full } = await ctx.formatChat(JSON.stringify(messages)); @@ -1621,7 +1189,7 @@ export interface SessionContext { * // Generate * let rawOutput = ''; * while (true) { - * const { token, text, isStop } = branch.produce(); + * const { token, text, isStop } = await branch.produce(); * if (isStop) break; * rawOutput += text; * await branch.commit(token); @@ -1653,7 +1221,7 @@ export interface SessionContext { * Convert JSON schema to GBNF grammar * * Generates grammar string for constrained JSON generation. - * Use with createSampler() for grammar-constrained generation. + * Use with {@link Branch.create} grammar parameter for constrained generation. * * Cost: ~1-10ms depending on schema complexity * @@ -1671,7 +1239,7 @@ export interface SessionContext { * }; * * const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(schema)); - * const handle = ctx.createSampler(grammar); + * const branch = Branch.create(ctx, 0, params, undefined, grammar); * ``` */ jsonSchemaToGrammar(schemaJson: string): Promise; @@ -1779,18 +1347,6 @@ export interface SessionContext { */ hasPooling(): boolean; - // ===== NATIVE REFERENCE IMPLEMENTATIONS ===== - - /** - * Sample greedily from current logits - * - * Selects token with highest logit value (deterministic). - * Equivalent to sample() with temperature=0. - * - * @returns Token ID with highest probability - */ - greedySample(): number; - // ===== PROPERTIES ===== /** @@ -1877,6 +1433,12 @@ export interface SessionContext { /** @internal Clear all dynamic logit biases from a branch */ _branchClearSteer(handle: number): void; + /** @internal Replace sampler chain with new parameters (memoized) */ + _branchSetSamplerParams(handle: number, params: SamplingParams): void; + + /** @internal Replace or remove grammar constraint */ + _branchSetGrammar(handle: number, grammarStr: string): void; + // ===== STORE API (internal, wrapped by BranchStore) ===== /** @internal Batched accept + decode_each + capture for N branches */ @@ -1905,7 +1467,6 @@ export interface SessionContext { * - KV cache: `nCtx * 2 * nLayers * dHead` bytes per KV type (fp16 default). * For a 7B model with `nCtx: 4096`, expect ~1-2 GB of KV memory. * - Compute scratch: temporary buffers for the forward pass, sized to `nBatch`. - * - Sampler state: penalty tracking window, PRNG state. * * **Model sharing:** If two contexts use the same `modelPath`, the model * weights are loaded once and shared. Only the KV cache and compute buffers @@ -1926,8 +1487,9 @@ export interface SessionContext { * * try { * const tokens = await ctx.tokenize("Hello"); - * await ctx.decode(tokens, 0); - * const token = ctx.sample({ temperature: 0.7 }); + * const branch = Branch.create(ctx, 0, { temperature: 0.7 }); + * await branch.prefill(tokens); + * for await (const { text } of branch) process.stdout.write(text); * } finally { * ctx.dispose(); * } @@ -2015,56 +1577,16 @@ export function loadBinary(variant?: GpuVariant): { * The callback MUST NOT: * - Store the logits reference * - Return a Promise (will throw) - * - Call decode() (would invalidate logits) * * This prevents common bugs where logits become invalid due to * async operations between access and usage. * - * How it works: - * - Memoization: Multiple getLogits() calls in same step return same buffer - * - Revocation: Next decode() invalidates previous buffer - * * @template T Return type of the callback * @param ctx The session context * @param fn Synchronous callback that uses logits - must not return a Promise * @returns The result from the callback * @throws Error if callback returns a Promise (async usage not allowed) * - * @example Safe synchronous usage - * ```typescript - * // Compute entropy synchronously - * const entropy = withLogits(ctx, (logits) => { - * let maxLogit = logits[0]; - * for (let i = 1; i < logits.length; i++) { - * if (logits[i] > maxLogit) maxLogit = logits[i]; - * } - * - * let sumExp = 0; - * for (let i = 0; i < logits.length; i++) { - * sumExp += Math.exp(logits[i] - maxLogit); - * } - * - * let entropy = 0; - * for (let i = 0; i < logits.length; i++) { - * const p = Math.exp(logits[i] - maxLogit) / sumExp; - * if (p > 0) entropy -= p * Math.log(p); - * } - * return entropy; - * }); - * - * // Now safe to decode (previous logits buffer is revoked) - * await ctx.decode([nextToken], position++); - * ``` - * - * @example Error: async callback - * ```typescript - * // This will throw! - * withLogits(ctx, async (logits) => { - * await something(); // NOT ALLOWED - * return logits[0]; - * }); - * ``` - * * @category Core */ export function withLogits( @@ -2208,7 +1730,7 @@ export class Branch { * tokens (user input between turns), not model-generated tokens. * For model output, use `commit()` which does accept + decode. * - * Branch-level equivalent of `ctx.decode()`. + * The primary way to feed tokens into a branch's KV cache. * * @param tokens - Token IDs to decode */ @@ -2281,7 +1803,7 @@ export class Branch { * // Block those tokens for this sample only * branch.steer(blocked.map(t => ({ token: t, bias: -Infinity }))); * - * const { token } = branch.produce(); // Blocked tokens won't be sampled + * const { token } = await branch.produce(); // Blocked tokens won't be sampled * await branch.commit(token); * * // Clear for next iteration (recompute based on new history) @@ -2300,7 +1822,7 @@ export class Branch { * // Penalize sibling choices to encourage diversity * beam.branch.steer(siblingTokens.map(t => ({ token: t, bias: -2.0 }))); * - * const { token } = beam.branch.produce(); + * const { token } = await beam.branch.produce(); * await beam.branch.commit(token); * beam.lastToken = token; * beam.branch.clearSteer(); @@ -2332,7 +1854,7 @@ export class Branch { * const blocked = computeConstraints(generatedTokens); * branch.steer(blocked.map(t => ({ token: t, bias: -Infinity }))); * - * const { token, isStop } = branch.produce(); + * const { token, isStop } = await branch.produce(); * if (isStop) break; * * await branch.commit(token); @@ -2343,8 +1865,60 @@ export class Branch { */ clearSteer(): void; - /** Sample next token without advancing state. Inspect before committing. */ - produce(): Produced; + /** + * Replace the sampler chain with new parameters (memoized) + * + * If the new params match the current chain's params, this is a no-op. + * Otherwise the old chain is freed and a new one is created. Use for + * Entropy-Driven Temperature (EDT) and other adaptive sampling strategies + * that adjust parameters per-step. + * + * @param params - New sampling parameters + * + * @example Entropy-Driven Temperature + * ```typescript + * const entropy = ctx.modelEntropy('nats', branch.getLogits()); + * branch.setSamplerParams({ temperature: edtTemperature(entropy) }); + * const { token } = await branch.produce(); + * await branch.commit(token); + * ``` + */ + setSamplerParams(params: SamplingParams): void; + + /** + * Replace or remove the grammar constraint + * + * Pass a GBNF grammar string to constrain generation. Pass empty string + * or undefined to remove the constraint. The grammar state is cloned on + * fork(), so sibling branches can diverge independently after hot-swap. + * + * @param grammarStr - GBNF grammar string, or empty/undefined to remove + * + * @example Hot-swap grammar mid-generation + * ```typescript + * // Start unconstrained, then switch to JSON after detecting tool call + * branch.setGrammar(jsonGrammar); + * const { token } = await branch.produce(); + * ``` + */ + setGrammar(grammarStr?: string): void; + + /** + * Sample next token without advancing state (async) + * + * Async contract: local branches resolve immediately; cloud branches + * may perform an HTTP round-trip. Use {@link produceSync} when you know + * the branch is local and want zero-overhead sampling. + */ + produce(): Promise; + + /** + * Sample next token without advancing state (sync) + * + * Same as {@link produce} but synchronous. Use when you know the branch + * is local and want to avoid the microtick overhead of a promise. + */ + produceSync(): Produced; /** * Accept and decode — update branch state, then write token to KV @@ -2428,9 +2002,9 @@ export class Branch { * Packs N tokens into a single batch via `decode_each` (one row per sequence, * all at their respective positions). Single `llama_decode()` call. Logits * captured per-branch at batch index `i`. O(N) total work, O(1) GPU - * dispatches, O(1) amortized dispatch overhead per branch. Post-decode, - * accepts each token into its branch's repeat-penalty window. Decode-first - * ordering ensures sampler state stays consistent if decode throws. + * dispatches, O(1) amortized dispatch overhead per branch. Accept-first + * ordering with rollback: accepts each token into its branch's repeat-penalty + * window before decode, restores from clones if decode throws. * * **prefill()** — Bulk token injection. Each branch contributes a * variable-length token array. Uses a two-pass bin-packing algorithm: @@ -2459,7 +2033,7 @@ export class Branch { * @example 32-branch generation step — one GPU dispatch * ```typescript * const store = new BranchStore(ctx); - * const entries = branches.map(b => [b, b.produce().token] as [Branch, number]); + * const entries = await Promise.all(branches.map(async b => [b, (await b.produce()).token] as [Branch, number])); * await store.commit(entries); // 32 tokens, 1 llama_decode() * ``` * @@ -2470,8 +2044,8 @@ export class Branch { * for (const _ of [1, 2, 3]) branches.push(await root.fork()); * * for (let step = 0; step < 50; step++) { - * const live = branches.map(b => [b, b.produce()] as const) - * .filter(([, p]) => !p.isStop); + * const produced = await Promise.all(branches.map(async b => [b, await b.produce()] as const)); + * const live = produced.filter(([, p]) => !p.isStop); * if (!live.length) break; * await store.commit(live.map(([b, p]) => [b, p.token])); * } diff --git a/lib/index.js b/lib/index.js index 3541a22..928c3f7 100644 --- a/lib/index.js +++ b/lib/index.js @@ -17,18 +17,13 @@ * // Tokenize * const tokens = await ctx.tokenize("Hello world"); * - * // Decode - * await ctx.decode(tokens, 0); - * - * // Safe logits access (Runtime Borrow Checker pattern) - * const entropy = withLogits(ctx, (logits) => { - * // logits is valid here - use synchronously only! - * return myComputeEntropy(logits); - * }); - * - * // Or with native reference implementations (for testing) - * const entropy = ctx.modelEntropy(); - * const token = ctx.greedySample(); + * // Generate via Branch API + * const branch = Branch.create(ctx, 0, { temperature: 0.7 }); + * await branch.prefill(tokens); + * for await (const { text } of branch) { + * process.stdout.write(text); + * } + * await branch.prune(); * * // Cleanup * ctx.dispose(); diff --git a/liblloyal b/liblloyal index 4c932ea..557c4ef 160000 --- a/liblloyal +++ b/liblloyal @@ -1 +1 @@ -Subproject commit 4c932ea0b74d5dd8392458f12027c5c55875f2a3 +Subproject commit 557c4ef6c7f88824c6fdbc029ad9e9b8bea4f73d diff --git a/src/SessionContext.cpp b/src/SessionContext.cpp index 0fb6696..b812a76 100644 --- a/src/SessionContext.cpp +++ b/src/SessionContext.cpp @@ -1,7 +1,6 @@ #include "SessionContext.hpp" #include "BackendManager.hpp" #include "FileSystem.h" -#include #include #include #include @@ -409,39 +408,6 @@ class TokenizeWorker : public Napi::AsyncWorker { /** * AsyncWorker for decode operation */ -class DecodeWorker : public Napi::AsyncWorker { -public: - DecodeWorker(Napi::Env env, llama_context* ctx, const std::vector& tokens, - int32_t pos, llama_seq_id seqId, int32_t nBatch) - : AsyncWorker(env), _deferred(env), _ctx(ctx), _tokens(tokens), _pos(pos), _seqId(seqId), _nBatch(nBatch) {} - - void Execute() override { - try { - lloyal::decode::many(_ctx, _tokens, _pos, _nBatch, _seqId); - } catch (const std::exception& e) { - SetError(e.what()); - } - } - - void OnOK() override { - _deferred.Resolve(Env().Undefined()); - } - - void OnError(const Napi::Error& err) override { - _deferred.Reject(err.Value()); - } - - Napi::Promise GetPromise() { return _deferred.Promise(); } - -private: - Napi::Promise::Deferred _deferred; - llama_context* _ctx; - std::vector _tokens; - int32_t _pos; - llama_seq_id _seqId; - int32_t _nBatch; -}; - /** * AsyncWorker for encode operation (embedding extraction) * Unlike DecodeWorker, marks ALL tokens with logits=true @@ -664,14 +630,16 @@ class StoreCommitWorker : public Napi::AsyncWorker { // RAII snapshot of accept-mutable state. Destructor frees anything still // owned, so partial clones from a throwing OOM don't leak. struct Snapshot { - llama_sampler* sampler = nullptr; - llama_sampler* grammar = nullptr; - lloyal::metrics::BranchMetricsHandle metrics = 0; + lloyal::branch::SamplerChainHandle sampler = 0; + lloyal::branch::GrammarHandle grammar = 0; + lloyal::branch::MetricsHandle metrics = 0; + lloyal::branch::BranchStore* store = nullptr; ~Snapshot() { - if (sampler) lloyal::sampler::free_chain(sampler); - if (grammar) lloyal::grammar::free_sampler(grammar); - if (metrics) lloyal::metrics::free_branch_metrics(metrics); + if (!store) return; + if (sampler) store->free_sampler(sampler); + if (grammar) store->free_grammar(grammar); + if (metrics) store->free_metrics(metrics); } void restore_into(lloyal::branch::BranchState& st) { @@ -691,12 +659,13 @@ class StoreCommitWorker : public Napi::AsyncWorker { if (!st) throw std::runtime_error("StoreCommitWorker: invalid handle"); auto s = std::make_unique(); - s->sampler = st->sampler_chain - ? lloyal::sampler::clone_chain(st->sampler_chain) : nullptr; - s->grammar = st->grammar - ? lloyal::grammar::clone_sampler(st->grammar) : nullptr; + s->store = &_store; + s->sampler = st->sampler_chain != 0 + ? _store.clone_sampler(st->sampler_chain) : 0; + s->grammar = st->grammar != 0 + ? _store.clone_grammar(st->grammar) : 0; s->metrics = st->metrics != 0 - ? lloyal::metrics::clone_branch_metrics(st->metrics) : 0; + ? _store.clone_metrics(st->metrics) : 0; snaps[i] = std::move(s); } @@ -768,45 +737,6 @@ class StorePrefillWorker : public Napi::AsyncWorker { std::vector> _tokenStorage; }; -/** - * AsyncWorker for decode + logits capture into a JS ArrayBuffer - * Pins the dest ArrayBuffer via Napi::Reference to prevent GC during Execute() - */ -class DecodeAndCaptureWorker : public Napi::AsyncWorker { -public: - DecodeAndCaptureWorker(Napi::Env env, llama_context* ctx, - std::vector tokens, - int32_t pos, llama_seq_id seqId, int32_t nBatch, - float* dest, int nVocab, - Napi::Reference bufRef) - : AsyncWorker(env), _deferred(env), _ctx(ctx), _tokens(std::move(tokens)), - _pos(pos), _seqId(seqId), _nBatch(nBatch), _dest(dest), _nVocab(nVocab), - _bufRef(std::move(bufRef)) {} - - void Execute() override { - try { - lloyal::decode::many(_ctx, _tokens, _pos, _nBatch, _seqId); - float* logits = lloyal::logits::get(_ctx, -1); - std::memcpy(_dest, logits, _nVocab * sizeof(float)); - } catch (const std::exception& e) { SetError(e.what()); } - } - - void OnOK() override { _deferred.Resolve(Env().Undefined()); } - void OnError(const Napi::Error& err) override { _deferred.Reject(err.Value()); } - Napi::Promise GetPromise() { return _deferred.Promise(); } - -private: - Napi::Promise::Deferred _deferred; - llama_context* _ctx; - std::vector _tokens; - int32_t _pos; - llama_seq_id _seqId; - int32_t _nBatch; - float* _dest; - int _nVocab; - Napi::Reference _bufRef; // prevent GC of dest buffer -}; - /** * AsyncWorker for JSON schema → GBNF grammar conversion * Pure CPU, no shared state — cleanest worker @@ -836,10 +766,8 @@ class JsonSchemaToGrammarWorker : public Napi::AsyncWorker { Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) { Napi::Function func = DefineClass(env, "SessionContext", { - // ===== THE GENERATION LOOP ===== - InstanceMethod("decode", &SessionContext::decode), + // ===== CORE ===== InstanceMethod("getLogits", &SessionContext::getLogits), - InstanceMethod("sample", &SessionContext::sample), InstanceMethod("tokenToText", &SessionContext::tokenToText), InstanceMethod("isStopToken", &SessionContext::isStopToken), InstanceMethod("getEogToken", &SessionContext::getEogToken), @@ -864,16 +792,6 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) { InstanceMethod("kvSeqKeep", &SessionContext::kvSeqKeep), InstanceMethod("kvSeqPosMax", &SessionContext::kvSeqPosMax), - // ===== HANDLE-BASED GRAMMAR ===== - InstanceMethod("createSampler", &SessionContext::createSampler), - InstanceMethod("applySampler", &SessionContext::applySampler), - InstanceMethod("acceptSamplerToken", &SessionContext::acceptSamplerToken), - InstanceMethod("cloneSampler", &SessionContext::cloneSampler), - InstanceMethod("freeSamplerHandle", &SessionContext::freeSamplerHandle), - - // ===== ATOMIC DECODE+CAPTURE ===== - InstanceMethod("decodeAndCapture", &SessionContext::decodeAndCapture), - // ===== HELPERS ===== InstanceMethod("formatChat", &SessionContext::formatChat), InstanceMethod("parseChatOutput", &SessionContext::parseChatOutput), @@ -889,16 +807,6 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) { // ===== METRICS API ===== InstanceMethod("modelSurprisal", &SessionContext::modelSurprisal), InstanceMethod("modelEntropy", &SessionContext::modelEntropy), - InstanceMethod("createPerplexityTracker", &SessionContext::createPerplexityTracker), - InstanceMethod("addSurprisal", &SessionContext::addSurprisal), - InstanceMethod("getPerplexity", &SessionContext::getPerplexity), - InstanceMethod("clonePerplexityTracker", &SessionContext::clonePerplexityTracker), - InstanceMethod("resetPerplexityTracker", &SessionContext::resetPerplexityTracker), - InstanceMethod("getPerplexityCount", &SessionContext::getPerplexityCount), - InstanceMethod("freePerplexityTracker", &SessionContext::freePerplexityTracker), - - // ===== NATIVE REFERENCE IMPLEMENTATIONS ===== - InstanceMethod("greedySample", &SessionContext::greedySample), // ===== LIFECYCLE ===== InstanceMethod("dispose", &SessionContext::dispose), @@ -923,6 +831,8 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) { InstanceMethod("_branchSamplerChainReseed", &SessionContext::_branchSamplerChainReseed), InstanceMethod("_branchSteer", &SessionContext::_branchSteer), InstanceMethod("_branchClearSteer", &SessionContext::_branchClearSteer), + InstanceMethod("_branchSetSamplerParams", &SessionContext::_branchSetSamplerParams), + InstanceMethod("_branchSetGrammar", &SessionContext::_branchSetGrammar), // ===== STORE API (internal, wrapped by lib/BranchStore.js) ===== InstanceMethod("_storeCommit", &SessionContext::_storeCommit), @@ -954,26 +864,6 @@ SessionContext::SessionContext(const Napi::CallbackInfo& info) SessionContext::~SessionContext() { if (!_disposed) { - // Free handle-based grammar samplers first - for (auto& [handle, sampler] : _samplerHandles) { - if (sampler) { - llama_sampler_free(sampler); - } - } - _samplerHandles.clear(); - - // Free handle-based perplexity trackers - for (auto& [napiHandle, pplHandle] : _perplexityHandles) { - lloyal::metrics::free_perplexity(pplHandle); - } - _perplexityHandles.clear(); - - // Free persistent sampler chain (pattern from branch.hpp) - if (_samplerChain) { - lloyal::sampler::free_chain(_samplerChain); - _samplerChain = nullptr; - } - // Free context (depends on model) if (_context) { llama_free(_context); @@ -1073,43 +963,6 @@ Napi::Value SessionContext::getLogits(const Napi::CallbackInfo& info) { return Napi::Float32Array::New(env, n_vocab, buffer, 0); } -Napi::Value SessionContext::decode(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - if (info.Length() < 2 || !info[0].IsArray() || !info[1].IsNumber()) { - throw Napi::TypeError::New(env, "Expected (tokens: number[], position: number[, seqId: number])"); - } - - // Revoke any active logits buffer before decode - invalidateLogits(); - - // Extract tokens - Napi::Array jsTokens = info[0].As(); - std::vector tokens; - tokens.reserve(jsTokens.Length()); - for (uint32_t i = 0; i < jsTokens.Length(); i++) { - Napi::Value val = jsTokens[i]; - if (!val.IsNumber()) { - throw Napi::TypeError::New(env, "Token array must contain only numbers"); - } - tokens.push_back(static_cast(val.As().Int32Value())); - } - - int32_t position = info[1].As().Int32Value(); - - // Extract optional seqId (default 0 for backward compatibility) - llama_seq_id seqId = 0; - if (info.Length() >= 3 && info[2].IsNumber()) { - seqId = static_cast(info[2].As().Int32Value()); - } - - // Run async - auto* worker = new DecodeWorker(env, _context, tokens, position, seqId, _nBatch); - worker->Queue(); - return worker->GetPromise(); -} - Napi::Value SessionContext::tokenize(const Napi::CallbackInfo& info) { Napi::Env env = info.Env(); ensureNotDisposed(); @@ -1250,20 +1103,6 @@ Napi::Value SessionContext::modelEntropy(const Napi::CallbackInfo& info) { return Napi::Number::New(env, static_cast(entropy)); } -Napi::Value SessionContext::greedySample(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - if (!_context) { - throw Napi::Error::New(env, "Context not initialized"); - } - - // Use liblloyal greedy sampler with model overload - llama_token token = lloyal::sampler::greedy(_context, _model.get()); - - return Napi::Number::New(env, static_cast(token)); -} - Napi::Value SessionContext::tokenToText(const Napi::CallbackInfo& info) { Napi::Env env = info.Env(); ensureNotDisposed(); @@ -1488,69 +1327,6 @@ Napi::Value SessionContext::kvCacheSize(const Napi::CallbackInfo& info) { return Napi::Number::New(env, static_cast(max_pos)); } -Napi::Value SessionContext::sample(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - if (!_context) { - throw Napi::Error::New(env, "Context not initialized"); - } - - llama_token next_token; - - // Use greedy if no params, otherwise use persistent sampler chain - // Pattern from branch.hpp: create chain once, reuse across samples, call accept() after - if (info.Length() == 0 || !info[0].IsObject()) { - // No params - use greedy sampling (stateless, no chain needed) - next_token = lloyal::sampler::greedy(_context, _model.get()); - } else { - // Use adapter to convert JS params → liblloyal-compatible structure - LloyalSamplingParams params = adaptSamplingParamsFromJS(info[0].As()); - - // Create or rebuild sampler chain if params changed - // Pattern from branch.hpp: persistent chain enables repeat penalty tracking - if (!_samplerChain || params != _samplerParams) { - if (_samplerChain) { - lloyal::sampler::free_chain(_samplerChain); - } - _samplerChain = lloyal::sampler::create_chain(params); - _samplerParams = params; - } - - // Get logits and build candidate array (pattern from branch.hpp::sample) - const int n_vocab = lloyal::tokenizer::vocab_size(_model.get()); - float* logits = lloyal::logits::get(_context, -1); - - std::vector candidates(n_vocab); - for (int i = 0; i < n_vocab; i++) { - candidates[i] = llama_token_data{static_cast(i), logits[i], 0.0f}; - } - - llama_token_data_array cur_p = { - candidates.data(), - static_cast(n_vocab), - -1, // selected - false // sorted - }; - - // Apply persistent sampler chain (includes penalties, filters, temp, dist) - lloyal::sampler::apply(_samplerChain, &cur_p); - - if (cur_p.selected == -1) { - throw Napi::Error::New(env, "Sampling failed - no token selected"); - } - - next_token = cur_p.data[cur_p.selected].id; - - // Update penalty history in persistent chain (KEY CHANGE from old stateless approach) - // This enables repeat penalty to track ALL tokens across the generation, - // not just what's visible in the current KV cache window after clearAndReseed() - lloyal::sampler::accept(_samplerChain, next_token); - } - - return Napi::Number::New(env, static_cast(next_token)); -} - Napi::Value SessionContext::dispose(const Napi::CallbackInfo& info) { Napi::Env env = info.Env(); @@ -1558,26 +1334,6 @@ Napi::Value SessionContext::dispose(const Napi::CallbackInfo& info) { // Revoke any active logits buffer before disposing invalidateLogits(); - // Free handle-based grammar samplers - for (auto& [handle, sampler] : _samplerHandles) { - if (sampler) { - llama_sampler_free(sampler); - } - } - _samplerHandles.clear(); - - // Free handle-based perplexity trackers - for (auto& [napiHandle, pplHandle] : _perplexityHandles) { - lloyal::metrics::free_perplexity(pplHandle); - } - _perplexityHandles.clear(); - - // Free persistent sampler chain (pattern from branch.hpp) - if (_samplerChain) { - lloyal::sampler::free_chain(_samplerChain); - _samplerChain = nullptr; - } - // Drain branch store while context is still alive _branchStore.drain(); @@ -1648,358 +1404,6 @@ Napi::Value SessionContext::kvSeqPosMax(const Napi::CallbackInfo& info) { return Napi::Number::New(env, static_cast(pos)); } -// ===== HANDLE-BASED GRAMMAR ===== - -Napi::Value SessionContext::createSampler(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - if (info.Length() < 1 || !info[0].IsString()) { - throw Napi::TypeError::New(env, "Expected (grammarStr: string)"); - } - - std::string grammarStr = info[0].As().Utf8Value(); - llama_sampler* sampler = lloyal::grammar::init_sampler(_model.get(), grammarStr); - - if (!sampler) { - throw Napi::Error::New(env, "Failed to create grammar sampler"); - } - - int32_t handle = _nextSamplerHandle++; - _samplerHandles[handle] = sampler; - - return Napi::Number::New(env, static_cast(handle)); -} - -Napi::Value SessionContext::applySampler(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - if (info.Length() < 2) { - throw Napi::TypeError::New(env, "Expected (handle, logitsBuffer)"); - } - - int32_t handle = static_cast(info[0].As().Int32Value()); - - auto it = _samplerHandles.find(handle); - if (it == _samplerHandles.end()) { - throw Napi::Error::New(env, "Invalid sampler handle"); - } - - // Get logits buffer - Napi::ArrayBuffer buffer; - if (info[1].IsArrayBuffer()) { - buffer = info[1].As(); - } else if (info[1].IsTypedArray()) { - buffer = info[1].As().ArrayBuffer(); - } else { - throw Napi::TypeError::New(env, "Expected ArrayBuffer or TypedArray"); - } - - float* logits = static_cast(buffer.Data()); - int n_vocab = lloyal::tokenizer::vocab_size(_model.get()); - - // Build candidates array - std::vector candidates(n_vocab); - for (int i = 0; i < n_vocab; i++) { - candidates[i] = llama_token_data{static_cast(i), logits[i], 0.0f}; - } - - llama_token_data_array arr = {candidates.data(), static_cast(n_vocab), -1, false}; - - // Apply grammar (modifies candidates) - llama_sampler_apply(it->second, &arr); - - // Write back to buffer - for (int i = 0; i < n_vocab; i++) { - logits[i] = candidates[i].logit; - } - - return env.Undefined(); -} - -Napi::Value SessionContext::acceptSamplerToken(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - if (info.Length() < 2) { - throw Napi::TypeError::New(env, "Expected (handle, tokenId)"); - } - - int32_t handle = static_cast(info[0].As().Int32Value()); - llama_token token = static_cast(info[1].As().Int32Value()); - - auto it = _samplerHandles.find(handle); - if (it == _samplerHandles.end()) { - throw Napi::Error::New(env, "Invalid sampler handle"); - } - - try { - llama_sampler_accept(it->second, token); - } catch (const std::exception& e) { - throw Napi::Error::New(env, e.what()); - } - return env.Undefined(); -} - -Napi::Value SessionContext::cloneSampler(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - if (info.Length() < 1) { - throw Napi::TypeError::New(env, "Expected (handle)"); - } - - int32_t handle = static_cast(info[0].As().Int32Value()); - - auto it = _samplerHandles.find(handle); - if (it == _samplerHandles.end()) { - throw Napi::Error::New(env, "Invalid sampler handle"); - } - - llama_sampler* cloned = lloyal::grammar::clone_sampler(it->second); - if (!cloned) { - throw Napi::Error::New(env, "Failed to clone sampler"); - } - - int32_t newHandle = _nextSamplerHandle++; - _samplerHandles[newHandle] = cloned; - - return Napi::Number::New(env, static_cast(newHandle)); -} - -Napi::Value SessionContext::freeSamplerHandle(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - if (info.Length() < 1) { - throw Napi::TypeError::New(env, "Expected (handle)"); - } - - int32_t handle = static_cast(info[0].As().Int32Value()); - - auto it = _samplerHandles.find(handle); - if (it != _samplerHandles.end()) { - llama_sampler_free(it->second); - _samplerHandles.erase(it); - } - - return env.Undefined(); -} - -// ===== PERPLEXITY TRACKING ===== - -Napi::Value SessionContext::createPerplexityTracker(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - // Create new perplexity tracker via metrics.hpp - lloyal::metrics::PerplexityHandle handle = lloyal::metrics::create_perplexity(); - - // Generate N-API handle - int32_t napiHandle = _nextPerplexityHandle++; - _perplexityHandles[napiHandle] = handle; - - return Napi::Number::New(env, static_cast(napiHandle)); -} - -Napi::Value SessionContext::addSurprisal(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - // Argument validation - if (info.Length() < 2 || !info[0].IsNumber() || !info[1].IsNumber()) { - throw Napi::TypeError::New(env, "Expected (handle: number, surprisal: number)"); - } - - int32_t napiHandle = info[0].As().Int32Value(); - double surprisal = info[1].As().DoubleValue(); - - // Lookup handle - auto it = _perplexityHandles.find(napiHandle); - if (it == _perplexityHandles.end()) { - throw Napi::Error::New(env, "Invalid perplexity tracker handle"); - } - - // Add surprisal to tracker - lloyal::metrics::add_surprisal(it->second, static_cast(surprisal)); - - return env.Undefined(); -} - -Napi::Value SessionContext::getPerplexity(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - // Argument validation - if (info.Length() < 1 || !info[0].IsNumber()) { - throw Napi::TypeError::New(env, "Expected handle: number"); - } - - int32_t napiHandle = info[0].As().Int32Value(); - - // Lookup handle - auto it = _perplexityHandles.find(napiHandle); - if (it == _perplexityHandles.end()) { - throw Napi::Error::New(env, "Invalid perplexity tracker handle"); - } - - // Get perplexity value - float ppl = lloyal::metrics::get_ppl(it->second); - - return Napi::Number::New(env, static_cast(ppl)); -} - -Napi::Value SessionContext::clonePerplexityTracker(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - // Argument validation - if (info.Length() < 1 || !info[0].IsNumber()) { - throw Napi::TypeError::New(env, "Expected handle: number"); - } - - int32_t sourceHandle = info[0].As().Int32Value(); - - // Lookup source handle - auto it = _perplexityHandles.find(sourceHandle); - if (it == _perplexityHandles.end()) { - throw Napi::Error::New(env, "Invalid source perplexity tracker handle"); - } - - // Clone via metrics.hpp - lloyal::metrics::PerplexityHandle clonedHandle = - lloyal::metrics::clone_perplexity(it->second); - - // Generate new N-API handle - int32_t newNapiHandle = _nextPerplexityHandle++; - _perplexityHandles[newNapiHandle] = clonedHandle; - - return Napi::Number::New(env, static_cast(newNapiHandle)); -} - -Napi::Value SessionContext::resetPerplexityTracker(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - // Argument validation - if (info.Length() < 1 || !info[0].IsNumber()) { - throw Napi::TypeError::New(env, "Expected handle: number"); - } - - int32_t napiHandle = info[0].As().Int32Value(); - - // Lookup handle - auto it = _perplexityHandles.find(napiHandle); - if (it == _perplexityHandles.end()) { - throw Napi::Error::New(env, "Invalid perplexity tracker handle"); - } - - // Reset tracker - lloyal::metrics::reset_perplexity(it->second); - - return env.Undefined(); -} - -Napi::Value SessionContext::getPerplexityCount(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - // Argument validation - if (info.Length() < 1 || !info[0].IsNumber()) { - throw Napi::TypeError::New(env, "Expected handle: number"); - } - - int32_t napiHandle = info[0].As().Int32Value(); - - // Lookup handle - auto it = _perplexityHandles.find(napiHandle); - if (it == _perplexityHandles.end()) { - throw Napi::Error::New(env, "Invalid perplexity tracker handle"); - } - - // Get token count - int count = lloyal::metrics::get_count(it->second); - - return Napi::Number::New(env, static_cast(count)); -} - -Napi::Value SessionContext::freePerplexityTracker(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - // Argument validation - if (info.Length() < 1 || !info[0].IsNumber()) { - throw Napi::TypeError::New(env, "Expected handle: number"); - } - - int32_t napiHandle = info[0].As().Int32Value(); - - // Lookup and remove handle - auto it = _perplexityHandles.find(napiHandle); - if (it == _perplexityHandles.end()) { - throw Napi::Error::New(env, "Invalid perplexity tracker handle"); - } - - // Free via metrics.hpp - lloyal::metrics::free_perplexity(it->second); - - // Remove from map - _perplexityHandles.erase(it); - - return env.Undefined(); -} - -// ===== ATOMIC DECODE+CAPTURE ===== - -Napi::Value SessionContext::decodeAndCapture(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - if (info.Length() < 4) { - throw Napi::TypeError::New(env, "Expected (tokens, position, seqId, destBuffer)"); - } - - // Parse tokens - Napi::Array tokensArray = info[0].As(); - std::vector tokens(tokensArray.Length()); - for (uint32_t i = 0; i < tokensArray.Length(); i++) { - tokens[i] = static_cast(tokensArray.Get(i).As().Int32Value()); - } - - int32_t position = info[1].As().Int32Value(); - llama_seq_id seqId = toSeqId(info[2].As().DoubleValue()); - - // Get dest buffer - Napi::ArrayBuffer destBuffer; - if (info[3].IsArrayBuffer()) { - destBuffer = info[3].As(); - } else if (info[3].IsTypedArray()) { - destBuffer = info[3].As().ArrayBuffer(); - } else { - throw Napi::TypeError::New(env, "destBuffer must be ArrayBuffer or TypedArray"); - } - - float* dest = static_cast(destBuffer.Data()); - int n_vocab = lloyal::tokenizer::vocab_size(_model.get()); - - // Main-thread work: invalidate logits views (touches Napi objects) - invalidateLogits(); - _decodeStepId++; - - // Pin the JS ArrayBuffer to prevent GC during worker Execute() - auto bufRef = Napi::Reference::New(destBuffer, 1); - - auto* worker = new DecodeAndCaptureWorker( - env, _context, std::move(tokens), position, seqId, _nBatch, - dest, n_vocab, std::move(bufRef)); - worker->Queue(); - return worker->GetPromise(); -} - -// ===== HELPER METHODS ===== -// Pattern matches HybridSessionContext.cpp:103-106, 365-379 - Napi::Value SessionContext::getMemorySize(const Napi::CallbackInfo& info) { Napi::Env env = info.Env(); ensureNotDisposed(); @@ -2658,10 +2062,11 @@ Napi::Value SessionContext::_branchSamplerChainReseed(const Napi::CallbackInfo& throw Napi::Error::New(env, "_branchSamplerChainReseed: invalid handle"); } - // Only reseed stochastic chains (has_dist_sampler=true) + // Only reseed stochastic chains (has_dist=true) // Reseeding greedy chains would corrupt them - if (state->sampler_chain && state->has_dist_sampler) { - lloyal::sampler::reseed_chain(state->sampler_chain, seed); + if (state->sampler_chain != 0 && _branchStore.sampler_has_dist(state->sampler_chain)) { + llama_sampler* chain = _branchStore.get_sampler_chain(state->sampler_chain); + if (chain) lloyal::sampler::reseed_chain(chain, seed); } return env.Undefined(); @@ -2736,6 +2141,48 @@ Napi::Value SessionContext::_branchClearSteer(const Napi::CallbackInfo& info) { return env.Undefined(); } +Napi::Value SessionContext::_branchSetSamplerParams(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + ensureNotDisposed(); + + if (info.Length() < 2) { + throw Napi::Error::New(env, "_branchSetSamplerParams requires (handle, params)"); + } + + auto handle = static_cast(info[0].As().Uint32Value()); + + LloyalSamplingParams params; + if (info[1].IsObject()) { + params = adaptSamplingParamsFromJS(info[1].As()); + } + + lloyal::branch::set_sampler_params(handle, params, _branchStore); + + return env.Undefined(); +} + +Napi::Value SessionContext::_branchSetGrammar(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + ensureNotDisposed(); + + if (info.Length() < 2) { + throw Napi::Error::New(env, "_branchSetGrammar requires (handle, grammarStr)"); + } + + auto handle = static_cast(info[0].As().Uint32Value()); + + std::string grammar_str = info[1].As().Utf8Value(); + + lloyal::branch::set_grammar( + handle, + _model.get(), + grammar_str.empty() ? "" : grammar_str.c_str(), + _branchStore + ); + + return env.Undefined(); +} + // ===== STORE API ===== Napi::Value SessionContext::_storeCommit(const Napi::CallbackInfo& info) { diff --git a/src/SessionContext.hpp b/src/SessionContext.hpp index 769c210..3ab159d 100644 --- a/src/SessionContext.hpp +++ b/src/SessionContext.hpp @@ -9,7 +9,6 @@ #include #include #include -#include #include namespace liblloyal_node { @@ -86,16 +85,6 @@ class SessionContext : public Napi::ObjectWrap { */ Napi::Value getLogits(const Napi::CallbackInfo& info); - /** - * Decode tokens through model - * Args: tokens (number[]), position (number), seqId? (number, default 0) - * Returns: Promise - * - * The seqId parameter specifies which KV cache sequence to update. - * Use different seqIds for independent parallel sequences. - */ - Napi::Value decode(const Napi::CallbackInfo& info); - /** * Tokenize text to token IDs * Args: text (string) @@ -150,21 +139,6 @@ class SessionContext : public Napi::ObjectWrap { */ Napi::Value kvCacheSize(const Napi::CallbackInfo& info); - // ===== NATIVE REFERENCE IMPLEMENTATIONS ===== - - /** - * Native greedy sampling (for validation) - * Returns: number (token ID) - */ - Napi::Value greedySample(const Napi::CallbackInfo& info); - - /** - * Native sampling with full parameters (for benchmarking) - * Args: params (optional object with temperature, topK, topP, etc.) - * Returns: number (token ID) - */ - Napi::Value sample(const Napi::CallbackInfo& info); - // ===== LIFECYCLE ===== /** @@ -212,49 +186,6 @@ class SessionContext : public Napi::ObjectWrap { */ Napi::Value kvSeqPosMax(const Napi::CallbackInfo& info); - // ===== HANDLE-BASED GRAMMAR ===== - - /** - * Create a new grammar sampler, returns handle - * Args: grammarStr (string) - * Returns: number (handle) - */ - Napi::Value createSampler(const Napi::CallbackInfo& info); - - /** - * Apply grammar constraints to logits buffer - * Args: handle (number), logitsBuffer (ArrayBuffer) - */ - Napi::Value applySampler(const Napi::CallbackInfo& info); - - /** - * Accept token to advance grammar parser state - * Args: handle (number), tokenId (number) - */ - Napi::Value acceptSamplerToken(const Napi::CallbackInfo& info); - - /** - * Clone a grammar sampler - * Args: handle (number) - * Returns: number (new handle) - */ - Napi::Value cloneSampler(const Napi::CallbackInfo& info); - - /** - * Free a grammar sampler - * Args: handle (number) - */ - Napi::Value freeSamplerHandle(const Napi::CallbackInfo& info); - - // ===== ATOMIC DECODE+CAPTURE ===== - - /** - * Decode tokens and capture logits into a JS ArrayBuffer - * Args: tokens (number[]), position (number), seqId (number), destBuffer (ArrayBuffer) - * Returns: Promise - */ - Napi::Value decodeAndCapture(const Napi::CallbackInfo& info); - /** * Write KV cache state + tokens to a file for disk persistence * Args: sequenceId (number), filepath (string), tokens (number[]) @@ -320,50 +251,6 @@ class SessionContext : public Napi::ObjectWrap { */ Napi::Value modelEntropy(const Napi::CallbackInfo& info); - /** - * Create a new perplexity tracker - * Returns: number (handle) - */ - Napi::Value createPerplexityTracker(const Napi::CallbackInfo& info); - - /** - * Add surprisal value to tracker - * Args: handle (number), surprisal (number) - */ - Napi::Value addSurprisal(const Napi::CallbackInfo& info); - - /** - * Get current perplexity value - * Args: handle (number) - * Returns: number (perplexity) - */ - Napi::Value getPerplexity(const Napi::CallbackInfo& info); - - /** - * Clone perplexity tracker - * Args: sourceHandle (number) - * Returns: number (new handle) - */ - Napi::Value clonePerplexityTracker(const Napi::CallbackInfo& info); - - /** - * Reset tracker to initial state - * Args: handle (number) - */ - Napi::Value resetPerplexityTracker(const Napi::CallbackInfo& info); - - /** - * Get number of tokens tracked - * Args: handle (number) - * Returns: number (count) - */ - Napi::Value getPerplexityCount(const Napi::CallbackInfo& info); - - /** - * Free perplexity tracker resources - * Args: handle (number) - */ - Napi::Value freePerplexityTracker(const Napi::CallbackInfo& info); // ===== BRANCH API (internal, wrapped by lib/Branch.ts) ===== @@ -386,6 +273,8 @@ class SessionContext : public Napi::ObjectWrap { Napi::Value _branchSamplerChainReseed(const Napi::CallbackInfo& info); Napi::Value _branchSteer(const Napi::CallbackInfo& info); Napi::Value _branchClearSteer(const Napi::CallbackInfo& info); + Napi::Value _branchSetSamplerParams(const Napi::CallbackInfo& info); + Napi::Value _branchSetGrammar(const Napi::CallbackInfo& info); // ===== STORE API (internal, wrapped by lib/BranchStore.js) ===== @@ -402,21 +291,6 @@ class SessionContext : public Napi::ObjectWrap { bool _disposed = false; int32_t _nBatch = lloyal::defaults::N_BATCH_INIT; - // Persistent sampling chain (for repeat penalty tracking across tokens) - // Pattern from branch.hpp: create once via sampler::create_chain(), reuse across samples. - // Penalty sampler's history is updated via sampler::accept() after each sample. - // This enables proper repeat penalty tracking across long generations and clearAndReseed(). - llama_sampler* _samplerChain = nullptr; - LloyalSamplingParams _samplerParams; // Track current params to detect changes - - // ===== HANDLE-BASED GRAMMAR ===== - std::unordered_map _samplerHandles; - int32_t _nextSamplerHandle = 1; - - // ===== HANDLE-BASED PERPLEXITY TRACKING ===== - std::unordered_map _perplexityHandles; - int32_t _nextPerplexityHandle = 1; - // ===== BRANCH STORE ===== lloyal::branch::BranchStore _branchStore{16}; // capacity 16 diff --git a/test/integration.js b/test/integration.js index 9018194..e276b2c 100644 --- a/test/integration.js +++ b/test/integration.js @@ -86,42 +86,37 @@ async function testCoreAPI(ctx) { const tokenText = ctx.tokenToText(tokens[0]); assert(typeof tokenText === 'string', `tokenToText(${tokens[0]}) → "${tokenText}"`); - // decode + getLogits - await ctx.decode(tokens, 0); - const logits = ctx.getLogits(); - assert(logits instanceof Float32Array, `getLogits() → Float32Array(${logits.length})`); - assert(logits.length === ctx.vocabSize, `logits.length === vocabSize (${ctx.vocabSize})`); + // Branch-based prefill + getLogits + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(tokens); + + const branchLogits = branch.getLogits(); + assert(branchLogits instanceof Float32Array, `branch.getLogits() → Float32Array(${branchLogits.length})`); + assert(branchLogits.length === ctx.vocabSize, `branchLogits.length === vocabSize (${ctx.vocabSize})`); // Validate logits are not garbage let hasNonZero = false, hasNaN = false; - for (let i = 0; i < logits.length; i++) { - if (logits[i] !== 0.0) hasNonZero = true; - if (isNaN(logits[i])) hasNaN = true; + for (let i = 0; i < branchLogits.length; i++) { + if (branchLogits[i] !== 0.0) hasNonZero = true; + if (isNaN(branchLogits[i])) hasNaN = true; } - assert(hasNonZero && !hasNaN, 'logits valid (non-zero, no NaN)'); - - // modelEntropy - const entropy = ctx.modelEntropy(); - assert(isFinite(entropy) && entropy >= 0, `modelEntropy() → ${entropy.toFixed(4)} nats`); + assert(hasNonZero && !hasNaN, 'branch logits valid (non-zero, no NaN)'); - // greedySample - const greedy = ctx.greedySample(); - assert(greedy >= 0 && greedy < ctx.vocabSize, `greedySample() → ${greedy}`); + // modelEntropy with branch logits + const entropy = ctx.modelEntropy('nats', branchLogits); + assert(isFinite(entropy) && entropy >= 0, `modelEntropy(branchLogits) → ${entropy.toFixed(4)} nats`); - // sample with params - const sampled = ctx.sample({ temperature: 0 }); - assert(sampled === greedy, `sample({temp:0}) === greedySample() (${sampled})`); + // Branch greedy sampling (temperature: 0) + const greedy = branch.sample(); + assert(greedy >= 0 && greedy < ctx.vocabSize, `branch.sample() greedy → ${greedy}`); // isStopToken - EOS should be a stop token const eos = ctx.getEogToken(); assert(ctx.isStopToken(eos), `isStopToken(EOS=${eos}) → true`); - // Logits memoization - const logits1 = ctx.getLogits(); - const logits2 = ctx.getLogits(); - assert(logits1[0] === logits2[0], 'getLogits() memoized (same step = same buffer)'); - - // withLogits helper + // withLogits helper (context-level logits) + // Note: getLogits() reads from the shared context buffer, which is populated + // by branch decode operations const maxLogit = withLogits(ctx, (l) => { let max = l[0]; for (let i = 1; i < l.length; i++) if (l[i] > max) max = l[i]; @@ -136,6 +131,8 @@ async function testCoreAPI(ctx) { asyncRejected = true; } assert(asyncRejected, 'withLogits() rejects async callbacks'); + + await branch.prune(); } // ═══════════════════════════════════════════════════════════════════════════ @@ -147,11 +144,13 @@ async function testKVCache(ctx) { await ctx.kvCacheClear(); const tokens = await ctx.tokenize("Test prompt"); - await ctx.decode(tokens, 0); + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(tokens); const sizeBefore = ctx.kvCacheSize(); - assert(sizeBefore >= 0, `kvCacheSize() after decode → ${sizeBefore}`); + assert(sizeBefore >= 0, `kvCacheSize() after prefill → ${sizeBefore}`); + await branch.prune(); await ctx.kvCacheClear(); const sizeAfter = ctx.kvCacheSize(); assert(sizeAfter === -1, `kvCacheClear() → size=${sizeAfter} (empty)`); @@ -172,21 +171,25 @@ async function testMultiSequence() { }); try { + // Use a branch to prefill tokens (populates KV on its seq_id) const tokens = await ctx.tokenize("The quick brown fox"); - await ctx.decode(tokens, 0, 0); + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(tokens); - const seq0Pos = ctx.kvSeqPosMax(0); - assert(seq0Pos >= 0, `kvSeqPosMax(0) → ${seq0Pos}`); + // Branch allocates a seq_id — check its KV is populated + const branchPos = branch.position; + assert(branchPos === tokens.length, `branch position → ${branchPos}`); - const seq1Before = ctx.kvSeqPosMax(1); - assert(seq1Before === -1, `kvSeqPosMax(1) before copy → ${seq1Before} (empty)`); + // Fork creates a new sequence with copied KV + const forked = await branch.fork(); + assert(forked.position === branchPos, `forked position matches parent → ${forked.position}`); - ctx.kvSeqCopy(0, 1); - const seq1After = ctx.kvSeqPosMax(1); - assert(seq1After === seq0Pos, `kvSeqCopy(0,1) → seq1 pos=${seq1After}`); + // Raw KV seq ops still work for advanced use + const seq1Before = ctx.kvSeqPosMax(3); // unused seq_id + assert(seq1Before === -1, `kvSeqPosMax(unused) → ${seq1Before} (empty)`); - const seq0After = ctx.kvSeqPosMax(0); - assert(seq0After === seq0Pos, `seq0 unchanged after copy → ${seq0After}`); + await forked.prune(); + await branch.prune(); } finally { ctx.dispose(); } @@ -202,48 +205,22 @@ async function testGrammar() { const ctx = await addon.createContext({ modelPath: MODEL_PATH, nCtx: CTX_SIZE, - nThreads: 4 + nThreads: 4, + nSeqMax: 4 }); try { const grammar = `root ::= "{" ws "}" ws ws ::= [ \\t\\n]*`; - // Handle-based API - const handle = ctx.createSampler(grammar); - assert(typeof handle === 'number' && handle > 0, `createSampler() → handle=${handle}`); - - const cloned = ctx.cloneSampler(handle); - assert(cloned !== handle, `cloneSampler() → new handle=${cloned}`); - - const testLogits = new Float32Array(ctx.vocabSize).fill(0.5); - ctx.applySampler(handle, testLogits); - - let masked = 0, validToken = -1; - for (let i = 0; i < testLogits.length; i++) { - if (testLogits[i] < -1e30) masked++; - else if (validToken === -1) validToken = i; - } - assert(masked > 0 && validToken >= 0, `applySampler() masked ${masked} tokens`); - - ctx.acceptSamplerToken(handle, validToken); - ok(`acceptSamplerToken(${validToken})`); - - ctx.freeSamplerHandle(handle); - ctx.freeSamplerHandle(cloned); - ok('freeSamplerHandle() both handles'); - // Branch API with grammar - await ctx.kvCacheClear(); const prompt = await ctx.tokenize("Output: "); - await ctx.decode(prompt, 0, 0); - - const branch = Branch.create(ctx, prompt.length, { temperature: 0 }, undefined, grammar); - branch.captureLogits(); + const branch = Branch.create(ctx, 0, { temperature: 0 }, undefined, grammar); + await branch.prefill(prompt); const output = []; for (let i = 0; i < 10; i++) { - const { token, text, isStop } = branch.produce(); + const { token, text, isStop } = await branch.produce(); if (isStop) break; await branch.commit(token); output.push(text); @@ -251,6 +228,32 @@ ws ::= [ \\t\\n]*`; const result = output.join(''); assert(/^\{\s*\}\s*$/.test(result), `Branch+grammar → "${result}"`); + + // Grammar is cloned on fork — independent parser states + await ctx.kvCacheClear(); + const prompt2 = await ctx.tokenize("Output: "); + const root = Branch.create(ctx, 0, { temperature: 0 }, undefined, grammar); + await root.prefill(prompt2); + + const childA = await root.fork(); + const childB = await root.fork(); + + // Both children should produce grammar-valid output independently + const outA = [], outB = []; + for (let i = 0; i < 10; i++) { + const pA = await childA.produce(); + if (!pA.isStop) { await childA.commit(pA.token); outA.push(pA.text); } + const pB = await childB.produce(); + if (!pB.isStop) { await childB.commit(pB.token); outB.push(pB.text); } + } + + const resultA = outA.join(''), resultB = outB.join(''); + assert(/^\{\s*\}\s*$/.test(resultA), `Fork A grammar → "${resultA}"`); + assert(/^\{\s*\}\s*$/.test(resultB), `Fork B grammar → "${resultB}"`); + + await childA.prune(); + await childB.prune(); + await root.prune(); await branch.prune(); } finally { ctx.dispose(); @@ -266,45 +269,27 @@ async function testMetrics(ctx) { await ctx.kvCacheClear(); const tokens = await ctx.tokenize("Hello"); - await ctx.decode(tokens, 0); + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(tokens); - const token1 = ctx.greedySample(); - const surprisal = ctx.modelSurprisal(token1, "nats"); - assert(surprisal >= 0, `modelSurprisal() → ${surprisal.toFixed(2)} nats`); + // modelSurprisal with branch logits + const token1 = branch.sample(); + const branchLogits = branch.getLogits(); + const surprisal = ctx.modelSurprisal(token1, "nats", branchLogits); + assert(surprisal >= 0, `modelSurprisal(branchLogits) → ${surprisal.toFixed(2)} nats`); - const surprisalBits = ctx.modelSurprisal(token1, "bits"); + const surprisalBits = ctx.modelSurprisal(token1, "bits", branchLogits); assert(Math.abs(surprisalBits - surprisal / Math.log(2)) < 0.01, 'bits = nats / ln(2)'); - const tracker = ctx.createPerplexityTracker(); - assert(tracker > 0, `createPerplexityTracker() → ${tracker}`); - - ctx.addSurprisal(tracker, surprisal); - await ctx.decode([token1], tokens.length); - ctx.addSurprisal(tracker, ctx.modelSurprisal(ctx.greedySample())); - - const count = ctx.getPerplexityCount(tracker); - assert(count === 2, `getPerplexityCount() → ${count}`); - - const ppl = ctx.getPerplexity(tracker); - assert(ppl >= 1.0, `getPerplexity() → ${ppl.toFixed(2)}`); - - const clonedTracker = ctx.clonePerplexityTracker(tracker); - assert(clonedTracker !== tracker, `clonePerplexityTracker() → ${clonedTracker}`); - - ctx.resetPerplexityTracker(clonedTracker); - assert(ctx.getPerplexityCount(clonedTracker) === 0, 'resetPerplexityTracker() → count=0'); + // Branch perplexity — built-in, accumulates through commit() + await branch.commit(token1); + const { token: token2 } = await branch.produce(); + await branch.commit(token2); - ctx.freePerplexityTracker(tracker); - ctx.freePerplexityTracker(clonedTracker); - ok('freePerplexityTracker() both'); + const ppl = branch.perplexity; + assert(isFinite(ppl) && ppl >= 1.0, `branch.perplexity → ${ppl.toFixed(2)}`); - let threwOnInvalid = false; - try { - ctx.getPerplexity(tracker); - } catch { - threwOnInvalid = true; - } - assert(threwOnInvalid, 'Invalid handle throws'); + await branch.prune(); } // ═══════════════════════════════════════════════════════════════════════════ @@ -332,15 +317,13 @@ async function testBranchPrefill() { const messages = [{ role: 'user', content: turns[0] }]; const { prompt } = await ctx.formatChat(JSON.stringify(messages)); const promptToks = await ctx.tokenize(prompt); - await ctx.decode(promptToks, 0, 0); - - const branch = Branch.create(ctx, promptToks.length, { temperature: 0 }); - branch.captureLogits(); + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(promptToks); // Turn 1 const gen1 = []; for (let i = 0; i < GEN_TOKENS; i++) { - const { token, isStop } = branch.produce(); + const { token, isStop } = await branch.produce(); if (isStop) break; await branch.commit(token); gen1.push(token); @@ -371,7 +354,7 @@ async function testBranchPrefill() { const gen = []; for (let i = 0; i < GEN_TOKENS; i++) { - const { token, isStop } = branch.produce(); + const { token, isStop } = await branch.produce(); if (isStop) break; await branch.commit(token); gen.push(token); @@ -411,7 +394,7 @@ async function testWarmMultiTurnRecall() { async function generate(branch) { const gen = []; for (;;) { - const { token, isStop } = branch.produce(); + const { token, isStop } = await branch.produce(); if (isStop) break; await branch.commit(token); gen.push(token); @@ -434,10 +417,8 @@ async function testWarmMultiTurnRecall() { const msgs1 = [{ role: 'user', content: 'Hi, my name is Lloyal' }]; const { prompt, format, reasoningFormat } = await ctx.formatChat(JSON.stringify(msgs1), {}); const promptToks = await ctx.tokenize(prompt); - await ctx.decode(promptToks, 0, 0); - - const branch = Branch.create(ctx, promptToks.length, { temperature: 0 }); - branch.captureLogits(); + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(promptToks); // Helper: parse output and check content (not reasoning) for a term function checkRecall(rawText, term) { @@ -528,7 +509,7 @@ async function testWarmSemanticRecall() { const gen = []; for (let i = 0; i < GEN_TOKENS; i++) { - const { token, isStop } = branch.produce(); + const { token, isStop } = await branch.produce(); if (isStop) break; await branch.commit(token); gen.push(token); @@ -542,15 +523,13 @@ async function testWarmSemanticRecall() { messages.push({ role: 'user', content: 'Remember this: my dog is named Max.' }); const { prompt } = await ctx.formatChat(JSON.stringify(messages)); const promptToks = await ctx.tokenize(prompt); - await ctx.decode(promptToks, 0, 0); - - branch = Branch.create(ctx, promptToks.length, { temperature: 0 }); - branch.captureLogits(); + branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(promptToks); // Generate turn 1 response const gen = []; for (let i = 0; i < GEN_TOKENS; i++) { - const { token, isStop } = branch.produce(); + const { token, isStop } = await branch.produce(); if (isStop) break; await branch.commit(token); gen.push(token); @@ -625,11 +604,8 @@ async function testBranchSteer() { try { const tokens = await ctx.tokenize("The quick brown"); - await ctx.decode(tokens, 0, 0); - - // Use greedy sampling for deterministic tests - const branch = Branch.create(ctx, tokens.length, { temperature: 0 }); - branch.captureLogits(); + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(tokens); // Get the greedy token (what would be sampled without steer) const greedyToken = branch.sample(); @@ -669,10 +645,8 @@ async function testBranchSteer() { // Test fork invariant: steer is NOT cloned on fork const tokens2 = await ctx.tokenize("Hello world"); - await ctx.decode(tokens2, 0, 0); - - const parent = Branch.create(ctx, tokens2.length, { temperature: 0 }); - parent.captureLogits(); + const parent = Branch.create(ctx, 0, { temperature: 0 }); + await parent.prefill(tokens2); const parentGreedy = parent.sample(); @@ -732,17 +706,15 @@ async function testNBatchAblation() { const messages = [{ role: 'user', content: "Hello, how are you today?" }]; const { prompt } = await ctx.formatChat(JSON.stringify(messages)); const promptToks = await ctx.tokenize(prompt); - await ctx.decode(promptToks, 0, 0); - - const branch = Branch.create(ctx, promptToks.length, { temperature: 0 }, nBatch); - branch.captureLogits(); + const branch = Branch.create(ctx, 0, { temperature: 0 }, nBatch); + await branch.prefill(promptToks); const followUp = await ctx.tokenize(" What else?"); await branch.prefill(followUp); const gen = []; for (let i = 0; i < 5; i++) { - const { token, isStop } = branch.produce(); + const { token, isStop } = await branch.produce(); if (isStop) break; await branch.commit(token); gen.push(token); @@ -820,15 +792,18 @@ async function testDeterminism() { const messages = [{ role: 'user', content: prompt }]; const { prompt: formatted } = await ctx.formatChat(JSON.stringify(messages)); const tokens = await ctx.tokenize(formatted); - await ctx.decode(tokens, 0); + + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(tokens); const gen = []; for (let i = 0; i < 20; i++) { - const token = ctx.sample({ temperature: 0 }); - if (ctx.isStopToken(token)) break; + const { token, isStop } = await branch.produce(); + if (isStop) break; + await branch.commit(token); gen.push(token); - await ctx.decode([token], tokens.length + i); } + await branch.prune(); return gen.join(','); } finally { ctx.dispose(); @@ -915,11 +890,11 @@ async function testEmbeddings() { } // ═══════════════════════════════════════════════════════════════════════════ -// ATOMIC DECODE AND CAPTURE +// BRANCH PREFILL + GET LOGITS (replaces testDecodeAndCapture) // ═══════════════════════════════════════════════════════════════════════════ -async function testDecodeAndCapture() { - console.log('\n--- decodeAndCapture ---'); +async function testBranchPrefillAndLogits() { + console.log('\n--- Branch prefill + getLogits ---'); const ctx = await addon.createContext({ modelPath: MODEL_PATH, @@ -929,23 +904,23 @@ async function testDecodeAndCapture() { try { const tokens = await ctx.tokenize("Hello"); - const buffer = new Float32Array(ctx.vocabSize); - - await ctx.decodeAndCapture(tokens, 0, 0, buffer); + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(tokens); + const logits = branch.getLogits(); let valid = false; - for (let i = 0; i < buffer.length; i++) { - if (buffer[i] !== 0 && !isNaN(buffer[i])) valid = true; + for (let i = 0; i < logits.length; i++) { + if (logits[i] !== 0 && !isNaN(logits[i])) valid = true; } - assert(valid, `decodeAndCapture() filled buffer with valid logits`); - - // Verify it's a copy - const orig = buffer[0]; - buffer[0] = -999; - const ctxLogits = ctx.getLogits(); - const isCopy = ctxLogits[0] !== -999; - buffer[0] = orig; - assert(isCopy, 'Captured buffer is independent copy'); + assert(valid, `branch.prefill() + getLogits() → valid logits`); + + // Branch logits are an independent copy + const orig = logits[0]; + logits[0] = -999; + const logits2 = branch.getLogits(); + assert(logits2[0] !== -999, 'branch.getLogits() returns independent copy'); + + await branch.prune(); } finally { ctx.dispose(); } @@ -1040,16 +1015,15 @@ async function testBranchStore() { // Tests: batched generation loop, perplexity accumulation through accept_token, // Branch.perplexity accessor after store ops, reseedSampler diversity. { - await ctx.decode(promptToks, 0, 0); - const root = Branch.create(ctx, promptToks.length, { temperature: 0.8 }); - root.captureLogits(); + const root = Branch.create(ctx, 0, { temperature: 0.8 }); + await root.prefill(promptToks); const branches = [root, await root.fork(), await root.fork()]; branches[1].reseedSampler(42); branches[2].reseedSampler(99); for (let step = 0; step < 10; step++) { - const live = branches.map(b => [b, b.produce()]) - .filter(([, p]) => !p.isStop); + const produced = await Promise.all(branches.map(async b => [b, await b.produce()])); + const live = produced.filter(([, p]) => !p.isStop); if (!live.length) break; await store.commit(live.map(([b, p]) => [b, p.token])); } @@ -1071,9 +1045,8 @@ async function testBranchStore() { // generating with store.commit(). This is the persistence/replay pattern. // Tests: prefill→commit lifecycle, metrics across phase transition, getLogits(). { - await ctx.decode(promptToks, 0, 0); - const b1 = Branch.create(ctx, promptToks.length, { temperature: 0 }); - b1.captureLogits(); + const b1 = Branch.create(ctx, 0, { temperature: 0 }); + await b1.prefill(promptToks); const b2 = await b1.fork(); // Phase 1: Rehydrate from "saved" histories @@ -1095,8 +1068,8 @@ async function testBranchStore() { // Phase 2: Generate continuations const gen1 = [], gen2 = []; for (let i = 0; i < 5; i++) { - const live = [[b1, b1.produce()], [b2, b2.produce()]] - .filter(([, p]) => !p.isStop); + const produced = [[b1, await b1.produce()], [b2, await b2.produce()]]; + const live = produced.filter(([, p]) => !p.isStop); if (!live.length) break; await store.commit(live.map(([b, p]) => [b, p.token])); for (const [b, p] of live) { @@ -1121,9 +1094,8 @@ async function testBranchStore() { // Verifies Branch.getLogits() returns a Float32Array consumable by the // existing metrics API. This tests the JS API surface of the new exposure. { - await ctx.decode(promptToks, 0, 0); - const b1 = Branch.create(ctx, promptToks.length, { temperature: 0 }); - b1.captureLogits(); + const b1 = Branch.create(ctx, 0, { temperature: 0 }); + await b1.prefill(promptToks); const logits = b1.getLogits(); assert(logits instanceof Float32Array, @@ -1134,16 +1106,11 @@ async function testBranchStore() { // Feed branch logits into ctx.modelEntropy() — proves the returned // buffer is a valid logits distribution consumable by metrics API const entropyFromBranch = ctx.modelEntropy("nats", logits); - const entropyFromCtx = ctx.modelEntropy("nats"); assert(isFinite(entropyFromBranch) && entropyFromBranch > 0, `getLogits→modelEntropy: ${entropyFromBranch.toFixed(4)} nats`); - // Branch logits (captured from same decode) should match context logits - assert(Math.abs(entropyFromBranch - entropyFromCtx) < 1e-4, - `getLogits→modelEntropy: branch=${entropyFromBranch.toFixed(4)} ≈ ctx=${entropyFromCtx.toFixed(4)}`); - // After store.commit, logits change — getLogits() reflects new state - const p = b1.produce(); + const p = await b1.produce(); assert(!p.isStop, `getLogits: produce() should not hit EOG on first token`); await store.commit([[b1, p.token]]); const logitsAfter = b1.getLogits(); @@ -1159,15 +1126,14 @@ async function testBranchStore() { // Tests: produce() reads from branch snapshot, store.commit() advances state, // produce() on next iteration reads from updated snapshot. { - await ctx.decode(promptToks, 0, 0); - const b1 = Branch.create(ctx, promptToks.length, { temperature: 0 }); - b1.captureLogits(); + const b1 = Branch.create(ctx, 0, { temperature: 0 }); + await b1.prefill(promptToks); const b2 = await b1.fork(); const output = []; for (let i = 0; i < 5; i++) { // Inspect with produce() — does NOT advance state - const p1 = b1.produce(), p2 = b2.produce(); + const p1 = await b1.produce(), p2 = await b2.produce(); // Can inspect text and isStop before committing assert(typeof p1.text === 'string' && typeof p2.text === 'string', @@ -1192,15 +1158,14 @@ async function testBranchStore() { // Tests: both paths write to the same branch state correctly, no corruption when // alternating between decode::one and decode::each on the same sequence. { - await ctx.decode(promptToks, 0, 0); - const b1 = Branch.create(ctx, promptToks.length, { temperature: 0 }); - b1.captureLogits(); + const b1 = Branch.create(ctx, 0, { temperature: 0 }); + await b1.prefill(promptToks); const b2 = await b1.fork(); // Step 1-3: single-branch commit (decode::one path) for (let i = 0; i < 3; i++) { - const live = [[b1, b1.produce()], [b2, b2.produce()]] - .filter(([, p]) => !p.isStop); + const produced = [[b1, await b1.produce()], [b2, await b2.produce()]]; + const live = produced.filter(([, p]) => !p.isStop); if (!live.length) break; for (const [b, p] of live) await b.commit(p.token); } @@ -1208,8 +1173,8 @@ async function testBranchStore() { // Step 4-6: batched commit (decode::each path) for (let i = 0; i < 3; i++) { - const live = [[b1, b1.produce()], [b2, b2.produce()]] - .filter(([, p]) => !p.isStop); + const produced = [[b1, await b1.produce()], [b2, await b2.produce()]]; + const live = produced.filter(([, p]) => !p.isStop); if (!live.length) break; await store.commit(live.map(([b, p]) => [b, p.token])); } @@ -1219,8 +1184,8 @@ async function testBranchStore() { // Step 7-9: back to single-branch commit for (let i = 0; i < 3; i++) { - const live = [[b1, b1.produce()], [b2, b2.produce()]] - .filter(([, p]) => !p.isStop); + const produced = [[b1, await b1.produce()], [b2, await b2.produce()]]; + const live = produced.filter(([, p]) => !p.isStop); if (!live.length) break; for (const [b, p] of live) await b.commit(p.token); } @@ -1237,9 +1202,8 @@ async function testBranchStore() { // Tests: per-branch EOG filtering, store.commit with shrinking branch set, // surviving branch generates correct output after sibling stops. { - await ctx.decode(promptToks, 0, 0); - const b1 = Branch.create(ctx, promptToks.length, { temperature: 0 }); - b1.captureLogits(); + const b1 = Branch.create(ctx, 0, { temperature: 0 }); + await b1.prefill(promptToks); const b2 = await b1.fork(); const eog = ctx.getEogToken(); @@ -1253,8 +1217,8 @@ async function testBranchStore() { } const pairs = [ - ...(!stopped[0] ? [[b1, b1.produce()]] : []), - ...(!stopped[1] ? [[b2, b2.produce()]] : []), + ...(!stopped[0] ? [[b1, await b1.produce()]] : []), + ...(!stopped[1] ? [[b2, await b2.produce()]] : []), ]; const live = pairs.filter(([, p]) => !p.isStop); @@ -1313,13 +1277,11 @@ async function testPplSanity() { const messages = [{ role: 'user', content: 'Tell me about the weather.' }]; const { prompt } = await ctx.formatChat(JSON.stringify(messages)); const promptToks = await ctx.tokenize(prompt); - await ctx.decode(promptToks, 0, 0); - - const branch = Branch.create(ctx, promptToks.length, { temperature: 0 }); - branch.captureLogits(); + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(promptToks); for (let i = 0; i < 10; i++) { - const { token, isStop } = branch.produce(); + const { token, isStop } = await branch.produce(); if (isStop) break; await branch.commit(token); } @@ -1356,10 +1318,8 @@ async function testCommitRollback() { try { const promptToks = await ctx.tokenize("Hi"); - await ctx.decode(promptToks, 0, 0); - - const root = Branch.create(ctx, promptToks.length, { temperature: 1.0 }); - root.captureLogits(); + const root = Branch.create(ctx, 0, { temperature: 1.0 }); + await root.prefill(promptToks); const branches = [root]; for (let i = 1; i < 8; i++) { const b = await root.fork(); @@ -1375,9 +1335,8 @@ async function testCommitRollback() { let successfulRounds = 0; let failedRound = false; for (let round = 0; round < 50; round++) { - const live = branches - .map(b => [b, b.produce()]) - .filter(([, p]) => !p.isStop); + const produced = await Promise.all(branches.map(async b => [b, await b.produce()])); + const live = produced.filter(([, p]) => !p.isStop); if (!live.length) break; // Snapshot PPL before this round @@ -1435,13 +1394,11 @@ async function testAsyncRejection() { try { const tokens = await ctx.tokenize("Hello world"); - await ctx.decode(tokens, 0, 0); - - const branch = Branch.create(ctx, tokens.length, { temperature: 0 }); - branch.captureLogits(); + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(tokens); // Generate one token to prove branch works - const { token, isStop } = branch.produce(); + const { token, isStop } = await branch.produce(); assert(!isStop, 'rejection: initial produce succeeds'); await branch.commit(token); const posAfterCommit = branch.position; @@ -1460,14 +1417,23 @@ async function testAsyncRejection() { } assert(threwOnCommit, 'rejection: commit on disposed branch throws'); - // produce() on disposed branch + // produce() on disposed branch — async version rejects let threwOnProduce = false; try { - branch.produce(); + await branch.produce(); } catch (e) { threwOnProduce = true; } - assert(threwOnProduce, 'rejection: produce on disposed branch throws'); + assert(threwOnProduce, 'rejection: produce on disposed branch rejects'); + + // produceSync() on disposed branch — throws synchronously + let threwOnProduceSync = false; + try { + branch.produceSync(); + } catch (e) { + threwOnProduceSync = true; + } + assert(threwOnProduceSync, 'rejection: produceSync on disposed branch throws'); // fork() on disposed branch let threwOnFork = false; @@ -1508,10 +1474,8 @@ async function testEmptyInputEdgeCases() { try { const tokens = await ctx.tokenize("Hello world"); - await ctx.decode(tokens, 0, 0); - - const branch = Branch.create(ctx, tokens.length, { temperature: 0 }); - branch.captureLogits(); + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(tokens); const store = new BranchStore(ctx); const posBefore = branch.position; @@ -1532,7 +1496,7 @@ async function testEmptyInputEdgeCases() { ok('branch.prefill([]) resolves'); // Verify branch still works after empty operations - const { token, isStop } = branch.produce(); + const { token, isStop } = await branch.produce(); assert(!isStop, 'empty edge: produce still works after empty ops'); await branch.commit(token); assert(branch.position === posBefore + 1, 'empty edge: commit advances position after empty ops'); @@ -1572,21 +1536,14 @@ async function testJsonSchemaToGrammar() { `jsonSchemaToGrammar: returned ${grammar.length}-char grammar`); assert(grammar.includes('root'), 'jsonSchemaToGrammar: grammar contains "root" rule'); - // Use the grammar with createSampler to prove it's valid GBNF - const handle = ctx.createSampler(grammar); - assert(handle > 0, `jsonSchemaToGrammar: createSampler accepted grammar (handle=${handle})`); - - // Generate tokens with grammar constraint - await ctx.kvCacheClear(); + // Use the grammar with Branch.create to prove it's valid GBNF const prompt = await ctx.tokenize("Output JSON: "); - await ctx.decode(prompt, 0, 0); - - const branch = Branch.create(ctx, prompt.length, { temperature: 0 }, undefined, grammar); - branch.captureLogits(); + const branch = Branch.create(ctx, 0, { temperature: 0 }, undefined, grammar); + await branch.prefill(prompt); const output = []; for (let i = 0; i < 50; i++) { - const { token, text, isStop } = branch.produce(); + const { token, text, isStop } = await branch.produce(); if (isStop) break; await branch.commit(token); output.push(text); @@ -1609,7 +1566,6 @@ async function testJsonSchemaToGrammar() { } await branch.prune(); - ctx.freeSamplerHandle(handle); // Error path: invalid JSON → promise rejects let rejected = false; @@ -1641,13 +1597,11 @@ async function testDisposedDuringAsync() { try { const tokens = await ctx.tokenize("Test prompt"); - await ctx.decode(tokens, 0, 0); - - const branch = Branch.create(ctx, tokens.length, { temperature: 0 }); - branch.captureLogits(); + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(tokens); // Generate one token so branch has state - const { token } = branch.produce(); + const { token } = await branch.produce(); await branch.commit(token); // Call prune() — DO NOT await yet @@ -1656,14 +1610,14 @@ async function testDisposedDuringAsync() { // Immediately (before microtask resolves) check disposed assert(branch.disposed, 'disposed-during: _disposed is true synchronously after prune() call'); - // produce() should throw synchronously + // produceSync() should throw synchronously let threwProduce = false; try { - branch.produce(); + branch.produceSync(); } catch { threwProduce = true; } - assert(threwProduce, 'disposed-during: produce() throws before prune promise resolves'); + assert(threwProduce, 'disposed-during: produceSync() throws before prune promise resolves'); // commit() should throw synchronously (the _ensureNotDisposed guard) let threwCommit = false; @@ -1702,11 +1656,10 @@ async function testAsyncIterator() { try { const prompt = await ctx.tokenize("The quick brown fox"); - await ctx.decode(prompt, 0, 0); // Generate to EOG via for-await - const branch = Branch.create(ctx, prompt.length, { temperature: 0 }); - branch.captureLogits(); + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(prompt); const tokens = []; for await (const { token, text } of branch) { @@ -1729,13 +1682,11 @@ async function testAsyncIterator() { // Compare: iterator output matches produce/commit output (deterministic, temp=0) await ctx.kvCacheClear(); - await ctx.decode(prompt, 0, 0); - - const branchManual = Branch.create(ctx, prompt.length, { temperature: 0 }); - branchManual.captureLogits(); + const branchManual = Branch.create(ctx, 0, { temperature: 0 }); + await branchManual.prefill(prompt); const manualTokens = []; for (let i = 0; i < 10; i++) { - const { token, isStop } = branchManual.produce(); + const { token, isStop } = await branchManual.produce(); if (isStop) break; await branchManual.commit(token); manualTokens.push(token); @@ -1751,6 +1702,119 @@ async function testAsyncIterator() { } } +// ═══════════════════════════════════════════════════════════════════════════ +// HOT-SWAP TESTS (setSamplerParams / setGrammar) +// ═══════════════════════════════════════════════════════════════════════════ + +async function testSetSamplerParams() { + console.log('\n--- setSamplerParams ---'); + + const ctx = await addon.createContext({ + modelPath: MODEL_PATH, + nCtx: CTX_SIZE, + nThreads: 4, + }); + + try { + const prompt = await ctx.tokenize("The capital of France is"); + + // Greedy baseline + const greedy = Branch.create(ctx, 0, { temperature: 0, topK: 0, topP: 1.0, minP: 0 }); + await greedy.prefill(prompt); + const greedyTok = greedy.sample(); + assert(greedyTok >= 0, `setSamplerParams: greedy token valid (${greedyTok})`); + + // Switch to stochastic — at high temp, should eventually diverge + greedy.setSamplerParams({ temperature: 1.5, seed: 42, topK: 0, topP: 1.0, minP: 0 }); + let diverged = false; + for (let i = 0; i < 20; i++) { + if (greedy.sample() !== greedyTok) { diverged = true; break; } + } + assert(diverged, 'setSamplerParams: stochastic diverges from greedy'); + + // Switch back to greedy — should be deterministic again + greedy.setSamplerParams({ temperature: 0, topK: 0, topP: 1.0, minP: 0 }); + const tok2 = greedy.sample(); + const tok3 = greedy.sample(); + assert(tok2 === tok3, `setSamplerParams: greedy restored (${tok2} === ${tok3})`); + + await greedy.prune(); + + // Memoization: identical params should not rebuild + await ctx.kvCacheClear(); + const branch = Branch.create(ctx, 0, { temperature: 0.8, seed: 100 }); + await branch.prefill(prompt); + branch.setSamplerParams({ temperature: 0.8, seed: 100 }); // Same — should be no-op + assert(!branch.disposed, 'setSamplerParams: memoized no-op does not dispose'); + + await branch.prune(); + } finally { + ctx.dispose(); + } +} + +async function testSetGrammar() { + console.log('\n--- setGrammar ---'); + + const ctx = await addon.createContext({ + modelPath: MODEL_PATH, + nCtx: CTX_SIZE, + nThreads: 4, + nSeqMax: 4, + }); + + try { + const grammar = `root ::= "{" ws "}" ws +ws ::= [ \\t\\n]*`; + + // Hot-swap: create without grammar, then add one + const prompt = await ctx.tokenize("Output: "); + const branch = Branch.create(ctx, 0, { temperature: 0 }); + await branch.prefill(prompt); + + branch.setGrammar(grammar); + const output = []; + for (let i = 0; i < 10; i++) { + const { token, text, isStop } = await branch.produce(); + if (isStop) break; + await branch.commit(token); + output.push(text); + } + const result = output.join(''); + assert(/^\{\s*\}\s*$/.test(result), `setGrammar: hot-swap constrains → "${result}"`); + + // Remove grammar + branch.setGrammar(''); + // Should no longer be constrained (just verify it doesn't throw) + const { token } = await branch.produce(); + assert(typeof token === 'number', 'setGrammar: removal works, sample succeeds'); + + await branch.prune(); + + // Hot-swap + fork: grammar cloned to child + await ctx.kvCacheClear(); + const root = Branch.create(ctx, 0, { temperature: 0 }); + await root.prefill(prompt); + root.setGrammar(grammar); + + const child = await root.fork(); + const childOut = []; + for (let i = 0; i < 10; i++) { + const p = await child.produce(); + if (p.isStop) break; + await child.commit(p.token); + childOut.push(p.text); + } + const childResult = childOut.join(''); + assert(/^\{\s*\}\s*$/.test(childResult), `setGrammar: fork inherits grammar → "${childResult}"`); + + await child.prune(); + await root.prune(); + } finally { + ctx.dispose(); + } +} + // ═══════════════════════════════════════════════════════════════════════════ // MAIN // ═══════════════════════════════════════════════════════════════════════════ @@ -1783,7 +1847,7 @@ async function main() { await testBranchSteer(); await testNBatchAblation(); await testDeterminism(); - await testDecodeAndCapture(); + await testBranchPrefillAndLogits(); await testBranchStore(); await testPplSanity(); await testCommitRollback(); @@ -1792,6 +1856,8 @@ async function main() { await testJsonSchemaToGrammar(); await testDisposedDuringAsync(); await testAsyncIterator(); + await testSetSamplerParams(); + await testSetGrammar(); await testEmbeddings(); // Summary From bdf1b40819bbe71a88ac0f3631a3abb658964a2d Mon Sep 17 00:00:00 2001 From: LLoyal Research Date: Fri, 20 Feb 2026 14:47:27 +1100 Subject: [PATCH 2/3] refactor(branch): streamline decode surface across N-API and JS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove captureLogits(), decodeAndCaptureOne() from JS (zero callers). Remove _branchCaptureLogits, _branchDecodeAndCaptureOne N-API bindings. Rename _branchDecodeAndCaptureBatch → _branchPrefill through all layers. Migrate streaming-tsampler from decodeAndCaptureOne to commit(). --- examples/streaming/streaming-tsampler.mjs | 4 +- lib/Branch.js | 44 +++-------- lib/index.d.ts | 33 ++------ liblloyal | 2 +- src/SessionContext.cpp | 93 +++-------------------- src/SessionContext.hpp | 4 +- test/integration.js | 4 +- 7 files changed, 32 insertions(+), 152 deletions(-) diff --git a/examples/streaming/streaming-tsampler.mjs b/examples/streaming/streaming-tsampler.mjs index 0d698bf..96cc40e 100644 --- a/examples/streaming/streaming-tsampler.mjs +++ b/examples/streaming/streaming-tsampler.mjs @@ -6,7 +6,7 @@ * - TypeScript sampling via tsampler (TTA pattern) * - N-gram tracking to detect sequence repetition * - Logit steering to prevent repeated sequences - * - Branch API for KV management (prefill/decodeAndCaptureOne) + * - Branch API for KV management (prefill/commit) * - KV cache clear + re-prefill for infinite context * * The key insight: llama.cpp's token-level penalties degrade prose quality. @@ -262,7 +262,7 @@ Begin: // Store and advance KV (no sampler accept — we're using tsampler externally) allTokens.push(token); - await branch.decodeAndCaptureOne(token); + await branch.commit(token); // Cache full? Reseed at boundary if (branch.position >= nCtx) { diff --git a/lib/Branch.js b/lib/Branch.js index 08fe4bf..c3b7a7c 100644 --- a/lib/Branch.js +++ b/lib/Branch.js @@ -22,7 +22,7 @@ * @example Best-of-N with perplexity selection * ```js * const root = Branch.create(ctx, tokens.length, { temperature: 0.8 }); - * root.captureLogits(); + * await root.prefill(tokens); * * const results = []; * for (let i = 0; i < 5; i++) { @@ -53,14 +53,14 @@ class Branch { * Create a root branch at the given position * * The branch takes ownership of the sequence and creates its own sampler - * chain from the provided params. Call captureLogits() after prefill to - * freeze the logit distribution before forking. + * chain from the provided params. Call prefill() to decode prompt tokens + * and capture the logit distribution before forking. * * @param {SessionContext} ctx - SessionContext to create branch on * @param {number} position - Starting position (typically prompt token count) * @param {SamplingParams} [params] - Sampling parameters (temperature, topP, etc.) * @param {number} [nBatch] - Per-branch batch size override (defaults to context nBatch). - * Controls chunk size for prefill() (decode_and_capture_batch). Has no effect on + * Controls chunk size for prefill(). Has no effect on * single-token commit() which uses a zero-allocation fast path. Useful for tuning * memory/throughput tradeoff on bulk token decode — e.g. smaller nBatch for cheap * exploration branches, larger for the trunk. @@ -92,25 +92,13 @@ class Branch { return new Branch(this._ctx, newHandle); } - /** - * Freeze the current logit distribution into this branch - * - * Logits are ephemeral — they're overwritten on the next decode() call. - * Capturing preserves them so this branch (and any forks from it) can - * sample from the same distribution. Essential before fork(). - */ - captureLogits() { - this._ensureNotDisposed(); - this._ctx._branchCaptureLogits(this._handle); - } - /** * Get a copy of this branch's captured logits snapshot * * Returns n_vocab floats — the raw logit distribution from the last - * decode_and_capture or captureLogits() call. Use for distributional - * analysis (KL divergence, entropy, top-k overlap) without crossing - * the sampling chain. + * prefill() or commit() call. Use for distributional analysis + * (KL divergence, entropy, top-k overlap) without crossing the + * sampling chain. * * @returns {Float32Array} Copy of the logits snapshot (n_vocab elements) * @throws {Error} If no logits have been captured yet @@ -120,20 +108,6 @@ class Branch { return this._ctx._branchGetLogits(this._handle); } - /** - * Single-token forward pass with logit snapshot - * - * Runs one decode step (writing the token's KV entries), advances position, - * and captures the resulting logits for the next sample() call. - * - * @param {number} token - Token to decode - * @returns {Promise} - */ - async decodeAndCaptureOne(token) { - this._ensureNotDisposed(); - await this._ctx._branchDecodeAndCaptureOne(this._handle, token); - } - /** * Bulk-decode tokens into the branch's KV cache and capture logits * @@ -158,7 +132,7 @@ class Branch { */ async prefill(tokens) { this._ensureNotDisposed(); - await this._ctx._branchDecodeAndCaptureBatch(this._handle, tokens); + await this._ctx._branchPrefill(this._handle, tokens); } /** @@ -226,7 +200,7 @@ class Branch { * @example * ```js * const root = Branch.create(ctx, pos, { temperature: 0.9 }); - * root.captureLogits(); + * await root.prefill(promptTokens); * * // Fork and reseed for diversity * const branches = []; diff --git a/lib/index.d.ts b/lib/index.d.ts index 0b1adcf..07721a8 100644 --- a/lib/index.d.ts +++ b/lib/index.d.ts @@ -1382,14 +1382,8 @@ export interface SessionContext { /** @internal Fork a branch to a new sequence */ _branchFork(handle: number): number; - /** @internal Capture logits into branch's snapshot */ - _branchCaptureLogits(handle: number): void; - - /** @internal Decode a single token and capture logits */ - _branchDecodeAndCaptureOne(handle: number, token: number): Promise; - /** @internal Decode multiple tokens in n_batch-sized chunks and capture logits */ - _branchDecodeAndCaptureBatch(handle: number, tokens: number[]): Promise; + _branchPrefill(handle: number, tokens: number[]): Promise; /** @internal Sample next token from branch's logits snapshot */ _branchSample(handle: number): number; @@ -1629,7 +1623,7 @@ export interface Produced { * @example Best-of-N with perplexity selection * ```typescript * const root = Branch.create(ctx, tokens.length, { temperature: 0.8 }); - * root.captureLogits(); + * await root.prefill(tokens); * * const results = []; * for (let i = 0; i < 5; i++) { @@ -1651,8 +1645,8 @@ export class Branch { * Create a root branch at the given position * * The branch takes ownership of the sequence and creates its own sampler - * chain from the provided params. Call captureLogits() after prefill to - * freeze the logit distribution before forking. + * chain from the provided params. Call prefill() to decode prompt tokens + * and capture the logit distribution before forking. * * @param ctx SessionContext to create branch on * @param position Starting position (typically prompt token count) @@ -1681,14 +1675,11 @@ export class Branch { */ fork(): Promise; - /** Freeze the current logit distribution into this branch. Essential before fork(). */ - captureLogits(): void; - /** * Get a copy of this branch's captured logits snapshot. * * Returns n_vocab floats — the raw logit distribution from the last - * decode_and_capture or captureLogits() call. + * prefill() or commit() call. * * Unlike {@link SessionContext.getLogits} (zero-copy view into shared * model memory, invalidated by next decode), this returns an independent @@ -1701,20 +1692,6 @@ export class Branch { */ getLogits(): Float32Array; - /** - * Single-token forward pass with logit snapshot - * - * Runs one decode step (writing the token's KV entries), advances position, - * and captures the resulting logits for the next sample()/produce() call. - * - * Lower-level than {@link commit} — does NOT accept into the sampler penalty - * window. Use commit() for normal generation; use this when you need decode + - * capture without repeat-penalty tracking. - * - * @param token Token to decode - */ - decodeAndCaptureOne(token: number): Promise; - /** * Bulk-decode tokens into the branch's KV cache and capture logits. * diff --git a/liblloyal b/liblloyal index 557c4ef..b0a30f6 160000 --- a/liblloyal +++ b/liblloyal @@ -1 +1 @@ -Subproject commit 557c4ef6c7f88824c6fdbc029ad9e9b8bea4f73d +Subproject commit b0a30f6bf9ad313fcb3a4d03fb58cc3b34934f7f diff --git a/src/SessionContext.cpp b/src/SessionContext.cpp index b812a76..baf945b 100644 --- a/src/SessionContext.cpp +++ b/src/SessionContext.cpp @@ -556,50 +556,21 @@ class FormatChatWorker : public Napi::AsyncWorker { // ===== BRANCH / STORE / DECODE ASYNC WORKERS ===== /** - * AsyncWorker for single-token branch decode + logits capture - * Wraps lloyal::branch::decode_and_capture_one on libuv pool thread + * AsyncWorker for bulk branch decode + logits capture (prompt injection) + * Wraps lloyal::branch::prefill on libuv pool thread */ -class BranchDecodeAndCaptureOneWorker : public Napi::AsyncWorker { +class BranchPrefillWorker : public Napi::AsyncWorker { public: - BranchDecodeAndCaptureOneWorker(Napi::Env env, - lloyal::branch::BranchStore& store, - lloyal::branch::BranchHandle handle, - llama_token token) - : AsyncWorker(env), _deferred(env), _store(store), _handle(handle), _token(token) {} - - void Execute() override { - try { - lloyal::branch::decode_and_capture_one(_handle, _token, _store); - } catch (const std::exception& e) { SetError(e.what()); } - } - - void OnOK() override { _deferred.Resolve(Env().Undefined()); } - void OnError(const Napi::Error& err) override { _deferred.Reject(err.Value()); } - Napi::Promise GetPromise() { return _deferred.Promise(); } - -private: - Napi::Promise::Deferred _deferred; - lloyal::branch::BranchStore& _store; - lloyal::branch::BranchHandle _handle; - llama_token _token; -}; - -/** - * AsyncWorker for bulk branch decode + logits capture - * Wraps lloyal::branch::decode_and_capture_batch on libuv pool thread - */ -class BranchDecodeAndCaptureBatchWorker : public Napi::AsyncWorker { -public: - BranchDecodeAndCaptureBatchWorker(Napi::Env env, - lloyal::branch::BranchStore& store, - lloyal::branch::BranchHandle handle, - std::vector tokens) + BranchPrefillWorker(Napi::Env env, + lloyal::branch::BranchStore& store, + lloyal::branch::BranchHandle handle, + std::vector tokens) : AsyncWorker(env), _deferred(env), _store(store), _handle(handle), _tokens(std::move(tokens)) {} void Execute() override { try { - lloyal::branch::decode_and_capture_batch(_handle, _tokens.data(), _tokens.size(), _store); + lloyal::branch::prefill(_handle, _tokens.data(), _tokens.size(), _store); } catch (const std::exception& e) { SetError(e.what()); } } @@ -814,9 +785,7 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) { // ===== BRANCH API (internal, wrapped by lib/Branch.ts) ===== InstanceMethod("_branchCreate", &SessionContext::_branchCreate), InstanceMethod("_branchFork", &SessionContext::_branchFork), - InstanceMethod("_branchCaptureLogits", &SessionContext::_branchCaptureLogits), - InstanceMethod("_branchDecodeAndCaptureOne", &SessionContext::_branchDecodeAndCaptureOne), - InstanceMethod("_branchDecodeAndCaptureBatch", &SessionContext::_branchDecodeAndCaptureBatch), + InstanceMethod("_branchPrefill", &SessionContext::_branchPrefill), InstanceMethod("_branchSample", &SessionContext::_branchSample), InstanceMethod("_branchAccept", &SessionContext::_branchAccept), InstanceMethod("_branchGetPosition", &SessionContext::_branchGetPosition), @@ -1881,52 +1850,14 @@ Napi::Value SessionContext::_branchFork(const Napi::CallbackInfo& info) { return Napi::Number::New(env, newHandle); } -Napi::Value SessionContext::_branchCaptureLogits(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - if (info.Length() < 1) { - throw Napi::Error::New(env, "_branchCaptureLogits requires (handle)"); - } - - auto handle = static_cast(info[0].As().Uint32Value()); - lloyal::branch::capture_logits(handle, _branchStore); - - return env.Undefined(); -} - -Napi::Value SessionContext::_branchDecodeAndCaptureOne(const Napi::CallbackInfo& info) { - Napi::Env env = info.Env(); - ensureNotDisposed(); - - if (info.Length() < 2) { - throw Napi::Error::New(env, "_branchDecodeAndCaptureOne requires (handle, token)"); - } - - auto handle = static_cast(info[0].As().Uint32Value()); - auto token = static_cast(info[1].As().Int32Value()); - - auto* worker = new BranchDecodeAndCaptureOneWorker(env, _branchStore, handle, token); - worker->Queue(); - return worker->GetPromise(); -} - // Bulk-decode tokens into a branch's KV cache and capture final logits. -// -// tokens.size() is the total token count (n_tokens). The branch's n_batch -// (set at Branch.create via the nBatch parameter, stored on BranchState) -// controls the chunk size — decode_and_capture_batch passes both to -// decoder::decode_tokens which loops: min(n_tokens - processed, n_batch) -// tokens per llama_decode call. -// -// Does NOT accept tokens into the sampler's penalty window. // Wrapped by Branch.prefill() on the JS side. -Napi::Value SessionContext::_branchDecodeAndCaptureBatch(const Napi::CallbackInfo& info) { +Napi::Value SessionContext::_branchPrefill(const Napi::CallbackInfo& info) { Napi::Env env = info.Env(); ensureNotDisposed(); if (info.Length() < 2 || !info[0].IsNumber() || !info[1].IsArray()) { - throw Napi::Error::New(env, "_branchDecodeAndCaptureBatch requires (handle, tokens[])"); + throw Napi::Error::New(env, "_branchPrefill requires (handle, tokens[])"); } auto handle = static_cast(info[0].As().Uint32Value()); @@ -1944,7 +1875,7 @@ Napi::Value SessionContext::_branchDecodeAndCaptureBatch(const Napi::CallbackInf return deferred.Promise(); } - auto* worker = new BranchDecodeAndCaptureBatchWorker(env, _branchStore, handle, std::move(tokens)); + auto* worker = new BranchPrefillWorker(env, _branchStore, handle, std::move(tokens)); worker->Queue(); return worker->GetPromise(); } diff --git a/src/SessionContext.hpp b/src/SessionContext.hpp index 3ab159d..254065e 100644 --- a/src/SessionContext.hpp +++ b/src/SessionContext.hpp @@ -256,9 +256,7 @@ class SessionContext : public Napi::ObjectWrap { Napi::Value _branchCreate(const Napi::CallbackInfo& info); Napi::Value _branchFork(const Napi::CallbackInfo& info); - Napi::Value _branchCaptureLogits(const Napi::CallbackInfo& info); - Napi::Value _branchDecodeAndCaptureOne(const Napi::CallbackInfo& info); - Napi::Value _branchDecodeAndCaptureBatch(const Napi::CallbackInfo& info); + Napi::Value _branchPrefill(const Napi::CallbackInfo& info); Napi::Value _branchSample(const Napi::CallbackInfo& info); Napi::Value _branchAccept(const Napi::CallbackInfo& info); Napi::Value _branchGetPosition(const Napi::CallbackInfo& info); diff --git a/test/integration.js b/test/integration.js index e276b2c..ef52b37 100644 --- a/test/integration.js +++ b/test/integration.js @@ -1444,10 +1444,10 @@ async function testAsyncRejection() { } assert(threwOnFork, 'rejection: fork on disposed branch throws'); - // Native AsyncWorker rejection: call _branchDecodeAndCaptureOne with invalid handle (0) + // Native AsyncWorker rejection: call _branchPrefill with invalid handle (0) let nativeRejected = false; try { - await ctx._branchDecodeAndCaptureOne(0, token); + await ctx._branchPrefill(0, [token]); } catch (e) { nativeRejected = true; assert(e instanceof Error, `rejection: native rejection is Error: ${e.constructor.name}`); From d7b7ed98f53824e48c402aaf349e9086d1f38d77 Mon Sep 17 00:00:00 2001 From: LLoyal Research Date: Fri, 20 Feb 2026 14:49:48 +1100 Subject: [PATCH 3/3] chore(deps): lock file update --- package-lock.json | 218 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 174 insertions(+), 44 deletions(-) diff --git a/package-lock.json b/package-lock.json index c0dc766..4af2f6d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -51,29 +51,6 @@ "@shikijs/vscode-textmate": "^10.0.2" } }, - "node_modules/@isaacs/balanced-match": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/@isaacs/balanced-match/-/balanced-match-4.0.1.tgz", - "integrity": "sha512-yzMTt9lEb8Gv7zRioUilSglI0c0smZ9k5D65677DLWLtWJaXIS3CqcGyUFByYKlnUj6TkjLVs54fBl6+TiGQDQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": "20 || >=22" - } - }, - "node_modules/@isaacs/brace-expansion": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/@isaacs/brace-expansion/-/brace-expansion-5.0.1.tgz", - "integrity": "sha512-WMz71T1JS624nWj2n2fnYAuPovhv7EUhk69R6i9dsVyzxt5eM3bjwvgk9L+APE1TRscGysAVMANkB0jh0LQZrQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@isaacs/balanced-match": "^4.0.1" - }, - "engines": { - "node": "20 || >=22" - } - }, "node_modules/@isaacs/cliui": { "version": "8.0.2", "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz", @@ -106,43 +83,173 @@ } }, "node_modules/@lloyal-labs/lloyal.node-darwin-arm64": { - "optional": true + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-darwin-arm64/-/lloyal.node-darwin-arm64-1.5.0.tgz", + "integrity": "sha512-TZlQhkt14RQLmhCPGgu2WtZ/gC8Z0tvzu/gVUcNqsBTwepaUlyAdAuZCNnlpjGCwSV/XTvlZQMKfNKDyXoMQbQ==", + "cpu": [ + "arm64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "darwin" + ] }, "node_modules/@lloyal-labs/lloyal.node-darwin-x64": { - "optional": true + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-darwin-x64/-/lloyal.node-darwin-x64-1.5.0.tgz", + "integrity": "sha512-0KJmT3vbrPm8HojFfu+tn433gTVF/x2vHdzi+kRGSvbI81pjzadd/pW4Qweo5NmgSfAEp2a1FTT4gXdLfAfxwQ==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "darwin" + ] }, "node_modules/@lloyal-labs/lloyal.node-linux-arm64": { - "optional": true + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-arm64/-/lloyal.node-linux-arm64-1.5.0.tgz", + "integrity": "sha512-QZPknVyCNXBF+Ed+YNBcAQgNquznegy8Q3A//Il4NsXeLrH4ZW0orygV7/sTC9z0eUKi2EFBwOVnhNbMsv66Cw==", + "cpu": [ + "arm64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ] }, "node_modules/@lloyal-labs/lloyal.node-linux-arm64-cuda": { - "optional": true + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-arm64-cuda/-/lloyal.node-linux-arm64-cuda-1.5.0.tgz", + "integrity": "sha512-aQr2MK2V2PgTY8msfrADqQY4Ymgn7sFddLnSmLNj/8poBqT+Tj8AQXoErcbXZZ1nzrGXpk3WUpF91dcRk/X/CQ==", + "cpu": [ + "arm64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ] }, "node_modules/@lloyal-labs/lloyal.node-linux-arm64-vulkan": { - "optional": true + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-arm64-vulkan/-/lloyal.node-linux-arm64-vulkan-1.5.0.tgz", + "integrity": "sha512-dxp7lpelVd7cDV+nCwRB6F3iQ7JV6Pyh5BNZwEMXpmiaBlC/1TycOXgGrdZFnwTy/Tu/GYzKYOMu4b1ferQZ8Q==", + "cpu": [ + "arm64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ] }, "node_modules/@lloyal-labs/lloyal.node-linux-x64": { - "optional": true + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-x64/-/lloyal.node-linux-x64-1.5.0.tgz", + "integrity": "sha512-YVlvw1YNLTDMthHhJAi2MRkjMvQsbHgyzzHu0JRuqkmq0slLis/hL1V6qYFA1W0Y5lxJ/jnHagKkDg3ibtCDcA==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ] }, "node_modules/@lloyal-labs/lloyal.node-linux-x64-cuda": { - "optional": true + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-x64-cuda/-/lloyal.node-linux-x64-cuda-1.5.0.tgz", + "integrity": "sha512-7tYHwe8wZ1U5LsJ153OR9tzYa1tio9doNBYhdJ+qKalOkRSdFSOBG67mVaYShWF9yt14GScoM3OuA1OrgJHK5Q==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ] }, "node_modules/@lloyal-labs/lloyal.node-linux-x64-vulkan": { - "optional": true + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-x64-vulkan/-/lloyal.node-linux-x64-vulkan-1.5.0.tgz", + "integrity": "sha512-vhwzN+xXdN3CDTqdgY5/awv0FxRXJtFlrwpQNH2UDS2IZucPuR/kFa27F0LLPHxTnvbeVnlBkHPg8KUsvrWRMQ==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ] }, "node_modules/@lloyal-labs/lloyal.node-win32-arm64": { - "optional": true + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-arm64/-/lloyal.node-win32-arm64-1.5.0.tgz", + "integrity": "sha512-eAopZwOQEKO5433KC3o3cG2lMno7hSDJ3ZcwedgNofbk47Jcg+IWUS6bJWKKEzlrxumzOdGCnD+Tm71M/6cFyQ==", + "cpu": [ + "arm64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "win32" + ] }, "node_modules/@lloyal-labs/lloyal.node-win32-arm64-vulkan": { - "optional": true + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-arm64-vulkan/-/lloyal.node-win32-arm64-vulkan-1.5.0.tgz", + "integrity": "sha512-Dm/nCj+ygSpwHyqi+9h6hxtrQ2J1Jq9AFVPwycFUA5EHKItsMlBLuY2P7Hw1i1cjuwhBPW+xyE8Q7b4R5mrV3g==", + "cpu": [ + "arm64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "win32" + ] }, "node_modules/@lloyal-labs/lloyal.node-win32-x64": { - "optional": true + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-x64/-/lloyal.node-win32-x64-1.5.0.tgz", + "integrity": "sha512-ccmXLIgtmtGkfe7LK5Vb5dyj7//9umCFAtci750tUGNLL/n+t2Yw2SnyxLwQF8e5GKg+ASfT9yQ3fejP2gu0ag==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "win32" + ] }, "node_modules/@lloyal-labs/lloyal.node-win32-x64-cuda": { - "optional": true + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-x64-cuda/-/lloyal.node-win32-x64-cuda-1.5.0.tgz", + "integrity": "sha512-udd/dMPq6Bedoekrch3DHqN+KX5u93Essknd2g8kwgqrEv49Petoky4SBdAEDRNcj4ckTch3IwC/NDQmvvOBLQ==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "win32" + ] }, "node_modules/@lloyal-labs/lloyal.node-win32-x64-vulkan": { - "optional": true + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-x64-vulkan/-/lloyal.node-win32-x64-vulkan-1.5.0.tgz", + "integrity": "sha512-tCmiI/zvNwN48WxvuEOQLcdJECq6+Ae4lKwQHSeXCmgggob4wp51xGnKFrk2l/YNwFIS6p3Bw5pZA9gJ3SJMMQ==", + "cpu": [ + "x64" + ], + "license": "Apache-2.0", + "optional": true, + "os": [ + "win32" + ] }, "node_modules/@lloyal-labs/tsampler": { "version": "0.2.0", @@ -689,21 +796,44 @@ "license": "MIT" }, "node_modules/minimatch": { - "version": "10.1.1", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.1.1.tgz", - "integrity": "sha512-enIvLvRAFZYXJzkCYG5RKmPfrFArdLv+R+lbQ53BmIMLIry74bjKzX6iHAm8WYamJkhSSEabrWN5D97XnKObjQ==", + "version": "10.2.2", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.2.2.tgz", + "integrity": "sha512-+G4CpNBxa5MprY+04MbgOw1v7So6n5JY166pFi9KfYwT78fxScCeSNQSNzp6dpPSW2rONOps6Ocam1wFhCgoVw==", "dev": true, "license": "BlueOak-1.0.0", "dependencies": { - "@isaacs/brace-expansion": "^5.0.0" + "brace-expansion": "^5.0.2" }, "engines": { - "node": "20 || >=22" + "node": "18 || 20 || >=22" }, "funding": { "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/minimatch/node_modules/balanced-match": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-4.0.3.tgz", + "integrity": "sha512-1pHv8LX9CpKut1Zp4EXey7Z8OfH11ONNH6Dhi2WDUt31VVZFXZzKwXcysBgqSumFCmR+0dqjMK5v5JiFHzi0+g==", + "dev": true, + "license": "MIT", + "engines": { + "node": "20 || >=22" + } + }, + "node_modules/minimatch/node_modules/brace-expansion": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.2.tgz", + "integrity": "sha512-Pdk8c9poy+YhOgVWw1JNN22/HcivgKWwpxKq04M/jTmHyCZn12WPJebZxdjSa5TmBqISrUSgNYU3eRORljfCCw==", + "dev": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^4.0.2" + }, + "engines": { + "node": "20 || >=22" + } + }, "node_modules/minimist": { "version": "1.2.8", "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", @@ -1021,9 +1151,9 @@ } }, "node_modules/typedoc": { - "version": "0.28.16", - "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.28.16.tgz", - "integrity": "sha512-x4xW77QC3i5DUFMBp0qjukOTnr/sSg+oEs86nB3LjDslvAmwe/PUGDWbe3GrIqt59oTqoXK5GRK9tAa0sYMiog==", + "version": "0.28.17", + "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.28.17.tgz", + "integrity": "sha512-ZkJ2G7mZrbxrKxinTQMjFqsCoYY6a5Luwv2GKbTnBCEgV2ihYm5CflA9JnJAwH0pZWavqfYxmDkFHPt4yx2oDQ==", "dev": true, "license": "Apache-2.0", "dependencies": {