diff --git a/README.md b/README.md index a1783d3..35df831 100644 --- a/README.md +++ b/README.md @@ -6,67 +6,59 @@ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) [![llama.cpp](https://img.shields.io/badge/llama.cpp-b8087-green.svg)](https://github.com/ggml-org/llama.cpp/releases/tag/b8087) -**Covalent Inference for Node.js** +**Native backend for the lloyal inference platform.** -Composable inference primitives for forkable decode state, shared-prefix KV branching, and continuous tree batching. Branches share a KV prefix while keeping independent machinery — sampler chain, grammar, logits snapshot, perplexity tracker — for controlled divergence at decode time. `BranchStore` packs tokens from N branches (each at a different position, different seq_id, each needing independent logits captured) into a single `llama_batch` and dispatches once. `kv::tenancy` manages seq_id leases automatically — acquired on `create()`/`fork()`, evicted on `prune()`, rebuilt on `retainOnly()`. +Prebuilt llama.cpp binaries for 13 platform/GPU combinations, exposing a `SessionContext` that powers the [`@lloyal-labs/sdk`](https://github.com/lloyal-ai/sdk) inference primitives (Branch, BranchStore, Session, Rerank) and [`@lloyal-labs/lloyal-agents`](https://github.com/lloyal-ai/sdk/tree/main/packages/agents) multi-agent framework. Built on [liblloyal](https://github.com/lloyal-ai/liblloyal), a header-only C++20 inference kernel for llama.cpp. -Built on [liblloyal](https://github.com/lloyal-ai/liblloyal), a header-only C++20 inference kernel for llama.cpp. +All SDK and agent exports are re-exported from this package for convenience — `import { Branch, runAgents } from "@lloyal-labs/lloyal.node"` works out of the box. -## The Branch API +## Install + +```bash +npm install @lloyal-labs/lloyal.node +``` + +Prebuilt binaries for 13 platform/GPU combinations. GPU selection at runtime, not install time. + +| Platform | Arch | Acceleration | +| -------- | ----- | ------------------- | +| macOS | arm64 | Metal | +| macOS | x64 | CPU | +| Linux | x64 | CPU / CUDA / Vulkan | +| Linux | arm64 | CPU / CUDA / Vulkan | +| Windows | x64 | CPU / CUDA / Vulkan | +| Windows | arm64 | CPU / Vulkan | + +## Quick Start ```javascript -import { createContext, Branch, BranchStore } from "@lloyal-labs/lloyal.node"; +import { createContext } from "@lloyal-labs/lloyal.node"; +import { Branch, BranchStore } from "@lloyal-labs/sdk"; -const ctx = await createContext({ modelPath: "./model.gguf", nSeqMax: 6 }); +const ctx = await createContext({ modelPath: "./model.gguf", nSeqMax: 4 }); const store = new BranchStore(ctx); -// Shared prompt: "Explain quantum entanglement" -const prompt = await ctx.tokenize("Explain quantum entanglement"); - const root = Branch.create(ctx, 0, { temperature: 0.8 }); -await root.prefill(prompt); - -// Fork 4 branches — each gets a different reasoning prefix -const analogy = await root.fork(); -const formal = await root.fork(); -const socratic = await root.fork(); -const visual = await root.fork(); - -// Scatter-prefill: inject divergent prefixes in one batched dispatch -// 4 branches × variable lengths → auto bin-packed into minimal GPU calls -await store.prefill([ - [analogy, await ctx.tokenize("Think of it like two coins...")], // 12 tokens - [formal, await ctx.tokenize("In quantum mechanics, the...")], // 8 tokens - [socratic, await ctx.tokenize("What happens when you measure...")], // 10 tokens - [visual, await ctx.tokenize("Imagine two particles...")], // 7 tokens -]); - -// Generate — all 4 in lockstep, 1 GPU call per step -const branches = [analogy, formal, socratic, visual]; +await root.prefill(await ctx.tokenize("Explain quantum entanglement")); + +// Fork and generate — all branches in lockstep, 1 GPU call per step +const branches = await Promise.all([root.fork(), root.fork(), root.fork()]); for (;;) { - const live = branches.filter(b => !b.disposed); + const live = branches.filter((b) => !b.disposed); if (!live.length) break; - const produced = live.map(b => ({ b, ...b.produce() })); - - // Prune branches that hit stop tokens - for (const p of produced.filter(p => p.isStop)) await p.b.prune(); - - // Commit survivors — accept + decode in one GPU dispatch + const produced = live.map((b) => ({ b, ...b.produce() })); + for (const p of produced.filter((p) => p.isStop)) await p.b.prune(); const items = produced - .filter(p => !p.isStop) - .map(p => { p.b.accept(p.token); return [p.b, p.token]; }); + .filter((p) => !p.isStop) + .map((p) => { + p.b.accept(p.token); + return [p.b, p.token]; + }); await store.commit(items); } - -// Winner takes all — one seq_keep pass, losers vaporized -const winner = branches - .filter(b => !b.disposed) - .reduce((a, b) => (a.perplexity < b.perplexity ? a : b)); -await store.retainOnly(winner); -// store.available === nSeqMax - 1 — all leases recovered ``` -Or for single-branch generation, Branch is an async iterable — generate until EOG: +Or for single-branch generation, Branch is an async iterable: ```javascript for await (const { token, text } of branch) { @@ -74,162 +66,130 @@ for await (const { token, text } of branch) { } ``` -## Continuous Tree Batching +See [`@lloyal-labs/sdk`](https://github.com/lloyal-ai/sdk) for the full Branch API, continuous tree batching, KV tenancy, and topology documentation. -Tree search with N branches means N calls to `llama_decode()` — each paying GPU dispatch overhead, memory barriers, and PCIe round-trips. `BranchStore` eliminates this: tokens from N branches — each at a different position, different seq_id, each needing independent logits captured — are packed into a single `llama_batch` and dispatched once. N branches, 1 GPU call. +### Without the SDK -Two packing strategies for different access patterns: +`createContext` returns a `SessionContext` — the native interface to llama.cpp. You can use it directly without the SDK's Branch/BranchStore layer: ```javascript -// commit: 1 token per branch — one GPU dispatch for N branches -await store.commit([[branch1, tok1], [branch2, tok2], [branch3, tok3]]); - -// prefill: variable tokens per branch — asymmetric injection -await store.prefill([ - [branchA, systemTokens], // 200 tokens - [branchB, queryTokens], // 12 tokens - [branchC, docTokens], // 800 tokens -]); -// Greedy bin-packed into ceil(total / nBatch) dispatches +import { createContext } from "@lloyal-labs/lloyal.node"; + +const ctx = await createContext({ modelPath: "./model.gguf", nSeqMax: 4 }); + +// Chat templates — model-agnostic formatting + tool calling +const { prompt, grammar, format } = await ctx.formatChat(messages, { + addGenerationPrompt: true, + tools: [{ type: "function", function: { name: "search", parameters: schema } }], +}); +const { content, toolCalls } = await ctx.parseChatOutput(output, format); + +// Branch primitives — what the SDK's Branch class wraps +const handle = ctx._branchCreate(0, samplerParams); +await ctx._branchPrefill(handle, tokens); +const token = ctx._branchSample(handle); +const text = ctx.tokenToText(token); +const isStop = ctx.isStopToken(token); +ctx._branchAccept(handle, token); +const logits = ctx._branchGetLogits(handle); // Float32Array(vocabSize) +const entropy = ctx._branchModelEntropy(handle); +const child = ctx._branchFork(handle); + +// Store primitives — what the SDK's BranchStore wraps +await ctx._storeCommit([handle1, handle2], [tok1, tok2]); // N branches, 1 GPU call +await ctx._storePrefill([handle], [tokens]); +await ctx._storeRetainOnly(winner); +const available = ctx._storeAvailable(); + +// KV cache — snapshot, copy, persist +await ctx.kvSeqCopy(0, 1); // share prefix across sequences +await ctx.kvCacheSave(); // snapshot for rollback +await ctx.kvCacheLoad(); // restore checkpoint +await ctx.kvCacheWriteFile("cache.bin"); // persist to disk + +// Embeddings +const embeddings = await ctx.encode("query text"); +const dim = ctx.getEmbeddingDimension(); + +// Grammar + tokenizer +const grammar = await ctx.jsonSchemaToGrammar(schema); +const tokens = await ctx.tokenize("Hello world"); +const sep = await ctx.getTurnSeparator(); ``` -## KV Tenancy +## What This Package Provides -Two resources, two scales. Slots (65K) are how many branches can *exist* — cheap CPU state. Leases (`nSeqMax`) are how many can *decode* — scarce KV cache residency. Tenancy manages the scarce resource automatically: leases are acquired on `create()`/`fork()`, evicted on `prune()`, rebuilt on `retainOnly()`. No manual seq_id tracking, ever. +**Native-only** (not in SDK): -```javascript -store.available; // leases remaining — use for width/depth budget -await store.retainOnly(winner); // nuclear: 1 seq_keep, rebuild vacancy -``` +- `createContext(options)` — load a GGUF model, return a `SessionContext` +- `loadBinary(options?)` — explicit GPU variant selection with automatic fallback +- Prebuilt binaries for 13 platform/GPU combinations -The turn lifecycle: search is surgical (N × `prune()`), promotion is nuclear (1 × `retainOnly()`). Per turn, fork → expand → evaluate → prune losers → repeat. Between turns, promote winner → tree is gone → next turn starts fresh. +**Re-exported from [`@lloyal-labs/sdk`](https://github.com/lloyal-ai/sdk):** -## Topology +- `Branch`, `BranchStore`, `Session`, `Rerank` +- Per-token metrics: `modelEntropy()`, `modelSurprisal()`, `samplingPerplexity` +- Chat formatting: `formatChat()`, `parseChatOutput()` +- Grammar: `jsonSchemaToGrammar()`, `setGrammar()` -Parent/child edges are always-on. Simple chat → best-of-N → deep search is one continuum. +**Re-exported from [`@lloyal-labs/lloyal-agents`](https://github.com/lloyal-ai/sdk/tree/main/packages/agents):** -```javascript -branch.parent; // handle or null if root -branch.children; // child handles -branch.isLeaf; // no children? -branch.isActive; // holds a KV lease? -``` +- `runAgents`, `useAgentPool`, `generate`, `diverge`, `createToolkit` +- Structured concurrency DAG via Effection generators +- In-loop orchestration: agents as branches of a single running process -| Method | FK analogy | Behavior | -|--------|-----------|----------| -| `prune()` | RESTRICT | Throws if children exist | -| `pruneSubtree()` | CASCADE | Iterative post-order traversal | +## GPU Variant Selection ---- +```javascript +import { loadBinary, createContext } from "@lloyal-labs/lloyal.node"; -## Install +// Automatic — uses Metal on macOS, CPU elsewhere +const ctx = await createContext({ modelPath: "./model.gguf" }); -```bash -npm install @lloyal-labs/lloyal.node +// Explicit CUDA +const binding = loadBinary({ gpuVariant: "cuda" }); +const ctx = await binding.createContext({ modelPath: "./model.gguf" }); +// Falls back to CPU with a warning if CUDA runtime not available ``` -Prebuilt binaries for 13 platform/GPU combinations. GPU selection at runtime, not install time. - -| Platform | Arch | Acceleration | -| -------- | ----- | ------------------- | -| macOS | arm64 | Metal | -| macOS | x64 | CPU | -| Linux | x64 | CPU / CUDA / Vulkan | -| Linux | arm64 | CPU / CUDA / Vulkan | -| Windows | x64 | CPU / CUDA / Vulkan | -| Windows | arm64 | CPU / Vulkan | - -CI integration testing (real inference): - -| Architecture | Test Model | Template | -| ------------ | -------------- | -------- | -| Llama | Llama 3.2 1B | llama3 | -| Phi | Phi 3.5 Mini | phi3 | -| Qwen | Qwen 3 1.7B | chatml | -| Gemma | Gemma 3 1B | gemma | -| SmolLM | SmolLM2 1.7B | chatml | -| Ministral | Ministral 3B | mistral | - -See [distribution.md](docs/distribution.md) for details. - ---- - ## Examples -| Example | Pattern | -| ----------------------------------------- | -------------------------------------------------------------------------- | -| [`best-of-n/`](./examples/best-of-n/) | Branch API: fork, produce/commit, perplexity selection | -| [`speculative/`](./examples/speculative/) | Branch API: draft/verify, fork/prune, bonus token sampling | -| [`streaming/`](./examples/streaming/) | Infinite context via BlinkKV reseeding with sidecar summarization | -| [`entropy/`](./examples/entropy/) | `modelEntropy()` mid-generation as control signal | -| [`grammar/`](./examples/grammar/) | Pull loop with generators, JSON schema constraints, KV + grammar branching | -| [`chat/`](./examples/chat/) | Interactive streaming chat | -| [`embed/`](./examples/embed/) | Text embeddings extraction | +| Example | Pattern | +| --------------------------------- | ------------------------------------------------- | +| [`entropy/`](./examples/entropy/) | `modelEntropy()` mid-generation as control signal | +| [`chat/`](./examples/chat/) | Interactive streaming chat | +| [`embed/`](./examples/embed/) | Text embeddings extraction | ```bash -node examples/best-of-n/best-of-n.mjs -node examples/speculative/speculative.mjs -``` - -Each example has a README explaining the pattern. - ---- - -## Other Patterns - -### Entropy as Control Signal - -Model uncertainty mid-generation enables dynamic behavior: - -```javascript -const entropy = ctx.modelEntropy("bits"); - -if (entropy > 4.0) { - // High uncertainty — model is guessing - // Trigger retrieval, reduce temperature, or branch -} +npx tsx examples/best-of-n/best-of-n.ts +npx tsx examples/chat/chat.ts ./model.gguf ``` -See [`examples/entropy/`](./examples/entropy/) for entropy-triggered sampling strategies. +## CI Testing -### Low-Level KV Operations +Integration tests run real inference across architectures: -For fine-grained control without Branch: +| Architecture | Test Model | Template | +| ------------ | ------------ | -------- | +| Llama | Llama 3.2 1B | llama3 | +| Phi | Phi 3.5 Mini | phi3 | +| Qwen | Qwen 3 1.7B | chatml | +| Gemma | Gemma 3 1B | gemma | +| SmolLM | SmolLM2 1.7B | chatml | +| Ministral | Ministral 3B | mistral | -| Approach | Method | Use Case | -| -------------------- | --------------------------------- | -------------------------------------------- | -| **Sequence copy** | `kvSeqCopy(src, dst)` | Share prefix across sequences | -| **Snapshot/restore** | `kvCacheSave()` / `kvCacheLoad()` | Sequential exploration, return to checkpoint | - -### Grammar-Constrained Generation - -```javascript -const grammar = await ctx.jsonSchemaToGrammar(schema); -const branch = Branch.create(ctx, 0, params, undefined, grammar); -await branch.prefill(promptTokens); -// Grammar state cloned automatically on fork() -``` - -See [`examples/grammar/`](./examples/grammar/) for the full branch fork pattern. - ---- - -## API Reference - -Full API documentation: **[lloyal-ai.github.io/lloyal.node](https://lloyal-ai.github.io/lloyal.node/)** - -Generated from [`src/index.ts`](./src/index.ts) with TypeDoc. - ---- +See [distribution.md](docs/distribution.md) for details. ## Ecosystem -| Package | Runtime | Description | -| ------------------------------------------------------- | ------------ | --------------------------------- | -| [liblloyal](https://github.com/lloyal-ai/liblloyal) | C++ | Header-only inference kernel | -| **lloyal.node** | Node.js | This package | -| [nitro-llama](https://github.com/lloyal-ai/nitro-llama) | React Native | Mobile bindings via Nitro Modules | -| [tsampler](https://github.com/lloyal-ai/tsampler) | TypeScript | Reference sampler implementation | +| Package | Description | +| ------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------- | +| [`@lloyal-labs/sdk`](https://github.com/lloyal-ai/sdk) | Backend-agnostic inference primitives (Branch, BranchStore, Session, Rerank) | +| [`@lloyal-labs/lloyal-agents`](https://github.com/lloyal-ai/sdk/tree/main/packages/agents) | Multi-agent framework — in-loop orchestration via structured concurrency | +| [liblloyal](https://github.com/lloyal-ai/liblloyal) | Header-only C++20 inference kernel for llama.cpp | +| **lloyal.node** | This package — native backend + prebuilt binaries | +| [nitro-llama](https://github.com/lloyal-ai/nitro-llama) | React Native backend via Nitro Modules | +| [tsampler](https://github.com/lloyal-ai/tsampler) | Reference sampler implementation | ## Contributing diff --git a/examples/deep-research/agreement.ts b/examples/deep-research/agreement.ts deleted file mode 100644 index 58380e1..0000000 --- a/examples/deep-research/agreement.ts +++ /dev/null @@ -1,142 +0,0 @@ -/** - * Per-section agreement analysis via bigram Jaccard similarity. - * - * Pure string math — no model calls. Used by the verify phase to quantify - * where N diverge attempts agree (confident) vs disagree (hallucination risk). - */ - -export interface SectionAgreement { - label: string; // section header or "¶1", "¶2", etc. - score: number; // 0–1 average pairwise bigram Jaccard -} - -export interface AgreementResult { - overall: number; // mean of section scores - sections: SectionAgreement[]; // per-section breakdown -} - -// ── Internals ───────────────────────────────────────────────────── - -interface Section { - key: string; // normalized header for matching, or positional index - label: string; // display label - body: string; // section text -} - -const HEADER_RE = /^#{1,4}\s+/m; - -function normalizeKey(header: string): string { - return header.toLowerCase().replace(/[^\w\s]/g, '').trim(); -} - -function extractSections(text: string): Section[] { - const hasHeaders = HEADER_RE.test(text); - - if (hasHeaders) { - const parts = text.split(/^(#{1,4}\s+.+)$/m).filter(Boolean); - const sections: Section[] = []; - for (let i = 0; i < parts.length; i++) { - const match = parts[i].match(/^#{1,4}\s+(.+)$/); - if (match) { - const header = match[1].trim(); - const body = (parts[i + 1] ?? '').trim(); - sections.push({ key: normalizeKey(header), label: header, body }); - i++; // skip body part - } - } - return sections.length ? sections : paragraphSections(text); - } - - return paragraphSections(text); -} - -function paragraphSections(text: string): Section[] { - return text.split(/\n{2,}/) - .map(p => p.trim()) - .filter(Boolean) - .map((body, i) => ({ key: String(i), label: `¶${i + 1}`, body })); -} - -function wordBigrams(text: string): Set { - const words = text.split(/\s+/).filter(Boolean); - const bigrams = new Set(); - for (let i = 0; i < words.length - 1; i++) { - bigrams.add(`${words[i]} ${words[i + 1]}`); - } - return bigrams; -} - -function jaccard(a: Set, b: Set): number { - if (a.size === 0 && b.size === 0) return 1; - let intersection = 0; - const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a]; - for (const x of smaller) if (larger.has(x)) intersection++; - const union = a.size + b.size - intersection; - return union === 0 ? 1 : intersection / union; -} - -function averagePairwiseJaccard(texts: string[]): number { - if (texts.length < 2) return 1; - const bigramSets = texts.map(wordBigrams); - let sum = 0; - let pairs = 0; - for (let i = 0; i < bigramSets.length; i++) { - for (let j = i + 1; j < bigramSets.length; j++) { - sum += jaccard(bigramSets[i], bigramSets[j]); - pairs++; - } - } - return sum / pairs; -} - -// ── Public API ──────────────────────────────────────────────────── - -export function computeAgreement(outputs: string[]): AgreementResult { - if (outputs.length < 2) return { overall: 1, sections: [] }; - - const allSections = outputs.map(extractSections); - const hasHeaders = allSections.some(ss => ss.length > 0 && ss[0].key !== '0'); - - if (hasHeaders) { - // Collect all unique section keys across attempts - const keySet = new Map(); // key → label (first seen) - for (const ss of allSections) { - for (const s of ss) { - if (!keySet.has(s.key)) keySet.set(s.key, s.label); - } - } - - const sections: SectionAgreement[] = [...keySet.entries()].map(([key, label]) => { - const bodies = allSections - .map(ss => ss.find(s => s.key === key)?.body) - .filter((b): b is string => b != null && b.length > 0); - // Sections present in only one attempt get score 0 - const score = bodies.length < 2 ? 0 : averagePairwiseJaccard(bodies); - return { label, score }; - }); - - const overall = sections.length - ? sections.reduce((s, x) => s + x.score, 0) / sections.length - : 0; - - return { overall, sections }; - } - - // Positional matching for headerless content - const maxSections = Math.max(...allSections.map(ss => ss.length)); - const sections: SectionAgreement[] = []; - - for (let i = 0; i < maxSections; i++) { - const bodies = allSections - .map(ss => ss[i]?.body) - .filter((b): b is string => b != null && b.length > 0); - const score = bodies.length < 2 ? 0 : averagePairwiseJaccard(bodies); - sections.push({ label: `¶${i + 1}`, score }); - } - - const overall = sections.length - ? sections.reduce((s, x) => s + x.score, 0) / sections.length - : 0; - - return { overall, sections }; -} diff --git a/examples/deep-research/harness.ts b/examples/deep-research/harness.ts deleted file mode 100644 index 8b6f687..0000000 --- a/examples/deep-research/harness.ts +++ /dev/null @@ -1,415 +0,0 @@ -import * as fs from 'node:fs'; -import * as path from 'node:path'; -import { call, scoped } from 'effection'; -import type { Operation, Channel } from 'effection'; -import { Branch, Session } from '../../dist'; -import type { SessionContext } from '../../dist'; -import { - Ctx, - generate, useAgentPool, runAgents, diverge, withSharedRoot, -} from '../../dist/agents'; -import type { Tool, AgentPoolResult, DivergeResult } from '../../dist/agents'; -import type { WorkflowEvent, OpTiming } from './tui'; -import { computeAgreement } from './agreement'; -import { reportTool } from './tools'; - -/** Load a task prompt file. Convention: system prompt above `---`, user content below. */ -function loadTask(name: string): { system: string; user: string } { - const raw = fs.readFileSync(path.resolve(__dirname, `tasks/${name}.md`), 'utf8').trim(); - const sep = raw.indexOf('\n---\n'); - if (sep === -1) return { system: raw, user: '' }; - return { system: raw.slice(0, sep).trim(), user: raw.slice(sep + 5).trim() }; -} - -const PLAN = loadTask('plan'); -const RESEARCH = loadTask('research'); -const VERIFY = loadTask('verify'); -const EVAL = loadTask('eval'); -const REPORT = loadTask('report'); - -// ── Options ────────────────────────────────────────────────────── - -export interface WorkflowOpts { - session: Session; - toolMap: Map; - toolsJson: string; - agentCount: number; - verifyCount: number; - maxTurns: number; - trace: boolean; - events: Channel; -} - -// ── Agent task builder ─────────────────────────────────────────── - -function agentTasks(questions: string[], toolsJson: string, parent: Branch, seed?: number) { - return questions.map((q, i) => ({ - systemPrompt: RESEARCH.system, - content: q, - tools: toolsJson, - parent, - seed: seed != null ? seed + i : undefined, - })); -} - -const reportOnlyTools = JSON.stringify([reportTool.schema]); - -function* reportPass( - pool: AgentPoolResult, - opts: WorkflowOpts, -): Operation { - const hardCut = pool.agents.filter(a => !a.findings && !a.branch.disposed); - if (hardCut.length === 0) return; - - // Free KV from successful agents before spawning reporters - for (const a of pool.agents) { - if (a.findings && !a.branch.disposed) a.branch.pruneSync(); - } - - const reporters = yield* runAgents({ - tasks: hardCut.map(a => ({ - systemPrompt: REPORT.system, - content: REPORT.user, - tools: reportOnlyTools, - parent: a.branch, - })), - tools: new Map([['report', reportTool]]), - terminalTool: 'report', - trace: opts.trace, - pressure: { softLimit: 200, hardLimit: 64 }, - }); - - hardCut.forEach((a, i) => { - if (reporters.agents[i]?.findings) a.findings = reporters.agents[i].findings; - }); -} - -// ── Operations ─────────────────────────────────────────────────── - -function* plan(query: string, opts: WorkflowOpts): Operation<{ questions: string[]; tokenCount: number; timeMs: number }> { - const ctx: SessionContext = yield* Ctx.expect(); - const t = performance.now(); - - const schema = { - type: 'object', - properties: { - questions: { - type: 'array', - items: { type: 'string' }, - minItems: 2, - maxItems: opts.agentCount, - }, - }, - required: ['questions'], - }; - const grammar: string = yield* call(() => ctx.jsonSchemaToGrammar(JSON.stringify(schema))); - - const userContent = PLAN.user - .replace('{{count}}', String(opts.agentCount)) - .replace('{{query}}', query); - - const messages = [ - { role: 'system', content: PLAN.system }, - { role: 'user', content: userContent }, - ]; - const { prompt }: { prompt: string } = yield* call(() => ctx.formatChat(JSON.stringify(messages))); - - let output: string; - let tokenCount: number; - - const parent = opts.session.trunk ?? undefined; - if (parent) { - const lead: Branch = yield* call(() => parent.fork()); - try { - lead.setGrammar(grammar); - const sep = ctx.getTurnSeparator(); - const delta: number[] = yield* call(() => ctx.tokenize(prompt, false)); - yield* call(() => lead.prefill([...sep, ...delta])); - - ({ output, tokenCount } = yield* call(async () => { - let o = ''; - let tc = 0; - for await (const { text } of lead) { o += text; tc++; } - return { output: o, tokenCount: tc }; - })); - } finally { - yield* call(() => lead.prune()); - } - } else { - const result = yield* generate({ prompt, grammar, params: { temperature: 0.3 } }); - output = result.output; - tokenCount = result.tokenCount; - } - - let questions: string[]; - try { - questions = JSON.parse(output).questions.slice(0, opts.agentCount); - if (!questions.length) throw new Error('empty'); - } catch { - questions = Array.from({ length: opts.agentCount }, (_, i) => `${query} (aspect ${i + 1})`); - } - - const timeMs = performance.now() - t; - yield* opts.events.send({ type: 'plan', questions, tokenCount, timeMs }); - return { questions, tokenCount, timeMs }; -} - -function* research( - questions: string[], - opts: WorkflowOpts, -): Operation<{ pool: AgentPoolResult; sharedPrefixLength: number; timeMs: number }> { - yield* opts.events.send({ type: 'research:start', agentCount: questions.length }); - const t = performance.now(); - - const { result: pool, prefixLen: sharedPrefixLength } = yield* withSharedRoot( - { systemPrompt: RESEARCH.system, tools: opts.toolsJson }, - function*(root, prefixLen) { - const pool = yield* useAgentPool({ - tasks: agentTasks(questions, opts.toolsJson, root), - tools: opts.toolMap, maxTurns: opts.maxTurns, trace: opts.trace, - terminalTool: 'report', - pressure: { softLimit: 2048 }, - }); - - yield* reportPass(pool, opts); - return { result: pool, prefixLen }; - }, - ); - - const timeMs = performance.now() - t; - yield* opts.events.send({ type: 'research:done', pool, timeMs }); - return { pool, sharedPrefixLength, timeMs }; -} - -function* warmResearch( - questions: string[], - opts: WorkflowOpts, -): Operation<{ pool: AgentPoolResult; timeMs: number }> { - yield* opts.events.send({ type: 'research:start', agentCount: questions.length }); - const t = performance.now(); - - const pool = yield* scoped(function*() { - const pool = yield* useAgentPool({ - tasks: agentTasks(questions, opts.toolsJson, opts.session.trunk!, Date.now()), - tools: opts.toolMap, maxTurns: opts.maxTurns, trace: opts.trace, - terminalTool: 'report', - pressure: { softLimit: 1024 }, - }); - - yield* reportPass(pool, opts); - return pool; - }); - - const timeMs = performance.now() - t; - yield* opts.events.send({ type: 'research:done', pool, timeMs }); - return { pool, timeMs }; -} - -function* verify( - pool: AgentPoolResult, - questions: string[], - query: string, - opts: WorkflowOpts, -): Operation<{ result: DivergeResult; timeMs: number }> { - const ctx: SessionContext = yield* Ctx.expect(); - const findingsText = pool.agents - .map((a, i) => `Q: ${questions[i]}\nA: ${(a.findings || '').trim()}`) - .join('\n\n'); - - const userContent = VERIFY.user - .replace('{{findings}}', findingsText) - .replace('{{query}}', query); - - const messages = [ - { role: 'system', content: VERIFY.system }, - { role: 'user', content: userContent }, - ]; - const { prompt }: { prompt: string } = yield* call(() => ctx.formatChat(JSON.stringify(messages))); - - yield* opts.events.send({ type: 'verify:start', count: opts.verifyCount }); - const t = performance.now(); - const result = yield* diverge({ - prompt, - attempts: opts.verifyCount, - params: { temperature: 0.7 }, - }); - const timeMs = performance.now() - t; - const agreement = computeAgreement(result.attempts.map(a => a.output)); - yield* opts.events.send({ type: 'verify:agreement', result: agreement }); - yield* opts.events.send({ type: 'verify:done', result, timeMs }); - return { result, timeMs }; -} - -function* evaluate( - verifyResult: DivergeResult, - opts: WorkflowOpts, -): Operation<{ converged: boolean | null; tokenCount: number; timeMs: number }> { - const ctx: SessionContext = yield* Ctx.expect(); - - const responsesText = verifyResult.attempts - .map((a, i) => `Response ${i + 1}: ${a.output.trim()}`) - .join('\n\n'); - - const userContent = EVAL.user.replace('{{responses}}', responsesText); - - const messages = [ - { role: 'system', content: EVAL.system }, - { role: 'user', content: userContent }, - ]; - - const evalSchema = { - type: 'object', - properties: { converged: { type: 'boolean' } }, - required: ['converged'], - }; - const grammar: string = yield* call(() => ctx.jsonSchemaToGrammar(JSON.stringify(evalSchema))); - const { prompt }: { prompt: string } = yield* call(() => ctx.formatChat(JSON.stringify(messages))); - - const t = performance.now(); - const result = yield* generate({ - prompt, - grammar, - params: { temperature: 0 }, - parse: (output: string) => { - try { return JSON.parse(output).converged as boolean; } - catch { return null; } - }, - }); - const timeMs = performance.now() - t; - yield* opts.events.send({ type: 'eval:done', converged: result.parsed as boolean | null, tokenCount: result.tokenCount, timeMs }); - return { converged: result.parsed as boolean | null, tokenCount: result.tokenCount, timeMs }; -} - -function* answer(verifyResult: DivergeResult, opts: WorkflowOpts): Operation { - yield* opts.events.send({ type: 'answer', text: verifyResult.bestOutput }); -} - -function* promote(verifyResult: DivergeResult, opts: WorkflowOpts): Operation { - yield* call(() => opts.session.promote(verifyResult.best)); -} - -function* respond( - pool: AgentPoolResult, - query: string, - opts: WorkflowOpts, -): Operation<{ tokenCount: number; timeMs: number }> { - const agentFindings = pool.agents - .map((a: { findings: string | null }, i: number) => - a.findings ? `[Agent ${i}] ${a.findings.trim()}` : null) - .filter(Boolean) - .join('\n\n'); - - yield* call(() => opts.session.prefillUser(agentFindings - ? `Research findings:\n${agentFindings}\n\nUser question: ${query}\n\nAnswer based on the research findings above.` - : query)); - - yield* opts.events.send({ type: 'response:start' }); - const t = performance.now(); - let tokenCount = 0; - const trunk = opts.session.trunk!; - for (;;) { - const { token, text, isStop } = trunk.produceSync(); - if (isStop) break; - yield* call(() => trunk.commit(token)); - tokenCount++; - yield* opts.events.send({ type: 'response:text', text }); - } - const timeMs = performance.now() - t; - yield* opts.events.send({ type: 'response:done' }); - return { tokenCount, timeMs }; -} - -function* summarize( - timings: OpTiming[], - opts: WorkflowOpts, - extra?: { kvLine?: string }, -): Operation { - const ctx: SessionContext = yield* Ctx.expect(); - const p = ctx._storeKvPressure(); - const ctxTotal = p.nCtx || 1; - yield* opts.events.send({ - type: 'stats', timings, - kvLine: extra?.kvLine, - ctxPct: Math.round(100 * p.cellsUsed / ctxTotal), - ctxPos: p.cellsUsed, - ctxTotal, - }); -} - -// ── Workflow compositions ──────────────────────────────────────── - -function* coldQuery(query: string, opts: WorkflowOpts): Operation { - const t0 = performance.now(); - - const p = yield* plan(query, opts); - const r = yield* research(p.questions, opts); - const v = yield* verify(r.pool, p.questions, query, opts); - const e = yield* evaluate(v.result, opts); - yield* answer(v.result, opts); - yield* promote(v.result, opts); - - const timings: OpTiming[] = [ - { label: 'Plan', tokens: p.tokenCount, detail: '', timeMs: p.timeMs }, - { - label: 'Research', tokens: r.pool.totalTokens, - detail: `(${r.pool.agents.map(a => a.tokenCount).join(' + ')}) ${r.pool.totalToolCalls} tools`, - timeMs: r.timeMs, - }, - { - label: 'Verify', tokens: v.result.totalTokens, - detail: `(${v.result.attempts.map(a => a.tokenCount).join(' + ')})`, - timeMs: v.timeMs, - }, - { label: 'Eval', tokens: e.tokenCount, detail: `converged: ${e.converged ? 'yes' : 'no'}`, timeMs: e.timeMs }, - ]; - - const kvSaved = r.sharedPrefixLength * (p.questions.length - 1) - + v.result.prefixLength * (v.result.attempts.length - 1); - const kvLine = `KV shared ${r.sharedPrefixLength} \u00d7 ${p.questions.length - 1} + ${v.result.prefixLength} \u00d7 ${v.result.attempts.length - 1} = ${kvSaved.toLocaleString()} tok saved`; - - yield* summarize(timings, opts, { kvLine }); - - yield* opts.events.send({ - type: 'complete', - data: { - planTokens: p.tokenCount, - agentTokens: r.pool.totalTokens, researchSteps: r.pool.steps, - agentPpl: r.pool.agents.map(a => a.ppl), - verifyTokens: v.result.totalTokens, verifySteps: v.result.steps, - evalTokens: e.tokenCount, converged: e.converged, - totalToolCalls: r.pool.totalToolCalls, - prefixTokens: v.result.prefixLength, - sharedPrefixTokens: r.sharedPrefixLength, - agentCount: p.questions.length, attemptCount: v.result.attempts.length, - wallTimeMs: Math.round(performance.now() - t0), - planMs: Math.round(p.timeMs), researchMs: Math.round(r.timeMs), - verifyMs: Math.round(v.timeMs), evalMs: Math.round(e.timeMs), - ...r.pool.counters, - }, - }); -} - -function* warmQuery(query: string, opts: WorkflowOpts): Operation { - const p = yield* plan(query, opts); - const r = yield* warmResearch(p.questions, opts); - const resp = yield* respond(r.pool, query, opts); - - const timings: OpTiming[] = [ - { label: 'Plan', tokens: p.tokenCount, detail: '', timeMs: p.timeMs }, - { - label: 'Research', tokens: r.pool.totalTokens, - detail: `(${r.pool.agents.map(a => a.tokenCount).join(' + ')}) ${r.pool.totalToolCalls} tools`, - timeMs: r.timeMs, - }, - { label: 'Response', tokens: resp.tokenCount, detail: '', timeMs: resp.timeMs }, - ]; - - yield* summarize(timings, opts); -} - -// ── Entry point ────────────────────────────────────────────────── - -export function* handleQuery(query: string, opts: WorkflowOpts): Operation { - yield* opts.events.send({ type: 'query', query, warm: !!opts.session.trunk }); - yield* (opts.session.trunk ? warmQuery : coldQuery)(query, opts); -} diff --git a/examples/deep-research/main.ts b/examples/deep-research/main.ts deleted file mode 100644 index edeaf10..0000000 --- a/examples/deep-research/main.ts +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env node -/** - * Deep Research — CLI entry point - * - * Wiring only: setup, TUI subscriber, REPL. - * Orchestration lives in harness.ts. Presentation lives in tui.ts. - * - * Usage: - * npx tsx examples/deep-research/main.ts [model-path] --corpus [--query ] [options] - */ - -import * as fs from "node:fs"; -import * as path from "node:path"; -import * as readline from "node:readline"; -import { - main, - ensure, - createSignal, - spawn, - each, - call, - action, -} from "effection"; -import { createContext } from "../../dist"; -import type { SessionContext } from "../../dist"; -import { initAgents } from "../../dist/agents"; -import { c, log, setJsonlMode, setVerboseMode, fmtSize, createView } from "./tui"; -import type { WorkflowEvent } from "./tui"; -import { loadResources, chunkResources } from "./resources/files"; -import { createReranker } from "./reranker"; -import { createTools } from "./tools"; -import { handleQuery } from "./harness"; -import type { WorkflowOpts } from "./harness"; - -// ── CLI args ───────────────────────────────────────────────────── - -const DEFAULT_MODEL = path.resolve( - __dirname, - "../../models/Qwen3-4B-Instruct-2507-Q4_K_M.gguf", -); -const DEFAULT_RERANKER = path.resolve( - __dirname, - "../../models/qwen3-reranker-0.6b-q4_k_m.gguf", -); - -const args = process.argv.slice(2); -const jsonlMode = args.includes("--jsonl"); -const verbose = args.includes("--verbose"); -const trace = args.includes("--trace"); - -function argVal(flag: string): string | null { - const i = args.indexOf(flag); - return i !== -1 ? args[i + 1] : null; -} -const flagIndices = new Set( - ["--reranker", "--corpus", "--query"].flatMap((f) => { - const i = args.indexOf(f); - return i !== -1 ? [i, i + 1] : []; - }), -); - -const rerankModelPath = argVal("--reranker") || DEFAULT_RERANKER; -const corpusDir = argVal("--corpus"); -const initialQuery = argVal("--query"); -const modelPath = - args.find((a, i) => !a.startsWith("--") && !flagIndices.has(i)) || - DEFAULT_MODEL; - -if (!corpusDir) { - process.stdout.write( - `Usage: npx tsx examples/deep-research/main.ts [model-path] --corpus [--query ] [--reranker ]\nMissing: --corpus\n`, - ); - process.exit(1); -} - -if (jsonlMode) setJsonlMode(true); -if (verbose) setVerboseMode(true); -if (!verbose && !jsonlMode && !trace) { - try { - fs.closeSync(2); - fs.openSync(process.platform === "win32" ? "\\\\.\\NUL" : "/dev/null", "w"); - } catch { - /* non-fatal */ - } -} - -const AGENT_COUNT = 3; -const VERIFY_COUNT = 3; -const MAX_TOOL_TURNS = 20; - -// ── Main ───────────────────────────────────────────────────────── - -main(function* () { - const resources = loadResources(corpusDir!); - const chunks = chunkResources(resources); - - const modelName = path.basename(modelPath).replace(/-Q\w+\.gguf$/, ""); - const rerankName = path - .basename(rerankModelPath) - .replace(/-q\w+\.gguf$/i, ""); - - log(); - log( - `${c.bold} Deep Research${c.reset} ${c.dim}\u2014 Structured Concurrency Runtime${c.reset}`, - ); - log(); - log( - ` ${c.green}\u25cf${c.reset} Loading ${c.bold}${modelName}${c.reset} ${c.dim}(${fmtSize(fs.statSync(modelPath).size)}, KV: Q4_0)${c.reset}`, - ); - - const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || "16384", 10); - const ctx: SessionContext = yield* call(() => - createContext({ - modelPath, - nCtx, - nSeqMax: Math.max(AGENT_COUNT, VERIFY_COUNT) * 2 + 1, - typeK: "q4_0", - typeV: "q4_0", - }), - ); - - log( - ` ${c.green}\u25cf${c.reset} Loading ${c.bold}${rerankName}${c.reset} ${c.dim}(${fmtSize(fs.statSync(rerankModelPath).size)}, reranker)${c.reset}`, - ); - - const reranker = yield* call(() => - createReranker(rerankModelPath, { nSeqMax: 8, nCtx: 4096 }), - ); - yield* ensure(() => { - reranker.dispose(); - }); - yield* call(() => reranker.tokenizeChunks(chunks)); - - const corpusIsFile = - resources.length === 1 && fs.statSync(corpusDir!).isFile(); - const corpusLabel = corpusIsFile - ? path.basename(corpusDir!) - : `${path.basename(corpusDir!)}/ \u2014 ${resources.length} files`; - log( - ` ${c.dim} Corpus: ${corpusLabel} \u2192 ${chunks.length} chunks${c.reset}`, - ); - - const { toolMap, toolsJson } = createTools({ resources, chunks, reranker }); - const { session, events } = yield* initAgents(ctx); - - // View subscriber — all presentation lives here - const view = createView({ - model: path.basename(modelPath), - reranker: path.basename(rerankModelPath), - agentCount: AGENT_COUNT, - verifyCount: VERIFY_COUNT, - chunkCount: chunks.length, - }); - yield* spawn(function* () { - yield* view.subscribe(events); - }); - - const harnessOpts: WorkflowOpts = { - session, - toolMap, - toolsJson, - events, - agentCount: AGENT_COUNT, - verifyCount: VERIFY_COUNT, - maxTurns: MAX_TOOL_TURNS, - trace, - }; - - // Initial query - if (initialQuery) { - yield* handleQuery(initialQuery, harnessOpts); - if (jsonlMode) return; - } - - // REPL — Signal bridges readline into Effection scope - log( - ` ${c.dim}${session.trunk ? "Ask a follow-up question" : "Enter your research question"} or /quit to exit${c.reset}`, - ); - log(); - - const inputSignal = createSignal(); - const rl = readline.createInterface({ - input: process.stdin, - output: process.stdout, - }); - rl.setPrompt(` ${c.dim}>${c.reset} `); - - yield* spawn(function* () { - yield* action((resolve) => { - rl.on("line", (line: string) => inputSignal.send(line.trim())); - rl.on("close", () => { - inputSignal.close(); - resolve(); - }); - return () => rl.close(); - }); - }); - - rl.prompt(); - for (const input of yield* each(inputSignal)) { - if (!input || input === "/quit") break; - try { - yield* handleQuery(input, harnessOpts); - } catch (err) { - log(` ${c.red}Error: ${(err as Error).message}${c.reset}`); - } - yield* each.next(); - try { - rl.prompt(); - } catch { - break; - } - } -}).catch((err: unknown) => { - process.stdout.write( - `Error: ${(err as Error).message}\n${(err as Error).stack}\n`, - ); - process.exit(1); -}); diff --git a/examples/deep-research/reranker.ts b/examples/deep-research/reranker.ts deleted file mode 100644 index 118e17e..0000000 --- a/examples/deep-research/reranker.ts +++ /dev/null @@ -1,59 +0,0 @@ -import { Rerank } from "../../dist"; -import type { Chunk } from "./resources/types"; -import type { Reranker, ScoredResult } from "./tools/types"; - -export async function createReranker( - modelPath: string, - opts?: { nSeqMax?: number; nCtx?: number }, -): Promise { - const rerank = await Rerank.create({ modelPath, ...opts }); - - return { - score(query: string, chunks: Chunk[]): AsyncIterable { - const inner = rerank.score( - query, - chunks.map((c) => c.tokens), - 10, - ); - return { - [Symbol.asyncIterator](): AsyncIterator { - const it = inner[Symbol.asyncIterator](); - return { - async next(): Promise> { - const { value, done } = await it.next(); - if (done) - return { - value: undefined as unknown as ScoredResult, - done: true, - }; - return { - value: { - filled: value.filled, - total: value.total, - results: value.results.map((r) => ({ - file: chunks[r.index].resource, - heading: chunks[r.index].heading, - score: r.score, - startLine: chunks[r.index].startLine, - endLine: chunks[r.index].endLine, - })), - }, - done: false, - }; - }, - }; - }, - }; - }, - - async tokenizeChunks(chunks: Chunk[]): Promise { - for (const chunk of chunks) { - chunk.tokens = await rerank.tokenize(chunk.text); - } - }, - - dispose() { - rerank.dispose(); - }, - }; -} diff --git a/examples/deep-research/resources/files.ts b/examples/deep-research/resources/files.ts deleted file mode 100644 index 4004374..0000000 --- a/examples/deep-research/resources/files.ts +++ /dev/null @@ -1,73 +0,0 @@ -import * as fs from 'node:fs'; -import * as path from 'node:path'; -import { loadBinary } from '../../../dist'; -import type { Resource, Chunk } from './types'; - -interface Section { heading: string; level: number; startLine: number; endLine: number } -const { parseMarkdown } = loadBinary() as unknown as { parseMarkdown(text: string): Section[] }; - -export function loadResources(dir: string): Resource[] { - if (!fs.existsSync(dir)) { - process.stdout.write(`Error: corpus not found: ${dir}\n`); - process.exit(1); - } - const stat = fs.statSync(dir); - if (stat.isFile()) { - return [{ name: path.basename(dir), content: fs.readFileSync(dir, 'utf8') }]; - } - const files = fs.readdirSync(dir).filter((f) => f.endsWith('.md')); - if (!files.length) { - process.stdout.write(`Error: no .md files in: ${dir}\n`); - process.exit(1); - } - return files.map((f) => ({ - name: f, - content: fs.readFileSync(path.join(dir, f), 'utf8'), - })); -} - -/** Split plain text into chunks on blank-line paragraph boundaries */ -function chunkByParagraph(res: Resource): Chunk[] { - const lines = res.content.split('\n'); - const chunks: Chunk[] = []; - let start = 0; - for (let i = 0; i <= lines.length; i++) { - const blank = i === lines.length || !lines[i].trim(); - if (blank && i > start) { - const text = lines.slice(start, i).join('\n').trim(); - if (text) { - chunks.push({ - resource: res.name, - heading: text.slice(0, 60).replace(/\n/g, ' ') + (text.length > 60 ? '…' : ''), - text, tokens: [], - startLine: start + 1, - endLine: i, - }); - } - } - if (blank) start = i + 1; - } - return chunks; -} - -export function chunkResources(resources: Resource[]): Chunk[] { - const out: Chunk[] = []; - for (const res of resources) { - const sections = parseMarkdown(res.content); - // Single section covering the whole file = no headings found → paragraph split - if (sections.length <= 1 && res.content.split('\n').length > 10) { - out.push(...chunkByParagraph(res)); - continue; - } - const lines = res.content.split('\n'); - for (const sec of sections) { - const text = lines.slice(sec.startLine - 1, sec.endLine).join('\n').trim(); - if (!text) continue; - out.push({ - resource: res.name, heading: sec.heading || res.name, text, tokens: [], - startLine: sec.startLine, endLine: sec.endLine, - }); - } - } - return out; -} diff --git a/examples/deep-research/resources/types.ts b/examples/deep-research/resources/types.ts deleted file mode 100644 index 17242b1..0000000 --- a/examples/deep-research/resources/types.ts +++ /dev/null @@ -1,10 +0,0 @@ -export interface Resource { name: string; content: string } - -export interface Chunk { - resource: string; - heading: string; - text: string; - tokens: number[]; - startLine: number; - endLine: number; -} diff --git a/examples/deep-research/tasks/eval.md b/examples/deep-research/tasks/eval.md deleted file mode 100644 index d555374..0000000 --- a/examples/deep-research/tasks/eval.md +++ /dev/null @@ -1,5 +0,0 @@ -You are a consistency checker. Compare the responses and determine if they convey the same core meaning. Output JSON only. ---- -Do these responses agree on the key points? - -{{responses}} diff --git a/examples/deep-research/tasks/plan.md b/examples/deep-research/tasks/plan.md deleted file mode 100644 index 05bba9a..0000000 --- a/examples/deep-research/tasks/plan.md +++ /dev/null @@ -1,3 +0,0 @@ -You break research queries into sub-questions. Output JSON only. ---- -Break this into {{count}} independent sub-questions for parallel research: "{{query}}" diff --git a/examples/deep-research/tasks/report.md b/examples/deep-research/tasks/report.md deleted file mode 100644 index 189a41b..0000000 --- a/examples/deep-research/tasks/report.md +++ /dev/null @@ -1,3 +0,0 @@ -You are a research reporter. Call the report tool with a concise summary (under 200 words) of the key findings from the research above. Focus on the most important discoveries and conclusions. ---- -Report your findings. diff --git a/examples/deep-research/tasks/research.md b/examples/deep-research/tasks/research.md deleted file mode 100644 index 60b25c2..0000000 --- a/examples/deep-research/tasks/research.md +++ /dev/null @@ -1,12 +0,0 @@ -You are a research assistant analyzing a knowledge base. Your tools: -- **grep**: regex pattern matching — use for precise, exhaustive retrieval -- **search**: semantic relevance ranking — use to discover related content -- **read_file**: read specific line ranges — use to verify and get context -- **report**: submit your final findings with evidence - -Process — follow every step in order: -1. Grep with short, simple patterns first. Use single keywords or two-word phrases — never combine multiple clauses with `.*`. Run multiple greps if needed. -2. Use search to discover content that grep may miss (different phrasing, synonyms). -3. Read every matching line with read_file to verify in context. Do not rely on grep/search summaries alone. -4. Grep again with a different pattern targeting what you have NOT yet found. This is a completeness check, not confirmation of existing results. -5. Report with line numbers and direct quotes as evidence. State what you found and what you checked. diff --git a/examples/deep-research/tasks/verify.md b/examples/deep-research/tasks/verify.md deleted file mode 100644 index 0713358..0000000 --- a/examples/deep-research/tasks/verify.md +++ /dev/null @@ -1,7 +0,0 @@ -Synthesize the research findings into a coherent, concise summary. ---- -Research findings: - -{{findings}} - -Synthesize these into a brief summary answering: "{{query}}" diff --git a/examples/deep-research/tools/grep.ts b/examples/deep-research/tools/grep.ts deleted file mode 100644 index bc3ae5f..0000000 --- a/examples/deep-research/tools/grep.ts +++ /dev/null @@ -1,67 +0,0 @@ -import { Tool } from '../../../dist/agents'; -import type { JsonSchema } from '../../../dist/agents'; -import type { Resource } from '../resources/types'; - -export class GrepTool extends Tool<{ pattern: string; ignoreCase?: boolean }> { - readonly name = 'grep'; - readonly description = 'Search the entire corpus for a regex pattern. Returns every matching line with line numbers and total match count. Complements search() which ranks by relevance — grep scans exhaustively.'; - readonly parameters: JsonSchema = { - type: 'object', - properties: { - pattern: { type: 'string', description: 'Regex pattern (e.g. "\\bshor\\b" for whole-word, "hidden_secret" for literal)' }, - ignoreCase: { type: 'boolean', description: 'Case-insensitive matching (default: true)' }, - }, - required: ['pattern'], - }; - - private _resources: Resource[]; - - constructor(resources: Resource[]) { - super(); - this._resources = resources; - } - - async execute(args: { pattern: string; ignoreCase?: boolean }): Promise { - const pattern = args.pattern?.trim(); - if (!pattern) return { error: 'pattern must not be empty' }; - const flags = (args.ignoreCase === false) ? 'g' : 'gi'; - let re: RegExp; - try { re = new RegExp(pattern, flags); } - catch { return { error: `Invalid regex: ${pattern}` }; } - - const matches: { file: string; line: number; text: string }[] = []; - let totalMatches = 0; - - for (const res of this._resources) { - const lines = res.content.split('\n'); - for (let i = 0; i < lines.length; i++) { - const hits = lines[i].match(re); - if (hits) { - totalMatches += hits.length; - const raw = lines[i].trim(); - let text: string; - if (raw.length <= 200) { - text = raw; - } else { - const idx = raw.search(re); - const start = Math.max(0, idx - 40); - const end = Math.min(raw.length, start + 200); - text = (start > 0 ? '\u2026' : '') + raw.slice(start, end) + (end < raw.length ? '\u2026' : ''); - } - matches.push({ file: res.name, line: i + 1, text }); - } - } - } - - if (totalMatches === 0) { - return { - totalMatches: 0, matchingLines: 0, matches: [], - note: 'Zero matches does NOT mean the topic is absent \u2014 only that this exact pattern was not found. Try search() for semantic matching or a broader/simpler regex.', - }; - } - - const limit = 50; - const truncated = matches.length > limit; - return { totalMatches, matchingLines: matches.length, truncated, matches: matches.slice(0, limit) }; - } -} diff --git a/examples/deep-research/tools/index.ts b/examples/deep-research/tools/index.ts deleted file mode 100644 index 2145f44..0000000 --- a/examples/deep-research/tools/index.ts +++ /dev/null @@ -1,23 +0,0 @@ -import { createToolkit } from '../../../dist/agents'; -import type { Toolkit } from '../../../dist/agents'; -import type { Resource, Chunk } from '../resources/types'; -import type { Reranker } from './types'; -import { SearchTool } from './search'; -import { ReadFileTool } from './read-file'; -import { GrepTool } from './grep'; -import { ReportTool } from './report'; - -export const reportTool = new ReportTool(); - -export function createTools(opts: { - resources: Resource[]; - chunks: Chunk[]; - reranker: Reranker; -}): Toolkit { - return createToolkit([ - new SearchTool(opts.chunks, opts.reranker), - new ReadFileTool(opts.resources), - new GrepTool(opts.resources), - reportTool, - ]); -} diff --git a/examples/deep-research/tools/read-file.ts b/examples/deep-research/tools/read-file.ts deleted file mode 100644 index 164a5c5..0000000 --- a/examples/deep-research/tools/read-file.ts +++ /dev/null @@ -1,41 +0,0 @@ -import { Tool } from '../../../dist/agents'; -import type { JsonSchema } from '../../../dist/agents'; -import type { Resource } from '../resources/types'; - -export class ReadFileTool extends Tool<{ filename: string; startLine?: number; endLine?: number }> { - readonly name = 'read_file'; - readonly description = 'Read content from a file at specific line ranges. Use startLine/endLine from search results.'; - readonly parameters: JsonSchema; - - private _resources: Resource[]; - - constructor(resources: Resource[]) { - super(); - this._resources = resources; - this.parameters = { - type: 'object', - properties: { - filename: { - type: 'string', - description: 'Filename from search results', - enum: resources.map(r => r.name), - }, - startLine: { type: 'number', description: 'Start line (1-indexed, from search results)' }, - endLine: { type: 'number', description: 'End line (1-indexed, from search results)' }, - }, - required: ['filename'], - }; - } - - async execute(args: { filename: string; startLine?: number; endLine?: number } & Record): Promise { - const filename = args.filename || (args.path as string) || ''; - const file = this._resources.find(r => r.name === filename); - if (!file) { - return { error: `File not found: ${filename}. Available: ${this._resources.map(r => r.name).join(', ')}` }; - } - const lines = file.content.split('\n'); - const s = Math.max(0, (args.startLine ?? 1) - 1); - const e = Math.min(lines.length, args.endLine ?? Math.min(100, lines.length)); - return { file: file.name, content: lines.slice(s, e).join('\n') }; - } -} diff --git a/examples/deep-research/tools/report.ts b/examples/deep-research/tools/report.ts deleted file mode 100644 index 97f061a..0000000 --- a/examples/deep-research/tools/report.ts +++ /dev/null @@ -1,14 +0,0 @@ -import { Tool } from '../../../dist/agents'; -import type { JsonSchema } from '../../../dist/agents'; - -export class ReportTool extends Tool<{ findings: string }> { - readonly name = 'report'; - readonly description = 'Submit your final research findings. Call this when you have gathered enough information to answer the question.'; - readonly parameters: JsonSchema = { - type: 'object', - properties: { findings: { type: 'string', description: 'Your research findings and answer' } }, - required: ['findings'], - }; - - async execute(): Promise { return {}; } -} diff --git a/examples/deep-research/tools/search.ts b/examples/deep-research/tools/search.ts deleted file mode 100644 index 034bc55..0000000 --- a/examples/deep-research/tools/search.ts +++ /dev/null @@ -1,34 +0,0 @@ -import { Tool } from '../../../dist/agents'; -import type { JsonSchema, ToolContext } from '../../../dist/agents'; -import type { Chunk } from '../resources/types'; -import type { Reranker } from './types'; - -export class SearchTool extends Tool<{ query: string }> { - readonly name = 'search'; - readonly description = 'Search the knowledge base. Returns sections ranked by relevance with line ranges for read_file.'; - readonly parameters: JsonSchema = { - type: 'object', - properties: { query: { type: 'string', description: 'Search query' } }, - required: ['query'], - }; - - private _chunks: Chunk[]; - private _reranker: Reranker; - - constructor(chunks: Chunk[], reranker: Reranker) { - super(); - this._chunks = chunks; - this._reranker = reranker; - } - - async execute(args: { query: string }, context?: ToolContext): Promise { - const query = args.query?.trim(); - if (!query) return { error: 'query must not be empty' }; - let last; - for await (const { results, filled, total } of this._reranker.score(query, this._chunks)) { - if (context?.onProgress) context.onProgress({ filled, total }); - last = results; - } - return last; - } -} diff --git a/examples/deep-research/tools/types.ts b/examples/deep-research/tools/types.ts deleted file mode 100644 index 3f0012a..0000000 --- a/examples/deep-research/tools/types.ts +++ /dev/null @@ -1,21 +0,0 @@ -import type { Chunk } from '../resources/types'; - -export interface ScoredChunk { - file: string; - heading: string; - score: number; - startLine: number; - endLine: number; -} - -export interface ScoredResult { - results: ScoredChunk[]; - filled: number; - total: number; -} - -export interface Reranker { - score(query: string, chunks: Chunk[]): AsyncIterable; - tokenizeChunks(chunks: Chunk[]): Promise; - dispose(): void; -} diff --git a/examples/deep-research/tui.ts b/examples/deep-research/tui.ts deleted file mode 100644 index f720095..0000000 --- a/examples/deep-research/tui.ts +++ /dev/null @@ -1,500 +0,0 @@ -import * as fs from 'node:fs'; -import { each } from 'effection'; -import type { Channel, Operation } from 'effection'; -import type { AgentEvent, AgentPoolResult, DivergeResult } from '../../dist/agents'; -import type { AgreementResult } from './agreement'; - -// ── Event types ────────────────────────────────────────────────── - -export interface OpTiming { - label: string; - tokens: number; - detail: string; - timeMs: number; -} - -export type StepEvent = - | { type: 'query'; query: string; warm: boolean } - | { type: 'plan'; questions: string[]; tokenCount: number; timeMs: number } - | { type: 'research:start'; agentCount: number } - | { type: 'research:done'; pool: AgentPoolResult; timeMs: number } - | { type: 'verify:start'; count: number } - | { type: 'verify:done'; result: DivergeResult; timeMs: number } - | { type: 'verify:agreement'; result: AgreementResult } - | { type: 'eval:done'; converged: boolean | null; tokenCount: number; timeMs: number } - | { type: 'answer'; text: string } - | { type: 'response:start' } - | { type: 'response:text'; text: string } - | { type: 'response:done' } - | { type: 'stats'; timings: OpTiming[]; kvLine?: string; ctxPct: number; ctxPos: number; ctxTotal: number } - | { type: 'complete'; data: Record }; - -export type WorkflowEvent = AgentEvent | StepEvent; - -// ── Mode + color ───────────────────────────────────────────────── - -let _jsonlMode = false; -let _verboseMode = false; - -export function setJsonlMode(on: boolean): void { _jsonlMode = on; } -export function setVerboseMode(on: boolean): void { _verboseMode = on; } - -const isTTY = process.stdout.isTTY; - -export const c = isTTY ? { - bold: '\x1b[1m', dim: '\x1b[2m', reset: '\x1b[0m', - green: '\x1b[32m', cyan: '\x1b[36m', yellow: '\x1b[33m', red: '\x1b[31m', -} : { bold: '', dim: '', reset: '', green: '', cyan: '', yellow: '', red: '' }; - -// ── Primitives ─────────────────────────────────────────────────── - -let _statusText = ''; - -function status(text: string): void { - if (_jsonlMode || !isTTY) return; - _statusText = text; - process.stdout.write('\r\x1b[K' + text); -} - -function statusClear(): void { - if (!_statusText) return; - _statusText = ''; - process.stdout.write('\r\x1b[K'); -} - -export const log = (...a: unknown[]): void => { - if (_jsonlMode) return; - statusClear(); - console.log(...a); -}; - -function emit(event: string, data: Record): void { - if (_jsonlMode) console.log(JSON.stringify({ event, ...data })); -} - -export const fmtSize = (bytes: number): string => bytes > 1e9 - ? (bytes / 1e9).toFixed(1) + ' GB' - : (bytes / 1e6).toFixed(0) + ' MB'; - -const pad = (s: unknown, n: number): string => String(s).padStart(n); - -// ── View state + handler type ──────────────────────────────────── - -interface ViewState { - agentLabel: Map; - nextLabel: number; - agentText: Map; - agentStatus: Map; - agentParent: Map; // childId → parentId (sub-agent tracking) - traceQuery: string; -} - -type ViewHandler = (ev: WorkflowEvent) => void; - -function isSubAgent(state: ViewState, agentId: number): boolean { - return state.agentParent.has(agentId); -} - -function parentLabel(state: ViewState, agentId: number): string { - return label(state, state.agentParent.get(agentId)!); -} - -function label(state: ViewState, agentId: number): string { - let l = state.agentLabel.get(agentId); - if (!l) { l = `A${state.nextLabel++}`; state.agentLabel.set(agentId, l); } - return l; -} - -function resetLabels(state: ViewState): void { - state.nextLabel = 0; - state.agentLabel.clear(); - state.agentStatus.clear(); - state.agentText.clear(); - state.agentParent.clear(); -} - -function renderStatus(state: ViewState): void { - const active = [...state.agentStatus.entries()] - .filter(([id, s]) => s.state !== 'done' && !isSubAgent(state, id)); - if (active.length === 0) return; - - const generating = active.filter(([, s]) => s.state === 'gen'); - if (generating.length === 1 && active.length === 1) { - const [id] = generating[0]; - const raw = (state.agentText.get(id) ?? '').replace(/\n/g, ' ').trimStart(); - const cols = process.stdout.columns || 80; - const maxLen = cols - 12; - const text = raw.length > maxLen ? raw.slice(raw.length - maxLen) : raw; - status(` ${c.dim}\u25c6${c.reset} ${c.yellow}${label(state, id)}${c.reset} ${text}`); - return; - } - - const parts = active.map(([id, s]) => { - const lbl = `${c.yellow}${label(state, id)}${c.reset}`; - if (s.state === 'gen') return `${lbl}: ${s.tokenCount} tok`; - const detail = s.detail ? ` ${s.detail}` : ''; - return `${lbl}: ${c.cyan}${s.state}${c.reset}${detail}`; - }); - status(` ${c.dim}\u25c6${c.reset} ${parts.join(' ')}`); -} - -// ── View handlers ──────────────────────────────────────────────── - -function queryHandler(state: ViewState, opts: ViewOpts): ViewHandler { - return (ev) => { - if (ev.type !== 'query') return; - state.traceQuery = ev.query; - if (!ev.warm) { - emit('start', { - model: opts.model, reranker: opts.reranker, query: ev.query, - agentCount: opts.agentCount, verifyCount: opts.verifyCount, chunks: opts.chunkCount, - }); - log(); - log(` ${c.dim}Query${c.reset}`); - log(` ${c.bold}${ev.query}${c.reset}`); - } - }; -} - -function planHandler(): ViewHandler { - return (ev) => { - if (ev.type !== 'plan') return; - emit('plan', { questions: ev.questions, planTokens: ev.tokenCount }); - log(`\n ${c.green}\u25cf${c.reset} ${c.bold}Plan${c.reset} ${c.dim}${ev.tokenCount} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`); - ev.questions.forEach((q: string, i: number) => log(` ${c.dim}${i + 1}.${c.reset} ${q}`)); - }; -} - -function agentHandler(state: ViewState): ViewHandler { - return (ev) => { - switch (ev.type) { - case 'agent:spawn': { - // If parent is a known labeled agent, this is a sub-agent - if (state.agentLabel.has(ev.parentAgentId)) { - state.agentParent.set(ev.agentId, ev.parentAgentId); - } - break; - } - case 'agent:produce': { - const sub = isSubAgent(state, ev.agentId); - state.agentText.set(ev.agentId, (state.agentText.get(ev.agentId) ?? '') + ev.text); - state.agentStatus.set(ev.agentId, { state: 'gen', tokenCount: ev.tokenCount, detail: '' }); - if (sub) break; // sub-agents: skip verbose/status output - if (_verboseMode) { - const lbl = label(state, ev.agentId); - if (ev.tokenCount === 1) { - statusClear(); - process.stdout.write(`\n ${c.dim}───${c.reset} ${c.yellow}${lbl}${c.reset} ${c.dim}tokens${c.reset} ${c.dim}───${c.reset}\n `); - } - process.stdout.write(ev.text); - } else { - renderStatus(state); - } - break; - } - case 'agent:tool_call': { - const sub = isSubAgent(state, ev.agentId); - if (_verboseMode && !sub) process.stdout.write('\n'); - state.agentText.delete(ev.agentId); - state.agentStatus.set(ev.agentId, { state: ev.tool, tokenCount: 0, detail: '' }); - emit('tool_call', { agentId: ev.agentId, toolName: ev.tool, arguments: ev.args }); - let toolArgs: Record; - try { toolArgs = JSON.parse(ev.args); } catch { toolArgs = {}; } - const argSummary = ev.tool === 'search' - ? `"${toolArgs.query || ''}"` - : ev.tool === 'grep' - ? `/${toolArgs.pattern || ''}/` - : ev.tool === 'report' ? '' - : `${toolArgs.filename}` + (toolArgs.startLine ? ` L${toolArgs.startLine}-${toolArgs.endLine}` : ''); - if (sub) { - const plbl = `${c.yellow}${parentLabel(state, ev.agentId)}${c.reset}`; - log(` ${c.dim}\u2502${c.reset} ${c.dim}\u2514${c.reset} ${plbl} ${c.cyan}${ev.tool}${c.reset}${argSummary ? `(${argSummary})` : ''}`); - } else { - log(` ${c.dim}\u251c${c.reset} ${c.yellow}${label(state, ev.agentId)}${c.reset} ${c.cyan}${ev.tool}${c.reset}${argSummary ? `(${argSummary})` : ''}`); - } - break; - } - case 'agent:tool_result': { - emit('tool_result', { - agentId: ev.agentId, toolName: ev.tool, - result: ev.result.length > 200 ? ev.result.slice(0, 200) + '...' : ev.result, - }); - let preview = ''; - if (ev.tool === 'read_file') { - try { - const firstLine = (JSON.parse(ev.result) as { content: string }).content.split('\n').find((l: string) => l.trim()); - if (firstLine) preview = ` \u00b7 ${firstLine.trim().slice(0, 60)}${firstLine.trim().length > 60 ? '\u2026' : ''}`; - } catch { /* non-fatal */ } - } else if (ev.tool === 'search') { - try { - const top = (JSON.parse(ev.result) as { heading: string }[])[0]; - if (top?.heading) preview = ` \u00b7 ${top.heading}`; - } catch { /* non-fatal */ } - } else if (ev.tool === 'grep') { - try { - const r = JSON.parse(ev.result) as { totalMatches: number; matchingLines: number }; - preview = ` \u00b7 ${r.totalMatches} matches in ${r.matchingLines} lines`; - } catch { /* non-fatal */ } - } - if (isSubAgent(state, ev.agentId)) { - const plbl = `${c.yellow}${parentLabel(state, ev.agentId)}${c.reset}`; - log(` ${c.dim}\u2502${c.reset} ${c.dim}\u2514${c.reset} ${plbl} ${c.dim}\u2190 ${ev.tool} ${ev.result.length}b${preview}${c.reset}`); - } else { - log(` ${c.dim}\u251c${c.reset} ${c.yellow}${label(state, ev.agentId)}${c.reset} ${c.dim}\u2190 ${ev.tool} ${ev.result.length}b${preview}${c.reset}`); - } - break; - } - case 'agent:tool_progress': { - state.agentStatus.set(ev.agentId, { state: ev.tool, tokenCount: 0, detail: `${ev.filled}/${ev.total}` }); - renderStatus(state); - break; - } - case 'agent:report': { - state.agentStatus.set(ev.agentId, { state: 'done', tokenCount: 0, detail: '' }); - const sub = isSubAgent(state, ev.agentId); - const cols = process.stdout.columns || 80; - const displayLabel = sub ? parentLabel(state, ev.agentId) : label(state, ev.agentId); - const lbl = `${c.yellow}${displayLabel}${c.reset}`; - const indent = sub ? ` ${c.dim}\u2502${c.reset} ` : ' '; - const prefix = `${indent}${c.dim}\u2502${c.reset} `; - const wrap = cols - (sub ? 11 : 8); - - log(`${indent}${c.dim}\u2502${c.reset}`); - log(`${indent}${c.dim}\u251c\u2500\u2500${c.reset} ${lbl} ${c.bold}findings${c.reset}`); - - for (const para of ev.findings.split('\n')) { - if (!para.trim()) { log(prefix); continue; } - const words = para.split(/\s+/); - let line = ''; - for (const word of words) { - if (line && line.length + 1 + word.length > wrap) { - log(`${prefix}${c.dim}${line}${c.reset}`); - line = word; - } else { - line = line ? `${line} ${word}` : word; - } - } - if (line) log(`${prefix}${c.dim}${line}${c.reset}`); - } - log(`${indent}${c.dim}\u2502${c.reset}`); - break; - } - case 'agent:done': - if (_verboseMode && !isSubAgent(state, ev.agentId)) process.stdout.write('\n'); - break; - } - }; -} - -function researchSummaryHandler(state: ViewState): ViewHandler { - function flushTrace(pool: AgentPoolResult): void { - if (!pool.agents.some(a => a.trace?.length)) return; - const filename = `trace-${Date.now()}.json`; - fs.writeFileSync(filename, JSON.stringify({ - query: state.traceQuery, - timestamp: new Date().toISOString(), - agents: pool.agents.map(a => ({ - agentId: a.agentId, label: label(state, a.agentId), - ppl: a.ppl, samplingPpl: a.samplingPpl, - tokenCount: a.tokenCount, toolCallCount: a.toolCallCount, - findings: a.findings, trace: a.trace ?? [], - })), - }, null, 2)); - log(` ${c.dim}Trace written to ${filename}${c.reset}`); - } - - return (ev) => { - switch (ev.type) { - case 'research:start': { - log(`\n ${c.green}\u25cf${c.reset} ${c.bold}Research${c.reset} ${c.dim}${ev.agentCount} agents${c.reset}`); - resetLabels(state); - break; - } - case 'research:done': { - statusClear(); - ev.pool.agents.forEach((a, i) => { - const tree = i === ev.pool.agents.length - 1 ? '\u2514' : '\u251c'; - emit('agent_done', { - index: i, findings: (a.findings || '').slice(0, 500), - toolCalls: a.toolCallCount, tokenCount: a.tokenCount, - ppl: a.ppl, samplingPpl: a.samplingPpl, - }); - const raw = (state.agentText.get(a.agentId) ?? '').replace(/\n/g, ' ').trim(); - if (raw) log(` ${c.dim}\u251c${c.reset} ${c.yellow}${label(state, a.agentId)}${c.reset} ${c.dim}\u25b8 ${raw.slice(0, 120)}${raw.length > 120 ? '\u2026' : ''}${c.reset}`); - const pplStr = Number.isFinite(a.ppl) ? ` \u00b7 ppl ${a.ppl.toFixed(2)}` : ''; - log(` ${c.dim}${tree}${c.reset} ${c.yellow}${label(state, a.agentId)}${c.reset} ${c.green}done${c.reset} ${c.dim}${a.tokenCount} tok \u00b7 ${a.toolCallCount} tools${pplStr}${c.reset}`); - }); - log(` ${c.dim}${ev.pool.totalTokens} tok \u00b7 ${ev.pool.totalToolCalls} tools \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`); - flushTrace(ev.pool); - break; - } - } - }; -} - -function verifyHandler(): ViewHandler { - let pendingAgreement: AgreementResult | null = null; - - return (ev) => { - switch (ev.type) { - case 'verify:start': { - log(`\n ${c.green}\u25cf${c.reset} ${c.bold}Verify${c.reset} ${c.dim}${ev.count} attempts${c.reset}`); - pendingAgreement = null; - break; - } - case 'verify:agreement': { - pendingAgreement = ev.result; - emit('verify_agreement', { - overall: ev.result.overall, - sections: ev.result.sections.map(s => ({ label: s.label, score: s.score })), - }); - break; - } - case 'verify:done': { - ev.result.attempts.forEach((a, i) => { - const tree = i === ev.result.attempts.length - 1 - ? (pendingAgreement ? '\u251c' : '\u2514') - : '\u251c'; - emit('attempt_done', { index: i, output: a.output.trim().slice(0, 500), tokenCount: a.tokenCount, ppl: a.ppl }); - log(` ${c.dim}${tree} ${a.tokenCount} tok \u00b7 ppl ${a.ppl.toFixed(2)}${c.reset}`); - }); - if (pendingAgreement && pendingAgreement.sections.length > 0) { - const pct = Math.round(pendingAgreement.overall * 100); - log(` ${c.dim}\u251c${c.reset} Agreement: ${c.bold}${pct}%${c.reset}`); - const sorted = [...pendingAgreement.sections].sort((a, b) => b.score - a.score); - const show = sorted.slice(0, 5); - const maxLabelLen = Math.max(...show.map(s => s.label.length)); - show.forEach((s, i) => { - const tree = i === show.length - 1 && sorted.length <= 5 ? '\u2514' : '\u251c'; - const filled = Math.round(s.score * 10); - const bar = '\u2588'.repeat(filled) + '\u2591'.repeat(10 - filled); - const sPct = pad(Math.round(s.score * 100), 3); - const label = `"${s.label}"`.padEnd(maxLabelLen + 2); - log(` ${c.dim}${tree}${c.reset} ${c.dim}${label}${c.reset} ${sPct}% ${bar}`); - }); - if (sorted.length > 5) { - log(` ${c.dim}\u2514 \u2026 ${sorted.length - 5} more${c.reset}`); - } - } - log(` ${c.dim}${ev.result.totalTokens} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`); - pendingAgreement = null; - break; - } - } - }; -} - -function evalHandler(): ViewHandler { - return (ev) => { - if (ev.type !== 'eval:done') return; - emit('convergence', { converged: ev.converged, evalTokens: ev.tokenCount }); - const verdict = ev.converged === true ? `${c.green}yes${c.reset}` - : ev.converged === false ? `${c.red}no${c.reset}` - : `${c.yellow}unknown${c.reset}`; - log(`\n ${c.green}\u25cf${c.reset} ${c.bold}Eval${c.reset} ${c.dim}${ev.tokenCount} tok \u00b7 ${(ev.timeMs / 1000).toFixed(1)}s${c.reset}`); - log(` Converged: ${verdict}`); - }; -} - -function answerHandler(): ViewHandler { - return (ev) => { - if (ev.type !== 'answer') return; - log(`\n ${c.dim}${'\u2500'.repeat(58)}${c.reset}\n`); - const prose = ev.text.trim() - .replace(/\*\*(.+?)\*\*/g, `${c.bold}$1${c.reset}`) - .split('\n').map((l: string) => ` ${l}`).join('\n'); - log(prose); - }; -} - -function responseHandler(): ViewHandler { - return (ev) => { - switch (ev.type) { - case 'response:start': - process.stdout.write(` ${c.dim}<${c.reset} `); - break; - case 'response:text': - process.stdout.write(ev.text); - break; - case 'response:done': - console.log('\n'); - break; - } - }; -} - -function statsHandler(): ViewHandler { - return (ev) => { - if (ev.type !== 'stats') return; - const { timings, kvLine, ctxPct, ctxPos, ctxTotal } = ev; - const totalTokens = timings.reduce((s, p) => s + p.tokens, 0); - const totalMs = timings.reduce((s, p) => s + p.timeMs, 0); - - log(`\n ${c.dim}${'\u2501'.repeat(58)}${c.reset}`); - for (const p of timings) { - const left = `${p.label.padEnd(10)} ${pad(p.tokens, 5)} tok`; - const detail = p.detail ? ` ${p.detail}` : ''; - const right = p.timeMs > 0 ? `${pad((p.timeMs / 1000).toFixed(1), 6)}s` : ''; - log(` ${c.dim}${left}${detail}${' '.repeat(Math.max(1, 58 - left.length - detail.length - right.length))}${right}${c.reset}`); - } - log(` ${c.dim}${'\u2501'.repeat(58)}${c.reset}`); - log(` ${c.bold}Total${c.reset} ${c.bold}${pad(totalTokens, 5)}${c.reset} tok ${c.bold}${pad((totalMs / 1000).toFixed(1), 6)}s${c.reset}`); - if (kvLine) log(` ${c.dim}${kvLine}${c.reset}`); - if (ctxPct != null && ctxPos != null && ctxTotal != null) { - const ctxStr = `ctx: ${ctxPct}% (${ctxPos.toLocaleString()}/${ctxTotal.toLocaleString()})`; - log(` ${c.dim}${'\u2501'.repeat(58)}${c.reset}`); - log(` ${c.dim}${' '.repeat(58 - ctxStr.length)}${ctxStr}${c.reset}`); - } - log(); - }; -} - -function completeHandler(): ViewHandler { - return (ev) => { - if (ev.type !== 'complete') return; - emit('complete', ev.data); - }; -} - -// ── createView — composable view factory ───────────────────────── - -export interface ViewOpts { - model: string; - reranker: string; - agentCount: number; - verifyCount: number; - chunkCount: number; -} - -export function createView(opts: ViewOpts) { - const state: ViewState = { - agentLabel: new Map(), - nextLabel: 0, - agentText: new Map(), - agentStatus: new Map(), - agentParent: new Map(), - traceQuery: '', - }; - - const handlers: ViewHandler[] = [ - queryHandler(state, opts), - planHandler(), - agentHandler(state), - researchSummaryHandler(state), - verifyHandler(), - evalHandler(), - answerHandler(), - responseHandler(), - statsHandler(), - completeHandler(), - ]; - - return { - *subscribe(events: Channel): Operation { - for (const ev of yield* each(events)) { - for (const h of handlers) h(ev); - yield* each.next(); - } - }, - }; -} diff --git a/package-lock.json b/package-lock.json index e618748..cdf46c5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,8 +9,9 @@ "version": "1.6.0", "license": "Apache-2.0", "dependencies": { + "@lloyal-labs/lloyal-agents": "^1.0.0", + "@lloyal-labs/sdk": "^1.0.0", "@lloyal-labs/tsampler": "^0.2.0", - "effection": "^4.0.2", "node-addon-api": "^8.5.0" }, "devDependencies": { @@ -528,6 +529,16 @@ "node": ">=18.0.0" } }, + "node_modules/@lloyal-labs/lloyal-agents": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal-agents/-/lloyal-agents-1.0.0.tgz", + "integrity": "sha512-wBoUH8xxhV+qvfvlfFvqMETuOggVA8o3qIfN2c9ffyu22+lWSUKESto++2OqzROYlig7YQBPop7+ar+o4yjN/w==", + "license": "Apache-2.0", + "dependencies": { + "@lloyal-labs/sdk": "*", + "effection": "^4.0.2" + } + }, "node_modules/@lloyal-labs/lloyal.node-darwin-arm64": { "version": "1.6.0", "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-darwin-arm64/-/lloyal.node-darwin-arm64-1.6.0.tgz", @@ -697,6 +708,12 @@ "win32" ] }, + "node_modules/@lloyal-labs/sdk": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@lloyal-labs/sdk/-/sdk-1.0.0.tgz", + "integrity": "sha512-Xu0LeSAgc+V4jeOXbtl+cgBHZE1LX31q0k0eQWJ8FX3LECYKIYv97SW9dkYNsxoOC8BJWQYPAtOyJbScgE6rzw==", + "license": "Apache-2.0" + }, "node_modules/@lloyal-labs/tsampler": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/@lloyal-labs/tsampler/-/tsampler-0.2.0.tgz", diff --git a/package.json b/package.json index 8cd4138..0d86e96 100644 --- a/package.json +++ b/package.json @@ -21,7 +21,6 @@ "docs": "npx typedoc", "test": "npm run test:integration", "test:integration": "npx tsx test/integration.ts", - "test:agents": "npx tsx test/agents.ts", "test:examples": "npx tsx test/examples.ts", "sync:llama-cpp": "node scripts/sync-llama-cpp.js", "example": "npx tsx examples/chat/chat.ts" @@ -47,8 +46,9 @@ }, "homepage": "https://github.com/lloyal-ai/lloyal.node#readme", "dependencies": { + "@lloyal-labs/lloyal-agents": "^1.0.0", + "@lloyal-labs/sdk": "^1.0.0", "@lloyal-labs/tsampler": "^0.2.0", - "effection": "^4.0.2", "node-addon-api": "^8.5.0" }, "devDependencies": { diff --git a/src/Branch.ts b/src/Branch.ts deleted file mode 100644 index e44300e..0000000 --- a/src/Branch.ts +++ /dev/null @@ -1,650 +0,0 @@ -import type { SessionContext, SamplingParams, Produced, GrammarTrigger } from './types'; -import { GrammarTriggerType } from './types'; - -/** - * Forkable inference handle for covalent generation - * - * A Branch owns everything needed for independent generation: a KV cache - * sequence, sampler chain, logits snapshot, and perplexity tracker. - * - * Forking is cheap — the KV prefix is shared in memory (metadata-only operation under unified KV — - * no KV tensor buffers are copied), so sibling branches read from the same physical KV entries. - * Only tokens decoded after the fork point are exclusive to each branch. - * - * Branches form trees, not just flat lists. Fork from root for best-of-N, - * fork from children for tree search/beam search, fork from a draft for speculative - * decoding. - * - * The produce/commit protocol separates sampling from state advancement: - * produce() samples without writing to KV, letting you inspect the result - * before deciding to commit(). - * - * @example Best-of-N with perplexity selection - * ```typescript - * const root = Branch.create(ctx, tokens.length, { temperature: 0.8 }); - * await root.prefill(tokens); - * - * const results = []; - * for (let i = 0; i < 5; i++) { - * const branch = await root.fork(); - * branch.reseedSampler(1000 + i); - * const tokens = []; - * for await (const { token } of branch) tokens.push(token); - * results.push({ branch, tokens, ppl: branch.perplexity }); - * } - * - * const best = results.reduce((a, b) => a.ppl < b.ppl ? a : b); - * for (const r of results) { if (r !== best) await r.branch.prune(); } - * ``` - * - * @category Branching - */ -export class Branch { - private _ctx: SessionContext; - private _handle: number; - private _disposed: boolean; - - constructor(ctx: SessionContext, handle: number) { - this._ctx = ctx; - this._handle = handle; - this._disposed = false; - } - - /** - * Create a root branch at the given position - * - * The branch takes ownership of the sequence and creates its own sampler - * chain from the provided params. Call prefill() to decode prompt tokens - * and capture the logit distribution before forking. - * - * @param ctx - SessionContext to create branch on - * @param position - Starting position (typically prompt token count) - * @param params - Sampling parameters (temperature, topP, etc.) - * @param nBatch - Per-branch batch size override (defaults to context nBatch). - * Controls chunk size for prefill(). Has no effect on - * single-token commit() which uses a zero-allocation fast path. - * @param grammar - GBNF grammar string for constrained generation. - * When provided, sample() returns only grammar-valid tokens. The grammar state - * is cloned on fork(), so sibling branches can diverge independently. - * @returns New Branch instance - */ - static create( - ctx: SessionContext, - position: number, - params?: SamplingParams, - nBatch?: number, - grammar?: string - ): Branch { - const handle = ctx._branchCreate(position, params, nBatch, grammar); - return new Branch(ctx, handle); - } - - /** - * Fork this branch to a new sequence (async) - * - * Async contract: local branches resolve immediately; cloud branches - * may perform an HTTP round-trip. Use {@link forkSync} when you know - * the branch is local and want zero-overhead forking. - * - * @returns New forked Branch - */ - async fork(): Promise { - return this.forkSync(); - } - - /** - * Fork this branch to a new sequence (sync) - * - * The child shares the parent's KV prefix in memory (metadata-only under unified KV, no KV buffer copy). - * Logits, sampler state, and perplexity tracker are cloned so the child - * can diverge independently. Fork from any branch — root or intermediate — - * to build arbitrarily deep trees. - * - * Call reseedSampler() on each child for stochastic diversity. - * - * @returns New forked Branch - */ - forkSync(): Branch { - this._ensureNotDisposed(); - const newHandle = this._ctx._branchFork(this._handle); - return new Branch(this._ctx, newHandle); - } - - /** - * Get a copy of this branch's captured logits snapshot. - * - * Returns n_vocab floats — the raw logit distribution from the last - * prefill() or commit() call. - * - * Returns an independent copy of the branch's internal snapshot. - * The returned Float32Array is safe to hold across async boundaries - * and is not affected by subsequent decode operations. - * - * @returns Independent copy of the logits snapshot (n_vocab elements) - * @throws If no logits have been captured yet - */ - getLogits(): Float32Array { - this._ensureNotDisposed(); - return this._ctx._branchGetLogits(this._handle); - } - - /** - * Bulk-decode tokens into the branch's KV cache and capture logits. - * - * `tokens.length` is the total count to process; the branch's `nBatch` - * (set at `Branch.create`) controls how many are sent per `llama_decode` - * call. E.g. 500 tokens with `nBatch=64` → 8 calls (7×64 + 1×52). - * - * Advances `position` by `tokens.length`. Stores final logits into the - * branch's internal snapshot — the next `produce()`/`sample()` reads - * from it. - * - * Does NOT accept tokens into the repeat-penalty window — for external - * tokens (user input between turns), not model-generated tokens. - * For model output, use `commit()` which does accept + decode. - * - * The primary way to feed tokens into a branch's KV cache. - * - * @param tokens - Token IDs to decode - */ - async prefill(tokens: number[]): Promise { - this._ensureNotDisposed(); - await this._ctx._branchPrefill(this._handle, tokens); - } - - /** - * Sample next token from branch's logits snapshot - * - * Applies the branch's full sampler chain (top-k, top-p, temperature, - * repeat/presence penalties) to the captured logits. - * - * @returns Sampled token ID - */ - sample(): number { - this._ensureNotDisposed(); - return this._ctx._branchSample(this._handle); - } - - /** - * Record token in the sampler's repeat/presence penalty window - * - * @param token - Token to accept - */ - accept(token: number): void { - this._ensureNotDisposed(); - this._ctx._branchAccept(this._handle, token); - } - - /** - * Discard this branch (async) - * - * Async contract: local branches resolve immediately; cloud branches - * may perform an HTTP round-trip. Use {@link pruneSync} when you know - * the branch is local. - * - * RESTRICT mode: throws if children exist. Use {@link pruneSubtree} to - * cascade-delete an entire subtree. - */ - async prune(): Promise { - this.pruneSync(); - } - - /** - * Discard this branch — remove its divergent KV entries and free the handle (sync) - * - * Only removes KV entries divergent from the shared prefix; sibling branches - * are unaffected. The disposed flag is set synchronously — any call to - * produce(), commit(), etc. after prune() will throw immediately. - * - * RESTRICT mode: throws if children exist. Use {@link pruneSubtreeSync} to - * cascade-delete an entire subtree. - */ - pruneSync(): void { - if (this._disposed) return; - const kids = this.children; - if (kids.length > 0) { - throw new Error( - `Branch.prune(): branch ${this._handle} has ${kids.length} active child(ren) ` + - `[${kids.join(', ')}]. Prune children first or use pruneSubtree().`, - ); - } - this._ctx._branchPrune(this._handle); - this._disposed = true; - } - - /** - * Discard this branch and all its descendants (async) - * - * Async contract: local branches resolve immediately; cloud branches - * may perform an HTTP round-trip. Use {@link pruneSubtreeSync} when you know - * the branch is local. - */ - async pruneSubtree(): Promise { - this.pruneSubtreeSync(); - } - - /** - * Discard this branch and all its descendants — CASCADE delete (sync) - * - * Iterative post-order traversal: prunes children first, then this branch. - * Use when tearing down an entire subtree (e.g. abandoned search path). - * Sets disposed synchronously. - */ - pruneSubtreeSync(): void { - if (this._disposed) return; - this._ctx._branchPruneSubtree(this._handle); - this._disposed = true; - } - - /** - * Reseed the sampler's PRNG for diversity after fork() - * - * CRITICAL for parallel generation: Without reseeding, all forked branches - * produce identical outputs because they share the same PRNG state. - * - * Only affects stochastic samplers (temperature > 0). Greedy samplers are unchanged. - * - * @param seed - New seed for the PRNG - */ - reseedSampler(seed: number): void { - this._ensureNotDisposed(); - this._ctx._branchSamplerChainReseed(this._handle, seed); - } - - /** - * Apply dynamic logit adjustments for this branch only - * - * Unlike `logit_bias` in sampling params (which is cloned on fork), steer biases - * are NOT inherited by child branches. Each branch manages its own steer state - * independently. This makes steer ideal for path-dependent constraints. - * - * **Use cases:** - * - **tsampler**: Block tokens that would create repeated N-grams based on - * this branch's specific generation history - * - **Diverse beam search**: Penalize tokens already chosen by sibling beams - * to encourage output diversity across the beam - * - **Dynamic constraints**: Apply token restrictions that change per-step - * - * **Sampling order:** Grammar → Logit Bias → Steer → Sampler Chain - * - * @param biases - Array of token adjustments. Use `-Infinity` to completely - * block a token, positive values to boost probability, negative to reduce. - * - * @example Block tokens for N-gram deduplication (tsampler pattern) - * ```ts - * // Compute which tokens would create repeated 4-grams - * const blocked = computeNgramBlocks(generatedTokens, n=4); - * - * // Block those tokens for this sample only - * branch.steer(blocked.map(t => ({ token: t, bias: -Infinity }))); - * - * const { token } = await branch.produce(); // Blocked tokens won't be sampled - * await branch.commit(token); - * - * // Clear for next iteration (recompute based on new history) - * branch.clearSteer(); - * ``` - * - * @example Diverse beam search - * ```ts - * // Each beam penalizes tokens chosen by siblings this step - * for (const beam of beams) { - * // Collect tokens chosen by other beams - * const siblingTokens = beams - * .filter(b => b !== beam && b.lastToken !== undefined) - * .map(b => b.lastToken); - * - * // Penalize sibling choices to encourage diversity - * beam.branch.steer(siblingTokens.map(t => ({ token: t, bias: -2.0 }))); - * - * const { token } = await beam.branch.produce(); - * await beam.branch.commit(token); - * beam.lastToken = token; - * beam.branch.clearSteer(); - * } - * ``` - * - * @example Boost specific tokens - * ```ts - * // Boost "yes" and "no" tokens for a yes/no question - * branch.steer([ - * { token: yesTokenId, bias: 5.0 }, - * { token: noTokenId, bias: 5.0 } - * ]); - * ``` - */ - steer(biases: Array<{ token: number; bias: number }>): void { - this._ensureNotDisposed(); - this._ctx._branchSteer(this._handle, biases); - } - - /** - * Clear all steer biases from this branch - * - * Removes any dynamic logit adjustments set by `steer()`. Call this after - * each generation step if your steer constraints are computed per-step - * (e.g., N-gram blocking where the blocked set changes as text grows). - * - * @example Per-step steer pattern - * ```ts - * for (let i = 0; i < maxTokens; i++) { - * // Compute constraints based on current state - * const blocked = computeConstraints(generatedTokens); - * branch.steer(blocked.map(t => ({ token: t, bias: -Infinity }))); - * - * const { token, isStop } = await branch.produce(); - * if (isStop) break; - * - * await branch.commit(token); - * branch.clearSteer(); // Reset for next iteration - * generatedTokens.push(token); - * } - * ``` - */ - clearSteer(): void { - this._ensureNotDisposed(); - this._ctx._branchClearSteer(this._handle); - } - - /** - * Replace the sampler chain with new parameters (memoized) - * - * If the new params match the current chain's params, this is a no-op. - * Otherwise the old chain is freed and a new one is created. Use for - * Entropy-Driven Temperature (EDT) and other adaptive sampling strategies - * that adjust parameters per-step. - * - * @param params - New sampling parameters - * - * @example Entropy-Driven Temperature - * ```typescript - * const entropy = branch.modelEntropy('nats'); - * branch.setSamplerParams({ temperature: edtTemperature(entropy) }); - * const { token } = await branch.produce(); - * await branch.commit(token); - * ``` - */ - setSamplerParams(params: SamplingParams): void { - this._ensureNotDisposed(); - this._ctx._branchSetSamplerParams(this._handle, params); - } - - /** - * Replace or remove the grammar constraint - * - * Pass a GBNF grammar string to constrain generation. Pass empty string - * or undefined to remove the constraint. The grammar state is cloned on - * fork(), so sibling branches can diverge independently after hot-swap. - * - * @param grammarStr - GBNF grammar string, or empty/undefined to remove - * - * @example Hot-swap grammar mid-generation - * ```typescript - * // Start unconstrained, then switch to JSON after detecting tool call - * branch.setGrammar(jsonGrammar); - * const { token } = await branch.produce(); - * ``` - */ - setGrammar(grammarStr?: string): void { - this._ensureNotDisposed(); - this._ctx._branchSetGrammar(this._handle, grammarStr || ''); - } - - /** - * Set lazy grammar — unconstrained until trigger, then grammar-constrained - * - * Generation runs freely until a trigger pattern or token fires, at which - * point the grammar activates and constrains subsequent tokens. Used for - * tool-call generation: model writes freely until ``, then - * grammar forces valid XML structure. - * - * The grammar state is cloned on fork(), so sibling branches can diverge - * independently. Call again after a tool result prefill to reset. - * - * @param grammar - GBNF grammar string - * @param triggers - Trigger conditions from formatChat().grammarTriggers - */ - setGrammarLazy(grammar: string, triggers: GrammarTrigger[]): void { - this._ensureNotDisposed(); - const escapeRegex = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); - const patterns: string[] = []; - const tokens: number[] = []; - for (const t of triggers) { - switch (t.type) { - case GrammarTriggerType.WORD: - patterns.push(escapeRegex(t.value)); - break; - case GrammarTriggerType.PATTERN: - patterns.push(t.value); - break; - case GrammarTriggerType.PATTERN_FULL: { - const p = t.value; - patterns.push((p[0] !== '^' ? '^' : '') + p + (p[p.length - 1] !== '$' ? '$' : '')); - break; - } - case GrammarTriggerType.TOKEN: - tokens.push(t.token); - break; - } - } - this._ctx._branchSetGrammarLazy(this._handle, grammar, patterns, tokens); - } - - /** - * Sample next token without advancing state (async) - * - * Async contract: local branches resolve immediately; cloud branches - * may perform an HTTP round-trip. Use {@link produceSync} when you know - * the branch is local and want zero-overhead sampling. - */ - async produce(): Promise { - return this.produceSync(); - } - - /** - * Sample next token without advancing state (sync) - * - * Same as {@link produce} but synchronous. Use when you know the branch - * is local and want to avoid the microtick overhead of a promise. - */ - produceSync(): Produced { - this._ensureNotDisposed(); - const token = this.sample(); - return { - token, - text: this._ctx.tokenToText(token), - isStop: this._ctx.isStopToken(token), - }; - } - - /** - * Accept and decode — update branch state, then write token to KV - * - * Accepts the token into the sampler penalty window (for correct PPL - * measurement), then decodes (writing to KV cache via AsyncWorker on - * the libuv thread pool) and captures the resulting logits for the next - * produce() call. Accept-first ordering with rollback: if decode throws, - * sampler/grammar/metrics are restored from clones. - * - * @param token Token to commit (from produce()) - */ - async commit(token: number): Promise { - this._ensureNotDisposed(); - await this._ctx._storeCommit([this._handle], [token]); - } - - // ===== METRICS ===== - - /** - * Compute entropy of the branch's logits distribution - * - * Measures model uncertainty from the branch's captured logits snapshot: - * - Low entropy: Model is confident (peaked distribution) - * - High entropy: Model is uncertain (flat distribution) - * - * Operates directly on `state->logits_snapshot` — no JS round-trip. - * - * @param base - Logarithm base: "nats" (default) or "bits" - * @returns Entropy value in specified base - * - * COST: O(n_vocab) - must sum over all token probabilities - */ - modelEntropy(base: 'nats' | 'bits' = 'nats'): number { - this._ensureNotDisposed(); - return this._ctx._branchModelEntropy(this._handle, base); - } - - /** - * Compute surprisal (negative log-likelihood) for a specific token - * - * Measures how "surprising" the model finds the given token from - * the branch's captured logits snapshot: - * - Low surprisal: Model expected this token (high probability) - * - High surprisal: Model didn't expect this token (low probability) - * - * Operates directly on `state->logits_snapshot` — no JS round-trip. - * - * @param token - Token ID to compute surprisal for - * @param base - Logarithm base: "nats" (default) or "bits" - * @returns Surprisal value in specified base - * - * COST: O(n_vocab) - softmax normalization required - */ - modelSurprisal(token: number, base: 'nats' | 'bits' = 'nats'): number { - this._ensureNotDisposed(); - return this._ctx._branchModelSurprisal(this._handle, token, base); - } - - /** - * Sampling-level perplexity (from filtered distribution) - * - * Returns perplexity from the distribution actually sampled from - * (after top-k/p/temp/penalties). Useful for policy priors and - * monitoring sampler chain impact. - * - * Compare with {@link perplexity} which is model-level (raw logits). - */ - get samplingPerplexity(): number { - this._ensureNotDisposed(); - return this._ctx._branchGetSamplingPerplexity(this._handle); - } - - /** - * Set static logit biases on this branch - * - * Unlike {@link steer} (which is NOT inherited on fork), logit biases - * ARE cloned when forking. Use for persistent constraints that should - * propagate to child branches. - * - * Applied during sample() in order: Grammar -> Logit Bias -> Steer -> Sampler Chain - * - * @param biases - Array of token adjustments. Use `-Infinity` to block, - * positive to boost, negative to reduce. - */ - setLogitBias(biases: Array<{ token: number; bias: number }>): void { - this._ensureNotDisposed(); - this._ctx._branchSetLogitBias(this._handle, biases); - } - - /** - * Clear all static logit biases from this branch - */ - clearLogitBias(): void { - this._ensureNotDisposed(); - this._ctx._branchClearLogitBias(this._handle); - } - - // ===== ACCESSORS ===== - - /** Branch's current position (number of tokens decoded) */ - get position(): number { - this._ensureNotDisposed(); - return this._ctx._branchGetPosition(this._handle); - } - - /** Branch's perplexity (exp of mean surprisal) */ - get perplexity(): number { - this._ensureNotDisposed(); - return this._ctx._branchGetPerplexity(this._handle); - } - - /** Internal handle (for debugging) */ - get handle(): number { - return this._handle; - } - - /** Whether this branch has been disposed */ - get disposed(): boolean { - return this._disposed; - } - - /** Parent branch handle, or null if root */ - get parent(): number | null { - this._ensureNotDisposed(); - const h = this._ctx._branchParent(this._handle); - return h === 0 ? null : h; - } - - /** Child branch handles */ - get children(): number[] { - this._ensureNotDisposed(); - return this._ctx._branchChildren(this._handle); - } - - /** True if this branch has no children */ - get isLeaf(): boolean { - this._ensureNotDisposed(); - return this._ctx._branchIsLeaf(this._handle); - } - - /** True if this branch holds a KV lease */ - get isActive(): boolean { - this._ensureNotDisposed(); - return this._ctx._branchIsActive(this._handle); - } - - // ===== ASYNC ITERATION ===== - - /** - * Async iterator — generate tokens until EOG - * - * Commit-before-yield semantics: every yielded token is already written - * to KV and accepted into the sampler. Breaking out of the loop is clean — - * no orphaned uncommitted tokens, perplexity reflects all yielded tokens. - * - * For inspect-before-commit (speculative decoding, tree search), use - * the {@link produce}/{@link commit} protocol directly. - * - * @example Generate to completion - * ```typescript - * for await (const { token, text } of branch) { - * process.stdout.write(text); - * } - * ``` - * - * @example Generate with consumer-side bound - * ```typescript - * const tokens = []; - * for await (const { token } of branch) { - * tokens.push(token); - * if (tokens.length >= limit) break; - * } - * ``` - */ - async *[Symbol.asyncIterator](): AsyncIterableIterator<{ token: number; text: string }> { - while (!this._disposed) { - const { token, text, isStop } = await this.produce(); - if (isStop) return; - await this.commit(token); - yield { token, text }; - } - } - - // ===== INTERNAL ===== - - private _ensureNotDisposed(): void { - if (this._disposed) { - throw new Error('Branch has been disposed'); - } - } -} diff --git a/src/BranchStore.ts b/src/BranchStore.ts deleted file mode 100644 index c4813b9..0000000 --- a/src/BranchStore.ts +++ /dev/null @@ -1,155 +0,0 @@ -import type { Branch } from './Branch'; -import type { SessionContext } from './types'; - -/** - * High-throughput multi-branch decode operations - * - * The naive approach to N-branch generation is N sequential llama_decode() - * calls — each paying full GPU kernel launch overhead, memory barrier, and - * PCIe round-trip. BranchStore eliminates this by packing all branches into - * a single llama_batch and dispatching once: O(1) GPU round-trips regardless - * of branch count. The GPU parallelizes across sequences within the batch, - * so N branches approach the wall-time cost of 1. - * - * Two operations, two packing strategies: - * - * **commit()** — Generation step. Each branch contributes exactly 1 token. - * Packs N tokens into a single batch via `decode_each` (one row per sequence, - * all at their respective positions). Single `llama_decode()` call. Logits - * captured per-branch at batch index `i`. O(N) total work, O(1) GPU - * dispatches, O(1) amortized dispatch overhead per branch. Accept-first - * ordering with rollback: accepts each token into its branch's repeat-penalty - * window before decode, restores from clones if decode throws. - * - * **prefill()** — Bulk token injection. Each branch contributes a - * variable-length token array. Uses a two-pass bin-packing algorithm: - * - * - *Pass 1 (planning)*: Greedy first-fit packs items into chunks ≤ nBatch. - * Items larger than nBatch get a dedicated chunk and fall through to - * decode_many's internal auto-chunking (ceil(nTokens / nBatch) calls). - * - *Pass 2 (dispatch)*: Normal chunks dispatch via `decode_scatter` (one - * `llama_decode` per chunk). Logits are indexed by flattened cursor - * position: for item k in a chunk, logits live at `cursor + nTokens[k] - 1`. - * - * For T total tokens across N branches with batch capacity B: - * - Best case (T ≤ B): 1 GPU dispatch, all branches in one batch. - * - Worst case: ceil(T / B) dispatches. Each dispatch is fully packed. - * - Amortized per-token GPU overhead: O(1/B) — vanishes as batch fills. - * - * Does NOT accept tokens into the sampler penalty window — use for - * external/replayed tokens where repeat-penalty tracking is unwanted. - * For model-generated tokens, use {@link commit} instead. - * - * Both methods take `[branch, token(s)]` tuples — the branch-to-token - * binding is structural, not positional. After either call, each branch's - * logits snapshot is updated with the output distribution from its decoded - * token(s), ready for the next `produce()`/`sample()` call. - * - * @example 32-branch generation step — one GPU dispatch - * ```typescript - * const store = new BranchStore(ctx); - * const entries = await Promise.all(branches.map(async b => [b, (await b.produce()).token] as [Branch, number])); - * await store.commit(entries); // 32 tokens, 1 llama_decode() - * ``` - * - * @example Best-of-N with batched commit - * ```typescript - * const store = new BranchStore(ctx); - * const branches = []; - * for (const _ of [1, 2, 3]) branches.push(await root.fork()); - * - * for (let step = 0; step < 50; step++) { - * const produced = await Promise.all(branches.map(async b => [b, await b.produce()] as const)); - * const live = produced.filter(([, p]) => !p.isStop); - * if (!live.length) break; - * await store.commit(live.map(([b, p]) => [b, p.token])); - * } - * ``` - * - * @example Asymmetric prefill — variable-length injections, auto-chunked - * ```typescript - * await store.prefill([ - * [branchA, systemPromptTokens], // 200 tokens - * [branchB, shortQueryTokens], // 12 tokens - * [branchC, longDocumentTokens], // 800 tokens - * ]); - * // Bin-packed into ceil(1012 / nBatch) GPU dispatches - * ``` - * - * @category Branching - */ -export class BranchStore { - private _ctx: SessionContext; - - constructor(ctx: SessionContext) { - this._ctx = ctx; - } - - /** - * Batched single-token commit for model-generated tokens - * - * Each tuple `[branch, token]` binds one token to one branch. - * Accepts each token into its branch's repeat-penalty window (for correct - * PPL measurement), then decodes all N tokens in a single llama_decode() - * call via decode_each and captures logits per-branch. Accept-first - * ordering with rollback: if decode throws, sampler/grammar/metrics are - * restored from clones taken before the accept. - * - * @param entries - Array of `[branch, token]` tuples (branches must not be disposed) - * @throws If any branch is disposed - */ - async commit(entries: [Branch, number][]): Promise { - const handles: number[] = []; - const tokens: number[] = []; - for (const [branch, token] of entries) { - if (branch.disposed) throw new Error('BranchStore.commit: branch is disposed'); - handles.push(branch.handle); - tokens.push(token); - } - await this._ctx._storeCommit(handles, tokens); - } - - /** - * Batched variable-length prefill for external tokens - * - * Each tuple `[branch, tokens]` binds a token array to one branch. - * Each branch can receive a different number of tokens — decode_scatter - * handles variable-length runs and auto-chunks to fit nBatch. - * - * Does NOT call accept_token — use for external/replayed tokens where - * repeat-penalty tracking is unwanted. For model-generated tokens, - * use {@link commit} instead. - * - * @param entries - Array of `[branch, tokens]` tuples (branches must not be disposed) - * @throws If any branch is disposed - */ - async prefill(entries: [Branch, number[]][]): Promise { - const handles: number[] = []; - const tokenArrays: number[][] = []; - for (const [branch, tokens] of entries) { - if (branch.disposed) throw new Error('BranchStore.prefill: branch is disposed'); - handles.push(branch.handle); - tokenArrays.push(tokens); - } - await this._ctx._storePrefill(handles, tokenArrays); - } - - /** - * Retain only the winner branch — evict all other leases and free their slots. - * - * Nuclear operation: calls `kv::seq_keep` on the winner's seq_id (stripping all - * other sequences from KV cache in a single pass), then frees all loser slots - * and rebuilds the vacancy list. The winner's topology is reset (no parent, no children). - * - * @param winner - The branch to keep (must not be disposed, must hold a lease) - * @throws If winner is disposed or has no lease - */ - async retainOnly(winner: Branch): Promise { - if (winner.disposed) throw new Error('BranchStore.retainOnly: winner is disposed'); - this._ctx._storeRetainOnly(winner.handle); - } - - get available(): number { - return this._ctx._storeAvailable(); - } -} diff --git a/src/Rerank.ts b/src/Rerank.ts deleted file mode 100644 index 0771fef..0000000 --- a/src/Rerank.ts +++ /dev/null @@ -1,268 +0,0 @@ -import { createContext } from './index.js'; -import type { SessionContext, RerankOptions, RerankResult, RerankProgress } from './types'; - -const SYSTEM_PROMPT = - 'Judge whether the Document meets the requirements based on the Query ' + - 'and the Instruct provided. Note that the answer can only be "yes" or "no".'; - -const USER_PREFIX = - ': Given a web search query, retrieve relevant passages that answer the query\n\n' + - ': '; - -interface ScoringRequest { - tokenArrays: number[][]; - cursor: number; - scores: number[]; - filled: number; - topK: number | undefined; - total: number; - push: (progress: RerankProgress) => void; - finish: () => void; - error: (err: Error) => void; -} - -/** Simple async channel — _drain pushes, consumer pulls via for-await */ -function channel(): { - push: (value: T) => void; - finish: () => void; - error: (err: Error) => void; - iterable: AsyncIterable; -} { - const buffer: T[] = []; - let done = false; - let err: Error | null = null; - let notify: (() => void) | null = null; - - const wait = () => new Promise((r) => { notify = r; }); - - return { - push(value: T) { - buffer.push(value); - notify?.(); - notify = null; - }, - finish() { - done = true; - notify?.(); - notify = null; - }, - error(e: Error) { - err = e; - notify?.(); - notify = null; - }, - iterable: { - [Symbol.asyncIterator](): AsyncIterator { - return { - async next(): Promise> { - while (buffer.length === 0 && !done && !err) await wait(); - if (err) throw err; - if (buffer.length > 0) return { value: buffer.shift()!, done: false }; - return { value: undefined as unknown as T, done: true }; - }, - }; - }, - }, - }; -} - -export class Rerank { - private _ctx: SessionContext; - private _nSeqMax: number; - private _nCtx: number; - private _yesId: number; - private _noId: number; - private _prefixTokens: number[]; - private _midTokens: number[]; - private _suffixTokens: number[]; - private _pending: ScoringRequest[] = []; - private _draining = false; - private _disposed = false; - - private constructor( - ctx: SessionContext, - nSeqMax: number, - nCtx: number, - yesId: number, - noId: number, - prefixTokens: number[], - midTokens: number[], - suffixTokens: number[], - ) { - this._ctx = ctx; - this._nSeqMax = nSeqMax; - this._nCtx = nCtx; - this._yesId = yesId; - this._noId = noId; - this._prefixTokens = prefixTokens; - this._midTokens = midTokens; - this._suffixTokens = suffixTokens; - } - - static async create(options: RerankOptions): Promise { - const nSeqMax = options.nSeqMax ?? 8; - const nCtx = options.nCtx ?? 4096; - const ctx = await createContext({ - modelPath: options.modelPath, - nCtx, - nSeqMax, - typeK: options.typeK ?? 'q4_0', - typeV: options.typeV ?? 'q4_0', - }); - - const [yesId] = await ctx.tokenize('yes', false); - const [noId] = await ctx.tokenize('no', false); - - const SENTINEL_Q = '\x00QUERY\x00'; - const SENTINEL_D = '\x00DOC\x00'; - const probe = await ctx.formatChat(JSON.stringify([ - { role: 'system', content: SYSTEM_PROMPT }, - { role: 'user', content: `${USER_PREFIX}${SENTINEL_Q}\n\n: ${SENTINEL_D}` }, - ]), { addGenerationPrompt: true, enableThinking: false }); - - const p = probe.prompt; - const qi = p.indexOf(SENTINEL_Q); - const di = p.indexOf(SENTINEL_D); - const prefixTokens = await ctx.tokenize(p.slice(0, qi), true); - const midTokens = await ctx.tokenize(p.slice(qi + SENTINEL_Q.length, di), false); - const suffixTokens = await ctx.tokenize(p.slice(di + SENTINEL_D.length), false); - - return new Rerank(ctx, nSeqMax, nCtx, yesId, noId, prefixTokens, midTokens, suffixTokens); - } - - score(query: string, documents: number[][], topK?: number): AsyncIterable { - if (this._disposed) throw new Error('Rerank disposed'); - - const self = this; - const ch = channel(); - - (async () => { - try { - const queryTokens = await self._ctx.tokenize(query, false); - const shared = [...self._prefixTokens, ...queryTokens, ...self._midTokens]; - const maxDoc = Math.floor(self._nCtx / self._nSeqMax) - shared.length - self._suffixTokens.length; - - const tokenArrays = documents.map((doc) => { - const trimmed = doc.length > maxDoc ? doc.slice(0, maxDoc) : doc; - return [...shared, ...trimmed, ...self._suffixTokens]; - }); - - self._enqueue(tokenArrays, topK, ch.push, ch.finish, ch.error); - } catch (err) { - ch.error(err instanceof Error ? err : new Error(String(err))); - } - })(); - - return ch.iterable; - } - - async tokenize(text: string): Promise { - return this._ctx.tokenize(text, false); - } - - dispose(): void { - this._disposed = true; - const err = new Error('Rerank disposed'); - for (const req of this._pending) req.error(err); - this._pending.length = 0; - this._ctx.dispose(); - } - - // ── Queue internals ────────────────────────────────────────── - - private _sortResults(scores: number[], topK: number | undefined): RerankResult[] { - const sorted = scores - .map((score, index) => ({ score: Math.round(score * 1000) / 1000, index })) - .sort((a, b) => b.score - a.score); - return topK != null ? sorted.slice(0, topK) : sorted; - } - - private _enqueue( - tokenArrays: number[][], - topK: number | undefined, - push: (progress: RerankProgress) => void, - finish: () => void, - error: (err: Error) => void, - ): void { - this._pending.push({ - tokenArrays, cursor: 0, - scores: new Array(tokenArrays.length), - filled: 0, - topK, - total: tokenArrays.length, - push, finish, error, - }); - this._drain(); - } - - private _fillGroup(): { reqIdx: number; promptIdx: number; tokens: number[] }[] { - const group: { reqIdx: number; promptIdx: number; tokens: number[] }[] = []; - let added = true; - while (group.length < this._nSeqMax && added) { - added = false; - for (let r = 0; r < this._pending.length && group.length < this._nSeqMax; r++) { - const req = this._pending[r]; - if (req.cursor < req.tokenArrays.length) { - group.push({ reqIdx: r, promptIdx: req.cursor, tokens: req.tokenArrays[req.cursor] }); - req.cursor++; - added = true; - } - } - } - return group; - } - - private async _drain(): Promise { - if (this._draining) return; - this._draining = true; - - try { - while (this._pending.length > 0) { - const group = this._fillGroup(); - if (group.length === 0) break; - - let logits: Float32Array[]; - try { - logits = await this._ctx._scoreGroup(group.map((g) => g.tokens)); - } catch (err) { - const error = err instanceof Error ? err : new Error(String(err)); - for (const req of this._pending) req.error(error); - this._pending.length = 0; - return; - } - - // Track which requests got new scores this group - const touched = new Set(); - for (let i = 0; i < group.length; i++) { - const req = this._pending[group[i].reqIdx]; - req.scores[group[i].promptIdx] = this._rerankScore(logits[i]); - req.filled++; - touched.add(group[i].reqIdx); - } - - // Push progress for each request that advanced, finish completed ones - for (let r = this._pending.length - 1; r >= 0; r--) { - const req = this._pending[r]; - if (!touched.has(r)) continue; - - const results = this._sortResults(req.scores, req.topK); - req.push({ filled: req.filled, total: req.total, results }); - - if (req.filled === req.total) { - req.finish(); - this._pending.splice(r, 1); - } - } - } - } finally { - this._draining = false; - } - } - - private _rerankScore(logits: Float32Array): number { - const max = Math.max(logits[this._yesId], logits[this._noId]); - const yesExp = Math.exp(logits[this._yesId] - max); - const noExp = Math.exp(logits[this._noId] - max); - return yesExp / (yesExp + noExp); - } -} diff --git a/src/Session.ts b/src/Session.ts deleted file mode 100644 index 4ce87fb..0000000 --- a/src/Session.ts +++ /dev/null @@ -1,99 +0,0 @@ -import type { Branch } from './Branch'; -import type { BranchStore } from './BranchStore'; -import type { SessionContext } from './types'; -import { buildUserDelta, buildToolResultDelta } from './agents/deltas'; - -/** - * Session - Trunk lifecycle + conversation delta helpers - * - * Owns the current "trunk" branch and provides promote() to crown a winner, - * plus delta helpers that centralize the sep + formatChat + tokenize + prefill - * pattern for injecting new turns into an ongoing conversation. - * - * Session does NOT own the SessionContext or BranchStore — the consumer - * creates those and passes them in. dispose() prunes trunk only. - * - * @example - * ```typescript - * const session = new Session({ ctx, store }); - * session.trunk = initialBranch; - * - * // After verification, promote the best attempt - * await session.promote(bestAttempt.branch); - * - * // Inject a user turn and generate - * await session.prefillUser('What about X?'); - * for await (const { text } of session.trunk) { - * process.stdout.write(text); - * } - * - * // Cleanup - * await session.dispose(); - * ctx.dispose(); - * ``` - * - * @category Branching - */ -export class Session { - private _ctx: SessionContext; - private _store: BranchStore; - private _trunk: Branch | null; - - constructor({ ctx, store }: { ctx: SessionContext; store: BranchStore }) { - this._ctx = ctx; - this._store = store; - this._trunk = null; - } - - /** Current trunk branch */ - get trunk(): Branch | null { - return this._trunk; - } - - /** Assign initial trunk (no promote) */ - set trunk(branch: Branch | null) { - this._trunk = branch; - } - - /** - * Promote a winner to trunk — retainOnly + reassign - * - * Safe even if winner is the only branch (resets topology, no-op on KV). - */ - async promote(winner: Branch): Promise { - await this._store.retainOnly(winner); - this._trunk = winner; - } - - /** - * Dispose trunk only — consumer owns ctx and other resources - */ - async dispose(): Promise { - if (this._trunk && !this._trunk.disposed) { - await this._trunk.prune(); - } - this._trunk = null; - } - - /** - * Prefill a user turn into trunk - * - * @param content - User message content - * @param opts - Optional tools JSON string - */ - async prefillUser(content: string, opts: { tools?: string } = {}): Promise { - const tokens = buildUserDelta(this._ctx, content, opts); - await this._trunk!.prefill(tokens); - } - - /** - * Prefill a tool result turn into trunk - * - * @param resultStr - JSON-stringified tool result - * @param callId - Tool call ID - */ - async prefillToolResult(resultStr: string, callId: string): Promise { - const tokens = buildToolResultDelta(this._ctx, resultStr, callId); - await this._trunk!.prefill(tokens); - } -} diff --git a/src/agents/Tool.ts b/src/agents/Tool.ts deleted file mode 100644 index 20c34d8..0000000 --- a/src/agents/Tool.ts +++ /dev/null @@ -1,76 +0,0 @@ -import type { JsonSchema, ToolSchema, ToolContext } from './types'; - -/** - * Abstract base class for tools usable by agents in the runtime - * - * Subclass to define tools that agents can invoke during generation. - * Implement `name`, `description`, `parameters`, and `execute()`. The - * {@link schema} getter auto-generates the OpenAI-compatible function - * schema expected by `formatChat()`. - * - * Pass tool instances to {@link createToolkit} to build the `toolMap` - * and `toolsJson` pair consumed by {@link useAgentPool} and - * {@link runAgents}. - * - * @example Search tool - * ```typescript - * class SearchTool extends Tool<{ query: string; topK?: number }> { - * readonly name = 'search'; - * readonly description = 'Search the corpus for relevant passages'; - * readonly parameters = { - * type: 'object', - * properties: { - * query: { type: 'string', description: 'Search query' }, - * topK: { type: 'number', description: 'Number of results' }, - * }, - * required: ['query'], - * }; - * - * async execute(args: { query: string; topK?: number }, ctx?: ToolContext) { - * const results = await this.reranker.rank(args.query, args.topK ?? 5); - * return { results }; - * } - * } - * ``` - * - * @category Agents - */ -export abstract class Tool> { - /** Tool name — used as the function identifier in tool calls */ - abstract readonly name: string; - /** Human-readable description shown to the model */ - abstract readonly description: string; - /** JSON Schema describing the tool's expected arguments */ - abstract readonly parameters: JsonSchema; - - /** - * Execute the tool with parsed arguments - * - * Called by the agent pool when the model emits a tool call matching - * this tool's name. The return value is JSON-serialized and prefilled - * back into the agent's context as a tool result. - * - * @param args - Parsed arguments from the model's tool call - * @param context - Execution context with progress reporting callback - * @returns Tool result (will be JSON-serialized) - */ - abstract execute(args: TArgs, context?: ToolContext): Promise; - - /** - * OpenAI-compatible function tool schema - * - * Auto-generated from `name`, `description`, and `parameters`. - * Used by {@link createToolkit} to build the JSON string passed - * to `formatChat()`. - */ - get schema(): ToolSchema { - return { - type: 'function', - function: { - name: this.name, - description: this.description, - parameters: this.parameters, - }, - }; - } -} diff --git a/src/agents/agent-pool.ts b/src/agents/agent-pool.ts deleted file mode 100644 index 7ff9316..0000000 --- a/src/agents/agent-pool.ts +++ /dev/null @@ -1,586 +0,0 @@ -import { resource, call, action, ensure, useScope, createSignal, spawn, each } from 'effection'; -import type { Operation, Scope, Channel } from 'effection'; -import type { Branch } from '../Branch'; -import { CHAT_FORMAT_CONTENT_ONLY, CHAT_FORMAT_GENERIC, GrammarTriggerType, type GrammarTrigger, type ParsedToolCall, type SessionContext } from '../types'; -import type { BranchStore } from '../BranchStore'; -import { Ctx, Store, Events } from './context'; -import { buildToolResultDelta } from './deltas'; -import type { - TraceToken, - PressureThresholds, - AgentTaskSpec, - AgentPoolOptions, - AgentPoolResult, - AgentEvent, -} from './types'; - -// ── Internal agent state machine ─────────────────────────────── -// generating → awaiting_tool → generating (tool result prefilled) -// generating → done (stop + no tool call, or report) -// awaiting_tool → done (tool error) - -type AgentInternalState = 'generating' | 'awaiting_tool' | 'done'; - -interface AgentInternal { - id: number; // = branch.handle - parentId: number; // = parent.handle - branch: Branch; - state: AgentInternalState; - fmt: { - format: number; - reasoningFormat: number; - thinkingForcedOpen: boolean; - parser: string; - grammar: string; - grammarLazy: boolean; - grammarTriggers: GrammarTrigger[]; - }; - rawOutput: string; - tokenCount: number; - toolCallCount: number; - turns: number; - findings: string | null; - traceBuffer: TraceToken[]; -} - -interface SettledTool { - agentId: number; - prefillTokens: number[]; - toolName: string; -} - -/** - * Immutable KV budget snapshot for one tick of the agent loop - * - * Created from `SessionContext._storeKvPressure()` which returns - * `{ nCtx, cellsUsed, remaining }` where `remaining = nCtx - cellsUsed`. - * `cellsUsed` is a monotonic counter in `BranchStore` — it increments on - * every `decode_each` / `decode_scatter` but does **not** decrement on - * individual branch prune (only resets on bulk ops like `retainOnly` and - * `drain`). This means `remaining` is a conservative lower bound that - * becomes increasingly pessimistic as branches are pruned mid-run. - * - * Two thresholds partition `remaining` into three zones: - * - * ``` - * ┌──────────────────────────────────────────────────────┐ - * │ nCtx │ - * │ ┌──────────┬───────────────────┬──────────────────┐ │ - * │ │cellsUsed │ headroom > 0 │ softLimit │ │ - * │ │ (in use) │ (new work OK) │ (reserved) │ │ - * │ └──────────┴───────────────────┴──────────────────┘ │ - * │ ◄── remaining ──► │ │ - * │ │ │ - * │ headroom = remaining - softLimit │ - * │ critical = remaining < hardLimit │ - * └──────────────────────────────────────────────────────┘ - * ``` - * - * - **headroom > 0** — room for new work (tool results, generation) - * - **headroom ≤ 0** — over budget. SETTLE rejects tool results, PRODUCE - * hard-cuts non-terminal tool calls. Terminal tools still pass. - * - **critical** — remaining below hardLimit. Agents killed before - * `produceSync()` to prevent llama_decode crashes. - * - * @category Agents - */ -export class ContextPressure { - /** Default softLimit: 1024 tokens reserved for downstream work */ - static readonly DEFAULT_SOFT_LIMIT = 1024; - /** Default hardLimit: 128 tokens crash-prevention floor */ - static readonly DEFAULT_HARD_LIMIT = 128; - - /** - * KV slots remaining (`nCtx - cellsUsed`). - * Infinity when nCtx ≤ 0 (no context limit). - * Conservative: may undercount actual free space when branches have been - * pruned, since `cellsUsed` is monotonic. - */ - readonly remaining: number; - /** Remaining KV floor — tokens reserved for downstream work */ - readonly softLimit: number; - /** Crash-prevention floor — agents killed when remaining drops below */ - readonly hardLimit: number; - - constructor(ctx: SessionContext, opts?: PressureThresholds) { - const p = ctx._storeKvPressure(); - this.remaining = p.nCtx <= 0 ? Infinity : p.remaining; - this.softLimit = opts?.softLimit ?? ContextPressure.DEFAULT_SOFT_LIMIT; - this.hardLimit = opts?.hardLimit ?? ContextPressure.DEFAULT_HARD_LIMIT; - } - - /** - * Tokens available for new work: `remaining - softLimit`. - * Positive means room to accept tool results or continue generating. - * Negative means over budget — SETTLE rejects, PRODUCE hard-cuts. - */ - get headroom(): number { return this.remaining - this.softLimit; } - - /** `remaining < hardLimit` — agent must not call `produceSync()`. */ - get critical(): boolean { return this.remaining < this.hardLimit; } - - /** Can `tokenCount` tokens fit while staying above softLimit? */ - canFit(tokenCount: number): boolean { return tokenCount <= this.headroom; } -} - -/** - * Fork an agent from a parent branch with its own system prompt and task. - * - * Generator — uses sync native calls so Effection sees everything. - * On scope exit (error, cancellation), `ensure()` prunes the branch - * automatically — the orphaned-branch leak is structurally impossible. - */ -function* setupAgent( - parent: Branch, - task: AgentTaskSpec, - ctx: SessionContext, -): Operation<{ agent: AgentInternal; suffixTokens: number[] }> { - const messages = [ - { role: 'system', content: task.systemPrompt }, - { role: 'user', content: task.content }, - ]; - const fmtOpts = task.tools ? { tools: task.tools } : {}; - const fmt = ctx.formatChatSync(JSON.stringify(messages), fmtOpts); - if (task.tools && (fmt.format === CHAT_FORMAT_CONTENT_ONLY || fmt.format === CHAT_FORMAT_GENERIC)) { - // Error before fork — no branch to clean up - throw new Error('Model does not support tool calling. Please use a model with native tool support (e.g. Qwen3, Llama 3.x, Mistral).'); - } - const branch = parent.forkSync(); - yield* ensure(() => { if (!branch.disposed) branch.pruneSync(); }); - const sep = ctx.getTurnSeparator(); - const suffixTokens = [...sep, ...ctx.tokenizeSync(fmt.prompt, false)]; - if (task.seed != null) branch.reseedSampler(task.seed); - - return { - agent: { - id: branch.handle, - parentId: parent.handle, - branch, - state: 'generating', - fmt: { - format: fmt.format, - reasoningFormat: fmt.reasoningFormat, - thinkingForcedOpen: fmt.thinkingForcedOpen, - parser: fmt.parser, - grammar: fmt.grammar, - grammarLazy: fmt.grammarLazy, - grammarTriggers: fmt.grammarTriggers, - }, - rawOutput: '', - tokenCount: 0, - toolCallCount: 0, - turns: 0, - findings: null, - traceBuffer: [], - }, - suffixTokens, - }; -} - -/** - * Concurrent agent generation loop as an Effection resource - * - * Runs N agents in parallel using a three-phase tick loop over shared - * {@link BranchStore} infrastructure. Each agent forks from a parent - * branch, generates tokens, invokes tools, and reports findings. - * - * **Three-phase tick loop:** - * 1. **PRODUCE** — sample all active agents via `produceSync()` (no async gap) - * 2. **COMMIT** — single GPU call via `store.commit()` for all produced tokens - * 3. **SETTLE** — drain settled tool results, batch prefill, reset grammars - * - * Tool dispatch uses `scope.run()` for eager start — tool executions run as - * children of the agent pool scope and are cancelled if the scope exits. - * - * **Resource semantics:** `provide()` suspends after all agents complete, - * keeping branches alive so the caller can fork from them (e.g. for - * verification). Branches are pruned when the scope exits — each branch's - * `ensure()` from `setupAgent` handles cleanup automatically. - * - * For automatic branch cleanup on return, use {@link runAgents} instead. - * - * @param opts - Pool configuration: tasks, tools, sampling params, max turns - * @returns Agent pool result with per-agent findings and aggregate statistics - * - * @example Shared root with agent pool - * ```typescript - * const pool = yield* withSharedRoot( - * { systemPrompt: RESEARCH_PROMPT, tools: toolsJson }, - * function*(root) { - * return yield* useAgentPool({ - * tasks: questions.map(q => ({ - * systemPrompt: RESEARCH_PROMPT, - * content: q, - * tools: toolsJson, - * parent: root, - * })), - * tools: toolMap, - * maxTurns: 6, - * }); - * }, - * ); - * ``` - * - * @category Agents - */ -export function useAgentPool(opts: AgentPoolOptions): Operation { - return resource(function*(provide) { - const ctx: SessionContext = yield* Ctx.expect(); - const store: BranchStore = yield* Store.expect(); - const events: Channel = yield* Events.expect(); - const scope: Scope = yield* useScope(); - - // Bridge for onProgress callbacks — Signal is correct here (external callback). - // A spawned forwarder drains the bridge into the Channel with proper scope context. - const progressBridge = createSignal(); - yield* spawn(function*() { - for (const ev of yield* each(progressBridge)) { - yield* events.send(ev); - yield* each.next(); - } - }); - const { tasks, tools, maxTurns = 100, terminalTool, trace = false, pressure: pressureOpts } = opts; - - // Whether the pool's tool registry contains tools besides the terminal tool. - // When false, agents are allowed to call the terminal tool as their first - // action (e.g. reporter sub-agents that only have `report()`). When true, - // the first tool call must be a non-terminal tool to prevent agents from - // immediately reporting without doing any work. - // - // IMPORTANT: this checks the pool's `tools` registry, not individual task - // schemas (`task.tools`). A reporter pool must pass only the terminal tool - // in its registry — passing the full tool map makes this flag true and - // traps reporters in an infinite rejection loop. - const hasNonTerminalTools = terminalTool ? [...tools.keys()].some(k => k !== terminalTool) : tools.size > 0; - - // ── Setup: fork branches, collect suffix tokens ────────── - // setupAgent is now a generator — each branch registers its own ensure() - // for cleanup. No manual try/finally needed here. - const agents: AgentInternal[] = []; - const prefillSetup: [Branch, number[]][] = []; - - for (const task of tasks) { - const parent = task.parent; - if (!parent) throw new Error('useAgentPool: each task must have a parent branch'); - - const { agent, suffixTokens } = yield* setupAgent(parent, task, ctx); - agents.push(agent); - prefillSetup.push([agent.branch, suffixTokens]); - } - - // Batch prefill all agent suffixes — pressure-gated. - // Each suffix is the full formatted chat (system prompt + tools JSON + - // user message + generation prompt), tokenized via formatChatSync(). - // Suffix cost is model-dependent: ~250-400 tokens per agent depending - // on chat template verbosity and tool schema size. - const initPressure = new ContextPressure(ctx, pressureOpts); - const totalSuffix = prefillSetup.reduce((s, [, t]) => s + t.length, 0); - if (!initPressure.canFit(totalSuffix)) { - // Not enough room — drop agents from the end until it fits - while (prefillSetup.length > 0) { - const needed = prefillSetup.reduce((s, [, t]) => s + t.length, 0); - if (initPressure.canFit(needed)) break; - prefillSetup.pop(); - const dropped = agents.pop()!; - dropped.state = 'done'; - } - } - if (prefillSetup.length > 0) { - yield* call(() => store.prefill(prefillSetup)); - } - - // Emit spawn events — TUI uses parentAgentId to detect sub-agents - for (const a of agents) { - yield* events.send({ type: 'agent:spawn', agentId: a.id, parentAgentId: a.parentId }); - } - - // ── Lazy grammar setup ─────────────────────────────────── - const applyLazyGrammar = (a: AgentInternal): void => { - if (a.fmt.grammar && a.fmt.grammarLazy && a.fmt.grammarTriggers.length > 0) { - const triggers = a.fmt.grammarTriggers.map(t => { - if (t.type === GrammarTriggerType.WORD) { - const nlIdx = t.value.indexOf('\n'); - if (nlIdx >= 0 && nlIdx < t.value.length - 1) { - return { ...t, value: t.value.slice(0, nlIdx + 1) }; - } - } - return t; - }); - a.branch.setGrammarLazy(a.fmt.grammar, triggers); - } - }; - for (const a of agents) applyLazyGrammar(a); - - // ── Tool dispatch coordination ─────────────────────────── - // Plain JS buffer: spawned tool tasks push synchronously on completion. - // SETTLE drains with splice(0). Safe because generators are synchronous - // between yields — spawns can only push at yield points (during COMMIT's - // yield* call()), and SETTLE runs after COMMIT in the same tick. - const settledBuffer: SettledTool[] = []; - const agentById = new Map(agents.map(a => [a.id, a])); - - // Track pending tool count for idle detection - let pendingToolCount = 0; - - // Resolve function for idle wake — set when all agents stall - let wakeIdle: (() => void) | null = null; - - let steps = 0; - let totalToolCalls = 0; - const counters = { - warmPrefillCalls: 0, - warmPrefillBranches: 0, - stalledTicks: 0, - maxConcurrentTools: 0, - idleTicks: 0, - }; - - function* dispatchTool(agent: AgentInternal, tc: ParsedToolCall): Operation { - let toolArgs: Record; - try { toolArgs = JSON.parse(tc.arguments); } catch { toolArgs = {}; } - const callId = tc.id || `call_${agent.toolCallCount}`; - - agent.toolCallCount++; - totalToolCalls++; - agent.turns++; - agent.state = 'awaiting_tool'; - - yield* events.send({ type: 'agent:tool_call', agentId: agent.id, tool: tc.name, args: tc.arguments }); - - const tool = tools.get(tc.name); - pendingToolCount++; - counters.maxConcurrentTools = Math.max(counters.maxConcurrentTools, pendingToolCount); - - // scope.run() — eager start, child of agent pool scope, cancelled if scope exits. - // spawn() is lazy (Operation), but we're in a generator — scope.run() is eager. - scope.run(function*() { - try { - const toolContext = { - onProgress: (p: { filled: number; total: number }) => { - // Signal bridge — onProgress is an external callback, Signal.send() is correct here. - progressBridge.send({ type: 'agent:tool_progress', agentId: agent.id, tool: tc.name, filled: p.filled, total: p.total }); - }, - }; - - const result: unknown = yield* call(() => - tool ? tool.execute(toolArgs, toolContext) : Promise.resolve({ error: `Unknown tool: ${tc.name}` }) - ); - const resultStr = JSON.stringify(result); - yield* events.send({ type: 'agent:tool_result', agentId: agent.id, tool: tc.name, result: resultStr }); - - const prefillTokens = buildToolResultDelta(ctx, resultStr, callId); - settledBuffer.push({ agentId: agent.id, prefillTokens, toolName: tc.name }); - } catch (err) { - agent.state = 'done'; - agent.findings = `Tool error: ${(err as Error).message}`; - } finally { - pendingToolCount--; - if (wakeIdle) { wakeIdle(); wakeIdle = null; } - } - }); - } - - // ── Three-phase tick loop ──────────────────────────────── - for (;;) { - // -- Phase 1: PRODUCE -- sample from active agents - const pressure = new ContextPressure(ctx, pressureOpts); - - if (trace && (pressure.critical || pressure.headroom < 0)) { - const p = ctx._storeKvPressure(); - try { process.stderr.write(`[PRODUCE] ${pressure.critical ? 'CRITICAL' : 'SOFT_LIMIT'} remaining=${p.remaining} headroom=${pressure.headroom} cellsUsed=${p.cellsUsed} nCtx=${p.nCtx}\n`); } catch {} - } - - const entries: [Branch, number][] = []; - for (const a of agents) { - if (a.state !== 'generating') continue; - - if (pressure.critical) { - a.state = 'done'; - yield* events.send({ type: 'agent:done', agentId: a.id }); - continue; - } - - const { token, text, isStop } = a.branch.produceSync(); - if (isStop) { - const parsed = ctx.parseChatOutput(a.rawOutput, a.fmt.format, { - reasoningFormat: a.fmt.reasoningFormat, - thinkingForcedOpen: a.fmt.thinkingForcedOpen, - parser: a.fmt.parser, - }); - - const tc = parsed.toolCalls[0]; - if (!tc) { - a.state = 'done'; - if (!a.findings && a.toolCallCount > 0 && parsed.content) { - a.findings = parsed.content; - yield* events.send({ type: 'agent:report', agentId: a.id, findings: a.findings }); - } - yield* events.send({ type: 'agent:done', agentId: a.id }); - continue; - } - - // Over budget: deny non-terminal tool calls when the agent has - // exceeded maxTurns or KV headroom is negative. Terminal tools - // (e.g. `report()`) are always allowed through — an agent that has - // done research and wants to report should never be blocked by - // pressure, since the report call itself consumes minimal KV. - const overBudget = (a.turns >= maxTurns || pressure.headroom < 0) - && (!terminalTool || tc.name !== terminalTool); - - if (overBudget) { - a.state = 'done'; - yield* events.send({ type: 'agent:done', agentId: a.id }); - continue; - } - - // Terminal tool — intercept, extract findings, mark done. - if (terminalTool && tc.name === terminalTool) { - if (a.toolCallCount === 0 && hasNonTerminalTools) { - const callId = tc.id || `call_${a.toolCallCount}`; - const errorMsg = 'You must perform research before reporting. Call at least one tool first.'; - a.turns++; - a.state = 'awaiting_tool'; - pendingToolCount++; - scope.run(function*() { - try { - const prefillTokens = buildToolResultDelta(ctx, JSON.stringify({ error: errorMsg }), callId); - settledBuffer.push({ agentId: a.id, prefillTokens, toolName: tc.name }); - } finally { - pendingToolCount--; - if (wakeIdle) { wakeIdle(); wakeIdle = null; } - } - }); - a.rawOutput = ''; - continue; - } - try { a.findings = JSON.parse(tc.arguments).findings; } catch { a.findings = tc.arguments; } - a.state = 'done'; - a.toolCallCount++; - totalToolCalls++; - yield* events.send({ type: 'agent:tool_call', agentId: a.id, tool: tc.name, args: tc.arguments }); - yield* events.send({ type: 'agent:report', agentId: a.id, findings: a.findings! }); - yield* events.send({ type: 'agent:done', agentId: a.id }); - continue; - } - - // Fire-and-forget — dispatch tool without blocking the decode loop - yield* dispatchTool(a, tc); - a.rawOutput = ''; - continue; - } - - entries.push([a.branch, token]); - a.rawOutput += text; - a.tokenCount++; - if (trace) { - const entropy = a.branch.modelEntropy(); - const surprisal = a.branch.modelSurprisal(token); - a.traceBuffer.push({ text, entropy, surprisal }); - yield* events.send({ - type: 'agent:produce', agentId: a.id, text, tokenCount: a.tokenCount, - entropy, surprisal, - }); - } else { - yield* events.send({ type: 'agent:produce', agentId: a.id, text, tokenCount: a.tokenCount }); - } - } - - // -- Phase 2: COMMIT -- batch-decode produced tokens - if (entries.length > 0) { - yield* call(() => store.commit(entries)); - steps++; - } - - // -- Phase 3: SETTLE -- drain settled tool buffer, batch prefill - const settled = settledBuffer.splice(0); - if (settled.length > 0) { - // Fresh snapshot — Phase 2 commits may have advanced positions - const settlePressure = new ContextPressure(ctx, pressureOpts); - let headroom = settlePressure.headroom; - - if (trace) { - const p = ctx._storeKvPressure(); - const items = settled.map(s => `${s.toolName}:${s.prefillTokens.length}`).join(', '); - try { process.stderr.write(`[SETTLE] remaining=${p.remaining} headroom=${headroom} cellsUsed=${p.cellsUsed} nCtx=${p.nCtx} items=[${items}]\n`); } catch {} - } - - const prefillPairs: [Branch, number[]][] = []; - const settledAgents: AgentInternal[] = []; - - for (const item of settled) { - const a = agentById.get(item.agentId); - if (!a || a.state === 'done') continue; - - if (item.prefillTokens.length > headroom) { - if (trace) { - try { process.stderr.write(`[SETTLE] REJECT ${item.toolName}:${item.prefillTokens.length} > headroom=${headroom}\n`); } catch {} - } - a.state = 'done'; - yield* events.send({ type: 'agent:done', agentId: a.id }); - continue; - } - - prefillPairs.push([a.branch, item.prefillTokens]); - settledAgents.push(a); - headroom -= item.prefillTokens.length; - } - - if (prefillPairs.length > 0) { - if (trace) { - const totalPrefill = prefillPairs.reduce((s, [, t]) => s + t.length, 0); - try { process.stderr.write(`[SETTLE] PREFILL ${prefillPairs.length} branches, ${totalPrefill} tokens, headroom_after=${headroom}\n`); } catch {} - } - yield* call(() => store.prefill(prefillPairs)); - counters.warmPrefillCalls++; - counters.warmPrefillBranches += prefillPairs.length; - - // Only NOW transition state + reset grammar - for (const a of settledAgents) { - a.state = 'generating'; - a.rawOutput = ''; - applyLazyGrammar(a); - } - } - } - - // -- Termination + idle yield - const allDone = agents.every(a => a.state === 'done') && pendingToolCount === 0; - if (allDone) break; - - if (entries.length === 0 && pendingToolCount > 0) { - counters.stalledTicks++; - if (settled.length === 0) { - // Nothing produced, nothing settled — yield until a tool resolves - yield* action((resolve) => { - wakeIdle = resolve; - return () => { wakeIdle = null; }; - }); - counters.idleTicks++; - } - } - } - - // ── Provide result — suspends, branches stay alive ─────── - // Branch cleanup is handled by each branch's ensure() from setupAgent — - // when this resource's scope exits, all ensure() callbacks fire. - const result: AgentPoolResult = { - agents: agents.map(a => ({ - agentId: a.id, - parentAgentId: a.parentId, - branch: a.branch, - findings: a.findings, - toolCallCount: a.toolCallCount, - tokenCount: a.tokenCount, - ppl: a.branch.perplexity, - samplingPpl: a.branch.samplingPerplexity, - trace: trace ? a.traceBuffer : undefined, - })), - totalTokens: agents.reduce((s, a) => s + a.tokenCount, 0), - totalToolCalls, - steps, - counters, - }; - - yield* provide(result); - }); -} diff --git a/src/agents/context.ts b/src/agents/context.ts deleted file mode 100644 index 3fc593a..0000000 --- a/src/agents/context.ts +++ /dev/null @@ -1,36 +0,0 @@ -import { createContext } from 'effection'; -import type { SessionContext } from '../types'; -import type { BranchStore } from '../BranchStore'; -import type { Channel } from 'effection'; -import type { AgentEvent } from './types'; - -/** - * Effection context holding the active {@link SessionContext} - * - * Set by {@link initAgents} in the caller's scope. All agent operations - * (`generate`, `diverge`, `useAgentPool`, `withSharedRoot`) read from this - * context via `yield* Ctx.expect()`. - * - * @category Agents - */ -export const Ctx = createContext('lloyal.ctx'); - -/** - * Effection context holding the active {@link BranchStore} - * - * Set by {@link initAgents}. Used by {@link diverge} and {@link useAgentPool} - * for batched commit/prefill across multiple branches. - * - * @category Agents - */ -export const Store = createContext('lloyal.store'); - -/** - * Effection context holding the agent event channel - * - * Set by {@link initAgents}. {@link useAgentPool} emits {@link AgentEvent} - * values through this channel via `yield* channel.send()`. - * - * @category Agents - */ -export const Events = createContext>('lloyal.events'); diff --git a/src/agents/deltas.ts b/src/agents/deltas.ts deleted file mode 100644 index baf12d0..0000000 --- a/src/agents/deltas.ts +++ /dev/null @@ -1,63 +0,0 @@ -import type { SessionContext } from '../types'; - -/** - * Build a token delta for a user turn - * - * Composes `getTurnSeparator()` + `formatChatSync()` + `tokenizeSync()` into a - * single token array suitable for `branch.prefill()`. Usable with any - * branch — not tied to {@link Session}'s trunk. - * - * This is the canonical way to build a user-turn delta for warm prefill - * in multi-turn conversations. - * - * @param ctx - Active session context - * @param content - User message content - * @param opts - Optional tools JSON for tool-aware formatting - * @returns Token array ready for `branch.prefill()` - * - * @category Agents - */ -export function buildUserDelta( - ctx: SessionContext, - content: string, - opts: { tools?: string } = {} -): number[] { - const sep = ctx.getTurnSeparator(); - const fmtOpts = opts.tools ? { tools: opts.tools } : {}; - const { prompt } = ctx.formatChatSync( - JSON.stringify([{ role: 'system', content: '' }, { role: 'user', content }]), - fmtOpts - ); - const delta = ctx.tokenizeSync(prompt, false); - return [...sep, ...delta]; -} - -/** - * Build a token delta for a tool result turn - * - * Composes `getTurnSeparator()` + `formatChatSync()` + `tokenizeSync()` into a - * single token array suitable for `branch.prefill()`. Used by - * {@link useAgentPool} to inject tool results back into agent context. - * - * @param ctx - Active session context - * @param resultStr - JSON-serialized tool result - * @param callId - Tool call identifier from the model's parsed output - * @returns Token array ready for `branch.prefill()` - * - * @category Agents - */ -export function buildToolResultDelta( - ctx: SessionContext, - resultStr: string, - callId: string -): number[] { - const sep = ctx.getTurnSeparator(); - const { prompt } = ctx.formatChatSync( - JSON.stringify([ - { role: 'system', content: '' }, - { role: 'tool', content: resultStr, tool_call_id: callId }, - ]) - ); - const delta = ctx.tokenizeSync(prompt, false); - return [...sep, ...delta]; -} diff --git a/src/agents/diverge.ts b/src/agents/diverge.ts deleted file mode 100644 index ed1be3e..0000000 --- a/src/agents/diverge.ts +++ /dev/null @@ -1,145 +0,0 @@ -import { call, ensure } from 'effection'; -import type { Operation } from 'effection'; -import { Branch } from '../Branch'; -import { Ctx, Store } from './context'; -import { ContextPressure } from './agent-pool'; -import type { DivergeOptions, DivergeResult, DivergeAttempt } from './types'; - -/** - * Multi-branch perplexity selection as an Effection operation - * - * Forks N branches from a parent (or a fresh root), generates to EOG via - * batched {@link BranchStore.commit}, then selects the lowest-perplexity - * attempt. Loser branches are pruned; the caller receives the best branch - * still alive. - * - * When `opts.parent` is provided, the parent branch is NOT pruned — it's - * owned by the calling scope. Only the forked attempt branches (losers) - * are pruned. The caller owns the winning branch's lifecycle, typically - * via {@link Session.promote}. - * - * Cleanup is structured: each forked branch registers an `ensure()` callback - * that prunes it on scope exit. Winners are marked disposed-safe (already - * pruned or ownership transferred) before the ensure fires. - * - * @param opts - Diverge options specifying parent or prompt, attempt count, - * and sampling parameters - * @returns Result containing the best branch, all attempt outputs, and - * aggregate statistics - * - * @example Verify with perplexity selection - * ```typescript - * const verified = yield* diverge({ - * prompt: verifyPrompt, - * attempts: 3, - * params: { temperature: 0.7 }, - * }); - * // verified.best is the lowest-perplexity branch, still alive - * yield* call(() => session.promote(verified.best)); - * ``` - * - * @category Agents - */ -export function* diverge(opts: DivergeOptions): Operation { - const ctx = yield* Ctx.expect(); - const store = yield* Store.expect(); - - // If parent provided, fork from it. Otherwise create a fresh root. - let root: Branch; - let ownRoot = false; - let prefixLength: number; - - if (opts.parent) { - root = opts.parent; - prefixLength = root.position; - } else { - if (!opts.prompt) throw new Error('diverge() requires either opts.parent or opts.prompt'); - const tokens = ctx.tokenizeSync(opts.prompt); - root = Branch.create(ctx, 0, opts.params ?? {}); - yield* call(() => root.prefill(tokens)); - prefixLength = tokens.length; - ownRoot = true; - // If we created the root, ensure it's cleaned up - yield* ensure(() => { - if (ownRoot && !root.disposed) { - try { root.pruneSync(); } catch { /* children may remain */ } - } - }); - } - - const live: { branch: Branch; output: string; done: boolean; tokenCount: number; ppl: number }[] = []; - - for (let i = 0; i < opts.attempts; i++) { - const branch = root.forkSync(); - // Each forked branch gets its own ensure() for structured cleanup - yield* ensure(() => { - if (!branch.disposed) { - try { branch.pruneSync(); } catch { /* already gone */ } - } - }); - branch.reseedSampler(2000 + i); - live.push({ branch, output: '', done: false, tokenCount: 0, ppl: Infinity }); - } - - // Batched generation — produceSync/commit loop - let steps = 0; - for (;;) { - const pressure = new ContextPressure(ctx); - if (pressure.critical) { - for (const a of live) { if (!a.done) a.done = true; } - break; - } - - const entries: [Branch, number][] = []; - for (const a of live) { - if (a.done) continue; - const { token, text, isStop } = a.branch.produceSync(); - if (isStop) { - const p = a.branch.perplexity; - a.ppl = Number.isFinite(p) ? p : Infinity; - a.done = true; - continue; - } - entries.push([a.branch, token]); - a.output += text; - a.tokenCount++; - } - if (entries.length === 0) break; - yield* call(() => store.commit(entries)); - steps++; - } - - // Select by lowest perplexity (most coherent) - const bestIdx = live.reduce((bi, a, i) => a.ppl <= live[bi].ppl ? i : bi, 0); - - // Prune losers now — winner stays alive as caller's result. - // ensure() will be a no-op for these since they're already disposed. - for (let i = 0; i < live.length; i++) { - if (i !== bestIdx && !live[i].branch.disposed) { - live[i].branch.pruneSync(); - } - } - - // If we created root and it's no longer needed, prune it now. - // (ensure() will be a no-op since it checks disposed) - if (ownRoot && !root.disposed && root.children.length === 0) { - root.pruneSync(); - } - - const totalTokens = live.reduce((s, a) => s + a.tokenCount, 0); - const attempts: DivergeAttempt[] = live.map(a => ({ - branch: a.branch, - output: a.output, - tokenCount: a.tokenCount, - ppl: a.ppl, - })); - - return { - best: live[bestIdx].branch, - bestOutput: live[bestIdx].output, - attempts, - totalTokens, - steps, - prefixLength, - }; -} diff --git a/src/agents/generate.ts b/src/agents/generate.ts deleted file mode 100644 index 4a37c66..0000000 --- a/src/agents/generate.ts +++ /dev/null @@ -1,59 +0,0 @@ -import { call } from 'effection'; -import type { Operation } from 'effection'; -import { Branch } from '../Branch'; -import { Ctx } from './context'; -import type { GenerateOptions, GenerateResult } from './types'; - -/** - * Single-branch grammar-constrained generation as an Effection operation - * - * Creates a fresh branch at position 0, prefills the prompt, generates - * to EOG, and prunes the branch. Uses {@link Branch}'s async iterator - * — single-branch generation doesn't need batched commit. - * - * The branch is always cleaned up via try/finally, even on error or - * scope cancellation. - * - * @param opts - Generation options (prompt, grammar, params, parse) - * @returns Generated text, token count, and optionally parsed result - * - * @example Grammar-constrained JSON generation - * ```typescript - * const plan = yield* generate({ - * prompt: planPrompt, - * grammar: planGrammar, - * params: { temperature: 0.3 }, - * parse: output => JSON.parse(output), - * }); - * console.log(plan.parsed); // typed result from parse() - * ``` - * - * @category Agents - */ -export function* generate(opts: GenerateOptions): Operation> { - const ctx = yield* Ctx.expect(); - - const samplerParams = opts.params ?? {}; - const branch = Branch.create(ctx, 0, samplerParams, undefined, opts.grammar); - - try { - const tokens = ctx.tokenizeSync(opts.prompt); - yield* call(() => branch.prefill(tokens)); - - // Consume async iterator inside call() — generators can't use for-await - const { output, tokenCount } = yield* call(async () => { - let output = ''; - let tokenCount = 0; - for await (const { text } of branch) { - output += text; - tokenCount++; - } - return { output, tokenCount }; - }); - - const parsed = opts.parse ? opts.parse(output) as T : undefined; - return { output, tokenCount, parsed }; - } finally { - if (!branch.disposed) branch.pruneSync(); - } -} diff --git a/src/agents/index.ts b/src/agents/index.ts deleted file mode 100644 index 6d5b889..0000000 --- a/src/agents/index.ts +++ /dev/null @@ -1,32 +0,0 @@ -export { Ctx, Store, Events } from './context'; -export { Tool } from './Tool'; -export { buildUserDelta, buildToolResultDelta } from './deltas'; -export { generate } from './generate'; -export { diverge } from './diverge'; -export { useAgentPool, ContextPressure } from './agent-pool'; -export { runAgents } from './run-agents'; -export { createToolkit } from './toolkit'; -export { initAgents } from './init'; -export { withSharedRoot } from './shared-root'; - -export type { Toolkit } from './toolkit'; -export type { AgentHandle } from './init'; -export type { SharedRootOptions } from './shared-root'; - -export type { - TraceToken, - JsonSchema, - ToolSchema, - ToolContext, - PressureThresholds, - AgentTaskSpec, - AgentPoolOptions, - AgentResult, - AgentPoolResult, - GenerateOptions, - GenerateResult, - DivergeOptions, - DivergeAttempt, - DivergeResult, - AgentEvent, -} from './types'; diff --git a/src/agents/init.ts b/src/agents/init.ts deleted file mode 100644 index d7ebbd6..0000000 --- a/src/agents/init.ts +++ /dev/null @@ -1,78 +0,0 @@ -import { ensure, createChannel, call } from 'effection'; -import type { Operation, Channel } from 'effection'; -import { BranchStore } from '../BranchStore'; -import { Session } from '../Session'; -import type { SessionContext } from '../types'; -import { Ctx, Store, Events } from './context'; -import type { AgentEvent } from './types'; - -/** - * Handle returned by {@link initAgents} containing all agent resources - * - * @category Agents - */ -export interface AgentHandle { - /** The session context (model, tokenizer, KV cache) */ - ctx: SessionContext; - /** Branch store for batched commit/prefill across branches */ - store: BranchStore; - /** Session managing conversation trunk and branch lifecycle */ - session: Session; - /** Channel for subscribing to agent events */ - events: Channel; -} - -/** - * Bootstrap the agent infrastructure and register structured cleanup - * - * Creates {@link BranchStore}, {@link Session}, and an event channel, then - * sets all three Effection contexts ({@link Ctx}, {@link Store}, - * {@link Events}) in the caller's scope. Cleanup runs on scope exit - * (Ctrl-C, error, normal completion) via `ensure()`. - * - * Context values are set in the caller's scope — visible to all subsequent - * operations. This is why `initAgents` uses `ensure()` rather than - * `resource()`: a resource creates a child scope where `Ctx.set()` would - * be invisible to sibling operations. - * - * The caller creates the {@link SessionContext} (model path, nCtx, KV types - * are harness-specific decisions) and passes it in. - * - * @param ctx - Session context created via `createContext()` - * @returns Agent handle with session, store, and event channel - * - * @example Canonical bootstrap - * ```typescript - * main(function*() { - * const ctx = yield* call(() => createContext({ - * modelPath, nCtx: 16384, - * nSeqMax: 4, typeK: 'q4_0', typeV: 'q4_0', - * })); - * - * const { session, events } = yield* initAgents(ctx); - * // Ctx, Store, Events are now set — generate(), diverge(), - * // useAgentPool() will find them automatically. - * // Cleanup runs on scope exit. - * }); - * ``` - * - * @category Agents - */ -export function* initAgents( - ctx: SessionContext, -): Operation> { - const store = new BranchStore(ctx); - const session = new Session({ ctx, store }); - const events: Channel = createChannel(); - - yield* Ctx.set(ctx); - yield* Store.set(store); - yield* Events.set(events as unknown as Channel); - - yield* ensure(function*() { - yield* call(() => session.dispose()); - ctx.dispose(); - }); - - return { ctx, store, session, events }; -} diff --git a/src/agents/run-agents.ts b/src/agents/run-agents.ts deleted file mode 100644 index b2c71dc..0000000 --- a/src/agents/run-agents.ts +++ /dev/null @@ -1,45 +0,0 @@ -import { scoped } from 'effection'; -import type { Operation } from 'effection'; -import { useAgentPool } from './agent-pool'; -import type { AgentPoolOptions, AgentPoolResult } from './types'; - -/** - * Run an agent pool with automatic branch cleanup on return - * - * Wraps {@link useAgentPool} in `scoped()` — agent branches are pruned - * when the scope exits, before this operation returns. Use this when you - * don't need to fork from agent branches after the pool completes. - * - * For multi-level tree topology (forking from agent branches for - * verification or follow-up), use {@link useAgentPool} directly within - * your own scope management. - * - * @param opts - Pool configuration: tasks, tools, sampling params, max turns - * @returns Agent pool result (branches already pruned) - * - * @example Research agents with shared root - * ```typescript - * const pool = yield* withSharedRoot( - * { systemPrompt: RESEARCH_PROMPT, tools: toolsJson }, - * function*(root, prefixLen) { - * return yield* runAgents({ - * tasks: questions.map(q => ({ - * systemPrompt: RESEARCH_PROMPT, - * content: q, - * tools: toolsJson, - * parent: root, - * })), - * tools: toolMap, - * maxTurns: 6, - * }); - * }, - * ); - * ``` - * - * @category Agents - */ -export function* runAgents(opts: AgentPoolOptions): Operation { - return yield* scoped(function*() { - return yield* useAgentPool(opts); - }); -} diff --git a/src/agents/shared-root.ts b/src/agents/shared-root.ts deleted file mode 100644 index 101958a..0000000 --- a/src/agents/shared-root.ts +++ /dev/null @@ -1,80 +0,0 @@ -import { call } from 'effection'; -import type { Operation } from 'effection'; -import { Branch } from '../Branch'; -import type { SessionContext } from '../types'; -import { Ctx } from './context'; -import type { SamplingParams } from './types'; - -/** - * Configuration for {@link withSharedRoot} - * - * @category Agents - */ -export interface SharedRootOptions { - /** System prompt to tokenize and prefill into the shared root */ - systemPrompt: string; - /** JSON-serialized tool schemas for tool-aware prompt formatting */ - tools?: string; - /** Sampling parameters for the root branch */ - params?: SamplingParams; -} - -/** - * Scoped shared root branch with guaranteed cleanup - * - * Creates a root branch, prefills the system prompt, and passes it to - * the body function. The root is pruned via try/finally when the body - * returns or throws, regardless of whether children still exist. - * - * Use this for the cold-path pattern where multiple agents share a - * tokenized system prompt prefix. The `sharedPrefixLength` passed to - * the body enables KV savings calculation. - * - * @param opts - System prompt, tools, and sampling parameters - * @param body - Operation that receives the root branch and prefix length. - * Typically calls {@link runAgents} or {@link useAgentPool} inside. - * @returns The body's return value - * - * @example Cold-path research with shared prefix - * ```typescript - * const { result, prefixLen } = yield* withSharedRoot( - * { systemPrompt: RESEARCH_PROMPT, tools: toolsJson }, - * function*(root, prefixLen) { - * const result = yield* runAgents({ - * tasks: questions.map(q => ({ - * systemPrompt: RESEARCH_PROMPT, - * content: q, - * tools: toolsJson, - * parent: root, - * })), - * tools: toolMap, - * }); - * return { result, prefixLen }; - * }, - * ); - * ``` - * - * @category Agents - */ -export function* withSharedRoot( - opts: SharedRootOptions, - body: (root: Branch, sharedPrefixLength: number) => Operation, -): Operation { - const ctx: SessionContext = yield* Ctx.expect(); - - const messages = [{ role: 'system', content: opts.systemPrompt }]; - const fmtOpts = opts.tools - ? { tools: opts.tools, addGenerationPrompt: false } - : { addGenerationPrompt: false }; - const fmt = ctx.formatChatSync(JSON.stringify(messages), fmtOpts); - const sharedTokens = ctx.tokenizeSync(fmt.prompt); - - const root = Branch.create(ctx, 0, opts.params ?? { temperature: 0.5 }); - yield* call(() => root.prefill(sharedTokens)); - - try { - return yield* body(root, sharedTokens.length); - } finally { - if (!root.disposed) root.pruneSubtreeSync(); - } -} diff --git a/src/agents/toolkit.ts b/src/agents/toolkit.ts deleted file mode 100644 index 86bcf0c..0000000 --- a/src/agents/toolkit.ts +++ /dev/null @@ -1,44 +0,0 @@ -import type { Tool } from './Tool'; - -/** - * Aggregated tool registry for agent pool consumption - * - * Contains the `toolMap` for dispatch and `toolsJson` for prompt - * formatting. Created by {@link createToolkit}. - * - * @category Agents - */ -export interface Toolkit { - /** Name-to-instance map used by {@link useAgentPool} for tool dispatch */ - toolMap: Map; - /** JSON-serialized tool schemas passed to `formatChat()` via task specs */ - toolsJson: string; -} - -/** - * Aggregate an array of {@link Tool} instances into a toolkit - * - * Builds both the dispatch map and the JSON schema string from the - * tool array. Pass the result directly to {@link AgentPoolOptions} - * and {@link AgentTaskSpec}. - * - * @param tools - Tool instances to aggregate - * @returns Toolkit with `toolMap` and `toolsJson` - * - * @example - * ```typescript - * const { toolMap, toolsJson } = createToolkit([ - * new SearchTool(chunks, reranker), - * new ReadFileTool(resources), - * new GrepTool(resources), - * ]); - * ``` - * - * @category Agents - */ -export function createToolkit(tools: Tool[]): Toolkit { - return { - toolMap: new Map(tools.map(t => [t.name, t])), - toolsJson: JSON.stringify(tools.map(t => t.schema)), - }; -} diff --git a/src/agents/types.ts b/src/agents/types.ts deleted file mode 100644 index df8c468..0000000 --- a/src/agents/types.ts +++ /dev/null @@ -1,378 +0,0 @@ -import type { Branch } from '../Branch'; -import type { SessionContext } from '../types'; - -// ── Tool base class types ────────────────────────────────────── - -/** - * JSON Schema definition for tool parameter validation - * - * Describes the shape of arguments a {@link Tool} accepts. Passed to the - * model via `formatChat()` so it can generate valid tool-call arguments. - * - * @category Agents - */ -export interface JsonSchema { - /** JSON Schema type (e.g. `"object"`, `"string"`, `"array"`) */ - type: string; - /** Property definitions when `type` is `"object"` */ - properties?: Record; - /** Required property names when `type` is `"object"` */ - required?: string[]; - /** Additional schema constraints (minItems, enum, etc.) */ - [key: string]: unknown; -} - -/** - * OpenAI-compatible function tool schema - * - * The wrapper format expected by `formatChat()` when passing tools to the - * model. {@link Tool.schema} generates this automatically from the tool's - * `name`, `description`, and `parameters`. - * - * @category Agents - */ -export interface ToolSchema { - /** Always `"function"` for function-calling tools */ - type: 'function'; - /** Function definition containing name, description, and parameter schema */ - function: { - /** Tool name — used as the function identifier in tool calls */ - name: string; - /** Human-readable description shown to the model */ - description: string; - /** JSON Schema describing the tool's arguments */ - parameters: JsonSchema; - }; -} - -/** - * Execution context passed to {@link Tool.execute} - * - * Provides callbacks for reporting progress during long-running tool - * operations (e.g. reranker scoring chunks). - * - * @category Agents - */ -export interface ToolContext { - /** Progress callback for long-running operations */ - onProgress?: (p: { filled: number; total: number }) => void; -} - -// ── Trace types ─────────────────────────────────────────────── - -/** - * Per-token trace entry captured when {@link AgentPoolOptions.trace} is true - * - * Each entry corresponds to one sampled token and the distribution state - * at the moment it was drawn. Available on {@link AgentResult.trace} after - * pool completion. - * - * @category Agents - */ -export interface TraceToken { - /** Decoded text for this token */ - text: string; - /** Shannon entropy of the full vocabulary distribution (bits, base-2) */ - entropy: number; - /** Surprisal of the chosen token: -log2(p) */ - surprisal: number; -} - -// ── Agent pool types ─────────────────────────────────────────── - -/** - * Task specification for a single agent in {@link useAgentPool} - * - * Each task defines the agent's system prompt, user content, available - * tools, and parent branch to fork from. The parent branch determines - * the agent's KV prefix — fork from a shared root to amortize system - * prompt tokenization across agents. - * - * @category Agents - */ -export interface AgentTaskSpec { - /** System prompt defining the agent's role and behavior */ - systemPrompt: string; - /** User message content — the agent's specific sub-question or task */ - content: string; - /** JSON-serialized tool schemas (from {@link createToolkit}) */ - tools?: string; - /** PRNG seed for sampler diversity — pass different seeds per agent */ - seed?: number; - /** Parent branch to fork from (required by {@link useAgentPool}) */ - parent?: Branch; -} - -/** - * Sampling parameters for generation - * - * Controls the sampler chain applied during token generation. Passed to - * {@link Branch.create}, {@link generate}, {@link diverge}, and agent - * pool tasks. - * - * @category Agents - */ -export interface SamplingParams { - /** Temperature for softmax scaling (0 = greedy, higher = more random) */ - temperature?: number; - /** Nucleus sampling threshold — cumulative probability cutoff */ - topP?: number; - /** Top-K sampling — keep only the K most likely tokens */ - topK?: number; - /** Minimum probability threshold relative to the most likely token */ - minP?: number; - /** Additional sampler-specific parameters */ - [key: string]: unknown; -} - -/** - * KV pressure thresholds controlling agent shutdown under context exhaustion - * - * Two thresholds govern what happens as remaining KV shrinks: - * - * **softLimit** (default 1024) — remaining KV floor for new work. - * Enforced at three points: - * - **SETTLE**: tool results that would cross this floor are rejected and - * the agent is marked done. This is the primary enforcement point — tool - * results (search results, etc.) are the largest KV consumers. - * - **PRODUCE (stop-token boundary)**: agents that want a non-terminal tool - * call are hard-cut. Terminal tools (e.g. `report()`) still pass. - * - **INIT prefill**: agents that don't fit above this floor are dropped. - * - * Set to account for downstream pool needs (reporters, verification). - * - * **hardLimit** (default 128) — crash-prevention floor. - * When remaining drops below this, agents are killed immediately before - * `produceSync()`. Prevents `llama_decode` "no memory slot" failures. - * Pure safety net — should never be the primary budget control. - * - * @category Agents - */ -export interface PressureThresholds { - /** - * Remaining KV floor for new work (tokens). When remaining drops below - * this, SETTLE rejects tool results, PRODUCE hard-cuts non-terminal tool - * calls, and INIT drops agents that don't fit. - * - * Set to account for downstream pool needs (reporters, verification). - * Default: 1024 - */ - softLimit?: number; - /** - * Crash-prevention floor (tokens). When remaining drops below this, - * agents are killed immediately before `produceSync()`. Prevents - * `llama_decode` "no memory slot for batch" failures. - * Default: 128 - */ - hardLimit?: number; -} - -/** - * Configuration for {@link useAgentPool} and {@link runAgents} - * - * @category Agents - */ -export interface AgentPoolOptions { - /** Agent task specifications — one per concurrent agent */ - tasks: AgentTaskSpec[]; - /** - * Tool registry mapping tool names to {@link Tool} instances. - * - * This is the **execution registry** — it determines which tools can be - * dispatched at runtime. It is distinct from the per-task `task.tools` - * JSON schema that tells the model which tools are available. - * - * The registry also controls {@link AgentPoolOptions.terminalTool | terminalTool} - * gating: if the registry contains only the terminal tool, agents are - * allowed to call it as their first action (e.g. reporter sub-agents). - * If the registry contains other tools, the first call must be - * non-terminal to prevent agents from reporting without doing work. - */ - tools: Map; - /** Sampling parameters applied to all agents */ - params?: SamplingParams; - /** Maximum tool-call turns per agent before forced termination */ - maxTurns?: number; - /** Tool name that signals agent completion. When the model calls this tool, - * findings are extracted from arguments and the agent is marked done. - * The tool is intercepted — never dispatched to execute(). If omitted, - * agents complete only via stop token or hard-cut. */ - terminalTool?: string; - /** Enable per-token entropy/surprisal on `agent:produce` events */ - trace?: boolean; - /** KV pressure thresholds — tune per pool. Reporter pools typically use - * lower thresholds than research pools since they complete in a single - * terminal tool call. See {@link PressureThresholds} for tuning guidance. */ - pressure?: PressureThresholds; -} - -/** - * Result for a single completed agent - * - * @category Agents - */ -export interface AgentResult { - /** Stable agent identifier (branch handle at creation time) */ - agentId: number; - /** Parent branch handle — shared root for top-level agents, parent agentId for sub-agents */ - parentAgentId: number; - /** The agent's branch — still alive when returned from {@link useAgentPool} */ - branch: Branch; - /** Agent's research findings (from terminal tool or final output), or null */ - findings: string | null; - /** Number of tool calls the agent made */ - toolCallCount: number; - /** Total tokens generated by this agent */ - tokenCount: number; - /** Model-level perplexity at completion (exp of mean NLL from raw logits) */ - ppl: number; - /** Sampling-level perplexity at completion (from filtered distribution) */ - samplingPpl: number; - /** Per-token trace data (present only when {@link AgentPoolOptions.trace} is true) */ - trace?: TraceToken[]; -} - -/** - * Aggregate result from a completed agent pool run - * - * Returned by both {@link useAgentPool} and {@link runAgents}. Contains - * per-agent results plus aggregate statistics for display and telemetry. - * - * @category Agents - */ -export interface AgentPoolResult { - /** Per-agent results in task order */ - agents: AgentResult[]; - /** Sum of all agent token counts */ - totalTokens: number; - /** Sum of all agent tool calls */ - totalToolCalls: number; - /** Number of batched commit steps in the tick loop */ - steps: number; - /** Internal performance counters for telemetry */ - counters: { - /** Number of batch prefill calls for tool result injection */ - warmPrefillCalls: number; - /** Total branches across all warm prefill batches */ - warmPrefillBranches: number; - /** Ticks where no agent was generating (all awaiting tools) */ - stalledTicks: number; - /** Peak concurrent tool executions */ - maxConcurrentTools: number; - /** Ticks spent idle-waiting via action() */ - idleTicks: number; - }; -} - -// ── Generate types ───────────────────────────────────────────── - -/** - * Options for single-branch {@link generate} - * - * @category Agents - */ -export interface GenerateOptions { - /** Pre-formatted prompt string (from `formatChat()` + `tokenize()`) */ - prompt: string; - /** GBNF grammar string for constrained generation */ - grammar?: string; - /** Sampling parameters */ - params?: SamplingParams; - /** Optional parser applied to the raw output string */ - parse?: (output: string) => unknown; -} - -/** - * Result from single-branch {@link generate} - * - * @category Agents - */ -export interface GenerateResult { - /** Raw generated text */ - output: string; - /** Number of tokens generated */ - tokenCount: number; - /** Parsed output (present only when `parse` was provided in options) */ - parsed?: T; -} - -// ── Diverge types ────────────────────────────────────────────── - -/** - * Options for multi-branch {@link diverge} - * - * Either `parent` or `prompt` must be provided. When `parent` is given, - * branches fork from it and no new root is created. When only `prompt` - * is given, a fresh root is created, prefilled, and cleaned up on error. - * - * @category Agents - */ -export interface DivergeOptions { - /** Pre-formatted prompt for creating a fresh root (mutually exclusive with parent) */ - prompt?: string; - /** Number of parallel generation attempts */ - attempts: number; - /** Parent branch to fork from (mutually exclusive with prompt) */ - parent?: Branch; - /** Sampling parameters for all attempts */ - params?: SamplingParams; -} - -/** - * Single attempt result from {@link diverge} - * - * @category Agents - */ -export interface DivergeAttempt { - /** The attempt's branch (only the best branch survives after diverge) */ - branch: Branch; - /** Generated text for this attempt */ - output: string; - /** Number of tokens generated */ - tokenCount: number; - /** Model perplexity — lower indicates more coherent generation */ - ppl: number; -} - -/** - * Aggregate result from {@link diverge} - * - * The `best` branch is still alive; all other attempt branches have been - * pruned. The caller owns cleanup — typically via {@link Session.promote} - * to make the best branch the new conversation trunk. - * - * @category Agents - */ -export interface DivergeResult { - /** Lowest-perplexity branch — still alive, caller owns cleanup */ - best: Branch; - /** Text output from the best attempt */ - bestOutput: string; - /** All attempts (losers already pruned, branches disposed) */ - attempts: DivergeAttempt[]; - /** Sum of all attempt token counts */ - totalTokens: number; - /** Number of batched commit steps */ - steps: number; - /** Shared prefix length in tokens (for KV savings calculation) */ - prefixLength: number; -} - -// ── Runtime events ───────────────────────────────────────────── - -/** - * Events emitted by the runtime during agent pool execution - * - * Subscribe to these via the `events` channel from {@link initAgents}. - * Harnesses can extend this union with phase-level events for display. - * - * @category Agents - */ -export type AgentEvent = - | { type: 'agent:spawn'; agentId: number; parentAgentId: number } - | { type: 'agent:produce'; agentId: number; text: string; tokenCount: number; entropy?: number; surprisal?: number } - | { type: 'agent:tool_call'; agentId: number; tool: string; args: string } - | { type: 'agent:tool_result'; agentId: number; tool: string; result: string } - | { type: 'agent:tool_progress'; agentId: number; tool: string; filled: number; total: number } - | { type: 'agent:report'; agentId: number; findings: string } - | { type: 'agent:done'; agentId: number }; diff --git a/src/index.ts b/src/index.ts index 1ee3f6d..abc300d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -30,18 +30,15 @@ */ import type { - ContextOptions, GpuVariant, LoadOptions, NativeBinding, - SessionContext, } from './types'; -import { Branch } from './Branch'; -import { BranchStore } from './BranchStore'; -import { Session } from './Session'; -import { buildUserDelta, buildToolResultDelta } from './agents/deltas'; -import { Rerank } from './Rerank'; +import type { + ContextOptions, + SessionContext, +} from '@lloyal-labs/sdk'; /** * Platform package naming: @lloyal-labs/lloyal.node-{platform}-{arch}[-{gpu}] @@ -251,10 +248,34 @@ export const createContext = async ( return binary.createContext(options); }; -// ── Layer 1: Substrate (unchanged) ────────────────────────────── -export { Branch, BranchStore, Session, buildUserDelta, buildToolResultDelta, Rerank }; +// ── Re-export from @lloyal-labs/sdk ────────────────────────────── +export { Branch, BranchStore, Session, Rerank, buildUserDelta, buildToolResultDelta } from '@lloyal-labs/sdk'; + +export { PoolingType, CHAT_FORMAT_CONTENT_ONLY, CHAT_FORMAT_GENERIC, ReasoningFormat, GrammarTriggerType } from '@lloyal-labs/sdk'; +export type { ChatFormat } from '@lloyal-labs/sdk'; +export type { + ContextOptions, + FormatChatOptions, + GrammarTrigger, + FormattedChatResult, + ParseChatOutputOptions, + ParsedToolCall, + ParseChatOutputResult, + PenaltyParams, + MirostatParams, + DryParams, + XtcParams, + AdvancedSamplingParams, + SamplingParams, + SessionContext, + Produced, + RerankOptions, + RerankResult, + RerankProgress, + KvCacheType, +} from '@lloyal-labs/sdk'; -// ── Layer 2: Agents (structured concurrency) ──────────────────── +// ── Re-export from @lloyal-labs/lloyal-agents ──────────────────── export { Ctx, Store, Events, Tool, @@ -265,7 +286,7 @@ export { createToolkit, initAgents, withSharedRoot, -} from './agents/index'; +} from '@lloyal-labs/lloyal-agents'; export type { Toolkit, @@ -284,32 +305,7 @@ export type { DivergeAttempt, DivergeResult, AgentEvent, -} from './agents/index'; +} from '@lloyal-labs/lloyal-agents'; -// ── Enums + types from types.ts ───────────────────────────────── -export { PoolingType, CHAT_FORMAT_CONTENT_ONLY, CHAT_FORMAT_GENERIC, ReasoningFormat, GrammarTriggerType } from './types'; -export type { ChatFormat } from './types'; -export type { - GpuVariant, - KvCacheType, - LoadOptions, - ContextOptions, - FormatChatOptions, - GrammarTrigger, - FormattedChatResult, - ParseChatOutputOptions, - ParsedToolCall, - ParseChatOutputResult, - PenaltyParams, - MirostatParams, - DryParams, - XtcParams, - AdvancedSamplingParams, - SamplingParams, - SessionContext, - Produced, - RerankOptions, - RerankResult, - RerankProgress, - NativeBinding, -} from './types'; +// ── Native-only types (stay in lloyal.node) ────────────────────── +export type { GpuVariant, LoadOptions, NativeBinding } from './types'; diff --git a/src/types.ts b/src/types.ts index aa97a19..69ac7a1 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,23 +1,12 @@ /** - * liblloyal-node TypeScript Definitions + * liblloyal-node — native-only type definitions * - * N-API bindings for liblloyal - Node.js native addon for llama.cpp inference - * - * @categoryDescription Core - * Entry points, context lifecycle, and the main inference interface. - * - * @categoryDescription Sampling - * Sampler chain configuration — temperature, penalties, nucleus sampling, and advanced filters. - * - * @categoryDescription Chat - * Chat template formatting, output parsing, tool calls, and reasoning extraction. - * - * @categoryDescription Branching - * Parallel and tree-structured generation with batched GPU dispatch. + * Types specific to the Node.js native addon (binary loading, GPU variant + * selection). All inference primitives and shared types are in + * {@link @lloyal-labs/sdk | @lloyal-labs/sdk}. */ -import type { Branch } from './Branch'; -import type { BranchStore } from './BranchStore'; +import type { ContextOptions, SessionContext } from '@lloyal-labs/sdk'; /** * GPU variant for binary loading @@ -34,16 +23,6 @@ import type { BranchStore } from './BranchStore'; */ export type GpuVariant = 'default' | 'cuda' | 'vulkan'; -/** - * Supported KV cache quantization types - * - * Matches llama.cpp CLI `-ctk` / `-ctv` flags. - * Lower precision = less GPU memory, slight quality tradeoff. - * - * @category Core - */ -export type KvCacheType = 'f32' | 'f16' | 'bf16' | 'q8_0' | 'q4_0' | 'q4_1' | 'iq4_nl' | 'q5_0' | 'q5_1'; - /** * Options for binary loading * @@ -62,1474 +41,10 @@ export interface LoadOptions { * * If the requested variant is unavailable (missing runtime libraries), * automatically falls back to CPU with a console warning. - * - * @example - * ```typescript - * // Request CUDA with automatic fallback to CPU - * const ctx = await createContext( - * { modelPath: './model.gguf' }, - * { gpuVariant: 'cuda' } - * ); - * ``` */ gpuVariant?: GpuVariant; } -/** - * Pooling type for embedding extraction - * - * @category Core - */ -export enum PoolingType { - /** No pooling - raw per-token embeddings */ - NONE = 0, - /** Mean pooling - average of all token embeddings */ - MEAN = 1, - /** CLS pooling - use first token embedding */ - CLS = 2, - /** Last token pooling - use last token embedding */ - LAST = 3, - /** Rank pooling - classification head output for reranking models */ - RANK = 4, -} - -/** - * Chat format detected by the template engine - * - * Identifies how the model formats tool calls, reasoning blocks, and content. - * Opaque chat format identifier returned by - * {@link SessionContext.formatChat | formatChat()} and consumed by - * {@link SessionContext.parseChatOutput | parseChatOutput()}. - * - * Maps 1:1 to llama.cpp's `common_chat_format` enum (30+ values). - * Treat as an opaque number — pass through, don't switch on it. - * - * @category Chat - */ -export type ChatFormat = number; - -/** Model template has no tool/structured-output support. */ -export const CHAT_FORMAT_CONTENT_ONLY: ChatFormat = 0; - -/** llama.cpp's generic JSON fallback — imposes format the model wasn't trained on. */ -export const CHAT_FORMAT_GENERIC: ChatFormat = 1; - -/** - * Reasoning/thinking block format - * - * Controls how `` blocks are handled during formatting and parsing. - * - * @see {@link FormatChatOptions.reasoningFormat} for input-side usage - * @see {@link ParseChatOutputOptions.reasoningFormat} for output-side usage - * - * @category Chat - */ -export enum ReasoningFormat { - /** No reasoning extraction (default) */ - NONE = 0, - /** Auto-detect reasoning format from model template */ - AUTO = 1, - /** DeepSeek legacy format (`...` in content) */ - DEEPSEEK_LEGACY = 2, - /** DeepSeek format (structured reasoning extraction) */ - DEEPSEEK = 3, -} - -/** - * Grammar trigger type - * - * Determines how lazy grammar activation is triggered during generation. - * - * @see {@link GrammarTrigger} - * @see {@link FormattedChatResult.grammarTriggers} - * - * @category Chat - */ -export enum GrammarTriggerType { - /** Trigger on a specific token ID */ - TOKEN = 0, - /** Trigger on a word boundary match */ - WORD = 1, - /** Trigger on a regex pattern match */ - PATTERN = 2, - /** Trigger on a full-string regex pattern match */ - PATTERN_FULL = 3, -} - -/** - * Configuration for context creation - * - * Controls the resource envelope for inference: context window size (`nCtx`), - * batch throughput (`nBatch`), compute parallelism (`nThreads`), and - * multi-sequence capacity (`nSeqMax`). These map directly to - * `llama_context_params` and are fixed for the context's lifetime. - * - * Key tradeoffs: - * - **nCtx**: Larger = longer conversations, but linear KV memory growth. - * - **nBatch**: Larger = faster prompt prefill (more tokens per GPU dispatch), - * but higher peak memory. Also sets the bin-packing capacity for - * {@link BranchStore.prefill}. - * - **nSeqMax**: Set ≥ your max concurrent branch count + 1 (root sequence). - * Each sequence shares the same KV cache memory pool — cost is metadata only - * under unified KV, not a per-sequence memory multiplier. - * - * @category Core - */ -export interface ContextOptions { - /** Path to .gguf model file */ - modelPath: string; - - /** Context size (default: 2048) */ - nCtx?: number; - - /** Number of threads (default: 4) */ - nThreads?: number; - - /** - * Batch size for token processing - * - * Controls how many tokens are processed per llama_decode call. - * Higher values improve throughput for prompt prefill at the cost of memory. - * Also sets llama_context_params.n_batch and n_ubatch at context creation. - * Default: 512 - */ - nBatch?: number; - - /** - * Enable embedding extraction mode - * - * When true, context is optimized for embedding extraction. - * Use with encode() and getEmbeddings() methods. - * Default: false (text generation mode) - */ - embeddings?: boolean; - - /** - * Pooling type for embedding extraction - * - * Only relevant when embeddings=true. - * Default: MEAN for embedding contexts, NONE otherwise - */ - poolingType?: PoolingType; - - /** - * Maximum number of sequences for multi-sequence support - * - * Set > 1 to enable multiple independent KV cache sequences. - * Useful for parallel decoding or conversation branching. - * Default: 1 (single sequence) - */ - nSeqMax?: number; - - /** - * KV cache data type for keys - * - * Quantize the key cache to reduce GPU memory. For a Q4_K_M model, - * F16 cache wastes precision — Q8_0 halves memory with minimal quality loss. - * - * Memory at nCtx=8192 (Qwen3-4B, 36 layers, 8 KV heads, 128 dim): - * f16: 1152 MB q8_0: ~576 MB q4_0: ~288 MB - * - * Default: 'f16' - */ - typeK?: KvCacheType; - - /** - * KV cache data type for values - * - * Same options as typeK. V cache is slightly more quality-sensitive than K. - * Default: 'f16' - */ - typeV?: KvCacheType; -} - -/** - * Options for chat template formatting - * - * Controls format-awareness fields passed to the chat template engine. - * All fields are optional -- sensible defaults are used when omitted. - * - * @example With tools and reasoning - * ```typescript - * const result = await ctx.formatChat(messagesJson, { - * tools: JSON.stringify(tools), - * toolChoice: 'auto', - * reasoningFormat: 'auto', - * }); - * ``` - * - * @category Chat - */ -export interface FormatChatOptions { - /** Custom Jinja2 template override (bypasses model's built-in template) */ - templateOverride?: string; - - /** - * JSON array of OpenAI-format tool definitions - * - * @example - * ```typescript - * const tools = [{ type: 'function', function: { - * name: 'get_weather', - * description: 'Get current weather', - * parameters: { type: 'object', properties: { location: { type: 'string' } } } - * }}]; - * options.tools = JSON.stringify(tools); - * ``` - */ - tools?: string; - - /** Tool choice strategy (default: "auto") */ - toolChoice?: 'auto' | 'required' | 'none'; - - /** Allow parallel tool calls (default: false) */ - parallelToolCalls?: boolean; - - /** - * Reasoning format (default: "none") - * - * Controls `` block handling in the template. - * Use "auto" to let the model's template decide. - */ - reasoningFormat?: 'none' | 'auto' | 'deepseek' | 'deepseek_legacy'; - - /** Enable `` blocks (default: true). Pairs with reasoningFormat. */ - enableThinking?: boolean; - - /** - * JSON schema for constrained output. Converted to GBNF grammar internally. - * Mutually exclusive with `grammar`. - * - * @see {@link SessionContext.jsonSchemaToGrammar} - */ - jsonSchema?: string; - - /** - * Explicit GBNF grammar string for constrained generation. - * Mutually exclusive with `jsonSchema`. - */ - grammar?: string; - - /** - * Append assistant prompt prefix (default: true). - * Set false when formatting partial conversations or for - * non-generation use cases like template validation. - */ - addGenerationPrompt?: boolean; -} - -/** - * Grammar trigger from format-aware chat template - * - * Defines conditions for lazy grammar activation. When `grammarLazy` is true - * in {@link FormattedChatResult}, generation runs unconstrained until one of - * these triggers fires, at which point the grammar is activated. - * - * @category Chat - */ -export interface GrammarTrigger { - /** Trigger type */ - type: GrammarTriggerType; - /** Trigger value (token text, word, or regex pattern depending on type) */ - value: string; - /** Token ID (for TOKEN-type triggers, -1 when not applicable) */ - token: number; -} - -/** - * Result from chat template formatting - * - * Includes format-awareness fields for proper output parsing. - * Pass `format` and `reasoningFormat` directly to - * {@link SessionContext.parseChatOutput | parseChatOutput()} to decode - * the model's response. - * - * @example Roundtrip: format -> generate -> parse - * ```typescript - * const fmt = await ctx.formatChat(messagesJson, { tools: toolsJson }); - * // ... generate tokens using fmt.prompt and fmt.grammar ... - * const parsed = ctx.parseChatOutput(output, fmt.format, { - * reasoningFormat: fmt.reasoningFormat, - * thinkingForcedOpen: fmt.thinkingForcedOpen, - * parser: fmt.parser, - * }); - * ``` - * - * @see {@link SessionContext.parseChatOutput} - * - * @category Chat - */ -export interface FormattedChatResult { - /** Formatted prompt string ready for tokenization */ - prompt: string; - /** Additional stop strings from the template */ - stopTokens: string[]; - - /** - * Detected chat format (pass to parseChatOutput) - * @see {@link SessionContext.parseChatOutput} - */ - format: ChatFormat; - - /** Grammar string for constrained generation (empty if no tools/schema) */ - grammar: string; - /** Whether grammar should be applied lazily (only after triggers fire) */ - grammarLazy: boolean; - /** Whether the thinking tag was forced open by the template */ - thinkingForcedOpen: boolean; - - /** - * Reasoning format (pass to parseChatOutput options) - * @see {@link ParseChatOutputOptions.reasoningFormat} - */ - reasoningFormat: ReasoningFormat; - - /** PEG parser definition for PEG format models (pass to parseChatOutput options) */ - parser: string; - /** Grammar trigger conditions for lazy grammar activation */ - grammarTriggers: GrammarTrigger[]; - /** Token strings preserved from grammar masking */ - preservedTokens: string[]; -} - -/** - * Options for parsing chat output - * - * All fields are optional. For correct parsing, pass through the corresponding - * fields from {@link FormattedChatResult}. - * - * @see {@link FormattedChatResult} - * - * @category Chat - */ -export interface ParseChatOutputOptions { - /** - * Reasoning format (from {@link FormattedChatResult.reasoningFormat}) - */ - reasoningFormat?: ReasoningFormat; - - /** - * True if output is incomplete (streaming). - * When true, the parser tolerates unterminated tool calls and open - * thinking blocks, returning partial content as-is rather than - * treating them as parse errors. - */ - isPartial?: boolean; - - /** Whether thinking tag was forced open (from {@link FormattedChatResult.thinkingForcedOpen}) */ - thinkingForcedOpen?: boolean; - - /** PEG parser definition for PEG format models (from {@link FormattedChatResult.parser}) */ - parser?: string; -} - -/** - * A tool call extracted from model output - * - * @example - * ```typescript - * for (const tc of result.toolCalls) { - * const args = JSON.parse(tc.arguments); - * await executeTool(tc.name, args); - * } - * ``` - * - * @category Chat - */ -export interface ParsedToolCall { - /** Tool/function name */ - name: string; - /** JSON string of arguments */ - arguments: string; - /** Tool call ID (may be empty depending on model format) */ - id: string; -} - -/** - * Result from parsing chat output - * - * @example - * ```typescript - * const result = ctx.parseChatOutput(output, fmt.format); - * if (result.toolCalls.length > 0) { - * for (const tc of result.toolCalls) { - * const args = JSON.parse(tc.arguments); - * await executeTool(tc.name, args); - * } - * } else { - * console.log(result.content); - * } - * ``` - * - * @category Chat - */ -export interface ParseChatOutputResult { - /** Main response text */ - content: string; - /** - * Extracted thinking/reasoning content (empty string if none). - * For thinking models (e.g. Qwen3), this contains the text inside - * `...` blocks. Store as `reasoning_content` in your - * messages array so formatChat() can reconstruct the template correctly - * on subsequent turns. - */ - reasoningContent: string; - /** Extracted tool calls (empty array if none) */ - toolCalls: ParsedToolCall[]; -} - -/** - * Penalty parameters for repetition control - * - * @category Sampling - */ -export interface PenaltyParams { - /** Repetition penalty (1.0 = disabled, >1.0 = penalize repeats) */ - repeat?: number; - - /** Frequency penalty (0.0 = disabled) */ - frequency?: number; - - /** Presence penalty (0.0 = disabled) */ - presence?: number; - - /** Tokens to consider for penalties (-1 = context size) */ - lastN?: number; -} - -/** - * Mirostat sampling configuration - * - * Mirostat dynamically adjusts sampling to maintain target perplexity, - * preventing both repetition and incoherence. Useful for long-form generation - * where temperature alone produces inconsistent quality. - * - * Use Mirostat v2 (mode: 2) for most cases - it's more stable than v1. - * - * @category Sampling - */ -export interface MirostatParams { - /** Mirostat mode (0 = disabled, 1 = v1, 2 = v2). Recommended: 2 */ - mode?: number; - - /** Target entropy (perplexity = exp(tau)). Default: 5.0. Lower = more focused */ - tau?: number; - - /** Learning rate for entropy adjustment. Default: 0.1. Higher = faster adaptation */ - eta?: number; -} - -/** - * DRY (Don't Repeat Yourself) sampling parameters - * - * Penalizes repetition of token sequences, more sophisticated than - * simple repetition penalty. Useful for reducing loops and redundancy - * in generated text. - * - * @category Sampling - */ -export interface DryParams { - /** Penalty strength (0.0 = disabled, higher = stronger penalty) */ - multiplier?: number; - - /** Base penalty value (typically 1.75) */ - base?: number; - - /** Minimum sequence length to trigger penalty (typically 2) */ - allowedLength?: number; - - /** Number of recent tokens to scan for repetitions */ - penaltyLastN?: number; -} - -/** - * XTC (eXclude Top Choices) sampler parameters - * - * Excludes very high probability tokens to increase output diversity. - * Useful when model is overly confident and produces repetitive text. - * - * @category Sampling - */ -export interface XtcParams { - /** Probability of applying XTC (0.0 = disabled, 1.0 = always). Typical: 0.1 */ - probability?: number; - - /** Confidence threshold above which tokens are excluded. Typical: 0.1 */ - threshold?: number; -} - -/** - * Advanced sampling parameters - * - * @category Sampling - */ -export interface AdvancedSamplingParams { - /** Locally typical sampling (1.0 = disabled) */ - typicalP?: number; - - /** Mirostat sampling configuration */ - mirostat?: MirostatParams; - - /** DRY (Don't Repeat Yourself) sampling */ - dry?: DryParams; - - /** XTC sampler */ - xtc?: XtcParams; -} - -/** - * Sampling parameters for token generation - * - * Configures the sampler chain — a pipeline of composable filters and - * transforms applied to raw logits before token selection. The chain is - * built once at branch creation and persists across decode steps - * (penalty state accumulates, PRNG advances). - * - * **Chain order**: penalties → top_k → typical_p → top_p → min_p → - * temperature → dist (stochastic) or greedy (temperature ≤ 0). - * - * For tree search, each {@link Branch} owns an independent clone of the - * chain. `reseedSampler()` replaces the terminal dist sampler's PRNG seed - * so forked branches diverge. Greedy chains (temperature ≤ 0) are - * deterministic and unaffected by reseeding. - * - * Common presets: - * - Factual/Precise: `{ temperature: 0.1 }` - * - Balanced: `{ temperature: 0.7 }` - * - Creative: `{ temperature: 1.0 }` - * - Deterministic greedy: `{ temperature: 0, topK: 0, topP: 1.0, minP: 0 }` - * - * @category Sampling - */ -export interface SamplingParams { - // ===== COMMON CONTROLS ===== - - /** Randomness (0.0 = always most likely, 2.0 = very random) */ - temperature?: number; - - /** Only consider top K most likely tokens (0 = disabled) */ - topK?: number; - - /** Nucleus sampling threshold (1.0 = disabled) */ - topP?: number; - - /** Minimum probability threshold */ - minP?: number; - - /** Random seed for reproducible generation (-1 = random) */ - seed?: number; - - /** GBNF grammar string for constrained generation */ - grammar?: string; - - // ===== GROUPED CONTROLS ===== - - /** Penalty parameters for repetition control */ - penalties?: PenaltyParams; - - /** Advanced sampling parameters */ - advanced?: AdvancedSamplingParams; -} - -/** - * Inference context — the runtime surface for a loaded model - * - * A SessionContext owns a llama_context (KV cache + compute graph) bound to a - * shared model. It provides tokenization, logit access, KV cache management, - * chat template formatting, and embedding extraction. - * - * **All generation flows through {@link Branch}.** Create a branch at position 0, - * prefill prompt tokens, then use the produce/commit loop or async iterator: - * - * ```typescript - * const branch = Branch.create(ctx, 0, { temperature: 0.7 }); - * await branch.prefill(promptTokens); - * for await (const { token, text } of branch) { - * process.stdout.write(text); - * } - * ``` - * - * For tree-structured generation (best-of-N, beam search, speculative - * decoding), use {@link Branch.fork} and {@link BranchStore} — they manage - * per-branch KV sequences, sampler chains, and logits snapshots with O(1) - * GPU dispatches via batched decode. - * - * **Logits**: For branch-level logits, use {@link Branch.getLogits} which - * returns an independent copy of the branch's snapshot. For metrics, use - * {@link Branch.modelEntropy} and {@link Branch.modelSurprisal} which - * operate directly on the branch's logits without JS round-trips. - * - * **KV cache**: Supports multi-sequence operation (`nSeqMax > 1`), per-sequence - * copy/clear/eviction, file-based persistence, and context compression via - * `clearAndReseed()`. - * - * **Chat templates**: `formatChat()` and `parseChatOutput()` handle the full - * round-trip of chat formatting, including tool calls, reasoning blocks, and - * grammar-constrained generation — using the model's native Jinja template. - * - * Use {@link createContext} to initialize, and `dispose()` when done to free - * GPU/CPU memory. - * - * @category Core - */ -export interface SessionContext { - - /** - * Convert token ID to text piece - * - * Fast synchronous lookup in vocabulary table. - * Call this on each generated token for streaming display. - * - * Optimized for per-token conversion during generation. - * For batch conversion of many tokens, use detokenize() instead. - * - * Cost: ~0.05ms - * - * @param token Token ID - * @returns Text string for this token - */ - tokenToText(token: number): string; - - /** - * Check if token is a model stop token - * - * Returns true for built-in end-of-generation tokens: - * - (Llama 2) - * - <|endoftext|> (GPT) - * - <|eot_id|> (Llama 3) - * - Model-specific EOS tokens - * - * Note: This checks vocabulary stop tokens, not custom stop sequences. - * For custom stops (e.g., "\n\n", "###"), compare generated text - * against your stop strings in application code. - * - * Cost: <0.01ms (fast vocabulary lookup) - * - * @param token Token ID to check - */ - isStopToken(token: number): boolean; - - /** - * Get the model's end-of-generation token ID - * - * Returns the EOT token (e.g. <|im_end|> for ChatML), falling back - * to EOS (e.g. ) for Zephyr-style models. This is the inverse - * of isStopToken() — "what IS the stop token?" vs "is this a stop token?" - * - * Use case: warm multi-turn continuation prepends this token to close - * the previous assistant turn before injecting new user content. - * - * @returns Token ID (integer) - * @throws If model has neither EOT nor EOS token - */ - getEogToken(): number; - - /** - * Get the model's turn separator token IDs - * - * Returns the tokens that close an assistant turn and transition to the - * next message, as determined by the model's chat template. Computed once - * per model, cached. - * - * For ChatML templates: [im_end_id, newline_id] (e.g., [2, 198]) - * For Llama 3 templates: [eot_id] (e.g., [128009]) - * - * Use case: warm multi-turn prefill to achieve exact parity with cold path. - * - * @returns Array of token IDs (cached after first call) - * - * @example - * ```typescript - * const separator = ctx.getTurnSeparator(); - * console.log(separator.map(t => ctx.tokenToText(t)).join('')); // "<|im_end|>\n" - * - * // Warm prefill with exact cold/warm parity - * const deltaTokens = await ctx.tokenize(deltaPrompt, false); - * await branch.prefill([...separator, ...deltaTokens]); - * ``` - */ - getTurnSeparator(): number[]; - - // ===== PROMPT PREPARATION ===== - - /** - * Tokenize text into model's vocabulary - * - * Converts human text → token IDs for decode(). - * Same text always produces same tokens for a given model. - * - * Cost: ~1ms per 100 characters - * - * @param text Text to tokenize - * @param addSpecial Whether to add special tokens (BOS/EOS). Defaults to - * model metadata setting (typically true). Pass false for mid-sequence - * tokenization (e.g., warm multi-turn continuation deltas). - * @returns Array of token IDs - * @example - * ```typescript - * // Full sequence (default — includes BOS) - * const tokens = await ctx.tokenize("Hello world"); - * - * // Mid-sequence delta (no BOS) - * const delta = await ctx.tokenize("continuation text", false); - * ``` - */ - tokenize(text: string, addSpecial?: boolean): Promise; - - /** - * Tokenize text into model's vocabulary (sync — inline on main thread) - * - * Same as {@link tokenize} but synchronous. Use from Effection generators - * to avoid `yield* call()` overhead for CPU-only work. - * - * @param text Text to tokenize - * @param addSpecial Whether to add special tokens (BOS/EOS). Defaults to - * model metadata setting (typically true). Pass false for mid-sequence - * tokenization. - * @returns Array of token IDs - */ - tokenizeSync(text: string, addSpecial?: boolean): number[]; - - /** - * Detokenize array of tokens back to text - * - * Inverse of tokenize(). Use for reconstructing complete text - * from token sequences (e.g., after KV cache operations). - * - * Optimized for batch conversion of many tokens. - * For single-token conversion during generation, use tokenToText(). - * - * Cost: ~1ms per 100 tokens - * - * @param tokens Array of token IDs - * @returns Complete text representation - * @example - * ```typescript - * const tokens = [15496, 1917]; // "Hello world" - * const text = await ctx.detokenize(tokens); - * console.log(text); // "Hello world" - * ``` - */ - detokenize(tokens: number[]): Promise; - - // ===== KV CACHE MANAGEMENT ===== - - /** - * Get max position in the KV cache for a sequence - * - * Returns the highest position index in the specified sequence, - * or -1 if the sequence is empty. This is the same value as - * {@link kvSeqPosMax}. To get the token count, add 1. - * - * Think of this as: "How much has the model read so far?" - * - * Cost: <0.01ms (fast sync operation - safe to call frequently) - * - * @param sequenceId Sequence ID (defaults to 0 for single conversation) - * @returns Highest position index, or -1 if empty - */ - kvCacheSize(sequenceId?: number): number; - - /** - * Remove token range from KV cache - * - * Deletes tokens from model's memory. Use cases: - * - Removing old context when hitting limit (sliding window) - * - Implementing conversation pruning - * - Forgetting specific messages - * - Preparing for injection of new context - * - * CRITICAL: Call BEFORE next decode(), not after! - * The model needs to know about the removal before processing new tokens. - * - * Cost: ~1-5ms depending on range - * - * @param sequenceId Sequence ID (use 0 for single sequence) - * @param start Start position (inclusive) - * @param end End position (exclusive), -1 = to end - */ - kvCacheRemove(sequenceId: number, start: number, end: number): Promise; - - /** - * Snapshot KV cache state for branching/undo - * - * Serializes entire model state to Buffer. - * Restore later with kvCacheLoad() for: - * - Conversation branching ("what if I said X instead?") - * - Undo/redo functionality - * - Checkpointing long conversations - * - * Size: ~500MB-2GB depending on context length and model - * - * Cost: ~100-500ms depending on cache size - * - * @param sequenceId Sequence ID (use 0 for single sequence) - * @returns Serialized state buffer - */ - kvCacheSave(sequenceId?: number): Promise; - - /** - * Restore KV cache from previous snapshot - * - * Loads saved model state. Context returns to exact state - * when snapshot was taken. - * - * Cost: ~100-500ms depending on snapshot size - * - * @param sequenceId Sequence ID (use 0 for single sequence) - * @param state Buffer from kvCacheSave() - * @example - * ```typescript - * const snapshot = await ctx.kvCacheSave(0); - * - * // ... many operations later ... - * - * // Restore to saved state - * await ctx.kvCacheLoad(0, snapshot); - * ``` - */ - kvCacheLoad(sequenceId: number, state: Buffer): Promise; - - /** - * Clear all KV cache (fresh start) - * - * Removes all cached tokens. Model returns to initial state - * as if no text has been processed. - * - * Use when starting a completely new conversation. - * - * Cost: ~1ms - */ - kvCacheClear(): Promise; - - /** - * Blink KV — cache-local reconstruction for bounded-memory streaming - * - * Implements the [Blink KV](https://github.com/lloyal-ai/blink-kv/blob/main/blink_kv.pdf) - * protocol (Naqvi, 2026): when the KV cache fills, clear it entirely and - * re-decode retained tokens at contiguous positions `[0, 1, ..., N-1]`. - * This achieves cache-local position IDs — the operative requirement for - * stable bounded-memory streaming — without backend-specific knowledge of - * key storage format. Works on post-RoPE engines (where StreamingLLM's - * pos-shift is unavailable) and any backend exposing `clear()` + `decode()`. - * - * **Why not naive eviction?** Selective eviction (`kvCacheRemove`) preserves - * original position IDs, which grow without bound. Across 5 architectures, - * naive eviction produces PPL spanning 3 orders of magnitude — ranging from - * 1.15x baseline (Llama, lucky config) to 198x (Phi, sinks present). - * Under Blink KV reconstruction, all 5 converge to 3-16% of baseline. - * - * **Sinks are optional.** Under reconstruction, the 0+N (sinkless) config - * matches 4+N (with sinks) within <2% across all tested architectures. - * Pass an empty sinks array if you don't need them. - * - * **Algorithm:** - * 1. Clear entire KV cache (zero fragmentation) - * 2. Re-decode `sinks` at position 0 (optional attention anchors) - * 3. Re-decode `tail` at position `sinks.length` (recent context) - * - * **Cost:** Re-decodes `sinks.length + tail.length` tokens. At per-boundary - * trigger (reconstruct when cache reaches `nCtx`), amortized cost is - * O(cacheSize / interval) decode ops per token — ~0.14 at typical settings. - * - * @param sinks First N tokens from conversation start (typically 4, or empty). - * Must be the same tokens every reseed — reusing different tokens degrades - * any attention-sink patterns the model may have learned for early positions. - * @param tail Recent M tokens to preserve (typically 252-1020) - * @returns Promise that resolves when reconstruction completes. - * Next decode continues at position `sinks.length + tail.length`. - * - * @example Per-boundary reconstruction - * ```typescript - * // Capture sinks once at conversation start - * const SINKS = allTokens.slice(0, 4); - * - * // On cache fill: compress to 512 tokens (4 sinks + 508 tail) - * if (position >= ctx.nCtx) { - * const tail = allTokens.slice(-508); - * await ctx.clearAndReseed(SINKS, tail); - * position = 512; // sinks.length + tail.length - * } - * ``` - * - * @example Sinkless reconstruction (equally effective) - * ```typescript - * const tail = allTokens.slice(-256); - * await ctx.clearAndReseed([], tail); // No sinks needed - * position = 256; - * ``` - * - * @see [Blink KV paper](https://github.com/lloyal-ai/blink-kv/blob/main/blink_kv.pdf) - */ - clearAndReseed(sinks: number[], tail: number[]): Promise; - - // ===== KV SEQUENCE OPERATIONS ===== - - /** - * Fork a KV cache sequence — the primitive behind {@link Branch.fork} - * - * Copies all KV cache entries from `srcSeqId` to `dstSeqId`. Under - * llama.cpp's unified KV cache, this is a **metadata-only operation** — - * no key/value tensors are copied. Both sequences reference the same - * physical KV entries for the shared prefix; only tokens decoded after - * the fork point allocate new storage. This is what makes tree-structured - * generation (best-of-N, beam search, speculative decoding) memory-efficient: - * N branches sharing a 1000-token prefix cost ~1000 KV entries, not N*1000. - * - * The higher-level {@link Branch.fork} wraps this and additionally clones - * the sampler chain, grammar state, logits snapshot, and perplexity tracker. - * Use `kvSeqCopy` directly when you need raw sequence management without - * the Branch abstraction. - * - * NOTE: Only full-sequence copies are supported. The p0/p1 parameters - * must use default values (0 and -1). - * - * Cost: O(1) metadata — no tensor copy under unified KV - * - * @param srcSeqId Source sequence to copy from - * @param dstSeqId Destination sequence to copy to - * @param p0 Start position (must be 0, default: 0) - * @param p1 End position (must be -1 for full copy, default: -1) - */ - kvSeqCopy(srcSeqId: number, dstSeqId: number, p0?: number, p1?: number): void; - - /** - * Keep only specified sequence, remove all others - * - * Removes all sequences except the one specified. - * For complete cleanup of unwanted sequences, consider using - * kvCacheRemove(seqId, 0, -1) on each sequence instead. - * - * @param seqId Sequence ID to keep - */ - kvSeqKeep(seqId: number): void; - - /** - * Get max position in sequence - * - * Returns the highest position index in the specified sequence, - * or -1 if the sequence is empty. - * - * Cost: <0.01ms (fast sync operation) - * - * @param seqId Sequence ID to query - * @returns Max position index, or -1 if empty - * @example - * ```typescript - * const pos = ctx.kvSeqPosMax(0); - * if (pos === -1) { - * console.log('Sequence is empty'); - * } else { - * console.log(`Sequence has ${pos + 1} tokens`); - * } - * ``` - */ - kvSeqPosMax(seqId: number): number; - - // ===== KV CACHE FILE PERSISTENCE ===== - - /** - * Write KV cache state + tokens to file - * - * Persists KV cache state for later restoration. - * Useful for checkpointing long conversations. - * - * @param sequenceId Sequence ID to save - * @param filepath Path to save file - * @param tokens Tokens that were decoded into this sequence - * @returns Promise resolving to bytes written - */ - kvCacheWriteFile( - sequenceId: number, - filepath: string, - tokens: number[] - ): Promise; - - /** - * Read KV cache state + tokens from file - * - * Restores KV cache state from a previous kvCacheWriteFile call. - * - * @param sequenceId Sequence ID to restore to - * @param filepath Path to saved file - * @returns Promise resolving to tokens and bytes read - */ - kvCacheReadFile( - sequenceId: number, - filepath: string - ): Promise<{ tokens: number[]; bytesRead: number }>; - - // ===== HELPERS ===== - - /** - * Format messages using model's chat template - * - * Converts [{role, content}] -> formatted prompt string with full format awareness. - * Uses model's built-in template (ChatML, Llama, Mistral, etc.). - * - * The returned `format` and `reasoningFormat` fields should be passed to - * `parseChatOutput()` after generation to correctly decode the response. - * - * Cost: ~1-5ms depending on message count - * - * @param messagesJson JSON string containing array of messages - * @param options Formatting options (tools, reasoning, grammar, etc.) - * @returns Formatted prompt with format-awareness metadata - * - * @see {@link parseChatOutput} - * - * @example Basic usage - * ```typescript - * const result = await ctx.formatChat(JSON.stringify([ - * { role: "system", content: "You are a helpful assistant" }, - * { role: "user", content: "Hello!" } - * ])); - * - * const tokens = await ctx.tokenize(result.prompt); - * const branch = Branch.create(ctx, 0, { temperature: 0.7 }); - * await branch.prefill(tokens); - * ``` - */ - formatChat( - messagesJson: string, - options?: FormatChatOptions | string - ): Promise; - - /** - * Format messages using model's chat template (sync — inline on main thread) - * - * Same as {@link formatChat} but synchronous. Use from Effection generators - * to avoid `yield* call()` overhead for CPU-only work. - * - * @param messagesJson JSON string containing array of messages - * @param options Formatting options (tools, reasoning, grammar, etc.) - * @returns Formatted prompt with format-awareness metadata - */ - formatChatSync( - messagesJson: string, - options?: FormatChatOptions | string - ): FormattedChatResult; - - /** - * Parse model output into structured content - * - * Extracts plain text, reasoning/thinking blocks, and tool calls from - * raw model output. Uses the format detected by {@link formatChat} to apply - * the correct parser for the model's output format. - * - * Cost: <0.1ms (synchronous string parsing, no I/O) - * - * @param output Raw model output text - * @param format Chat format enum (from {@link FormattedChatResult.format}) - * @param options Optional parsing parameters - * @returns Parsed content with tool calls and reasoning - * - * @see {@link formatChat} - * - * @example Basic parsing - * ```typescript - * const fmt = await ctx.formatChat(JSON.stringify(messages), { tools: toolsJson }); - * // ... generate tokens ... - * const parsed = ctx.parseChatOutput(generatedText, fmt.format, { - * reasoningFormat: fmt.reasoningFormat, - * thinkingForcedOpen: fmt.thinkingForcedOpen, - * parser: fmt.parser - * }); - * if (parsed.toolCalls.length > 0) { - * // Handle tool calls - * } - * ``` - * - * @example Multi-turn warm continuation with reasoning models - * ```typescript - * // parseChatOutput separates ... blocks into reasoningContent. - * // This is REQUIRED for correct warm continuation on thinking models (e.g. Qwen3): - * // if raw output containing tags is stored as content, re-formatting - * // the conversation produces different tokens, breaking cold/warm parity. - * - * const messages: Array<{role: string; content: string; reasoning_content?: string}> = []; - * const sep = ctx.getTurnSeparator(); - * let branch: Branch | null = null; - * let fmt: FormattedChatResult; - * - * async function handleTurn(userContent: string) { - * messages.push({ role: 'user', content: userContent }); - * - * if (!branch) { - * // Cold path: format full conversation, tokenize with BOS, prefill - * fmt = await ctx.formatChat(JSON.stringify(messages)); - * const tokens = await ctx.tokenize(fmt.prompt); - * branch = Branch.create(ctx, 0, { temperature: 0.7 }); - * await branch.prefill(tokens); - * } else { - * // Warm path: string-diff for delta tokens - * const { prompt: full } = await ctx.formatChat(JSON.stringify(messages)); - * const { prompt: prefix } = await ctx.formatChat( - * JSON.stringify(messages.slice(0, -1)), - * { addGenerationPrompt: false } - * ); - * const delta = await ctx.tokenize(full.substring(prefix.length), false); - * await branch.prefill([...sep, ...delta]); - * } - * - * // Generate - * let rawOutput = ''; - * while (true) { - * const { token, text, isStop } = await branch.produce(); - * if (isStop) break; - * rawOutput += text; - * await branch.commit(token); - * } - * - * // Parse output: separates reasoning from content - * const parsed = ctx.parseChatOutput(rawOutput, fmt.format, { - * reasoningFormat: fmt.reasoningFormat, - * thinkingForcedOpen: fmt.thinkingForcedOpen, - * parser: fmt.parser - * }); - * - * // Store parsed fields — formatChat reconstructs thinking blocks correctly - * messages.push({ - * role: 'assistant', - * content: parsed.content, - * reasoning_content: parsed.reasoningContent || undefined - * }); - * } - * ``` - */ - parseChatOutput( - output: string, - format: ChatFormat, - options?: ParseChatOutputOptions - ): ParseChatOutputResult; - - /** - * Convert JSON schema to GBNF grammar - * - * Generates grammar string for constrained JSON generation. - * Use with {@link Branch.create} grammar parameter for constrained generation. - * - * Cost: ~1-10ms depending on schema complexity - * - * @param schemaJson JSON schema string - * @returns GBNF grammar string - * @example - * ```typescript - * const schema = { - * type: "object", - * properties: { - * name: { type: "string" }, - * age: { type: "number" } - * }, - * required: ["name"] - * }; - * - * const grammar = await ctx.jsonSchemaToGrammar(JSON.stringify(schema)); - * const branch = Branch.create(ctx, 0, params, undefined, grammar); - * ``` - */ - jsonSchemaToGrammar(schemaJson: string): Promise; - - /** - * Convert JSON schema to GBNF grammar (sync — inline on main thread) - * - * Same as {@link jsonSchemaToGrammar} but synchronous. Use from Effection - * generators to avoid `yield* call()` overhead for CPU-only work. - * - * @param schemaJson JSON schema string - * @returns GBNF grammar string - */ - jsonSchemaToGrammarSync(schemaJson: string): string; - - /** - * Validate chat template syntax - * - * Checks if template string is valid before using. - * - * Cost: ~0.1-1ms - * - * @param templateString Template string to validate - * @returns True if template syntax is valid - */ - validateChatTemplate(templateString: string): Promise; - - // ===== EMBEDDING EXTRACTION ===== - - /** - * Encode tokens for embedding extraction - * - * Unlike decode(), this marks ALL tokens with logits=true which is - * required for embedding extraction. Use with embeddings=true context. - * - * Workflow: - * 1. Create context with { embeddings: true, poolingType: PoolingType.MEAN } - * 2. Tokenize your text - * 3. Clear KV cache (important between different texts!) - * 4. Call encode() with tokens - * 5. Call getEmbeddings() to get the vector - * - * Cost: ~5-50ms depending on text length and model - * - * @param tokens Token IDs from tokenize() - * @example - * ```typescript - * // Create embedding context - * const ctx = await createContext({ - * modelPath: './nomic-embed.gguf', - * embeddings: true, - * poolingType: PoolingType.MEAN - * }); - * - * // Get embedding for text - * const tokens = await ctx.tokenize("Hello world"); - * await ctx.kvCacheClear(); // Important between texts! - * await ctx.encode(tokens); - * const embedding = ctx.getEmbeddings(); - * ``` - */ - encode(tokens: number[]): Promise; - - /** - * Get embedding vector from context (after encode) - * - * Returns the embedding vector for the encoded text. - * Call after encode() to extract embeddings. - * - * The vector dimension depends on the model (e.g., 768 for nomic-embed). - * Use getEmbeddingDimension() to get the size. - * - * Cost: ~0.5ms (extraction from model state) - * - * @param normalize Apply L2 normalization (default: true for cosine similarity) - * @returns Float32Array of embedding values - * @example - * ```typescript - * await ctx.encode(tokens); - * - * // Get L2-normalized embedding (for cosine similarity) - * const embedding = ctx.getEmbeddings(); - * - * // Or raw embedding without normalization - * const rawEmbedding = ctx.getEmbeddings(false); - * ``` - */ - getEmbeddings(normalize?: boolean): Float32Array; - - /** - * Get embedding dimension for model - * - * Returns the size of embedding vectors this model produces. - * Common values: 768 (BERT-like), 1024, 2048, 4096. - * - * Cost: <0.01ms (fast model property lookup) - * - * @returns Embedding dimension - * @example - * ```typescript - * const dim = ctx.getEmbeddingDimension(); - * console.log(`Model produces ${dim}-dimensional embeddings`); - * ``` - */ - getEmbeddingDimension(): number; - - /** - * Check if context has pooling enabled - * - * Returns true if context was created with embeddings=true and - * a pooling type other than NONE. - * - * Cost: <0.01ms - * - * @returns True if pooling is enabled - */ - hasPooling(): boolean; - - // ===== PROPERTIES ===== - - /** - * Model vocabulary size (number of possible tokens) - * - * This is the length of the logits array from Branch.getLogits(). - */ - readonly vocabSize: number; - - /** - * Memory used by this context (bytes) - * - * Reports native memory for monitoring. - * Includes model weights, KV cache, and context state. - */ - readonly memorySize: number; - - // ===== LIFECYCLE ===== - - /** - * Free native resources - * - * Call when done with context to release model and KV cache memory. - * Context becomes unusable after disposal. - */ - dispose(): void; - - // ===== BRANCH API (internal, wrapped by Branch class) ===== - - /** @internal */ - _branchCreate(position: number, params?: SamplingParams, nBatch?: number, grammar?: string): number; - - /** @internal */ - _branchFork(handle: number): number; - - /** @internal */ - _branchPrefill(handle: number, tokens: number[]): Promise; - - /** @internal */ - _branchSample(handle: number): number; - - /** @internal */ - _branchAccept(handle: number, token: number): void; - - /** @internal */ - _branchGetPosition(handle: number): number; - - /** @internal */ - _branchGetPerplexity(handle: number): number; - - /** @internal */ - _branchGetLogits(handle: number): Float32Array; - - /** @internal */ - _branchPrune(handle: number): void; - - /** @internal */ - _branchPruneSubtree(handle: number): void; - - /** @internal */ - _branchParent(handle: number): number; - - /** @internal */ - _branchChildren(handle: number): number[]; - - /** @internal */ - _branchIsLeaf(handle: number): boolean; - - /** @internal */ - _branchIsActive(handle: number): boolean; - - /** @internal */ - _branchSamplerChainReseed(handle: number, seed: number): void; - - /** @internal */ - _branchSteer(handle: number, biases: Array<{ token: number; bias: number }>): void; - - /** @internal */ - _branchClearSteer(handle: number): void; - - /** @internal */ - _branchSetSamplerParams(handle: number, params: SamplingParams): void; - - /** @internal */ - _branchSetGrammar(handle: number, grammarStr: string): void; - - /** @internal */ - _branchSetGrammarLazy(handle: number, grammar: string, patterns: string[], tokens: number[]): void; - - /** @internal */ - _branchModelEntropy(handle: number, base?: string): number; - - /** @internal */ - _branchModelSurprisal(handle: number, token: number, base?: string): number; - - /** @internal */ - _branchGetSamplingPerplexity(handle: number): number; - - /** @internal */ - _branchSetLogitBias(handle: number, biases: Array<{ token: number; bias: number }>): void; - - /** @internal */ - _branchClearLogitBias(handle: number): void; - - // ===== STORE API (internal, wrapped by BranchStore) ===== - - /** @internal */ - _storeCommit(handles: number[], tokens: number[]): Promise; - - /** @internal */ - _storePrefill(handles: number[], tokenArrays: number[][]): Promise; - - /** @internal */ - _storeRetainOnly(handle: number): void; - - /** @internal */ - _storeAvailable(): number; - - /** KV cache pressure snapshot from native BranchStore. - * cells_used is a monotonic counter reset on drain/retainOnly. */ - _storeKvPressure(): { nCtx: number; cellsUsed: number; remaining: number }; - - // ===== SCORING API ===== - - /** @internal — processes ≤ n_seq_max prompts in a single group */ - _scoreGroup(tokenArrays: number[][]): Promise; -} - -/** - * Result from Branch.produce() - * - * @category Branching - */ -export interface Produced { - /** Sampled token ID */ - token: number; - /** Text representation of the token */ - text: string; - /** Whether this is a stop token (EOS) */ - isStop: boolean; -} - -// AgentTask, AgentState, RunAgentsOptions, RunAgentsResult removed — -// superseded by src/runtime/ (useAgentPool, AgentTaskSpec, AgentPoolResult) - -/** - * Options for Rerank context creation - * @category Core - */ -export interface RerankOptions { - /** Path to reranker .gguf model */ - modelPath: string; - /** Max prompts per GPU dispatch (default: 8) */ - nSeqMax?: number; - /** Context window size (default: 4096) */ - nCtx?: number; - /** KV cache key quantization (default: 'q4_0') */ - typeK?: KvCacheType; - /** KV cache value quantization (default: 'q4_0') */ - typeV?: KvCacheType; -} - -/** - * A single rerank result — score for one document - * @category Core - */ -export interface RerankResult { - /** Relevance probability (0–1) */ - score: number; - /** Original index in the input array */ - index: number; -} - -/** - * Progress yielded by Rerank.score() after each scoring group completes - * @category Core - */ -export interface RerankProgress { - /** Number of documents scored so far */ - filled: number; - /** Total documents to score */ - total: number; - /** Sorted results — partial until filled === total */ - results: RerankResult[]; -} - /** * Native binding interface — what loadBinary() returns * diff --git a/test/agents.ts b/test/agents.ts deleted file mode 100644 index 5e5dfd8..0000000 --- a/test/agents.ts +++ /dev/null @@ -1,272 +0,0 @@ -/** - * Structured concurrency tests for the agent system - * - * Verifies Effection v4 SC guarantees: branch cleanup on all exit paths, - * scope teardown ordering, ensure() lifecycle. - * - * Usage: - * npm run test:agents - * LLAMA_TEST_MODEL=models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf npm run test:agents - */ - -import * as path from 'node:path'; -import * as fs from 'node:fs'; -import { run, call, spawn, ensure, each } from 'effection'; -import { loadBinary, Branch } from '../dist/index.js'; -import type { SessionContext, NativeBinding } from '../dist/index.js'; -import { - initAgents, runAgents, withSharedRoot, Tool, -} from '../dist/agents/index.js'; -import type { AgentPoolResult, JsonSchema } from '../dist/agents/index.js'; - -const MODEL_PATH: string = process.env.LLAMA_TEST_MODEL - ? path.resolve(process.env.LLAMA_TEST_MODEL) - : path.join(__dirname, '../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf'); - -const CTX_SIZE = 2048; - -if (!fs.existsSync(MODEL_PATH)) { - console.error('Test model not found:', MODEL_PATH); - process.exit(1); -} - -console.log('=== lloyal.node SC Agent Tests ===\n'); -console.log(`Model: ${path.basename(MODEL_PATH)}`); -console.log(`Size: ${(fs.statSync(MODEL_PATH).size / 1024 / 1024).toFixed(1)} MB\n`); - -let addon: NativeBinding; -try { - addon = require('../build/Release/lloyal.node') as NativeBinding; -} catch { - addon = loadBinary(); -} - -let passed = 0; -let failed = 0; - -function ok(msg: string): void { - passed++; - console.log(` [PASS] ${msg}`); -} - -function fail(msg: string): void { - failed++; - console.log(` [FAIL] ${msg}`); -} - -function assert(condition: boolean, msg: string): void { - if (condition) ok(msg); - else { fail(msg); throw new Error(msg); } -} - -// ── Test tools ──────────────────────────────────────────────────── - -class ThrowingTool extends Tool> { - readonly name = 'explode'; - readonly description = 'A tool that always throws'; - readonly parameters: JsonSchema = { - type: 'object', - properties: { input: { type: 'string' } }, - }; - async execute(): Promise { - throw new Error('intentional_tool_error'); - } -} - -// ── Helpers ──────────────────────────────────────────────────────── - -async function createTestContext(): Promise { - return addon.createContext({ - modelPath: MODEL_PATH, - nCtx: CTX_SIZE, - nThreads: 4, - nSeqMax: 4, - typeK: 'f16', - typeV: 'f16', - }); -} - -function makeTasks(parent: Branch, count: number) { - return Array.from({ length: count }, (_, i) => ({ - systemPrompt: 'You are a test agent.', - content: `Test task ${i}`, - parent, - })); -} - -/** Bootstrap agent infra via initAgents + drain events to prevent backpressure */ -function* setupTest(ctx: SessionContext) { - const { events } = yield* initAgents(ctx); - yield* spawn(function*() { - for (const _ev of yield* each(events)) { - yield* each.next(); - } - }); -} - -// ═══════════════════════════════════════════════════════════════════ -// TEST 1: ensure() cleanup — runs on scope exit regardless of how -// ═══════════════════════════════════════════════════════════════════ - -async function testEnsureCleanup(): Promise { - console.log('\n--- ensure() cleanup: runs on normal exit and on error ---'); - - // Test A: ensure runs on normal exit - let cleanupRanNormal = false; - await run(function*() { - yield* ensure(() => { cleanupRanNormal = true; }); - }); - assert(cleanupRanNormal, 'ensure() ran on normal scope exit'); - - // Test B: ensure runs on error exit - let cleanupRanError = false; - try { - await run(function*() { - yield* ensure(() => { cleanupRanError = true; }); - throw new Error('intentional_test_error'); - }); - } catch { - // expected - } - assert(cleanupRanError, 'ensure() ran on error scope exit'); -} - -// ═══════════════════════════════════════════════════════════════════ -// TEST 2: Normal lifecycle — branches pruned after runAgents returns -// ═══════════════════════════════════════════════════════════════════ - -async function testNormalLifecycle(): Promise { - console.log('\n--- Normal lifecycle: branches pruned after runAgents ---'); - - await run(function*() { - const ctx: SessionContext = yield* call(() => createTestContext()); - yield* setupTest(ctx); - - yield* withSharedRoot( - { systemPrompt: 'You are a test agent.' }, - function*(root, prefixLen) { - assert(prefixLen > 0, `shared prefix has tokens (${prefixLen})`); - - const pool: AgentPoolResult = yield* runAgents({ - tasks: makeTasks(root, 2), - tools: new Map(), - maxTurns: 1, - }); - - assert(pool.agents.length === 2, 'pool has 2 agents'); - assert(root.children.length === 0, 'agent branches pruned before body returns'); - - return pool; - }, - ); - - ok('withSharedRoot completed without error'); - }); -} - -// ═══════════════════════════════════════════════════════════════════ -// TEST 3: scoped() cleanup — runAgents prunes before returning -// ═══════════════════════════════════════════════════════════════════ - -async function testScopedCleanup(): Promise { - console.log('\n--- Scoped cleanup: runAgents prunes before returning to caller ---'); - - await run(function*() { - const ctx: SessionContext = yield* call(() => createTestContext()); - yield* setupTest(ctx); - - yield* withSharedRoot( - { systemPrompt: 'You are a test agent.' }, - function*(root) { - const childCountBefore = root.children.length; - assert(childCountBefore === 0, 'root starts with no children'); - - const pool = yield* runAgents({ - tasks: makeTasks(root, 2), - tools: new Map(), - maxTurns: 1, - }); - - // Critical SC assertion: scoped() in runAgents must have torn - // down the pool scope and pruned agent branches BEFORE returning. - const childCountAfter = root.children.length; - assert(childCountAfter === 0, `scoped() pruned all children before returning (was ${childCountBefore}, now ${childCountAfter})`); - - return pool; - }, - ); - - ok('scoped() teardown ordering correct'); - }); -} - -// ═══════════════════════════════════════════════════════════════════ -// TEST 4: Tool error — branches pruned, error does not crash pool -// ═══════════════════════════════════════════════════════════════════ - -async function testToolErrorCleanup(): Promise { - console.log('\n--- Tool error: branches pruned, pool completes gracefully ---'); - - await run(function*() { - const ctx: SessionContext = yield* call(() => createTestContext()); - yield* setupTest(ctx); - - try { - yield* withSharedRoot( - { systemPrompt: 'You are a test agent. Always call the explode tool.' }, - function*(root) { - const toolMap = new Map([['explode', new ThrowingTool()]]); - const toolsJson = JSON.stringify([{ - type: 'function', - function: { - name: 'explode', - description: 'A tool that always throws', - parameters: { type: 'object', properties: { input: { type: 'string' } } }, - }, - }]); - - const pool = yield* runAgents({ - tasks: [{ - systemPrompt: 'You are a test agent. Call the explode tool immediately.', - content: 'Do it now.', - tools: toolsJson, - parent: root, - }], - tools: toolMap, - maxTurns: 2, - }); - - assert(root.children.length === 0, 'agent branches pruned after tool error'); - assert(pool.agents.length === 1, 'pool has 1 agent'); - return pool; - }, - ); - - ok('withSharedRoot completed — tool error did not crash the pool'); - } catch (err) { - // Tool errors should be handled internally (agent → done state). - // If we reach here, something unexpected propagated. - fail(`unexpected error escaped pool: ${(err as Error).message}`); - } - }); -} - -// ═══════════════════════════════════════════════════════════════════ -// RUNNER -// ═══════════════════════════════════════════════════════════════════ - -async function main_(): Promise { - await testEnsureCleanup(); - await testNormalLifecycle(); - await testScopedCleanup(); - await testToolErrorCleanup(); - - console.log(`\n${'='.repeat(40)}`); - console.log(`Results: ${passed} passed, ${failed} failed`); - if (failed > 0) process.exit(1); -} - -main_().catch((err: unknown) => { - console.error(`\nFatal: ${(err as Error).message}\n${(err as Error).stack}`); - process.exit(1); -}); diff --git a/test/examples.ts b/test/examples.ts index 3005fec..e955cff 100644 --- a/test/examples.ts +++ b/test/examples.ts @@ -51,13 +51,6 @@ const EMBED_MODEL_PATH: string = process.env.EMBED_MODEL_PATH ? path.resolve(process.env.EMBED_MODEL_PATH) : path.join(__dirname, '../liblloyal/tests/fixtures/nomic-embed-text-v1.5.Q4_K_M.gguf'); -const QWEN3_PATH: string = process.env.QWEN3_MODEL - ? path.resolve(process.env.QWEN3_MODEL) - : path.join(__dirname, '../models/Qwen3-4B-Instruct-2507-Q4_K_M.gguf'); - -const RERANKER_PATH: string = process.env.RERANKER_MODEL - ? path.resolve(process.env.RERANKER_MODEL) - : path.join(__dirname, '../models/qwen3-reranker-0.6b-q4_k_m.gguf'); if (!fs.existsSync(MODEL_PATH)) { @@ -185,48 +178,6 @@ const EXAMPLES: Record = { }, }, - 'deep-research': { - path: 'deep-research/deep-research.ts', - timeout: 300000, - modelPath: QWEN3_PATH, - extraArgs: [ - '--reranker', RERANKER_PATH, - '--corpus', process.env.DEEP_RESEARCH_CORPUS || '', - '--query', process.env.DEEP_RESEARCH_QUERY || '', - ], - skip: !fs.existsSync(QWEN3_PATH) || !fs.existsSync(RERANKER_PATH) - || !process.env.DEEP_RESEARCH_CORPUS || !process.env.DEEP_RESEARCH_QUERY, - skipReason: 'Requires QWEN3_MODEL, RERANKER_MODEL, DEEP_RESEARCH_CORPUS, and DEEP_RESEARCH_QUERY env vars', - validate(events: ExampleEvent[]): void { - const start: ExampleEvent | undefined = events.find(e => e.event === 'start'); - assert(start, 'should have start event'); - assert(start.agentCount === 3, 'should have 3 agents'); - assert(start.chunks > 0, 'should have corpus chunks'); - - const plan: ExampleEvent | undefined = events.find(e => e.event === 'plan'); - assert(plan, 'should have plan event'); - assert(plan.questions.length >= 2, 'should plan at least 2 sub-questions'); - - const researchStart: ExampleEvent | undefined = events.find(e => e.event === 'research_start'); - assert(researchStart, 'should have research_start event'); - assert(researchStart.sharedPrefixTokens > 0, 'should have shared prefix'); - - const toolCalls: ExampleEvent[] = events.filter(e => e.event === 'tool_call'); - assert(toolCalls.length > 0, 'should make at least one tool call'); - - const agentsDone: ExampleEvent[] = events.filter(e => e.event === 'agent_done'); - assert(agentsDone.length === 3, 'all 3 agents should finish'); - for (const a of agentsDone) { - assert(a.tokenCount > 0, `agent ${a.index} should generate tokens`); - } - - const complete: ExampleEvent | undefined = events.find(e => e.event === 'complete'); - assert(complete, 'should have complete event'); - assert(complete.totalToolCalls > 0, 'should have tool calls'); - assert(complete.wallTimeMs > 0, 'should have wall time'); - assert(complete.converged !== undefined, 'should have convergence result'); - }, - }, }; async function runTest(name: string, config: ExampleConfig): Promise { diff --git a/test/integration.ts b/test/integration.ts index 42b042c..77eca9c 100644 --- a/test/integration.ts +++ b/test/integration.ts @@ -17,7 +17,7 @@ import * as path from 'node:path'; import * as fs from 'node:fs'; -import { loadBinary, Branch, BranchStore, Rerank } from '../dist/index.js'; +import { loadBinary, createContext, Branch, BranchStore, Rerank } from '../dist/index.js'; import type { SessionContext, NativeBinding, FormattedChatResult, Produced } from '../dist/index.js'; const MODEL_PATH: string = process.env.LLAMA_TEST_MODEL @@ -1908,7 +1908,14 @@ async function testRerank(): Promise { console.log('\n--- Rerank ---'); console.log(` Model: ${path.basename(RERANK_MODEL_PATH)}`); - const rerank = await Rerank.create({ modelPath: RERANK_MODEL_PATH }); + const rerankCtx = await createContext({ + modelPath: RERANK_MODEL_PATH, + nCtx: 4096, + nSeqMax: 8, + typeK: 'q4_0', + typeV: 'q4_0', + }); + const rerank = await Rerank.create(rerankCtx, { nSeqMax: 8, nCtx: 4096 }); try { // Tokenize documents @@ -1987,7 +1994,14 @@ async function testRerankLargeCorpus(): Promise { console.log(` Model: ${path.basename(RERANK_MODEL_PATH)}`); // n_seq_max=8 so 20 documents requires 3 groups (8+8+4) - const rerank = await Rerank.create({ modelPath: RERANK_MODEL_PATH, nSeqMax: 8 }); + const rerankCtx = await createContext({ + modelPath: RERANK_MODEL_PATH, + nCtx: 4096, + nSeqMax: 8, + typeK: 'q4_0', + typeV: 'q4_0', + }); + const rerank = await Rerank.create(rerankCtx, { nSeqMax: 8, nCtx: 4096 }); try { const query = 'What is the capital of France?'; @@ -2065,7 +2079,14 @@ async function testRerankConcurrent(): Promise { console.log('\n--- Rerank Concurrent ---'); - const rerank = await Rerank.create({ modelPath: RERANK_MODEL_PATH, nSeqMax: 4 }); + const rerankCtx = await createContext({ + modelPath: RERANK_MODEL_PATH, + nCtx: 4096, + nSeqMax: 4, + typeK: 'q4_0', + typeV: 'q4_0', + }); + const rerank = await Rerank.create(rerankCtx, { nSeqMax: 4, nCtx: 4096 }); try { const docs = [