lloyal-ai · lloyal-research · Dec 5, 2025 · Nov 27, 2025 · Dec 4, 2025 · Dec 4, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -92,6 +92,7 @@ jobs:
         if: runner.os != 'Windows'
         run: npm run test:e2e
         timeout-minutes: 5
+        # Note: E2E tests include embedding suite which skips gracefully if no embedding model
 
       - name: Display build info
         if: always()
@@ -167,10 +168,7 @@ jobs:
             exit 1
           fi
 
-          if grep -q "package/vendor/liblloyal/tests/lib" package-contents.txt; then
-            echo "❌ ERROR: tests/lib/ prebuilt binaries should not be in package!"
-            exit 1
-          fi
+          # Note: vendor/liblloyal/tests/ is no longer vendored (only headers needed)
 
           echo "✅ Package contents verified!"
 
@@ -197,6 +195,6 @@ jobs:
           echo "✓ Node.js 18, 20, 22 compatibility verified"
           echo "✓ npm package contents verified"
           echo "✓ API tests passed (11 tests)"
-          echo "✓ E2E tests passed (4 validation tests)"
+          echo "✓ E2E tests passed (4 text generation + 8 embedding tests)"
           echo ""
           echo "Phase 1 (Build from Source) Status: ${{ needs.test-vendored-sources.result }}"
diff --git a/.gitignore b/.gitignore
@@ -6,7 +6,6 @@ prebuilds/
 
 # Vendor build artifacts (generated during npm install)
 vendor/llama.cpp/build-*/
-vendor/liblloyal/tests/lib/
 
 # Dependencies
 node_modules/
@@ -26,3 +25,5 @@ yarn-error.log*
 # OS
 .DS_Store
 Thumbs.db
+
+.tmp
diff --git a/README.md b/README.md
@@ -7,6 +7,7 @@ Thin N-API wrapper over [liblloyal](https://github.com/lloyal-ai/liblloyal) for
 - **Prebuilt Binaries**: Install in <1 minute on 7 common platforms (macOS, Linux, Windows)
 - **Raw & Thin**: Direct access to llama.cpp primitives via liblloyal
 - **Zero-Copy Logits**: `getLogits()` returns Float32Array pointing to llama.cpp memory
+- **Embeddings**: Extract L2-normalized embeddings with configurable pooling (MEAN, CLS, LAST)
 - **GPU Acceleration**: Metal (macOS), CUDA, and Vulkan support with dedicated prebuilts
 - **BYO llama.cpp**: Swap `libllama.dylib` for custom builds (dynamic linking)
 - **Native Reference**: Includes native entropy/greedy implementations for testing
@@ -318,6 +319,8 @@ Creates a new inference context.
 - `modelPath: string` - Path to .gguf model file (required)
 - `nCtx?: number` - Context size (default: 2048)
 - `nThreads?: number` - Number of threads (default: 4)
+- `embeddings?: boolean` - Enable embedding mode (default: false)
+- `poolingType?: number` - Pooling type: 0=NONE, 1=MEAN, 2=CLS, 3=LAST (default: model's default)
 
 **Returns:** `Promise<SessionContext>`
 
@@ -330,6 +333,14 @@ Creates a new inference context.
 - **`tokenize(text: string): Promise<number[]>`** - Tokenize text to token IDs
 - **`detokenize(tokens: number[]): Promise<string>`** - Detokenize tokens to text
 
+#### Embeddings
+
+- **`encode(tokens: number[]): Promise<void>`** - Encode tokens for embedding extraction
+- **`getEmbeddings(normalize?: boolean): Float32Array`** - Get embeddings (optionally L2-normalized)
+- **`hasPooling(): boolean`** - Check if context has pooling enabled
+- **`getEmbeddingDimension(): number`** - Get embedding vector dimension
+- **`kvCacheClear(): Promise<void>`** - Clear KV cache (call between texts for embeddings)
+
 #### Native References (for testing)
 
 - **`computeEntropy(): number`** - Native entropy computation (nats)
@@ -418,9 +429,9 @@ npm run test:e2e      # Correctness and determinism validation
 ### Tests
 
 - **`test/api.js`**: API functionality tests and performance benchmarks
-- **`test/e2e.js`**: End-to-end validation with deterministic output checks
+- **`test/e2e.js`**: End-to-end validation (text generation + embeddings)
 
-Tests use SmolLM2-1.7B-Instruct with chat templates to simulate real-world usage patterns.
+Tests use SmolLM2-1.7B-Instruct for text generation and nomic-embed-text for embeddings. Embedding tests skip gracefully if no embedding model is available.
 
 ## Distribution & Releases
 

diff --git a/examples/chat/README.md b/examples/chat/README.md
@@ -0,0 +1,50 @@
+# Simple Chat Example
+
+A minimal chat example demonstrating the lloyal.node API.
+
+## Usage
+
+```bash
+npm run example                           # uses default model
+npm run example -- /path/to/model.gguf    # custom model
+```
+
+## Commands
+
+- `/clear` - Reset conversation and clear terminal
+- `/quit` - Exit
+
+## The Pattern: Sync Produce, Async Commit
+
+```javascript
+// Sync generator - all operations are synchronous
+function* produceTokens(ctx, params) {
+  while (true) {
+    const tokenId = ctx.sample(params);      // sync
+    if (ctx.isStopToken(tokenId)) return;    // sync
+    const text = ctx.tokenToText(tokenId);   // sync
+    yield { text, tokenId };
+  }
+}
+
+// Usage - async commit is explicit in caller's loop
+for (const { text, tokenId } of produceTokens(ctx, params)) {
+  process.stdout.write(text);
+  await ctx.decode([tokenId], position);     // async commit to KV
+  position += 1;
+}
+```
+
+**Key insight:** Token production is synchronous. Only the KV cache commit (`decode`) is async. This separation makes the control flow explicit.
+
+## API Reference
+
+| Method | Sync/Async | Purpose |
+|--------|------------|---------|
+| `sample(params)` | sync | Sample next token from logits |
+| `isStopToken(id)` | sync | Check if token ends generation |
+| `tokenToText(id)` | sync | Convert token ID to text |
+| `decode(tokens, pos)` | async | Commit tokens to KV cache |
+| `tokenize(text)` | async | Convert text to token IDs |
+| `formatChat(json)` | async | Apply chat template |
+| `kvCacheClear()` | async | Reset KV cache |
diff --git a/examples/chat/chat.mjs b/examples/chat/chat.mjs
@@ -0,0 +1,137 @@
+#!/usr/bin/env node
+/**
+ * Simple chat example using lloyal.node
+ *
+ * Usage:
+ *   node chat.mjs /path/to/model.gguf
+ *   node chat.mjs  # uses default model path
+ *
+ * This example demonstrates:
+ * - Sync generator for token production (sample, check stop, convert to text)
+ * - Async commit via decode() to update KV cache
+ * - Clear separation: sync produce, async commit
+ */
+
+import * as readline from 'node:readline';
+import * as path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { createContext } from '../../lib/index.js';
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const DEFAULT_MODEL = path.resolve(
+  __dirname,
+  '../../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf'
+);
+
+/**
+ * Sync generator - produces tokens until stop token.
+ * All operations are synchronous: sample, isStopToken, tokenToText.
+ */
+function* produceTokens(ctx, params) {
+  while (true) {
+    const tokenId = ctx.sample(params);
+    if (ctx.isStopToken(tokenId)) return;
+    const text = ctx.tokenToText(tokenId);
+    yield { text, tokenId };
+  }
+}
+
+async function main() {
+  const modelPath = process.argv[2] || DEFAULT_MODEL;
+
+  console.log(`Loading model: ${modelPath}`);
+  console.log('This may take a moment...\n');
+
+  const ctx = await createContext({
+    modelPath,
+    contextSize: 2048,
+    threads: 4,
+  });
+
+  console.log('Model loaded! Type your message and press Enter.');
+  console.log('Commands: /clear to reset, /quit to exit\n');
+
+  const messages = [];
+  let position = 0;
+  let lastPrompt = '';
+
+  const rl = readline.createInterface({
+    input: process.stdin,
+    output: process.stdout,
+  });
+
+  const askUser = () => rl.question('> ', handleInput);
+
+  async function handleInput(input) {
+    const trimmed = input.trim();
+
+    if (trimmed === '/quit' || trimmed === '/exit') {
+      console.log('Goodbye!');
+      ctx.dispose();
+      rl.close();
+      return;
+    }
+
+    if (trimmed === '/clear') {
+      await ctx.kvCacheClear();
+      messages.length = 0;
+      position = 0;
+      lastPrompt = '';
+      console.clear();
+      console.log('Conversation cleared.\n');
+      askUser();
+      return;
+    }
+
+    if (!trimmed) {
+      askUser();
+      return;
+    }
+
+    messages.push({ role: 'user', content: trimmed });
+
+    // Format with chat template
+    const { prompt: fullPrompt } = await ctx.formatChat(
+      JSON.stringify(messages)
+    );
+
+    // Prompt diffing - only tokenize new content
+    const newContent = fullPrompt.startsWith(lastPrompt)
+      ? fullPrompt.slice(lastPrompt.length)
+      : fullPrompt;
+
+    const tokens = await ctx.tokenize(newContent);
+    await ctx.decode(tokens, position);
+    position += tokens.length;
+
+    // Generate: sync produce, async commit
+    process.stdout.write('< ');
+    let response = '';
+
+    for (const { text, tokenId } of produceTokens(ctx, {
+      temperature: 0.7,
+      topK: 40,
+      topP: 0.9,
+    })) {
+      process.stdout.write(text);
+      response += text;
+
+      await ctx.decode([tokenId], position); // async commit to KV
+      position += 1;
+    }
+
+    console.log('\n');
+
+    messages.push({ role: 'assistant', content: response.trim() });
+    lastPrompt = fullPrompt + response;
+
+    askUser();
+  }
+
+  askUser();
+}
+
+main().catch((err) => {
+  console.error('Error:', err.message);
+  process.exit(1);
+});
diff --git a/examples/cli/.gitignore b/examples/cli/.gitignore