From d0c28008b8b3b3fd998d076c2c356e3aeb034a49 Mon Sep 17 00:00:00 2001 From: LLoyal Research Date: Mon, 19 Jan 2026 03:11:14 +1100 Subject: [PATCH 1/9] feat(metrics): SessionContext bindings --- lib/index.d.ts | 166 ++++++++++++++++++++++++++ package.json | 2 +- src/SessionContext.cpp | 259 +++++++++++++++++++++++++++++++++++++++++ src/SessionContext.hpp | 69 +++++++++++ test/api.js | 136 ++++++++++++++++++++++ 5 files changed, 631 insertions(+), 1 deletion(-) diff --git a/lib/index.d.ts b/lib/index.d.ts index bec6974..675e4ab 100644 --- a/lib/index.d.ts +++ b/lib/index.d.ts @@ -812,6 +812,172 @@ export interface SessionContext { */ freeSamplerHandle(handle: number): void; + // ===== METRICS API ===== + + /** + * Compute surprisal (negative log-likelihood) for a specific token. + * + * Measures how "surprising" the model finds the given token: + * - Low surprisal: Model expected this token (high probability) + * - High surprisal: Model didn't expect this token (low probability) + * + * @param pickedTokenId - Token ID to compute surprisal for + * @param base - Logarithm base: "nats" (default) or "bits" + * @returns Surprisal value in specified base + * + * @example + * ```typescript + * // After decode with logits=true + * const token = ctx.sampleNextToken(); + * const surprisal = ctx.modelSurprisal(token, "bits"); + * console.log(`Model surprise: ${surprisal.toFixed(2)} bits`); + * ``` + * + * COST: O(1) - direct probability lookup from logits + * REQUIRES: decode() called with logits=true + */ + modelSurprisal(pickedTokenId: number, base?: "nats" | "bits"): number; + + /** + * Compute entropy of the entire logits distribution. + * + * Measures model uncertainty: + * - Low entropy: Model is confident (peaked distribution) + * - High entropy: Model is uncertain (flat distribution) + * + * @param base - Logarithm base: "nats" (default), "bits", or "base10" + * @returns Entropy value in specified base + * + * @example + * ```typescript + * // Check model confidence before sampling + * const entropy = ctx.modelEntropy("bits"); + * if (entropy > 5.0) { + * console.log("Model is very uncertain - consider adjusting parameters"); + * } + * ``` + * + * COST: O(n_vocab) - must sum over all token probabilities + * REQUIRES: decode() called with logits=true + * ALGORITHM: Numerically stable log-sum-exp (metrics.hpp:73-81) + */ + modelEntropy(base?: "nats" | "bits"): number; + + /** + * Create a new perplexity tracker. + * + * @returns Integer handle to the tracker + * + * @example + * ```typescript + * const tracker = ctx.createPerplexityTracker(); + * + * // Add surprisals during generation + * for (let i = 0; i < tokens.length; i++) { + * const surprisal = ctx.modelSurprisal(tokens[i]); + * ctx.addSurprisal(tracker, surprisal); + * } + * + * const ppl = ctx.getPerplexity(tracker); + * console.log(`Sequence perplexity: ${ppl.toFixed(2)}`); + * + * ctx.freePerplexityTracker(tracker); + * ``` + */ + createPerplexityTracker(): number; + + /** + * Add a surprisal value to the rolling tracker. + * + * @param handle - Tracker handle from createPerplexityTracker() + * @param surprisal - Surprisal value (from modelSurprisal or computed) + * + * @example + * ```typescript + * const surprisal = ctx.modelSurprisal(tokenId, "nats"); + * ctx.addSurprisal(tracker, surprisal); + * ``` + * + * COST: O(1) - numerically stable accumulation + * THREAD-SAFETY: Not thread-safe (handle is session-local) + */ + addSurprisal(handle: number, surprisal: number): void; + + /** + * Get current perplexity value. + * + * @param handle - Tracker handle + * @returns Perplexity = exp(average_surprisal_in_nats) + * + * @example + * ```typescript + * const ppl = ctx.getPerplexity(tracker); + * console.log(`Current PPL: ${ppl.toFixed(2)}`); + * ``` + * + * FORMULA: PPL = exp(sum_surprisals / count) + * RANGE: [1, ∞) where 1 = perfect prediction + */ + getPerplexity(handle: number): number; + + /** + * Clone a perplexity tracker (for fork/branch scenarios). + * + * @param sourceHandle - Handle to clone from + * @returns New handle with same accumulated state + * + * @example + * ```typescript + * // Branch A and B start from same base perplexity + * const baseTracker = ctx.createPerplexityTracker(); + * // ... accumulate base surprisals ... + * + * const branchA = ctx.clonePerplexityTracker(baseTracker); + * const branchB = ctx.clonePerplexityTracker(baseTracker); + * + * // Branch A and B now track independently + * ctx.addSurprisal(branchA, surprisalA); + * ctx.addSurprisal(branchB, surprisalB); + * ``` + */ + clonePerplexityTracker(sourceHandle: number): number; + + /** + * Reset tracker to initial state (count=0, sum=0). + * + * @param handle - Tracker handle to reset + * + * @example + * ```typescript + * // Reuse tracker for multiple sequences + * const tracker = ctx.createPerplexityTracker(); + * + * for (const sequence of sequences) { + * ctx.resetPerplexityTracker(tracker); + * // ... process sequence ... + * const ppl = ctx.getPerplexity(tracker); + * } + * ``` + */ + resetPerplexityTracker(handle: number): void; + + /** + * Get number of tokens tracked. + * + * @param handle - Tracker handle + * @returns Number of surprisal values added + */ + getPerplexityCount(handle: number): number; + + /** + * Free perplexity tracker resources. + * + * @param handle - Tracker handle to free + * + * NOTE: Auto-freed in dispose() if not manually freed + */ + freePerplexityTracker(handle: number): void; + // ===== ATOMIC DECODE+CAPTURE ===== /** diff --git a/package.json b/package.json index ef69b7c..3f7392a 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "lloyal.node", "version": "0.1.0", - "description": "Thin N-API wrapper over liblloyal for Node.js - raw llama.cpp inference primitives", + "description": "N-API client for liblloyal+llama.cpp", "main": "lib/index.js", "types": "lib/index.d.ts", "gypfile": true, diff --git a/src/SessionContext.cpp b/src/SessionContext.cpp index 0684269..c5b4a1c 100644 --- a/src/SessionContext.cpp +++ b/src/SessionContext.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include namespace liblloyal_node { @@ -593,6 +594,17 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) { InstanceMethod("getEmbeddingDimension", &SessionContext::getEmbeddingDimension), InstanceMethod("hasPooling", &SessionContext::hasPooling), + // ===== METRICS API ===== + InstanceMethod("modelSurprisal", &SessionContext::modelSurprisal), + InstanceMethod("modelEntropy", &SessionContext::modelEntropy), + InstanceMethod("createPerplexityTracker", &SessionContext::createPerplexityTracker), + InstanceMethod("addSurprisal", &SessionContext::addSurprisal), + InstanceMethod("getPerplexity", &SessionContext::getPerplexity), + InstanceMethod("clonePerplexityTracker", &SessionContext::clonePerplexityTracker), + InstanceMethod("resetPerplexityTracker", &SessionContext::resetPerplexityTracker), + InstanceMethod("getPerplexityCount", &SessionContext::getPerplexityCount), + InstanceMethod("freePerplexityTracker", &SessionContext::freePerplexityTracker), + // ===== NATIVE REFERENCE IMPLEMENTATIONS ===== InstanceMethod("computeEntropy", &SessionContext::computeEntropy), InstanceMethod("greedySample", &SessionContext::greedySample), @@ -609,6 +621,13 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) { return exports; } +// ===== HELPERS ===== + +lloyal::metrics::Base SessionContext::parseBase(const std::string& baseStr) { + if (baseStr == "bits") return lloyal::metrics::Base::Bits; + return lloyal::metrics::Base::Nats; // Default (matches metrics.hpp) +} + SessionContext::SessionContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap(info) { // Constructor is called by CreateContext factory function @@ -625,6 +644,12 @@ SessionContext::~SessionContext() { } _samplerHandles.clear(); + // Free handle-based perplexity trackers + for (auto& [napiHandle, pplHandle] : _perplexityHandles) { + lloyal::metrics::free_perplexity(pplHandle); + } + _perplexityHandles.clear(); + // Free legacy global grammar sampler (pattern matches HybridSessionContext.cpp:72) if (_grammarSampler) { llama_sampler_free(_grammarSampler); @@ -864,6 +889,71 @@ Napi::Value SessionContext::computeEntropy(const Napi::CallbackInfo& info) { return Napi::Number::New(env, entropy); } +// ===== METRICS API ===== + +Napi::Value SessionContext::modelSurprisal(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + ensureNotDisposed(); + + // Argument validation + if (info.Length() < 1 || !info[0].IsNumber()) { + throw Napi::TypeError::New(env, "Expected number (pickedTokenId)"); + } + + int32_t pickedTokenId = info[0].As().Int32Value(); + + // Optional base parameter (default: "nats") + std::string baseStr = "nats"; + if (info.Length() >= 2 && info[1].IsString()) { + baseStr = info[1].As().Utf8Value(); + } + + lloyal::metrics::Base base = parseBase(baseStr); + + // Get logits pointer (zero-copy) + float* logits; + try { + logits = lloyal::logits::get(_context, -1); + } catch (const std::exception& e) { + throw Napi::Error::New(env, e.what()); + } + + int n_vocab = lloyal::tokenizer::vocab_size(_model.get()); + + // Compute surprisal + float surprisal = lloyal::metrics::model_surprisal(logits, n_vocab, pickedTokenId, base); + + return Napi::Number::New(env, static_cast(surprisal)); +} + +Napi::Value SessionContext::modelEntropy(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + ensureNotDisposed(); + + // Optional base parameter (default: "nats") + std::string baseStr = "nats"; + if (info.Length() >= 1 && info[0].IsString()) { + baseStr = info[0].As().Utf8Value(); + } + + lloyal::metrics::Base base = parseBase(baseStr); + + // Get logits pointer (zero-copy) + float* logits; + try { + logits = lloyal::logits::get(_context, -1); + } catch (const std::exception& e) { + throw Napi::Error::New(env, e.what()); + } + + int n_vocab = lloyal::tokenizer::vocab_size(_model.get()); + + // Compute entropy using metrics.hpp (replaces manual log-sum-exp) + float entropy = lloyal::metrics::model_entropy(logits, n_vocab, base); + + return Napi::Number::New(env, static_cast(entropy)); +} + Napi::Value SessionContext::greedySample(const Napi::CallbackInfo& info) { Napi::Env env = info.Env(); ensureNotDisposed(); @@ -1083,6 +1173,12 @@ Napi::Value SessionContext::dispose(const Napi::CallbackInfo& info) { } _samplerHandles.clear(); + // Free handle-based perplexity trackers + for (auto& [napiHandle, pplHandle] : _perplexityHandles) { + lloyal::metrics::free_perplexity(pplHandle); + } + _perplexityHandles.clear(); + // Free legacy global grammar sampler if (_grammarSampler) { llama_sampler_free(_grammarSampler); @@ -1448,6 +1544,169 @@ Napi::Value SessionContext::freeSamplerHandle(const Napi::CallbackInfo& info) { return env.Undefined(); } +// ===== PERPLEXITY TRACKING ===== + +Napi::Value SessionContext::createPerplexityTracker(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + ensureNotDisposed(); + + // Create new perplexity tracker via metrics.hpp + lloyal::metrics::PerplexityHandle handle = lloyal::metrics::create_perplexity(); + + // Generate N-API handle + int32_t napiHandle = _nextPerplexityHandle++; + _perplexityHandles[napiHandle] = handle; + + return Napi::Number::New(env, static_cast(napiHandle)); +} + +Napi::Value SessionContext::addSurprisal(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + ensureNotDisposed(); + + // Argument validation + if (info.Length() < 2 || !info[0].IsNumber() || !info[1].IsNumber()) { + throw Napi::TypeError::New(env, "Expected (handle: number, surprisal: number)"); + } + + int32_t napiHandle = info[0].As().Int32Value(); + double surprisal = info[1].As().DoubleValue(); + + // Lookup handle + auto it = _perplexityHandles.find(napiHandle); + if (it == _perplexityHandles.end()) { + throw Napi::Error::New(env, "Invalid perplexity tracker handle"); + } + + // Add surprisal to tracker + lloyal::metrics::add_surprisal(it->second, static_cast(surprisal)); + + return env.Undefined(); +} + +Napi::Value SessionContext::getPerplexity(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + ensureNotDisposed(); + + // Argument validation + if (info.Length() < 1 || !info[0].IsNumber()) { + throw Napi::TypeError::New(env, "Expected handle: number"); + } + + int32_t napiHandle = info[0].As().Int32Value(); + + // Lookup handle + auto it = _perplexityHandles.find(napiHandle); + if (it == _perplexityHandles.end()) { + throw Napi::Error::New(env, "Invalid perplexity tracker handle"); + } + + // Get perplexity value + float ppl = lloyal::metrics::get_ppl(it->second); + + return Napi::Number::New(env, static_cast(ppl)); +} + +Napi::Value SessionContext::clonePerplexityTracker(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + ensureNotDisposed(); + + // Argument validation + if (info.Length() < 1 || !info[0].IsNumber()) { + throw Napi::TypeError::New(env, "Expected handle: number"); + } + + int32_t sourceHandle = info[0].As().Int32Value(); + + // Lookup source handle + auto it = _perplexityHandles.find(sourceHandle); + if (it == _perplexityHandles.end()) { + throw Napi::Error::New(env, "Invalid source perplexity tracker handle"); + } + + // Clone via metrics.hpp + lloyal::metrics::PerplexityHandle clonedHandle = + lloyal::metrics::clone_perplexity(it->second); + + // Generate new N-API handle + int32_t newNapiHandle = _nextPerplexityHandle++; + _perplexityHandles[newNapiHandle] = clonedHandle; + + return Napi::Number::New(env, static_cast(newNapiHandle)); +} + +Napi::Value SessionContext::resetPerplexityTracker(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + ensureNotDisposed(); + + // Argument validation + if (info.Length() < 1 || !info[0].IsNumber()) { + throw Napi::TypeError::New(env, "Expected handle: number"); + } + + int32_t napiHandle = info[0].As().Int32Value(); + + // Lookup handle + auto it = _perplexityHandles.find(napiHandle); + if (it == _perplexityHandles.end()) { + throw Napi::Error::New(env, "Invalid perplexity tracker handle"); + } + + // Reset tracker + lloyal::metrics::reset_perplexity(it->second); + + return env.Undefined(); +} + +Napi::Value SessionContext::getPerplexityCount(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + ensureNotDisposed(); + + // Argument validation + if (info.Length() < 1 || !info[0].IsNumber()) { + throw Napi::TypeError::New(env, "Expected handle: number"); + } + + int32_t napiHandle = info[0].As().Int32Value(); + + // Lookup handle + auto it = _perplexityHandles.find(napiHandle); + if (it == _perplexityHandles.end()) { + throw Napi::Error::New(env, "Invalid perplexity tracker handle"); + } + + // Get token count + int count = lloyal::metrics::get_count(it->second); + + return Napi::Number::New(env, static_cast(count)); +} + +Napi::Value SessionContext::freePerplexityTracker(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + ensureNotDisposed(); + + // Argument validation + if (info.Length() < 1 || !info[0].IsNumber()) { + throw Napi::TypeError::New(env, "Expected handle: number"); + } + + int32_t napiHandle = info[0].As().Int32Value(); + + // Lookup and remove handle + auto it = _perplexityHandles.find(napiHandle); + if (it == _perplexityHandles.end()) { + throw Napi::Error::New(env, "Invalid perplexity tracker handle"); + } + + // Free via metrics.hpp + lloyal::metrics::free_perplexity(it->second); + + // Remove from map + _perplexityHandles.erase(it); + + return env.Undefined(); +} + // ===== ATOMIC DECODE+CAPTURE ===== Napi::Value SessionContext::decodeAndCapture(const Napi::CallbackInfo& info) { diff --git a/src/SessionContext.hpp b/src/SessionContext.hpp index a896758..720a53d 100644 --- a/src/SessionContext.hpp +++ b/src/SessionContext.hpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -258,6 +259,67 @@ class SessionContext : public Napi::ObjectWrap { */ Napi::Value hasPooling(const Napi::CallbackInfo& info); + // ===== METRICS API ===== + + /** + * Compute surprisal for a specific token + * Args: pickedTokenId (number), base? (string: "nats" | "bits" | "base10") + * Returns: number (surprisal in specified base) + */ + Napi::Value modelSurprisal(const Napi::CallbackInfo& info); + + /** + * Compute entropy of logits distribution + * Args: base? (string: "nats" | "bits" | "base10") + * Returns: number (entropy in specified base) + */ + Napi::Value modelEntropy(const Napi::CallbackInfo& info); + + /** + * Create a new perplexity tracker + * Returns: number (handle) + */ + Napi::Value createPerplexityTracker(const Napi::CallbackInfo& info); + + /** + * Add surprisal value to tracker + * Args: handle (number), surprisal (number) + */ + Napi::Value addSurprisal(const Napi::CallbackInfo& info); + + /** + * Get current perplexity value + * Args: handle (number) + * Returns: number (perplexity) + */ + Napi::Value getPerplexity(const Napi::CallbackInfo& info); + + /** + * Clone perplexity tracker + * Args: sourceHandle (number) + * Returns: number (new handle) + */ + Napi::Value clonePerplexityTracker(const Napi::CallbackInfo& info); + + /** + * Reset tracker to initial state + * Args: handle (number) + */ + Napi::Value resetPerplexityTracker(const Napi::CallbackInfo& info); + + /** + * Get number of tokens tracked + * Args: handle (number) + * Returns: number (count) + */ + Napi::Value getPerplexityCount(const Napi::CallbackInfo& info); + + /** + * Free perplexity tracker resources + * Args: handle (number) + */ + Napi::Value freePerplexityTracker(const Napi::CallbackInfo& info); + private: // ===== INTERNAL STATE ===== @@ -274,6 +336,10 @@ class SessionContext : public Napi::ObjectWrap { std::unordered_map _samplerHandles; int32_t _nextSamplerHandle = 1; + // ===== HANDLE-BASED PERPLEXITY TRACKING ===== + std::unordered_map _perplexityHandles; + int32_t _nextPerplexityHandle = 1; + // ===== DECODE MUTEX ===== std::mutex _decodeMutex; @@ -306,6 +372,9 @@ class SessionContext : public Napi::ObjectWrap { return static_cast(pos); } + // Parse base string ("nats", "bits", "base10") to lloyal::metrics::Base enum + static lloyal::metrics::Base parseBase(const std::string& baseStr); + /** * Invalidate any active logits buffer (The Kill Switch) * diff --git a/test/api.js b/test/api.js index 7bcccca..dec5958 100644 --- a/test/api.js +++ b/test/api.js @@ -439,6 +439,142 @@ ws ::= [ \\t\\n]*`; ctx2.dispose(); console.log('βœ“ Multi-sequence context disposed\n'); + // ============================================================================ + // Test 17: Metrics API + // ============================================================================ + + console.log('πŸ”’ Test 17: Metrics API'); + + // Setup: Clear cache and decode first token to get valid logits + await ctx.kvCacheClear(); + await ctx.decode([tokens[0]], 0); + const token1 = ctx.greedySample(); + + // Test 17a: Model surprisal + const surprisalNats = ctx.modelSurprisal(token1, "nats"); + const surprisalBits = ctx.modelSurprisal(token1, "bits"); + + if (typeof surprisalNats !== 'number' || surprisalNats < 0) { + throw new Error('modelSurprisal(nats) should return non-negative number'); + } + + if (Math.abs(surprisalBits - surprisalNats / Math.log(2)) > 0.01) { + throw new Error('modelSurprisal(bits) should equal surprisal(nats) / ln(2)'); + } + + console.log(` βœ“ modelSurprisal: ${surprisalBits.toFixed(2)} bits`); + + // Test 17b: Model entropy + const entropyNats = ctx.modelEntropy("nats"); + const entropyBits = ctx.modelEntropy("bits"); + + if (typeof entropyNats !== 'number' || entropyNats < 0) { + throw new Error('modelEntropy should return non-negative number'); + } + + if (Math.abs(entropyBits - entropyNats / Math.log(2)) > 0.01) { + throw new Error('modelEntropy(bits) should equal entropy(nats) / ln(2)'); + } + + console.log(` βœ“ modelEntropy: ${entropyBits.toFixed(2)} bits`); + + // Test 17c: Perplexity tracker creation + const tracker = ctx.createPerplexityTracker(); + + if (typeof tracker !== 'number' || tracker <= 0) { + throw new Error('createPerplexityTracker should return positive integer handle'); + } + + console.log(` βœ“ createPerplexityTracker: handle=${tracker}`); + + // Test 17d: Add surprisals and check count + ctx.addSurprisal(tracker, surprisalNats); + + await ctx.decode([token1], 1); + const token2 = ctx.greedySample(); + const surprisal2 = ctx.modelSurprisal(token2); + ctx.addSurprisal(tracker, surprisal2); + + const count = ctx.getPerplexityCount(tracker); + if (count !== 2) { + throw new Error('getPerplexityCount should return correct count'); + } + + console.log(` βœ“ Added 2 surprisals, count=${count}`); + + // Test 17e: Get perplexity + const ppl = ctx.getPerplexity(tracker); + + if (typeof ppl !== 'number' || ppl < 1.0) { + throw new Error('getPerplexity should return value >= 1.0'); + } + + // Verify formula: PPL = exp(avg_surprisal) + const expectedPpl = Math.exp((surprisalNats + surprisal2) / 2); + if (Math.abs(ppl - expectedPpl) > 0.01) { + throw new Error('PPL formula should be exp(sum_surprisals / count)'); + } + + console.log(` βœ“ getPerplexity: ${ppl.toFixed(2)}`); + + // Test 17f: Clone tracker + const cloned = ctx.clonePerplexityTracker(tracker); + + if (typeof cloned !== 'number' || cloned === tracker) { + throw new Error('clonePerplexityTracker should return new unique handle'); + } + + const clonedPpl = ctx.getPerplexity(cloned); + if (Math.abs(clonedPpl - ppl) > 0.01) { + throw new Error('Cloned tracker should have same perplexity as original'); + } + + console.log(` βœ“ clonePerplexityTracker: cloned handle=${cloned}`); + + // Test 17g: Independent tracking + await ctx.decode([token2], 2); + const token3 = ctx.greedySample(); + const surprisal3 = ctx.modelSurprisal(token3); + + ctx.addSurprisal(tracker, surprisal3); // Add to original + const pplOriginal = ctx.getPerplexity(tracker); + const pplCloned = ctx.getPerplexity(cloned); + + if (pplOriginal === pplCloned) { + throw new Error('Original and cloned trackers should track independently'); + } + + console.log(` βœ“ Independent tracking: original=${pplOriginal.toFixed(2)}, cloned=${pplCloned.toFixed(2)}`); + + // Test 17h: Reset tracker + ctx.resetPerplexityTracker(cloned); + + const resetCount = ctx.getPerplexityCount(cloned); + if (resetCount !== 0) { + throw new Error('resetPerplexityTracker should set count to 0'); + } + + console.log(` βœ“ resetPerplexityTracker: count=${resetCount}`); + + // Test 17i: Free trackers + ctx.freePerplexityTracker(tracker); + ctx.freePerplexityTracker(cloned); + + console.log(` βœ“ Freed both trackers`); + + // Test 17j: Invalid handle error + try { + ctx.getPerplexity(tracker); // Already freed + throw new Error('Should throw on invalid handle'); + } catch (e) { + if (!e.message.includes('Invalid perplexity tracker handle')) { + throw new Error('Should have correct error message for invalid handle'); + } + console.log(` βœ“ Invalid handle throws error`); + } + + console.log('βœ… Test 17: Metrics API passed\n'); + // ===== SUCCESS ===== console.log('βœ… All integration tests passed!\n'); From 8973ca44d515379b7a2fa86c3cbc0254aa915907 Mon Sep 17 00:00:00 2001 From: LLoyal Research Date: Wed, 21 Jan 2026 23:05:31 +1100 Subject: [PATCH 2/9] feat(api): expose atomic clear and reseed --- lib/index.d.ts | 30 ++++++++++++++- src/SessionContext.cpp | 83 ++++++++++++++++++++++++++++++++++++++++++ src/SessionContext.hpp | 7 ++++ 3 files changed, 118 insertions(+), 2 deletions(-) diff --git a/lib/index.d.ts b/lib/index.d.ts index 675e4ab..5c9b853 100644 --- a/lib/index.d.ts +++ b/lib/index.d.ts @@ -519,6 +519,32 @@ export interface SessionContext { */ kvCacheClear(): Promise; + /** + * Atomic clear+reseed operation + * + * Implements the StreamingLLM pattern: + * 1. Clear entire KV cache + * 2. Re-decode original sinks (first N tokens from conversation start) + * 3. Re-decode tail (last M recent tokens) + * + * + * @param sinks - ORIGINAL first N tokens from conversation start (typically 4) + * @param tail - Recent M tokens to preserve (typically 508-1020) + * @returns Promise that resolves when reseed completes + * + * @example + * ```typescript + * const ORIGINAL_SINKS = allTokens.slice(0, 4); + * + * const tail = allTokens.slice(-508); // Last 508 tokens + * await ctx.clearAndReseed(ORIGINAL_SINKS, tail); + * + * const nextToken = ctx.greedySample(); + * await ctx.decode([nextToken], 512); + * ``` + */ + clearAndReseed(sinks: number[], tail: number[]): Promise; + // ===== GRAMMAR-CONSTRAINED GENERATION ===== /** @@ -836,7 +862,7 @@ export interface SessionContext { * COST: O(1) - direct probability lookup from logits * REQUIRES: decode() called with logits=true */ - modelSurprisal(pickedTokenId: number, base?: "nats" | "bits"): number; + modelSurprisal(pickedTokenId: number, base?: 'nats' | 'bits'): number; /** * Compute entropy of the entire logits distribution. @@ -861,7 +887,7 @@ export interface SessionContext { * REQUIRES: decode() called with logits=true * ALGORITHM: Numerically stable log-sum-exp (metrics.hpp:73-81) */ - modelEntropy(base?: "nats" | "bits"): number; + modelEntropy(base?: 'nats' | 'bits'): number; /** * Create a new perplexity tracker. diff --git a/src/SessionContext.cpp b/src/SessionContext.cpp index c5b4a1c..c39f5b3 100644 --- a/src/SessionContext.cpp +++ b/src/SessionContext.cpp @@ -252,6 +252,42 @@ class KVCacheClearWorker : public Napi::AsyncWorker { llama_context* _ctx; }; +/** + * AsyncWorker for clearAndReseed operation (StreamingLLM) + * Uses lloyal::kv::clear_and_reseed() - the validated API + */ +class ClearAndReseedWorker : public Napi::AsyncWorker { +public: + ClearAndReseedWorker(Napi::Env env, llama_context* ctx, + std::vector sinks, + std::vector tail, + int32_t n_batch) + : AsyncWorker(env), _deferred(env), _ctx(ctx), + _sinks(std::move(sinks)), _tail(std::move(tail)), _n_batch(n_batch) {} + + void Execute() override { + // Use lloyal::kv::clear_and_reseed() - handles clear+decode atomically + lloyal::kv::clear_and_reseed(_ctx, _sinks, _tail, _n_batch); + } + + void OnOK() override { + _deferred.Resolve(Env().Undefined()); + } + + void OnError(const Napi::Error& err) override { + _deferred.Reject(err.Value()); + } + + Napi::Promise GetPromise() { return _deferred.Promise(); } + +private: + Napi::Promise::Deferred _deferred; + llama_context* _ctx; + std::vector _sinks; + std::vector _tail; + int32_t _n_batch; +}; + /** * AsyncWorker for kvCacheWriteFile operation * Writes KV cache state + tokens to a file for disk persistence @@ -558,6 +594,7 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) { InstanceMethod("kvCacheSave", &SessionContext::kvCacheSave), InstanceMethod("kvCacheLoad", &SessionContext::kvCacheLoad), InstanceMethod("kvCacheClear", &SessionContext::kvCacheClear), + InstanceMethod("clearAndReseed", &SessionContext::clearAndReseed), InstanceMethod("kvCacheWriteFile", &SessionContext::kvCacheWriteFile), InstanceMethod("kvCacheReadFile", &SessionContext::kvCacheReadFile), @@ -1852,6 +1889,11 @@ Napi::Value SessionContext::kvCacheRemove(const Napi::CallbackInfo& info) { throw Napi::TypeError::New(env, "Expected (sequenceId: number, start: number, end: number)"); } + // CRITICAL: Invalidate logits before KV cache modification + // Logits may reference positions that will be evicted + // (matches pattern from decode() line 801, encode() line 1035) + invalidateLogits(); + double sequenceId = info[0].As().DoubleValue(); double start = info[1].As().DoubleValue(); double end = info[2].As().DoubleValue(); @@ -1901,6 +1943,47 @@ Napi::Value SessionContext::kvCacheClear(const Napi::CallbackInfo& info) { return worker->GetPromise(); } +Napi::Value SessionContext::clearAndReseed(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + ensureNotDisposed(); + + // Args: sinks (Array), tail (Array) + if (info.Length() < 2 || !info[0].IsArray() || !info[1].IsArray()) { + throw Napi::TypeError::New(env, "Expected (sinks: number[], tail: number[])"); + } + + // Extract sinks array + Napi::Array jsSinks = info[0].As(); + std::vector sinks; + sinks.reserve(jsSinks.Length()); + for (uint32_t i = 0; i < jsSinks.Length(); i++) { + Napi::Value val = jsSinks.Get(i); + if (!val.IsNumber()) { + throw Napi::TypeError::New(env, "sinks array must contain only numbers"); + } + sinks.push_back(static_cast(val.As().Int32Value())); + } + + // Extract tail array + Napi::Array jsTail = info[1].As(); + std::vector tail; + tail.reserve(jsTail.Length()); + for (uint32_t i = 0; i < jsTail.Length(); i++) { + Napi::Value val = jsTail.Get(i); + if (!val.IsNumber()) { + throw Napi::TypeError::New(env, "tail array must contain only numbers"); + } + tail.push_back(static_cast(val.As().Int32Value())); + } + + // Use default batch size (512) from context params + int32_t n_batch = 512; + + auto* worker = new ClearAndReseedWorker(env, _context, std::move(sinks), std::move(tail), n_batch); + worker->Queue(); + return worker->GetPromise(); +} + Napi::Value SessionContext::kvCacheWriteFile(const Napi::CallbackInfo& info) { Napi::Env env = info.Env(); ensureNotDisposed(); diff --git a/src/SessionContext.hpp b/src/SessionContext.hpp index 720a53d..7253f49 100644 --- a/src/SessionContext.hpp +++ b/src/SessionContext.hpp @@ -146,6 +146,13 @@ class SessionContext : public Napi::ObjectWrap { Napi::Value kvCacheLoad(const Napi::CallbackInfo& info); Napi::Value kvCacheClear(const Napi::CallbackInfo& info); + /** + * Atomic clear+reseed operation for KV cache compression + * Args: sinks (Array), tail (Array) + * Returns: void (Promise) + */ + Napi::Value clearAndReseed(const Napi::CallbackInfo& info); + // ===== KV SEQUENCE OPERATIONS ===== /** From b8beb44f9e439cdb7e2e625a6012e1c8938e5c75 Mon Sep 17 00:00:00 2001 From: LLoyal Research Date: Fri, 23 Jan 2026 14:37:53 +1100 Subject: [PATCH 3/9] fix(core): fix namespaces --- src/FileSystem.h | 4 ++-- src/SessionContext.cpp | 24 ++++++++++++------------ src/SessionContext.hpp | 4 ++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/FileSystem.h b/src/FileSystem.h index 8837168..dd65f1c 100644 --- a/src/FileSystem.h +++ b/src/FileSystem.h @@ -4,7 +4,7 @@ #include #include -namespace margelo::nitro::nitrollama { +namespace liblloyal_node { /** * File system operations and validation service @@ -103,4 +103,4 @@ namespace FileSystem { } } -} // namespace margelo::nitro::nitrollama +} // namespace liblloyal_node diff --git a/src/SessionContext.cpp b/src/SessionContext.cpp index c39f5b3..07a5a46 100644 --- a/src/SessionContext.cpp +++ b/src/SessionContext.cpp @@ -38,8 +38,9 @@ struct LloyalSamplingParams { }; // Convert JS object params β†’ liblloyal-compatible structure -// Note: For now this is a placeholder - Phase 5 will implement full conversion -// from the new nested API structure (penalties, advanced, etc.) +// Currently supports basic parameters (temperature, topK, topP, minP, seed) +// and penalty group (repeat, frequency, presence, lastN). +// Advanced parameters (mirostat, dry, xtc, typical_p) to be added as liblloyal adds support. static LloyalSamplingParams adaptSamplingParamsFromJS(Napi::Object paramsObj) { LloyalSamplingParams adapted; @@ -78,12 +79,11 @@ static LloyalSamplingParams adaptSamplingParamsFromJS(Napi::Object paramsObj) { } } - // TODO Phase 5: Extract from advanced group (mirostat, dry, xtc) - // if (paramsObj.Has("advanced") && paramsObj.Get("advanced").IsObject()) { - // Napi::Object advanced = paramsObj.Get("advanced").As(); - // adapted.typical_p = advanced.Get("typicalP").As().FloatValue(); - // // Note: mirostat, dry, xtc not yet supported in liblloyal - // } + // Future: Extract from advanced group when liblloyal adds support + // - typical_p (Locally Typical Sampling) + // - mirostat (Mirostat 1.0/2.0) + // - dry (Don't Repeat Yourself) + // - xtc (Extended Temperature Scaling) return adapted; } @@ -457,7 +457,7 @@ class EncodeWorker : public Napi::AsyncWorker { void Execute() override { try { - lloyal::decoder::encode(_ctx, _tokens, lloyal::defaults::N_BATCH_PROCESS); + lloyal::embedding::encode(_ctx, _tokens, lloyal::defaults::N_BATCH_PROCESS); } catch (const std::exception& e) { SetError(e.what()); } @@ -2082,17 +2082,17 @@ Napi::Value CreateContext(const Napi::CallbackInfo& info) { BackendManager::ensureInitialized(); // Normalize and validate path BEFORE queuing async work - std::string fsPath = margelo::nitro::nitrollama::FileSystem::normalizePath(modelPath); + std::string fsPath = liblloyal_node::FileSystem::normalizePath(modelPath); if (fsPath != modelPath) { std::cout << "[CreateContext] Normalized " << modelPath << " β†’ " << fsPath << std::endl; } - if (!margelo::nitro::nitrollama::FileSystem::exists(fsPath)) { + if (!liblloyal_node::FileSystem::exists(fsPath)) { std::cout << "[CreateContext] File does not exist: " << fsPath << std::endl; throw Napi::Error::New(env, "Model file not found: " + fsPath); } - size_t fileSize = margelo::nitro::nitrollama::FileSystem::getSize(fsPath); + size_t fileSize = liblloyal_node::FileSystem::getSize(fsPath); std::cout << "[CreateContext] File validated: " << fsPath << " (" << fileSize << " bytes)" << std::endl; // Load model on main thread diff --git a/src/SessionContext.hpp b/src/SessionContext.hpp index 7253f49..def49fe 100644 --- a/src/SessionContext.hpp +++ b/src/SessionContext.hpp @@ -131,7 +131,7 @@ class SessionContext : public Napi::ObjectWrap { Napi::Value getMemorySize(const Napi::CallbackInfo& info); // ===== GRAMMAR-CONSTRAINED GENERATION ===== - // (To be implemented in Phase 4) + // Legacy single-grammar API (deprecated, use handle-based API below) Napi::Value initGrammar(const Napi::CallbackInfo& info); Napi::Value applyGrammar(const Napi::CallbackInfo& info); @@ -232,7 +232,7 @@ class SessionContext : public Napi::ObjectWrap { Napi::Value kvCacheReadFile(const Napi::CallbackInfo& info); // ===== HELPERS ===== - // (To be implemented in Phase 6) + // Utility functions (not yet implemented) Napi::Value jsonSchemaToGrammar(const Napi::CallbackInfo& info); Napi::Value validateChatTemplate(const Napi::CallbackInfo& info); From 2b44eb4e8fc8d694616aae7352482d7f1ec5a602 Mon Sep 17 00:00:00 2001 From: LLoyal Research Date: Fri, 23 Jan 2026 14:55:01 +1100 Subject: [PATCH 4/9] feat(dist): Add linux-arm64 (CPU + CUDA) via Docker / QEMU, Add win32-x64-vulkan support, Update docs to reflect --- .github/workflows/release.yml | 72 +++++++++++++++- docs/distribution.md | 152 +++++++++++++++++++--------------- package.json | 5 +- 3 files changed, 161 insertions(+), 68 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 901b847..6e641be 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -47,6 +47,23 @@ jobs: gpu: cuda cuda_version: 12.2.0 package: win32-x64-cuda + - os: windows-2022 + arch: x64 + gpu: vulkan + package: win32-x64-vulkan + + # Linux ARM64 (new for v1.0) + - os: ubuntu-22.04 + arch: arm64 + gpu: cpu + package: linux-arm64 + docker_platform: linux/arm64 + - os: ubuntu-22.04 + arch: arm64 + gpu: cuda + package: linux-arm64-cuda + docker_image: nvcr.io/nvidia/l4t-cuda:12.6-devel + docker_platform: linux/arm64 steps: - name: Checkout code @@ -87,15 +104,68 @@ jobs: with: cuda: '12.2.0' + - name: Install Vulkan SDK (Windows) + if: matrix.gpu == 'vulkan' && runner.os == 'Windows' + shell: pwsh + run: | + $url = "https://sdk.lunarg.com/sdk/download/1.3.275.0/windows/VulkanSDK-1.3.275.0-Installer.exe" + Invoke-WebRequest -Uri $url -OutFile VulkanSDK.exe + Start-Process -FilePath .\VulkanSDK.exe -ArgumentList '/S' -Wait + echo "VULKAN_SDK=C:\VulkanSDK\1.3.275.0" | Out-File -FilePath $env:GITHUB_ENV -Append + + - name: Setup QEMU for ARM64 + if: matrix.arch == 'arm64' && runner.os == 'Linux' + uses: docker/setup-qemu-action@v3 + with: + platforms: linux/arm64 + # Build - name: Install npm dependencies + if: matrix.arch != 'arm64' || runner.os != 'Linux' run: npm install - - name: Build native module + - name: Build native module (x64 or native ARM64) + if: matrix.arch != 'arm64' || runner.os != 'Linux' run: npm run build env: LLOYAL_GPU: ${{ matrix.gpu }} + - name: Build native module (ARM64 via Docker) + if: matrix.arch == 'arm64' && runner.os == 'Linux' + shell: bash + run: | + # Determine Docker image + if [ -n "${{ matrix.docker_image }}" ]; then + IMAGE="${{ matrix.docker_image }}" + else + IMAGE="arm64v8/ubuntu:22.04" + fi + + # Build inside ARM64 container + docker run --rm --platform ${{ matrix.docker_platform }} \ + -v $PWD:/workspace -w /workspace \ + -e LLOYAL_GPU=${{ matrix.gpu }} \ + $IMAGE bash -c " + # Install build dependencies + apt-get update + apt-get install -y build-essential cmake git curl + + # Install Node.js 20 + curl -fsSL https://deb.nodesource.com/setup_20.x | bash - + apt-get install -y nodejs + + # Install CUDA toolkit if needed + if [ '${{ matrix.gpu }}' = 'cuda' ]; then + apt-get install -y cuda-toolkit-12-6 || true + fi + + # Build + npm install + npm run build + " + env: + LLOYAL_GPU: ${{ matrix.gpu }} + # Package - name: Create platform package shell: bash diff --git a/docs/distribution.md b/docs/distribution.md index 42e4d37..e8bb519 100644 --- a/docs/distribution.md +++ b/docs/distribution.md @@ -225,27 +225,29 @@ npm publish --- -## Phase 2: Core Platform Prebuilts +## Phase 2: Core Platform Prebuilts βœ… COMPLETE ### Overview -**Audience:** Production users on common platforms -**Timeline:** v0.5.0 - v1.0.0 -**Distribution:** npm registry with 3 prebuilt packages +**Status:** βœ… Implemented (v0.1.0) +**Audience:** Production users on common x64 platforms +**Distribution:** 7 npm packages covering 80%+ of developers -### Platform Selection +### Platform Packages (Implemented) -Target the **top 3 most common developer platforms**: +| Package | Platform | Arch | GPU | Status | +|---------|----------|------|-----|--------| +| `@lloyal/lloyal.node-darwin-arm64` | macOS | arm64 | Metal | βœ… Working | +| `@lloyal/lloyal.node-darwin-x64` | macOS | x64 | CPU | βœ… Working | +| `@lloyal/lloyal.node-linux-x64` | Linux | x64 | CPU | βœ… Working | +| `@lloyal/lloyal.node-linux-x64-cuda` | Linux | x64 | CUDA 12.2 | βœ… Working | +| `@lloyal/lloyal.node-linux-x64-vulkan` | Linux | x64 | Vulkan | βœ… Working | +| `@lloyal/lloyal.node-win32-x64` | Windows | x64 | CPU | βœ… Working | +| `@lloyal/lloyal.node-win32-x64-cuda` | Windows | x64 | CUDA 12.2 | βœ… Working | -| Package | Platform | Arch | GPU | Coverage | -|---------|----------|------|-----|----------| -| `@lloyal/lloyal.node-darwin-arm64` | macOS | arm64 | Metal | ~40% | -| `@lloyal/lloyal.node-linux-x64` | Linux | x64 | CPU | ~20% | -| `@lloyal/lloyal.node-win32-x64` | Windows | x64 | CPU | ~10% | +**Total coverage:** ~80% of developers with instant install -**Total coverage:** ~70% of developers with instant install - -**Unsupported platforms** (Linux arm64, macOS x64, Windows arm64): Fallback to source build +**Note:** Original Phase 2 plan was 3 packages, but we exceeded expectations by implementing 7 packages including GPU variants. ### Architecture @@ -481,35 +483,42 @@ if (mainPkg.optionalDependencies) { --- -## Phase 3: Full Platform Matrix +## Phase 3: Full Platform Matrix βš™οΈ IN PROGRESS (v1.0) ### Overview +**Status:** βš™οΈ Implementing (target: v1.0.0) **Audience:** All users, all platforms, all GPU variants -**Timeline:** v1.x.x+ (mature project with resources) -**Distribution:** 10+ platform/GPU packages +**Timeline:** v1.0.0 +**Distribution:** 10 platform/GPU packages covering 95%+ deployments -### Platform Packages +### Platform Packages (v1.0 Target) -**CPU-only (6 packages):** -``` -@lloyal/lloyal.node-darwin-arm64 (macOS Apple Silicon, Metal built-in) -@lloyal/lloyal.node-darwin-x64 (macOS Intel, CPU only) -@lloyal/lloyal.node-linux-x64 (Linux x64, CPU only) -@lloyal/lloyal.node-linux-arm64 (Linux ARM64, CPU only) -@lloyal/lloyal.node-win32-x64 (Windows x64, CPU only) -@lloyal/lloyal.node-win32-arm64 (Windows ARM64, CPU only) -``` +**Already Implemented (7 packages from Phase 2+):** +- βœ… `@lloyal/lloyal.node-darwin-arm64` (macOS Apple Silicon, Metal) +- βœ… `@lloyal/lloyal.node-darwin-x64` (macOS Intel, CPU) +- βœ… `@lloyal/lloyal.node-linux-x64` (Linux x64, CPU) +- βœ… `@lloyal/lloyal.node-linux-x64-cuda` (Linux x64 + CUDA 12.2) +- βœ… `@lloyal/lloyal.node-linux-x64-vulkan` (Linux x64 + Vulkan) +- βœ… `@lloyal/lloyal.node-win32-x64` (Windows x64, CPU) +- βœ… `@lloyal/lloyal.node-win32-x64-cuda` (Windows x64 + CUDA 12.2) -**GPU variants (6+ packages):** -``` -@lloyal/lloyal.node-linux-x64-cuda (Linux x64 + CUDA) -@lloyal/lloyal.node-linux-x64-vulkan (Linux x64 + Vulkan) -@lloyal/lloyal.node-linux-arm64-cuda (Linux ARM64 + CUDA) -@lloyal/lloyal.node-linux-arm64-vulkan (Linux ARM64 + Vulkan) -@lloyal/lloyal.node-win32-x64-cuda (Windows x64 + CUDA) -@lloyal/lloyal.node-win32-x64-vulkan (Windows x64 + Vulkan) -``` +**New for v1.0 (3 packages):** +- πŸ”„ `@lloyal/lloyal.node-linux-arm64` (Linux ARM64 - AWS Graviton, Raspberry Pi) +- πŸ”„ `@lloyal/lloyal.node-linux-arm64-cuda` (Linux ARM64 + CUDA - NVIDIA Jetson) +- πŸ”„ `@lloyal/lloyal.node-win32-x64-vulkan` (Windows x64 + Vulkan - AMD/Intel GPU) + +**Deferred to v1.1+ (2 packages):** +- ⏸️ `@lloyal/lloyal.node-win32-arm64` (Windows ARM64 - awaiting GitHub Actions ARM64 Windows runners) +- ⏸️ `@lloyal/lloyal.node-darwin-x64-vulkan` (macOS Intel + eGPU - negligible use case) + +### What Changed from Original Plan + +**Original Phase 3 (docs):** 12 packages including win32-arm64, darwin-x64-vulkan + +**Actual v1.0 Phase 3:** 10 packages + +**Rationale:** 10 packages cover 95%+ of real-world usage. Remaining 2 packages require infrastructure not yet available (win32-arm64) or serve minimal users (darwin-x64-vulkan). ### GPU Variant Installation @@ -552,51 +561,62 @@ if (!hasVariant(gpu)) { } ``` -### CI/CD Expansion +### CI/CD Implementation (v1.0) -Expand build matrix to 12+ jobs: +Build matrix with 10 jobs (see `.github/workflows/release.yml`): ```yaml strategy: matrix: include: - # CPU variants - - { os: macos-14, arch: arm64, variant: default } - - { os: macos-13, arch: x64, variant: default } - - { os: ubuntu-22.04, arch: x64, variant: default } - - { os: ubuntu-22.04-arm, arch: arm64, variant: default } - - { os: windows-latest, arch: x64, variant: default } - - { os: windows-latest, arch: arm64, variant: default } + # macOS (2 jobs) + - { os: macos-14, arch: arm64, gpu: metal, package: darwin-arm64 } + - { os: macos-13, arch: x64, gpu: cpu, package: darwin-x64 } - # CUDA variants - - { os: ubuntu-22.04, arch: x64, variant: cuda, container: nvidia/cuda:12.6 } - - { os: ubuntu-22.04-arm, arch: arm64, variant: cuda, container: nvidia/cuda:12.6 } - - { os: windows-latest, arch: x64, variant: cuda, cuda-version: 12.9 } + # Linux x64 (3 jobs) + - { os: ubuntu-22.04, arch: x64, gpu: cpu, package: linux-x64 } + - { os: ubuntu-22.04, arch: x64, gpu: cuda, package: linux-x64-cuda } + - { os: ubuntu-22.04, arch: x64, gpu: vulkan, package: linux-x64-vulkan } - # Vulkan variants - - { os: ubuntu-22.04, arch: x64, variant: vulkan } - - { os: ubuntu-22.04-arm, arch: arm64, variant: vulkan } - - { os: windows-latest, arch: x64, variant: vulkan } + # Linux ARM64 (2 jobs - Docker + QEMU) + - { os: ubuntu-22.04, arch: arm64, gpu: cpu, package: linux-arm64, docker_platform: linux/arm64 } + - { os: ubuntu-22.04, arch: arm64, gpu: cuda, package: linux-arm64-cuda, docker_image: nvcr.io/nvidia/l4t-cuda:12.6-devel } + + # Windows (3 jobs) + - { os: windows-2022, arch: x64, gpu: cpu, package: win32-x64 } + - { os: windows-2022, arch: x64, gpu: cuda, package: win32-x64-cuda, cuda_version: 12.2.0 } + - { os: windows-2022, arch: x64, gpu: vulkan, package: win32-x64-vulkan } ``` -### Pros & Cons +**Key Implementation Details:** +- **ARM64 builds:** Use Docker + QEMU for cross-compilation (GitHub Actions has no native ARM64 Linux runners) +- **CUDA ARM64:** Use NVIDIA L4T (Linux for Tegra) Docker image for Jetson compatibility +- **Vulkan Windows:** Install LunarG Vulkan SDK during CI build step + +### Pros & Cons (v1.0 Implementation) **Pros:** -- Best user experience (instant install + optimal performance) -- Covers 100% of platforms -- GPU acceleration out of box +- Excellent user experience (instant install + optimal performance) +- Covers 95%+ of real-world deployments +- GPU acceleration out of box (CUDA, Vulkan, Metal) +- ARM64 support (AWS Graviton, Jetson, Raspberry Pi) - Professional distribution **Cons:** -- Complex CI/CD (12+ jobs, cross-compilation, GPU toolchains) -- High maintenance burden (12+ packages to version/publish) -- Storage/bandwidth costs ($$$) -- Platform-specific bugs to debug - -### When to Use - -- Established project with funding/resources -- Large user base demanding GPU support +- Moderate CI/CD complexity (10 jobs, cross-compilation, GPU toolchains) +- Maintenance burden (10 packages to version/publish) +- Storage/bandwidth costs (50-150MB per package) +- Platform-specific bugs to debug (especially ARM64 QEMU builds) +- Cannot fully test all platforms in CI (no ARM64 hardware runners) + +### Success Metrics + +**Phase 3 v1.0 considered successful when:** +- All 10 platform packages build successfully in CI +- All 10 packages published to npm registry +- `npm install lloyal.node` works on all 10 platforms +- Community validation on ARM64 hardware (Graviton, Raspberry Pi, Jetson) +- No regression in existing 7 packages - Commercial product expectations --- diff --git a/package.json b/package.json index 3f7392a..e802b67 100644 --- a/package.json +++ b/package.json @@ -52,8 +52,11 @@ "@lloyal/lloyal.node-linux-x64": "0.1.0", "@lloyal/lloyal.node-linux-x64-cuda": "0.1.0", "@lloyal/lloyal.node-linux-x64-vulkan": "0.1.0", + "@lloyal/lloyal.node-linux-arm64": "0.1.0", + "@lloyal/lloyal.node-linux-arm64-cuda": "0.1.0", "@lloyal/lloyal.node-win32-x64": "0.1.0", - "@lloyal/lloyal.node-win32-x64-cuda": "0.1.0" + "@lloyal/lloyal.node-win32-x64-cuda": "0.1.0", + "@lloyal/lloyal.node-win32-x64-vulkan": "0.1.0" }, "engines": { "node": ">=18.0.0" From 5cfa63422e0f2173411af707da66afa62c6c5b15 Mon Sep 17 00:00:00 2001 From: LLoyal Research Date: Fri, 23 Jan 2026 16:07:29 +1100 Subject: [PATCH 5/9] feat(sync): liblloyal v1.0.0-alpha --- vendor/VERSIONS.json | 12 +- vendor/liblloyal/CMakeLists.txt | 4 +- vendor/liblloyal/LICENSE | 201 ++++++++++ vendor/liblloyal/README.md | 4 +- .../include/lloyal/chat_template.hpp | 22 +- vendor/liblloyal/include/lloyal/common.hpp | 5 +- vendor/liblloyal/include/lloyal/decoder.hpp | 89 +---- vendor/liblloyal/include/lloyal/embedding.hpp | 115 +++++- vendor/liblloyal/include/lloyal/grammar.hpp | 20 +- vendor/liblloyal/include/lloyal/helpers.hpp | 169 ++++++++- .../include/lloyal/json-schema-to-grammar.hpp | 62 ++- vendor/liblloyal/include/lloyal/kv.hpp | 355 +++++++++++------- vendor/liblloyal/include/lloyal/logits.hpp | 4 + vendor/liblloyal/include/lloyal/metrics.hpp | 3 + .../include/lloyal/model_registry.hpp | 55 ++- vendor/liblloyal/include/lloyal/sampler.hpp | 29 +- vendor/liblloyal/include/lloyal/tokenizer.hpp | 16 +- vendor/llama.cpp/README.md | 2 +- 18 files changed, 851 insertions(+), 316 deletions(-) create mode 100644 vendor/liblloyal/LICENSE diff --git a/vendor/VERSIONS.json b/vendor/VERSIONS.json index c9e8572..58874cf 100644 --- a/vendor/VERSIONS.json +++ b/vendor/VERSIONS.json @@ -1,17 +1,17 @@ { - "vendoredAt": "2026-01-18T13:51:23.070Z", + "vendoredAt": "2026-01-23T04:38:31.451Z", "vendors": { "liblloyal": { - "commit": "2fd20a50213b99589b91b65356eac8e67695b903", - "commitShort": "2fd20a5", - "fileCount": 19, - "vendoredAt": "2026-01-18T13:51:23.104Z" + "commit": "0c5f79d590a3594edad763bea7782f8eaf522e43", + "commitShort": "0c5f79d", + "fileCount": 20, + "vendoredAt": "2026-01-23T04:38:31.487Z" }, "llama.cpp": { "commit": "338074c383c81366320d176d83b94b0a567ee0c2", "commitShort": "338074c", "fileCount": 170, - "vendoredAt": "2026-01-18T13:51:23.321Z" + "vendoredAt": "2026-01-23T04:38:31.667Z" } } } \ No newline at end of file diff --git a/vendor/liblloyal/CMakeLists.txt b/vendor/liblloyal/CMakeLists.txt index 6848052..4cff1b4 100644 --- a/vendor/liblloyal/CMakeLists.txt +++ b/vendor/liblloyal/CMakeLists.txt @@ -6,8 +6,8 @@ project(liblloyal VERSION 0.1.0 LANGUAGES CXX) # ============================================================================= # # This library provides type-safe, ergonomic wrappers around llama.cpp for -# React Native shells (calibrate-ndk, nitro-llama). All implementations are -# header-only with inline specifiers. +# multiple language bindings. All implementations are header-only with inline +# specifiers. # # Dependencies: # - llama.cpp (b6870 or compatible) diff --git a/vendor/liblloyal/LICENSE b/vendor/liblloyal/LICENSE new file mode 100644 index 0000000..a0e99cc --- /dev/null +++ b/vendor/liblloyal/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2026 Lloyal Labs + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/liblloyal/README.md b/vendor/liblloyal/README.md index 3fde816..1a1e7ac 100644 --- a/vendor/liblloyal/README.md +++ b/vendor/liblloyal/README.md @@ -3,8 +3,8 @@ This directory contains vendored sources from the liblloyal project. **Source:** liblloyal/ git submodule -**Commit:** 2fd20a50213b99589b91b65356eac8e67695b903 -**Vendored:** 2026-01-18T13:51:23.104Z +**Commit:** 0c5f79d590a3594edad763bea7782f8eaf522e43 +**Vendored:** 2026-01-23T04:38:31.487Z **DO NOT EDIT:** Files in this directory are copied from git submodules. To update, run: npm run update-vendors diff --git a/vendor/liblloyal/include/lloyal/chat_template.hpp b/vendor/liblloyal/include/lloyal/chat_template.hpp index 02a88ed..12a1ae3 100644 --- a/vendor/liblloyal/include/lloyal/chat_template.hpp +++ b/vendor/liblloyal/include/lloyal/chat_template.hpp @@ -1,5 +1,8 @@ #pragma once +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Lloyal Labs + #include "common.hpp" #include "helpers.hpp" #include @@ -8,24 +11,23 @@ #include /** - * Chat Template Orchestration Layer (Header-Only) + * @file chat_template.hpp + * @brief Chat Template Formatting * - * Purpose: Wraps helpers.hpp chat template functions with fallback error - * handling. NOT just re-exports - adds orchestration and fallback logic. + * Orchestrates chat template processing with fallback error handling. + * Wraps helpers.hpp functions and adds graceful degradation when template + * processing fails. * * Architecture: - * - Uses public functions from helpers.hpp (format_chat_template_complete, - * validate_chat_template_helper) - * - Adds fallback to simple "role: content" format when template processing - * fails - * - Provides clean API for chat template formatting + * - Uses format_chat_template_complete() and validate_chat_template_helper() from helpers.hpp + * - Adds fallback to simple "role: content" format on errors + * - Provides clean FormatResult API for template formatting + stop token extraction */ namespace lloyal::chat_template { /** * Result from chat template formatting - * SOURCE: ChatTemplate.h:24-28 * NOTE: Named FormatResult, NOT ChatTemplateResult */ struct FormatResult { @@ -35,7 +37,6 @@ struct FormatResult { /** * Format chat messages using model's chat template with fallback - * SOURCE: ChatTemplate.h:51-55 * * Orchestration logic: * 1. Calls format_chat_template_complete() from helpers.hpp @@ -119,7 +120,6 @@ inline FormatResult format(const llama_model *model, /** * Validate chat template syntax - * SOURCE: ChatTemplate.h:68 * * Calls validate_chat_template_helper() from helpers.hpp. * Does NOT require a model (syntax-only validation). diff --git a/vendor/liblloyal/include/lloyal/common.hpp b/vendor/liblloyal/include/lloyal/common.hpp index f4e361f..a283c08 100644 --- a/vendor/liblloyal/include/lloyal/common.hpp +++ b/vendor/liblloyal/include/lloyal/common.hpp @@ -1,11 +1,14 @@ #pragma once +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Lloyal Labs + /** * liblloyal - Common definitions and logging * * Header-only library for llama.cpp-bound LLM operations * Version: 1.0.0 (bound to llama.cpp b6870) - * License: MIT + * License: Apache-2.0 */ // ===== PLATFORM-NATIVE LOGGING ===== diff --git a/vendor/liblloyal/include/lloyal/decoder.hpp b/vendor/liblloyal/include/lloyal/decoder.hpp index ff2d3db..18c9979 100644 --- a/vendor/liblloyal/include/lloyal/decoder.hpp +++ b/vendor/liblloyal/include/lloyal/decoder.hpp @@ -1,5 +1,8 @@ #pragma once +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Lloyal Labs + #include "common.hpp" #include "helpers.hpp" #include @@ -30,12 +33,13 @@ #endif /** - * Decoder Anti-Corruption Layer (Header-Only) + * @file decoder.hpp + * @brief Batch Decoding Operations * - * Purpose: Single point of contact with llama.cpp decode APIs to isolate batch - * management complexity, chunking logic, and decode operation orchestration. + * Wraps llama.cpp decode APIs with batch management, chunking logic, and + * orchestration primitives. Provides both batched and single-token decode operations. * - * Calls helpers.hpp batch utilities (batch_clear, batch_add). + * Uses batch utilities from helpers.hpp (batch_clear, batch_add) for token management. */ namespace lloyal::detail { @@ -237,81 +241,4 @@ inline void decode_one(llama_context *ctx, llama_token tok, llama_pos pos, } } -/** - * Encode tokens for embedding extraction - * - * Unlike decode_tokens(), this marks ALL tokens with logits=true which is - * required for embedding extraction. - * - * NOTE: Use this with a dedicated embedding context (embeddings=true, pooling - * enabled). Clear KV between texts with kv::clear_all(): - * - * // Create dedicated embedding context - * ctx_params.embeddings = true; - * ctx_params.pooling_type = LLAMA_POOLING_TYPE_MEAN; - * auto embed_ctx = llama_init_from_model(model, ctx_params); - * - * // Embed each text - * kv::clear_all(embed_ctx); - * decoder::encode(embed_ctx, tokens, 512); - * auto emb = embedding::get(embed_ctx); - * - * @param ctx Llama context (must have embeddings=true and pooling enabled) - * @param tokens Token array to encode - * @param n_tokens Number of tokens in array - * @param n_batch Batch size - * @throws std::runtime_error if encode fails - */ -inline void encode(llama_context *ctx, const llama_token *tokens, - int32_t n_tokens, int32_t n_batch) { - LLOYAL_LOG_DEBUG("[decoder::encode] Encoding %d tokens for embeddings", - n_tokens); - - if (!ctx) { - LLOYAL_LOG_DEBUG("[decoder::encode] ERROR: NULL context"); - throw std::runtime_error("decoder::encode - NULL context"); - } - - if (!tokens || n_tokens <= 0) { - LLOYAL_LOG_DEBUG("[decoder::encode] ERROR: Invalid token array"); - throw std::runtime_error("decoder::encode - Invalid token array"); - } - - if (n_tokens > n_batch) { - LLOYAL_LOG_DEBUG("[decoder::encode] ERROR: n_tokens (%d) > n_batch (%d)", - n_tokens, n_batch); - throw std::runtime_error( - "decoder::encode - token count exceeds batch size (truncation not " - "supported, increase n_batch or reduce input length)"); - } - - // Initialize batch - single sequence - llama_batch batch = llama_batch_init(n_batch, 0, 1); - detail::BatchGuard batch_guard(batch); - - // Clear batch - lloyal::batch_clear(batch); - - // Add ALL tokens with logits=true (required for embedding extraction) - for (int32_t i = 0; i < n_tokens; ++i) { - lloyal::batch_add(batch, tokens[i], i, {0}, true, n_batch); - } - - // Decode/encode the batch (llama.cpp handles encoder vs decoder internally) - if (llama_decode(ctx, batch) != 0) { - LLOYAL_LOG_DEBUG("[decoder::encode] ERROR: llama_decode failed"); - throw std::runtime_error("decoder::encode - llama_decode failed"); - } - - LLOYAL_LOG_DEBUG("[decoder::encode] Encode complete"); -} - -/** - * Convenience overload for std::vector - */ -inline void encode(llama_context *ctx, const std::vector &tokens, - int32_t n_batch) { - encode(ctx, tokens.data(), static_cast(tokens.size()), n_batch); -} - } // namespace lloyal::decoder diff --git a/vendor/liblloyal/include/lloyal/embedding.hpp b/vendor/liblloyal/include/lloyal/embedding.hpp index 9f57f24..cf49d61 100644 --- a/vendor/liblloyal/include/lloyal/embedding.hpp +++ b/vendor/liblloyal/include/lloyal/embedding.hpp @@ -1,6 +1,11 @@ #pragma once +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Lloyal Labs + #include "common.hpp" +#include "helpers.hpp" +#include #include #include #include @@ -8,17 +13,18 @@ #include /** - * Embeddings Anti-Corruption Layer (Header-Only) + * @file embedding.hpp + * @brief Embedding Extraction and Normalization * - * Purpose: Single point of contact with llama.cpp embedding APIs to isolate - * version churn, pooling modes, and normalization complexity. + * Wraps llama.cpp embedding APIs with pooling mode management and L2 normalization. + * Provides both context-bound extraction and model capability checks. * - * ARCHITECTURE: - * - Primitives accept context directly (embeddings are context-bound) - * - Model-accepting overloads provided for capability checks - * - L2 normalization built-in (required for cosine similarity) + * Architecture: + * - Context-bound primitives for embedding extraction + * - Model-accepting overloads for capability checks + * - Built-in L2 normalization for cosine similarity * - * USAGE: + * @example * // Check model supports embeddings * if (embedding::has_embeddings(model)) { * int32_t dim = embedding::dimension(model); @@ -152,6 +158,99 @@ inline void apply_l2_normalize(std::vector &vec) { } // namespace detail +// ===== RAII GUARD FOR BATCH CLEANUP ===== + +namespace detail { +/** + * RAII guard for automatic batch cleanup + * Ensures llama_batch_free is called even if exceptions occur + */ +struct BatchGuard { + llama_batch &batch; + explicit BatchGuard(llama_batch &b) : batch(b) {} + ~BatchGuard() { llama_batch_free(batch); } +}; +} // namespace detail + +// ===== ENCODING (FORWARD PASS FOR EMBEDDINGS) ===== + +/** + * Encode tokens for embedding extraction + * + * Unlike decoder::decode_tokens(), this marks ALL tokens with logits=true which is + * required for embedding extraction. + * + * NOTE: Use this with a dedicated embedding context (embeddings=true, pooling + * enabled). Clear KV between texts with kv::clear_all(): + * + * // Create dedicated embedding context + * ctx_params.embeddings = true; + * ctx_params.pooling_type = LLAMA_POOLING_TYPE_MEAN; + * auto embed_ctx = llama_init_from_model(model, ctx_params); + * + * // Embed each text + * kv::clear_all(embed_ctx); + * embedding::encode(embed_ctx, tokens, 512); + * auto emb = embedding::get(embed_ctx); + * + * @param ctx Llama context (must have embeddings=true and pooling enabled) + * @param tokens Token array to encode + * @param n_tokens Number of tokens in array + * @param n_batch Batch size + * @throws std::runtime_error if encode fails + */ +inline void encode(llama_context *ctx, const llama_token *tokens, + int32_t n_tokens, int32_t n_batch) { + LLOYAL_LOG_DEBUG("[embedding::encode] Encoding %d tokens for embeddings", + n_tokens); + + if (!ctx) { + LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: NULL context"); + throw std::runtime_error("embedding::encode - NULL context"); + } + + if (!tokens || n_tokens <= 0) { + LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: Invalid token array"); + throw std::runtime_error("embedding::encode - Invalid token array"); + } + + if (n_tokens > n_batch) { + LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: n_tokens (%d) > n_batch (%d)", + n_tokens, n_batch); + throw std::runtime_error( + "embedding::encode - token count exceeds batch size (truncation not " + "supported, increase n_batch or reduce input length)"); + } + + // Initialize batch - single sequence + llama_batch batch = llama_batch_init(n_batch, 0, 1); + detail::BatchGuard batch_guard(batch); + + // Clear batch + lloyal::batch_clear(batch); + + // Add ALL tokens with logits=true (required for embedding extraction) + for (int32_t i = 0; i < n_tokens; ++i) { + lloyal::batch_add(batch, tokens[i], i, {0}, true, n_batch); + } + + // Decode/encode the batch (llama.cpp handles encoder vs decoder internally) + if (llama_decode(ctx, batch) != 0) { + LLOYAL_LOG_DEBUG("[embedding::encode] ERROR: llama_decode failed"); + throw std::runtime_error("embedding::encode - llama_decode failed"); + } + + LLOYAL_LOG_DEBUG("[embedding::encode] Encode complete"); +} + +/** + * Convenience overload for std::vector + */ +inline void encode(llama_context *ctx, const std::vector &tokens, + int32_t n_batch) { + encode(ctx, tokens.data(), static_cast(tokens.size()), n_batch); +} + // ===== EMBEDDING EXTRACTION ===== /** diff --git a/vendor/liblloyal/include/lloyal/grammar.hpp b/vendor/liblloyal/include/lloyal/grammar.hpp index 91a704b..2e419e1 100644 --- a/vendor/liblloyal/include/lloyal/grammar.hpp +++ b/vendor/liblloyal/include/lloyal/grammar.hpp @@ -1,5 +1,8 @@ #pragma once +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Lloyal Labs + #include "common.hpp" #include "json-schema-to-grammar.hpp" #include "tokenizer.hpp" @@ -9,17 +12,18 @@ #include /** - * Grammar Anti-Corruption Layer (Header-Only) + * @file grammar.hpp + * @brief Grammar-Constrained Sampling * - * PURPOSE: Provides JSON schema to GBNF grammar conversion for structured - * output + * Provides JSON schema to GBNF grammar conversion for structured output generation. + * Wraps json-schema-to-grammar.hpp conversion logic with error handling and logging. * - * ARCHITECTURE: - * - This layer CALLS json_schema_to_grammar from json-schema-to-grammar.hpp - * - Does NOT reimplement conversion logic - * - Provides error handling, logging, and consistent API + * Architecture: + * - Calls json_schema_to_grammar() from json-schema-to-grammar.hpp + * - Adds error handling, logging, and consistent API + * - Manages grammar sampler lifecycle * - * USAGE: + * @example * std::string gbnf = lloyal::grammar::from_json_schema(schemaJsonString); * // Pass to sampler::sample_with_params() via grammarSampler parameter */ diff --git a/vendor/liblloyal/include/lloyal/helpers.hpp b/vendor/liblloyal/include/lloyal/helpers.hpp index b181755..899d5dc 100644 --- a/vendor/liblloyal/include/lloyal/helpers.hpp +++ b/vendor/liblloyal/include/lloyal/helpers.hpp @@ -1,5 +1,8 @@ #pragma once +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Lloyal Labs + #include "common.hpp" #include "minja/chat-template.hpp" #include "minja/minja.hpp" @@ -14,8 +17,17 @@ #include /** - * Helper utilities vendored from llama.cpp/common/ - * MIT License - Copyright (c) 2023-2024 The ggml.ai team + * @file helpers.hpp + * @brief Helper Utilities + * + * Collection of utility functions for common llama.cpp operations: + * - Batch operations: Build and manage token batches for decoding + * - Chat template processing: Format messages, extract stop tokens, validate templates + * - Parameter conversion: KV cache type mapping, string validation helpers + * - String utilities: Repeat, join, split operations + * + * Source: Vendored from llama.cpp/common/ + * License: MIT License - Copyright (c) 2023-2024 The ggml.ai team */ // Forward declarations for detail namespace (defined at end of file) @@ -39,10 +51,36 @@ using json = nlohmann::ordered_json; // ===== BATCH UTILITIES ===== -// Clear batch to empty state (reset n_tokens) +/** + * @brief Clear batch to empty state + * + * Resets the batch token counter to prepare for new tokens. + * Does not deallocate buffer memory. + * + * @param batch Batch to clear (modified in place) + * + * @note Only resets n_tokens counter, buffer capacity remains unchanged + */ inline void batch_clear(llama_batch &batch) { batch.n_tokens = 0; } -// Add single token to batch with position and sequence info +/** + * @brief Add single token to batch with position and sequence info + * + * Appends a token to the batch at the current n_tokens position, then increments + * the counter. Assigns position embedding, sequence IDs, and logits flag. + * + * @param batch Batch to modify (appends token at batch.n_tokens) + * @param id Token ID to add + * @param pos Position embedding for this token (e.g., 0, 1, 2...) + * @param seq_ids Sequence IDs this token belongs to (usually single-element vector {0}) + * @param logits Whether to compute logits for this token + * @param capacity Optional capacity check for DEBUG builds (default: -1 disables check) + * + * @warning Caller must ensure batch has sufficient capacity (n_tokens < n_max) + * to avoid buffer overflow. No runtime bounds checking in release builds. + * + * @note DEBUG builds enable capacity assertion if capacity > 0 + */ inline void batch_add(llama_batch &batch, llama_token id, int32_t pos, const std::vector &seq_ids, bool logits, int32_t capacity = -1) { @@ -66,12 +104,38 @@ inline void batch_add(llama_batch &batch, llama_token id, int32_t pos, // ===== CHAT TEMPLATE TYPES (PUBLIC API) ===== +/** + * @brief Result from complete chat template processing + * + * Contains formatted prompt and dynamically detected stop tokens specific + * to the model's chat template (ChatML, Llama-3, etc.). + */ struct ChatTemplateResult { - std::string prompt; - std::vector additional_stops; + std::string prompt; ///< Formatted chat prompt ready for tokenization + std::vector additional_stops; ///< Template-specific stop tokens (e.g., "<|im_end|>", "<|eot_id|>") }; -// Format chat messages using model's built-in template +/** + * @brief Format chat messages using model's built-in template + * + * Applies chat template (Jinja2) to format message array into a single prompt string. + * Automatically queries model metadata for BOS/EOS tokens and add_bos/add_eos flags. + * + * Template selection hierarchy: + * 1. template_override (if provided) + * 2. model's embedded template (from GGUF metadata) + * 3. ChatML fallback (default) + * + * @param model Llama model (can be null, will use ChatML fallback) + * @param messages_json JSON array of messages: [{"role":"user","content":"..."},...] + * @param template_override Optional Jinja2 template string (default: empty, uses model template) + * @return Formatted prompt string ready for tokenization + * + * @throws std::exception if JSON parsing fails (caught internally, returns empty string) + * + * @note Strips BOS/EOS wrapper tokens if model metadata indicates they're added during tokenization + * to prevent double-token issues + */ inline std::string format_chat_template_from_model(const llama_model *model, const std::string &messages_json, @@ -117,7 +181,25 @@ format_chat_template_from_model(const llama_model *model, } } -// Dynamic stop token detection +/** + * @brief Dynamically detect stop tokens from chat template + * + * Analyzes template string to identify template-specific stop tokens and verifies + * they exist in the model's vocabulary. Prevents generating invalid tokens that + * would cause tokenization failures. + * + * Supported patterns: + * - ChatML: <|im_end|>, <|endoftext|> (when template contains "im_start") + * - Llama-3: <|eom_id|>, <|eot_id|> (when template contains "eom_id" or "eot_id") + * - Fallback: Model's EOT token from vocabulary + * + * @param model Llama model (can be null, returns empty vector) + * @param template_str Jinja2 template string to analyze + * @return Vector of stop token strings that exist in model vocabulary + * + * @note Only returns tokens that successfully tokenize to single token IDs. + * Prevents returning strings that would split into multiple tokens. + */ inline std::vector extract_template_stop_tokens(const llama_model *model, const std::string &template_str) { @@ -180,7 +262,22 @@ extract_template_stop_tokens(const llama_model *model, return stops; } -// Complete chat template processing +/** + * @brief Complete chat template processing with stop token detection + * + * Combines format_chat_template_from_model() and extract_template_stop_tokens() + * into a single call for convenience. Returns both formatted prompt and detected + * stop tokens. + * + * @param model Llama model (can be null, will use ChatML fallback) + * @param messages_json JSON array of messages: [{"role":"user","content":"..."},...] + * @param template_override Optional Jinja2 template string (default: empty, uses model template) + * @return ChatTemplateResult with formatted prompt and additional_stops vector + * + * @note Equivalent to calling format_chat_template_from_model() followed by + * extract_template_stop_tokens(), but more efficient as it only queries + * model metadata once. + */ inline ChatTemplateResult format_chat_template_complete(const llama_model *model, const std::string &messages_json, @@ -230,7 +327,17 @@ format_chat_template_complete(const llama_model *model, return result; } -// Validate chat template syntax +/** + * @brief Validate chat template syntax + * + * Attempts to parse Jinja2 template string using minja engine to check for + * syntax errors before usage. + * + * @param template_str Jinja2 template string to validate + * @return True if template syntax is valid, false if parsing failed + * + * @note Uses empty BOS/EOS tokens for validation - only checks syntax, not semantics + */ inline bool validate_chat_template_helper(const std::string &template_str) { try { minja::chat_template tmpl(template_str, "", ""); @@ -242,7 +349,17 @@ inline bool validate_chat_template_helper(const std::string &template_str) { // ===== PARAMETER CONVERSION HELPERS ===== -// Get supported KV cache types +/** + * @brief Get list of supported KV cache types + * + * Returns static vector of ggml_type enums representing supported quantization + * formats for KV cache. Includes full-precision (F32, F16, BF16) and quantized + * formats (Q8_0, Q4_0, Q4_1, IQ4_NL, Q5_0, Q5_1). + * + * @return Reference to static vector of supported cache types + * + * @note Returns const reference to avoid allocation on each call + */ inline const std::vector &get_kv_cache_types() { static const std::vector types = { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, @@ -252,7 +369,16 @@ inline const std::vector &get_kv_cache_types() { return types; } -// Convert cache type string to ggml_type enum +/** + * @brief Convert cache type string to ggml_type enum + * + * Maps type name string (e.g., "f16", "q4_0") to corresponding ggml_type enum. + * Used for parsing user-provided cache type configuration. + * + * @param s Type name string (e.g., "f16", "q4_0", "q8_0") + * @return Matching ggml_type enum value + * @throws std::runtime_error if type name is not in supported types list + */ inline ggml_type kv_cache_type_from_str(const std::string &s) { const auto &kv_cache_types = get_kv_cache_types(); for (const auto &type : kv_cache_types) { @@ -263,16 +389,33 @@ inline ggml_type kv_cache_type_from_str(const std::string &s) { throw std::runtime_error("Unsupported cache type: " + s); } -// String validation helpers +/** + * @brief Check if string represents a truthy value + * + * @param value String to check + * @return True if value is "on", "enabled", "1", or "true" + */ inline bool is_truthy(const std::string &value) { return value == "on" || value == "enabled" || value == "1" || value == "true"; } +/** + * @brief Check if string represents a falsey value + * + * @param value String to check + * @return True if value is "off", "disabled", "0", or "false" + */ inline bool is_falsey(const std::string &value) { return value == "off" || value == "disabled" || value == "0" || value == "false"; } +/** + * @brief Check if string represents an auto value + * + * @param value String to check + * @return True if value is "auto" or "-1" + */ inline bool is_autoy(const std::string &value) { return value == "auto" || value == "-1"; } diff --git a/vendor/liblloyal/include/lloyal/json-schema-to-grammar.hpp b/vendor/liblloyal/include/lloyal/json-schema-to-grammar.hpp index 1414dc4..27be414 100644 --- a/vendor/liblloyal/include/lloyal/json-schema-to-grammar.hpp +++ b/vendor/liblloyal/include/lloyal/json-schema-to-grammar.hpp @@ -1,5 +1,8 @@ #pragma once +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Lloyal Labs + #include "common.hpp" #include "helpers.hpp" // For string_repeat, string_join, string_split #include @@ -78,7 +81,13 @@ struct BuiltinRule { std::vector deps; }; -// Primitive grammar rules +/** + * @var PRIMITIVE_RULES + * @brief Built-in grammar rules for JSON primitives + * + * Defines GBNF rules for basic JSON types: boolean, number, integer, string, array, + * object, null, uuid, and character escaping. Used as building blocks for schema conversion. + */ inline const std::unordered_map PRIMITIVE_RULES = { {"boolean", {"(\"true\" | \"false\") space", {}}}, {"decimal-part", {"[0-9]{1,16}", {}}}, @@ -109,7 +118,13 @@ inline const std::unordered_map PRIMITIVE_RULES = { {"null", {"\"null\" space", {}}}, }; -// String format rules (date, time, etc.) +/** + * @var STRING_FORMAT_RULES + * @brief Grammar rules for string format validation + * + * Defines GBNF rules for JSON Schema string formats: date, time, date-time, uri, email, uuid. + * Used when schema specifies "format" field for string validation. + */ inline const std::unordered_map STRING_FORMAT_RULES = {{"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | " @@ -126,7 +141,18 @@ inline const std::unordered_map STRING_FORMAT_RULES = {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}}; -// Reserved rule names +/** + * @brief Check if name conflicts with GBNF reserved keywords + * + * Tests whether a rule name would collide with built-in primitives ("root", "boolean", + * "number", "string", etc.) or format rules ("date", "time", "uuid", etc.). Used during + * schema conversion to append "-" suffix to conflicting names. + * + * @param name Rule name to check + * @return True if name is reserved, false otherwise + * + * @note Uses lazy-initialized static set for O(1) lookup after first call + */ inline bool is_reserved_name(const std::string &name) { static std::unordered_set RESERVED_NAMES; if (RESERVED_NAMES.empty()) { @@ -461,8 +487,38 @@ class SchemaConverter { _rules["space"] = SPACE_RULE; } + /** + * @brief Resolve $ref pointers in JSON schema + * + * Recursively resolves all $ref fields in schema, fetching remote schemas as needed. + * Replaces relative refs (#/definitions/...) with absolute URLs and populates internal + * _refs map with resolved schema objects. + * + * @param schema Schema object to resolve (modified in place) + * @param url Base URL for resolving relative references + * + * @note Handles both absolute (https://...) and relative (#/definitions/...) refs + * @note Errors accumulated in _errors vector for batch reporting + */ void resolve_refs(json &schema, const std::string &url); std::string _generate_constant_rule(const json &value); + + /** + * @brief Convert schema node to GBNF rule + * + * Main entry point for schema-to-grammar conversion. Dispatches to appropriate handler + * based on schema type (object, array, string, number, enum, etc.). Recursively processes + * nested schemas and generates corresponding GBNF rules. + * + * @param schema Schema node to convert (JSON object, may contain type, properties, items, etc.) + * @param name Rule name to generate (used as identifier in output grammar) + * @return Generated GBNF rule definition + * + * @note Accumulates errors in _errors vector - call check_errors() after conversion + * @note May throw std::runtime_error on unrecognized schema constructs + * + * @warning Complex method (~200+ lines) - handles all JSON Schema type keywords + */ std::string visit(const json &schema, const std::string &name); void check_errors(); std::string format_grammar(); diff --git a/vendor/liblloyal/include/lloyal/kv.hpp b/vendor/liblloyal/include/lloyal/kv.hpp index 543db0a..e56a851 100644 --- a/vendor/liblloyal/include/lloyal/kv.hpp +++ b/vendor/liblloyal/include/lloyal/kv.hpp @@ -1,32 +1,55 @@ #pragma once +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Lloyal Labs + +/** + * @file kv.hpp + * @brief KV Cache Management + * + * Core primitives for KV cache operations in LLM applications: + * - Multi-sequence management: independent recurrent states per seq_id + * - Cache lifecycle: clear, remove, copy, keep operations + * - State persistence: save/load with fragmentation fallback + * - Cache reconstruction: clear_and_reseed for context compression strategies + * - File I/O: session save/resume for app lifecycle management + * + * These primitives compose into diverse inference patterns including: + * - Context window management (streaming, compression, eviction) + * - Session persistence (save/resume across app restarts) + * - Multi-sequence orchestration (parallel logical states) + * - Specialized search and sampling strategies + * + * Memory management for llama.cpp primitives: + * - llama_memory_* for cache lifecycle and multi-sequence ops + * - llama_state_* for serialization with fragmentation fallback + * - Adds null-safety, error handling, and defensive programming + */ + #include "common.hpp" #include "decoder.hpp" #include #include #include -/** - * KV Cache Anti-Corruption Layer (Header-Only) - * - * Purpose: Handles API name churn across llama.cpp versions. - * Pinned version: commit b6870 (llama_memory_seq_* API naming) - */ - namespace lloyal::kv { // ===== KV SEQUENCE OPERATIONS ===== /** - * Remove token range from KV cache sequence. + * @brief Remove token range from KV cache sequence + * + * Removes tokens in the range [p0, p1) from the specified sequence's KV cache. + * Used for selective eviction in context window management. * - * @param ctx llama context - * @param seq sequence ID (use 0 for single-sequence mode) - * @param p0 start position (inclusive) - * @param p1 end position (exclusive), use -1 for "to end" - * @return true if successful, false otherwise + * @param ctx Llama context (must not be null) + * @param seq Sequence ID (use 0 for single-sequence mode) + * @param p0 Start position (inclusive) + * @param p1 End position (exclusive), use -1 for "to end" + * @return true if successful, false if context is null or operation failed * - * CRITICAL: Call this BEFORE next llama_decode(), not after. + * @warning CRITICAL: Call this BEFORE next llama_decode(), not after. + * Calling after decode may cause undefined behavior. */ inline bool remove_range(llama_context *ctx, llama_seq_id seq, llama_pos p0, llama_pos p1) { @@ -52,11 +75,14 @@ inline bool remove_range(llama_context *ctx, llama_seq_id seq, llama_pos p0, } /** - * Get maximum position in KV cache sequence. + * @brief Get maximum position in KV cache sequence * - * @param ctx llama context - * @param seq sequence ID - * @return maximum position (number of tokens - 1), or -1 if empty + * Returns the highest token position in the specified sequence's KV cache. + * For a sequence with N tokens, this returns N-1 (zero-indexed). + * + * @param ctx Llama context (must not be null) + * @param seq Sequence ID + * @return Maximum position (number of tokens - 1), or -1 if empty or context is null */ inline llama_pos pos_max(llama_context *ctx, llama_seq_id seq) { if (!ctx) { @@ -72,15 +98,18 @@ inline llama_pos pos_max(llama_context *ctx, llama_seq_id seq) { } /** - * Copy KV cache from one sequence to another (for branching/fork). + * @brief Copy KV cache from one sequence to another + * + * Copies KV cache state from source to destination sequence, enabling + * efficient branching without duplicating model weights. * - * @param ctx llama context - * @param src source sequence ID - * @param dst destination sequence ID - * @param p0 start position (inclusive), default 0 - * @param p1 end position (exclusive), default -1 (to end) + * @param ctx Llama context (must not be null) + * @param src Source sequence ID + * @param dst Destination sequence ID + * @param p0 Start position (inclusive), default 0 + * @param p1 End position (exclusive), default -1 for "to end" * - * Use case: System 2 tree search - fork from trunk without copying model weights + * @note Use case: Multi-sequence search (fork from trunk without copying model weights) */ inline void seq_cp(llama_context *ctx, llama_seq_id src, llama_seq_id dst, llama_pos p0 = 0, llama_pos p1 = -1) { @@ -96,12 +125,15 @@ inline void seq_cp(llama_context *ctx, llama_seq_id src, llama_seq_id dst, } /** - * Keep only one sequence, removing all others. + * @brief Keep only one sequence, removing all others * - * @param ctx llama context - * @param seq sequence ID to keep + * Removes all sequences except the specified one from the KV cache. + * Efficient way to prune unused branches. * - * Use case: After tree search, prune all branches except winner + * @param ctx Llama context (must not be null) + * @param seq Sequence ID to keep + * + * @note Use case: After selection, prune all alternatives except chosen path */ inline void seq_keep(llama_context *ctx, llama_seq_id seq) { if (!ctx) { @@ -115,11 +147,20 @@ inline void seq_keep(llama_context *ctx, llama_seq_id seq) { LLOYAL_LOG_DEBUG("[kv::seq_keep] Kept only seq %d", seq); } -// ===== STATE SNAPSHOT OPERATIONS (with fragmentation fallback) ===== +// ===== STATE SNAPSHOT OPERATIONS ===== /** - * Get size needed to serialize sequence state. - * Automatically falls back to global state size if per-sequence fails. + * @brief Get size needed to serialize sequence state + * + * Returns buffer size required to save the sequence's KV cache state. + * Automatically falls back to global state size if per-sequence query fails + * (may occur with fragmented caches). + * + * @param ctx Llama context (must not be null) + * @param seq Sequence ID + * @return Required buffer size in bytes, or 0 if empty/failed + * + * @note Fallback strategy: per-sequence β†’ global state (handles fragmentation) */ inline size_t state_size(llama_context *ctx, llama_seq_id seq) { if (!ctx) { @@ -162,8 +203,19 @@ inline size_t state_size(llama_context *ctx, llama_seq_id seq) { } /** - * Save sequence state to buffer. - * Automatically falls back to global state save if per-sequence fails. + * @brief Save sequence state to buffer + * + * Serializes the sequence's KV cache state into the provided buffer. + * Automatically falls back to global state save if per-sequence save fails + * (may occur with fragmented caches). + * + * @param ctx Llama context (must not be null) + * @param seq Sequence ID + * @param dst Destination buffer (must not be null) + * @param size Buffer size in bytes + * @return Bytes written, or 0 on failure + * + * @note Fallback strategy: per-sequence β†’ global state (handles fragmentation) */ inline size_t state_save(llama_context *ctx, llama_seq_id seq, uint8_t *dst, size_t size) { @@ -211,8 +263,20 @@ inline size_t state_save(llama_context *ctx, llama_seq_id seq, uint8_t *dst, } /** - * Restore sequence state from buffer. - * Automatically falls back to global state restore if per-sequence fails. + * @brief Restore sequence state from buffer + * + * Deserializes KV cache state from buffer and restores it to the sequence. + * Automatically falls back to global state restore if per-sequence restore fails + * (may occur with fragmented caches). + * + * @param ctx Llama context (must not be null) + * @param seq Sequence ID + * @param src Source buffer (must not be null) + * @param size Buffer size in bytes + * @return Bytes read, or 0 on failure + * + * @warning May crash on recurrent models if KV cache is empty during load + * @note Fallback strategy: per-sequence β†’ global state (handles fragmentation) */ inline size_t state_load(llama_context *ctx, llama_seq_id seq, const uint8_t *src, size_t size) { @@ -258,8 +322,17 @@ inline size_t state_load(llama_context *ctx, llama_seq_id seq, return read; } -// ===== GLOBAL STATE FALLBACKS (explicit) ===== +// ===== GLOBAL STATE OPERATIONS ===== +/** + * @brief Get size needed to serialize global state + * + * Returns buffer size required to save the entire context's state. + * Use when per-sequence serialization is not needed. + * + * @param ctx Llama context (must not be null) + * @return Required buffer size in bytes, or 0 if context is null + */ inline size_t global_state_size(llama_context *ctx) { if (!ctx) { LLOYAL_LOG_DEBUG("[kv::global_state_size] ERROR: null context"); @@ -272,6 +345,16 @@ inline size_t global_state_size(llama_context *ctx) { return size; } +/** + * @brief Save global state to buffer + * + * Serializes the entire context's state into the provided buffer. + * + * @param ctx Llama context (must not be null) + * @param dst Destination buffer (must not be null) + * @param size Buffer size in bytes + * @return Bytes written, or 0 on failure + */ inline size_t global_state_save(llama_context *ctx, uint8_t *dst, size_t size) { if (!ctx || !dst || size == 0) { LLOYAL_LOG_DEBUG("[kv::global_state_save] ERROR: invalid parameters"); @@ -284,6 +367,16 @@ inline size_t global_state_save(llama_context *ctx, uint8_t *dst, size_t size) { return written; } +/** + * @brief Restore global state from buffer + * + * Deserializes and restores the entire context's state from buffer. + * + * @param ctx Llama context (must not be null) + * @param src Source buffer (must not be null) + * @param size Buffer size in bytes + * @return Bytes read, or 0 on failure + */ inline size_t global_state_load(llama_context *ctx, const uint8_t *src, size_t size) { if (!ctx || !src || size == 0) { @@ -299,6 +392,16 @@ inline size_t global_state_load(llama_context *ctx, const uint8_t *src, // ===== DIAGNOSTICS ===== +/** + * @brief Log KV cache build info and current state + * + * Outputs debug information about the KV cache configuration and current state. + * Useful for debugging and understanding cache behavior. + * + * @param ctx Llama context (can be null; limits output if null) + * + * @note Only produces output when DEBUG logging is enabled + */ inline void log_build_info(llama_context *ctx) { LLOYAL_LOG_DEBUG( "[kv::build_info] ============================================"); @@ -336,31 +439,23 @@ inline void log_build_info(llama_context *ctx) { "[kv::build_info] ============================================"); } -// ===== CACHE CLEARING (PHASE 3) ===== +// ===== CACHE CLEARING ===== /** - * Clear all KV cache (complete reset) - * - * Wrapper around llama_memory_clear() with: - * - Null checking - * - Error logging - * - Clears both metadata and data buffers + * @brief Clear all KV cache (complete reset) * - * @param ctx Llama context (must be initialized) - * @throws std::runtime_error if ctx is NULL + * Clears both metadata and data buffers for a complete cache reset. + * Use when starting a new conversation or session. * - * USAGE: - * lloyal::kv::clear_all(ctx); // Fresh start for new conversation + * @param ctx Llama context (must not be null) + * @throws std::runtime_error if ctx is null * - * IMPLEMENTATION NOTE: - * Uses llama_memory_clear(mem, true) which: - * - Clears metadata (cell positions, sequence heads) - * - Zeroes K/V tensor data buffers - * - Full reset for new conversation + * @note Uses llama_memory_clear(mem, true) which: + * - Clears metadata (cell positions, sequence heads) + * - Zeroes K/V tensor data buffers + * - Complete reset (slower than clear_metadata()) * - * Compare with clear_metadata(): - * - clear_metadata() clears only metadata (keeps allocations, faster) - * - clear_all() clears both metadata and data (complete reset) + * @see clear_metadata() for faster metadata-only clearing */ inline void clear_all(llama_context *ctx) { if (!ctx) { @@ -374,21 +469,18 @@ inline void clear_all(llama_context *ctx) { } /** - * Clear KV cache metadata only (fast reset) + * @brief Clear KV cache metadata only (fast reset) * * Clears logical structure but keeps buffer allocations. - * Faster than clear_all() for StreamingLLM pattern. + * Faster than clear_all() for compression patterns. * - * @param ctx Llama context (must be initialized) - * @throws std::runtime_error if ctx is NULL + * @param ctx Llama context (must not be null) + * @throws std::runtime_error if ctx is null * - * USAGE: - * lloyal::kv::clear_metadata(ctx); // Fast reset for reseed + * @note Performance: Faster than clear_all() (no buffer zeroing) + * Use when immediately re-decoding; buffer reuse reduces overhead * - * PERFORMANCE: - * - Faster than clear_all() (no buffer zeroing) - * - Use for StreamingLLM when immediately re-decoding - * - Buffer reuse reduces allocation overhead + * @see clear_all() for complete reset including data buffers */ inline void clear_metadata(llama_context *ctx) { if (!ctx) { @@ -401,64 +493,39 @@ inline void clear_metadata(llama_context *ctx) { LLOYAL_LOG_DEBUG("[kv::clear_metadata] KV cache metadata cleared"); } -// ===== STREAMINGLLM SUPPORT (PHASE 3) ===== +// ===== CONTEXT COMPRESSION ===== /** - * StreamingLLM state for managing original sinks across reseeds + * @brief Clear KV cache and reconstruct with anchor + tail tokens * - * StreamingLLM pattern requires ALWAYS reusing the ORIGINAL first 4 tokens - * from conversation start as "attention sinks". This struct helps track them. + * Reconstructs KV cache with contiguous positions by: + * 1. Clearing entire KV cache + * 2. Re-decoding original_sinks (anchor tokens) at position 0 + * 3. Re-decoding tail (recent tokens) at position sinks.size() * - * NOTE: This is provided for convenience. Callers can also track original - * sinks themselves and pass directly to clear_and_reseed(). - */ -struct StreamingLlmState { - std::vector original_sinks; // First N tokens from conversation start - size_t tail_size; // Number of recent tokens to keep (usually 252) -}; - -/** - * Clear KV cache and re-decode sinks + tail (StreamingLLM pattern) - * - * Implements the "CLEAR" strategy validated in integration tests: - * 1. Clear entire KV cache using llama_memory_clear() - * 2. Re-decode original_sinks (first N tokens) at position 0 - * 3. Re-decode tail (last M tokens) at position sinks.size() - * - * This is SIMPLER and MORE RELIABLE than selective removal (llama_memory_seq_rm) - * which has known bugs with position handling in some llama.cpp versions. - * - * ⚠️ CRITICAL: original_sinks MUST be the FIRST tokens from conversation start! - * - * StreamingLLM relies on attention sinks at fixed positions. Using different - * "first 4" tokens after each reseed will violate the learned positional bias - * and destroy perplexity preservation. - * - * CORRECT usage: - * // First time: Capture original sinks - * std::vector ORIGINAL_SINKS(conversation.begin(), conversation.begin() + 4); - * // Store ORIGINAL_SINKS for entire session - * - * // Each reseed: Reuse SAME original sinks - * auto tail = std::vector(conversation.end() - 252, conversation.end()); - * kv::clear_and_reseed(ctx, ORIGINAL_SINKS, tail, n_batch); + * This maintains contiguous positions [0,1,2,...] which is simpler and more + * reliable than selective removal with position gaps. * - * WRONG usage: - * auto current_window = get_current_tokens(); - * auto sinks = std::vector(current_window.begin(), current_window.begin() + 4); - * kv::clear_and_reseed(ctx, sinks, tail, n_batch); // ❌ NOT original! Will degrade! - * - * @param ctx Llama context (must be initialized) - * @param original_sinks MUST be first N tokens from conversation start (typically 4) - * @param tail Recent M tokens to preserve (typically 252, total 256 with sinks) + * @param ctx Llama context (must not be null) + * @param original_sinks Anchor tokens from sequence start (typically 4) + * @param tail Recent tokens to preserve (typically 252, total 256 with sinks) * @param n_batch Batch size for re-decoding chunks * @throws std::runtime_error if parameters are invalid or re-decode fails * - * Empirical validation: Preserves perplexity within 10% (StreamingLLM paper: 3.7%) - * See tests/integration/clear_and_reseed_validation.cpp for full validation. + * @warning CRITICAL: original_sinks MUST be the ORIGINAL first N tokens from + * sequence start. Reusing different "first N" tokens on each reseed + * will degrade quality for attention-sink patterns. + * + * @note After calling, KV cache position = sinks.size() + tail.size() + * Continue generation with n_past = static_cast(sinks.size() + tail.size()) + * + * @example + * // Capture original anchor tokens once + * std::vector SINKS(tokens.begin(), tokens.begin() + 4); * - * IMPORTANT: After calling, KV cache position = sinks.size() + tail.size() - * Continue generation with n_past = static_cast(sinks.size() + tail.size()) + * // Each compression: reuse SAME anchors with current tail + * auto tail = std::vector(tokens.end() - 252, tokens.end()); + * kv::clear_and_reseed(ctx, SINKS, tail, n_batch); */ inline void clear_and_reseed(llama_context *ctx, const std::vector &original_sinks, @@ -522,35 +589,38 @@ inline void clear_and_reseed(llama_context *ctx, LLOYAL_LOG_DEBUG("[kv::clear_and_reseed] Reseed complete"); } -// ===== FILE PERSISTENCE OPERATIONS ===== +// ===== FILE PERSISTENCE ===== /** - * FileData structure returned by read_file - * Contains tokens and metadata from file + * @brief Data structure returned by read_file + * + * Contains tokens and metadata restored from KV cache file. */ struct FileData { - std::vector tokens; // Tokens restored from file - size_t bytes_read; // Total bytes read from file + std::vector tokens; ///< Tokens restored from file + size_t bytes_read; ///< Total bytes read from file }; /** - * Write KV state to file with self-describing format + * @brief Write KV state to file with self-describing format * - * File format (llama.cpp standard): + * Serializes KV cache state to file using llama.cpp's standard format: * - Magic + Version (validation) * - Token count + Token array * - KV state data (cache + logits + embeddings) * - * @param ctx llama context - * @param seq sequence ID (use 0 for single-sequence mode) - * @param filepath Destination file path + * @param ctx Llama context (must not be null) + * @param seq Sequence ID (use 0 for single-sequence mode) + * @param filepath Destination file path (must not be empty) * @param tokens Token IDs to include in file - * @return bytes written, or 0 on failure + * @return Bytes written, or 0 on failure + * + * @note Use cases: + * - Exit/resume: Save app state across restarts + * - Persistent sessions: Multiple save points per conversation + * - Context sharing: Serialize β†’ upload β†’ share * - * Use cases: - * - Exit/resume app: kv::write_file(ctx, 0, "app_state.llama", tokens) - * - Persistent pages: kv::write_file(ctx, 0, "fork_42.llama", fork_tokens) - * - Context sharing: Write β†’ upload to S3 β†’ share signed URL + * @warning Skips write if KV cache is empty (returns 0) */ inline size_t write_file(llama_context *ctx, llama_seq_id seq, const std::string &filepath, @@ -592,23 +662,24 @@ inline size_t write_file(llama_context *ctx, llama_seq_id seq, } /** - * Read KV state from file + * @brief Read KV state from file * - * Validates magic + version automatically. - * Returns structured data (no output parameters). + * Deserializes KV cache state from file and restores it to the sequence. + * Validates magic + version automatically. Returns structured data with + * restored tokens and metadata. * - * @param ctx llama context - * @param seq sequence ID (use 0 for single-sequence mode) - * @param filepath Source file path + * @param ctx Llama context (must not be null) + * @param seq Sequence ID (use 0 for single-sequence mode) + * @param filepath Source file path (must not be empty) * @return FileData with tokens and bytes_read * @throws std::runtime_error if validation fails or file doesn't exist * - * Example usage: - * ```cpp - * auto data = lloyal::kv::read_file(ctx, 0, "app_state.llama"); - * // Use data.tokens for reconstruction/validation - * // KV cache is automatically restored - * ``` + * @note KV cache is automatically restored during load + * Use data.tokens for reconstruction/validation + * + * @example + * auto data = lloyal::kv::read_file(ctx, 0, "app_state.llama"); + * // KV cache restored, tokens available in data.tokens */ inline FileData read_file(llama_context *ctx, llama_seq_id seq, const std::string &filepath) { diff --git a/vendor/liblloyal/include/lloyal/logits.hpp b/vendor/liblloyal/include/lloyal/logits.hpp index 51f639d..30604da 100644 --- a/vendor/liblloyal/include/lloyal/logits.hpp +++ b/vendor/liblloyal/include/lloyal/logits.hpp @@ -1,4 +1,8 @@ #pragma once + +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Lloyal Labs + /** * @file logits.hpp * @brief Zero-copy logits access with clear lifetime semantics diff --git a/vendor/liblloyal/include/lloyal/metrics.hpp b/vendor/liblloyal/include/lloyal/metrics.hpp index e3e310e..b432d7c 100644 --- a/vendor/liblloyal/include/lloyal/metrics.hpp +++ b/vendor/liblloyal/include/lloyal/metrics.hpp @@ -1,5 +1,8 @@ #pragma once +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Lloyal Labs + /** * @file metrics.hpp * @brief Distribution Metrics for Test-Time Alignment diff --git a/vendor/liblloyal/include/lloyal/model_registry.hpp b/vendor/liblloyal/include/lloyal/model_registry.hpp index 52efbc4..3155305 100644 --- a/vendor/liblloyal/include/lloyal/model_registry.hpp +++ b/vendor/liblloyal/include/lloyal/model_registry.hpp @@ -1,5 +1,8 @@ #pragma once +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Lloyal Labs + #include "common.hpp" #include #include @@ -9,14 +12,14 @@ #include /** - * Model Registry (Header-Only) + * @file model_registry.hpp + * @brief Thread-Safe Model Cache * - * Purpose: Thread-safe weak-pointer cache to avoid reloading same model - * multiple times. Uses inline static members (C++17) to enable header-only - * class with static state. + * Provides weak-pointer cache to avoid reloading same model multiple times. + * Uses inline static members (C++17) for header-only implementation with static state. * - * Key: (canonPath, n_gpu_layers, use_mmap) - * Value: weak_ptr to llama_model (auto-cleanup when last context releases) + * Cache key: (canonPath, n_gpu_layers, use_mmap) + * Cache value: weak_ptr to llama_model (auto-cleanup when last context releases) * * Thread-safe via std::mutex for all cache operations. */ @@ -24,13 +27,15 @@ namespace lloyal { /** - * Model cache key combining file path and GPU configuration - * SOURCE: ModelRegistry.h:22-32 + * @brief Model cache key combining file path and GPU configuration + * + * Used as key in ModelRegistry cache to uniquely identify model instances. + * Different GPU configurations of the same model are cached separately. */ struct ModelKey { - std::string canonPath; // Normalized file path (file:// prefix removed) - int n_gpu_layers; // Number of layers offloaded to GPU (-1 = all) - bool use_mmap; // Whether to use memory mapping + std::string canonPath; ///< Normalized file path (file:// prefix removed) + int n_gpu_layers; ///< Number of layers offloaded to GPU (-1 = all) + bool use_mmap; ///< Whether to use memory mapping bool operator==(const ModelKey &o) const { return n_gpu_layers == o.n_gpu_layers && use_mmap == o.use_mmap && @@ -39,10 +44,20 @@ struct ModelKey { }; /** - * Hash function for ModelKey - * SOURCE: ModelRegistry.h:38-46 + * @brief Hash functor for ModelKey + * + * Computes combined hash of path, GPU layers, and mmap flag for use in + * std::unordered_map. Uses XOR with golden ratio constant for good distribution. */ struct ModelKeyHash { + /** + * @brief Compute hash for ModelKey + * + * Combines path hash with GPU/mmap configuration using XOR and golden ratio. + * + * @param k Key to hash + * @return Combined hash value + */ size_t operator()(const ModelKey &k) const { std::hash Hs; std::hash Hi; @@ -54,7 +69,6 @@ struct ModelKeyHash { /** * Thread-safe registry for sharing llama_model instances - * SOURCE: ModelRegistry.h:72-120 * * IMPORTANT: This is a CLASS with static members, not a namespace. * Converting to header-only requires inline static members (C++17). @@ -63,7 +77,6 @@ class ModelRegistry { public: /** * Acquire a model from cache or load if not present - * SOURCE: ModelRegistry.h:93-96 * * @param fsPath Filesystem path to model file (file:// prefix normalized) * @param params Model load parameters (GPU layers, mmap, etc.) @@ -75,21 +88,25 @@ class ModelRegistry { private: /** * Global cache mutex - inline static for header-only - * SOURCE: ModelRegistry.h:103 */ inline static std::mutex mu_; /** * Model cache - inline static for header-only - * SOURCE: ModelRegistry.h:113 */ inline static std::unordered_map, ModelKeyHash> cache_; /** - * Create cache key from path and parameters (private helper) - * SOURCE: ModelRegistry.h:119 + * @brief Create normalized cache key from path and parameters + * + * Normalizes filesystem path by removing file:// prefix to ensure + * "file:///path" and "/path" map to the same cache entry. + * + * @param fsPath Filesystem path (may include file:// prefix) + * @param params Model parameters for GPU/mmap configuration + * @return Normalized ModelKey */ static ModelKey makeKey(const std::string &fsPath, const llama_model_params ¶ms); diff --git a/vendor/liblloyal/include/lloyal/sampler.hpp b/vendor/liblloyal/include/lloyal/sampler.hpp index b00e76f..182b7c6 100644 --- a/vendor/liblloyal/include/lloyal/sampler.hpp +++ b/vendor/liblloyal/include/lloyal/sampler.hpp @@ -1,5 +1,8 @@ #pragma once +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Lloyal Labs + #include "common.hpp" #include "logits.hpp" #include "tokenizer.hpp" @@ -12,14 +15,16 @@ #include /** - * Sampler Anti-Corruption Layer (Header-Only) + * @file sampler.hpp + * @brief Token Sampling Operations * - * Purpose: Single point of contact with llama.cpp sampling APIs to isolate - * sampling strategy complexity and enable future extensions. + * Wraps llama.cpp sampling APIs with configurable sampling strategies. + * Uses C++20 concepts for generic parameter handling. * - * Uses C++20 concept-constrained templates to accept any shell's - * Nitrogen-generated SamplingParams type without requiring struct duplication - * or adapters. + * Architecture: + * - Concept-constrained templates accept any Nitrogen-generated SamplingParams type + * - No struct duplication or adapters required + * - Supports greedy, temperature, top-k, top-p, min-p, grammar-constrained sampling */ namespace lloyal::detail { @@ -59,10 +64,8 @@ namespace lloyal { /** * C++20 concept: Any type with sampling parameter fields * - * Allows template to accept any shell's Nitrogen-generated SamplingParams: - * - margelo::nitro::calibratendk::SamplingParams - * - margelo::nitro::nitrollama::SamplingParams - * - Or any other conforming type + * Allows template to accept any binding's generated SamplingParams type + * without coupling to specific implementations. * * Fields can be either T or std::optional */ @@ -169,10 +172,8 @@ inline llama_token greedy(llama_context *ctx, const llama_vocab *vocab) { * @throws std::runtime_error if sampling fails * * TEMPLATE INSTANTIATION: - * - calibrate-ndk: instantiates for - * margelo::nitro::calibratendk::SamplingParams - * - nitro-llama: instantiates for margelo::nitro::nitrollama::SamplingParams - * - No adapters needed, works via duck typing + concept constraint + * Works with any SamplingParams type matching the concept constraint. + * No adapters needed - uses duck typing + C++20 concepts. */ template inline llama_token sample_with_params(llama_context *ctx, diff --git a/vendor/liblloyal/include/lloyal/tokenizer.hpp b/vendor/liblloyal/include/lloyal/tokenizer.hpp index 69017c3..442eb27 100644 --- a/vendor/liblloyal/include/lloyal/tokenizer.hpp +++ b/vendor/liblloyal/include/lloyal/tokenizer.hpp @@ -1,5 +1,8 @@ #pragma once +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Lloyal Labs + #include "common.hpp" #include #include @@ -7,13 +10,16 @@ #include /** - * Tokenizer Anti-Corruption Layer (Header-Only) + * @file tokenizer.hpp + * @brief Text Tokenization Operations * - * Purpose: Single point of contact with llama.cpp tokenization APIs to isolate - * version churn, special token handling complexity, and buffer sizing edge - * cases. + * Wraps llama.cpp tokenization APIs with safe buffer management and special token handling. + * Uses two-pass algorithms for reliable buffer sizing. * - * Uses two-pass algorithms for safe buffer sizing. + * Architecture: + * - Two-pass tokenization: measure size, then allocate and populate + * - Special token handling: BOS/EOS/parsing configuration + * - Model-accepting overloads for convenience */ namespace lloyal::tokenizer { diff --git a/vendor/llama.cpp/README.md b/vendor/llama.cpp/README.md index fff5496..41f3680 100644 --- a/vendor/llama.cpp/README.md +++ b/vendor/llama.cpp/README.md @@ -4,7 +4,7 @@ This directory contains vendored sources from the llama.cpp project. **Source:** llama.cpp/ git submodule **Commit:** 338074c383c81366320d176d83b94b0a567ee0c2 -**Vendored:** 2026-01-18T13:51:23.321Z +**Vendored:** 2026-01-23T04:38:31.667Z **DO NOT EDIT:** Files in this directory are copied from git submodules. To update, run: npm run update-vendors From 361e4f1c0a772fab95de5fe76513db0264ba2b1f Mon Sep 17 00:00:00 2001 From: LLoyal Research Date: Fri, 23 Jan 2026 17:39:36 +1100 Subject: [PATCH 6/9] feat(dist+docs): full matrix implementation, README update --- .github/actions/provision-cuda/action.yaml | 94 ++ .github/workflows/release.yml | 137 +-- LICENSE | 201 ++++ README.md | 704 ++++++-------- cmake/arm64-cross.cmake | 21 + docs/distribution.md | 1013 ++++---------------- package.json | 5 +- 7 files changed, 891 insertions(+), 1284 deletions(-) create mode 100644 .github/actions/provision-cuda/action.yaml create mode 100644 LICENSE create mode 100644 cmake/arm64-cross.cmake diff --git a/.github/actions/provision-cuda/action.yaml b/.github/actions/provision-cuda/action.yaml new file mode 100644 index 0000000..2e16b92 --- /dev/null +++ b/.github/actions/provision-cuda/action.yaml @@ -0,0 +1,94 @@ +name: Provision CUDA Toolkit + +description: Install CUDA toolkit for lloyal.node builds across all platforms + +inputs: + version: + description: "CUDA toolkit version" + required: false + default: "12.6.2" + arch: + description: "Target architecture (x64 or arm64)" + required: true + +outputs: + cuda-path: + description: "CUDA installation path" + value: ${{ steps.set-cuda-path.outputs.cuda-path }} + +runs: + using: "composite" + steps: + # Windows: Install via Chocolatey + - name: Install CUDA (Windows) + if: runner.os == 'Windows' + shell: pwsh + env: + VERSION: ${{ inputs.version }} + run: | + $version = $env:VERSION + $version_major_minor = $version.Split('.')[0..1] -join '.' + $version_slug = $version_major_minor.Replace('.', '_') + $cuda_path = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version_major_minor}" + + Write-Host "Installing CUDA ${version} via Chocolatey..." + choco install cuda --version=${version} -y --no-progress + + # Set environment variables + Add-Content -Path $env:GITHUB_ENV -Value "CUDA_PATH=${cuda_path}" + Add-Content -Path $env:GITHUB_ENV -Value "CUDA_PATH_V${version_slug}=${cuda_path}" + Add-Content -Path $env:GITHUB_PATH -Value "${cuda_path}\bin" + Add-Content -Path $env:GITHUB_PATH -Value "${cuda_path}\libnvvp" + + Write-Host "CUDA installed at: ${cuda_path}" + + # Linux x64: Install from NVIDIA repos + - name: Install CUDA (Linux x64) + if: runner.os == 'Linux' && inputs.arch == 'x64' + shell: bash + env: + VERSION: ${{ inputs.version }} + run: | + version_major_minor=$(echo $VERSION | cut -d. -f1,2) + version_slug=$(echo $version_major_minor | tr '.' '-') + + echo "Installing CUDA ${version_major_minor} for x86_64..." + wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt-get update -qq + sudo apt-get install -y -qq cuda-toolkit-${version_slug} build-essential cmake + + cuda_path="/usr/local/cuda-${version_major_minor}" + echo "CUDA_PATH=${cuda_path}" >> $GITHUB_ENV + echo "${cuda_path}/bin" >> $GITHUB_PATH + + echo "CUDA installed at: ${cuda_path}" + + # Linux ARM64: Install from NVIDIA repos + - name: Install CUDA (Linux ARM64) + if: runner.os == 'Linux' && inputs.arch == 'arm64' + shell: bash + env: + VERSION: ${{ inputs.version }} + run: | + version_major_minor=$(echo $VERSION | cut -d. -f1,2) + version_slug=$(echo $version_major_minor | tr '.' '-') + + echo "Installing CUDA ${version_major_minor} for arm64..." + wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt-get update -qq + sudo apt-get install -y -qq cuda-toolkit-${version_slug} build-essential cmake + + cuda_path="/usr/local/cuda-${version_major_minor}" + echo "CUDA_PATH=${cuda_path}" >> $GITHUB_ENV + echo "${cuda_path}/bin" >> $GITHUB_PATH + + echo "CUDA installed at: ${cuda_path}" + + # Set output + - name: Set CUDA path output + id: set-cuda-path + shell: bash + run: | + echo "cuda-path=${CUDA_PATH}" >> $GITHUB_OUTPUT diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 6e641be..628680e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -37,7 +37,7 @@ jobs: gpu: vulkan package: linux-x64-vulkan - # Windows + # Windows x64 - os: windows-2022 arch: x64 gpu: cpu @@ -45,25 +45,37 @@ jobs: - os: windows-2022 arch: x64 gpu: cuda - cuda_version: 12.2.0 package: win32-x64-cuda - os: windows-2022 arch: x64 gpu: vulkan package: win32-x64-vulkan - # Linux ARM64 (new for v1.0) - - os: ubuntu-22.04 + # Windows ARM64 (cross-compiled from x64) + - os: windows-2022 + arch: arm64 + gpu: cpu + package: win32-arm64 + cross_compile: true + - os: windows-2022 + arch: arm64 + gpu: vulkan + package: win32-arm64-vulkan + cross_compile: true + + # Linux ARM64 (native runners) + - os: ubuntu-22.04-arm arch: arm64 gpu: cpu package: linux-arm64 - docker_platform: linux/arm64 - - os: ubuntu-22.04 + - os: ubuntu-22.04-arm arch: arm64 gpu: cuda package: linux-arm64-cuda - docker_image: nvcr.io/nvidia/l4t-cuda:12.6-devel - docker_platform: linux/arm64 + - os: ubuntu-22.04-arm + arch: arm64 + gpu: vulkan + package: linux-arm64-vulkan steps: - name: Checkout code @@ -78,93 +90,86 @@ jobs: registry-url: 'https://registry.npmjs.org' # Platform-specific dependencies - - name: Install build tools (Linux) - if: runner.os == 'Linux' && matrix.gpu == 'cpu' + - name: Install build tools (Linux x64) + if: runner.os == 'Linux' && matrix.arch == 'x64' && matrix.gpu == 'cpu' run: | sudo apt-get update sudo apt-get install -y build-essential cmake - - name: Install CUDA toolkit (Linux) - if: matrix.gpu == 'cuda' && runner.os == 'Linux' + - name: Install build tools (Linux ARM64) + if: runner.os == 'Linux' && matrix.arch == 'arm64' run: | - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb - sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt-get update - sudo apt-get install -y cuda-toolkit-12-2 build-essential cmake + sudo apt-get install -y build-essential cmake + + - name: Provision CUDA toolkit + if: matrix.gpu == 'cuda' && runner.os == 'Linux' + uses: ./.github/actions/provision-cuda + with: + version: '12.6.2' + arch: ${{ matrix.arch }} - name: Install Vulkan SDK (Linux) if: matrix.gpu == 'vulkan' && runner.os == 'Linux' - run: | - sudo apt-get update - sudo apt-get install -y build-essential cmake libvulkan-dev vulkan-tools + uses: jakoch/install-vulkan-sdk-action@v1.2.4 + with: + vulkan_version: '1.4.313.0' + install_runtime: true + optional_components: com.lunarg.vulkan.arm64 + cache: true + stripdown: true - - name: Install CUDA toolkit (Windows) + - name: Provision CUDA toolkit if: matrix.gpu == 'cuda' && runner.os == 'Windows' - uses: Jimver/cuda-toolkit@v0.2.11 + uses: ./.github/actions/provision-cuda with: - cuda: '12.2.0' + version: '12.6.2' + arch: ${{ matrix.arch }} - name: Install Vulkan SDK (Windows) if: matrix.gpu == 'vulkan' && runner.os == 'Windows' + uses: jakoch/install-vulkan-sdk-action@v1.2.4 + with: + vulkan_version: '1.4.313.0' + install_runtime: true + cache: true + stripdown: true + + - name: Setup LLVM and Ninja for Windows ARM64 cross-compilation + if: runner.os == 'Windows' && matrix.cross_compile == true shell: pwsh run: | - $url = "https://sdk.lunarg.com/sdk/download/1.3.275.0/windows/VulkanSDK-1.3.275.0-Installer.exe" - Invoke-WebRequest -Uri $url -OutFile VulkanSDK.exe - Start-Process -FilePath .\VulkanSDK.exe -ArgumentList '/S' -Wait - echo "VULKAN_SDK=C:\VulkanSDK\1.3.275.0" | Out-File -FilePath $env:GITHUB_ENV -Append - - - name: Setup QEMU for ARM64 - if: matrix.arch == 'arm64' && runner.os == 'Linux' - uses: docker/setup-qemu-action@v3 - with: - platforms: linux/arm64 + # Install LLVM for cross-compilation + choco install llvm ninja -y + + # Set environment for clang cross-compilation + echo "CC=clang-cl" | Out-File -FilePath $env:GITHUB_ENV -Append + echo "CXX=clang-cl" | Out-File -FilePath $env:GITHUB_ENV -Append + echo "CMAKE_GENERATOR=Ninja" | Out-File -FilePath $env:GITHUB_ENV -Append # Build - name: Install npm dependencies - if: matrix.arch != 'arm64' || runner.os != 'Linux' run: npm install - - name: Build native module (x64 or native ARM64) - if: matrix.arch != 'arm64' || runner.os != 'Linux' + - name: Build native module (Native builds) + if: matrix.cross_compile != true run: npm run build env: LLOYAL_GPU: ${{ matrix.gpu }} - - name: Build native module (ARM64 via Docker) - if: matrix.arch == 'arm64' && runner.os == 'Linux' - shell: bash + - name: Build native module (Windows ARM64 cross-compile) + if: runner.os == 'Windows' && matrix.cross_compile == true + shell: pwsh run: | - # Determine Docker image - if [ -n "${{ matrix.docker_image }}" ]; then - IMAGE="${{ matrix.docker_image }}" - else - IMAGE="arm64v8/ubuntu:22.04" - fi - - # Build inside ARM64 container - docker run --rm --platform ${{ matrix.docker_platform }} \ - -v $PWD:/workspace -w /workspace \ - -e LLOYAL_GPU=${{ matrix.gpu }} \ - $IMAGE bash -c " - # Install build dependencies - apt-get update - apt-get install -y build-essential cmake git curl - - # Install Node.js 20 - curl -fsSL https://deb.nodesource.com/setup_20.x | bash - - apt-get install -y nodejs - - # Install CUDA toolkit if needed - if [ '${{ matrix.gpu }}' = 'cuda' ]; then - apt-get install -y cuda-toolkit-12-6 || true - fi - - # Build - npm install - npm run build - " + # Set up cross-compilation environment + $env:CMAKE_GENERATOR = "Ninja" + $env:CMAKE_TOOLCHAIN_FILE = "${{ github.workspace }}/cmake/arm64-cross.cmake" + + # Build with cross-compilation + npm run build env: LLOYAL_GPU: ${{ matrix.gpu }} + ARCH: arm64 # Package - name: Create platform package diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3e8eaac --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2025 lloyal.ai + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 045bc61..2a49988 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,10 @@ # lloyal.node -Thin N-API wrapper over [liblloyal](https://github.com/lloyal-ai/liblloyal) for Node.js - raw llama.cpp inference primitives. +Node.js bindings for [liblloyal](https://github.com/lloyal-ai/liblloyal)β€”the inference kernel that orchestrates llama.cpp in-process for agentic inference patterns. -## Features +**Today:** Core liblloyal primitives and Test Time Alignment via TypeScript sampling. -- **Prebuilt Binaries**: Install in <1 minute on 7 common platforms (macOS, Linux, Windows) -- **Raw & Thin**: Direct access to llama.cpp primitives via liblloyal -- **Zero-Copy Logits**: `getLogits()` returns Float32Array pointing to llama.cpp memory -- **Embeddings**: Extract L2-normalized embeddings with configurable pooling (MEAN, CLS, LAST) -- **GPU Acceleration**: Metal (macOS), CUDA, and Vulkan support with dedicated prebuilts -- **BYO llama.cpp**: Swap `libllama.dylib` for custom builds (dynamic linking) -- **Native Reference**: Includes native entropy/greedy implementations for testing -- **TypeScript**: Full type definitions included - -## Use Cases - -A minimal Node.js binding for llama.cpp inference, suitable for: -- **Testing & Validation**: Compare TypeScript implementations against native references -- **Serverless Deployments**: Lightweight footprint for edge compute and Lambda-style functions -- **Automation & CI**: Build deterministic test suites for LLM-powered workflows -- **Research & Prototyping**: Direct access to llama.cpp primitives without framework overhead +**Coming (vNext):** Atomic state forking, KV-LRU (leasing), SMMA (Single Model Multi-Agent) orchestrationβ€”bringing liblloyal's Branch and Lease to TypeScript. ## Installation @@ -27,479 +12,424 @@ A minimal Node.js binding for llama.cpp inference, suitable for: npm install lloyal.node ``` -### Prebuilt Binaries (Recommended) - -lloyal.node ships with **prebuilt binaries** for common platforms. Installation takes **<1 minute**: - -| Platform | Architecture | GPU | Package | Install Time | -|----------|--------------|-----|---------|--------------| -| **macOS** | Apple Silicon (arm64) | Metal | `@lloyal/lloyal.node-darwin-arm64` | <1 min ⚑ | -| **macOS** | Intel (x64) | CPU | `@lloyal/lloyal.node-darwin-x64` | <1 min ⚑ | -| **Linux** | x64 | CPU | `@lloyal/lloyal.node-linux-x64` | <1 min ⚑ | -| **Linux** | x64 | CUDA | `@lloyal/lloyal.node-linux-x64-cuda` | <1 min ⚑ | -| **Linux** | x64 | Vulkan | `@lloyal/lloyal.node-linux-x64-vulkan` | <1 min ⚑ | -| **Windows** | x64 | CPU | `@lloyal/lloyal.node-win32-x64` | <1 min ⚑ | -| **Windows** | x64 | CUDA | `@lloyal/lloyal.node-win32-x64-cuda` | <1 min ⚑ | - -**How it works:** -- npm automatically downloads the correct prebuilt for your platform -- Platform packages are listed as `optionalDependencies` -- Falls back to building from source if your platform isn't covered - -### Building from Source (Fallback) - -If no prebuilt is available for your platform, lloyal.node builds from **vendored sources** (5-15 minutes): +Prebuilt binaries for 13 platforms: -**Prerequisites:** -- Node.js β‰₯18 -- C++20 compiler (GCC, Clang, or MSVC) -- CMake β‰₯3.14 -- node-gyp build tools +| Platform | Arch | Acceleration | +| -------- | ----- | ------------------- | +| macOS | arm64 | Metal | +| macOS | x64 | CPU | +| Linux | x64 | CPU / CUDA / Vulkan | +| Linux | arm64 | CPU / CUDA / Vulkan | +| Windows | x64 | CPU / CUDA / Vulkan | +| Windows | arm64 | CPU / Vulkan | -**Supported platforms:** -- Any platform with a C++20 compiler and CMake -- GPU backends require additional dependencies (see GPU Acceleration section) +Falls back to source build if your platform isn't covered. -## Using in Your Project - -Simply add lloyal.node as a dependency: - -```json -{ - "dependencies": { - "lloyal.node": "^0.1.0" - } -} +```bash +LLOYAL_GPU=cuda npm install # NVIDIA +LLOYAL_GPU=vulkan npm install # AMD/Intel +LLOYAL_GPU=cpu npm install # Force CPU ``` -Then import and use: +See [DISTRIBUTION.md](./docs/DISTRIBUTION.md) for package details. -```javascript -const { createContext } = require('lloyal.node'); +## Quick Start -const ctx = await createContext({ - modelPath: './model.gguf' -}); -``` +Complete example with greedy sampling: -**That's it!** npm handles downloading prebuilts or building from source automatically. +```typescript +import { createContext } from 'lloyal.node'; -## Development & Contributing +async function generate(prompt: string, maxTokens = 100): Promise { + const ctx = await createContext({ + modelPath: './model.gguf', + nCtx: 2048, + nThreads: 4, + }); -**Clone the repository:** + try { + const tokens = await ctx.tokenize(prompt); + await ctx.decode(tokens, 0); -```bash -# Clone with submodules -git clone --recursive https://github.com/lloyal-ai/lloyal.node.git -cd lloyal.node + const output: number[] = []; + let pos = tokens.length; -# Build from source -npm install -npm run build -``` + for (let i = 0; i < maxTokens; i++) { + const token = ctx.greedySample(); + if (token < 0) break; // EOS -**Build process:** -- **Linux**: Builds llama.cpp as a single shared library (`.so`) with `-DCMAKE_POSITION_INDEPENDENT_CODE=ON` -- **macOS**: Creates universal binary (arm64+x86_64) `libllama.dylib` with Metal/Accelerate support -- **Windows**: Builds DLLs for llama.cpp + ggml + output.push(token); + await ctx.decode([token], pos++); + } -**Why single combined library?** Dynamic linking to `libllama.so`/`.dylib` enables the "bring your own llama.cpp" pattern while avoiding ODR violations. + return ctx.detokenize(output); + } finally { + ctx.dispose(); + } +} -**Active development workflow:** -```bash -git submodule update --remote # Update submodules -npm run clean # Clean build artifacts -npm run build # Rebuild +const response = await generate('The capital of France is'); +console.log(response); ``` -### GPU Acceleration +## Test-Time Alignment -By default, lloyal.node auto-detects the best backend for your platform: +TTA is the fusion of application state with sampling strategy at every token step. Instead of generating text and validating after, you enforce constraints _during_ generation. -| Platform | Default Backend | GPU Support | -|----------|----------------|-------------| -| **macOS (local)** | Metal | βœ… GPU acceleration | -| **macOS (CI)** | CPU | ⚠️ No GPU (virtualized) | -| **Linux** | CPU | Manual via `LLOYAL_GPU` | -| **Windows** | CPU | Manual via `LLOYAL_GPU` | +This requires two things: -**Override with `LLOYAL_GPU` environment variable:** +1. **Raw logits** β€” the probability distribution over all possible next tokens +2. **TypeScript sampling** β€” so your app logic can modify probabilities before selection -```bash -# Force CPU-only build (disables all GPU backends) -LLOYAL_GPU=cpu npm install +lloyal.node provides the logits. [tsampler](https://github.com/lloyal-ai/tsampler) provides the sampling: -# Enable CUDA (Linux/Windows with NVIDIA GPU) -LLOYAL_GPU=cuda npm install +```typescript +import { createContext } from 'lloyal.node'; +import { + sampleWithStrategy, + computeModelEntropy, + TokenHistoryTracker, + SamplerWorkspace, + Xoroshiro128Plus, +} from '@lloyal/tsampler'; -# Enable Vulkan (Linux/Windows) -LLOYAL_GPU=vulkan npm install +const ctx = await createContext({ modelPath: './model.gguf' }); +const prng = new Xoroshiro128Plus(42); // Deterministic PRNG +const tokenHistory = new TokenHistoryTracker(64); // For repetition penalties +const workspace = new SamplerWorkspace(256); // Pre-allocated, zero-alloc hot path -# Enable Metal (macOS only, default on local builds) -LLOYAL_GPU=metal npm install -``` +const tokens = await ctx.tokenize(prompt); +await ctx.decode(tokens, 0); -**Requirements by Backend:** +let pos = tokens.length; +const output: number[] = []; -- **CPU**: No additional dependencies (works everywhere) -- **Metal**: macOS only (built-in, requires physical GPU) -- **CUDA**: NVIDIA GPU + [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) installed -- **Vulkan**: [Vulkan SDK](https://vulkan.lunarg.com/) installed +while (output.length < maxTokens) { + const logits = ctx.getLogits(); -**⚠️ Runtime Dependencies & Dynamic Linking:** + // === YOUR STEERING LOGIC HERE === -lloyal.node uses **dynamic linking** to a bundled `libllama.so`/`libllama.dylib`: + // Enforce domain rules + if (currency === 'JPY') { + logits[DECIMAL_TOKEN] = -Infinity; // JPY has no decimal subdivision + } -``` -node_modules/lloyal.node/build/Release/ -β”œβ”€β”€ lloyal.node # N-API wrapper (links to libllama via @rpath) -└── libllama.dylib # llama.cpp + ggml (bundled, but swappable!) -``` + // Adapt to model confidence + const entropy = computeModelEntropy(logits); + const params = + entropy < 2.0 + ? { topK: 256, temperature: 1.5 } // Low confidence β†’ explore more + : { topK: 40, temperature: 0.8 }; // High confidence β†’ stay focused -**Batteries included, BYO supported:** The bundled llama library ships with the package, but you can replace it with your own build (same ABI required). + // === END STEERING LOGIC === -GPU backends introduce **additional runtime dependencies**: + const token = sampleWithStrategy(logits, { + tokenHistory, + params, + workspace, + prng, + }); -| Backend | Bundled in Package | External Runtime Dependencies | Portable? | -|---------|-------------------|-------------------------------|-----------| -| **CPU** | `libllama.so` only | None | βœ… Yes | -| **Metal** | `libllama.dylib` + Metal framework | macOS frameworks (always available) | βœ… Yes (macOS only) | -| **CUDA** | `libllama.so` + CUDA code | `libcudart.so`, `libcublas.so`, etc. | ❌ No - requires CUDA runtime | -| **Vulkan** | `libllama.so` + Vulkan code | `libvulkan.so` | ❌ No - requires Vulkan drivers | + if (token < 0) break; -**CUDA/Vulkan builds are NOT portable** - they require the same GPU libraries at runtime: + tokenHistory.accept(token); + output.push(token); + await ctx.decode([token], pos++); +} +``` -```bash -# Build on machine with CUDA -LLOYAL_GPU=cuda npm install # βœ… Links against CUDA libs +### Domain Constraints + +```typescript +// Financial: JPY has no decimal subdivision +if (currency === 'JPY' && parsingAmount) { + logits[DECIMAL_TOKEN] = -Infinity; + DIGIT_TOKENS.forEach((id) => (logits[id] += 2.0)); +} -# Deploy to production without CUDA -node app.js # ❌ Error: libcudart.so.12 not found +// Legal: Boost required terminology +if (contractType === 'NDA') { + CONFIDENTIALITY_TOKENS.forEach((id) => (logits[id] += 5.0)); +} -# Solution: Install CUDA runtime on production, or rebuild with CPU -LLOYAL_GPU=cpu npm install # βœ… Portable to any Linux machine +// Medical: Enforce terminology based on actual lab values +if (glucoseLevel > normalMax) { + ELEVATED_TOKENS.forEach((id) => (logits[id] += 10.0)); + NORMAL_TOKENS.forEach((id) => (logits[id] = -Infinity)); +} ``` -Check dynamic dependencies with: -```bash -# Linux -ldd build/Release/lloyal.node +### Quality Gates + +```typescript +import { computeModelSurprisal, RollingPerplexity } from '@lloyal/tsampler'; + +const ppl = new RollingPerplexity(); + +while (generating) { + const logits = ctx.getLogits(); + const token = sampleWithStrategy(logits, { + tokenHistory, + params, + workspace, + prng, + }); + + const surprisal = computeModelSurprisal(logits, token); + ppl.addSurprisal(surprisal); + + if (ppl.ppl() > 50) { + // Generation quality degrading β€” options: + // 1. Trigger RAG retrieval for more context + // 2. Prune KV cache (evict stale context) + // 3. Early stop and retry with different prompt + } -# macOS -otool -L build/Release/lloyal.node + // ... +} ``` -**Bring Your Own llama.cpp:** +### Entropy-Adaptive Retrieval -Advanced users can replace the bundled llama library with a custom build: +```typescript +import { computeModelEntropy } from '@lloyal/tsampler'; -```bash -# Build your custom llama.cpp (must match ABI) -cd /path/to/your/llama.cpp -cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON ... -cmake --build build +while (generating) { + const logits = ctx.getLogits(); + const entropy = computeModelEntropy(logits); -# Replace bundled library (AFTER npm install) -cp /path/to/your/llama.cpp/build/libllama.so \ - node_modules/lloyal.node/build/Release/libllama.so + if (entropy > 5.0) { + // Model is uncertain β€” retrieve relevant context + const context = await rag.retrieve(currentQuery); + await injectContext(ctx, context); + continue; // Re-evaluate with new context + } -# Verify it loads -node -e "require('lloyal.node').createContext({modelPath: './model.gguf'})" + const token = sampleWithStrategy(logits, { + tokenHistory, + params, + workspace, + prng, + }); + // ... +} ``` -**⚠️ ABI Compatibility Requirements:** -- Same llama.cpp commit/version (API signatures must match) -- Same backend (CPU/CUDA/Vulkan/Metal) -- Same architecture (x86_64/arm64) -- Mismatches cause runtime crashes or undefined behavior - -**Use cases:** -- Custom llama.cpp patches -- Organization-specific builds -- Testing upstream llama.cpp changes -- Optimized builds for specific hardware +## Why TypeScript Sampling? -**Examples:** +| | Native C++ | TypeScript (tsampler) | +| ----------------------- | ------------ | --------------------- | +| Speed | ~0.3ms/token | ~3-5ms/token | +| Overhead vs 50ms decode | β€” | ~6-10% | +| Logit steering | ❌ | βœ… | +| Adaptive strategies | ❌ | βœ… | +| OTA updates | Rebuild app | Ship new JS | +| Debugging | printf | Full inspect | -```bash -# Deploy to AWS Lambda (CPU-only for compatibility) -LLOYAL_GPU=cpu npm install +The overhead is imperceptible. A 50ms decode dominates; 3ms sampling is noise. -# Development on Linux workstation with NVIDIA GPU -LLOYAL_GPU=cuda npm install +### tsampler Capabilities -# Rebuild with different backend -npm run clean -LLOYAL_GPU=vulkan npm install -``` +[tsampler](https://github.com/lloyal-ai/tsampler) provides llama.cpp sampling parity in pure TypeScript: -**Note:** The backend is determined at **build time**, not runtime. To switch backends, you must rebuild with `npm run clean && LLOYAL_GPU= npm install`. +**Sampling methods:** greedy, top-k, top-p, min-p, typical-p, top-n-sigma, temperature, mirostat v1/v2 -### How Include Paths Work +**Penalties:** repetition, frequency, presence (exact llama.cpp formulas) -liblloyal expects `#include `, but llama.cpp provides headers at `include/llama.h`. +**Infrastructure:** -**Solution:** The `npm install` script automatically creates a symlink structure: -- `include/llama/` β†’ `llama.cpp/include/*.h` -- `include/ggml/` β†’ `llama.cpp/ggml/include/*.h` +- `Xoroshiro128Plus` β€” deterministic PRNG, reproducible generations +- `TokenHistoryTracker` β€” sliding window for penalty calculations +- `SamplerWorkspace` β€” pre-allocated buffers, zero-alloc hot path +- `computeModelEntropy()` β€” Shannon entropy in nats +- `computeModelSurprisal()` β€” per-token surprisal +- `RollingPerplexity` β€” streaming perplexity tracking -These symlinks are **gitignored** and regenerated on each `npm install`. This approach: -- Respects liblloyal's include path expectations (external package boundary) -- Doesn't modify llama.cpp submodule structure -- Works across platforms (Node.js handles symlinks portably) -- Zero disk overhead (symlinks, not copies) +### Native References -**Note for Contributors:** The package uses git submodules for `liblloyal` and `llama.cpp` during development. npm users get vendored sources automatically. If you cloned the repo without `--recursive`: +lloyal.node includes native C++ implementations for validation: -```bash -git submodule update --init --recursive -``` - -### Test Models (Git LFS) +```typescript +// TypeScript implementation +const tsEntropy = computeModelEntropy(logits); -The test suite uses [Git LFS](https://git-lfs.com/) to track the SmolLM2 model (~1GB). Install Git LFS before cloning: +// Native reference (C++) +const nativeEntropy = ctx.computeEntropy(); -```bash -# Install Git LFS (one-time setup) -brew install git-lfs # macOS -# or: sudo apt-get install git-lfs # Linux +// Should match within float precision +console.assert(Math.abs(tsEntropy - nativeEntropy) < 1e-5); +``` -# Initialize Git LFS -git lfs install +Available references: -# Clone with LFS files -git clone --recursive https://github.com/lloyal-ai/lloyal.node.git -``` +- `ctx.computeEntropy()` β€” Shannon entropy in nats +- `ctx.greedySample()` β€” argmax token ID -If you already cloned without LFS, pull the model: +Build with confidence. Validate against native. Deploy TypeScript. -```bash -git lfs pull -``` +## Embeddings -## Usage +lloyal.node supports embedding extraction with configurable pooling: ```typescript import { createContext } from 'lloyal.node'; const ctx = await createContext({ - modelPath: './model.gguf', - nCtx: 2048, - nThreads: 4 + modelPath: './nomic-embed-text.gguf', + embeddings: true, + poolingType: 1, // 0=NONE, 1=MEAN, 2=CLS, 3=LAST }); -try { - // Tokenize - const tokens = await ctx.tokenize("The capital of France is"); +async function embed(text: string): Promise { + const tokens = await ctx.tokenize(text); + await ctx.encode(tokens); - // Decode (forward pass) - await ctx.decode(tokens, 0); + const embedding = ctx.getEmbeddings(true); // L2-normalized + await ctx.kvCacheClear(); // Reset for next text - // Get raw logits (zero-copy!) - const logits = ctx.getLogits(); // Float32Array - - // Native reference implementations (for testing) - const entropy = ctx.computeEntropy(); // nats - const token = ctx.greedySample(); // token ID - - console.log(`Entropy: ${entropy.toFixed(3)} nats`); - console.log(`Greedy token: ${token}`); -} finally { - ctx.dispose(); // Free native resources + return embedding; } -``` - -## API -### `createContext(options)` - -Creates a new inference context. +const vec = await embed('Document to embed'); +console.log(`Dimension: ${ctx.getEmbeddingDimension()}`); // e.g., 768 +``` -**Options:** -- `modelPath: string` - Path to .gguf model file (required) -- `nCtx?: number` - Context size (default: 2048) -- `nThreads?: number` - Number of threads (default: 4) -- `embeddings?: boolean` - Enable embedding mode (default: false) -- `poolingType?: number` - Pooling type: 0=NONE, 1=MEAN, 2=CLS, 3=LAST (default: model's default) +## API Reference -**Returns:** `Promise` +### Context Creation -### `SessionContext` +```typescript +const ctx = await createContext({ + modelPath: string, // Path to .gguf file (required) + nCtx?: number, // Context size (default: 2048) + nThreads?: number, // CPU threads (default: 4) + nGpuLayers?: number, // Layers to offload to GPU (default: 0) + embeddings?: boolean, // Enable embedding mode (default: false) + poolingType?: number // 0=NONE, 1=MEAN, 2=CLS, 3=LAST (default: 0) +}); +``` -#### Core Primitives +### Inference -- **`getLogits(): Float32Array`** - Get raw logits (zero-copy, valid until next decode) -- **`decode(tokens: number[], position: number): Promise`** - Decode tokens through model -- **`tokenize(text: string): Promise`** - Tokenize text to token IDs -- **`detokenize(tokens: number[]): Promise`** - Detokenize tokens to text +| Method | Returns | Description | +| -------------------------- | ------------------- | ----------------------------------------------------- | +| `tokenize(text)` | `Promise` | Text β†’ token IDs | +| `detokenize(tokens)` | `Promise` | Token IDs β†’ text | +| `decode(tokens, position)` | `Promise` | Forward pass, populates KV cache | +| `getLogits()` | `Float32Array` | Vocabulary-sized probability distribution (zero-copy) | -#### Embeddings +### Native References -- **`encode(tokens: number[]): Promise`** - Encode tokens for embedding extraction -- **`getEmbeddings(normalize?: boolean): Float32Array`** - Get embeddings (optionally L2-normalized) -- **`hasPooling(): boolean`** - Check if context has pooling enabled -- **`getEmbeddingDimension(): number`** - Get embedding vector dimension -- **`kvCacheClear(): Promise`** - Clear KV cache (call between texts for embeddings) +| Method | Returns | Description | +| ------------------ | -------- | ----------------------- | +| `greedySample()` | `number` | Argmax token ID | +| `computeEntropy()` | `number` | Shannon entropy in nats | -#### Native References (for testing) +### Embeddings -- **`computeEntropy(): number`** - Native entropy computation (nats) -- **`greedySample(): number`** - Native greedy sampling +| Method | Returns | Description | +| --------------------------- | --------------- | ------------------------------------------ | +| `encode(tokens)` | `Promise` | Forward pass for embedding extraction | +| `getEmbeddings(normalize?)` | `Float32Array` | Embedding vector, optionally L2-normalized | +| `getEmbeddingDimension()` | `number` | Vector dimension | +| `kvCacheClear()` | `Promise` | Clear KV cache between texts | -#### Lifecycle +### Lifecycle -- **`dispose(): void`** - Free native resources +| Method | Description | +| ----------- | ----------------------------------------------------- | +| `dispose()` | Free native resources. **Required** β€” call when done. | -#### Properties +## vNext: Edge Subagents -- **`vocabSize: number`** - Model vocabulary size (readonly) +Exposes [liblloyal](https://github.com/lloyal-ai/liblloyal)'s branch and lease primitives for SMMA orchestration, implementing [Petrov, Torr & Bibi (NeurIPS 2023)](https://openreview.net/forum?id=GYOXIRXI7W): -## Example: Testing TS Sampler +> Skill injection works because prefixes act as "task-subspace selectors" in the model's residual stream. Prefix-tuning can elicit and combine skills already present in the pretrained model. ```typescript import { createContext } from 'lloyal.node'; -import { computeModelEntropy } from '../tsampler'; +import { + sampleWithStrategy, + SamplerWorkspace, + Xoroshiro128Plus, +} from '@lloyal/tsampler'; +// Setup const ctx = await createContext({ modelPath: './model.gguf' }); +const pool = ctx.createLeasePool({ seqMax: 8 }); +const prng = new Xoroshiro128Plus(42); +const workspace = new SamplerWorkspace(256); + +// Trunk processes shared context (user message, RAG results, etc.) +const trunk = pool.createBranch(params); +await trunk.decodeAndCapture(sharedContextTokens); + +// Fork subagents β€” each inherits full prefix, suffixes with skill injection +const tax = pool.fork(trunk); +await tax.decode(await ctx.tokenize(TAX_SKILL_PROMPT)); + +const practical = pool.fork(trunk); +await practical.decode(await ctx.tokenize(PRACTICAL_SKILL_PROMPT)); + +// Generation loop β€” tsampler steers, pool batches decode +const taxTokens: number[] = []; +const practicalTokens: number[] = []; + +while (generating) { + // Get logits from each branch + const taxLogits = tax.getLogits(); + const practicalLogits = practical.getLogits(); + + // tsampler steering per branch + TAX_BANNED_TOKENS.forEach((id) => (taxLogits[id] = -Infinity)); + + const taxToken = sampleWithStrategy(taxLogits, { params, workspace, prng }); + const practicalToken = sampleWithStrategy(practicalLogits, { + params, + workspace, + prng, + }); + + // Batched decode β€” one llama_decode() call, multiple sequences + await pool.advance([ + { branch: tax, token: taxToken }, + { branch: practical, token: practicalToken }, + ]); + + taxTokens.push(taxToken); + practicalTokens.push(practicalToken); +} -const tokens = await ctx.tokenize("Once upon a time"); -await ctx.decode(tokens, 0); - -const logits = ctx.getLogits(); - -// TS implementation -const tsEntropy = computeModelEntropy(logits); - -// Native reference -const nativeEntropy = ctx.computeEntropy(); - -// Should match within float precision -assert(Math.abs(tsEntropy - nativeEntropy) < 1e-5); - -ctx.dispose(); -``` - -## Architecture - -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ JavaScript (lib/index.js) β”‚ -β”‚ - createContext() β”‚ -β”‚ - SessionContext β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β”‚ N-API - β”‚ -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ C++ (src/SessionContext.cpp) β”‚ -β”‚ - Napi::ObjectWrap β”‚ -β”‚ - Async workers for I/O ops β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β”‚ uses - β”‚ -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ liblloyal (header-only) β”‚ -β”‚ - decoder, sampler, tokenizer β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β”‚ wraps - β”‚ -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ llama.cpp β”‚ -β”‚ - libllama.a, libggml.a β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -``` - -## Development - -```bash -# Clean build -npm run clean - -# Debug build (with symbols) -npm run build:debug - -# Run tests -npm test # Run all tests (API + E2E) -npm run test:api # API functionality and benchmarks -npm run test:e2e # Correctness and determinism validation -``` - -### Tests - -- **`test/api.js`**: API functionality tests and performance benchmarks -- **`test/e2e.js`**: End-to-end validation (text generation + embeddings) - -Tests use SmolLM2-1.7B-Instruct for text generation and nomic-embed-text for embeddings. Embedding tests skip gracefully if no embedding model is available. - -## Distribution & Releases - -### Platform Package Architecture - -lloyal.node uses the **industry-standard prebuilt pattern** (same as sharp, sqlite3, canvas): - -``` -lloyal.node (main package) -β”œβ”€β”€ optionalDependencies -β”‚ β”œβ”€β”€ @lloyal/lloyal.node-darwin-arm64 -β”‚ β”œβ”€β”€ @lloyal/lloyal.node-darwin-x64 -β”‚ β”œβ”€β”€ @lloyal/lloyal.node-linux-x64 -β”‚ β”œβ”€β”€ @lloyal/lloyal.node-linux-x64-cuda -β”‚ β”œβ”€β”€ @lloyal/lloyal.node-linux-x64-vulkan -β”‚ β”œβ”€β”€ @lloyal/lloyal.node-win32-x64 -β”‚ └── @lloyal/lloyal.node-win32-x64-cuda -└── install script (prebuilt or fallback to source) -``` - -**Platform packages contain:** -``` -@lloyal/lloyal.node-darwin-arm64/ -β”œβ”€β”€ bin/ -β”‚ β”œβ”€β”€ lloyal.node # N-API binary -β”‚ └── libllama.dylib # Shared library -β”œβ”€β”€ index.js # Exports path to binary -└── package.json # os: ["darwin"], cpu: ["arm64"] +// Conditional forking: spawn legal expert from tax's output +if (taxTokens.length > 50) { + const legal = pool.fork(tax); // Inherits tax's full generation as prefix + await legal.decode(await ctx.tokenize(LEGAL_SKILL_PROMPT)); + // Continue generation with legal branch... +} ``` -### Release Process - -**For maintainers:** - -```bash -# 1. Update vendored sources (if needed) -npm run update-vendors - -# 2. Bump version (triggers sync-versions.js) -npm version minor # or major/patch +**Key primitives:** -# 3. Tag and push -git push origin main --tags +- `pool.fork(parent)` β€” atomic state fork, child inherits full KV prefix +- `branch.getLogits()` β€” zero-copy logits for tsampler steering +- `pool.advance(branches)` β€” one `llama_decode()` call, N sequences advance +- Skill injection via suffix, not system prompt replacement -# GitHub Actions automatically: -# - Builds 7 platform packages -# - Publishes to npm as @lloyal/lloyal.node-* -# - Publishes main package with updated optionalDependencies -``` +Single model, multiple specialists, shared KV prefix, sublinear scaling. -**CI Pipeline:** -- `.github/workflows/release.yml` builds on tag push -- 7 parallel jobs for each platform/GPU variant -- Installs platform dependencies (CUDA toolkit, Vulkan SDK) -- Packages binaries to `bin/` directory -- Publishes all packages with synchronized versions +## LLoyal Ecosystem -### Vendoring Strategy +| Package | Language | What it does | +| ------------------------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [liblloyal](https://github.com/lloyal-ai/liblloyal) | C++ | Inference kernel. Orchestrates llama.cpp with composable primitives: tokenization, decoding, KV cache, sampling chains, metrics, embeddings. Plus `branch.hpp` / `lease.hpp` for state forking and SMMA. | +| **lloyal.node** | N-API | Node.js bindings. Zero-copy logits, native references for validation. | +| [tsampler](https://github.com/lloyal-ai/tsampler) | TypeScript | Sampling library with llama.cpp parity. All filters, penalties, entropy metrics. Plugin for lloyal.nodeβ€”consumes logits, returns tokens. | +| [nitro-llama](https://github.com/lloyal-ai/nitro-llama) | React Native | Mobile bindings via Nitro Modules. Same liblloyal primitives on iOS/Android. | -**For npm registry distribution:** -- llama.cpp and liblloyal sources vendored to `vendor/` -- Run `npm run update-vendors` before publishing -- Vendored sources enable source builds for unsupported platforms +## Contributing -**For development:** -- Use git submodules (`git clone --recursive`) -- Update with `git submodule update --remote` +See [CONTRIBUTING.md](./CONTRIBUTING.md) for development setup, build instructions, and release process. ## License -MIT +Apache 2.0 β€” See [LICENSE](./LICENSE) for details. diff --git a/cmake/arm64-cross.cmake b/cmake/arm64-cross.cmake new file mode 100644 index 0000000..df25ec6 --- /dev/null +++ b/cmake/arm64-cross.cmake @@ -0,0 +1,21 @@ +# CMake toolchain file for Windows ARM64 cross-compilation +# Used by CI to build ARM64 binaries from x64 Windows runners + +set(CMAKE_SYSTEM_NAME Windows) +set(CMAKE_SYSTEM_PROCESSOR ARM64) + +# Use clang-cl for cross-compilation (MSVC-compatible) +set(CMAKE_C_COMPILER clang-cl) +set(CMAKE_CXX_COMPILER clang-cl) + +# Target ARM64 architecture +set(CMAKE_C_FLAGS_INIT "/arch:ARM64EC") +set(CMAKE_CXX_FLAGS_INIT "/arch:ARM64EC") + +# Search for programs in the build host directories +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) + +# Search for libraries and headers in the target directories +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) diff --git a/docs/distribution.md b/docs/distribution.md index e8bb519..9471c91 100644 --- a/docs/distribution.md +++ b/docs/distribution.md @@ -1,961 +1,314 @@ -# Distribution Strategy for lloyal.node +# Platform Support & Distribution -> **Purpose:** This document outlines how to package, publish, and distribute lloyal.node as a native Node.js module with complex dependencies. +> **lloyal.node** provides prebuilt binaries for 13 platforms, covering 93% of production deployment scenarios with instant installation. --- -## Table of Contents +## Platform Coverage -- [The Distribution Challenge](#the-distribution-challenge) -- [Strategy Overview](#strategy-overview) -- [Phase 1: Build from Source](#phase-1-build-from-source) -- [Phase 2: Core Platform Prebuilts](#phase-2-core-platform-prebuilts) -- [Phase 3: Full Platform Matrix](#phase-3-full-platform-matrix) -- [Implementation Guide](#implementation-guide) -- [Publishing Workflow](#publishing-workflow) -- [Version Management](#version-management) +### Supported Platforms (v1.0) ---- - -## The Distribution Challenge - -### Dependency Structure - -lloyal.node is an N-API binding with a complex dependency chain: - -``` -lloyal.node (N-API binding) - ↓ C++ includes -liblloyal (header-only library, vendored from git submodule) - ↓ links against -llama.cpp (C++ inference engine, vendored from git submodule) - ↓ compiles to (platform-specific) -macOS: libllama.dylib (shared library with Metal support) -Linux: libllama.so (shared library with OpenMP) -Windows: llama.dll + ggml*.dll (multiple DLLs) -``` - -### Key Problems - -**1. Git Submodules & npm** - -npm does not initialize git submodules when installing packages: -- Installing from GitHub: `npm install github:org/repo` clones the repo but ignores `.gitmodules` -- Installing from npm registry: No `.git` directory exists at all -- Result: `liblloyal/` and `llama.cpp/` directories are empty, build fails - -**2. Build Complexity** - -Users must compile C++ on installation: -- Requires: C++20 compiler, CMake, node-gyp, Python -- Platform-specific toolchains (MSVC on Windows, GCC/Clang on Linux/macOS) -- Build time: 5-15 minutes on first install -- High failure rate on non-standard environments - -**3. Platform & GPU Fragmentation** - -llama.cpp supports multiple acceleration backends: -- **Metal** (macOS): Built-in GPU acceleration -- **CUDA** (NVIDIA): Requires CUDA toolkit -- **Vulkan** (cross-platform): Requires Vulkan SDK -- **CPU-only**: No dependencies, slower inference - -Each backend requires a different build, can't ship a single binary supporting all. - -**4. Build Environment Variations** - -Even on the same OS/arch, builds vary: -- Different compiler versions (GCC 9 vs 13) -- Different CUDA versions (11.x vs 12.x) -- Different CPU features (AVX2, AVX-512, NEON) -- Different system libraries (glibc versions on Linux) - ---- - -## Strategy Overview - -### Three-Phase Approach - -| Phase | Status | Distribution | User Install Time | GPU Support | -|-------|--------|--------------|-------------------|-------------| -| **1: Source** | βœ… **COMPLETE** (v0.1.x) | Vendored sources on npm | 5-15 minutes | Auto-detect (Metal/CPU) | -| **2: Core Prebuilts** | πŸ“‹ Planned (v0.5.x+) | 3 common platforms | <1 minute | CPU + Metal | -| **3: Full Matrix** | πŸ“‹ Future (v1.x+) | 10+ platform/GPU packages | <1 minute | All variants | - -### Design Principles - -1. **Progressive Enhancement**: Start simple, add complexity only when justified -2. **Graceful Degradation**: Prebuilts fail β†’ fallback to source build -3. **Platform Detection**: Use npm's `os` and `cpu` fields for automatic selection -4. **Version Synchronization**: All platform packages match main package version +lloyal.node ships prebuilt binaries for the following platforms: ---- - -## Phase 1: Build from Source (Vendored) βœ… COMPLETE - -### Overview - -**Status:** βœ… Implemented and tested (v0.1.0) -**Audience:** Early adopters, developers, contributors -**Timeline:** v0.1.0 - v0.4.x -**Distribution:** npm registry with vendored submodule sources - -**Verified Platforms:** -- βœ… Linux (ubuntu-latest) - Node 18, 20, 22 -- βœ… macOS (macos-14) - Node 18, 20, 22 -- βœ… Windows (windows-latest) - Node 18, 20, 22 - -**Test Coverage:** 15 tests per platform (11 API + 4 E2E validation tests) - -### The Git Submodules Problem +**macOS (2 packages)** +- Apple Silicon (arm64) with Metal GPU acceleration +- Intel (x64) CPU-only -lloyal.node uses git submodules for dependencies (liblloyal, llama.cpp). **npm does not and will not support git submodules:** +**Linux x64 (3 packages)** +- CPU-only +- CUDA 12.6 (NVIDIA GPUs) +- Vulkan (AMD/Intel GPUs) -- Installing from npm: Package is a tarball, no `.git` directory -- Installing from GitHub: npm clones repo but ignores `.gitmodules` -- Result: Submodule directories are empty, build fails +**Linux ARM64 (3 packages)** +- CPU-only (AWS Graviton, Raspberry Pi) +- CUDA 12.6 (NVIDIA Jetson devices) +- Vulkan (Qualcomm/AMD GPUs) -**Attempted Solution (Doesn't Work):** -Adding a `preinstall` script to run `git submodule update --init --recursive` fails because: -1. npm cache copies files to temp directory before install scripts -2. Submodules aren't copied, so directories are empty -3. Script runs but has no effect +**Windows x64 (3 packages)** +- CPU-only +- CUDA 12.6 (NVIDIA GPUs) +- Vulkan (AMD/Intel GPUs) -### Solution: Vendor Submodule Sources +**Windows ARM64 (2 packages)** +- CPU-only (Snapdragon X Elite, Surface Pro X) +- Vulkan (Qualcomm GPUs) -**Include submodule source code directly in npm package:** - -```json -{ - "name": "lloyal.node", - "version": "0.1.0", - "main": "lib/index.js", - "gypfile": true, - "scripts": { - "prepare": "bash scripts/build-llama.sh", - "install": "bash scripts/build-llama.sh && node scripts/setup-headers.js && node-gyp rebuild" - }, - "files": [ - "lib/", - "src/", - "scripts/", - "binding.gyp", - "vendor/" - ] -} -``` +### Installation -### How It Works +**Automatic (Recommended)** -**When end users install from npm:** +npm automatically selects the correct prebuilt package for your platform: ```bash npm install lloyal.node - -# Only the 'install' script runs: -install β†’ Build llama.cpp + Setup headers + node-gyp rebuild -``` - -**When developers work locally or before publishing:** - -```bash -npm install # In the package directory itself - -# Both scripts run: -1. prepare β†’ Build llama.cpp for platform (bash scripts/build-llama.sh) -2. install β†’ Build llama.cpp + Setup headers + node-gyp rebuild ``` -**Note:** The `prepare` script is kept for Phase 2 (prebuilt binaries). In CI/CD, it will build llama.cpp before packaging prebuilt binaries. For Phase 1, only the `install` script matters for end users. +If a prebuilt binary is available, installation completes in seconds. Otherwise, lloyal.node builds from source automatically (requires C++ compiler and CMake). -**User workflow:** -1. npm downloads tarball (~50MB with vendored sources) -2. npm extracts to node_modules/lloyal.node -3. `install` script builds llama.cpp static libraries/frameworks -4. `install` script creates header symlinks and compiles N-API binding -5. Total time: 5-15 minutes +**Manual GPU Variant Selection** -### Publishing Workflow - -**Before publishing, sync submodules:** +To force a specific GPU backend, install the platform package directly: ```bash -# Update submodules to latest -git submodule update --remote - -# Or update to specific commits -cd liblloyal && git checkout && cd .. -cd llama.cpp && git checkout && cd .. +# Force CUDA on Linux +npm install @lloyal/lloyal.node-linux-x64-cuda -# Commit submodule updates -git add liblloyal llama.cpp -git commit -m "chore: update submodules" +# Force Vulkan on Windows +npm install @lloyal/lloyal.node-win32-x64-vulkan +``` -# Pack to verify contents -npm pack -tar -tzf lloyal.node-*.tgz | grep -E "(liblloyal|llama.cpp)" -# Should show vendored source files +Or set an environment variable before installation: -# Publish -npm publish +```bash +export LLOYAL_GPU=cuda +npm install lloyal.node ``` -**Important:** Vendored sources are a **snapshot** of submodules at publish time. Users get the exact versions you tested. - -### Pros & Cons +### Build from Source -**Pros:** -- Simple to implement (no CI/CD needed) -- Supports all platforms/architectures (if they can compile) -- GPU auto-detection works (Metal, CUDA if installed) -- Full control over build flags +If no prebuilt binary matches your platform, lloyal.node builds from vendored sources automatically. -**Cons:** -- Slow install (5-15 min compilation) -- High failure rate (missing compilers, toolchains) -- Requires build tools on user machine -- Poor developer experience +**Requirements:** +- C++20 compiler (GCC 9+, Clang 10+, MSVC 2019+) +- CMake 3.18+ +- node-gyp -### When to Use +**Build time:** 5-15 minutes (one-time) +**Supported platforms not covered by prebuilts:** +- Older or niche platforms +- Custom CPU optimizations - Development and testing -- Early alpha/beta releases -- Platforms without prebuilt support -- Users needing custom build flags --- -## Phase 2: Core Platform Prebuilts βœ… COMPLETE - -### Overview - -**Status:** βœ… Implemented (v0.1.0) -**Audience:** Production users on common x64 platforms -**Distribution:** 7 npm packages covering 80%+ of developers - -### Platform Packages (Implemented) - -| Package | Platform | Arch | GPU | Status | -|---------|----------|------|-----|--------| -| `@lloyal/lloyal.node-darwin-arm64` | macOS | arm64 | Metal | βœ… Working | -| `@lloyal/lloyal.node-darwin-x64` | macOS | x64 | CPU | βœ… Working | -| `@lloyal/lloyal.node-linux-x64` | Linux | x64 | CPU | βœ… Working | -| `@lloyal/lloyal.node-linux-x64-cuda` | Linux | x64 | CUDA 12.2 | βœ… Working | -| `@lloyal/lloyal.node-linux-x64-vulkan` | Linux | x64 | Vulkan | βœ… Working | -| `@lloyal/lloyal.node-win32-x64` | Windows | x64 | CPU | βœ… Working | -| `@lloyal/lloyal.node-win32-x64-cuda` | Windows | x64 | CUDA 12.2 | βœ… Working | - -**Total coverage:** ~80% of developers with instant install - -**Note:** Original Phase 2 plan was 3 packages, but we exceeded expectations by implementing 7 packages including GPU variants. - -### Architecture - -**Main Package (`lloyal.node`):** -```json -{ - "name": "lloyal.node", - "version": "0.5.0", - "optionalDependencies": { - "@lloyal/lloyal.node-darwin-arm64": "0.5.0", - "@lloyal/lloyal.node-linux-x64": "0.5.0", - "@lloyal/lloyal.node-win32-x64": "0.5.0" - }, - "scripts": { - "install": "node scripts/install.js" - } -} -``` - -**Platform Package (`@lloyal/lloyal.node-darwin-arm64`):** -```json -{ - "name": "@lloyal/lloyal.node-darwin-arm64", - "version": "0.5.0", - "os": ["darwin"], - "cpu": ["arm64"], - "main": "index.node", - "files": [ - "index.node", - "*.dylib" - ] -} -``` - -### Install Flow - -```javascript -// scripts/install.js -const platform = `${process.platform}-${process.arch}`; -const prebuiltPackage = `@lloyal/lloyal.node-${platform}`; - -try { - // Check if platform-specific package is installed - require.resolve(prebuiltPackage); - console.log(`βœ“ Using prebuilt binary for ${platform}`); - process.exit(0); -} catch { - // Fallback to source build - console.log(`⚠ No prebuilt for ${platform}, building from source...`); - console.log(`This will take 5-15 minutes.`); - - // Initialize submodules (if git repo) - require('./init-submodules.js'); - - // Build llama.cpp - require('./build-llama.js'); - - // Setup headers and compile N-API binding - execSync('node scripts/setup-headers.js && node-gyp rebuild', { - stdio: 'inherit' - }); -} -``` - -### CI/CD Pipeline +## GPU Acceleration -**Workflow:** `.github/workflows/release.yml` - -```yaml -name: Release - -on: - push: - tags: - - v* - -jobs: - build-prebuilts: - runs-on: ${{ matrix.os }} - strategy: - matrix: - include: - - os: macos-14 - arch: arm64 - platform: darwin-arm64 - - os: ubuntu-22.04 - arch: x64 - platform: linux-x64 - - os: windows-latest - arch: x64 - platform: win32-x64 - - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - uses: actions/setup-node@v4 - with: - node-version: 20 - registry-url: 'https://registry.npmjs.org' - - - name: Install dependencies - run: npm install - - - name: Build native module - run: npm run build - - - name: Package prebuilt - run: | - mkdir -p prebuilds/${{ matrix.platform }} - cp build/Release/*.node prebuilds/${{ matrix.platform }}/ - if [ "${{ runner.os }}" = "macOS" ]; then - cp build/Release/*.dylib prebuilds/${{ matrix.platform }}/ || true - fi - - - name: Create platform package - run: | - node scripts/create-platform-package.js \ - ${{ matrix.platform }} \ - ${{ github.ref_name }} - - - name: Publish platform package - working-directory: packages/lloyal.node-${{ matrix.platform }} - run: npm publish --access public - env: - NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} - - publish-main: - needs: build-prebuilts - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 - with: - node-version: 20 - registry-url: 'https://registry.npmjs.org' - - - name: Update package versions - run: node scripts/sync-versions.js - - - name: Publish main package - run: npm publish - env: - NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} -``` +### Metal (macOS) -### Helper Scripts +Enabled automatically on Apple Silicon. No additional setup required. -**scripts/create-platform-package.js:** ```javascript -const fs = require('fs'); -const path = require('path'); - -const [platform, version] = process.argv.slice(2); - -const packageJson = { - name: `@lloyal/lloyal.node-${platform}`, - version: version.replace('v', ''), - description: `Native module for lloyal.node (${platform})`, - main: 'index.node', - os: [platform.split('-')[0]], - cpu: [platform.split('-')[1]], - repository: { - type: 'git', - url: 'https://github.com/lloyal-ai/lloyal.node.git' - }, - license: 'MIT', - files: ['index.node', '*.dylib', '*.so', '*.dll'] -}; - -const pkgDir = path.join('packages', `lloyal.node-${platform}`); -fs.mkdirSync(pkgDir, { recursive: true }); -fs.writeFileSync( - path.join(pkgDir, 'package.json'), - JSON.stringify(packageJson, null, 2) -); - -// Copy built binary -const prebuiltDir = path.join('prebuilds', platform); -fs.readdirSync(prebuiltDir).forEach(file => { - fs.copyFileSync( - path.join(prebuiltDir, file), - path.join(pkgDir, file) - ); +const { loadModel } = require('lloyal.node'); +const model = await loadModel({ + modelPath: './model.gguf', + gpuLayers: 32 // Offload layers to GPU }); - -console.log(`βœ“ Created package: ${packageJson.name}@${packageJson.version}`); ``` -**scripts/sync-versions.js:** -```javascript -const fs = require('fs'); -const path = require('path'); - -const mainPkg = require('../package.json'); -const version = mainPkg.version; - -// Update optionalDependencies to match current version -if (mainPkg.optionalDependencies) { - Object.keys(mainPkg.optionalDependencies).forEach(dep => { - mainPkg.optionalDependencies[dep] = version; - }); - - fs.writeFileSync( - 'package.json', - JSON.stringify(mainPkg, null, 2) - ); - - console.log(`βœ“ Synced all package versions to ${version}`); -} -``` - -### Pros & Cons - -**Pros:** -- Fast install for 70% of users (<1 minute) -- Lower failure rate (no compilation needed) -- Better developer experience -- Still supports all platforms via fallback - -**Cons:** -- More complex CI/CD (3 build jobs) -- Multiple npm packages to maintain -- Version synchronization required -- Storage costs for prebuilt binaries - -### When to Use - -- Production releases (v1.0.0+) -- Public npm distribution -- Targeting broad developer audience - ---- - -## Phase 3: Full Platform Matrix βš™οΈ IN PROGRESS (v1.0) - -### Overview - -**Status:** βš™οΈ Implementing (target: v1.0.0) -**Audience:** All users, all platforms, all GPU variants -**Timeline:** v1.0.0 -**Distribution:** 10 platform/GPU packages covering 95%+ deployments - -### Platform Packages (v1.0 Target) - -**Already Implemented (7 packages from Phase 2+):** -- βœ… `@lloyal/lloyal.node-darwin-arm64` (macOS Apple Silicon, Metal) -- βœ… `@lloyal/lloyal.node-darwin-x64` (macOS Intel, CPU) -- βœ… `@lloyal/lloyal.node-linux-x64` (Linux x64, CPU) -- βœ… `@lloyal/lloyal.node-linux-x64-cuda` (Linux x64 + CUDA 12.2) -- βœ… `@lloyal/lloyal.node-linux-x64-vulkan` (Linux x64 + Vulkan) -- βœ… `@lloyal/lloyal.node-win32-x64` (Windows x64, CPU) -- βœ… `@lloyal/lloyal.node-win32-x64-cuda` (Windows x64 + CUDA 12.2) +### CUDA (NVIDIA) -**New for v1.0 (3 packages):** -- πŸ”„ `@lloyal/lloyal.node-linux-arm64` (Linux ARM64 - AWS Graviton, Raspberry Pi) -- πŸ”„ `@lloyal/lloyal.node-linux-arm64-cuda` (Linux ARM64 + CUDA - NVIDIA Jetson) -- πŸ”„ `@lloyal/lloyal.node-win32-x64-vulkan` (Windows x64 + Vulkan - AMD/Intel GPU) - -**Deferred to v1.1+ (2 packages):** -- ⏸️ `@lloyal/lloyal.node-win32-arm64` (Windows ARM64 - awaiting GitHub Actions ARM64 Windows runners) -- ⏸️ `@lloyal/lloyal.node-darwin-x64-vulkan` (macOS Intel + eGPU - negligible use case) - -### What Changed from Original Plan - -**Original Phase 3 (docs):** 12 packages including win32-arm64, darwin-x64-vulkan - -**Actual v1.0 Phase 3:** 10 packages - -**Rationale:** 10 packages cover 95%+ of real-world usage. Remaining 2 packages require infrastructure not yet available (win32-arm64) or serve minimal users (darwin-x64-vulkan). - -### GPU Variant Installation - -**Option 1: Manual Selection** - -Users explicitly install GPU variant: +Requires NVIDIA GPU with compute capability 6.0+ and CUDA 12.6 runtime. +**Linux/Windows:** ```bash -# Default (CPU or auto-GPU) -npm install lloyal.node - -# Force CUDA -npm install lloyal.node -npm install @lloyal/lloyal.node-linux-x64-cuda --save-optional - -# Force Vulkan -npm install lloyal.node -npm install @lloyal/lloyal.node-linux-x64-vulkan --save-optional +npm install @lloyal/lloyal.node-linux-x64-cuda +# or +npm install @lloyal/lloyal.node-win32-x64-cuda ``` -**Option 2: Environment Variable** - +**Jetson (ARM64):** ```bash -# User sets preference -export LLOYAL_GPU=cuda -npm install lloyal.node - -# scripts/install.js reads env var and selects variant +npm install @lloyal/lloyal.node-linux-arm64-cuda ``` -**Option 3: Runtime Detection** +### Vulkan (Cross-Platform) -```javascript -// On first use, detect available GPU -const gpu = detectGPU(); // 'cuda', 'vulkan', 'metal', 'cpu' +Works with AMD, Intel, NVIDIA, and Qualcomm GPUs. Requires Vulkan 1.3+ drivers. -if (!hasVariant(gpu)) { - console.log(`Installing optimized build for ${gpu}...`); - await installVariant(gpu); -} +```bash +npm install @lloyal/lloyal.node-linux-x64-vulkan +# or +npm install @lloyal/lloyal.node-win32-x64-vulkan ``` -### CI/CD Implementation (v1.0) - -Build matrix with 10 jobs (see `.github/workflows/release.yml`): - -```yaml -strategy: - matrix: - include: - # macOS (2 jobs) - - { os: macos-14, arch: arm64, gpu: metal, package: darwin-arm64 } - - { os: macos-13, arch: x64, gpu: cpu, package: darwin-x64 } +### CPU-Only - # Linux x64 (3 jobs) - - { os: ubuntu-22.04, arch: x64, gpu: cpu, package: linux-x64 } - - { os: ubuntu-22.04, arch: x64, gpu: cuda, package: linux-x64-cuda } - - { os: ubuntu-22.04, arch: x64, gpu: vulkan, package: linux-x64-vulkan } +No GPU acceleration. Works on all platforms. - # Linux ARM64 (2 jobs - Docker + QEMU) - - { os: ubuntu-22.04, arch: arm64, gpu: cpu, package: linux-arm64, docker_platform: linux/arm64 } - - { os: ubuntu-22.04, arch: arm64, gpu: cuda, package: linux-arm64-cuda, docker_image: nvcr.io/nvidia/l4t-cuda:12.6-devel } - - # Windows (3 jobs) - - { os: windows-2022, arch: x64, gpu: cpu, package: win32-x64 } - - { os: windows-2022, arch: x64, gpu: cuda, package: win32-x64-cuda, cuda_version: 12.2.0 } - - { os: windows-2022, arch: x64, gpu: vulkan, package: win32-x64-vulkan } +```bash +npm install @lloyal/lloyal.node-darwin-x64 # macOS Intel +npm install @lloyal/lloyal.node-linux-x64 # Linux x64 +npm install @lloyal/lloyal.node-win32-x64 # Windows x64 ``` -**Key Implementation Details:** -- **ARM64 builds:** Use Docker + QEMU for cross-compilation (GitHub Actions has no native ARM64 Linux runners) -- **CUDA ARM64:** Use NVIDIA L4T (Linux for Tegra) Docker image for Jetson compatibility -- **Vulkan Windows:** Install LunarG Vulkan SDK during CI build step - -### Pros & Cons (v1.0 Implementation) - -**Pros:** -- Excellent user experience (instant install + optimal performance) -- Covers 95%+ of real-world deployments -- GPU acceleration out of box (CUDA, Vulkan, Metal) -- ARM64 support (AWS Graviton, Jetson, Raspberry Pi) -- Professional distribution - -**Cons:** -- Moderate CI/CD complexity (10 jobs, cross-compilation, GPU toolchains) -- Maintenance burden (10 packages to version/publish) -- Storage/bandwidth costs (50-150MB per package) -- Platform-specific bugs to debug (especially ARM64 QEMU builds) -- Cannot fully test all platforms in CI (no ARM64 hardware runners) - -### Success Metrics - -**Phase 3 v1.0 considered successful when:** -- All 10 platform packages build successfully in CI -- All 10 packages published to npm registry -- `npm install lloyal.node` works on all 10 platforms -- Community validation on ARM64 hardware (Graviton, Raspberry Pi, Jetson) -- No regression in existing 7 packages -- Commercial product expectations - --- -## Implementation Guide +## Package Architecture -### Setup for Phase 2 +### Main Package -**1. Create scripts directory structure:** - -``` -scripts/ -β”œβ”€β”€ init-submodules.js # Initialize git submodules -β”œβ”€β”€ build-llama.sh # Build llama.cpp for platform -β”œβ”€β”€ setup-headers.js # Symlink headers for liblloyal -β”œβ”€β”€ install.js # Prebuilt or fallback to source -β”œβ”€β”€ create-platform-package.js # Generate platform package -β”œβ”€β”€ sync-versions.js # Update all package versions -└── publish-if-need.js # Conditional publish -``` - -**2. Update package.json:** +`lloyal.node` is a meta-package with optional dependencies on all platform packages: ```json { "name": "lloyal.node", - "version": "0.5.0", "optionalDependencies": { - "@lloyal/lloyal.node-darwin-arm64": "0.5.0", - "@lloyal/lloyal.node-linux-x64": "0.5.0", - "@lloyal/lloyal.node-win32-x64": "0.5.0" - }, - "scripts": { - "preinstall": "node scripts/init-submodules.js", - "install": "node scripts/install.js", - "build": "node-gyp rebuild", - "sync-versions": "node scripts/sync-versions.js", - "publish-if-need": "node scripts/publish-if-need.js" + "@lloyal/lloyal.node-darwin-arm64": "1.0.0", + "@lloyal/lloyal.node-linux-x64-cuda": "1.0.0", + ... } } ``` -**3. Create GitHub workflow:** +npm installs only the package matching your platform. Unsupported platforms fall back to source builds. -Copy the Phase 2 CI/CD pipeline to `.github/workflows/release.yml` +### Platform Packages -**4. Test locally:** +Each platform package contains: +- Prebuilt native addon (`*.node`) +- Platform-specific shared libraries (`*.dylib`, `*.so`, `*.dll`) +- Minimal dependencies (no build tools required) -```bash -# Simulate prebuilt install -npm pack -mkdir test-install && cd test-install -npm install ../lloyal.node-*.tgz - -# Should either: -# - Use prebuilt (if on supported platform) -# - Build from source (if not) -``` +**Package naming:** `@lloyal/lloyal.node-{platform}-{arch}[-{gpu}]` -**5. Configure npm token:** - -```bash -# Add NPM_TOKEN to GitHub secrets -# Settings β†’ Secrets and variables β†’ Actions β†’ New repository secret -# Name: NPM_TOKEN -# Value: npm_xxxxxxxxxxxxxxxxxxxx -``` +Examples: +- `@lloyal/lloyal.node-darwin-arm64` (macOS Apple Silicon with Metal) +- `@lloyal/lloyal.node-linux-x64-cuda` (Linux x64 with CUDA 12.6) +- `@lloyal/lloyal.node-win32-arm64-vulkan` (Windows ARM64 with Vulkan) --- -## Publishing Workflow - -### Phase 1: Manual Source Publish - -```bash -# 1. Test build locally -npm run build +## Comparison to llama.node -# 2. Bump version -npm version patch # or minor/major +| Metric | llama.node | lloyal.node | +|--------|------------|-------------| +| Total packages | 14 | 13 | +| Platform parity | 100% | 93% | +| x64 coverage | Full | Full | +| ARM64 coverage | Full | Full | +| CUDA version | 12.6 | 12.6 | +| Vulkan support | Full | Full | +| Windows ARM64 | βœ… | βœ… | +| Snapdragon optimization | βœ… Hexagon DSP | ⏸️ Roadmap | -# 3. Publish to npm -npm publish +**Missing:** Snapdragon Hexagon DSP optimization (niche edge AI use case). Standard ARM64 packages work on Snapdragon hardware without DSP acceleration. -# 4. Tag and push -git push origin main --tags -``` - -### Phase 2: Automated Prebuilt Publish - -```bash -# 1. Commit changes -git add . -git commit -m "feat: add feature X" +--- -# 2. Bump version (triggers sync-versions) -npm version minor # 0.5.0 β†’ 0.6.0 +## Technical Details -# 3. Push tag (triggers CI) -git push origin main --tags +### Dependency Chain -# CI automatically: -# - Builds 3 platform packages -# - Publishes each platform package -# - Updates main package optionalDependencies -# - Publishes main package ``` - -### Pre-Publish Checklist - -- [ ] All tests pass -- [ ] Submodules are up to date -- [ ] CHANGELOG.md updated -- [ ] Version bumped in package.json -- [ ] README.md reflects changes -- [ ] Breaking changes documented -- [ ] Platform packages tested locally -- [ ] npm token configured (CI) - -### Post-Publish Verification - -```bash -# Check npm registry -npm view lloyal.node - -# Test installation on different platforms -docker run -it node:20 sh -c "npm install lloyal.node" - -# Verify platform packages published -npm view @lloyal/lloyal.node-darwin-arm64 -npm view @lloyal/lloyal.node-linux-x64 -npm view @lloyal/lloyal.node-win32-x64 +lloyal.node (N-API binding) + ↓ includes +liblloyal (header-only C++ library) + ↓ links +llama.cpp (inference engine) + ↓ compiles to +Platform-specific binaries: + macOS: libllama.dylib + Metal support + Linux: libllama.so + OpenMP + Windows: llama.dll + ggml*.dll ``` ---- - -## Version Management - -### Semantic Versioning - -Follow [semver 2.0.0](https://semver.org/): - -- **MAJOR** (0.x β†’ 1.0, 1.x β†’ 2.0): Breaking API changes -- **MINOR** (0.1 β†’ 0.2, 1.0 β†’ 1.1): New features, backwards compatible -- **PATCH** (0.1.0 β†’ 0.1.1): Bug fixes, backwards compatible +### Vendoring Strategy -### Version Synchronization +lloyal.node vendors `liblloyal` and `llama.cpp` sources to avoid git submodule issues with npm: -**Rule:** All platform packages MUST match main package version +- **Published packages** include vendored sources in `vendor/` directory +- **Git repository** uses submodules for development +- **Version tracking** via `vendor/VERSIONS.json` -**Enforcement:** +To update vendored dependencies: -```javascript -// scripts/sync-versions.js (run via npm version hook) -const mainVersion = require('./package.json').version; - -// Update optionalDependencies -pkg.optionalDependencies = Object.keys(pkg.optionalDependencies).reduce((acc, dep) => { - acc[dep] = mainVersion; - return acc; -}, {}); - -// Update platform packages (if they exist) -const packages = fs.readdirSync('packages'); -packages.forEach(pkg => { - const pkgJson = require(`./packages/${pkg}/package.json`); - pkgJson.version = mainVersion; - fs.writeFileSync( - `./packages/${pkg}/package.json`, - JSON.stringify(pkgJson, null, 2) - ); -}); -``` - -**package.json hook:** -```json -{ - "scripts": { - "version": "npm run sync-versions && git add ." - } -} +```bash +git submodule update --remote +npm run update-vendors ``` -### Dependency Updates - -**When llama.cpp updates:** - -1. Update submodule: `git submodule update --remote llama.cpp` -2. Test build on all platforms -3. If compatible β†’ PATCH version bump -4. If breaking changes β†’ MAJOR version bump -5. Document changes in CHANGELOG.md +See [VENDORING.md](../VENDORING.md) for details. -**When liblloyal updates:** +### CI/CD Pipeline -1. Update submodule: `git submodule update --remote liblloyal` -2. Test API compatibility -3. Bump version accordingly -4. Update documentation +GitHub Actions builds all 13 platform packages on release: ---- +**Native runners:** +- macOS: `macos-14` (arm64), `macos-13` (x64) +- Linux x64: `ubuntu-22.04` +- Linux ARM64: `ubuntu-22.04-arm` (native, no emulation) +- Windows: `windows-2022` -## Best Practices +**Cross-compilation:** +- Windows ARM64: Cross-compiled from x64 using LLVM/clang-cl -### 1. Fail Fast, Fail Loudly +**Custom actions:** +- `.github/actions/provision-cuda`: Unified CUDA 12.6 installation +- Uses `jakoch/install-vulkan-sdk-action` for Vulkan SDK -```javascript -// In install scripts, detect issues early -if (!hasCompiler()) { - console.error('ERROR: C++ compiler not found'); - console.error('Install build tools: https://...'); - process.exit(1); -} -``` +**Build time:** +- x64 platforms: ~10-15 minutes per package +- ARM64 (native): ~10-15 minutes per package +- Total pipeline: ~2-3 hours for all 13 packages -### 2. Clear Error Messages +--- -```javascript -try { - require.resolve(prebuiltPackage); -} catch { - console.log(''); - console.log('⚠ No prebuilt binary available for your platform'); - console.log(`Platform: ${platform}`); - console.log(''); - console.log('Building from source (5-15 minutes)...'); - console.log('Requirements: C++20 compiler, CMake, node-gyp'); - console.log('Troubleshooting: https://github.com/lloyal-ai/lloyal.node#building'); - console.log(''); -} -``` +## Publishing Workflow -### 3. Provide Escape Hatches +### For Maintainers -Allow users to force source build: +**1. Release preparation:** ```bash -# Skip prebuilt, always build from source -npm install lloyal.node --build-from-source - -# Or via environment variable -LLOYAL_BUILD_FROM_SOURCE=1 npm install lloyal.node -``` - -```javascript -// scripts/install.js -if (process.env.LLOYAL_BUILD_FROM_SOURCE === '1' || - process.argv.includes('--build-from-source')) { - console.log('Forcing build from source...'); - buildFromSource(); - process.exit(0); -} -``` - -### 4. Document Platform Support +# Update submodules to desired versions +git submodule update --remote -**README.md:** -```markdown -## Platform Support +# Vendor the dependencies +npm run update-vendors -| Platform | Architecture | Support | Install Time | GPU | -|----------|--------------|---------|--------------|-----| -| macOS | Apple Silicon (arm64) | ⚑ Prebuilt | <1 min | Metal | -| macOS | Intel (x64) | πŸ”¨ Source | 5-15 min | CPU | -| Linux | x64 | ⚑ Prebuilt | <1 min | CPU | -| Linux | ARM64 | πŸ”¨ Source | 5-15 min | CPU | -| Windows | x64 | ⚑ Prebuilt | <1 min | CPU | -| Windows | ARM64 | πŸ”¨ Source | 5-15 min | CPU | +# Update version +npm version minor # or major/patch -⚑ Prebuilt = Download binary -πŸ”¨ Source = Compile on install +# Commit changes +git add . +git commit -m "chore: prepare v1.0.0 release" +git push ``` -### 5. Test on Real Platforms +**2. Tag and trigger release:** -Don't rely on CI alone: -- Test prebuilt install on actual macOS/Linux/Windows machines -- Test fallback to source build -- Test with different Node.js versions (18, 20, 22) -- Test with different compilers (GCC, Clang, MSVC) - ---- - -## Troubleshooting - -### Common Issues - -**Issue:** `Error: Module did not self-register` - -**Cause:** Binary compiled for different Node.js version - -**Solution:** ```bash -# Rebuild for your Node.js version -npm rebuild lloyal.node +git tag v1.0.0 +git push origin v1.0.0 ``` -**Issue:** `Error: llama.cpp/include not found` +**3. CI builds and publishes:** + +GitHub Actions automatically: +- Builds all 13 platform packages +- Publishes to npm registry (`@lloyal/lloyal.node-*`) +- Publishes main package (`lloyal.node`) -**Cause:** Submodules not initialized +**4. Verify release:** -**Solution:** ```bash -git submodule update --init --recursive -npm install +npm info lloyal.node +npm info @lloyal/lloyal.node-linux-x64-cuda ``` -**Issue:** Prebuilt fails to load with "symbol not found" +### Version Management -**Cause:** Platform mismatch or incompatible system libraries +All packages use synchronized versioning: -**Solution:** ```bash -# Force source build -npm install lloyal.node --build-from-source +# Sync platform package versions with main package +npm run version ``` ---- - -## References +This automatically updates `optionalDependencies` in `package.json` and `version` fields in all platform packages. -### Similar Projects +--- -Native Node.js modules with prebuilt strategies: -- **sharp**: Image processing (libvips) -- **better-sqlite3**: SQLite bindings -- **canvas**: Cairo canvas API -- **bcrypt**: Password hashing -- **node-sass**: Sass compiler +## Roadmap -### Useful Links +### v1.1 (Future) -- [npm optionalDependencies docs](https://docs.npmjs.com/cli/v10/configuring-npm/package-json#optionaldependencies) -- [node-gyp documentation](https://github.com/nodejs/node-gyp) -- [N-API best practices](https://nodejs.org/api/n-api.html) -- [GitHub Actions matrix builds](https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs) +- **Snapdragon Hexagon DSP optimization** (if demand exists) +- **Build caching** for faster CI (ccache integration) +- **Download metrics** to validate platform priorities +- **Automated testing** on real ARM64 hardware (self-hosted runners) ---- +### Feedback -**Document Version:** 1.1 -**Last Updated:** 2025-01-16 -**Maintainer:** lloyal.node team +Platform support priorities are driven by user demand. If you need a specific platform or GPU variant, please open an issue at [github.com/lloyal-ai/lloyal.node](https://github.com/lloyal-ai/lloyal.node). diff --git a/package.json b/package.json index e802b67..84de8d3 100644 --- a/package.json +++ b/package.json @@ -54,9 +54,12 @@ "@lloyal/lloyal.node-linux-x64-vulkan": "0.1.0", "@lloyal/lloyal.node-linux-arm64": "0.1.0", "@lloyal/lloyal.node-linux-arm64-cuda": "0.1.0", + "@lloyal/lloyal.node-linux-arm64-vulkan": "0.1.0", "@lloyal/lloyal.node-win32-x64": "0.1.0", "@lloyal/lloyal.node-win32-x64-cuda": "0.1.0", - "@lloyal/lloyal.node-win32-x64-vulkan": "0.1.0" + "@lloyal/lloyal.node-win32-x64-vulkan": "0.1.0", + "@lloyal/lloyal.node-win32-arm64": "0.1.0", + "@lloyal/lloyal.node-win32-arm64-vulkan": "0.1.0" }, "engines": { "node": ">=18.0.0" From 524aaf3ef45b3d3f0b86c6748fbfd2f49909d50f Mon Sep 17 00:00:00 2001 From: LLoyal Research Date: Fri, 23 Jan 2026 17:44:32 +1100 Subject: [PATCH 7/9] feat(docs): update README --- README.md | 77 ------------------------------------------------------- 1 file changed, 77 deletions(-) diff --git a/README.md b/README.md index 2a49988..b782a38 100644 --- a/README.md +++ b/README.md @@ -340,83 +340,6 @@ const ctx = await createContext({ | ----------- | ----------------------------------------------------- | | `dispose()` | Free native resources. **Required** β€” call when done. | -## vNext: Edge Subagents - -Exposes [liblloyal](https://github.com/lloyal-ai/liblloyal)'s branch and lease primitives for SMMA orchestration, implementing [Petrov, Torr & Bibi (NeurIPS 2023)](https://openreview.net/forum?id=GYOXIRXI7W): - -> Skill injection works because prefixes act as "task-subspace selectors" in the model's residual stream. Prefix-tuning can elicit and combine skills already present in the pretrained model. - -```typescript -import { createContext } from 'lloyal.node'; -import { - sampleWithStrategy, - SamplerWorkspace, - Xoroshiro128Plus, -} from '@lloyal/tsampler'; - -// Setup -const ctx = await createContext({ modelPath: './model.gguf' }); -const pool = ctx.createLeasePool({ seqMax: 8 }); -const prng = new Xoroshiro128Plus(42); -const workspace = new SamplerWorkspace(256); - -// Trunk processes shared context (user message, RAG results, etc.) -const trunk = pool.createBranch(params); -await trunk.decodeAndCapture(sharedContextTokens); - -// Fork subagents β€” each inherits full prefix, suffixes with skill injection -const tax = pool.fork(trunk); -await tax.decode(await ctx.tokenize(TAX_SKILL_PROMPT)); - -const practical = pool.fork(trunk); -await practical.decode(await ctx.tokenize(PRACTICAL_SKILL_PROMPT)); - -// Generation loop β€” tsampler steers, pool batches decode -const taxTokens: number[] = []; -const practicalTokens: number[] = []; - -while (generating) { - // Get logits from each branch - const taxLogits = tax.getLogits(); - const practicalLogits = practical.getLogits(); - - // tsampler steering per branch - TAX_BANNED_TOKENS.forEach((id) => (taxLogits[id] = -Infinity)); - - const taxToken = sampleWithStrategy(taxLogits, { params, workspace, prng }); - const practicalToken = sampleWithStrategy(practicalLogits, { - params, - workspace, - prng, - }); - - // Batched decode β€” one llama_decode() call, multiple sequences - await pool.advance([ - { branch: tax, token: taxToken }, - { branch: practical, token: practicalToken }, - ]); - - taxTokens.push(taxToken); - practicalTokens.push(practicalToken); -} - -// Conditional forking: spawn legal expert from tax's output -if (taxTokens.length > 50) { - const legal = pool.fork(tax); // Inherits tax's full generation as prefix - await legal.decode(await ctx.tokenize(LEGAL_SKILL_PROMPT)); - // Continue generation with legal branch... -} -``` - -**Key primitives:** - -- `pool.fork(parent)` β€” atomic state fork, child inherits full KV prefix -- `branch.getLogits()` β€” zero-copy logits for tsampler steering -- `pool.advance(branches)` β€” one `llama_decode()` call, N sequences advance -- Skill injection via suffix, not system prompt replacement - -Single model, multiple specialists, shared KV prefix, sublinear scaling. - ## LLoyal Ecosystem | Package | Language | What it does | From cdcd676b8a3929778cae1b31a49bd43709646b7d Mon Sep 17 00:00:00 2001 From: LLoyal Research Date: Fri, 23 Jan 2026 18:35:52 +1100 Subject: [PATCH 8/9] chore(ci): bump node versions --- .github/workflows/release.yml | 4 ++-- .github/workflows/tests.yml | 6 +++--- package.json | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 628680e..4e7af75 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -86,7 +86,7 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: 20 + node-version: 24 registry-url: 'https://registry.npmjs.org' # Platform-specific dependencies @@ -197,7 +197,7 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: 20 + node-version: 24 registry-url: 'https://registry.npmjs.org' - name: Sync package versions diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 91fc7a9..4d3330d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -15,7 +15,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-14, windows-latest] - node: [18, 20, 22] + node: [22, 24] steps: - name: Checkout code @@ -133,7 +133,7 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: 20 + node-version: 24 - name: Pack package run: npm pack @@ -192,7 +192,7 @@ jobs: echo "Phase 1 Test Results" echo "================================" echo "βœ“ Vendor sources tested on Linux, macOS, Windows" - echo "βœ“ Node.js 18, 20, 22 compatibility verified" + echo "βœ“ Node.js 22, 24 (LTS) compatibility verified" echo "βœ“ npm package contents verified" echo "βœ“ API tests passed (11 tests)" echo "βœ“ E2E tests passed (4 text generation + 8 embedding tests)" diff --git a/package.json b/package.json index 84de8d3..4fe6a36 100644 --- a/package.json +++ b/package.json @@ -32,7 +32,7 @@ "llm" ], "author": "lloyal.ai", - "license": "MIT", + "license": "Apache-2.0", "type": "commonjs", "bugs": { "url": "https://github.com/lloyal-ai/lloyal.node/issues" @@ -62,7 +62,7 @@ "@lloyal/lloyal.node-win32-arm64-vulkan": "0.1.0" }, "engines": { - "node": ">=18.0.0" + "node": ">=22.0.0" }, "files": [ "lib/", From c1a223ef5f9bf2e7de30a66c5212ea530ae76702 Mon Sep 17 00:00:00 2001 From: LLoyal Research Date: Fri, 23 Jan 2026 18:41:46 +1100 Subject: [PATCH 9/9] fix(ci): update lock file --- package-lock.json | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/package-lock.json b/package-lock.json index dec72b7..0b045f9 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8,7 +8,7 @@ "name": "lloyal.node", "version": "0.1.0", "hasInstallScript": true, - "license": "MIT", + "license": "Apache-2.0", "dependencies": { "node-addon-api": "^8.5.0", "node-gyp-build": "^4.8.4" @@ -18,16 +18,22 @@ "node-gyp": "^10.2.0" }, "engines": { - "node": ">=18.0.0" + "node": ">=22.0.0" }, "optionalDependencies": { "@lloyal/lloyal.node-darwin-arm64": "0.1.0", "@lloyal/lloyal.node-darwin-x64": "0.1.0", + "@lloyal/lloyal.node-linux-arm64": "0.1.0", + "@lloyal/lloyal.node-linux-arm64-cuda": "0.1.0", + "@lloyal/lloyal.node-linux-arm64-vulkan": "0.1.0", "@lloyal/lloyal.node-linux-x64": "0.1.0", "@lloyal/lloyal.node-linux-x64-cuda": "0.1.0", "@lloyal/lloyal.node-linux-x64-vulkan": "0.1.0", + "@lloyal/lloyal.node-win32-arm64": "0.1.0", + "@lloyal/lloyal.node-win32-arm64-vulkan": "0.1.0", "@lloyal/lloyal.node-win32-x64": "0.1.0", - "@lloyal/lloyal.node-win32-x64-cuda": "0.1.0" + "@lloyal/lloyal.node-win32-x64-cuda": "0.1.0", + "@lloyal/lloyal.node-win32-x64-vulkan": "0.1.0" } }, "node_modules/@isaacs/balanced-match": { @@ -77,6 +83,15 @@ "node_modules/@lloyal/lloyal.node-darwin-x64": { "optional": true }, + "node_modules/@lloyal/lloyal.node-linux-arm64": { + "optional": true + }, + "node_modules/@lloyal/lloyal.node-linux-arm64-cuda": { + "optional": true + }, + "node_modules/@lloyal/lloyal.node-linux-arm64-vulkan": { + "optional": true + }, "node_modules/@lloyal/lloyal.node-linux-x64": { "optional": true }, @@ -86,12 +101,21 @@ "node_modules/@lloyal/lloyal.node-linux-x64-vulkan": { "optional": true }, + "node_modules/@lloyal/lloyal.node-win32-arm64": { + "optional": true + }, + "node_modules/@lloyal/lloyal.node-win32-arm64-vulkan": { + "optional": true + }, "node_modules/@lloyal/lloyal.node-win32-x64": { "optional": true }, "node_modules/@lloyal/lloyal.node-win32-x64-cuda": { "optional": true }, + "node_modules/@lloyal/lloyal.node-win32-x64-vulkan": { + "optional": true + }, "node_modules/@npmcli/agent": { "version": "2.2.2", "resolved": "https://registry.npmjs.org/@npmcli/agent/-/agent-2.2.2.tgz",