From 0ec8f04539563db365abfae18361d9e5b1051e06 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Wed, 11 Feb 2026 02:29:07 +1100
Subject: [PATCH 01/13] feat(chat): new chat api

---
 .gitignore             |   2 +-
 README.md              |   2 +-
 examples/chat/chat.mjs |  44 +++---
 lib/index.d.ts         | 347 ++++++++++++++++++++++++++++++++++++++++-
 liblloyal              |   2 +-
 src/SessionContext.cpp | 176 +++++++++++++++++----
 src/SessionContext.hpp |   1 +
 test/integration.js    | 313 +++++++++++++++++++++++++++++++++++++
 test/matrix.json       |   8 +-
 9 files changed, 829 insertions(+), 66 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1a11391..95b20dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,7 @@ models/
 
 # Generated documentation
 docs/api/
-
+docs/_internal
 # Vendor build artifacts (generated during npm install)
 vendor/llama.cpp/build-*/
 
diff --git a/README.md b/README.md
index 8698b36..a4bcca3 100644
--- a/README.md
+++ b/README.md
@@ -82,7 +82,7 @@ CI integration testing (real inference):
 | Qwen         | Qwen 3 1.7B    | chatml   |
 | Gemma        | Gemma 3 1B     | gemma    |
 | SmolLM       | SmolLM2 1.7B   | chatml   |
-| TinyLlama    | TinyLlama 1.1B | zephyr   |
+| Ministral    | Ministral 3B   | mistral  |
 
 See [distribution.md](docs/distribution.md) for details.
 
diff --git a/examples/chat/chat.mjs b/examples/chat/chat.mjs
index 396edb4..3256158 100644
--- a/examples/chat/chat.mjs
+++ b/examples/chat/chat.mjs
@@ -12,15 +12,15 @@
  * - Clear separation: sync produce, async commit
  */
 
-import * as readline from 'node:readline';
-import * as path from 'node:path';
-import { fileURLToPath } from 'node:url';
-import { createContext } from '../../lib/index.js';
+import * as readline from "node:readline";
+import * as path from "node:path";
+import { fileURLToPath } from "node:url";
+import { createContext } from "../../lib/index.js";
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const DEFAULT_MODEL = path.resolve(
   __dirname,
-  '../../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf'
+  "../../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf",
 );
 
 /**
@@ -40,7 +40,7 @@ async function main() {
   const modelPath = process.argv[2] || DEFAULT_MODEL;
 
   console.log(`Loading model: ${modelPath}`);
-  console.log('This may take a moment...\n');
+  console.log("This may take a moment...\n");
 
   const ctx = await createContext({
     modelPath,
@@ -48,37 +48,37 @@ async function main() {
     threads: 4,
   });
 
-  console.log('Model loaded! Type your message and press Enter.');
-  console.log('Commands: /clear to reset, /quit to exit\n');
+  console.log("Model loaded! Type your message and press Enter.");
+  console.log("Commands: /clear to reset, /quit to exit\n");
 
   const messages = [];
   let position = 0;
-  let lastPrompt = '';
+  let lastPrompt = "";
 
   const rl = readline.createInterface({
     input: process.stdin,
     output: process.stdout,
   });
 
-  const askUser = () => rl.question('> ', handleInput);
+  const askUser = () => rl.question("> ", handleInput);
 
   async function handleInput(input) {
     const trimmed = input.trim();
 
-    if (trimmed === '/quit' || trimmed === '/exit') {
-      console.log('Goodbye!');
+    if (trimmed === "/quit" || trimmed === "/exit") {
+      console.log("Goodbye!");
       ctx.dispose();
       rl.close();
       return;
     }
 
-    if (trimmed === '/clear') {
+    if (trimmed === "/clear") {
       await ctx.kvCacheClear();
       messages.length = 0;
       position = 0;
-      lastPrompt = '';
+      lastPrompt = "";
       console.clear();
-      console.log('Conversation cleared.\n');
+      console.log("Conversation cleared.\n");
       askUser();
       return;
     }
@@ -88,11 +88,11 @@ async function main() {
       return;
     }
 
-    messages.push({ role: 'user', content: trimmed });
+    messages.push({ role: "user", content: trimmed });
 
     // Format with chat template
     const { prompt: fullPrompt } = await ctx.formatChat(
-      JSON.stringify(messages)
+      JSON.stringify(messages),
     );
 
     // Prompt diffing - only tokenize new content
@@ -105,8 +105,8 @@ async function main() {
     position += tokens.length;
 
     // Generate: sync produce, async commit
-    process.stdout.write('< ');
-    let response = '';
+    process.stdout.write("< ");
+    let response = "";
 
     for (const { text, tokenId } of produceTokens(ctx, {
       temperature: 0.7,
@@ -120,9 +120,9 @@ async function main() {
       position += 1;
     }
 
-    console.log('\n');
+    console.log("\n");
 
-    messages.push({ role: 'assistant', content: response.trim() });
+    messages.push({ role: "assistant", content: response.trim() });
     lastPrompt = fullPrompt + response;
 
     askUser();
@@ -132,6 +132,6 @@ async function main() {
 }
 
 main().catch((err) => {
-  console.error('Error:', err.message);
+  console.error("Error:", err.message);
   process.exit(1);
 });
diff --git a/lib/index.d.ts b/lib/index.d.ts
index 8a3c797..a762547 100644
--- a/lib/index.d.ts
+++ b/lib/index.d.ts
@@ -110,12 +110,282 @@ export interface ContextOptions {
   nSeqMax?: number;
 }
 
+/**
+ * Chat format detected by the template engine
+ *
+ * Identifies how the model formats tool calls, reasoning blocks, and content.
+ * Returned by {@link SessionContext.formatChat | formatChat()} in
+ * {@link FormattedChatResult.format} and consumed by
+ * {@link SessionContext.parseChatOutput | parseChatOutput()}.
+ *
+ * You generally don't need to inspect these values directly --
+ * just pass them through from the formatChat result to parseChatOutput.
+ *
+ * Only commonly-used values are listed. The full set matches llama.cpp's
+ * `common_chat_format` enum (30+ formats).
+ */
+export enum ChatFormat {
+  /** Plain content, no special formatting */
+  CONTENT_ONLY = 0,
+  /** Generic tool call format */
+  GENERIC = 1,
+}
+
+/**
+ * Reasoning/thinking block format
+ *
+ * Controls how `<think>` blocks are handled during formatting and parsing.
+ *
+ * @see {@link FormatChatOptions.reasoningFormat} for input-side usage
+ * @see {@link ParseChatOutputOptions.reasoningFormat} for output-side usage
+ */
+export enum ReasoningFormat {
+  /** No reasoning extraction (default) */
+  NONE = 0,
+  /** Auto-detect reasoning format from model template */
+  AUTO = 1,
+  /** DeepSeek legacy format (`<think>...</think>` in content) */
+  DEEPSEEK_LEGACY = 2,
+  /** DeepSeek format (structured reasoning extraction) */
+  DEEPSEEK = 3,
+}
+
+/**
+ * Grammar trigger type
+ *
+ * Determines how lazy grammar activation is triggered during generation.
+ *
+ * @see {@link GrammarTrigger}
+ * @see {@link FormattedChatResult.grammarTriggers}
+ */
+export enum GrammarTriggerType {
+  /** Trigger on a specific token ID */
+  TOKEN = 0,
+  /** Trigger on a word boundary match */
+  WORD = 1,
+  /** Trigger on a regex pattern match */
+  PATTERN = 2,
+  /** Trigger on a full-string regex pattern match */
+  PATTERN_FULL = 3,
+}
+
+/**
+ * Options for chat template formatting
+ *
+ * Controls format-awareness fields passed to the chat template engine.
+ * All fields are optional -- sensible defaults are used when omitted.
+ *
+ * @example With tools and reasoning
+ * ```typescript
+ * const result = await ctx.formatChat(messagesJson, {
+ *   tools: JSON.stringify(tools),
+ *   toolChoice: 'auto',
+ *   reasoningFormat: 'auto',
+ * });
+ * ```
+ */
+export interface FormatChatOptions {
+  /** Custom Jinja2 template override (bypasses model's built-in template) */
+  templateOverride?: string;
+
+  /**
+   * JSON array of OpenAI-format tool definitions
+   *
+   * @example
+   * ```typescript
+   * const tools = [{ type: 'function', function: {
+   *   name: 'get_weather',
+   *   description: 'Get current weather',
+   *   parameters: { type: 'object', properties: { location: { type: 'string' } } }
+   * }}];
+   * options.tools = JSON.stringify(tools);
+   * ```
+   */
+  tools?: string;
+
+  /** Tool choice strategy (default: "auto") */
+  toolChoice?: 'auto' | 'required' | 'none';
+
+  /** Allow parallel tool calls (default: false) */
+  parallelToolCalls?: boolean;
+
+  /**
+   * Reasoning format (default: "none")
+   *
+   * Controls `<think>` block handling in the template.
+   * Use "auto" to let the model's template decide.
+   */
+  reasoningFormat?: 'none' | 'auto' | 'deepseek' | 'deepseek_legacy';
+
+  /** Enable `<think>` blocks (default: true). Pairs with reasoningFormat. */
+  enableThinking?: boolean;
+
+  /**
+   * JSON schema for constrained output. Converted to GBNF grammar internally.
+   * Mutually exclusive with `grammar`.
+   *
+   * @see {@link SessionContext.jsonSchemaToGrammar}
+   */
+  jsonSchema?: string;
+
+  /**
+   * Explicit GBNF grammar string for constrained generation.
+   * Mutually exclusive with `jsonSchema`.
+   *
+   * @see {@link SessionContext.createSampler}
+   */
+  grammar?: string;
+
+  /**
+   * Append assistant prompt prefix (default: true).
+   * Set false when formatting partial conversations or for
+   * non-generation use cases like template validation.
+   */
+  addGenerationPrompt?: boolean;
+}
+
+/**
+ * Grammar trigger from format-aware chat template
+ *
+ * Defines conditions for lazy grammar activation. When `grammarLazy` is true
+ * in {@link FormattedChatResult}, generation runs unconstrained until one of
+ * these triggers fires, at which point the grammar is activated.
+ */
+export interface GrammarTrigger {
+  /** Trigger type */
+  type: GrammarTriggerType;
+  /** Trigger value (token text, word, or regex pattern depending on type) */
+  value: string;
+  /** Token ID (for TOKEN-type triggers, -1 when not applicable) */
+  token: number;
+}
+
 /**
  * Result from chat template formatting
+ *
+ * Includes format-awareness fields for proper output parsing.
+ * Pass `format` and `reasoningFormat` directly to
+ * {@link SessionContext.parseChatOutput | parseChatOutput()} to decode
+ * the model's response.
+ *
+ * @example Roundtrip: format -> generate -> parse
+ * ```typescript
+ * const fmt = await ctx.formatChat(messagesJson, { tools: toolsJson });
+ * // ... generate tokens using fmt.prompt and fmt.grammar ...
+ * const parsed = ctx.parseChatOutput(output, fmt.format, {
+ *   reasoningFormat: fmt.reasoningFormat,
+ *   thinkingForcedOpen: fmt.thinkingForcedOpen,
+ *   parser: fmt.parser,
+ * });
+ * ```
+ *
+ * @see {@link SessionContext.parseChatOutput}
  */
 export interface FormattedChatResult {
+  /** Formatted prompt string ready for tokenization */
   prompt: string;
+  /** Additional stop strings from the template */
   stopTokens: string[];
+
+  /**
+   * Detected chat format (pass to parseChatOutput)
+   * @see {@link SessionContext.parseChatOutput}
+   */
+  format: ChatFormat;
+
+  /** Grammar string for constrained generation (empty if no tools/schema) */
+  grammar: string;
+  /** Whether grammar should be applied lazily (only after triggers fire) */
+  grammarLazy: boolean;
+  /** Whether the thinking tag was forced open by the template */
+  thinkingForcedOpen: boolean;
+
+  /**
+   * Reasoning format (pass to parseChatOutput options)
+   * @see {@link ParseChatOutputOptions.reasoningFormat}
+   */
+  reasoningFormat: ReasoningFormat;
+
+  /** PEG parser definition for PEG format models (pass to parseChatOutput options) */
+  parser: string;
+  /** Grammar trigger conditions for lazy grammar activation */
+  grammarTriggers: GrammarTrigger[];
+  /** Token strings preserved from grammar masking */
+  preservedTokens: string[];
+}
+
+/**
+ * Options for parsing chat output
+ *
+ * All fields are optional. For correct parsing, pass through the corresponding
+ * fields from {@link FormattedChatResult}.
+ *
+ * @see {@link FormattedChatResult}
+ */
+export interface ParseChatOutputOptions {
+  /**
+   * Reasoning format (from {@link FormattedChatResult.reasoningFormat})
+   */
+  reasoningFormat?: ReasoningFormat;
+
+  /**
+   * True if output is incomplete (streaming).
+   * When true, the parser tolerates unterminated tool calls and open
+   * thinking blocks, returning partial content as-is rather than
+   * treating them as parse errors.
+   */
+  isPartial?: boolean;
+
+  /** Whether thinking tag was forced open (from {@link FormattedChatResult.thinkingForcedOpen}) */
+  thinkingForcedOpen?: boolean;
+
+  /** PEG parser definition for PEG format models (from {@link FormattedChatResult.parser}) */
+  parser?: string;
+}
+
+/**
+ * A tool call extracted from model output
+ *
+ * @example
+ * ```typescript
+ * for (const tc of result.toolCalls) {
+ *   const args = JSON.parse(tc.arguments);
+ *   await executeTool(tc.name, args);
+ * }
+ * ```
+ */
+export interface ParsedToolCall {
+  /** Tool/function name */
+  name: string;
+  /** JSON string of arguments */
+  arguments: string;
+  /** Tool call ID (may be empty depending on model format) */
+  id: string;
+}
+
+/**
+ * Result from parsing chat output
+ *
+ * @example
+ * ```typescript
+ * const result = ctx.parseChatOutput(output, fmt.format);
+ * if (result.toolCalls.length > 0) {
+ *   for (const tc of result.toolCalls) {
+ *     const args = JSON.parse(tc.arguments);
+ *     await executeTool(tc.name, args);
+ *   }
+ * } else {
+ *   console.log(result.content);
+ * }
+ * ```
+ */
+export interface ParseChatOutputResult {
+  /** Main response text */
+  content: string;
+  /** Extracted thinking/reasoning content (empty if none) */
+  reasoningContent: string;
+  /** Extracted tool calls (empty array if none) */
+  toolCalls: ParsedToolCall[];
 }
 
 /**
@@ -1074,15 +1344,21 @@ export interface SessionContext {
   /**
    * Format messages using model's chat template
    *
-   * Converts [{role, content}] → formatted prompt string.
+   * Converts [{role, content}] → formatted prompt string with full format awareness.
    * Uses model's built-in template (ChatML, Llama, Mistral, etc.).
    *
+   * The returned `format` and `reasoningFormat` fields should be passed to
+   * `parseChatOutput()` after generation to correctly decode the response.
+   *
    * Cost: ~1-5ms depending on message count
    *
    * @param messagesJson JSON string containing array of messages
-   * @param templateOverride Optional custom template string
-   * @returns Formatted prompt and stop tokens from template
-   * @example
+   * @param options Formatting options (tools, reasoning, grammar, etc.)
+   * @returns Formatted prompt with format-awareness metadata
+   *
+   * @see {@link parseChatOutput}
+   *
+   * @example Basic usage
    * ```typescript
    * const result = await ctx.formatChat(JSON.stringify([
    *   { role: "system", content: "You are a helpful assistant" },
@@ -1092,12 +1368,67 @@ export interface SessionContext {
    * const tokens = await ctx.tokenize(result.prompt);
    * await ctx.decode(tokens, 0);
    * ```
+   *
+   * @example With tools
+   * ```typescript
+   * const tools = [{ type: 'function', function: {
+   *   name: 'get_weather', description: 'Get weather',
+   *   parameters: { type: 'object', properties: { location: { type: 'string' } } }
+   * }}];
+   * const result = await ctx.formatChat(JSON.stringify(messages), {
+   *   tools: JSON.stringify(tools),
+   *   toolChoice: 'auto'
+   * });
+   * // result.grammar contains GBNF for constrained tool call generation
+   * // result.format identifies the chat format for output parsing
+   * ```
+   *
+   * @example Backward compatible (string as second arg)
+   * ```typescript
+   * const result = await ctx.formatChat(messagesJson, templateOverrideString);
+   * ```
    */
   formatChat(
     messagesJson: string,
-    templateOverride?: string
+    options?: FormatChatOptions | string
   ): Promise<FormattedChatResult>;
 
+  /**
+   * Parse model output into structured content
+   *
+   * Extracts plain text, reasoning/thinking blocks, and tool calls from
+   * raw model output. Uses the format detected by {@link formatChat} to apply
+   * the correct parser for the model's output format.
+   *
+   * Cost: <0.1ms (synchronous string parsing, no I/O)
+   *
+   * @param output Raw model output text
+   * @param format Chat format enum (from {@link FormattedChatResult.format})
+   * @param options Optional parsing parameters
+   * @returns Parsed content with tool calls and reasoning
+   *
+   * @see {@link formatChat}
+   *
+   * @example Basic parsing
+   * ```typescript
+   * const fmt = await ctx.formatChat(JSON.stringify(messages), { tools: toolsJson });
+   * // ... generate tokens ...
+   * const parsed = ctx.parseChatOutput(generatedText, fmt.format, {
+   *   reasoningFormat: fmt.reasoningFormat,
+   *   thinkingForcedOpen: fmt.thinkingForcedOpen,
+   *   parser: fmt.parser
+   * });
+   * if (parsed.toolCalls.length > 0) {
+   *   // Handle tool calls
+   * }
+   * ```
+   */
+  parseChatOutput(
+    output: string,
+    format: ChatFormat,
+    options?: ParseChatOutputOptions
+  ): ParseChatOutputResult;
+
   /**
    * Convert JSON schema to GBNF grammar
    *
@@ -1307,6 +1638,12 @@ export interface SessionContext {
 
   /** @internal Reseed branch sampler PRNG for diversity after fork */
   _branchSamplerChainReseed(handle: number, seed: number): void;
+
+  /** @internal Set dynamic logit biases for a branch */
+  _branchSteer(handle: number, biases: Array<{ token: number; bias: number }>): void;
+
+  /** @internal Clear all dynamic logit biases from a branch */
+  _branchClearSteer(handle: number): void;
 }
 
 /**
diff --git a/liblloyal b/liblloyal
index 9c8fc25..158bb8e 160000
--- a/liblloyal
+++ b/liblloyal
@@ -1 +1 @@
-Subproject commit 9c8fc25bebcdc66a3ff74061b266cf24de51d81f
+Subproject commit 158bb8ed600121fe2b8f0ec1fd3646729258da0c
diff --git a/src/SessionContext.cpp b/src/SessionContext.cpp
index 1992caf..65bb323 100644
--- a/src/SessionContext.cpp
+++ b/src/SessionContext.cpp
@@ -6,7 +6,8 @@
 #include <lloyal/tokenizer.hpp>
 #include <lloyal/common.hpp>
 #include <lloyal/model_registry.hpp>
-#include <lloyal/chat_template.hpp>
+#include <lloyal/chat_in.hpp>
+#include <lloyal/chat_out.hpp>
 #include <lloyal/grammar.hpp>
 #include <lloyal/kv.hpp>
 #include <lloyal/embedding.hpp>
@@ -510,27 +511,21 @@ class DetokenizeWorker : public Napi::AsyncWorker {
 class FormatChatWorker : public Napi::AsyncWorker {
 public:
   FormatChatWorker(Napi::Env env, std::shared_ptr<llama_model> model,
-                   const std::string& messagesJson, const std::string& templateOverride)
-    : AsyncWorker(env), _deferred(env), _model(model),
-      _messagesJson(messagesJson), _templateOverride(templateOverride) {}
+                   const lloyal::chat_in::FormatInputs& inputs)
+    : AsyncWorker(env), _deferred(env), _model(model), _inputs(inputs) {}
 
   void Execute() override {
     try {
-      // Use lloyal::chat_template::format() from liblloyal
-      lloyal::chat_template::FormatResult result = lloyal::chat_template::format(
-        _model.get(),
-        _messagesJson,
-        _templateOverride
+      lloyal::chat_in::FormatResult result = lloyal::chat_in::format(
+        _model.get(), _inputs
       );
 
-      // Check if formatting failed completely
       if (result.prompt.empty()) {
         SetError("Chat template formatting failed");
         return;
       }
 
-      _resultPrompt = result.prompt;
-      _resultStopTokens = result.additional_stops;
+      _result = result;
     } catch (const std::exception& e) {
       SetError(e.what());
     }
@@ -539,17 +534,42 @@ class FormatChatWorker : public Napi::AsyncWorker {
   void OnOK() override {
     Napi::Env env = Env();
 
-    // Create result object { prompt: string, stopTokens: string[] }
     Napi::Object result = Napi::Object::New(env);
-    result.Set("prompt", Napi::String::New(env, _resultPrompt));
+    result.Set("prompt", Napi::String::New(env, _result.prompt));
 
-    // Convert stopTokens vector to JS array
-    Napi::Array stopTokens = Napi::Array::New(env, _resultStopTokens.size());
-    for (size_t i = 0; i < _resultStopTokens.size(); i++) {
-      stopTokens[i] = Napi::String::New(env, _resultStopTokens[i]);
+    // stopTokens (backward compat)
+    Napi::Array stopTokens = Napi::Array::New(env, _result.additional_stops.size());
+    for (size_t i = 0; i < _result.additional_stops.size(); i++) {
+      stopTokens[i] = Napi::String::New(env, _result.additional_stops[i]);
     }
     result.Set("stopTokens", stopTokens);
 
+    // Format awareness fields
+    result.Set("format", Napi::Number::New(env, static_cast<double>(_result.format)));
+    result.Set("grammar", Napi::String::New(env, _result.grammar));
+    result.Set("grammarLazy", Napi::Boolean::New(env, _result.grammar_lazy));
+    result.Set("thinkingForcedOpen", Napi::Boolean::New(env, _result.thinking_forced_open));
+    result.Set("reasoningFormat", Napi::Number::New(env, static_cast<double>(_result.reasoning_format)));
+    result.Set("parser", Napi::String::New(env, _result.parser));
+
+    // grammarTriggers: Array<{ type: number, value: string, token: number }>
+    Napi::Array triggers = Napi::Array::New(env, _result.grammar_triggers.size());
+    for (size_t i = 0; i < _result.grammar_triggers.size(); i++) {
+      Napi::Object trigger = Napi::Object::New(env);
+      trigger.Set("type", Napi::Number::New(env, static_cast<double>(_result.grammar_triggers[i].type)));
+      trigger.Set("value", Napi::String::New(env, _result.grammar_triggers[i].value));
+      trigger.Set("token", Napi::Number::New(env, static_cast<double>(_result.grammar_triggers[i].token)));
+      triggers[i] = trigger;
+    }
+    result.Set("grammarTriggers", triggers);
+
+    // preservedTokens: string[]
+    Napi::Array preserved = Napi::Array::New(env, _result.preserved_tokens.size());
+    for (size_t i = 0; i < _result.preserved_tokens.size(); i++) {
+      preserved[i] = Napi::String::New(env, _result.preserved_tokens[i]);
+    }
+    result.Set("preservedTokens", preserved);
+
     _deferred.Resolve(result);
   }
 
@@ -562,10 +582,8 @@ class FormatChatWorker : public Napi::AsyncWorker {
 private:
   Napi::Promise::Deferred _deferred;
   std::shared_ptr<llama_model> _model;
-  std::string _messagesJson;
-  std::string _templateOverride;
-  std::string _resultPrompt;
-  std::vector<std::string> _resultStopTokens;
+  lloyal::chat_in::FormatInputs _inputs;
+  lloyal::chat_in::FormatResult _result;
 };
 
 // ===== SESSIONCONTEXT IMPLEMENTATION =====
@@ -612,6 +630,7 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) {
 
     // ===== HELPERS =====
     InstanceMethod("formatChat", &SessionContext::formatChat),
+    InstanceMethod("parseChatOutput", &SessionContext::parseChatOutput),
     InstanceMethod("jsonSchemaToGrammar", &SessionContext::jsonSchemaToGrammar),
     InstanceMethod("validateChatTemplate", &SessionContext::validateChatTemplate),
 
@@ -1125,7 +1144,7 @@ Napi::Value SessionContext::getTurnSeparator(const Napi::CallbackInfo& info) {
 
   // Compute once, cache thereafter
   if (!_turnSeparatorCached) {
-    _turnSeparatorCache = lloyal::chat_template::get_turn_separator(_model.get());
+    _turnSeparatorCache = lloyal::chat_in::get_turn_separator(_model.get());
     _turnSeparatorCached = true;
   }
 
@@ -1141,18 +1160,51 @@ Napi::Value SessionContext::formatChat(const Napi::CallbackInfo& info) {
   ensureNotDisposed();
 
   if (info.Length() < 1 || !info[0].IsString()) {
-    throw Napi::TypeError::New(env, "Expected (messagesJson: string[, templateOverride: string])");
+    throw Napi::TypeError::New(env, "Expected (messagesJson: string[, options: object])");
   }
 
-  std::string messagesJson = info[0].As<Napi::String>().Utf8Value();
-  std::string templateOverride = "";
+  lloyal::chat_in::FormatInputs inputs;
+  inputs.messages_json = info[0].As<Napi::String>().Utf8Value();
 
-  if (info.Length() >= 2 && info[1].IsString()) {
-    templateOverride = info[1].As<Napi::String>().Utf8Value();
+  // Second argument: options object (or string for backward compat)
+  if (info.Length() >= 2) {
+    if (info[1].IsString()) {
+      // Backward compat: formatChat(messagesJson, templateOverride)
+      inputs.template_override = info[1].As<Napi::String>().Utf8Value();
+    } else if (info[1].IsObject()) {
+      Napi::Object opts = info[1].As<Napi::Object>();
+
+      if (opts.Has("templateOverride") && opts.Get("templateOverride").IsString()) {
+        inputs.template_override = opts.Get("templateOverride").As<Napi::String>().Utf8Value();
+      }
+      if (opts.Has("tools") && opts.Get("tools").IsString()) {
+        inputs.tools_json = opts.Get("tools").As<Napi::String>().Utf8Value();
+      }
+      if (opts.Has("toolChoice") && opts.Get("toolChoice").IsString()) {
+        inputs.tool_choice = opts.Get("toolChoice").As<Napi::String>().Utf8Value();
+      }
+      if (opts.Has("parallelToolCalls") && opts.Get("parallelToolCalls").IsBoolean()) {
+        inputs.parallel_tool_calls = opts.Get("parallelToolCalls").As<Napi::Boolean>().Value();
+      }
+      if (opts.Has("reasoningFormat") && opts.Get("reasoningFormat").IsString()) {
+        inputs.reasoning_format = opts.Get("reasoningFormat").As<Napi::String>().Utf8Value();
+      }
+      if (opts.Has("enableThinking") && opts.Get("enableThinking").IsBoolean()) {
+        inputs.enable_thinking = opts.Get("enableThinking").As<Napi::Boolean>().Value();
+      }
+      if (opts.Has("jsonSchema") && opts.Get("jsonSchema").IsString()) {
+        inputs.json_schema = opts.Get("jsonSchema").As<Napi::String>().Utf8Value();
+      }
+      if (opts.Has("grammar") && opts.Get("grammar").IsString()) {
+        inputs.grammar = opts.Get("grammar").As<Napi::String>().Utf8Value();
+      }
+      if (opts.Has("addGenerationPrompt") && opts.Get("addGenerationPrompt").IsBoolean()) {
+        inputs.add_generation_prompt = opts.Get("addGenerationPrompt").As<Napi::Boolean>().Value();
+      }
+    }
   }
 
-  // Run async
-  auto* worker = new FormatChatWorker(env, _model, messagesJson, templateOverride);
+  auto* worker = new FormatChatWorker(env, _model, inputs);
   worker->Queue();
   return worker->GetPromise();
 }
@@ -1742,9 +1794,9 @@ Napi::Value SessionContext::validateChatTemplate(const Napi::CallbackInfo& info)
       : AsyncWorker(env), _deferred(env), _templateString(templateStr) {}
 
     void Execute() override {
-      // Use lloyal::chat_template from liblloyal (handles error logging)
+      // Use lloyal::chat_in from liblloyal (handles error logging)
       // Pattern matches HybridSessionContext.cpp:365-372
-      _result = lloyal::chat_template::validate(_templateString);
+      _result = lloyal::chat_in::validate(_templateString);
     }
 
     void OnOK() override {
@@ -1768,6 +1820,66 @@ Napi::Value SessionContext::validateChatTemplate(const Napi::CallbackInfo& info)
   return worker->GetPromise();
 }
 
+// ===== CHAT OUTPUT PARSING =====
+
+Napi::Value SessionContext::parseChatOutput(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  ensureNotDisposed();
+
+  // Args: output (string), format (number), options? (object)
+  if (info.Length() < 2 || !info[0].IsString() || !info[1].IsNumber()) {
+    throw Napi::TypeError::New(env, "Expected (output: string, format: number[, options: object])");
+  }
+
+  std::string output = info[0].As<Napi::String>().Utf8Value();
+  auto format = static_cast<common_chat_format>(info[1].As<Napi::Number>().Int32Value());
+
+  // Optional params
+  auto reasoning_format = COMMON_REASONING_FORMAT_NONE;
+  bool is_partial = false;
+  bool thinking_forced_open = false;
+  std::string parser_data;
+
+  if (info.Length() >= 3 && info[2].IsObject()) {
+    Napi::Object opts = info[2].As<Napi::Object>();
+
+    if (opts.Has("reasoningFormat") && opts.Get("reasoningFormat").IsNumber()) {
+      reasoning_format = static_cast<common_reasoning_format>(
+        opts.Get("reasoningFormat").As<Napi::Number>().Int32Value());
+    }
+    if (opts.Has("isPartial") && opts.Get("isPartial").IsBoolean()) {
+      is_partial = opts.Get("isPartial").As<Napi::Boolean>().Value();
+    }
+    if (opts.Has("thinkingForcedOpen") && opts.Get("thinkingForcedOpen").IsBoolean()) {
+      thinking_forced_open = opts.Get("thinkingForcedOpen").As<Napi::Boolean>().Value();
+    }
+    if (opts.Has("parser") && opts.Get("parser").IsString()) {
+      parser_data = opts.Get("parser").As<Napi::String>().Utf8Value();
+    }
+  }
+
+  // Synchronous — parsing is fast, no I/O
+  auto result = lloyal::chat_out::parse(output, format, reasoning_format,
+                                         is_partial, thinking_forced_open, parser_data);
+
+  // Build return object
+  Napi::Object obj = Napi::Object::New(env);
+  obj.Set("content", Napi::String::New(env, result.content));
+  obj.Set("reasoningContent", Napi::String::New(env, result.reasoning_content));
+
+  Napi::Array toolCalls = Napi::Array::New(env, result.tool_calls.size());
+  for (size_t i = 0; i < result.tool_calls.size(); i++) {
+    Napi::Object tc = Napi::Object::New(env);
+    tc.Set("name", Napi::String::New(env, result.tool_calls[i].name));
+    tc.Set("arguments", Napi::String::New(env, result.tool_calls[i].arguments));
+    tc.Set("id", Napi::String::New(env, result.tool_calls[i].id));
+    toolCalls[i] = tc;
+  }
+  obj.Set("toolCalls", toolCalls);
+
+  return obj;
+}
+
 // ===== KV CACHE OPERATIONS =====
 // Pattern matches HybridSessionContext.cpp:550-642
 
diff --git a/src/SessionContext.hpp b/src/SessionContext.hpp
index f729ed2..b42ca1d 100644
--- a/src/SessionContext.hpp
+++ b/src/SessionContext.hpp
@@ -142,6 +142,7 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
    * Returns: Promise<{ prompt: string, stopTokens: string[] }>
    */
   Napi::Value formatChat(const Napi::CallbackInfo& info);
+  Napi::Value parseChatOutput(const Napi::CallbackInfo& info);
 
   /**
    * Get current KV cache position (number of tokens in cache)
diff --git a/test/integration.js b/test/integration.js
index b8d50e5..b02a222 100644
--- a/test/integration.js
+++ b/test/integration.js
@@ -381,6 +381,260 @@ async function testBranchPrefill() {
   }
 }
 
+// ═══════════════════════════════════════════════════════════════════════════
+// WARM vs COLD PARITY - Semantic proof that warm continuation == cold start
+// ═══════════════════════════════════════════════════════════════════════════
+
+async function testWarmColdParity() {
+  console.log('\n--- Warm vs Cold Parity ---');
+
+  const GEN_TOKENS = 10;
+  const userMessages = [
+    "What is the capital of France?",
+    " Tell me more about it."
+  ];
+
+  // === WARM PATH: decode turn 1, prefill turn 2 delta, generate ===
+  const warmCtx = await addon.createContext({
+    modelPath: MODEL_PATH,
+    nCtx: 2048,
+    nBatch: 512,
+    nThreads: 4
+  });
+
+  let assistantContent;
+  let warmGen2;
+
+  try {
+    // Turn 1: format, decode, generate
+    const msgs1 = [{ role: 'user', content: userMessages[0] }];
+    const { prompt: prompt1 } = await warmCtx.formatChat(JSON.stringify(msgs1));
+    const toks1 = await warmCtx.tokenize(prompt1);
+    await warmCtx.decode(toks1, 0, 0);
+
+    const branch = Branch.create(warmCtx, 0, toks1.length, { temperature: 0 });
+    branch.captureLogits();
+
+    const gen1 = [];
+    for (let i = 0; i < GEN_TOKENS; i++) {
+      const { token, isStop } = branch.produce();
+      if (isStop) break;
+      branch.commit(token);
+      gen1.push(token);
+    }
+
+    assistantContent = await warmCtx.detokenize(gen1);
+    const lastText = prompt1 + assistantContent;
+
+    // Turn 2: prefill delta, generate
+    const msgs2 = [
+      { role: 'user', content: userMessages[0] },
+      { role: 'assistant', content: assistantContent },
+      { role: 'user', content: userMessages[1] }
+    ];
+    const { prompt: fullPrompt2 } = await warmCtx.formatChat(JSON.stringify(msgs2));
+    const delta = fullPrompt2.slice(lastText.length);
+    const deltaToks = await warmCtx.tokenize(delta);
+    branch.prefill(deltaToks);
+
+    warmGen2 = [];
+    for (let i = 0; i < GEN_TOKENS; i++) {
+      const { token, isStop } = branch.produce();
+      if (isStop) break;
+      branch.commit(token);
+      warmGen2.push(token);
+    }
+
+    branch.prune();
+  } finally {
+    warmCtx.dispose();
+  }
+
+  // === COLD PATH: decode full 2-turn conversation from scratch, generate ===
+  const coldCtx = await addon.createContext({
+    modelPath: MODEL_PATH,
+    nCtx: 2048,
+    nBatch: 512,
+    nThreads: 4
+  });
+
+  let coldGen2;
+
+  try {
+    const msgs = [
+      { role: 'user', content: userMessages[0] },
+      { role: 'assistant', content: assistantContent },
+      { role: 'user', content: userMessages[1] }
+    ];
+    const { prompt: coldPrompt } = await coldCtx.formatChat(JSON.stringify(msgs));
+    const coldToks = await coldCtx.tokenize(coldPrompt);
+    await coldCtx.decode(coldToks, 0, 0);
+
+    const branch = Branch.create(coldCtx, 0, coldToks.length, { temperature: 0 });
+    branch.captureLogits();
+
+    coldGen2 = [];
+    for (let i = 0; i < GEN_TOKENS; i++) {
+      const { token, isStop } = branch.produce();
+      if (isStop) break;
+      branch.commit(token);
+      coldGen2.push(token);
+    }
+
+    branch.prune();
+  } finally {
+    coldCtx.dispose();
+  }
+
+  // === COMPARE ===
+  const warmStr = warmGen2.join(',');
+  const coldStr = coldGen2.join(',');
+  assert(warmStr === coldStr,
+    `Warm==Cold parity: ${warmGen2.length} tokens match`);
+
+  if (warmStr !== coldStr) {
+    // Diagnostic: show first divergence point
+    for (let i = 0; i < Math.max(warmGen2.length, coldGen2.length); i++) {
+      if (warmGen2[i] !== coldGen2[i]) {
+        console.log(`  First divergence at position ${i}: warm=${warmGen2[i]} cold=${coldGen2[i]}`);
+        break;
+      }
+    }
+  }
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// WARM CONTINUATION SEMANTIC RECALL - Proves context survives delta-only prefill
+// ═══════════════════════════════════════════════════════════════════════════
+
+async function testWarmSemanticRecall() {
+  if (!EMBED_MODEL_PATH) {
+    console.log('\n--- Warm Semantic Recall (SKIPPED - no LLAMA_EMBED_MODEL) ---');
+    return;
+  }
+
+  console.log('\n--- Warm Semantic Recall ---');
+
+  const GEN_TOKENS = 40;
+
+  // Helper: cosine similarity
+  function cosine(a, b) {
+    let dot = 0, na = 0, nb = 0;
+    for (let i = 0; i < a.length; i++) {
+      dot += a[i] * b[i];
+      na += a[i] * a[i];
+      nb += b[i] * b[i];
+    }
+    return dot / (Math.sqrt(na) * Math.sqrt(nb));
+  }
+
+  // Phase 1: Generate multi-turn conversation via warm continuation
+  let recallText;
+  {
+    const ctx = await addon.createContext({
+      modelPath: MODEL_PATH,
+      nCtx: 2048,
+      nBatch: 512,
+      nThreads: 4
+    });
+
+    try {
+      // Helper: warm-continue one turn (prefill delta, generate)
+      async function warmTurn(messages, lastText, userContent) {
+        messages.push({ role: 'user', content: userContent });
+        const { prompt: fullPrompt } = await ctx.formatChat(JSON.stringify(messages));
+        const delta = fullPrompt.slice(lastText.length);
+        const deltaToks = await ctx.tokenize(delta);
+        branch.prefill(deltaToks);
+
+        const gen = [];
+        for (let i = 0; i < GEN_TOKENS; i++) {
+          const { token, isStop } = branch.produce();
+          if (isStop) break;
+          branch.commit(token);
+          gen.push(token);
+        }
+        const assistantText = await ctx.detokenize(gen);
+        messages.push({ role: 'assistant', content: assistantText });
+        return { text: assistantText, lastText: fullPrompt + assistantText };
+      }
+
+      // Turn 1: Plant a specific, recallable fact
+      const messages = [{ role: 'user', content: 'Remember this: my dog is named Max.' }];
+      const { prompt } = await ctx.formatChat(JSON.stringify(messages));
+      const promptToks = await ctx.tokenize(prompt);
+      await ctx.decode(promptToks, 0, 0);
+
+      var branch = Branch.create(ctx, 0, promptToks.length, { temperature: 0 });
+      branch.captureLogits();
+
+      // Generate turn 1 response
+      const gen = [];
+      for (let i = 0; i < GEN_TOKENS; i++) {
+        const { token, isStop } = branch.produce();
+        if (isStop) break;
+        branch.commit(token);
+        gen.push(token);
+      }
+      const assistantText = await ctx.detokenize(gen);
+      messages.push({ role: 'assistant', content: assistantText });
+      let lastText = prompt + assistantText;
+
+      // Turn 2: Distractor
+      let turn;
+      turn = await warmTurn(messages, lastText, 'What is 2 + 2?');
+      lastText = turn.lastText;
+
+      // Turn 3: Another distractor
+      turn = await warmTurn(messages, lastText, 'Name three colors.');
+      lastText = turn.lastText;
+
+      // Turn 4: Recall — only answerable from turn 1 context
+      turn = await warmTurn(messages, lastText, 'What is my dog\'s name?');
+      recallText = turn.text;
+
+      branch.prune();
+    } finally {
+      ctx.dispose();
+    }
+  }
+
+  // Phase 2: Score via embedding similarity (chat model fully released)
+  {
+    const embedCtx = await addon.createContext({
+      modelPath: EMBED_MODEL_PATH,
+      nCtx: 512,
+      nBatch: 512,
+      nThreads: 4,
+      embeddings: true,
+      poolingType: 1  // MEAN
+    });
+
+    try {
+      async function embed(text) {
+        const tokens = await embedCtx.tokenize(text);
+        await embedCtx.kvCacheClear();
+        await embedCtx.encode(tokens);
+        return embedCtx.getEmbeddings(true);
+      }
+
+      console.log(`  Recall response: "${recallText.trim().slice(0, 120)}"`);
+
+      const embResponse = await embed(recallText);
+      const embCorrect = await embed('The dog is named Max.');
+      const embWrong = await embed('Red, blue, and green are three colors.');
+
+      const simCorrect = cosine(embResponse, embCorrect);
+      const simWrong = cosine(embResponse, embWrong);
+
+      assert(simCorrect > simWrong,
+        `Semantic recall: correct=${simCorrect.toFixed(3)} > wrong=${simWrong.toFixed(3)}`);
+    } finally {
+      embedCtx.dispose();
+    }
+  }
+}
+
 // ═══════════════════════════════════════════════════════════════════════════
 // BRANCH STEER TESTS - Dynamic per-sample logit manipulation
 // ═══════════════════════════════════════════════════════════════════════════
@@ -726,6 +980,62 @@ async function testDecodeAndCapture() {
 // MAIN
 // ═══════════════════════════════════════════════════════════════════════════
 
+async function testChatInOut(ctx) {
+  console.log('\n── chat_in / chat_out ──');
+
+  // formatChat with empty options object (new signature)
+  const messages = [{ role: 'user', content: 'Hello' }];
+  const result = await ctx.formatChat(JSON.stringify(messages), {});
+  assert(result.prompt.includes('Hello'), 'formatChat with options: prompt contains Hello');
+  assert(typeof result.format === 'number', 'formatChat returns format as number');
+  assert(typeof result.grammar === 'string', 'formatChat returns grammar as string');
+  assert(typeof result.grammarLazy === 'boolean', 'formatChat returns grammarLazy');
+  assert(typeof result.thinkingForcedOpen === 'boolean', 'formatChat returns thinkingForcedOpen');
+  assert(typeof result.reasoningFormat === 'number', 'formatChat returns reasoningFormat');
+  assert(Array.isArray(result.grammarTriggers), 'formatChat returns grammarTriggers array');
+  assert(Array.isArray(result.preservedTokens), 'formatChat returns preservedTokens array');
+  ok('formatChat with options returns extended result');
+
+  // Backward compat: string second argument still works
+  const backCompat = await ctx.formatChat(JSON.stringify(messages));
+  assert(backCompat.prompt.includes('Hello'), 'formatChat backward compat works');
+  ok('formatChat backward compat (no second arg)');
+
+  // formatChat with tools
+  const tools = [{
+    type: 'function',
+    function: {
+      name: 'get_weather',
+      description: 'Get weather',
+      parameters: { type: 'object', properties: { location: { type: 'string' } } }
+    }
+  }];
+  const toolResult = await ctx.formatChat(JSON.stringify(messages), {
+    tools: JSON.stringify(tools),
+    toolChoice: 'auto'
+  });
+  assert(typeof toolResult.format === 'number', 'formatChat with tools returns format');
+  assert(typeof toolResult.grammar === 'string', 'formatChat with tools returns grammar');
+  ok('formatChat with tools');
+
+  // parseChatOutput
+  const parsed = ctx.parseChatOutput('Hello world', toolResult.format);
+  assert(typeof parsed.content === 'string', 'parseChatOutput returns content');
+  assert(parsed.content.includes('Hello'), 'parseChatOutput content contains Hello');
+  assert(typeof parsed.reasoningContent === 'string', 'parseChatOutput returns reasoningContent');
+  assert(Array.isArray(parsed.toolCalls), 'parseChatOutput returns toolCalls array');
+  ok('parseChatOutput basic');
+
+  // parseChatOutput with options
+  const parsedWithOpts = ctx.parseChatOutput('Some output', toolResult.format, {
+    reasoningFormat: toolResult.reasoningFormat,
+    isPartial: false,
+    thinkingForcedOpen: false
+  });
+  assert(typeof parsedWithOpts.content === 'string', 'parseChatOutput with options');
+  ok('parseChatOutput with options');
+}
+
 async function main() {
   let mainCtx = null;
 
@@ -743,11 +1053,14 @@ async function main() {
     await testKVCache(mainCtx);
     await testMetrics(mainCtx);
     await testTokenizer(mainCtx);
+    await testChatInOut(mainCtx);
 
     // Tests that create their own contexts
     await testMultiSequence();
     await testGrammar();
     await testBranchPrefill();
+    await testWarmColdParity();
+    await testWarmSemanticRecall();
     await testBranchSteer();
     await testNBatchAblation();
     await testDeterminism();
diff --git a/test/matrix.json b/test/matrix.json
index 96e7b96..a22f153 100644
--- a/test/matrix.json
+++ b/test/matrix.json
@@ -10,10 +10,10 @@
       "default": true
     },
     {
-      "name": "TinyLlama",
-      "file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-      "url": "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-      "template": "zephyr"
+      "name": "Ministral",
+      "file": "Ministral-3-3B-Instruct-2512-Q4_K_M.gguf",
+      "url": "https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512-GGUF/resolve/main/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf",
+      "template": "mistral"
     },
     {
       "name": "Llama-3.2",

From 7c1ba43f3564e2abd9fdcfd33ee71469c363cc29 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Wed, 11 Feb 2026 09:32:01 +1100
Subject: [PATCH 02/13] feat(chat): new chat api

---
 examples/chat/chat.mjs |  79 +++++++++++++----------------
 lib/index.d.ts         |  42 ++++++++++++++++
 liblloyal              |   2 +-
 src/SessionContext.cpp |  29 +++++++++++
 src/SessionContext.hpp |  11 +++++
 test/integration.js    | 110 +++++++++++++++++++++++++++++++----------
 6 files changed, 203 insertions(+), 70 deletions(-)

diff --git a/examples/chat/chat.mjs b/examples/chat/chat.mjs
index 3256158..8a22f8b 100644
--- a/examples/chat/chat.mjs
+++ b/examples/chat/chat.mjs
@@ -7,15 +7,15 @@
  *   node chat.mjs  # uses default model path
  *
  * This example demonstrates:
- * - Sync generator for token production (sample, check stop, convert to text)
- * - Async commit via decode() to update KV cache
- * - Clear separation: sync produce, async commit
+ * - Branch API for token generation (produce/commit two-phase)
+ * - Warm multi-turn continuation via getWarmTurnTokens() + branch.prefill()
+ * - Cold/warm routing: formatChat() on first turn, probe-based prefill on subsequent turns
  */
 
 import * as readline from "node:readline";
 import * as path from "node:path";
 import { fileURLToPath } from "node:url";
-import { createContext } from "../../lib/index.js";
+import { createContext, Branch } from "../../lib/index.js";
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const DEFAULT_MODEL = path.resolve(
@@ -23,19 +23,6 @@ const DEFAULT_MODEL = path.resolve(
   "../../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf",
 );
 
-/**
- * Sync generator - produces tokens until stop token.
- * All operations are synchronous: sample, isStopToken, tokenToText.
- */
-function* produceTokens(ctx, params) {
-  while (true) {
-    const tokenId = ctx.sample(params);
-    if (ctx.isStopToken(tokenId)) return;
-    const text = ctx.tokenToText(tokenId);
-    yield { text, tokenId };
-  }
-}
-
 async function main() {
   const modelPath = process.argv[2] || DEFAULT_MODEL;
 
@@ -52,8 +39,8 @@ async function main() {
   console.log("Commands: /clear to reset, /quit to exit\n");
 
   const messages = [];
-  let position = 0;
-  let lastPrompt = "";
+  let branch = null;
+  const warm = ctx.getWarmTurnTokens();
 
   const rl = readline.createInterface({
     input: process.stdin,
@@ -67,16 +54,17 @@ async function main() {
 
     if (trimmed === "/quit" || trimmed === "/exit") {
       console.log("Goodbye!");
+      if (branch) branch.prune();
       ctx.dispose();
       rl.close();
       return;
     }
 
     if (trimmed === "/clear") {
+      if (branch) branch.prune();
+      branch = null;
       await ctx.kvCacheClear();
       messages.length = 0;
-      position = 0;
-      lastPrompt = "";
       console.clear();
       console.log("Conversation cleared.\n");
       askUser();
@@ -90,40 +78,43 @@ async function main() {
 
     messages.push({ role: "user", content: trimmed });
 
-    // Format with chat template
-    const { prompt: fullPrompt } = await ctx.formatChat(
-      JSON.stringify(messages),
-    );
-
-    // Prompt diffing - only tokenize new content
-    const newContent = fullPrompt.startsWith(lastPrompt)
-      ? fullPrompt.slice(lastPrompt.length)
-      : fullPrompt;
-
-    const tokens = await ctx.tokenize(newContent);
-    await ctx.decode(tokens, position);
-    position += tokens.length;
+    if (!branch) {
+      // === COLD (position === 0): full format → tokenize with BOS → decode ===
+      const { prompt } = await ctx.formatChat(JSON.stringify(messages));
+      const tokens = await ctx.tokenize(prompt);
+      await ctx.decode(tokens, 0, 0);
+      branch = Branch.create(ctx, 0, tokens.length, {
+        temperature: 0.7,
+        topK: 40,
+        topP: 0.9,
+      });
+      branch.captureLogits();
+    } else {
+      // === WARM (position > 0): probe-based prefill — no formatChat(), no BOS ===
+      const contentToks = await ctx.tokenize(trimmed, false);
+      branch.prefill([
+        ...warm.turnSeparator,
+        ...warm.userPrefix,
+        ...contentToks,
+        ...warm.userToAssistant,
+      ]);
+    }
 
-    // Generate: sync produce, async commit
+    // Generate: produce inspects, commit advances
     process.stdout.write("< ");
     let response = "";
 
-    for (const { text, tokenId } of produceTokens(ctx, {
-      temperature: 0.7,
-      topK: 40,
-      topP: 0.9,
-    })) {
+    while (true) {
+      const { token, text, isStop } = branch.produce();
+      if (isStop) break;
       process.stdout.write(text);
       response += text;
-
-      await ctx.decode([tokenId], position); // async commit to KV
-      position += 1;
+      branch.commit(token);
     }
 
     console.log("\n");
 
     messages.push({ role: "assistant", content: response.trim() });
-    lastPrompt = fullPrompt + response;
 
     askUser();
   }
diff --git a/lib/index.d.ts b/lib/index.d.ts
index a762547..7dfffdf 100644
--- a/lib/index.d.ts
+++ b/lib/index.d.ts
@@ -388,6 +388,23 @@ export interface ParseChatOutputResult {
   toolCalls: ParsedToolCall[];
 }
 
+/**
+ * Pre-tokenized wrapper tokens for warm multi-turn continuation
+ *
+ * Contains the three token sequences needed to inject a new user turn
+ * into an existing conversation without re-formatting the full history.
+ *
+ * @see {@link SessionContext.getWarmTurnTokens}
+ */
+export interface WarmTurnTokens {
+  /** Tokens that close the previous assistant turn (e.g., im_end + newline for ChatML) */
+  turnSeparator: number[];
+  /** Tokens that open a new user turn (e.g., im_start + "user" + newline for ChatML) */
+  userPrefix: number[];
+  /** Tokens that close the user turn and open assistant (e.g., im_end + newline + im_start + "assistant" + newline) */
+  userToAssistant: number[];
+}
+
 /**
  * Penalty parameters for repetition control
  */
@@ -732,6 +749,31 @@ export interface SessionContext {
    */
   getTurnSeparator(): number[];
 
+  /**
+   * Get warm turn wrapper tokens for template-aware warm continuation
+   *
+   * Returns pre-tokenized role wrappers extracted from the model's chat
+   * template. Use these to construct warm prefill tokens without
+   * re-formatting the full conversation (no BOS bug, O(1) per turn).
+   *
+   * Warm path: turnSeparator + userPrefix + tokenize(content, false) + userToAssistant
+   *
+   * @returns Cached wrapper tokens (computed once per model)
+   *
+   * @example
+   * ```typescript
+   * const warm = ctx.getWarmTurnTokens();
+   * const contentToks = await ctx.tokenize(userContent, false);
+   * branch.prefill([
+   *   ...warm.turnSeparator,     // closes previous assistant turn
+   *   ...warm.userPrefix,        // opens new user turn
+   *   ...contentToks,            // raw user content (no BOS)
+   *   ...warm.userToAssistant,   // closes user turn + opens assistant
+   * ]);
+   * ```
+   */
+  getWarmTurnTokens(): WarmTurnTokens;
+
   // ===== PROMPT PREPARATION =====
 
   /**
diff --git a/liblloyal b/liblloyal
index 158bb8e..6037b9b 160000
--- a/liblloyal
+++ b/liblloyal
@@ -1 +1 @@
-Subproject commit 158bb8ed600121fe2b8f0ec1fd3646729258da0c
+Subproject commit 6037b9bc7a3ea67460a073df3373f6e121ce9d68
diff --git a/src/SessionContext.cpp b/src/SessionContext.cpp
index 65bb323..6cc2a5f 100644
--- a/src/SessionContext.cpp
+++ b/src/SessionContext.cpp
@@ -598,6 +598,7 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) {
     InstanceMethod("isStopToken", &SessionContext::isStopToken),
     InstanceMethod("getEogToken", &SessionContext::getEogToken),
     InstanceMethod("getTurnSeparator", &SessionContext::getTurnSeparator),
+    InstanceMethod("getWarmTurnTokens", &SessionContext::getWarmTurnTokens),
 
     // ===== PROMPT PREPARATION =====
     InstanceMethod("tokenize", &SessionContext::tokenize),
@@ -1155,6 +1156,34 @@ Napi::Value SessionContext::getTurnSeparator(const Napi::CallbackInfo& info) {
   return result;
 }
 
+Napi::Value SessionContext::getWarmTurnTokens(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  ensureNotDisposed();
+
+  // Compute once, cache thereafter
+  if (!_warmTurnTokensCached) {
+    _warmTurnTokensCache = lloyal::chat_in::get_warm_turn_tokens(_model.get());
+    _warmTurnTokensCached = true;
+  }
+
+  // Return { turnSeparator: number[], userPrefix: number[], userToAssistant: number[] }
+  Napi::Object result = Napi::Object::New(env);
+
+  auto toArray = [&](const std::vector<llama_token>& tokens) {
+    Napi::Array arr = Napi::Array::New(env, tokens.size());
+    for (size_t i = 0; i < tokens.size(); i++) {
+      arr[i] = Napi::Number::New(env, static_cast<double>(tokens[i]));
+    }
+    return arr;
+  };
+
+  result.Set("turnSeparator", toArray(_warmTurnTokensCache.turn_separator));
+  result.Set("userPrefix", toArray(_warmTurnTokensCache.user_prefix));
+  result.Set("userToAssistant", toArray(_warmTurnTokensCache.user_to_assistant));
+
+  return result;
+}
+
 Napi::Value SessionContext::formatChat(const Napi::CallbackInfo& info) {
   Napi::Env env = info.Env();
   ensureNotDisposed();
diff --git a/src/SessionContext.hpp b/src/SessionContext.hpp
index b42ca1d..c844e51 100644
--- a/src/SessionContext.hpp
+++ b/src/SessionContext.hpp
@@ -4,6 +4,7 @@
 #include <lloyal/tokenizer.hpp>
 #include <lloyal/metrics.hpp>
 #include <lloyal/branch.hpp>
+#include <lloyal/chat_in.hpp>
 #include <llama/llama.h>
 #include <memory>
 #include <mutex>
@@ -136,6 +137,12 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
    */
   Napi::Value getTurnSeparator(const Napi::CallbackInfo& info);
 
+  /**
+   * Get warm turn wrapper tokens for template-aware warm continuation
+   * Returns { turnSeparator: number[], userPrefix: number[], userToAssistant: number[] }
+   */
+  Napi::Value getWarmTurnTokens(const Napi::CallbackInfo& info);
+
   /**
    * Format messages using model's chat template
    * Args: messagesJson (string), templateOverride (optional string)
@@ -413,6 +420,10 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
   std::vector<llama_token> _turnSeparatorCache;
   bool _turnSeparatorCached = false;
 
+  // ===== WARM TURN TOKENS CACHE =====
+  lloyal::chat_in::WarmTurnTokens _warmTurnTokensCache;
+  bool _warmTurnTokensCached = false;
+
   // ===== DECODE MUTEX =====
   std::mutex _decodeMutex;
 
diff --git a/test/integration.js b/test/integration.js
index b02a222..62d1c19 100644
--- a/test/integration.js
+++ b/test/integration.js
@@ -345,22 +345,23 @@ async function testBranchPrefill() {
     }
     assert(gen1.length > 0, `Turn 1: generated ${gen1.length} tokens`);
 
-    let lastText = prompt + await ctx.detokenize(gen1);
-    let lastGen = gen1;  // Track last generation for multi-turn
+    // Warm turn tokens for probe-based prefill (no string diff, no BOS bug)
+    const warm = ctx.getWarmTurnTokens();
 
-    // Turn 2-3: prefill + generate
+    // Turn 2-3: prefill using warm probe + generate
     for (let t = 1; t < turns.length; t++) {
-      messages.push({ role: 'assistant', content: await ctx.detokenize(lastGen) });
-      messages.push({ role: 'user', content: turns[t] });
-
-      const { prompt: fullPrompt } = await ctx.formatChat(JSON.stringify(messages));
-      const delta = fullPrompt.slice(lastText.length);
-      const deltaToks = await ctx.tokenize(delta);
+      const contentToks = await ctx.tokenize(turns[t], false);
+      const prefillToks = [
+        ...warm.turnSeparator,
+        ...warm.userPrefix,
+        ...contentToks,
+        ...warm.userToAssistant,
+      ];
 
       const posBefore = branch.position;
-      branch.prefill(deltaToks);
-      assert(branch.position === posBefore + deltaToks.length,
-        `Turn ${t + 1}: prefill ${deltaToks.length} tokens → pos=${branch.position}`);
+      branch.prefill(prefillToks);
+      assert(branch.position === posBefore + prefillToks.length,
+        `Turn ${t + 1}: prefill ${prefillToks.length} tokens → pos=${branch.position}`);
 
       const gen = [];
       for (let i = 0; i < GEN_TOKENS; i++) {
@@ -370,9 +371,6 @@ async function testBranchPrefill() {
         gen.push(token);
       }
       assert(gen.length > 0, `Turn ${t + 1}: generated ${gen.length} tokens`);
-
-      lastText = fullPrompt + await ctx.detokenize(gen);
-      lastGen = gen;  // Update for next turn
     }
 
     branch.prune();
@@ -424,18 +422,17 @@ async function testWarmColdParity() {
     }
 
     assistantContent = await warmCtx.detokenize(gen1);
-    const lastText = prompt1 + assistantContent;
 
-    // Turn 2: prefill delta, generate
-    const msgs2 = [
-      { role: 'user', content: userMessages[0] },
-      { role: 'assistant', content: assistantContent },
-      { role: 'user', content: userMessages[1] }
+    // Turn 2: prefill using warm probe (no string diff, no BOS bug)
+    const warm = warmCtx.getWarmTurnTokens();
+    const contentToks = await warmCtx.tokenize(userMessages[1], false);
+    const prefillToks = [
+      ...warm.turnSeparator,
+      ...warm.userPrefix,
+      ...contentToks,
+      ...warm.userToAssistant,
     ];
-    const { prompt: fullPrompt2 } = await warmCtx.formatChat(JSON.stringify(msgs2));
-    const delta = fullPrompt2.slice(lastText.length);
-    const deltaToks = await warmCtx.tokenize(delta);
-    branch.prefill(deltaToks);
+    branch.prefill(prefillToks);
 
     warmGen2 = [];
     for (let i = 0; i < GEN_TOKENS; i++) {
@@ -503,6 +500,68 @@ async function testWarmColdParity() {
   }
 }
 
+// ═══════════════════════════════════════════════════════════════════════════
+// WARM TURN TOKENS PROBE - Verifies template-extracted role wrappers
+// ═══════════════════════════════════════════════════════════════════════════
+
+async function testWarmTurnTokens() {
+  console.log('\n--- Warm Turn Tokens Probe ---');
+
+  const ctx = await addon.createContext({
+    modelPath: MODEL_PATH,
+    nCtx: 512,
+    nThreads: 4
+  });
+
+  try {
+    const warm = ctx.getWarmTurnTokens();
+
+    assert(Array.isArray(warm.turnSeparator) && warm.turnSeparator.length > 0,
+      `turnSeparator: ${warm.turnSeparator.length} tokens`);
+    assert(Array.isArray(warm.userPrefix) && warm.userPrefix.length > 0,
+      `userPrefix: ${warm.userPrefix.length} tokens`);
+    assert(Array.isArray(warm.userToAssistant) && warm.userToAssistant.length > 0,
+      `userToAssistant: ${warm.userToAssistant.length} tokens`);
+
+    // All token IDs should be valid numbers
+    for (const tok of warm.turnSeparator) {
+      assert(typeof tok === 'number' && Number.isInteger(tok), `turnSeparator token ${tok} is integer`);
+    }
+    for (const tok of warm.userPrefix) {
+      assert(typeof tok === 'number' && Number.isInteger(tok), `userPrefix token ${tok} is integer`);
+    }
+    for (const tok of warm.userToAssistant) {
+      assert(typeof tok === 'number' && Number.isInteger(tok), `userToAssistant token ${tok} is integer`);
+    }
+
+    // turnSeparator should contain at least one EOG token
+    const hasEog = warm.turnSeparator.some(t => ctx.isStopToken(t));
+    assert(hasEog, 'turnSeparator contains at least one EOG token');
+
+    // userPrefix should NOT contain EOG tokens
+    const prefixHasEog = warm.userPrefix.some(t => ctx.isStopToken(t));
+    assert(!prefixHasEog, 'userPrefix contains no EOG tokens');
+
+    // Cached: second call returns same result
+    const warm2 = ctx.getWarmTurnTokens();
+    assert(
+      warm.turnSeparator.join(',') === warm2.turnSeparator.join(',') &&
+      warm.userPrefix.join(',') === warm2.userPrefix.join(',') &&
+      warm.userToAssistant.join(',') === warm2.userToAssistant.join(','),
+      'getWarmTurnTokens() is cached (idempotent)');
+
+    // Log for diagnostic visibility
+    const sepText = warm.turnSeparator.map(t => ctx.tokenToText(t)).join('');
+    const prefText = warm.userPrefix.map(t => ctx.tokenToText(t)).join('');
+    const u2aText = warm.userToAssistant.map(t => ctx.tokenToText(t)).join('');
+    console.log(`    separator: ${JSON.stringify(sepText)}`);
+    console.log(`    userPrefix: ${JSON.stringify(prefText)}`);
+    console.log(`    userToAssistant: ${JSON.stringify(u2aText)}`);
+  } finally {
+    ctx.dispose();
+  }
+}
+
 // ═══════════════════════════════════════════════════════════════════════════
 // WARM CONTINUATION SEMANTIC RECALL - Proves context survives delta-only prefill
 // ═══════════════════════════════════════════════════════════════════════════
@@ -1059,6 +1118,7 @@ async function main() {
     await testMultiSequence();
     await testGrammar();
     await testBranchPrefill();
+    await testWarmTurnTokens();
     await testWarmColdParity();
     await testWarmSemanticRecall();
     await testBranchSteer();

From 758d6c41d40d6feef5780b4b2deadabf422fb936 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Wed, 11 Feb 2026 12:35:47 +1100
Subject: [PATCH 03/13] feat(chat): new chat api - rely on llama.cpp's BOS
 stripping

---
 examples/chat/chat.mjs |  22 +++---
 lib/index.d.ts         |  42 -----------
 liblloyal              |   2 +-
 src/SessionContext.cpp |  29 --------
 src/SessionContext.hpp |  10 ---
 test/integration.js    | 165 +++++++++++++++--------------------------
 6 files changed, 72 insertions(+), 198 deletions(-)

diff --git a/examples/chat/chat.mjs b/examples/chat/chat.mjs
index 8a22f8b..da358a0 100644
--- a/examples/chat/chat.mjs
+++ b/examples/chat/chat.mjs
@@ -8,8 +8,8 @@
  *
  * This example demonstrates:
  * - Branch API for token generation (produce/commit two-phase)
- * - Warm multi-turn continuation via getWarmTurnTokens() + branch.prefill()
- * - Cold/warm routing: formatChat() on first turn, probe-based prefill on subsequent turns
+ * - Warm multi-turn continuation via string-diff formatChat() + getTurnSeparator()
+ * - Cold/warm routing: full format on first turn, string-diff on subsequent turns
  */
 
 import * as readline from "node:readline";
@@ -40,7 +40,7 @@ async function main() {
 
   const messages = [];
   let branch = null;
-  const warm = ctx.getWarmTurnTokens();
+  const sep = ctx.getTurnSeparator();
 
   const rl = readline.createInterface({
     input: process.stdin,
@@ -90,14 +90,14 @@ async function main() {
       });
       branch.captureLogits();
     } else {
-      // === WARM (position > 0): probe-based prefill — no formatChat(), no BOS ===
-      const contentToks = await ctx.tokenize(trimmed, false);
-      branch.prefill([
-        ...warm.turnSeparator,
-        ...warm.userPrefix,
-        ...contentToks,
-        ...warm.userToAssistant,
-      ]);
+      // === WARM (position > 0): string-diff for delta tokens ===
+      const { prompt: full } = await ctx.formatChat(JSON.stringify(messages));
+      const { prompt: prefix } = await ctx.formatChat(
+        JSON.stringify(messages.slice(0, -1)),
+        { addGenerationPrompt: false },
+      );
+      const delta = await ctx.tokenize(full.substring(prefix.length), false);
+      branch.prefill([...sep, ...delta]);
     }
 
     // Generate: produce inspects, commit advances
diff --git a/lib/index.d.ts b/lib/index.d.ts
index 7dfffdf..a762547 100644
--- a/lib/index.d.ts
+++ b/lib/index.d.ts
@@ -388,23 +388,6 @@ export interface ParseChatOutputResult {
   toolCalls: ParsedToolCall[];
 }
 
-/**
- * Pre-tokenized wrapper tokens for warm multi-turn continuation
- *
- * Contains the three token sequences needed to inject a new user turn
- * into an existing conversation without re-formatting the full history.
- *
- * @see {@link SessionContext.getWarmTurnTokens}
- */
-export interface WarmTurnTokens {
-  /** Tokens that close the previous assistant turn (e.g., im_end + newline for ChatML) */
-  turnSeparator: number[];
-  /** Tokens that open a new user turn (e.g., im_start + "user" + newline for ChatML) */
-  userPrefix: number[];
-  /** Tokens that close the user turn and open assistant (e.g., im_end + newline + im_start + "assistant" + newline) */
-  userToAssistant: number[];
-}
-
 /**
  * Penalty parameters for repetition control
  */
@@ -749,31 +732,6 @@ export interface SessionContext {
    */
   getTurnSeparator(): number[];
 
-  /**
-   * Get warm turn wrapper tokens for template-aware warm continuation
-   *
-   * Returns pre-tokenized role wrappers extracted from the model's chat
-   * template. Use these to construct warm prefill tokens without
-   * re-formatting the full conversation (no BOS bug, O(1) per turn).
-   *
-   * Warm path: turnSeparator + userPrefix + tokenize(content, false) + userToAssistant
-   *
-   * @returns Cached wrapper tokens (computed once per model)
-   *
-   * @example
-   * ```typescript
-   * const warm = ctx.getWarmTurnTokens();
-   * const contentToks = await ctx.tokenize(userContent, false);
-   * branch.prefill([
-   *   ...warm.turnSeparator,     // closes previous assistant turn
-   *   ...warm.userPrefix,        // opens new user turn
-   *   ...contentToks,            // raw user content (no BOS)
-   *   ...warm.userToAssistant,   // closes user turn + opens assistant
-   * ]);
-   * ```
-   */
-  getWarmTurnTokens(): WarmTurnTokens;
-
   // ===== PROMPT PREPARATION =====
 
   /**
diff --git a/liblloyal b/liblloyal
index 6037b9b..fdcce9e 160000
--- a/liblloyal
+++ b/liblloyal
@@ -1 +1 @@
-Subproject commit 6037b9bc7a3ea67460a073df3373f6e121ce9d68
+Subproject commit fdcce9ef25ac5bf56ca8ffdfdc477ee2752d00e7
diff --git a/src/SessionContext.cpp b/src/SessionContext.cpp
index 6cc2a5f..65bb323 100644
--- a/src/SessionContext.cpp
+++ b/src/SessionContext.cpp
@@ -598,7 +598,6 @@ Napi::Object SessionContext::Init(Napi::Env env, Napi::Object exports) {
     InstanceMethod("isStopToken", &SessionContext::isStopToken),
     InstanceMethod("getEogToken", &SessionContext::getEogToken),
     InstanceMethod("getTurnSeparator", &SessionContext::getTurnSeparator),
-    InstanceMethod("getWarmTurnTokens", &SessionContext::getWarmTurnTokens),
 
     // ===== PROMPT PREPARATION =====
     InstanceMethod("tokenize", &SessionContext::tokenize),
@@ -1156,34 +1155,6 @@ Napi::Value SessionContext::getTurnSeparator(const Napi::CallbackInfo& info) {
   return result;
 }
 
-Napi::Value SessionContext::getWarmTurnTokens(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  ensureNotDisposed();
-
-  // Compute once, cache thereafter
-  if (!_warmTurnTokensCached) {
-    _warmTurnTokensCache = lloyal::chat_in::get_warm_turn_tokens(_model.get());
-    _warmTurnTokensCached = true;
-  }
-
-  // Return { turnSeparator: number[], userPrefix: number[], userToAssistant: number[] }
-  Napi::Object result = Napi::Object::New(env);
-
-  auto toArray = [&](const std::vector<llama_token>& tokens) {
-    Napi::Array arr = Napi::Array::New(env, tokens.size());
-    for (size_t i = 0; i < tokens.size(); i++) {
-      arr[i] = Napi::Number::New(env, static_cast<double>(tokens[i]));
-    }
-    return arr;
-  };
-
-  result.Set("turnSeparator", toArray(_warmTurnTokensCache.turn_separator));
-  result.Set("userPrefix", toArray(_warmTurnTokensCache.user_prefix));
-  result.Set("userToAssistant", toArray(_warmTurnTokensCache.user_to_assistant));
-
-  return result;
-}
-
 Napi::Value SessionContext::formatChat(const Napi::CallbackInfo& info) {
   Napi::Env env = info.Env();
   ensureNotDisposed();
diff --git a/src/SessionContext.hpp b/src/SessionContext.hpp
index c844e51..466db9f 100644
--- a/src/SessionContext.hpp
+++ b/src/SessionContext.hpp
@@ -137,12 +137,6 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
    */
   Napi::Value getTurnSeparator(const Napi::CallbackInfo& info);
 
-  /**
-   * Get warm turn wrapper tokens for template-aware warm continuation
-   * Returns { turnSeparator: number[], userPrefix: number[], userToAssistant: number[] }
-   */
-  Napi::Value getWarmTurnTokens(const Napi::CallbackInfo& info);
-
   /**
    * Format messages using model's chat template
    * Args: messagesJson (string), templateOverride (optional string)
@@ -420,10 +414,6 @@ class SessionContext : public Napi::ObjectWrap<SessionContext> {
   std::vector<llama_token> _turnSeparatorCache;
   bool _turnSeparatorCached = false;
 
-  // ===== WARM TURN TOKENS CACHE =====
-  lloyal::chat_in::WarmTurnTokens _warmTurnTokensCache;
-  bool _warmTurnTokensCached = false;
-
   // ===== DECODE MUTEX =====
   std::mutex _decodeMutex;
 
diff --git a/test/integration.js b/test/integration.js
index 62d1c19..675f00b 100644
--- a/test/integration.js
+++ b/test/integration.js
@@ -345,18 +345,23 @@ async function testBranchPrefill() {
     }
     assert(gen1.length > 0, `Turn 1: generated ${gen1.length} tokens`);
 
-    // Warm turn tokens for probe-based prefill (no string diff, no BOS bug)
-    const warm = ctx.getWarmTurnTokens();
+    // Track assistant response for string-diff warm continuation
+    const assistantText1 = await ctx.detokenize(gen1);
+    messages.push({ role: 'assistant', content: assistantText1 });
 
-    // Turn 2-3: prefill using warm probe + generate
+    // Warm continuation: string-diff formatChat() + turn separator
+    const sep = ctx.getTurnSeparator();
+
+    // Turn 2-3: prefill using string-diff warm pattern + generate
     for (let t = 1; t < turns.length; t++) {
-      const contentToks = await ctx.tokenize(turns[t], false);
-      const prefillToks = [
-        ...warm.turnSeparator,
-        ...warm.userPrefix,
-        ...contentToks,
-        ...warm.userToAssistant,
-      ];
+      messages.push({ role: 'user', content: turns[t] });
+      const { prompt: full } = await ctx.formatChat(JSON.stringify(messages));
+      const { prompt: prefix } = await ctx.formatChat(
+        JSON.stringify(messages.slice(0, -1)),
+        { addGenerationPrompt: false }
+      );
+      const delta = await ctx.tokenize(full.substring(prefix.length), false);
+      const prefillToks = [...sep, ...delta];
 
       const posBefore = branch.position;
       branch.prefill(prefillToks);
@@ -371,6 +376,10 @@ async function testBranchPrefill() {
         gen.push(token);
       }
       assert(gen.length > 0, `Turn ${t + 1}: generated ${gen.length} tokens`);
+
+      // Track assistant response
+      const assistantText = await ctx.detokenize(gen);
+      messages.push({ role: 'assistant', content: assistantText });
     }
 
     branch.prune();
@@ -423,16 +432,20 @@ async function testWarmColdParity() {
 
     assistantContent = await warmCtx.detokenize(gen1);
 
-    // Turn 2: prefill using warm probe (no string diff, no BOS bug)
-    const warm = warmCtx.getWarmTurnTokens();
-    const contentToks = await warmCtx.tokenize(userMessages[1], false);
-    const prefillToks = [
-      ...warm.turnSeparator,
-      ...warm.userPrefix,
-      ...contentToks,
-      ...warm.userToAssistant,
+    // Turn 2: string-diff warm continuation
+    const sep = warmCtx.getTurnSeparator();
+    const allMessages = [
+      { role: 'user', content: userMessages[0] },
+      { role: 'assistant', content: assistantContent },
+      { role: 'user', content: userMessages[1] }
     ];
-    branch.prefill(prefillToks);
+    const { prompt: full } = await warmCtx.formatChat(JSON.stringify(allMessages));
+    const { prompt: prefix } = await warmCtx.formatChat(
+      JSON.stringify(allMessages.slice(0, -1)),
+      { addGenerationPrompt: false }
+    );
+    const deltaToks = await warmCtx.tokenize(full.substring(prefix.length), false);
+    branch.prefill([...sep, ...deltaToks]);
 
     warmGen2 = [];
     for (let i = 0; i < GEN_TOKENS; i++) {
@@ -486,11 +499,9 @@ async function testWarmColdParity() {
   // === COMPARE ===
   const warmStr = warmGen2.join(',');
   const coldStr = coldGen2.join(',');
-  assert(warmStr === coldStr,
-    `Warm==Cold parity: ${warmGen2.length} tokens match`);
 
+  // Log divergence diagnostics BEFORE assert (assert throws on failure)
   if (warmStr !== coldStr) {
-    // Diagnostic: show first divergence point
     for (let i = 0; i < Math.max(warmGen2.length, coldGen2.length); i++) {
       if (warmGen2[i] !== coldGen2[i]) {
         console.log(`  First divergence at position ${i}: warm=${warmGen2[i]} cold=${coldGen2[i]}`);
@@ -498,68 +509,11 @@ async function testWarmColdParity() {
       }
     }
   }
-}
-
-// ═══════════════════════════════════════════════════════════════════════════
-// WARM TURN TOKENS PROBE - Verifies template-extracted role wrappers
-// ═══════════════════════════════════════════════════════════════════════════
 
-async function testWarmTurnTokens() {
-  console.log('\n--- Warm Turn Tokens Probe ---');
-
-  const ctx = await addon.createContext({
-    modelPath: MODEL_PATH,
-    nCtx: 512,
-    nThreads: 4
-  });
-
-  try {
-    const warm = ctx.getWarmTurnTokens();
-
-    assert(Array.isArray(warm.turnSeparator) && warm.turnSeparator.length > 0,
-      `turnSeparator: ${warm.turnSeparator.length} tokens`);
-    assert(Array.isArray(warm.userPrefix) && warm.userPrefix.length > 0,
-      `userPrefix: ${warm.userPrefix.length} tokens`);
-    assert(Array.isArray(warm.userToAssistant) && warm.userToAssistant.length > 0,
-      `userToAssistant: ${warm.userToAssistant.length} tokens`);
-
-    // All token IDs should be valid numbers
-    for (const tok of warm.turnSeparator) {
-      assert(typeof tok === 'number' && Number.isInteger(tok), `turnSeparator token ${tok} is integer`);
-    }
-    for (const tok of warm.userPrefix) {
-      assert(typeof tok === 'number' && Number.isInteger(tok), `userPrefix token ${tok} is integer`);
-    }
-    for (const tok of warm.userToAssistant) {
-      assert(typeof tok === 'number' && Number.isInteger(tok), `userToAssistant token ${tok} is integer`);
-    }
-
-    // turnSeparator should contain at least one EOG token
-    const hasEog = warm.turnSeparator.some(t => ctx.isStopToken(t));
-    assert(hasEog, 'turnSeparator contains at least one EOG token');
-
-    // userPrefix should NOT contain EOG tokens
-    const prefixHasEog = warm.userPrefix.some(t => ctx.isStopToken(t));
-    assert(!prefixHasEog, 'userPrefix contains no EOG tokens');
-
-    // Cached: second call returns same result
-    const warm2 = ctx.getWarmTurnTokens();
-    assert(
-      warm.turnSeparator.join(',') === warm2.turnSeparator.join(',') &&
-      warm.userPrefix.join(',') === warm2.userPrefix.join(',') &&
-      warm.userToAssistant.join(',') === warm2.userToAssistant.join(','),
-      'getWarmTurnTokens() is cached (idempotent)');
-
-    // Log for diagnostic visibility
-    const sepText = warm.turnSeparator.map(t => ctx.tokenToText(t)).join('');
-    const prefText = warm.userPrefix.map(t => ctx.tokenToText(t)).join('');
-    const u2aText = warm.userToAssistant.map(t => ctx.tokenToText(t)).join('');
-    console.log(`    separator: ${JSON.stringify(sepText)}`);
-    console.log(`    userPrefix: ${JSON.stringify(prefText)}`);
-    console.log(`    userToAssistant: ${JSON.stringify(u2aText)}`);
-  } finally {
-    ctx.dispose();
-  }
+  assert(warmStr === coldStr,
+    warmStr === coldStr
+      ? `Warm==Cold parity: ${warmGen2.length} tokens match`
+      : `Warm==Cold parity FAILED: warm=[${warmStr}] vs cold=[${coldStr}]`);
 }
 
 // ═══════════════════════════════════════════════════════════════════════════
@@ -598,13 +552,20 @@ async function testWarmSemanticRecall() {
     });
 
     try {
-      // Helper: warm-continue one turn (prefill delta, generate)
-      async function warmTurn(messages, lastText, userContent) {
+      const sep = ctx.getTurnSeparator();
+      let branch;
+      const messages = [];
+
+      // Helper: string-diff warm continuation
+      async function warmTurn(userContent) {
         messages.push({ role: 'user', content: userContent });
-        const { prompt: fullPrompt } = await ctx.formatChat(JSON.stringify(messages));
-        const delta = fullPrompt.slice(lastText.length);
-        const deltaToks = await ctx.tokenize(delta);
-        branch.prefill(deltaToks);
+        const { prompt: full } = await ctx.formatChat(JSON.stringify(messages));
+        const { prompt: prefix } = await ctx.formatChat(
+          JSON.stringify(messages.slice(0, -1)),
+          { addGenerationPrompt: false }
+        );
+        const delta = await ctx.tokenize(full.substring(prefix.length), false);
+        branch.prefill([...sep, ...delta]);
 
         const gen = [];
         for (let i = 0; i < GEN_TOKENS; i++) {
@@ -613,18 +574,18 @@ async function testWarmSemanticRecall() {
           branch.commit(token);
           gen.push(token);
         }
-        const assistantText = await ctx.detokenize(gen);
-        messages.push({ role: 'assistant', content: assistantText });
-        return { text: assistantText, lastText: fullPrompt + assistantText };
+        const text = await ctx.detokenize(gen);
+        messages.push({ role: 'assistant', content: text });
+        return text;
       }
 
       // Turn 1: Plant a specific, recallable fact
-      const messages = [{ role: 'user', content: 'Remember this: my dog is named Max.' }];
+      messages.push({ role: 'user', content: 'Remember this: my dog is named Max.' });
       const { prompt } = await ctx.formatChat(JSON.stringify(messages));
       const promptToks = await ctx.tokenize(prompt);
       await ctx.decode(promptToks, 0, 0);
 
-      var branch = Branch.create(ctx, 0, promptToks.length, { temperature: 0 });
+      branch = Branch.create(ctx, 0, promptToks.length, { temperature: 0 });
       branch.captureLogits();
 
       // Generate turn 1 response
@@ -635,22 +596,17 @@ async function testWarmSemanticRecall() {
         branch.commit(token);
         gen.push(token);
       }
-      const assistantText = await ctx.detokenize(gen);
-      messages.push({ role: 'assistant', content: assistantText });
-      let lastText = prompt + assistantText;
+      const turn1Response = await ctx.detokenize(gen);
+      messages.push({ role: 'assistant', content: turn1Response });
 
       // Turn 2: Distractor
-      let turn;
-      turn = await warmTurn(messages, lastText, 'What is 2 + 2?');
-      lastText = turn.lastText;
+      await warmTurn('What is 2 + 2?');
 
       // Turn 3: Another distractor
-      turn = await warmTurn(messages, lastText, 'Name three colors.');
-      lastText = turn.lastText;
+      await warmTurn('Name three colors.');
 
       // Turn 4: Recall — only answerable from turn 1 context
-      turn = await warmTurn(messages, lastText, 'What is my dog\'s name?');
-      recallText = turn.text;
+      recallText = await warmTurn('What is my dog\'s name?');
 
       branch.prune();
     } finally {
@@ -1118,7 +1074,6 @@ async function main() {
     await testMultiSequence();
     await testGrammar();
     await testBranchPrefill();
-    await testWarmTurnTokens();
     await testWarmColdParity();
     await testWarmSemanticRecall();
     await testBranchSteer();

From 0d4a65cdd30dd4b7d4ea0fb58154e4b39886f589 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Wed, 11 Feb 2026 23:55:55 +1100
Subject: [PATCH 04/13] feat(chat): new chat api - cross template multi-turn
 updates

---
 examples/chat/chat.mjs | 39 +++++++++++++++---------
 lib/index.d.ts         | 66 +++++++++++++++++++++++++++++++++++++++-
 liblloyal              |  2 +-
 test/integration.js    | 68 +++++++++++++++++++++---------------------
 4 files changed, 125 insertions(+), 50 deletions(-)

diff --git a/examples/chat/chat.mjs b/examples/chat/chat.mjs
index da358a0..f86b0e1 100644
--- a/examples/chat/chat.mjs
+++ b/examples/chat/chat.mjs
@@ -8,8 +8,9 @@
  *
  * This example demonstrates:
  * - Branch API for token generation (produce/commit two-phase)
- * - Warm multi-turn continuation via string-diff formatChat() + getTurnSeparator()
- * - Cold/warm routing: full format on first turn, string-diff on subsequent turns
+ * - Warm multi-turn continuation via formatChat([newMsg]) + getTurnSeparator()
+ * - Cold/warm routing: full format on first turn, format-only-new on subsequent turns
+ * - parseChatOutput() for correct reasoning_content handling on thinking models
  */
 
 import * as readline from "node:readline";
@@ -20,7 +21,7 @@ import { createContext, Branch } from "../../lib/index.js";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const DEFAULT_MODEL = path.resolve(
   __dirname,
-  "../../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf",
+  "../../models/Phi-3.5-mini-instruct-Q4_K_M.gguf",
 );
 
 async function main() {
@@ -40,6 +41,7 @@ async function main() {
 
   const messages = [];
   let branch = null;
+  let fmt = null;
   const sep = ctx.getTurnSeparator();
 
   const rl = readline.createInterface({
@@ -80,8 +82,8 @@ async function main() {
 
     if (!branch) {
       // === COLD (position === 0): full format → tokenize with BOS → decode ===
-      const { prompt } = await ctx.formatChat(JSON.stringify(messages));
-      const tokens = await ctx.tokenize(prompt);
+      fmt = await ctx.formatChat(JSON.stringify(messages));
+      const tokens = await ctx.tokenize(fmt.prompt);
       await ctx.decode(tokens, 0, 0);
       branch = Branch.create(ctx, 0, tokens.length, {
         temperature: 0.7,
@@ -90,31 +92,40 @@ async function main() {
       });
       branch.captureLogits();
     } else {
-      // === WARM (position > 0): string-diff for delta tokens ===
-      const { prompt: full } = await ctx.formatChat(JSON.stringify(messages));
-      const { prompt: prefix } = await ctx.formatChat(
-        JSON.stringify(messages.slice(0, -1)),
-        { addGenerationPrompt: false },
+      // === WARM (position > 0): format only the new message ===
+      fmt = await ctx.formatChat(
+        JSON.stringify([{ role: "system", content: "" }, { role: "user", content: trimmed }]),
       );
-      const delta = await ctx.tokenize(full.substring(prefix.length), false);
+      const delta = await ctx.tokenize(fmt.prompt, false);
       branch.prefill([...sep, ...delta]);
     }
 
     // Generate: produce inspects, commit advances
     process.stdout.write("< ");
-    let response = "";
+    let rawOutput = "";
 
     while (true) {
       const { token, text, isStop } = branch.produce();
       if (isStop) break;
       process.stdout.write(text);
-      response += text;
+      rawOutput += text;
       branch.commit(token);
     }
 
     console.log("\n");
 
-    messages.push({ role: "assistant", content: response.trim() });
+    // Parse output: separates reasoning from content for thinking models
+    const parsed = ctx.parseChatOutput(rawOutput, fmt.format, {
+      reasoningFormat: fmt.reasoningFormat,
+      thinkingForcedOpen: fmt.thinkingForcedOpen,
+      parser: fmt.parser,
+    });
+
+    const msg = { role: "assistant", content: parsed.content };
+    if (parsed.reasoningContent) {
+      msg.reasoning_content = parsed.reasoningContent;
+    }
+    messages.push(msg);
 
     askUser();
   }
diff --git a/lib/index.d.ts b/lib/index.d.ts
index a762547..75c7b1e 100644
--- a/lib/index.d.ts
+++ b/lib/index.d.ts
@@ -382,7 +382,13 @@ export interface ParsedToolCall {
 export interface ParseChatOutputResult {
   /** Main response text */
   content: string;
-  /** Extracted thinking/reasoning content (empty if none) */
+  /**
+   * Extracted thinking/reasoning content (empty string if none).
+   * For thinking models (e.g. Qwen3), this contains the text inside
+   * `<think>...</think>` blocks. Store as `reasoning_content` in your
+   * messages array so formatChat() can reconstruct the template correctly
+   * on subsequent turns.
+   */
   reasoningContent: string;
   /** Extracted tool calls (empty array if none) */
   toolCalls: ParsedToolCall[];
@@ -1422,6 +1428,64 @@ export interface SessionContext {
    *   // Handle tool calls
    * }
    * ```
+   *
+   * @example Multi-turn warm continuation with reasoning models
+   * ```typescript
+   * // parseChatOutput separates <think>...</think> blocks into reasoningContent.
+   * // This is REQUIRED for correct warm continuation on thinking models (e.g. Qwen3):
+   * // if raw output containing <think> tags is stored as content, re-formatting
+   * // the conversation produces different tokens, breaking cold/warm parity.
+   *
+   * const messages: Array<{role: string; content: string; reasoning_content?: string}> = [];
+   * const sep = ctx.getTurnSeparator();
+   * let branch: Branch | null = null;
+   * let fmt: FormattedChatResult;
+   *
+   * async function handleTurn(userContent: string) {
+   *   messages.push({ role: 'user', content: userContent });
+   *
+   *   if (!branch) {
+   *     // Cold path: format full conversation, tokenize with BOS, decode all
+   *     fmt = await ctx.formatChat(JSON.stringify(messages));
+   *     const tokens = await ctx.tokenize(fmt.prompt);
+   *     await ctx.decode(tokens, 0, 0);
+   *     branch = Branch.create(ctx, 0, tokens.length, { temperature: 0.7 });
+   *     branch.captureLogits();
+   *   } else {
+   *     // Warm path: string-diff for delta tokens
+   *     const { prompt: full } = await ctx.formatChat(JSON.stringify(messages));
+   *     const { prompt: prefix } = await ctx.formatChat(
+   *       JSON.stringify(messages.slice(0, -1)),
+   *       { addGenerationPrompt: false }
+   *     );
+   *     const delta = await ctx.tokenize(full.substring(prefix.length), false);
+   *     branch.prefill([...sep, ...delta]);
+   *   }
+   *
+   *   // Generate
+   *   let rawOutput = '';
+   *   while (true) {
+   *     const { token, text, isStop } = branch.produce();
+   *     if (isStop) break;
+   *     rawOutput += text;
+   *     branch.commit(token);
+   *   }
+   *
+   *   // Parse output: separates reasoning from content
+   *   const parsed = ctx.parseChatOutput(rawOutput, fmt.format, {
+   *     reasoningFormat: fmt.reasoningFormat,
+   *     thinkingForcedOpen: fmt.thinkingForcedOpen,
+   *     parser: fmt.parser
+   *   });
+   *
+   *   // Store parsed fields — formatChat reconstructs thinking blocks correctly
+   *   messages.push({
+   *     role: 'assistant',
+   *     content: parsed.content,
+   *     reasoning_content: parsed.reasoningContent || undefined
+   *   });
+   * }
+   * ```
    */
   parseChatOutput(
     output: string,
diff --git a/liblloyal b/liblloyal
index fdcce9e..754ee22 160000
--- a/liblloyal
+++ b/liblloyal
@@ -1 +1 @@
-Subproject commit fdcce9ef25ac5bf56ca8ffdfdc477ee2752d00e7
+Subproject commit 754ee2270004eb56b108a22af310ebbb084c96f8
diff --git a/test/integration.js b/test/integration.js
index 675f00b..6a995e8 100644
--- a/test/integration.js
+++ b/test/integration.js
@@ -345,22 +345,20 @@ async function testBranchPrefill() {
     }
     assert(gen1.length > 0, `Turn 1: generated ${gen1.length} tokens`);
 
-    // Track assistant response for string-diff warm continuation
+    // Track assistant response
     const assistantText1 = await ctx.detokenize(gen1);
     messages.push({ role: 'assistant', content: assistantText1 });
 
-    // Warm continuation: string-diff formatChat() + turn separator
+    // Warm continuation: format only new message + turn separator
     const sep = ctx.getTurnSeparator();
 
-    // Turn 2-3: prefill using string-diff warm pattern + generate
+    // Turn 2-3: prefill using format-only-new pattern + generate
     for (let t = 1; t < turns.length; t++) {
       messages.push({ role: 'user', content: turns[t] });
-      const { prompt: full } = await ctx.formatChat(JSON.stringify(messages));
-      const { prompt: prefix } = await ctx.formatChat(
-        JSON.stringify(messages.slice(0, -1)),
-        { addGenerationPrompt: false }
-      );
-      const delta = await ctx.tokenize(full.substring(prefix.length), false);
+      const { prompt } = await ctx.formatChat(JSON.stringify([
+        { role: 'user', content: turns[t] }
+      ]));
+      const delta = await ctx.tokenize(prompt, false);
       const prefillToks = [...sep, ...delta];
 
       const posBefore = branch.position;
@@ -432,19 +430,13 @@ async function testWarmColdParity() {
 
     assistantContent = await warmCtx.detokenize(gen1);
 
-    // Turn 2: string-diff warm continuation
+    // Turn 2: format-only-new warm continuation
     const sep = warmCtx.getTurnSeparator();
-    const allMessages = [
-      { role: 'user', content: userMessages[0] },
-      { role: 'assistant', content: assistantContent },
+    const { prompt: warmDelta } = await warmCtx.formatChat(JSON.stringify([
+      { role: 'system', content: '' },
       { role: 'user', content: userMessages[1] }
-    ];
-    const { prompt: full } = await warmCtx.formatChat(JSON.stringify(allMessages));
-    const { prompt: prefix } = await warmCtx.formatChat(
-      JSON.stringify(allMessages.slice(0, -1)),
-      { addGenerationPrompt: false }
-    );
-    const deltaToks = await warmCtx.tokenize(full.substring(prefix.length), false);
+    ]));
+    const deltaToks = await warmCtx.tokenize(warmDelta, false);
     branch.prefill([...sep, ...deltaToks]);
 
     warmGen2 = [];
@@ -471,16 +463,26 @@ async function testWarmColdParity() {
   let coldGen2;
 
   try {
-    const msgs = [
+    // History: all but last user message (with addGenerationPrompt=false)
+    const history = [
       { role: 'user', content: userMessages[0] },
-      { role: 'assistant', content: assistantContent },
-      { role: 'user', content: userMessages[1] }
+      { role: 'assistant', content: assistantContent }
     ];
-    const { prompt: coldPrompt } = await coldCtx.formatChat(JSON.stringify(msgs));
-    const coldToks = await coldCtx.tokenize(coldPrompt);
-    await coldCtx.decode(coldToks, 0, 0);
+    const { prompt: histPrompt } = await coldCtx.formatChat(
+      JSON.stringify(history), { addGenerationPrompt: false }
+    );
+    const histToks = await coldCtx.tokenize(histPrompt);
+    await coldCtx.decode(histToks, 0, 0);
+
+    // Delta: format-only-new (same as warm path)
+    const { prompt: coldDelta } = await coldCtx.formatChat(JSON.stringify([
+      { role: 'system', content: '' },
+      { role: 'user', content: userMessages[1] }
+    ]));
+    const deltaToks = await coldCtx.tokenize(coldDelta, false);
+    await coldCtx.decode(deltaToks, histToks.length, 0);
 
-    const branch = Branch.create(coldCtx, 0, coldToks.length, { temperature: 0 });
+    const branch = Branch.create(coldCtx, 0, histToks.length + deltaToks.length, { temperature: 0 });
     branch.captureLogits();
 
     coldGen2 = [];
@@ -556,15 +558,13 @@ async function testWarmSemanticRecall() {
       let branch;
       const messages = [];
 
-      // Helper: string-diff warm continuation
+      // Helper: format-only-new warm continuation
       async function warmTurn(userContent) {
         messages.push({ role: 'user', content: userContent });
-        const { prompt: full } = await ctx.formatChat(JSON.stringify(messages));
-        const { prompt: prefix } = await ctx.formatChat(
-          JSON.stringify(messages.slice(0, -1)),
-          { addGenerationPrompt: false }
-        );
-        const delta = await ctx.tokenize(full.substring(prefix.length), false);
+        const { prompt } = await ctx.formatChat(JSON.stringify([
+          { role: 'user', content: userContent }
+        ]));
+        const delta = await ctx.tokenize(prompt, false);
         branch.prefill([...sep, ...delta]);
 
         const gen = [];

From 4d72d1185ce72beec35786d808cb822a939aeee4 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Thu, 12 Feb 2026 00:31:32 +1100
Subject: [PATCH 05/13] feat(chat): new chat api - fix tests

---
 src/BackendManager.hpp | 48 +++---------------------------------------
 src/SessionContext.cpp |  1 +
 test/integration.js    |  2 ++
 3 files changed, 6 insertions(+), 45 deletions(-)

diff --git a/src/BackendManager.hpp b/src/BackendManager.hpp
index e37c818..38e79ab 100644
--- a/src/BackendManager.hpp
+++ b/src/BackendManager.hpp
@@ -1,8 +1,8 @@
 #pragma once
 
 #include <llama/llama.h>
+#include "log.h"
 #include <mutex>
-#include <iostream>
 
 namespace liblloyal_node {
 
@@ -33,50 +33,9 @@ class BackendManager {
    * Called exactly once by ensureInitialized()
    */
   BackendManager() {
-    std::cout << "[BackendManager] Initializing llama.cpp backend..." << std::endl;
-
-    // Initialize llama backend (matches Nitro's LlamaBackendManager exactly)
     llama_backend_init();
-    std::cout << "[BackendManager] llama_backend_init() called" << std::endl;
-
-    // Match Nitro: Enable logging callback with dim colors for less visual noise
-    llama_log_set([](ggml_log_level level, const char* text, void* user_data) {
-      // ANSI escape codes
-      const char* RESET = "\033[0m";
-      const char* DIM = "\033[2m";       // Dim/faint text
-      const char* RED = "\033[31m";
-      const char* YELLOW = "\033[33m";
-
-      const char* color = DIM;  // Default: dim grey for INFO/DEBUG
-      const char* level_str = "";
-
-      switch (level) {
-        case GGML_LOG_LEVEL_ERROR:
-          level_str = "ERROR";
-          color = RED;
-          break;
-        case GGML_LOG_LEVEL_WARN:
-          level_str = "WARN";
-          color = YELLOW;
-          break;
-        case GGML_LOG_LEVEL_INFO:
-          level_str = "INFO";
-          break;
-        case GGML_LOG_LEVEL_DEBUG:
-          level_str = "DEBUG";
-          break;
-        case GGML_LOG_LEVEL_NONE:
-          level_str = "NONE";
-          break;
-        case GGML_LOG_LEVEL_CONT:
-          // Continuation - just print text dimmed, no prefix
-          std::cerr << DIM << text << RESET << std::flush;
-          return;
-      }
-      std::cerr << color << "[llama.cpp " << level_str << "] " << text << RESET << std::flush;
-    }, nullptr);
-
-    std::cout << "[BackendManager] llama.cpp logging configured" << std::endl;
+    common_log_set_verbosity_thold(LOG_DEFAULT_LLAMA);
+    llama_log_set(common_log_default_callback, nullptr);
   }
 
   /**
@@ -85,7 +44,6 @@ class BackendManager {
    */
   ~BackendManager() {
     llama_backend_free();
-    std::cout << "[~BackendManager] llama_backend_free() called" << std::endl;
   }
 
   // Delete copy/move
diff --git a/src/SessionContext.cpp b/src/SessionContext.cpp
index 65bb323..e32ae19 100644
--- a/src/SessionContext.cpp
+++ b/src/SessionContext.cpp
@@ -14,6 +14,7 @@
 #include <lloyal/logits.hpp>
 #include <lloyal/metrics.hpp>
 #include <cmath>
+#include <iostream>
 
 namespace liblloyal_node {
 
diff --git a/test/integration.js b/test/integration.js
index 6a995e8..f95133c 100644
--- a/test/integration.js
+++ b/test/integration.js
@@ -356,6 +356,7 @@ async function testBranchPrefill() {
     for (let t = 1; t < turns.length; t++) {
       messages.push({ role: 'user', content: turns[t] });
       const { prompt } = await ctx.formatChat(JSON.stringify([
+        { role: 'system', content: '' },
         { role: 'user', content: turns[t] }
       ]));
       const delta = await ctx.tokenize(prompt, false);
@@ -562,6 +563,7 @@ async function testWarmSemanticRecall() {
       async function warmTurn(userContent) {
         messages.push({ role: 'user', content: userContent });
         const { prompt } = await ctx.formatChat(JSON.stringify([
+          { role: 'system', content: '' },
           { role: 'user', content: userContent }
         ]));
         const delta = await ctx.tokenize(prompt, false);

From 2db1b424a244e0372db98bc03e21e2c0a9804fb2 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Thu, 12 Feb 2026 00:54:45 +1100
Subject: [PATCH 06/13] feat(chat): new chat api - fix tests

---
 test/integration.js | 153 +++++++++++++++-----------------------------
 1 file changed, 52 insertions(+), 101 deletions(-)

diff --git a/test/integration.js b/test/integration.js
index f95133c..f3bb8c1 100644
--- a/test/integration.js
+++ b/test/integration.js
@@ -388,135 +388,86 @@ async function testBranchPrefill() {
 }
 
 // ═══════════════════════════════════════════════════════════════════════════
-// WARM vs COLD PARITY - Semantic proof that warm continuation == cold start
+// WARM MULTI-TURN SEMANTIC RECALL - Proves context survives warm continuations
+// Mirrors liblloyal C++ test: chat_in_integration_test.cpp
 // ═══════════════════════════════════════════════════════════════════════════
 
-async function testWarmColdParity() {
-  console.log('\n--- Warm vs Cold Parity ---');
+async function testWarmMultiTurnRecall() {
+  console.log('\n--- Warm Multi-Turn Recall ---');
 
-  const GEN_TOKENS = 10;
-  const userMessages = [
-    "What is the capital of France?",
-    " Tell me more about it."
-  ];
+  const GEN_TOKENS = 60;
 
-  // === WARM PATH: decode turn 1, prefill turn 2 delta, generate ===
-  const warmCtx = await addon.createContext({
+  const ctx = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: 2048,
     nBatch: 512,
     nThreads: 4
   });
 
-  let assistantContent;
-  let warmGen2;
-
   try {
-    // Turn 1: format, decode, generate
-    const msgs1 = [{ role: 'user', content: userMessages[0] }];
-    const { prompt: prompt1 } = await warmCtx.formatChat(JSON.stringify(msgs1));
-    const toks1 = await warmCtx.tokenize(prompt1);
-    await warmCtx.decode(toks1, 0, 0);
+    const sep = ctx.getTurnSeparator();
 
-    const branch = Branch.create(warmCtx, 0, toks1.length, { temperature: 0 });
-    branch.captureLogits();
+    // Helper: warm continuation — sep + format([{system,""},{user,msg}])
+    async function warmTurn(branch, userContent) {
+      const { prompt } = await ctx.formatChat(JSON.stringify([
+        { role: 'system', content: '' },
+        { role: 'user', content: userContent }
+      ]));
+      const delta = await ctx.tokenize(prompt, false);
+      branch.prefill([...sep, ...delta]);
 
-    const gen1 = [];
-    for (let i = 0; i < GEN_TOKENS; i++) {
-      const { token, isStop } = branch.produce();
-      if (isStop) break;
-      branch.commit(token);
-      gen1.push(token);
+      const gen = [];
+      for (let i = 0; i < GEN_TOKENS; i++) {
+        const { token, isStop } = branch.produce();
+        if (isStop) break;
+        branch.commit(token);
+        gen.push(token);
+      }
+      const text = await ctx.detokenize(gen);
+      return text;
     }
 
-    assistantContent = await warmCtx.detokenize(gen1);
+    // Turn 1 (COLD): introduce name
+    const msgs1 = [{ role: 'user', content: 'Hi, my name is Lloyal' }];
+    const { prompt } = await ctx.formatChat(JSON.stringify(msgs1));
+    const promptToks = await ctx.tokenize(prompt);
+    await ctx.decode(promptToks, 0, 0);
 
-    // Turn 2: format-only-new warm continuation
-    const sep = warmCtx.getTurnSeparator();
-    const { prompt: warmDelta } = await warmCtx.formatChat(JSON.stringify([
-      { role: 'system', content: '' },
-      { role: 'user', content: userMessages[1] }
-    ]));
-    const deltaToks = await warmCtx.tokenize(warmDelta, false);
-    branch.prefill([...sep, ...deltaToks]);
+    const branch = Branch.create(ctx, 0, promptToks.length, { temperature: 0 });
+    branch.captureLogits();
 
-    warmGen2 = [];
+    const gen1 = [];
     for (let i = 0; i < GEN_TOKENS; i++) {
       const { token, isStop } = branch.produce();
       if (isStop) break;
       branch.commit(token);
-      warmGen2.push(token);
+      gen1.push(token);
     }
+    const turn1 = await ctx.detokenize(gen1);
+    console.log(`  Turn 1: "${turn1.trim().slice(0, 80)}"`);
+    assert(gen1.length > 0, `Turn 1: generated ${gen1.length} tokens`);
 
-    branch.prune();
-  } finally {
-    warmCtx.dispose();
-  }
-
-  // === COLD PATH: decode full 2-turn conversation from scratch, generate ===
-  const coldCtx = await addon.createContext({
-    modelPath: MODEL_PATH,
-    nCtx: 2048,
-    nBatch: 512,
-    nThreads: 4
-  });
-
-  let coldGen2;
+    // Turn 2 (WARM): introduce favourite food
+    const turn2 = await warmTurn(branch, 'My favourite food is pizza');
+    console.log(`  Turn 2: "${turn2.trim().slice(0, 80)}"`);
+    assert(turn2.length > 0, 'Turn 2: generated response');
 
-  try {
-    // History: all but last user message (with addGenerationPrompt=false)
-    const history = [
-      { role: 'user', content: userMessages[0] },
-      { role: 'assistant', content: assistantContent }
-    ];
-    const { prompt: histPrompt } = await coldCtx.formatChat(
-      JSON.stringify(history), { addGenerationPrompt: false }
-    );
-    const histToks = await coldCtx.tokenize(histPrompt);
-    await coldCtx.decode(histToks, 0, 0);
-
-    // Delta: format-only-new (same as warm path)
-    const { prompt: coldDelta } = await coldCtx.formatChat(JSON.stringify([
-      { role: 'system', content: '' },
-      { role: 'user', content: userMessages[1] }
-    ]));
-    const deltaToks = await coldCtx.tokenize(coldDelta, false);
-    await coldCtx.decode(deltaToks, histToks.length, 0);
-
-    const branch = Branch.create(coldCtx, 0, histToks.length + deltaToks.length, { temperature: 0 });
-    branch.captureLogits();
+    // Turn 3 (WARM): recall name
+    const turn3 = await warmTurn(branch, 'Do you remember my name?');
+    console.log(`  Turn 3 (name recall): "${turn3.trim().slice(0, 80)}"`);
+    const nameRecalled = turn3.toLowerCase().includes('lloyal');
+    assert(nameRecalled, `Name recall: ${nameRecalled ? 'found "Lloyal"' : 'MISSING "Lloyal" in: ' + turn3.trim().slice(0, 120)}`);
 
-    coldGen2 = [];
-    for (let i = 0; i < GEN_TOKENS; i++) {
-      const { token, isStop } = branch.produce();
-      if (isStop) break;
-      branch.commit(token);
-      coldGen2.push(token);
-    }
+    // Turn 4 (WARM): recall food
+    const turn4 = await warmTurn(branch, 'Do you remember my favourite food?');
+    console.log(`  Turn 4 (food recall): "${turn4.trim().slice(0, 80)}"`);
+    const foodRecalled = turn4.toLowerCase().includes('pizza');
+    assert(foodRecalled, `Food recall: ${foodRecalled ? 'found "pizza"' : 'MISSING "pizza" in: ' + turn4.trim().slice(0, 120)}`);
 
     branch.prune();
   } finally {
-    coldCtx.dispose();
-  }
-
-  // === COMPARE ===
-  const warmStr = warmGen2.join(',');
-  const coldStr = coldGen2.join(',');
-
-  // Log divergence diagnostics BEFORE assert (assert throws on failure)
-  if (warmStr !== coldStr) {
-    for (let i = 0; i < Math.max(warmGen2.length, coldGen2.length); i++) {
-      if (warmGen2[i] !== coldGen2[i]) {
-        console.log(`  First divergence at position ${i}: warm=${warmGen2[i]} cold=${coldGen2[i]}`);
-        break;
-      }
-    }
+    ctx.dispose();
   }
-
-  assert(warmStr === coldStr,
-    warmStr === coldStr
-      ? `Warm==Cold parity: ${warmGen2.length} tokens match`
-      : `Warm==Cold parity FAILED: warm=[${warmStr}] vs cold=[${coldStr}]`);
 }
 
 // ═══════════════════════════════════════════════════════════════════════════
@@ -1076,7 +1027,7 @@ async function main() {
     await testMultiSequence();
     await testGrammar();
     await testBranchPrefill();
-    await testWarmColdParity();
+    await testWarmMultiTurnRecall();
     await testWarmSemanticRecall();
     await testBranchSteer();
     await testNBatchAblation();

From 58d79c971ad40fee578b18208549d7505a5ea90a Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Thu, 12 Feb 2026 01:15:18 +1100
Subject: [PATCH 07/13] feat(chat): new chat api - fix tests

---
 .github/workflows/tests.yml |  6 +++---
 test/integration.js         | 36 +++++++++++++++---------------------
 2 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 3df363c..9955994 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -80,14 +80,14 @@ jobs:
         run: |
           $env:PATH = "${{ github.workspace }}\build\Release;$env:PATH"
           npm run test:integration
-        timeout-minutes: 5
+        timeout-minutes: 10
         env:
           LLOYAL_LOCAL: '1'
 
       - name: Run integration tests (Unix)
         if: runner.os != 'Windows'
         run: npm run test:integration
-        timeout-minutes: 5
+        timeout-minutes: 10
         env:
           LLOYAL_LOCAL: '1'
 
@@ -181,7 +181,7 @@ jobs:
 
       - name: Run integration tests
         run: npm run test:integration
-        timeout-minutes: 5
+        timeout-minutes: 10
         env:
           LLOYAL_LOCAL: '1'
           MODEL_PATH: models/${{ matrix.model.file }}
diff --git a/test/integration.js b/test/integration.js
index f3bb8c1..a1cfca5 100644
--- a/test/integration.js
+++ b/test/integration.js
@@ -395,8 +395,6 @@ async function testBranchPrefill() {
 async function testWarmMultiTurnRecall() {
   console.log('\n--- Warm Multi-Turn Recall ---');
 
-  const GEN_TOKENS = 60;
-
   const ctx = await addon.createContext({
     modelPath: MODEL_PATH,
     nCtx: 2048,
@@ -407,6 +405,18 @@ async function testWarmMultiTurnRecall() {
   try {
     const sep = ctx.getTurnSeparator();
 
+    // Helper: generate until EOG (matches C++ test pattern)
+    async function generate(branch) {
+      const gen = [];
+      for (;;) {
+        const { token, isStop } = branch.produce();
+        if (isStop) break;
+        branch.commit(token);
+        gen.push(token);
+      }
+      return ctx.detokenize(gen);
+    }
+
     // Helper: warm continuation — sep + format([{system,""},{user,msg}])
     async function warmTurn(branch, userContent) {
       const { prompt } = await ctx.formatChat(JSON.stringify([
@@ -415,16 +425,7 @@ async function testWarmMultiTurnRecall() {
       ]));
       const delta = await ctx.tokenize(prompt, false);
       branch.prefill([...sep, ...delta]);
-
-      const gen = [];
-      for (let i = 0; i < GEN_TOKENS; i++) {
-        const { token, isStop } = branch.produce();
-        if (isStop) break;
-        branch.commit(token);
-        gen.push(token);
-      }
-      const text = await ctx.detokenize(gen);
-      return text;
+      return generate(branch);
     }
 
     // Turn 1 (COLD): introduce name
@@ -436,16 +437,9 @@ async function testWarmMultiTurnRecall() {
     const branch = Branch.create(ctx, 0, promptToks.length, { temperature: 0 });
     branch.captureLogits();
 
-    const gen1 = [];
-    for (let i = 0; i < GEN_TOKENS; i++) {
-      const { token, isStop } = branch.produce();
-      if (isStop) break;
-      branch.commit(token);
-      gen1.push(token);
-    }
-    const turn1 = await ctx.detokenize(gen1);
+    const turn1 = await generate(branch);
     console.log(`  Turn 1: "${turn1.trim().slice(0, 80)}"`);
-    assert(gen1.length > 0, `Turn 1: generated ${gen1.length} tokens`);
+    assert(turn1.length > 0, 'Turn 1: generated response');
 
     // Turn 2 (WARM): introduce favourite food
     const turn2 = await warmTurn(branch, 'My favourite food is pizza');

From 1c42ec6feef655791e348b5876a36e2da6fc82a5 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Thu, 12 Feb 2026 01:49:44 +1100
Subject: [PATCH 08/13] feat(ci): move model matrix to GPU

---
 .github/workflows/gpu-test.yml |   4 +-
 .github/workflows/tests.yml    | 103 +--------------------------------
 ci/run-gpu-tests.sh            |  66 ++++++++++++++++++---
 3 files changed, 64 insertions(+), 109 deletions(-)

diff --git a/.github/workflows/gpu-test.yml b/.github/workflows/gpu-test.yml
index faf56a8..2f2d0f2 100644
--- a/.github/workflows/gpu-test.yml
+++ b/.github/workflows/gpu-test.yml
@@ -129,7 +129,7 @@ jobs:
               --image="${IMAGE}" \
               --service-account="${{ secrets.GCP_SA_EMAIL }}" \
               --set-env-vars=LLOYAL_GPU=cuda,LLOYAL_NO_FALLBACK=1 \
-              --task-timeout=10m \
+              --task-timeout=20m \
               --no-gpu-zonal-redundancy
           else
             gcloud run jobs create $JOB_NAME \
@@ -137,7 +137,7 @@ jobs:
               --image="${IMAGE}" \
               --service-account="${{ secrets.GCP_SA_EMAIL }}" \
               --set-env-vars=LLOYAL_GPU=cuda,LLOYAL_NO_FALLBACK=1 \
-              --task-timeout=10m \
+              --task-timeout=20m \
               --gpu=1 \
               --gpu-type=nvidia-l4 \
               --memory=16Gi \
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9955994..a59c1cc 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -41,12 +41,6 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y build-essential cmake
 
-      - name: Install build dependencies (macOS)
-        if: runner.os == 'macOS'
-        run: |
-          # Xcode command line tools already installed on GitHub runners
-          brew install cmake ninja
-
       - name: Install build dependencies (Windows)
         if: runner.os == 'Windows'
         run: |
@@ -117,95 +111,6 @@ jobs:
             build/CMakeFiles/CMakeError.log
           retention-days: 7
 
-  # Model architecture matrix - tests all models on Metal GPU
-  setup-model-matrix:
-    name: Setup Model Matrix
-    runs-on: ubuntu-latest
-    outputs:
-      models: ${{ steps.read-matrix.outputs.models }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Read model matrix
-        id: read-matrix
-        run: |
-          # Extract model names and files from test/matrix.json
-          models=$(jq -c '[.models[] | {name: .name, file: .file}]' test/matrix.json)
-          echo "models=$models" >> $GITHUB_OUTPUT
-          echo "Model matrix: $models"
-
-  test-model-matrix:
-    name: Examples - ${{ matrix.model.name }}
-    runs-on: macos-14  # Metal GPU
-    needs: setup-model-matrix
-
-    strategy:
-      fail-fast: false
-      matrix:
-        model: ${{ fromJson(needs.setup-model-matrix.outputs.models) }}
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - name: Cache all test models
-        uses: actions/cache@v4
-        with:
-          path: models/
-          key: test-models-all-v1-${{ hashFiles('test/matrix.json') }}
-
-      - name: Download all test models
-        run: bash scripts/download-test-models.sh --all
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: 24
-          cache: 'npm'
-
-      - name: Install build dependencies
-        run: brew install cmake ninja
-
-      - name: Install npm dependencies
-        run: npm install --ignore-scripts
-
-      # Build with CPU-only to avoid paravirtualized Metal GPU bugs
-      # (NeoX RoPE kernel returns zeros on Apple Paravirtual device)
-      - name: Build from submodules
-        run: npm run build
-        env:
-          LLOYAL_GPU: cpu
-
-      - name: Run integration tests
-        run: npm run test:integration
-        timeout-minutes: 10
-        env:
-          LLOYAL_LOCAL: '1'
-          MODEL_PATH: models/${{ matrix.model.file }}
-
-      # TODO: Enable example tests when self-hosted GPU runners are available
-      # GitHub's macos-14 uses paravirtualized Metal (~2.4s/token) which is too
-      # slow for example tests. Integration tests pass but examples timeout.
-      # - name: Run example tests
-      #   run: npm run test:examples
-      #   timeout-minutes: 15
-      #   env:
-      #     LLOYAL_LOCAL: '1'
-      #     MODEL_PATH: models/${{ matrix.model.file }}
-
-      - name: Display test info
-        if: always()
-        run: |
-          echo "================================"
-          echo "Model Matrix Test"
-          echo "================================"
-          echo "Model: ${{ matrix.model.name }}"
-          echo "File: ${{ matrix.model.file }}"
-          echo "Platform: Metal (macos-14)"
-
   verify-npm-package:
     name: Verify npm package contents
     runs-on: ubuntu-latest
@@ -264,7 +169,7 @@ jobs:
   test-summary:
     name: Test Summary
     runs-on: ubuntu-latest
-    needs: [test-build, test-model-matrix, verify-npm-package]
+    needs: [test-build, verify-npm-package]
     if: always()
 
     steps:
@@ -273,12 +178,10 @@ jobs:
           echo "================================"
           echo "Test Results Summary"
           echo "================================"
-          echo "✓ Source builds tested on Linux, macOS, Windows"
+          echo "✓ Source builds tested on Linux and Windows"
           echo "✓ Node.js 24 (Active LTS) compatibility verified"
-          echo "✓ Model architecture matrix tested on Metal"
           echo "✓ npm package contents verified"
           echo "✓ Integration tests passed"
-          echo "✓ Example tests passed"
+          echo "✓ Model matrix tested on GPU (gpu-test.yml)"
           echo ""
           echo "Build & Test Status: ${{ needs.test-build.result }}"
-          echo "Model Matrix Status: ${{ needs.test-model-matrix.result }}"
diff --git a/ci/run-gpu-tests.sh b/ci/run-gpu-tests.sh
index 08a2b14..7cc3209 100755
--- a/ci/run-gpu-tests.sh
+++ b/ci/run-gpu-tests.sh
@@ -36,7 +36,7 @@ fi
 
 echo ""
 echo "=== Downloading Test Models ==="
-./scripts/download-test-models.sh
+./scripts/download-test-models.sh --all
 
 echo ""
 echo "=== Verifying Backend ==="
@@ -55,16 +55,68 @@ try {
 "
 
 echo ""
-echo "=== Running Integration Tests ==="
-LLOYAL_GPU="${GPU_BACKEND}" \
-LLOYAL_NO_FALLBACK=1 \
-node test/integration.js
+echo "=== Running Model Matrix ==="
+
+# Read model list from matrix.json
+MODELS=$(jq -c '.models[]' test/matrix.json)
+
+# Per-model results tracking
+TOTAL=0
+PASS=0
+FAIL=0
+declare -a RESULTS=()
+
+# Don't exit on per-model failure — track results and report at end
+set +e
+
+while IFS= read -r model; do
+  name=$(echo "$model" | jq -r '.name')
+  file=$(echo "$model" | jq -r '.file')
+
+  echo ""
+  echo "══════════════════════════════════════"
+  echo "MODEL: $name ($file)"
+  echo "══════════════════════════════════════"
+
+  TOTAL=$((TOTAL + 1))
+
+  LLOYAL_GPU="${GPU_BACKEND}" \
+  LLOYAL_NO_FALLBACK=1 \
+  MODEL_PATH="models/$file" \
+  node test/integration.js
+
+  if [ $? -eq 0 ]; then
+    RESULTS+=("✅ $name")
+    PASS=$((PASS + 1))
+  else
+    RESULTS+=("❌ $name")
+    FAIL=$((FAIL + 1))
+  fi
+done <<< "$MODELS"
+
+set -e
 
 echo ""
-echo "=== Running Examples ==="
+echo "=== Running Examples (default model) ==="
 LLOYAL_GPU="${GPU_BACKEND}" \
 LLOYAL_NO_FALLBACK=1 \
 node test/examples.js
 
+# Final summary table
+echo ""
+echo "══════════════════════════════════════"
+echo "MODEL MATRIX RESULTS"
+echo "══════════════════════════════════════"
+for r in "${RESULTS[@]}"; do echo "  $r"; done
 echo ""
-echo "=== ✅ GPU Tests Passed ==="
+echo "Total: $PASS passed, $FAIL failed out of $TOTAL models"
+
+if [ $FAIL -eq 0 ]; then
+  echo ""
+  echo "=== ✅ GPU Tests Passed ==="
+  exit 0
+else
+  echo ""
+  echo "=== ❌ GPU Tests Failed ==="
+  exit 1
+fi

From 4c31addf9fc58383911f63030d99b935e7158ca1 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Thu, 12 Feb 2026 02:02:16 +1100
Subject: [PATCH 09/13] feat(ci): move model matrix to GPU

---
 .github/workflows/gpu-test.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/gpu-test.yml b/.github/workflows/gpu-test.yml
index 2f2d0f2..7d749ee 100644
--- a/.github/workflows/gpu-test.yml
+++ b/.github/workflows/gpu-test.yml
@@ -1,6 +1,16 @@
 name: GPU Tests (CUDA)
 
 on:
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'liblloyal'
+      - 'llama.cpp'
+      - 'lib/**'
+      - 'src/**'
+      - 'test/**'
+      - 'ci/**'
+      - 'CMakeLists.txt'
   workflow_dispatch:
     inputs:
       skip_build:

From 9274f51487457ccdd962c2f4eb327e470d0f8cac Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Thu, 12 Feb 2026 02:55:12 +1100
Subject: [PATCH 10/13] feat(ci): move model matrix to GPU

---
 .github/workflows/gpu-test.yml | 82 ++++++++++++++++++++++++++++------
 .github/workflows/tests.yml    |  8 +++-
 ci/run-gpu-tests.sh            | 57 ++++++++++++++++++-----
 test/integration.js            | 12 ++---
 4 files changed, 127 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/gpu-test.yml b/.github/workflows/gpu-test.yml
index 7d749ee..9d6f02c 100644
--- a/.github/workflows/gpu-test.yml
+++ b/.github/workflows/gpu-test.yml
@@ -159,20 +159,74 @@ jobs:
       - name: Run GPU tests
         run: |
           JOB_NAME="lloyal-gpu-test-cuda"
+          REGION="us-east4"
 
-          # Execute job
-          EXECUTION=$(gcloud run jobs execute $JOB_NAME \
-            --region=us-east4 \
-            --wait \
+          # Launch job asynchronously so we can stream logs
+          EXEC=$(gcloud run jobs execute $JOB_NAME \
+            --region=$REGION \
+            --async \
             --format='value(metadata.name)')
 
-          echo "Execution: $EXECUTION"
-
-          # Wait for logs to flush to Cloud Logging
-          sleep 5
-
-          # Get logs
-          gcloud logging read \
-            "resource.type=\"cloud_run_job\" AND resource.labels.job_name=\"$JOB_NAME\" AND resource.labels.location=\"us-east4\"" \
-            --limit=200 \
-            --format='value(textPayload)'
+          echo "Execution: $EXEC"
+          echo "Streaming logs (container startup may take ~30s)..."
+          echo ""
+
+          # Filter for this specific execution's logs
+          LOG_FILTER="resource.type=\"cloud_run_job\" AND resource.labels.job_name=\"$JOB_NAME\" AND labels.\"run.googleapis.com/execution_name\"=\"$EXEC\""
+
+          # Poll loop: stream new log lines + check for completion
+          SEEN=0
+          while true; do
+            # Check if execution has completed
+            COMPLETION=$(gcloud run jobs executions describe "$EXEC" \
+              --region="$REGION" \
+              --format='value(status.completionTime)' 2>/dev/null || true)
+
+            # Fetch all logs for this execution in chronological order
+            LOGS=$(gcloud logging read "$LOG_FILTER" \
+              --limit=10000 \
+              --order=asc \
+              --format='value(textPayload)' 2>/dev/null || true)
+
+            # Print only lines we haven't seen yet
+            if [ -n "$LOGS" ]; then
+              TOTAL=$(echo "$LOGS" | wc -l | tr -d ' ')
+              if [ "$TOTAL" -gt "$SEEN" ]; then
+                echo "$LOGS" | tail -n +$((SEEN + 1))
+                SEEN=$TOTAL
+              fi
+            fi
+
+            # If done, do one final fetch for stragglers then break
+            if [ -n "$COMPLETION" ]; then
+              sleep 5
+              LOGS=$(gcloud logging read "$LOG_FILTER" \
+                --limit=10000 \
+                --order=asc \
+                --format='value(textPayload)' 2>/dev/null || true)
+              if [ -n "$LOGS" ]; then
+                TOTAL=$(echo "$LOGS" | wc -l | tr -d ' ')
+                if [ "$TOTAL" -gt "$SEEN" ]; then
+                  echo "$LOGS" | tail -n +$((SEEN + 1))
+                fi
+              fi
+              break
+            fi
+
+            sleep 10
+          done
+
+          # Determine pass/fail from execution status
+          SUCCEEDED=$(gcloud run jobs executions describe "$EXEC" \
+            --region="$REGION" \
+            --format=json 2>/dev/null | \
+            jq -r '.status.conditions[] | select(.type == "Completed") | .status')
+
+          if [ "$SUCCEEDED" = "True" ]; then
+            echo ""
+            echo "✅ GPU Tests Passed"
+          else
+            echo ""
+            echo "❌ GPU Tests Failed"
+            exit 1
+          fi
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index a59c1cc..6861128 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest, windows-latest]
+        os: [ubuntu-latest, macos-14, windows-latest]
 
     steps:
       - name: Checkout code
@@ -41,6 +41,10 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y build-essential cmake
 
+      - name: Install build dependencies (macOS)
+        if: runner.os == 'macOS'
+        run: brew install cmake
+
       - name: Install build dependencies (Windows)
         if: runner.os == 'Windows'
         run: |
@@ -178,7 +182,7 @@ jobs:
           echo "================================"
           echo "Test Results Summary"
           echo "================================"
-          echo "✓ Source builds tested on Linux and Windows"
+          echo "✓ Source builds tested on Linux, macOS, and Windows"
           echo "✓ Node.js 24 (Active LTS) compatibility verified"
           echo "✓ npm package contents verified"
           echo "✓ Integration tests passed"
diff --git a/ci/run-gpu-tests.sh b/ci/run-gpu-tests.sh
index 7cc3209..92a070c 100755
--- a/ci/run-gpu-tests.sh
+++ b/ci/run-gpu-tests.sh
@@ -54,8 +54,13 @@ try {
 }
 "
 
+# Common env for all test runs
+export LLOYAL_GPU="${GPU_BACKEND}"
+export LLOYAL_NO_FALLBACK=1
+export LLAMA_CTX_SIZE=4096
+
 echo ""
-echo "=== Running Model Matrix ==="
+echo "=== Running Model Matrix (nCtx=${LLAMA_CTX_SIZE}) ==="
 
 # Read model list from matrix.json
 MODELS=$(jq -c '.models[]' test/matrix.json)
@@ -65,6 +70,7 @@ TOTAL=0
 PASS=0
 FAIL=0
 declare -a RESULTS=()
+declare -a FAIL_DETAILS=()
 
 # Don't exit on per-model failure — track results and report at end
 set +e
@@ -79,29 +85,47 @@ while IFS= read -r model; do
   echo "══════════════════════════════════════"
 
   TOTAL=$((TOTAL + 1))
+  MODEL_LOG=$(mktemp)
+  MODEL_FAILED=false
+
+  # --- Integration tests ---
+  echo "── Integration Tests ──"
+  MODEL_PATH="models/$file" \
+  node test/integration.js 2>&1 | tee "$MODEL_LOG"
+  INT_EXIT=${PIPESTATUS[0]}
+
+  if [ $INT_EXIT -ne 0 ]; then
+    MODEL_FAILED=true
+  fi
 
-  LLOYAL_GPU="${GPU_BACKEND}" \
-  LLOYAL_NO_FALLBACK=1 \
+  # --- Example tests ---
+  echo ""
+  echo "── Example Tests ──"
   MODEL_PATH="models/$file" \
-  node test/integration.js
+  node test/examples.js 2>&1 | tee -a "$MODEL_LOG"
+  EX_EXIT=${PIPESTATUS[0]}
+
+  if [ $EX_EXIT -ne 0 ]; then
+    MODEL_FAILED=true
+  fi
 
-  if [ $? -eq 0 ]; then
+  # Per-model summary
+  if [ "$MODEL_FAILED" = false ]; then
     RESULTS+=("✅ $name")
     PASS=$((PASS + 1))
   else
     RESULTS+=("❌ $name")
     FAIL=$((FAIL + 1))
+    # Extract failure lines for the final summary
+    FAILURES=$(grep -E '\[FAIL\]|❌ FAILED|Assertion failed|Fatal error' "$MODEL_LOG" | head -10)
+    FAIL_DETAILS+=("── $name ──"$'\n'"$FAILURES")
   fi
+
+  rm -f "$MODEL_LOG"
 done <<< "$MODELS"
 
 set -e
 
-echo ""
-echo "=== Running Examples (default model) ==="
-LLOYAL_GPU="${GPU_BACKEND}" \
-LLOYAL_NO_FALLBACK=1 \
-node test/examples.js
-
 # Final summary table
 echo ""
 echo "══════════════════════════════════════"
@@ -111,6 +135,17 @@ for r in "${RESULTS[@]}"; do echo "  $r"; done
 echo ""
 echo "Total: $PASS passed, $FAIL failed out of $TOTAL models"
 
+if [ $FAIL -gt 0 ] && [ ${#FAIL_DETAILS[@]} -gt 0 ]; then
+  echo ""
+  echo "══════════════════════════════════════"
+  echo "FAILURE DETAILS"
+  echo "══════════════════════════════════════"
+  for d in "${FAIL_DETAILS[@]}"; do
+    echo "$d"
+    echo ""
+  done
+fi
+
 if [ $FAIL -eq 0 ]; then
   echo ""
   echo "=== ✅ GPU Tests Passed ==="
diff --git a/test/integration.js b/test/integration.js
index a1cfca5..382cce1 100644
--- a/test/integration.js
+++ b/test/integration.js
@@ -23,6 +23,8 @@ const EMBED_MODEL_PATH = process.env.LLAMA_EMBED_MODEL ||
     ? path.join(__dirname, '../models/nomic-embed-text-v1.5.Q4_K_M.gguf')
     : null);
 
+const CTX_SIZE = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
+
 if (!fs.existsSync(MODEL_PATH)) {
   console.error('Test model not found:', MODEL_PATH);
   process.exit(1);
@@ -314,7 +316,7 @@ async function testBranchPrefill() {
 
   const ctx = await addon.createContext({
     modelPath: MODEL_PATH,
-    nCtx: 2048,
+    nCtx: CTX_SIZE,
     nBatch: 512,
     nThreads: 4
   });
@@ -397,7 +399,7 @@ async function testWarmMultiTurnRecall() {
 
   const ctx = await addon.createContext({
     modelPath: MODEL_PATH,
-    nCtx: 2048,
+    nCtx: CTX_SIZE,
     nBatch: 512,
     nThreads: 4
   });
@@ -494,7 +496,7 @@ async function testWarmSemanticRecall() {
   {
     const ctx = await addon.createContext({
       modelPath: MODEL_PATH,
-      nCtx: 2048,
+      nCtx: CTX_SIZE,
       nBatch: 512,
       nThreads: 4
     });
@@ -1005,10 +1007,10 @@ async function main() {
     // Create main context for reusable tests
     mainCtx = await addon.createContext({
       modelPath: MODEL_PATH,
-      nCtx: 512,
+      nCtx: CTX_SIZE,
       nThreads: 4
     });
-    ok(`createContext() → vocabSize=${mainCtx.vocabSize}`);
+    ok(`createContext(nCtx=${CTX_SIZE}) → vocabSize=${mainCtx.vocabSize}`);
 
     // Run test suites
     await testCoreAPI(mainCtx);

From 2ee840d457e8abda91ef8e686c8b6e101d030331 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Thu, 12 Feb 2026 11:14:53 +1100
Subject: [PATCH 11/13] feat(ci): fix tests, update workflow

---
 .github/actions/provision-cuda/action.yaml | 59 +++++++++++++++-------
 .github/workflows/gpu-test.yml             | 17 ++++---
 .gitignore                                 |  1 +
 examples/best-of-n/best-of-n.mjs           |  3 +-
 examples/chat/chat.mjs                     |  3 +-
 examples/entropy/entropy.mjs               |  3 +-
 examples/grammar/grammar.mjs               |  3 +-
 examples/speculative/speculative.mjs       |  3 +-
 examples/streaming/streaming-summary.mjs   | 10 ++--
 examples/streaming/streaming-tsampler.mjs  |  4 +-
 examples/streaming/streaming.mjs           |  4 +-
 src/SessionContext.cpp                     |  4 +-
 test/integration.js                        | 30 +++++++----
 13 files changed, 95 insertions(+), 49 deletions(-)

diff --git a/.github/actions/provision-cuda/action.yaml b/.github/actions/provision-cuda/action.yaml
index 22938f5..3cfec0a 100644
--- a/.github/actions/provision-cuda/action.yaml
+++ b/.github/actions/provision-cuda/action.yaml
@@ -28,9 +28,36 @@ runs:
         method: 'network'
         sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "visual_studio_integration"]'
 
-    # Linux x64: Install from NVIDIA repos
-    - name: Install CUDA (Linux x64)
-      if: runner.os == 'Linux' && inputs.arch == 'x64'
+    # Linux: Compute version info for cache key
+    - name: Compute CUDA version info
+      id: cuda-info
+      if: runner.os == 'Linux'
+      shell: bash
+      env:
+        VERSION: ${{ inputs.version }}
+      run: |
+        echo "major-minor=$(echo $VERSION | cut -d. -f1,2)" >> $GITHUB_OUTPUT
+
+    # Linux: Cache CUDA toolkit directory (~2-3 GB, saves 3-5 min install)
+    - name: Cache CUDA toolkit
+      if: runner.os == 'Linux'
+      id: cuda-cache
+      uses: actions/cache@v4
+      with:
+        path: /usr/local/cuda-${{ steps.cuda-info.outputs.major-minor }}
+        key: cuda-toolkit-${{ inputs.version }}-${{ inputs.arch }}
+
+    # Linux: Install build dependencies (always needed, fast from apt cache)
+    - name: Install build tools
+      if: runner.os == 'Linux'
+      shell: bash
+      run: |
+        sudo apt-get update -qq
+        sudo apt-get install -y -qq build-essential cmake
+
+    # Linux x64: Install CUDA toolkit (cache miss only)
+    - name: Install CUDA toolkit (Linux x64)
+      if: runner.os == 'Linux' && inputs.arch == 'x64' && steps.cuda-cache.outputs.cache-hit != 'true'
       shell: bash
       env:
         VERSION: ${{ inputs.version }}
@@ -42,18 +69,11 @@ runs:
         wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
         sudo dpkg -i cuda-keyring_1.1-1_all.deb
         sudo apt-get update -qq
-        sudo apt-get install -y -qq cuda-toolkit-${version_slug} build-essential cmake
+        sudo apt-get install -y -qq cuda-toolkit-${version_slug}
 
-        cuda_path="/usr/local/cuda-${version_major_minor}"
-        echo "CUDA_PATH=${cuda_path}" >> $GITHUB_ENV
-        echo "${cuda_path}/bin" >> $GITHUB_PATH
-        echo "LD_LIBRARY_PATH=${cuda_path}/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
-
-        echo "CUDA installed at: ${cuda_path}"
-
-    # Linux ARM64: Install from NVIDIA repos
-    - name: Install CUDA (Linux ARM64)
-      if: runner.os == 'Linux' && inputs.arch == 'arm64'
+    # Linux ARM64: Install CUDA toolkit (cache miss only)
+    - name: Install CUDA toolkit (Linux ARM64)
+      if: runner.os == 'Linux' && inputs.arch == 'arm64' && steps.cuda-cache.outputs.cache-hit != 'true'
       shell: bash
       env:
         VERSION: ${{ inputs.version }}
@@ -65,14 +85,19 @@ runs:
         wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
         sudo dpkg -i cuda-keyring_1.1-1_all.deb
         sudo apt-get update -qq
-        sudo apt-get install -y -qq cuda-toolkit-${version_slug} build-essential cmake
+        sudo apt-get install -y -qq cuda-toolkit-${version_slug}
 
-        cuda_path="/usr/local/cuda-${version_major_minor}"
+    # Linux: Set CUDA environment variables (always - cached or fresh install)
+    - name: Set CUDA environment
+      if: runner.os == 'Linux'
+      shell: bash
+      run: |
+        cuda_path="/usr/local/cuda-${{ steps.cuda-info.outputs.major-minor }}"
         echo "CUDA_PATH=${cuda_path}" >> $GITHUB_ENV
         echo "${cuda_path}/bin" >> $GITHUB_PATH
         echo "LD_LIBRARY_PATH=${cuda_path}/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
 
-        echo "CUDA installed at: ${cuda_path}"
+        echo "CUDA ready at: ${cuda_path}"
 
     # Set output
     - name: Set CUDA path output
diff --git a/.github/workflows/gpu-test.yml b/.github/workflows/gpu-test.yml
index 9d6f02c..2057193 100644
--- a/.github/workflows/gpu-test.yml
+++ b/.github/workflows/gpu-test.yml
@@ -48,25 +48,29 @@ jobs:
         run: node scripts/sync-llama-cpp.js --check
         shell: bash
 
-      - name: Install build tools
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential cmake
-
       # CUDA 12.2.2 required for Cloud Run L4 GPU (driver 535.x)
+      # provision-cuda also installs build-essential + cmake
       - name: Provision CUDA toolkit
         uses: ./.github/actions/provision-cuda
         with:
           version: '12.2.2'
           arch: x64
 
+      - name: Setup ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: cuda-build-${{ runner.os }}
+
       - name: Install npm dependencies
-        run: npm install --ignore-scripts
+        run: npm ci --ignore-scripts
 
       - name: Build native module
         run: npm run build
         env:
           LLOYAL_GPU: cuda
+          CMAKE_C_COMPILER_LAUNCHER: ccache
+          CMAKE_CXX_COMPILER_LAUNCHER: ccache
+          CMAKE_CUDA_COMPILER_LAUNCHER: ccache
 
       - name: Create platform package
         run: node scripts/create-platform-package.js linux-x64-cuda ubuntu-22.04 x64
@@ -77,6 +81,7 @@ jobs:
           name: package-linux-x64-cuda
           path: packages/linux-x64-cuda/
           retention-days: 1
+          compression-level: 0
 
   # GPU Integration Tests via Cloud Run
   # Runs real GPU tests on NVIDIA L4
diff --git a/.gitignore b/.gitignore
index 95b20dd..1f2f1b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ models/
 # Generated documentation
 docs/api/
 docs/_internal
+
 # Vendor build artifacts (generated during npm install)
 vendor/llama.cpp/build-*/
 
diff --git a/examples/best-of-n/best-of-n.mjs b/examples/best-of-n/best-of-n.mjs
index 8fe5dc9..26daa68 100644
--- a/examples/best-of-n/best-of-n.mjs
+++ b/examples/best-of-n/best-of-n.mjs
@@ -89,9 +89,10 @@ async function main() {
 
   emit('start', { model: path.basename(modelPath), n: N, maxTokens: MAX_TOKENS, highTemp: HIGH_TEMP, lowTemp: LOW_TEMP });
 
+  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
   const ctx = await createContext({
     modelPath,
-    contextSize: 2048,
+    nCtx,
     nSeqMax: N + 2, // Need slots for N candidates + baseline + trunk
   });
 
diff --git a/examples/chat/chat.mjs b/examples/chat/chat.mjs
index f86b0e1..05196dc 100644
--- a/examples/chat/chat.mjs
+++ b/examples/chat/chat.mjs
@@ -30,9 +30,10 @@ async function main() {
   console.log(`Loading model: ${modelPath}`);
   console.log("This may take a moment...\n");
 
+  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
   const ctx = await createContext({
     modelPath,
-    contextSize: 2048,
+    nCtx,
     threads: 4,
   });
 
diff --git a/examples/entropy/entropy.mjs b/examples/entropy/entropy.mjs
index bfbce73..c9204fe 100644
--- a/examples/entropy/entropy.mjs
+++ b/examples/entropy/entropy.mjs
@@ -160,9 +160,10 @@ async function main() {
 
   emit('start', { model: path.basename(modelPath), T0, N, THETA });
 
+  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
   const ctx = await createContext({
     modelPath,
-    contextSize: 2048,
+    nCtx,
   });
 
   // Test 1: Factual question (expect low entropy, EDT should use low temp)
diff --git a/examples/grammar/grammar.mjs b/examples/grammar/grammar.mjs
index c32be1f..33d4fc3 100644
--- a/examples/grammar/grammar.mjs
+++ b/examples/grammar/grammar.mjs
@@ -60,9 +60,10 @@ async function main() {
 
   emit('start', { model: path.basename(modelPath) });
 
+  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
   const ctx = await createContext({
     modelPath,
-    contextSize: 2048,
+    nCtx,
     nSeqMax: 4,
   });
 
diff --git a/examples/speculative/speculative.mjs b/examples/speculative/speculative.mjs
index ccb44a1..f1b8261 100644
--- a/examples/speculative/speculative.mjs
+++ b/examples/speculative/speculative.mjs
@@ -92,9 +92,10 @@ async function main() {
     generationLength: GENERATION_LENGTH,
   });
 
+  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
   const ctx = await createContext({
     modelPath,
-    contextSize: 2048,
+    nCtx,
     nSeqMax: 4, // Enable multi-sequence for fork/verify pattern
   });
 
diff --git a/examples/streaming/streaming-summary.mjs b/examples/streaming/streaming-summary.mjs
index 066d1a8..6de9f56 100644
--- a/examples/streaming/streaming-summary.mjs
+++ b/examples/streaming/streaming-summary.mjs
@@ -193,7 +193,7 @@ function buildProgressSink(anchor, outline, allGeneratedText, summaryChain) {
 
 async function main() {
   // Constants
-  const nCtx = 2048;
+  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
   const TAIL_SIZE = 256;
   const MAX_SINK_RATIO = 0.4;
   const MAX_SINK_TOKENS = Math.floor(nCtx * MAX_SINK_RATIO);
@@ -218,7 +218,7 @@ async function main() {
 
   const ctx = await createContext({
     modelPath,
-    contextSize: nCtx,
+    nCtx,
   });
 
   // Summary sidecar — preload in background (overlaps with prompt decode + generation)
@@ -232,20 +232,20 @@ async function main() {
     // Sidecar mode: use slim-summarize.gguf
     const summaryModelAvailable = fs.existsSync(SUMMARY_MODEL);
     if (summaryModelAvailable) {
-      summaryCtxPromise = createContext({ modelPath: SUMMARY_MODEL, contextSize: 4096 });
+      summaryCtxPromise = createContext({ modelPath: SUMMARY_MODEL, nCtx: 4096 });
     } else {
       if (!jsonlMode) {
         console.log('Sidecar model not found - falling back to self-summary');
       }
       emit('sidecar_missing', { message: 'slim-summarize.gguf not found, using self-summary' });
       // Fall back to self mode
-      summaryCtxPromise = createContext({ modelPath, contextSize: 4096 });
+      summaryCtxPromise = createContext({ modelPath, nCtx: 4096 });
       actualSummaryFormat = 'self';
     }
   } else {
     // Self mode (default): second context from same model
     // Weights are shared via model_registry — only KV cache is duplicated
-    summaryCtxPromise = createContext({ modelPath, contextSize: 4096 });
+    summaryCtxPromise = createContext({ modelPath, nCtx: 4096 });
   }
 
   const prompt = `Write a comprehensive guide to machine learning, covering the following topics in extreme detail with examples, code snippets, and mathematical formulas:
diff --git a/examples/streaming/streaming-tsampler.mjs b/examples/streaming/streaming-tsampler.mjs
index 09c052f..4fd3207 100644
--- a/examples/streaming/streaming-tsampler.mjs
+++ b/examples/streaming/streaming-tsampler.mjs
@@ -133,7 +133,7 @@ class NgramTracker {
 
 async function main() {
   // BlinkKV parameters
-  const nCtx = 2048;
+  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
   const TAIL_SIZE = 256;
   const NGRAM_SIZE = 6; // Track 6-grams for sequence detection
   const BLOCK_THRESHOLD = 2; // Only block after seeing same pattern K times
@@ -146,7 +146,7 @@ async function main() {
 
   const ctx = await createContext({
     modelPath,
-    contextSize: nCtx,
+    nCtx,
   });
 
   const prompt = `Write a comprehensive guide to machine learning, covering the following topics in extreme detail with examples, code snippets, and mathematical formulas:
diff --git a/examples/streaming/streaming.mjs b/examples/streaming/streaming.mjs
index 1c0dc2a..4bc52e8 100644
--- a/examples/streaming/streaming.mjs
+++ b/examples/streaming/streaming.mjs
@@ -42,7 +42,7 @@ function emit(event, data) {
 
 async function main() {
   // BlinkKV paper parameters: 2048 context, 4 sinks, 256 tail
-  const nCtx = 2048;
+  const nCtx = parseInt(process.env.LLAMA_CTX_SIZE || '2048', 10);
   const SINK_COUNT = 4;
   const TAIL_SIZE = 256;
 
@@ -54,7 +54,7 @@ async function main() {
 
   const ctx = await createContext({
     modelPath,
-    contextSize: nCtx,
+    nCtx,
   });
 
   const prompt = `Write a comprehensive guide to machine learning, covering the following topics in extreme detail with examples, code snippets, and mathematical formulas:
diff --git a/src/SessionContext.cpp b/src/SessionContext.cpp
index e32ae19..9e9aada 100644
--- a/src/SessionContext.cpp
+++ b/src/SessionContext.cpp
@@ -2102,9 +2102,7 @@ Napi::Value CreateContext(const Napi::CallbackInfo& info) {
   std::cout << "[CreateContext] File validated: " << fsPath << " (" << fileSize << " bytes)" << std::endl;
 
   // Load model on main thread
-  // Note: With XCFramework build, this works reliably on main thread
-  // (async loading was failing with CMake build due to binary incompatibility)
-  std::cout << "[CreateContext] Loading model from XCFramework..." << std::endl;
+  std::cout << "[CreateContext] Loading model..." << std::endl;
 
   llama_model_params model_params = llama_model_default_params();
   // -1 = offload all layers to GPU (auto-detect), 0 = CPU only
diff --git a/test/integration.js b/test/integration.js
index 382cce1..4580447 100644
--- a/test/integration.js
+++ b/test/integration.js
@@ -166,7 +166,7 @@ async function testMultiSequence() {
 
   const ctx = await addon.createContext({
     modelPath: MODEL_PATH,
-    nCtx: 512,
+    nCtx: CTX_SIZE,
     nThreads: 4,
     nSeqMax: 4
   });
@@ -201,7 +201,7 @@ async function testGrammar() {
 
   const ctx = await addon.createContext({
     modelPath: MODEL_PATH,
-    nCtx: 512,
+    nCtx: CTX_SIZE,
     nThreads: 4
   });
 
@@ -432,13 +432,25 @@ async function testWarmMultiTurnRecall() {
 
     // Turn 1 (COLD): introduce name
     const msgs1 = [{ role: 'user', content: 'Hi, my name is Lloyal' }];
-    const { prompt } = await ctx.formatChat(JSON.stringify(msgs1));
+    const { prompt, format, reasoningFormat } = await ctx.formatChat(JSON.stringify(msgs1), {});
     const promptToks = await ctx.tokenize(prompt);
     await ctx.decode(promptToks, 0, 0);
 
     const branch = Branch.create(ctx, 0, promptToks.length, { temperature: 0 });
     branch.captureLogits();
 
+    // Helper: parse output and check both content and reasoning for a term
+    // Handles thinking models (Qwen3, DeepSeek-R1) where answer may be in <think> blocks
+    function checkRecall(rawText, term) {
+      const { content, reasoningContent } = ctx.parseChatOutput(rawText, format, {
+        reasoningFormat,
+        isPartial: false,
+        thinkingForcedOpen: false
+      });
+      const fullText = (content || '') + ' ' + (reasoningContent || '');
+      return fullText.toLowerCase().includes(term.toLowerCase());
+    }
+
     const turn1 = await generate(branch);
     console.log(`  Turn 1: "${turn1.trim().slice(0, 80)}"`);
     assert(turn1.length > 0, 'Turn 1: generated response');
@@ -451,13 +463,13 @@ async function testWarmMultiTurnRecall() {
     // Turn 3 (WARM): recall name
     const turn3 = await warmTurn(branch, 'Do you remember my name?');
     console.log(`  Turn 3 (name recall): "${turn3.trim().slice(0, 80)}"`);
-    const nameRecalled = turn3.toLowerCase().includes('lloyal');
+    const nameRecalled = checkRecall(turn3, 'lloyal');
     assert(nameRecalled, `Name recall: ${nameRecalled ? 'found "Lloyal"' : 'MISSING "Lloyal" in: ' + turn3.trim().slice(0, 120)}`);
 
     // Turn 4 (WARM): recall food
     const turn4 = await warmTurn(branch, 'Do you remember my favourite food?');
     console.log(`  Turn 4 (food recall): "${turn4.trim().slice(0, 80)}"`);
-    const foodRecalled = turn4.toLowerCase().includes('pizza');
+    const foodRecalled = checkRecall(turn4, 'pizza');
     assert(foodRecalled, `Food recall: ${foodRecalled ? 'found "pizza"' : 'MISSING "pizza" in: ' + turn4.trim().slice(0, 120)}`);
 
     branch.prune();
@@ -608,7 +620,7 @@ async function testBranchSteer() {
 
   const ctx = await addon.createContext({
     modelPath: MODEL_PATH,
-    nCtx: 512,
+    nCtx: CTX_SIZE,
     nThreads: 4
   });
 
@@ -712,7 +724,7 @@ async function testNBatchAblation() {
   for (const nBatch of nBatchValues) {
     const ctx = await addon.createContext({
       modelPath: MODEL_PATH,
-      nCtx: 1024,
+      nCtx: CTX_SIZE,
       nBatch,
       nThreads: 4
     });
@@ -801,7 +813,7 @@ async function testDeterminism() {
   async function generate(prompt) {
     const ctx = await addon.createContext({
       modelPath: MODEL_PATH,
-      nCtx: 512,
+      nCtx: CTX_SIZE,
       nThreads: 4
     });
 
@@ -912,7 +924,7 @@ async function testDecodeAndCapture() {
 
   const ctx = await addon.createContext({
     modelPath: MODEL_PATH,
-    nCtx: 512,
+    nCtx: CTX_SIZE,
     nThreads: 4
   });
 

From 223020bb99c49623d64865d8f8847fabcbb1a214 Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Thu, 12 Feb 2026 11:38:25 +1100
Subject: [PATCH 12/13] feat(ci): fix tests, update workflow

---
 package-lock.json | 156 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 143 insertions(+), 13 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 3a9b00f..acc8e73 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -93,43 +93,173 @@
       }
     },
     "node_modules/@lloyal-labs/lloyal.node-darwin-arm64": {
-      "optional": true
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-darwin-arm64/-/lloyal.node-darwin-arm64-1.1.3.tgz",
+      "integrity": "sha512-/FLmWFA9mO4YaTrOGOL4AdEeRGCON1cqJPXEoWaHM+vn32x3u8D2tMFaRbAD8hd0JdWDFajhG720b/+G0cI7hw==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-darwin-x64": {
-      "optional": true
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-darwin-x64/-/lloyal.node-darwin-x64-1.1.3.tgz",
+      "integrity": "sha512-r1TaiIejrZMPNXTWwUEsZLd4vvT9l95Wb18BKSMlBOombkZmeDHmg0C6MzXKRCml5obwqKhtzRSUk5okj7Y8HQ==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "darwin"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-arm64": {
-      "optional": true
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-arm64/-/lloyal.node-linux-arm64-1.1.3.tgz",
+      "integrity": "sha512-ClaLJMEZrXFM9PgFMDViXVZyI0ekNhPtTzjCwbFmGgzeWLiTHg+r7MkgU+JGxClytIiLskxZtHtMTpys6LlzVA==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-arm64-cuda": {
-      "optional": true
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-arm64-cuda/-/lloyal.node-linux-arm64-cuda-1.1.3.tgz",
+      "integrity": "sha512-6qWRPANF5qtX0tsLqvhBIT0veSz2San7b4NlCyNoEQjp5or9dp2s+54Hq+9ShXe9GO4VusaTrSbKn4Ndm+ngbA==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-arm64-vulkan": {
-      "optional": true
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-arm64-vulkan/-/lloyal.node-linux-arm64-vulkan-1.1.3.tgz",
+      "integrity": "sha512-LKXy1iEOs3LdvSOLv20/qHOZEj1A8G4VFVcC9E+HcnPu76wNeS5PNsu3Tdg2pHUUvl1R6doGAYS6oSQvkNmyaA==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-x64": {
-      "optional": true
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-x64/-/lloyal.node-linux-x64-1.1.3.tgz",
+      "integrity": "sha512-8nb6Wa5gX+lEjFElzIBNy1Lh52+/u/7u90vUJeG/RUffTR3r0A9vZ+lVLgIEH2TkFi7zuqz1mM9YXyzY1plgiw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-x64-cuda": {
-      "optional": true
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-x64-cuda/-/lloyal.node-linux-x64-cuda-1.1.3.tgz",
+      "integrity": "sha512-y+e+usmeHVX5UVJB6HOjG4vE1xVJm6lQyMFyP16kJnYJTjlTjRzFFzipCo30lHPwozPNpWogWGzJtsMwQauOHw==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-linux-x64-vulkan": {
-      "optional": true
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-linux-x64-vulkan/-/lloyal.node-linux-x64-vulkan-1.1.3.tgz",
+      "integrity": "sha512-S57IeSMw/yLiC+vLiUxnc2PRWA8+Df3k9G8eVJyEIEurm88aZJ3wywhaNFm7dWLKriilLzwCEboIncxZu1F7yA==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-arm64": {
-      "optional": true
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-arm64/-/lloyal.node-win32-arm64-1.1.3.tgz",
+      "integrity": "sha512-faMP7p4LyTlZR1bzLkPZuagH55LrInqRS0tzTgaL80b/mF1KULUKqfO9C5wmpXW1BxMHWsv1LPrvA1VIraHvYg==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-arm64-vulkan": {
-      "optional": true
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-arm64-vulkan/-/lloyal.node-win32-arm64-vulkan-1.1.3.tgz",
+      "integrity": "sha512-zxFl5c6RP3Ke62wSBrYJAx7A4cMDQji8gBa8iYAXvUiXfdJQRULIN5C0NdflKCJBYgwvIPi5Mn6lUHzQNRtTZA==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-x64": {
-      "optional": true
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-x64/-/lloyal.node-win32-x64-1.1.3.tgz",
+      "integrity": "sha512-sJHp8/oxwG1nwMRfAh5htSTkXfzZXHjKBljlbSQUCEkbNWhqwVp1Wz2V5C7uB6Um5NkizHTBNOT+JNa5wUNh7Q==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-x64-cuda": {
-      "optional": true
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-x64-cuda/-/lloyal.node-win32-x64-cuda-1.1.3.tgz",
+      "integrity": "sha512-UsUZn3fT2Hbg5atjX00DBeNQp9/V2l0XkYA4xdqjyefcZ7QyYi5ShG59wDEkW88okzShWEQ3494jYNgXrNGz+w==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/lloyal.node-win32-x64-vulkan": {
-      "optional": true
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@lloyal-labs/lloyal.node-win32-x64-vulkan/-/lloyal.node-win32-x64-vulkan-1.1.3.tgz",
+      "integrity": "sha512-uNc7gUQhZJK7EjcAsNoFQMWjsRW0iq422xF89uK5K6k8EMinnvkqkSDBX0LAZUHXB5TlP4rPy+1SZCMITMFE4g==",
+      "cpu": [
+        "x64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
     "node_modules/@lloyal-labs/tsampler": {
       "version": "0.2.0",

From 98f54d3ecc8b7b8cb0bd72e25130571879c24e8e Mon Sep 17 00:00:00 2001
From: LLoyal Research <research@lloyal.ai>
Date: Thu, 12 Feb 2026 14:57:14 +1100
Subject: [PATCH 13/13] feat(ci): fix tests, update workflow

---
 ci/run-gpu-tests.sh |  4 ++--
 lib/index.js        | 20 ++++++++++----------
 test/examples.js    |  6 +++---
 test/integration.js | 30 ++++++++++++++----------------
 test/matrix.json    | 18 +++++++++---------
 5 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/ci/run-gpu-tests.sh b/ci/run-gpu-tests.sh
index 92a070c..8a6df8f 100755
--- a/ci/run-gpu-tests.sh
+++ b/ci/run-gpu-tests.sh
@@ -90,7 +90,7 @@ while IFS= read -r model; do
 
   # --- Integration tests ---
   echo "── Integration Tests ──"
-  MODEL_PATH="models/$file" \
+  LLAMA_TEST_MODEL="models/$file" \
   node test/integration.js 2>&1 | tee "$MODEL_LOG"
   INT_EXIT=${PIPESTATUS[0]}
 
@@ -101,7 +101,7 @@ while IFS= read -r model; do
   # --- Example tests ---
   echo ""
   echo "── Example Tests ──"
-  MODEL_PATH="models/$file" \
+  LLAMA_TEST_MODEL="models/$file" \
   node test/examples.js 2>&1 | tee -a "$MODEL_LOG"
   EX_EXIT=${PIPESTATUS[0]}
 
diff --git a/lib/index.js b/lib/index.js
index 8fd81c5..d01130e 100644
--- a/lib/index.js
+++ b/lib/index.js
@@ -97,10 +97,10 @@ const tryLoadPackage = (packageName, verbose = false) => {
  * - Uses local build exclusively (`build/Release/lloyal.node`)
  * - Throws error if not found (no fallback)
  *
- * Otherwise (production):
+ * Otherwise:
  * 1. Requested GPU variant package (if `variant` param or `LLOYAL_GPU` env var specified)
- * 2. Default platform package (`@lloyal-labs/lloyal.node-{platform}-{arch}`)
- * 3. Local build as final fallback
+ * 2. Local build (`build/Release/lloyal.node`) — always fresher during development
+ * 3. Default platform package (`@lloyal-labs/lloyal.node-{platform}-{arch}`)
  *
  * **Environment Variables:**
  * - `LLOYAL_LOCAL=1` — Use local build exclusively (`build/Release/lloyal.node`).
@@ -173,18 +173,18 @@ const loadBinary = (variant) => {
     console.warn(`[lloyal.node] GPU variant "${variant}" unavailable, falling back to CPU`);
   }
 
-  // 2. Try default platform package (CPU)
-  const defaultPkg = getPlatformPackageName();
-  const binary = tryLoadPackage(defaultPkg, true); // verbose=true
-  if (binary) return binary;
-
-  // 3. Try local build (development)
+  // 2. Try local build (always fresher than installed packages during development)
   try {
     return require('../build/Release/lloyal.node');
   } catch (e) {
-    // ignore
+    // ignore — no local build
   }
 
+  // 3. Try default platform package (CPU)
+  const defaultPkg = getPlatformPackageName();
+  const binary = tryLoadPackage(defaultPkg, true); // verbose=true
+  if (binary) return binary;
+
   throw new Error(
     `No lloyal.node binary found for ${process.platform}-${process.arch}. ` +
     `Tried: ${variant ? getPlatformPackageName(variant) + ', ' : ''}${defaultPkg}`
diff --git a/test/examples.js b/test/examples.js
index 48cb63c..aedd9c1 100644
--- a/test/examples.js
+++ b/test/examples.js
@@ -9,7 +9,7 @@
  *   node test/examples.js entropy   # Run specific example
  *
  * Environment variables:
- *   MODEL_PATH       - Path to chat/instruct model (default: SmolLM2)
+ *   LLAMA_TEST_MODEL  - Path to chat/instruct model (default: SmolLM2)
  *   EMBED_MODEL_PATH - Path to embedding model (default: nomic-embed)
  */
 
@@ -18,8 +18,8 @@ const path = require('path');
 const fs = require('fs');
 
 // Model paths - use env var or default (resolve to absolute path)
-const MODEL_PATH = process.env.MODEL_PATH
-  ? path.resolve(process.env.MODEL_PATH)
+const MODEL_PATH = process.env.LLAMA_TEST_MODEL
+  ? path.resolve(process.env.LLAMA_TEST_MODEL)
   : path.join(__dirname, '../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf');
 
 // Embedding model (separate from chat model, resolve to absolute path)
diff --git a/test/integration.js b/test/integration.js
index 4580447..8ab8edd 100644
--- a/test/integration.js
+++ b/test/integration.js
@@ -6,7 +6,7 @@
  *
  * Usage:
  *   npm run test:integration
- *   MODEL_PATH=models/Llama-3.2-1B-Instruct-Q4_K_M.gguf npm run test:integration
+ *   LLAMA_TEST_MODEL=models/Llama-3.2-1B-Instruct-Q4_K_M.gguf npm run test:integration
  *
  * Optional embedding tests:
  *   LLAMA_EMBED_MODEL=models/nomic-embed-text-v1.5.Q4_K_M.gguf npm run test:integration
@@ -15,8 +15,8 @@
 const path = require('path');
 const fs = require('fs');
 
-const MODEL_PATH = process.env.MODEL_PATH
-  ? path.resolve(process.env.MODEL_PATH)
+const MODEL_PATH = process.env.LLAMA_TEST_MODEL
+  ? path.resolve(process.env.LLAMA_TEST_MODEL)
   : path.join(__dirname, '../models/SmolLM2-1.7B-Instruct-Q4_K_M.gguf');
 const EMBED_MODEL_PATH = process.env.LLAMA_EMBED_MODEL ||
   (fs.existsSync(path.join(__dirname, '../models/nomic-embed-text-v1.5.Q4_K_M.gguf'))
@@ -424,7 +424,7 @@ async function testWarmMultiTurnRecall() {
       const { prompt } = await ctx.formatChat(JSON.stringify([
         { role: 'system', content: '' },
         { role: 'user', content: userContent }
-      ]));
+      ]), {});
       const delta = await ctx.tokenize(prompt, false);
       branch.prefill([...sep, ...delta]);
       return generate(branch);
@@ -439,38 +439,36 @@ async function testWarmMultiTurnRecall() {
     const branch = Branch.create(ctx, 0, promptToks.length, { temperature: 0 });
     branch.captureLogits();
 
-    // Helper: parse output and check both content and reasoning for a term
-    // Handles thinking models (Qwen3, DeepSeek-R1) where answer may be in <think> blocks
+    // Helper: parse output and check content (not reasoning) for a term
     function checkRecall(rawText, term) {
-      const { content, reasoningContent } = ctx.parseChatOutput(rawText, format, {
+      const { content } = ctx.parseChatOutput(rawText, format, {
         reasoningFormat,
         isPartial: false,
         thinkingForcedOpen: false
       });
-      const fullText = (content || '') + ' ' + (reasoningContent || '');
-      return fullText.toLowerCase().includes(term.toLowerCase());
+      return (content || '').toLowerCase().includes(term.toLowerCase());
     }
 
     const turn1 = await generate(branch);
-    console.log(`  Turn 1: "${turn1.trim().slice(0, 80)}"`);
+    console.log(`  Turn 1: "${turn1.trim()}"`);
     assert(turn1.length > 0, 'Turn 1: generated response');
 
     // Turn 2 (WARM): introduce favourite food
     const turn2 = await warmTurn(branch, 'My favourite food is pizza');
-    console.log(`  Turn 2: "${turn2.trim().slice(0, 80)}"`);
+    console.log(`  Turn 2: "${turn2.trim()}"`);
     assert(turn2.length > 0, 'Turn 2: generated response');
 
     // Turn 3 (WARM): recall name
     const turn3 = await warmTurn(branch, 'Do you remember my name?');
-    console.log(`  Turn 3 (name recall): "${turn3.trim().slice(0, 80)}"`);
+    console.log(`  Turn 3 (name recall): "${turn3.trim()}"`);
     const nameRecalled = checkRecall(turn3, 'lloyal');
-    assert(nameRecalled, `Name recall: ${nameRecalled ? 'found "Lloyal"' : 'MISSING "Lloyal" in: ' + turn3.trim().slice(0, 120)}`);
+    assert(nameRecalled, `Name recall: ${nameRecalled ? 'found "Lloyal"' : 'MISSING "Lloyal" in: ' + turn3.trim()}`);
 
     // Turn 4 (WARM): recall food
     const turn4 = await warmTurn(branch, 'Do you remember my favourite food?');
-    console.log(`  Turn 4 (food recall): "${turn4.trim().slice(0, 80)}"`);
+    console.log(`  Turn 4 (food recall): "${turn4.trim()}"`);
     const foodRecalled = checkRecall(turn4, 'pizza');
-    assert(foodRecalled, `Food recall: ${foodRecalled ? 'found "pizza"' : 'MISSING "pizza" in: ' + turn4.trim().slice(0, 120)}`);
+    assert(foodRecalled, `Food recall: ${foodRecalled ? 'found "pizza"' : 'MISSING "pizza" in: ' + turn4.trim()}`);
 
     branch.prune();
   } finally {
@@ -594,7 +592,7 @@ async function testWarmSemanticRecall() {
         return embedCtx.getEmbeddings(true);
       }
 
-      console.log(`  Recall response: "${recallText.trim().slice(0, 120)}"`);
+      console.log(`  Recall response: "${recallText.trim()}"`);
 
       const embResponse = await embed(recallText);
       const embCorrect = await embed('The dog is named Max.');
diff --git a/test/matrix.json b/test/matrix.json
index a22f153..0a5bad5 100644
--- a/test/matrix.json
+++ b/test/matrix.json
@@ -9,13 +9,7 @@
       "template": "chatml",
       "default": true
     },
-    {
-      "name": "Ministral",
-      "file": "Ministral-3-3B-Instruct-2512-Q4_K_M.gguf",
-      "url": "https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512-GGUF/resolve/main/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf",
-      "template": "mistral"
-    },
-    {
+{
       "name": "Llama-3.2",
       "file": "Llama-3.2-1B-Instruct-Q4_K_M.gguf",
       "url": "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
@@ -29,8 +23,8 @@
     },
     {
       "name": "Qwen3",
-      "file": "Qwen3-1.7B-Q4_K_M.gguf",
-      "url": "https://huggingface.co/unsloth/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-Q4_K_M.gguf",
+      "file": "Qwen3-4B-Thinking-2507-Q4_K_M.gguf",
+      "url": "https://huggingface.co/unsloth/Qwen3-4B-Thinking-2507-GGUF/resolve/main/Qwen3-4B-Thinking-2507-Q4_K_M.gguf",
       "template": "chatml"
     },
     {
@@ -38,6 +32,12 @@
       "file": "gemma-3-1b-it-Q4_K_M.gguf",
       "url": "https://huggingface.co/unsloth/gemma-3-1b-it-GGUF/resolve/main/gemma-3-1b-it-Q4_K_M.gguf",
       "template": "gemma"
+    },
+    {
+      "name": "GLM-Edge",
+      "file": "ggml-model-Q4_K_M.gguf",
+      "url": "https://huggingface.co/zai-org/glm-edge-4b-chat-gguf/resolve/main/ggml-model-Q4_K_M.gguf",
+      "template": "glm-edge"
     }
   ],
   "embeddings": [