feat(sync): llama.cpp to b8589

lloyal-research · lloyal-research · commit 30ea8eda01b3 · 2026-04-01T22:20:19.000+11:00
thinking_forced_open (bool) replaced with generation_prompt (string)                                                                                                                                                                                                                upstream in llama.cpp#20424 — carries actual prefill text instead of a narrow flag. Updated headers, stubs, and tests.
diff --git a/.llama-cpp-version b/.llama-cpp-version
@@ -1,4 +1,4 @@
 # llama.cpp version for integration tests
-# Last updated: 2026-03-16
+# Last updated: 2026-04-01
 # Format: git commit hash or tag
-b8369
+b8589
diff --git a/include/lloyal/branch.hpp b/include/lloyal/branch.hpp
@@ -107,9 +107,8 @@ constexpr uint32_t INDEX_MASK = 0xFFFF;     ///< Mask for slot index field
  * init_tenancy(), and when the last active branch is released. retainOnly()
  * resets it to the surviving branch's position.
  *
- * Conservative: overcounts if individual branches are pruned mid-run
- * (prune does NOT decrement), which is safe — it triggers soft limits
- * sooner rather than later.
+ * Decremented on release: each pruned branch subtracts its unique cells
+ * (position - fork_head). Pressure recovers as branches are freed.
  */
 struct KvPressure {
   uint32_t n_ctx;       ///< Total KV capacity
diff --git a/include/lloyal/chat_in.hpp b/include/lloyal/chat_in.hpp
@@ -89,7 +89,7 @@ struct FormatResult {
   common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;  ///< Detected chat format
   std::string grammar;                             ///< GBNF grammar for constrained sampling
   bool grammar_lazy = false;                       ///< Whether grammar should use lazy compilation
-  bool thinking_forced_open = false;               ///< Whether thinking tag is forced open
+  std::string generation_prompt;                    ///< Generation prompt prefill (e.g. "<think>")
   std::vector<common_grammar_trigger> grammar_triggers; ///< Triggers for lazy grammar activation
   std::vector<std::string> preserved_tokens;       ///< Tokens to preserve during grammar constraining
   std::string parser;                              ///< PEG parser definition (for PEG formats)
@@ -206,7 +206,7 @@ inline FormatResult format(const llama_model *model, const FormatInputs& inputs)
       result.format = params.format;
       result.grammar = params.grammar;
       result.grammar_lazy = params.grammar_lazy;
-      result.thinking_forced_open = params.thinking_forced_open;
+      result.generation_prompt = params.generation_prompt;
       result.grammar_triggers = params.grammar_triggers;
       result.preserved_tokens = params.preserved_tokens;
       result.parser = params.parser;
diff --git a/include/lloyal/chat_out.hpp b/include/lloyal/chat_out.hpp
@@ -60,7 +60,7 @@ struct ToolCall {
  *
  * @code{.cpp}
  * auto parsed = chat_out::parse(raw_output, fmt.format, fmt.reasoning_format,
- *                                false, fmt.thinking_forced_open, fmt.parser);
+ *                                false, fmt.generation_prompt, fmt.parser);
  *
  * // Build assistant message with separate fields
  * json assistant_msg = {{"role", "assistant"}, {"content", parsed.content}};
@@ -91,7 +91,7 @@ struct ParseResult {
  * @param format The chat format (from chat_in::FormatResult.format)
  * @param reasoning_format How to handle reasoning/thinking blocks
  * @param is_partial True if output is incomplete (streaming)
- * @param thinking_forced_open Whether thinking tag was forced open
+ * @param generation_prompt Generation prompt prefill text (e.g. "<think>")
  * @param parser_data Serialized PEG parser (from chat_in::FormatResult.parser).
  *                    Required for PEG format models; ignored for others.
  *
@@ -109,7 +109,7 @@ struct ParseResult {
  * auto fmt = chat_in::format(model, inputs);
  * // ... generate tokens ...
  * auto parsed = chat_out::parse(output_text, fmt.format, fmt.reasoning_format,
- *                                false, fmt.thinking_forced_open, fmt.parser);
+ *                                false, fmt.generation_prompt, fmt.parser);
  * if (!parsed.tool_calls.empty()) {
  *   // Handle tool calls
  * }
@@ -129,7 +129,7 @@ struct ParseResult {
  *
  * // Parse: separates reasoning from content
  * auto parsed = chat_out::parse(raw_output, fmt.format,
- *     fmt.reasoning_format, false, fmt.thinking_forced_open, fmt.parser);
+ *     fmt.reasoning_format, false, fmt.generation_prompt, fmt.parser);
  *
  * // Store with separate fields for correct re-formatting on cold restart
  * json msg = {{"role", "assistant"}, {"content", parsed.content}};
@@ -144,7 +144,7 @@ inline ParseResult parse(
     common_chat_format format,
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE,
     bool is_partial = false,
-    bool thinking_forced_open = false,
+    const std::string& generation_prompt = "",
     const std::string& parser_data = ""
 ) {
   ParseResult result;
@@ -154,7 +154,7 @@ inline ParseResult parse(
     common_chat_parser_params syntax;
     syntax.format = format;
     syntax.reasoning_format = reasoning_format;
-    syntax.thinking_forced_open = thinking_forced_open;
+    syntax.generation_prompt = generation_prompt;
 
     // Load serialized PEG parser if provided (required for PEG format models)
     if (!parser_data.empty()) {
@@ -222,7 +222,7 @@ inline ParseResult parse(
 
     // Delegate to explicit-format overload
     return parse(output, params.format, COMMON_REASONING_FORMAT_NONE, is_partial,
-                 params.thinking_forced_open);
+                 params.generation_prompt);
 
   } catch (const std::exception& e) {
     LLOYAL_LOG_DEBUG("[chat_out::parse] Auto-detect failed: %s", e.what());
diff --git a/tests/chat_in_test.cpp b/tests/chat_in_test.cpp
@@ -368,5 +368,5 @@ TEST_CASE("ChatIn: format returns default format when no tools") {
   CHECK(result.preserved_tokens.empty());
   CHECK(result.parser.empty());
   CHECK(!result.grammar_lazy);
-  CHECK(!result.thinking_forced_open);
+  CHECK(result.generation_prompt.empty());
 }
diff --git a/tests/chat_out_test.cpp b/tests/chat_out_test.cpp
@@ -81,13 +81,13 @@ TEST_CASE("ChatOut: parse empty string") {
   CHECK(result.tool_calls.empty());
 }
 
-TEST_CASE("ChatOut: parse with thinking_forced_open") {
+TEST_CASE("ChatOut: parse with generation_prompt") {
   auto result = lloyal::chat_out::parse(
     "still thinking",
     COMMON_CHAT_FORMAT_DEEPSEEK_R1,
     COMMON_REASONING_FORMAT_DEEPSEEK,
-    false,  // is_partial
-    true    // thinking_forced_open
+    false,        // is_partial
+    "<think>"     // generation_prompt
   );
 
   // Stub passthrough — just verify no crash
diff --git a/tests/integration/chat_out_integration_test.cpp b/tests/integration/chat_out_integration_test.cpp
@@ -93,7 +93,7 @@ TEST_CASE("ChatOut Integration: parse with format from chat_in roundtrip") {
     fmt.format,
     fmt.reasoning_format,
     false,
-    fmt.thinking_forced_open,
+    fmt.generation_prompt,
     fmt.parser
   );
 
diff --git a/tests/stubs/chat.h b/tests/stubs/chat.h
@@ -111,7 +111,7 @@ struct common_chat_params {
   std::string prompt;
   std::string grammar;
   bool grammar_lazy = false;
-  bool thinking_forced_open = false;
+  std::string generation_prompt;
   std::vector<common_grammar_trigger> grammar_triggers;
   std::vector<std::string> preserved_tokens;
   std::vector<std::string> additional_stops;
@@ -123,14 +123,14 @@ struct common_chat_parser_params {
   common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
   common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
   bool reasoning_in_content = false;
-  bool thinking_forced_open = false;
+  std::string generation_prompt;
   bool parse_tool_calls = true;
   common_peg_arena parser = {};
 
   common_chat_parser_params() = default;
   common_chat_parser_params(const common_chat_params& chat_params) {
     format = chat_params.format;
-    thinking_forced_open = chat_params.thinking_forced_open;
+    generation_prompt = chat_params.generation_prompt;
   }
 };
 

Original file line number	Diff line number	Diff line change
`@@ -368,5 +368,5 @@ TEST_CASE("ChatIn: format returns default format when no tools") {`
`368`	`368`	`CHECK(result.preserved_tokens.empty());`
`369`	`369`	`CHECK(result.parser.empty());`
`370`	`370`	`CHECK(!result.grammar_lazy);`
`371`		`- CHECK(!result.thinking_forced_open);`
	`371`	`+ CHECK(result.generation_prompt.empty());`
`372`	`372`	`}`