From 81dd6e474087699f58eebea85d91b39d127ab5a5 Mon Sep 17 00:00:00 2001
From: Sergei Zharinov <zharinov@users.noreply.github.com>
Date: Wed, 7 Jan 2026 17:37:02 -0300
Subject: [PATCH] fix: Cascade overflow pre-effects to prevent bytecode
 encoding panic

---
 crates/plotnik-lib/src/bytecode/constants.rs  |  6 ++
 crates/plotnik-lib/src/bytecode/ir.rs         | 24 +++++--
 crates/plotnik-lib/src/bytecode/mod.rs        |  4 +-
 crates/plotnik-lib/src/bytecode/nav.rs        |  9 ++-
 .../plotnik-lib/src/compile/compile_tests.rs  | 65 +++++++++++++++++++
 crates/plotnik-lib/src/compile/expressions.rs | 21 +++---
 crates/plotnik-lib/src/compile/scope.rs       | 63 ++++++++++++++++++
 crates/plotnik-lib/src/engine/engine_tests.rs | 44 +++++++++++++
 ...beled_alternation_5_branches_captures.snap | 27 ++++++++
 ...beled_alternation_8_branches_captures.snap | 33 ++++++++++
 10 files changed, 273 insertions(+), 23 deletions(-)
 create mode 100644 crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_5_branches_captures.snap
 create mode 100644 crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_8_branches_captures.snap

diff --git a/crates/plotnik-lib/src/bytecode/constants.rs b/crates/plotnik-lib/src/bytecode/constants.rs
index 589347bc..3a61776e 100644
--- a/crates/plotnik-lib/src/bytecode/constants.rs
+++ b/crates/plotnik-lib/src/bytecode/constants.rs
@@ -18,3 +18,9 @@ pub const STEP_SIZE: usize = 8;
 /// effects, neg_fields, and successors combined. When an epsilon
 /// transition needs more successors, it must be split into a cascade.
 pub const MAX_MATCH_PAYLOAD_SLOTS: usize = 28;
+
+/// Maximum pre-effects per Match instruction.
+///
+/// Pre-effect count is stored in 3 bits (max 7). When exceeded,
+/// overflow effects must be emitted in leading epsilon transitions.
+pub const MAX_PRE_EFFECTS: usize = 7;
diff --git a/crates/plotnik-lib/src/bytecode/ir.rs b/crates/plotnik-lib/src/bytecode/ir.rs
index 32ac9489..e9cc34a5 100644
--- a/crates/plotnik-lib/src/bytecode/ir.rs
+++ b/crates/plotnik-lib/src/bytecode/ir.rs
@@ -526,12 +526,24 @@ impl MatchIR {
                 .unwrap_or(0);
             bytes[6..8].copy_from_slice(&next.to_le_bytes());
         } else {
-            let pre_count = self.pre_effects.len() as u16;
-            let neg_count = self.neg_fields.len() as u16;
-            let post_count = self.post_effects.len() as u16;
-            let succ_count = self.successors.len() as u16;
-            let counts =
-                (pre_count << 13) | (neg_count << 10) | (post_count << 7) | (succ_count << 1);
+            let pre_count = self.pre_effects.len();
+            let neg_count = self.neg_fields.len();
+            let post_count = self.post_effects.len();
+            let succ_count = self.successors.len();
+
+            // Validate bit-packed field limits (3 bits for counts, 6 bits for successors)
+            assert!(
+                pre_count <= 7,
+                "pre_effects overflow: {pre_count} > 7 (use emit_match_with_cascade)"
+            );
+            assert!(neg_count <= 7, "neg_fields overflow: {neg_count} > 7");
+            assert!(post_count <= 7, "post_effects overflow: {post_count} > 7");
+            assert!(succ_count <= 63, "successors overflow: {succ_count} > 63");
+
+            let counts = ((pre_count as u16) << 13)
+                | ((neg_count as u16) << 10)
+                | ((post_count as u16) << 7)
+                | ((succ_count as u16) << 1);
             bytes[6..8].copy_from_slice(&counts.to_le_bytes());
 
             let mut offset = 8;
diff --git a/crates/plotnik-lib/src/bytecode/mod.rs b/crates/plotnik-lib/src/bytecode/mod.rs
index f14e2d52..0eca50ed 100644
--- a/crates/plotnik-lib/src/bytecode/mod.rs
+++ b/crates/plotnik-lib/src/bytecode/mod.rs
@@ -16,7 +16,9 @@ mod nav;
 mod sections;
 mod type_meta;
 
-pub use constants::{MAGIC, MAX_MATCH_PAYLOAD_SLOTS, SECTION_ALIGN, STEP_SIZE, VERSION};
+pub use constants::{
+    MAGIC, MAX_MATCH_PAYLOAD_SLOTS, MAX_PRE_EFFECTS, SECTION_ALIGN, STEP_SIZE, VERSION,
+};
 
 pub use ids::{StringId, TypeId};
 
diff --git a/crates/plotnik-lib/src/bytecode/nav.rs b/crates/plotnik-lib/src/bytecode/nav.rs
index a3cce71b..8845a25b 100644
--- a/crates/plotnik-lib/src/bytecode/nav.rs
+++ b/crates/plotnik-lib/src/bytecode/nav.rs
@@ -76,15 +76,18 @@ impl Nav {
             Self::DownSkip => 7,
             Self::DownExact => 8,
             Self::Up(n) => {
-                debug_assert!((1..=63).contains(&n));
+                assert!((1..=63).contains(&n), "Up level overflow: {n} > 63");
                 0b01_000000 | n
             }
             Self::UpSkipTrivia(n) => {
-                debug_assert!((1..=63).contains(&n));
+                assert!(
+                    (1..=63).contains(&n),
+                    "UpSkipTrivia level overflow: {n} > 63"
+                );
                 0b10_000000 | n
             }
             Self::UpExact(n) => {
-                debug_assert!((1..=63).contains(&n));
+                assert!((1..=63).contains(&n), "UpExact level overflow: {n} > 63");
                 0b11_000000 | n
             }
         }
diff --git a/crates/plotnik-lib/src/compile/compile_tests.rs b/crates/plotnik-lib/src/compile/compile_tests.rs
index 6eb5e126..255127e4 100644
--- a/crates/plotnik-lib/src/compile/compile_tests.rs
+++ b/crates/plotnik-lib/src/compile/compile_tests.rs
@@ -161,3 +161,68 @@ fn compile_large_tagged_alternation() {
 
     assert!(!result.instructions.is_empty());
 }
+
+#[test]
+fn compile_unlabeled_alternation_5_branches_with_captures() {
+    // Regression test: unlabeled alternation with 5+ branches where each has
+    // a unique capture requires 8+ pre-effects (4 nulls + 4 sets per branch).
+    // This exceeds the 3-bit limit (max 7) and must cascade via epsilon chain.
+    let query = QueryBuilder::one_liner(
+        "Q = [(identifier) @a (number) @b (string) @c (binary_expression) @d (call_expression) @e]",
+    )
+    .parse()
+    .unwrap()
+    .analyze();
+
+    let mut strings = StringTableBuilder::new();
+    let result = Compiler::compile(
+        query.interner(),
+        query.type_context(),
+        &query.symbol_table,
+        &mut strings,
+        None,
+        None,
+    )
+    .unwrap();
+
+    assert!(!result.instructions.is_empty());
+
+    // Verify that effects cascade created extra epsilon instructions.
+    // With 5 branches, each branch needs 8 pre-effects (4 missing captures × 2 effects).
+    // This requires at least one cascade step per branch.
+    let epsilon_count = result
+        .instructions
+        .iter()
+        .filter(|i| matches!(i, crate::bytecode::InstructionIR::Match(m) if m.is_epsilon()))
+        .count();
+
+    // Should have more epsilon transitions than without cascade
+    // (5 branches + cascade steps for overflow effects)
+    assert!(epsilon_count >= 5, "expected cascade epsilon steps");
+}
+
+#[test]
+fn compile_unlabeled_alternation_8_branches_with_captures() {
+    // Even more extreme: 8 branches means 14 pre-effects per branch (7 nulls + 7 sets).
+    // This requires 2 cascade steps per branch.
+    let query = QueryBuilder::one_liner(
+        "Q = [(identifier) @a (number) @b (string) @c (binary_expression) @d \
+              (call_expression) @e (member_expression) @f (array) @g (object) @h]",
+    )
+    .parse()
+    .unwrap()
+    .analyze();
+
+    let mut strings = StringTableBuilder::new();
+    let result = Compiler::compile(
+        query.interner(),
+        query.type_context(),
+        &query.symbol_table,
+        &mut strings,
+        None,
+        None,
+    )
+    .unwrap();
+
+    assert!(!result.instructions.is_empty());
+}
diff --git a/crates/plotnik-lib/src/compile/expressions.rs b/crates/plotnik-lib/src/compile/expressions.rs
index a374a87a..bbb9c52b 100644
--- a/crates/plotnik-lib/src/compile/expressions.rs
+++ b/crates/plotnik-lib/src/compile/expressions.rs
@@ -40,16 +40,14 @@ impl Compiler<'_> {
 
         // If no items, just match and exit (capture effects go here)
         if items.is_empty() {
-            self.instructions.push(
+            return self.emit_match_with_cascade(
                 MatchIR::epsilon(entry, exit)
                     .nav(nav)
                     .node_type(node_type)
                     .neg_fields(neg_fields)
                     .pre_effects(capture.pre)
-                    .post_effects(capture.post)
-                    .into(),
+                    .post_effects(capture.post),
             );
-            return entry;
         }
 
         // Determine Up navigation based on trailing anchor
@@ -102,13 +100,12 @@ impl Compiler<'_> {
             .push(MatchIR::epsilon(up_label, final_exit).nav(up_nav).into());
 
         // Emit entry instruction into the node (only pre_effects here)
-        self.instructions.push(
+        self.emit_match_with_cascade(
             MatchIR::epsilon(entry, items_entry)
                 .nav(nav)
                 .node_type(node_type)
                 .neg_fields(neg_fields)
-                .pre_effects(capture.pre)
-                .into(),
+                .pre_effects(capture.pre),
         );
 
         entry
@@ -178,13 +175,12 @@ impl Compiler<'_> {
         self.emit_wildcard_nav(down_wildcard, Nav::Down, try_label);
 
         // entry: match parent node → down_wildcard (only pre_effects here)
-        self.instructions.push(
+        self.emit_match_with_cascade(
             MatchIR::epsilon(entry, down_wildcard)
                 .nav(nav)
                 .node_type(node_type)
                 .neg_fields(neg_fields)
-                .pre_effects(capture.pre)
-                .into(),
+                .pre_effects(capture.pre),
         );
 
         entry
@@ -220,13 +216,12 @@ impl Compiler<'_> {
             None => NodeTypeIR::Any, // `_` wildcard matches any node
         };
 
-        self.instructions.push(
+        self.emit_match_with_cascade(
             MatchIR::epsilon(entry, exit)
                 .nav(nav)
                 .node_type(node_type)
                 .pre_effects(capture.pre)
-                .post_effects(capture.post)
-                .into(),
+                .post_effects(capture.post),
         );
 
         entry
diff --git a/crates/plotnik-lib/src/compile/scope.rs b/crates/plotnik-lib/src/compile/scope.rs
index fbf78ddf..029195d2 100644
--- a/crates/plotnik-lib/src/compile/scope.rs
+++ b/crates/plotnik-lib/src/compile/scope.rs
@@ -420,6 +420,69 @@ impl Compiler<'_> {
             .push(MatchIR::at(label).next_many(successors).into());
     }
 
+    /// Emit a Match instruction, cascading pre-effects if they exceed the bytecode limit.
+    ///
+    /// When `pre_effects.len() > MAX_PRE_EFFECTS` (7), splits overflow into leading
+    /// epsilon transitions to avoid bytecode encoding overflow. The original label
+    /// is preserved as the entry point of the cascade.
+    ///
+    /// Returns the entry label (same as `instr.label`).
+    pub(super) fn emit_match_with_cascade(&mut self, mut instr: MatchIR) -> Label {
+        use crate::bytecode::MAX_PRE_EFFECTS;
+
+        let entry = instr.label;
+
+        if instr.pre_effects.len() <= MAX_PRE_EFFECTS {
+            self.instructions.push(instr.into());
+            return entry;
+        }
+
+        // Move all pre-effects to epsilon chain
+        let all_effects = std::mem::take(&mut instr.pre_effects);
+
+        // Create new label for the actual match instruction
+        let match_label = self.fresh_label();
+        instr.label = match_label;
+        self.instructions.push(instr.into());
+
+        // Emit cascade from entry → ... → match_label
+        self.emit_effects_chain(entry, match_label, all_effects);
+
+        entry
+    }
+
+    /// Emit a chain of epsilon transitions to execute effects in order.
+    ///
+    /// Splits effects into batches of `MAX_PRE_EFFECTS` (7), emitting epsilon
+    /// transitions: `entry → intermediate1 → ... → exit`.
+    fn emit_effects_chain(&mut self, entry: Label, exit: Label, mut effects: Vec<EffectIR>) {
+        use crate::bytecode::MAX_PRE_EFFECTS;
+
+        if effects.is_empty() {
+            // Just link entry to exit
+            self.instructions.push(MatchIR::epsilon(entry, exit).into());
+            return;
+        }
+
+        if effects.len() <= MAX_PRE_EFFECTS {
+            self.instructions
+                .push(MatchIR::epsilon(entry, exit).pre_effects(effects).into());
+            return;
+        }
+
+        // Take first batch, recurse for rest
+        let first_batch: Vec<_> = effects.drain(..MAX_PRE_EFFECTS).collect();
+        let intermediate = self.fresh_label();
+
+        self.instructions.push(
+            MatchIR::epsilon(entry, intermediate)
+                .pre_effects(first_batch)
+                .into(),
+        );
+
+        self.emit_effects_chain(intermediate, exit, effects);
+    }
+
     /// Emit a wildcard navigation step that accepts any node.
     ///
     /// Used for skip-retry logic in quantifiers: navigates to the next position
diff --git a/crates/plotnik-lib/src/engine/engine_tests.rs b/crates/plotnik-lib/src/engine/engine_tests.rs
index 22dace73..c0732b5b 100644
--- a/crates/plotnik-lib/src/engine/engine_tests.rs
+++ b/crates/plotnik-lib/src/engine/engine_tests.rs
@@ -529,3 +529,47 @@ fn wildcard_named_skips_anonymous() {
 fn wildcard_bare_matches_anonymous() {
     snap!("Q = (program (return_statement _ @x))", "return 42");
 }
+
+/// BUG: Unlabeled alternation with 5+ branches and unique captures per branch.
+///
+/// Each branch in an unlabeled alternation needs to inject null for captures
+/// it doesn't have. With 5 branches (@a, @b, @c, @d, @e), each branch needs
+/// 8 pre-effects (4 nulls + 4 sets). This exceeds the 3-bit limit (max 7)
+/// and caused bytecode encoding overflow, resulting in garbage addresses.
+///
+/// Fix: Cascade overflow pre-effects into leading epsilon transitions.
+#[test]
+fn regression_unlabeled_alternation_5_branches_captures() {
+    snap!(
+        indoc! {r#"
+            Q = (program (expression_statement [
+                (identifier) @a
+                (number) @b
+                (string) @c
+                (binary_expression) @d
+                (call_expression) @e
+            ]))
+        "#},
+        "foo"
+    );
+}
+
+/// Same bug with 8 branches - even more pre-effects requiring multiple cascade steps.
+#[test]
+fn regression_unlabeled_alternation_8_branches_captures() {
+    snap!(
+        indoc! {r#"
+            Q = (program (expression_statement [
+                (identifier) @a
+                (number) @b
+                (string) @c
+                (binary_expression) @d
+                (call_expression) @e
+                (member_expression) @f
+                (array) @g
+                (object) @h
+            ]))
+        "#},
+        "42"
+    );
+}
diff --git a/crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_5_branches_captures.snap b/crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_5_branches_captures.snap
new file mode 100644
index 00000000..66ba065a
--- /dev/null
+++ b/crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_5_branches_captures.snap
@@ -0,0 +1,27 @@
+---
+source: crates/plotnik-lib/src/engine/engine_tests.rs
+---
+Q = (program (expression_statement [
+    (identifier) @a
+    (number) @b
+    (string) @c
+    (binary_expression) @d
+    (call_expression) @e
+]))
+---
+foo
+---
+{
+  "b": null,
+  "c": null,
+  "d": null,
+  "e": null,
+  "a": {
+    "kind": "identifier",
+    "text": "foo",
+    "span": [
+      0,
+      3
+    ]
+  }
+}
diff --git a/crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_8_branches_captures.snap b/crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_8_branches_captures.snap
new file mode 100644
index 00000000..d79960b3
--- /dev/null
+++ b/crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_8_branches_captures.snap
@@ -0,0 +1,33 @@
+---
+source: crates/plotnik-lib/src/engine/engine_tests.rs
+---
+Q = (program (expression_statement [
+    (identifier) @a
+    (number) @b
+    (string) @c
+    (binary_expression) @d
+    (call_expression) @e
+    (member_expression) @f
+    (array) @g
+    (object) @h
+]))
+---
+42
+---
+{
+  "a": null,
+  "c": null,
+  "d": null,
+  "e": null,
+  "f": null,
+  "g": null,
+  "h": null,
+  "b": {
+    "kind": "number",
+    "text": "42",
+    "span": [
+      0,
+      2
+    ]
+  }
+}