From 81dd6e474087699f58eebea85d91b39d127ab5a5 Mon Sep 17 00:00:00 2001 From: Sergei Zharinov Date: Wed, 7 Jan 2026 17:37:02 -0300 Subject: [PATCH] fix: Cascade overflow pre-effects to prevent bytecode encoding panic --- crates/plotnik-lib/src/bytecode/constants.rs | 6 ++ crates/plotnik-lib/src/bytecode/ir.rs | 24 +++++-- crates/plotnik-lib/src/bytecode/mod.rs | 4 +- crates/plotnik-lib/src/bytecode/nav.rs | 9 ++- .../plotnik-lib/src/compile/compile_tests.rs | 65 +++++++++++++++++++ crates/plotnik-lib/src/compile/expressions.rs | 21 +++--- crates/plotnik-lib/src/compile/scope.rs | 63 ++++++++++++++++++ crates/plotnik-lib/src/engine/engine_tests.rs | 44 +++++++++++++ ...beled_alternation_5_branches_captures.snap | 27 ++++++++ ...beled_alternation_8_branches_captures.snap | 33 ++++++++++ 10 files changed, 273 insertions(+), 23 deletions(-) create mode 100644 crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_5_branches_captures.snap create mode 100644 crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_8_branches_captures.snap diff --git a/crates/plotnik-lib/src/bytecode/constants.rs b/crates/plotnik-lib/src/bytecode/constants.rs index 589347bc..3a61776e 100644 --- a/crates/plotnik-lib/src/bytecode/constants.rs +++ b/crates/plotnik-lib/src/bytecode/constants.rs @@ -18,3 +18,9 @@ pub const STEP_SIZE: usize = 8; /// effects, neg_fields, and successors combined. When an epsilon /// transition needs more successors, it must be split into a cascade. pub const MAX_MATCH_PAYLOAD_SLOTS: usize = 28; + +/// Maximum pre-effects per Match instruction. +/// +/// Pre-effect count is stored in 3 bits (max 7). When exceeded, +/// overflow effects must be emitted in leading epsilon transitions. +pub const MAX_PRE_EFFECTS: usize = 7; diff --git a/crates/plotnik-lib/src/bytecode/ir.rs b/crates/plotnik-lib/src/bytecode/ir.rs index 32ac9489..e9cc34a5 100644 --- a/crates/plotnik-lib/src/bytecode/ir.rs +++ b/crates/plotnik-lib/src/bytecode/ir.rs @@ -526,12 +526,24 @@ impl MatchIR { .unwrap_or(0); bytes[6..8].copy_from_slice(&next.to_le_bytes()); } else { - let pre_count = self.pre_effects.len() as u16; - let neg_count = self.neg_fields.len() as u16; - let post_count = self.post_effects.len() as u16; - let succ_count = self.successors.len() as u16; - let counts = - (pre_count << 13) | (neg_count << 10) | (post_count << 7) | (succ_count << 1); + let pre_count = self.pre_effects.len(); + let neg_count = self.neg_fields.len(); + let post_count = self.post_effects.len(); + let succ_count = self.successors.len(); + + // Validate bit-packed field limits (3 bits for counts, 6 bits for successors) + assert!( + pre_count <= 7, + "pre_effects overflow: {pre_count} > 7 (use emit_match_with_cascade)" + ); + assert!(neg_count <= 7, "neg_fields overflow: {neg_count} > 7"); + assert!(post_count <= 7, "post_effects overflow: {post_count} > 7"); + assert!(succ_count <= 63, "successors overflow: {succ_count} > 63"); + + let counts = ((pre_count as u16) << 13) + | ((neg_count as u16) << 10) + | ((post_count as u16) << 7) + | ((succ_count as u16) << 1); bytes[6..8].copy_from_slice(&counts.to_le_bytes()); let mut offset = 8; diff --git a/crates/plotnik-lib/src/bytecode/mod.rs b/crates/plotnik-lib/src/bytecode/mod.rs index f14e2d52..0eca50ed 100644 --- a/crates/plotnik-lib/src/bytecode/mod.rs +++ b/crates/plotnik-lib/src/bytecode/mod.rs @@ -16,7 +16,9 @@ mod nav; mod sections; mod type_meta; -pub use constants::{MAGIC, MAX_MATCH_PAYLOAD_SLOTS, SECTION_ALIGN, STEP_SIZE, VERSION}; +pub use constants::{ + MAGIC, MAX_MATCH_PAYLOAD_SLOTS, MAX_PRE_EFFECTS, SECTION_ALIGN, STEP_SIZE, VERSION, +}; pub use ids::{StringId, TypeId}; diff --git a/crates/plotnik-lib/src/bytecode/nav.rs b/crates/plotnik-lib/src/bytecode/nav.rs index a3cce71b..8845a25b 100644 --- a/crates/plotnik-lib/src/bytecode/nav.rs +++ b/crates/plotnik-lib/src/bytecode/nav.rs @@ -76,15 +76,18 @@ impl Nav { Self::DownSkip => 7, Self::DownExact => 8, Self::Up(n) => { - debug_assert!((1..=63).contains(&n)); + assert!((1..=63).contains(&n), "Up level overflow: {n} > 63"); 0b01_000000 | n } Self::UpSkipTrivia(n) => { - debug_assert!((1..=63).contains(&n)); + assert!( + (1..=63).contains(&n), + "UpSkipTrivia level overflow: {n} > 63" + ); 0b10_000000 | n } Self::UpExact(n) => { - debug_assert!((1..=63).contains(&n)); + assert!((1..=63).contains(&n), "UpExact level overflow: {n} > 63"); 0b11_000000 | n } } diff --git a/crates/plotnik-lib/src/compile/compile_tests.rs b/crates/plotnik-lib/src/compile/compile_tests.rs index 6eb5e126..255127e4 100644 --- a/crates/plotnik-lib/src/compile/compile_tests.rs +++ b/crates/plotnik-lib/src/compile/compile_tests.rs @@ -161,3 +161,68 @@ fn compile_large_tagged_alternation() { assert!(!result.instructions.is_empty()); } + +#[test] +fn compile_unlabeled_alternation_5_branches_with_captures() { + // Regression test: unlabeled alternation with 5+ branches where each has + // a unique capture requires 8+ pre-effects (4 nulls + 4 sets per branch). + // This exceeds the 3-bit limit (max 7) and must cascade via epsilon chain. + let query = QueryBuilder::one_liner( + "Q = [(identifier) @a (number) @b (string) @c (binary_expression) @d (call_expression) @e]", + ) + .parse() + .unwrap() + .analyze(); + + let mut strings = StringTableBuilder::new(); + let result = Compiler::compile( + query.interner(), + query.type_context(), + &query.symbol_table, + &mut strings, + None, + None, + ) + .unwrap(); + + assert!(!result.instructions.is_empty()); + + // Verify that effects cascade created extra epsilon instructions. + // With 5 branches, each branch needs 8 pre-effects (4 missing captures × 2 effects). + // This requires at least one cascade step per branch. + let epsilon_count = result + .instructions + .iter() + .filter(|i| matches!(i, crate::bytecode::InstructionIR::Match(m) if m.is_epsilon())) + .count(); + + // Should have more epsilon transitions than without cascade + // (5 branches + cascade steps for overflow effects) + assert!(epsilon_count >= 5, "expected cascade epsilon steps"); +} + +#[test] +fn compile_unlabeled_alternation_8_branches_with_captures() { + // Even more extreme: 8 branches means 14 pre-effects per branch (7 nulls + 7 sets). + // This requires 2 cascade steps per branch. + let query = QueryBuilder::one_liner( + "Q = [(identifier) @a (number) @b (string) @c (binary_expression) @d \ + (call_expression) @e (member_expression) @f (array) @g (object) @h]", + ) + .parse() + .unwrap() + .analyze(); + + let mut strings = StringTableBuilder::new(); + let result = Compiler::compile( + query.interner(), + query.type_context(), + &query.symbol_table, + &mut strings, + None, + None, + ) + .unwrap(); + + assert!(!result.instructions.is_empty()); +} diff --git a/crates/plotnik-lib/src/compile/expressions.rs b/crates/plotnik-lib/src/compile/expressions.rs index a374a87a..bbb9c52b 100644 --- a/crates/plotnik-lib/src/compile/expressions.rs +++ b/crates/plotnik-lib/src/compile/expressions.rs @@ -40,16 +40,14 @@ impl Compiler<'_> { // If no items, just match and exit (capture effects go here) if items.is_empty() { - self.instructions.push( + return self.emit_match_with_cascade( MatchIR::epsilon(entry, exit) .nav(nav) .node_type(node_type) .neg_fields(neg_fields) .pre_effects(capture.pre) - .post_effects(capture.post) - .into(), + .post_effects(capture.post), ); - return entry; } // Determine Up navigation based on trailing anchor @@ -102,13 +100,12 @@ impl Compiler<'_> { .push(MatchIR::epsilon(up_label, final_exit).nav(up_nav).into()); // Emit entry instruction into the node (only pre_effects here) - self.instructions.push( + self.emit_match_with_cascade( MatchIR::epsilon(entry, items_entry) .nav(nav) .node_type(node_type) .neg_fields(neg_fields) - .pre_effects(capture.pre) - .into(), + .pre_effects(capture.pre), ); entry @@ -178,13 +175,12 @@ impl Compiler<'_> { self.emit_wildcard_nav(down_wildcard, Nav::Down, try_label); // entry: match parent node → down_wildcard (only pre_effects here) - self.instructions.push( + self.emit_match_with_cascade( MatchIR::epsilon(entry, down_wildcard) .nav(nav) .node_type(node_type) .neg_fields(neg_fields) - .pre_effects(capture.pre) - .into(), + .pre_effects(capture.pre), ); entry @@ -220,13 +216,12 @@ impl Compiler<'_> { None => NodeTypeIR::Any, // `_` wildcard matches any node }; - self.instructions.push( + self.emit_match_with_cascade( MatchIR::epsilon(entry, exit) .nav(nav) .node_type(node_type) .pre_effects(capture.pre) - .post_effects(capture.post) - .into(), + .post_effects(capture.post), ); entry diff --git a/crates/plotnik-lib/src/compile/scope.rs b/crates/plotnik-lib/src/compile/scope.rs index fbf78ddf..029195d2 100644 --- a/crates/plotnik-lib/src/compile/scope.rs +++ b/crates/plotnik-lib/src/compile/scope.rs @@ -420,6 +420,69 @@ impl Compiler<'_> { .push(MatchIR::at(label).next_many(successors).into()); } + /// Emit a Match instruction, cascading pre-effects if they exceed the bytecode limit. + /// + /// When `pre_effects.len() > MAX_PRE_EFFECTS` (7), splits overflow into leading + /// epsilon transitions to avoid bytecode encoding overflow. The original label + /// is preserved as the entry point of the cascade. + /// + /// Returns the entry label (same as `instr.label`). + pub(super) fn emit_match_with_cascade(&mut self, mut instr: MatchIR) -> Label { + use crate::bytecode::MAX_PRE_EFFECTS; + + let entry = instr.label; + + if instr.pre_effects.len() <= MAX_PRE_EFFECTS { + self.instructions.push(instr.into()); + return entry; + } + + // Move all pre-effects to epsilon chain + let all_effects = std::mem::take(&mut instr.pre_effects); + + // Create new label for the actual match instruction + let match_label = self.fresh_label(); + instr.label = match_label; + self.instructions.push(instr.into()); + + // Emit cascade from entry → ... → match_label + self.emit_effects_chain(entry, match_label, all_effects); + + entry + } + + /// Emit a chain of epsilon transitions to execute effects in order. + /// + /// Splits effects into batches of `MAX_PRE_EFFECTS` (7), emitting epsilon + /// transitions: `entry → intermediate1 → ... → exit`. + fn emit_effects_chain(&mut self, entry: Label, exit: Label, mut effects: Vec) { + use crate::bytecode::MAX_PRE_EFFECTS; + + if effects.is_empty() { + // Just link entry to exit + self.instructions.push(MatchIR::epsilon(entry, exit).into()); + return; + } + + if effects.len() <= MAX_PRE_EFFECTS { + self.instructions + .push(MatchIR::epsilon(entry, exit).pre_effects(effects).into()); + return; + } + + // Take first batch, recurse for rest + let first_batch: Vec<_> = effects.drain(..MAX_PRE_EFFECTS).collect(); + let intermediate = self.fresh_label(); + + self.instructions.push( + MatchIR::epsilon(entry, intermediate) + .pre_effects(first_batch) + .into(), + ); + + self.emit_effects_chain(intermediate, exit, effects); + } + /// Emit a wildcard navigation step that accepts any node. /// /// Used for skip-retry logic in quantifiers: navigates to the next position diff --git a/crates/plotnik-lib/src/engine/engine_tests.rs b/crates/plotnik-lib/src/engine/engine_tests.rs index 22dace73..c0732b5b 100644 --- a/crates/plotnik-lib/src/engine/engine_tests.rs +++ b/crates/plotnik-lib/src/engine/engine_tests.rs @@ -529,3 +529,47 @@ fn wildcard_named_skips_anonymous() { fn wildcard_bare_matches_anonymous() { snap!("Q = (program (return_statement _ @x))", "return 42"); } + +/// BUG: Unlabeled alternation with 5+ branches and unique captures per branch. +/// +/// Each branch in an unlabeled alternation needs to inject null for captures +/// it doesn't have. With 5 branches (@a, @b, @c, @d, @e), each branch needs +/// 8 pre-effects (4 nulls + 4 sets). This exceeds the 3-bit limit (max 7) +/// and caused bytecode encoding overflow, resulting in garbage addresses. +/// +/// Fix: Cascade overflow pre-effects into leading epsilon transitions. +#[test] +fn regression_unlabeled_alternation_5_branches_captures() { + snap!( + indoc! {r#" + Q = (program (expression_statement [ + (identifier) @a + (number) @b + (string) @c + (binary_expression) @d + (call_expression) @e + ])) + "#}, + "foo" + ); +} + +/// Same bug with 8 branches - even more pre-effects requiring multiple cascade steps. +#[test] +fn regression_unlabeled_alternation_8_branches_captures() { + snap!( + indoc! {r#" + Q = (program (expression_statement [ + (identifier) @a + (number) @b + (string) @c + (binary_expression) @d + (call_expression) @e + (member_expression) @f + (array) @g + (object) @h + ])) + "#}, + "42" + ); +} diff --git a/crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_5_branches_captures.snap b/crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_5_branches_captures.snap new file mode 100644 index 00000000..66ba065a --- /dev/null +++ b/crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_5_branches_captures.snap @@ -0,0 +1,27 @@ +--- +source: crates/plotnik-lib/src/engine/engine_tests.rs +--- +Q = (program (expression_statement [ + (identifier) @a + (number) @b + (string) @c + (binary_expression) @d + (call_expression) @e +])) +--- +foo +--- +{ + "b": null, + "c": null, + "d": null, + "e": null, + "a": { + "kind": "identifier", + "text": "foo", + "span": [ + 0, + 3 + ] + } +} diff --git a/crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_8_branches_captures.snap b/crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_8_branches_captures.snap new file mode 100644 index 00000000..d79960b3 --- /dev/null +++ b/crates/plotnik-lib/src/engine/snapshots/plotnik_lib__engine__engine_tests__regression_unlabeled_alternation_8_branches_captures.snap @@ -0,0 +1,33 @@ +--- +source: crates/plotnik-lib/src/engine/engine_tests.rs +--- +Q = (program (expression_statement [ + (identifier) @a + (number) @b + (string) @c + (binary_expression) @d + (call_expression) @e + (member_expression) @f + (array) @g + (object) @h +])) +--- +42 +--- +{ + "a": null, + "c": null, + "d": null, + "e": null, + "f": null, + "g": null, + "h": null, + "b": { + "kind": "number", + "text": "42", + "span": [ + 0, + 2 + ] + } +}