From 594546cab437598b50a9fa9ca581edf87134902e Mon Sep 17 00:00:00 2001 From: Sergei Zharinov Date: Wed, 14 Jan 2026 23:45:08 -0300 Subject: [PATCH] feat: Add predicate support for node text filtering --- AGENTS.md | 6 +- Cargo.lock | 2 + crates/plotnik-cli/src/commands/exec.rs | 2 +- crates/plotnik-cli/src/commands/trace.rs | 2 +- crates/plotnik-lib/Cargo.toml | 2 + crates/plotnik-lib/src/analyze/link.rs | 15 ++ crates/plotnik-lib/src/analyze/link_tests.rs | 20 +++ crates/plotnik-lib/src/analyze/mod.rs | 4 +- .../plotnik-lib/src/analyze/validation/mod.rs | 5 + .../src/analyze/validation/predicates.rs | 137 ++++++++++++++++ .../analyze/validation/predicates_tests.rs | 85 ++++++++++ crates/plotnik-lib/src/bytecode/dump.rs | 27 ++- crates/plotnik-lib/src/bytecode/header.rs | 2 +- .../plotnik-lib/src/bytecode/instructions.rs | 22 +++ .../src/bytecode/instructions_tests.rs | 6 +- crates/plotnik-lib/src/bytecode/ir.rs | 101 +++++++++++- crates/plotnik-lib/src/bytecode/ir_tests.rs | 5 +- crates/plotnik-lib/src/bytecode/mod.rs | 4 +- crates/plotnik-lib/src/bytecode/module.rs | 26 ++- crates/plotnik-lib/src/compile/expressions.rs | 86 +++++++--- crates/plotnik-lib/src/diagnostics/message.rs | 23 ++- crates/plotnik-lib/src/emit/emitter.rs | 43 ++++- crates/plotnik-lib/src/emit/error.rs | 6 + crates/plotnik-lib/src/emit/mod.rs | 4 + crates/plotnik-lib/src/emit/regex_table.rs | 155 ++++++++++++++++++ .../plotnik-lib/src/emit/regex_table_tests.rs | 76 +++++++++ crates/plotnik-lib/src/emit/string_table.rs | 5 + crates/plotnik-lib/src/engine/engine_tests.rs | 2 +- crates/plotnik-lib/src/engine/trace.rs | 41 ++++- crates/plotnik-lib/src/engine/vm.rs | 81 ++++++++- crates/plotnik-lib/src/parser/ast.rs | 147 +++++++++++++++++ crates/plotnik-lib/src/parser/cst.rs | 97 +++++++++-- crates/plotnik-lib/src/parser/cst_tests.rs | 8 +- .../src/parser/grammar/expressions.rs | 2 +- .../src/parser/grammar/structures.rs | 87 +++++++++- crates/plotnik-lib/src/parser/lexer.rs | 52 +++++- crates/plotnik-lib/src/parser/lexer_tests.rs | 8 +- crates/plotnik-lib/src/parser/mod.rs | 4 +- crates/plotnik-lib/src/query/stages.rs | 5 +- docs/binary-format/01-overview.md | 2 +- docs/binary-format/03-symbols.md | 22 ++- docs/lang-reference.md | 83 +++++++--- 42 files changed, 1379 insertions(+), 133 deletions(-) create mode 100644 crates/plotnik-lib/src/analyze/validation/predicates.rs create mode 100644 crates/plotnik-lib/src/analyze/validation/predicates_tests.rs create mode 100644 crates/plotnik-lib/src/emit/regex_table.rs create mode 100644 crates/plotnik-lib/src/emit/regex_table_tests.rs diff --git a/AGENTS.md b/AGENTS.md index c10c0cc1..48ba8d56 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -30,6 +30,8 @@ | `[...]` | Alternation (first match wins) | | `Name = ...` | Named definition (entrypoint) | | `(Name)` | Use named expression | +| `(node == "x")` | String predicate (== != ^= $= *=) | +| `(node =~ /x/)` | Regex predicate (=~ !~) | ## Data Model Rules @@ -99,8 +101,8 @@ Rule: anchor is as strict as its strictest operand. ; WRONG: dot capture syntax @function.name ; use @function_name -; WRONG: predicates (unsupported) -(id) @x (#eq? @x "foo") +; WRONG: tree-sitter predicate syntax +(id) @x (#eq? @x "foo") ; use (id == "foo") @x ; WRONG: boundary anchors without parent node {. (a)} ; use (parent {. (a)}) diff --git a/Cargo.lock b/Cargo.lock index 989cfe54..329ce936 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1794,6 +1794,8 @@ dependencies = [ "memmap2", "plotnik-core", "plotnik-langs", + "regex-automata", + "regex-syntax", "rowan", "serde", "serde_json", diff --git a/crates/plotnik-cli/src/commands/exec.rs b/crates/plotnik-cli/src/commands/exec.rs index af555177..eaf677dc 100644 --- a/crates/plotnik-cli/src/commands/exec.rs +++ b/crates/plotnik-cli/src/commands/exec.rs @@ -35,7 +35,7 @@ pub fn run(args: ExecArgs) { color: args.color, }); - let vm = VM::builder(&tree).trivia_types(trivia_types).build(); + let vm = VM::builder(&source_code, &tree).trivia_types(trivia_types).build(); let effects = match vm.execute(&module, 0, &entrypoint) { Ok(effects) => effects, Err(RuntimeError::NoMatch) => { diff --git a/crates/plotnik-cli/src/commands/trace.rs b/crates/plotnik-cli/src/commands/trace.rs index e8e9a1a2..afb642a6 100644 --- a/crates/plotnik-cli/src/commands/trace.rs +++ b/crates/plotnik-cli/src/commands/trace.rs @@ -39,7 +39,7 @@ pub fn run(args: TraceArgs) { color: args.color, }); - let vm = VM::builder(&tree) + let vm = VM::builder(&source_code, &tree) .trivia_types(trivia_types) .exec_fuel(args.fuel) .build(); diff --git a/crates/plotnik-lib/Cargo.toml b/crates/plotnik-lib/Cargo.toml index 901520f4..5731e10f 100644 --- a/crates/plotnik-lib/Cargo.toml +++ b/crates/plotnik-lib/Cargo.toml @@ -25,6 +25,8 @@ crc32fast = "1.4" memmap2 = "0.9" plotnik-core.workspace = true plotnik-langs = { workspace = true, optional = true } +regex-automata = { version = "0.4", features = ["dfa-build", "dfa-search"] } +regex-syntax = "0.8" [features] default = ["plotnik-langs"] diff --git a/crates/plotnik-lib/src/analyze/link.rs b/crates/plotnik-lib/src/analyze/link.rs index 2942cd5b..e6ac3fcb 100644 --- a/crates/plotnik-lib/src/analyze/link.rs +++ b/crates/plotnik-lib/src/analyze/link.rs @@ -188,6 +188,21 @@ impl<'a, 'q> Linker<'a, 'q> { Expr::NamedNode(node) => { let child_ctx = self.make_node_context(node); + // Predicates are only valid on leaf nodes (grammar check) + if let Some(pred) = node.predicate() + && let Some(ctx) = &child_ctx + && (!self.lang.valid_child_types(ctx.parent_id).is_empty() + || !self.lang.fields_for_node_type(ctx.parent_id).is_empty()) + { + self.diagnostics + .report( + self.source_id, + DiagnosticKind::PredicateOnNonLeaf, + pred.as_cst().text_range(), + ) + .emit(); + } + for child in node.children() { if let Expr::FieldExpr(f) = &child { self.validate_field_expr(f, child_ctx.as_ref(), visited); diff --git a/crates/plotnik-lib/src/analyze/link_tests.rs b/crates/plotnik-lib/src/analyze/link_tests.rs index e6c8a70b..300aa8ec 100644 --- a/crates/plotnik-lib/src/analyze/link_tests.rs +++ b/crates/plotnik-lib/src/analyze/link_tests.rs @@ -1,6 +1,26 @@ use crate::Query; use indoc::indoc; +#[test] +fn predicate_on_non_leaf() { + let input = r"Q = (function_declaration == 'foo')"; + + let res = Query::expect_invalid_linking(input); + + insta::assert_snapshot!(res, @r" + error: predicates match text content, but this node can contain children + | + 1 | Q = (function_declaration == 'foo') + | ^^^^^^^^ + "); +} + +#[test] +fn predicate_on_leaf_valid() { + let input = r#"Q = (identifier == "foo")"#; + Query::expect_valid_linking(input); +} + #[test] fn valid_query_with_field() { let input = indoc! {r#" diff --git a/crates/plotnik-lib/src/analyze/mod.rs b/crates/plotnik-lib/src/analyze/mod.rs index d09294e9..c1960d16 100644 --- a/crates/plotnik-lib/src/analyze/mod.rs +++ b/crates/plotnik-lib/src/analyze/mod.rs @@ -32,5 +32,7 @@ pub use link::LinkOutput; pub use recursion::validate_recursion; pub use symbol_table::{SymbolTable, UNNAMED_DEF}; pub use type_check::{TypeContext, infer_types, primary_def_name}; -pub use validation::{validate_alt_kinds, validate_anchors, validate_empty_constructs}; +pub use validation::{ + validate_alt_kinds, validate_anchors, validate_empty_constructs, validate_predicates, +}; pub use visitor::{Visitor, walk_expr}; diff --git a/crates/plotnik-lib/src/analyze/validation/mod.rs b/crates/plotnik-lib/src/analyze/validation/mod.rs index 6202db5e..929122f2 100644 --- a/crates/plotnik-lib/src/analyze/validation/mod.rs +++ b/crates/plotnik-lib/src/analyze/validation/mod.rs @@ -4,10 +4,12 @@ //! - Alternation kind consistency (alt_kinds) //! - Anchor placement rules (anchors) //! - Empty constructs (empty_constructs) +//! - Predicate regex patterns (predicates) pub mod alt_kinds; pub mod anchors; pub mod empty_constructs; +pub mod predicates; #[cfg(test)] mod alt_kinds_tests; @@ -15,7 +17,10 @@ mod alt_kinds_tests; mod anchors_tests; #[cfg(test)] mod empty_constructs_tests; +#[cfg(test)] +mod predicates_tests; pub use alt_kinds::validate_alt_kinds; pub use anchors::validate_anchors; pub use empty_constructs::validate_empty_constructs; +pub use predicates::validate_predicates; diff --git a/crates/plotnik-lib/src/analyze/validation/predicates.rs b/crates/plotnik-lib/src/analyze/validation/predicates.rs new file mode 100644 index 00000000..3d2384ad --- /dev/null +++ b/crates/plotnik-lib/src/analyze/validation/predicates.rs @@ -0,0 +1,137 @@ +//! Predicate validation. +//! +//! Validates regex patterns in predicates for unsupported features: +//! - Backreferences (`\1`) +//! - Lookahead/lookbehind (`(?=...)`, `(?!...)`, etc.) +//! - Named captures (`(?P...)`) + +use regex_syntax::ast::{self, visit, Ast, GroupKind, Visitor as RegexVisitor}; +use rowan::TextRange; + +use crate::SourceId; +use crate::analyze::visitor::{Visitor, walk_named_node}; +use crate::diagnostics::{DiagnosticKind, Diagnostics}; +use crate::parser::{NamedNode, Root}; + +pub fn validate_predicates( + source_id: SourceId, + source: &str, + ast: &Root, + diag: &mut Diagnostics, +) { + let mut validator = PredicateValidator { + diag, + source_id, + source, + }; + validator.visit(ast); +} + +struct PredicateValidator<'q, 'd> { + diag: &'d mut Diagnostics, + source_id: SourceId, + source: &'q str, +} + +impl Visitor for PredicateValidator<'_, '_> { + fn visit_named_node(&mut self, node: &NamedNode) { + // Validate regex syntax if this is a regex predicate + if let Some(pred) = node.predicate() + && let Some(op) = pred.operator() + && op.is_regex_op() + && let Some(regex) = pred.regex() + { + self.validate_regex(regex.pattern(self.source), regex.text_range()); + } + walk_named_node(self, node); + } +} + +impl PredicateValidator<'_, '_> { + fn validate_regex(&mut self, pattern: &str, regex_range: TextRange) { + // Reject empty regex patterns + if pattern.is_empty() { + self.diag + .report(self.source_id, DiagnosticKind::EmptyRegex, regex_range) + .emit(); + return; + } + + // Parse with octal disabled so \1-\9 are backreferences, not octal + let parser_result = ast::parse::ParserBuilder::new() + .octal(false) + .build() + .parse(pattern); + + let parsed_ast = match parser_result { + Ok(ast) => ast, + Err(e) => { + let span = self.map_regex_span(e.span(), regex_range); + let report = match e.kind() { + ast::ErrorKind::UnsupportedBackreference => { + self.diag.report(self.source_id, DiagnosticKind::RegexBackreference, span) + } + ast::ErrorKind::UnsupportedLookAround => { + // Skip the opening `(` - point at `?=` / `?!` / `?<=` / `? self + .diag + .report(self.source_id, DiagnosticKind::RegexSyntaxError, span) + .message(format!("{}", e.kind())), + }; + report.emit(); + return; + } + }; + + // Walk AST to find named captures + let detector = NamedCaptureDetector { + named_captures: Vec::new(), + }; + let detector = visit(&parsed_ast, detector).unwrap(); + + for capture_span in detector.named_captures { + let span = self.map_regex_span(&capture_span, regex_range); + self.diag + .report(self.source_id, DiagnosticKind::RegexNamedCapture, span) + .emit(); + } + } + + /// Map a span within the regex pattern to a span in the query source. + fn map_regex_span(&self, regex_span: &ast::Span, regex_range: TextRange) -> TextRange { + // regex_range includes the `/` delimiters, so content starts at +1 + let content_start = u32::from(regex_range.start()) + 1; + let start = content_start + regex_span.start.offset as u32; + let end = content_start + regex_span.end.offset as u32; + TextRange::new(start.into(), end.into()) + } +} + +struct NamedCaptureDetector { + named_captures: Vec, +} + +impl RegexVisitor for NamedCaptureDetector { + type Output = Self; + type Err = std::convert::Infallible; + + fn finish(self) -> Result { + Ok(self) + } + + fn visit_pre(&mut self, ast: &Ast) -> Result<(), Self::Err> { + if let Ast::Group(group) = ast + && let GroupKind::CaptureName { name, .. } = &group.kind + { + // Span for `?P` (skip opening paren, include closing `>`) + let start = ast::Position::new(group.span.start.offset + 1, group.span.start.line, group.span.start.column + 1); + let end = ast::Position::new(name.span.end.offset + 1, name.span.end.line, name.span.end.column + 1); + self.named_captures.push(ast::Span::new(start, end)); + } + Ok(()) + } +} diff --git a/crates/plotnik-lib/src/analyze/validation/predicates_tests.rs b/crates/plotnik-lib/src/analyze/validation/predicates_tests.rs new file mode 100644 index 00000000..465d0041 --- /dev/null +++ b/crates/plotnik-lib/src/analyze/validation/predicates_tests.rs @@ -0,0 +1,85 @@ +use crate::query::QueryAnalyzed; + +#[test] +fn backreference_error() { + let q = QueryAnalyzed::expect(r"Q = (identifier =~ /(.)\1/)"); + assert!(!q.is_valid()); + insta::assert_snapshot!(q.dump_diagnostics(), @r" + error: backreferences are not supported in regex + | + 1 | Q = (identifier =~ /(.)\1/) + | ^^ + "); +} + +#[test] +fn lookahead_error() { + let q = QueryAnalyzed::expect(r"Q = (identifier =~ /foo(?=bar)/)"); + assert!(!q.is_valid()); + insta::assert_snapshot!(q.dump_diagnostics(), @r" + error: lookahead/lookbehind is not supported in regex + | + 1 | Q = (identifier =~ /foo(?=bar)/) + | ^^ + "); +} + +#[test] +fn lookbehind_error() { + let q = QueryAnalyzed::expect(r"Q = (identifier =~ /(?<=foo)bar/)"); + assert!(!q.is_valid()); + insta::assert_snapshot!(q.dump_diagnostics(), @r" + error: lookahead/lookbehind is not supported in regex + | + 1 | Q = (identifier =~ /(?<=foo)bar/) + | ^^^ + "); +} + +#[test] +fn named_capture_error() { + let q = QueryAnalyzed::expect(r"Q = (identifier =~ /(?Pfoo)/)"); + assert!(!q.is_valid()); + insta::assert_snapshot!(q.dump_diagnostics(), @r" + error: named captures are not supported in regex + | + 1 | Q = (identifier =~ /(?Pfoo)/) + | ^^^^^^^^ + "); +} + +#[test] +fn syntax_error() { + let q = QueryAnalyzed::expect(r"Q = (identifier =~ /[/)"); + assert!(!q.is_valid()); + insta::assert_snapshot!(q.dump_diagnostics(), @r" + error: invalid regex syntax: unclosed character class + | + 1 | Q = (identifier =~ /[/) + | ^ + "); +} + +#[test] +fn empty_regex_error() { + let q = QueryAnalyzed::expect(r"Q = (identifier =~ //)"); + assert!(!q.is_valid()); + insta::assert_snapshot!(q.dump_diagnostics(), @r" + error: empty regex pattern + | + 1 | Q = (identifier =~ //) + | ^^ + "); +} + +#[test] +fn valid_regex() { + let q = QueryAnalyzed::expect(r"Q = (identifier =~ /^test_/)"); + assert!(q.is_valid()); +} + +#[test] +fn valid_string_predicate() { + let q = QueryAnalyzed::expect(r#"Q = (identifier == "foo")"#); + assert!(q.is_valid()); +} diff --git a/crates/plotnik-lib/src/bytecode/dump.rs b/crates/plotnik-lib/src/bytecode/dump.rs index df549b3b..369b608d 100644 --- a/crates/plotnik-lib/src/bytecode/dump.rs +++ b/crates/plotnik-lib/src/bytecode/dump.rs @@ -7,6 +7,8 @@ use std::fmt::Write as _; use crate::colors::Colors; +use crate::parser::PredicateOp; + use super::format::{LineBuilder, Symbol, format_effect, nav_symbol, width_for_count}; use super::ids::TypeId; use super::instructions::StepId; @@ -436,9 +438,10 @@ fn instruction_step_count(instr: &Instruction) -> u16 { let neg = m.neg_fields().count(); let post = m.post_effects().count(); let succ = m.succ_count(); - let slots = pre + neg + post + succ; + let pred = if m.has_predicate() { 2 } else { 0 }; + let slots = pre + neg + post + pred + succ; - if pre == 0 && neg == 0 && post == 0 && succ <= 1 { + if pre == 0 && neg == 0 && post == 0 && pred == 0 && succ <= 1 { 1 // Match8 } else if slots <= 4 { 2 // Match16 @@ -474,7 +477,7 @@ fn format_instruction( fn format_match( step: u16, m: &Match, - _module: &Module, + module: &Module, ctx: &DumpContext, step_width: usize, ) -> String { @@ -482,14 +485,14 @@ fn format_match( let symbol = nav_symbol(m.nav); let prefix = format!(" {:0sw$} {} ", step, symbol.format(), sw = step_width); - let content = format_match_content(m, ctx); + let content = format_match_content(m, module, ctx); let successors = format_match_successors(m, ctx, step_width); let base = format!("{prefix}{content}"); builder.pad_successors(base, &successors) } -fn format_match_content(m: &Match, ctx: &DumpContext) -> String { +fn format_match_content(m: &Match, module: &Module, ctx: &DumpContext) -> String { let mut parts = Vec::new(); let pre: Vec<_> = m.pre_effects().map(|e| format_effect(&e)).collect(); @@ -511,6 +514,20 @@ fn format_match_content(m: &Match, ctx: &DumpContext) -> String { if !node_part.is_empty() { parts.push(node_part); } + + // Format predicate if present + if let Some((op, is_regex, value_ref)) = m.predicate() { + let op = PredicateOp::from_byte(op); + let value = if is_regex { + let string_id = module.regexes().get_string_id(value_ref as usize); + let pattern = &ctx.all_strings[string_id.get() as usize]; + format!("/{}/", pattern) + } else { + let s = &ctx.all_strings[value_ref as usize]; + format!("{:?}", s) + }; + parts.push(format!("{} {}", op.as_str(), value)); + } } let post: Vec<_> = m.post_effects().map(|e| format_effect(&e)).collect(); diff --git a/crates/plotnik-lib/src/bytecode/header.rs b/crates/plotnik-lib/src/bytecode/header.rs index fbb1d3a8..773dc335 100644 --- a/crates/plotnik-lib/src/bytecode/header.rs +++ b/crates/plotnik-lib/src/bytecode/header.rs @@ -196,7 +196,7 @@ impl Header { let str_table_size = (self.str_table_count as u32 + 1) * 4; let regex_table = align_up(str_table + str_table_size, align); - let regex_table_size = (self.regex_table_count as u32 + 1) * 4; + let regex_table_size = (self.regex_table_count as u32 + 1) * 8; // Symbol sections let node_types = align_up(regex_table + regex_table_size, align); diff --git a/crates/plotnik-lib/src/bytecode/instructions.rs b/crates/plotnik-lib/src/bytecode/instructions.rs index 2925bb21..6545258f 100644 --- a/crates/plotnik-lib/src/bytecode/instructions.rs +++ b/crates/plotnik-lib/src/bytecode/instructions.rs @@ -301,6 +301,28 @@ impl<'a> Match<'a> { self.has_predicate } + /// Get predicate data if present: (op, is_regex, value_ref). + /// + /// - `op`: operator (0=Eq, 1=Ne, 2=StartsWith, 3=EndsWith, 4=Contains, 5=RegexMatch, 6=RegexNoMatch) + /// - `is_regex`: true if value_ref is a RegexTable index, false if StringTable index + /// - `value_ref`: index into the appropriate table + pub fn predicate(&self) -> Option<(u8, bool, u16)> { + if !self.has_predicate { + return None; + } + + let effects_size = + (self.pre_count as usize + self.neg_count as usize + self.post_count as usize) * 2; + let offset = 8 + effects_size; + + let op_and_flags = u16::from_le_bytes([self.bytes[offset], self.bytes[offset + 1]]); + let op = (op_and_flags & 0xFF) as u8; + let is_regex = (op_and_flags >> 8) & 0x1 != 0; + let value_ref = u16::from_le_bytes([self.bytes[offset + 2], self.bytes[offset + 3]]); + + Some((op, is_regex, value_ref)) + } + /// Byte offset where successors start in the payload. /// Accounts for predicate (4 bytes) if present. #[inline] diff --git a/crates/plotnik-lib/src/bytecode/instructions_tests.rs b/crates/plotnik-lib/src/bytecode/instructions_tests.rs index e780e7cc..d9c0fa4e 100644 --- a/crates/plotnik-lib/src/bytecode/instructions_tests.rs +++ b/crates/plotnik-lib/src/bytecode/instructions_tests.rs @@ -97,7 +97,7 @@ fn match_basic() { .node_type(NodeTypeIR::Named(NonZeroU16::new(42))) .node_field(NonZeroU16::new(7)) .next(Label(1)) - .resolve(&map, |_, _| None, |_| None); + .resolve(&map, |_, _| None, |_| None, |_| None); assert_eq!(bytes.len(), 8); @@ -118,7 +118,7 @@ fn match_basic() { fn match_terminal() { let map = label_map(&[(0, 1)]); - let bytes = MatchIR::terminal(Label(0)).resolve(&map, |_, _| None, |_| None); + let bytes = MatchIR::terminal(Label(0)).resolve(&map, |_, _| None, |_| None, |_| None); assert_eq!(bytes.len(), 8); @@ -144,7 +144,7 @@ fn match_extended() { super::ir::MemberRef::absolute(42), )) .next_many(vec![Label(1), Label(2)]) - .resolve(&map, |_, _| None, |_| None); + .resolve(&map, |_, _| None, |_| None, |_| None); // 1 pre + 2 neg + 2 post + 2 succ = 7 slots → Match24 (8 slots capacity) assert_eq!(bytes.len(), 24); diff --git a/crates/plotnik-lib/src/bytecode/ir.rs b/crates/plotnik-lib/src/bytecode/ir.rs index 55bce7be..035a3061 100644 --- a/crates/plotnik-lib/src/bytecode/ir.rs +++ b/crates/plotnik-lib/src/bytecode/ir.rs @@ -13,6 +13,7 @@ use super::instructions::{ }; use super::nav::Nav; use crate::analyze::type_check::TypeId; +use crate::parser::PredicateOp; /// Node type constraint for Match instructions. /// @@ -166,11 +167,15 @@ impl MemberRef { Self::Deferred { field_name, field_type, - } => lookup_member(field_name, field_type).unwrap_or(0), + } => lookup_member(field_name, field_type) + .expect("deferred member reference must resolve"), Self::DeferredByIndex { parent_type, relative_index, - } => get_member_base(parent_type).unwrap_or(0) + relative_index, + } => { + get_member_base(parent_type).expect("deferred member base must resolve") + + relative_index + } } } } @@ -283,6 +288,51 @@ impl EffectIR { } } +/// Predicate value: string or regex pattern. +/// +/// Both variants store StringId (index into StringTable). For regex predicates, +/// the pattern string is also compiled to a DFA during emit. +#[derive(Clone, Debug)] +pub enum PredicateValueIR { + /// String comparison value. + String(crate::bytecode::StringId), + /// Regex pattern (StringId for pattern, compiled to DFA during emit). + Regex(crate::bytecode::StringId), +} + +/// Predicate IR for node text filtering. +/// +/// Applied after node type/field matching. Compares node text against +/// a string literal or regex pattern. +#[derive(Clone, Debug)] +pub struct PredicateIR { + pub op: PredicateOp, + pub value: PredicateValueIR, +} + +impl PredicateIR { + /// Create a string predicate (==, !=, ^=, $=, *=). + pub fn string(op: PredicateOp, value: crate::bytecode::StringId) -> Self { + Self { + op, + value: PredicateValueIR::String(value), + } + } + + /// Create a regex predicate (=~, !~). + pub fn regex(op: PredicateOp, pattern_id: crate::bytecode::StringId) -> Self { + Self { + op, + value: PredicateValueIR::Regex(pattern_id), + } + } + + /// Returns the operator as a u8 for bytecode encoding. + pub fn op_byte(&self) -> u8 { + self.op.to_byte() + } +} + /// Pre-layout instruction with symbolic references. #[derive(Clone, Debug)] pub enum InstructionIR { @@ -326,18 +376,21 @@ impl InstructionIR { /// /// - `lookup_member`: maps (field_name Symbol, field_type TypeId) to member index /// - `get_member_base`: maps parent TypeId to member base index - pub fn resolve( + /// - `lookup_regex`: maps pattern to RegexTable index (for predicate regexes) + pub fn resolve( &self, map: &BTreeMap, lookup_member: F, get_member_base: G, + lookup_regex: R, ) -> Vec where F: Fn(plotnik_core::Symbol, TypeId) -> Option, G: Fn(TypeId) -> Option, + R: Fn(crate::bytecode::StringId) -> Option, { match self { - Self::Match(m) => m.resolve(map, lookup_member, get_member_base), + Self::Match(m) => m.resolve(map, lookup_member, get_member_base, lookup_regex), Self::Call(c) => c.resolve(map).to_vec(), Self::Return(r) => r.resolve().to_vec(), Self::Trampoline(t) => t.resolve(map).to_vec(), @@ -362,6 +415,8 @@ pub struct MatchIR { pub neg_fields: Vec, /// Effects to execute after successful match. pub post_effects: Vec, + /// Predicate for node text filtering (None = no text check). + pub predicate: Option, /// Successor labels (empty = accept, 1 = linear, 2+ = branch). pub successors: Vec