From a84dfb0d15f4593b612725a23a2eafc5f5861cd9 Mon Sep 17 00:00:00 2001 From: Sergei Zharinov Date: Thu, 15 Jan 2026 12:36:07 -0300 Subject: [PATCH] refactor: Epsilon elimination with semantic fingerprint verification --- .github/workflows/sync-langs.yml | 2 +- AGENTS.md | 55 +- .../src/bytecode/aligned_vec.rs | 5 +- .../src/bytecode/aligned_vec_tests.rs | 2 +- crates/plotnik-bytecode/src/bytecode/dump.rs | 5 +- .../src/bytecode/header_tests.rs | 36 +- .../plotnik-bytecode/src/bytecode/module.rs | 5 +- .../src/bytecode/module_tests.rs | 2 +- crates/plotnik-bytecode/src/lib.rs | 13 +- crates/plotnik-cli/src/commands/exec.rs | 4 +- .../src/analyze/validation/predicates.rs | 30 +- crates/plotnik-compiler/src/bytecode/ir.rs | 7 +- .../plotnik-compiler/src/compile/capture.rs | 16 +- .../src/compile/capture_tests.rs | 2 +- .../src/compile/compile_tests.rs | 118 +-- .../plotnik-compiler/src/compile/compiler.rs | 141 ++-- crates/plotnik-compiler/src/compile/dce.rs | 135 +++ .../src/compile/epsilon_elim.rs | 786 ++++++++++++++++++ .../src/compile/expressions.rs | 36 +- crates/plotnik-compiler/src/compile/mod.rs | 5 +- .../src/compile/navigation.rs | 2 +- .../src/compile/quantifier.rs | 6 +- crates/plotnik-compiler/src/compile/scope.rs | 6 +- .../plotnik-compiler/src/compile/sequences.rs | 11 +- crates/plotnik-compiler/src/compile/verify.rs | 353 ++++++++ .../src/diagnostics/message.rs | 9 +- crates/plotnik-compiler/src/emit/emitter.rs | 47 +- .../plotnik-compiler/src/emit/layout_tests.rs | 2 +- crates/plotnik-compiler/src/emit/mod.rs | 6 +- .../src/emit/regex_table_tests.rs | 32 +- ...it__emit_tests__alternations_captured.snap | 10 +- ...t_tests__alternations_captured_tagged.snap | 11 +- ...mit_tests__alternations_in_quantifier.snap | 41 +- ...mit__emit_tests__alternations_labeled.snap | 12 +- ...ts__alternations_no_internal_captures.snap | 12 +- ...it_tests__alternations_null_injection.snap | 12 +- ...ternations_tagged_in_field_constraint.snap | 16 +- ...ternations_tagged_with_definition_ref.snap | 28 +- ...t__emit_tests__alternations_unlabeled.snap | 12 +- ..._emit_tests__anchors_between_siblings.snap | 11 +- ...emit__emit_tests__anchors_first_child.snap | 19 +- ..._emit__emit_tests__anchors_last_child.snap | 15 +- ...__emit__emit_tests__anchors_no_anchor.snap | 11 +- ...t__emit_tests__anchors_with_anonymous.snap | 11 +- ...ler__emit__emit_tests__captures_basic.snap | 16 +- ...t__emit_tests__captures_deeply_nested.snap | 16 +- ...s__captures_enum_with_type_annotation.snap | 11 +- ...__emit__emit_tests__captures_multiple.snap | 4 +- ...mit__emit_tests__captures_nested_flat.snap | 14 +- ...sts__captures_optional_wrapper_struct.snap | 22 +- ...it__emit_tests__captures_struct_scope.snap | 10 +- ..._captures_struct_with_type_annotation.snap | 12 +- ...emit_tests__captures_with_type_custom.snap | 16 +- ...emit_tests__captures_with_type_string.snap | 16 +- ...__emit_tests__captures_wrapper_struct.snap | 34 +- ...tests__comprehensive_multi_definition.snap | 33 +- ...mit__emit_tests__definitions_multiple.snap | 13 +- ...mit_tests__definitions_nested_capture.snap | 43 +- ...it__emit_tests__definitions_reference.snap | 23 +- ..._emit__emit_tests__definitions_single.snap | 16 +- ..._emit__emit_tests__fields_alternation.snap | 14 +- ...er__emit__emit_tests__fields_multiple.snap | 4 +- ...ler__emit__emit_tests__fields_negated.snap | 10 +- ...iler__emit__emit_tests__fields_single.snap | 4 +- ...er__emit__emit_tests__nodes_anonymous.snap | 4 +- ...mpiler__emit__emit_tests__nodes_error.snap | 16 +- ...iler__emit__emit_tests__nodes_missing.snap | 16 +- ...mpiler__emit__emit_tests__nodes_named.snap | 16 +- ..._emit__emit_tests__nodes_wildcard_any.snap | 4 +- ...mit__emit_tests__nodes_wildcard_named.snap | 4 +- ...mit__emit_tests__optional_first_child.snap | 25 +- ...__emit_tests__optional_null_injection.snap | 20 +- ..._tests__quantifiers_first_child_array.snap | 36 +- ...mit__emit_tests__quantifiers_optional.snap | 20 +- ...tests__quantifiers_optional_nongreedy.snap | 20 +- ...r__emit__emit_tests__quantifiers_plus.snap | 22 +- ...mit_tests__quantifiers_plus_nongreedy.snap | 22 +- ..._tests__quantifiers_repeat_navigation.snap | 29 +- ...s__quantifiers_sequence_in_called_def.snap | 53 +- ...r__emit__emit_tests__quantifiers_star.snap | 25 +- ...mit_tests__quantifiers_star_nongreedy.snap | 25 +- ..._emit_tests__quantifiers_struct_array.snap | 38 +- ...r__emit__emit_tests__recursion_simple.snap | 19 +- ...sts__recursion_with_structured_result.snap | 28 +- ...er__emit__emit_tests__sequences_basic.snap | 11 +- ...__emit_tests__sequences_in_quantifier.snap | 33 +- ...r__emit__emit_tests__sequences_nested.snap | 15 +- ...__emit_tests__sequences_with_captures.snap | 4 +- crates/plotnik-compiler/src/parser/lexer.rs | 7 +- .../src/typegen/typescript/naming.rs | 5 +- crates/plotnik-lib/src/lib.rs | 4 +- crates/plotnik-vm/src/engine/verify.rs | 2 +- crates/plotnik-vm/src/engine/vm.rs | 9 +- docs/binary-format/01-overview.md | 30 +- docs/binary-format/03-symbols.md | 1 + docs/binary-format/04-types.md | 46 +- docs/binary-format/06-transitions.md | 32 +- 97 files changed, 2058 insertions(+), 1014 deletions(-) create mode 100644 crates/plotnik-compiler/src/compile/dce.rs create mode 100644 crates/plotnik-compiler/src/compile/epsilon_elim.rs create mode 100644 crates/plotnik-compiler/src/compile/verify.rs diff --git a/.github/workflows/sync-langs.yml b/.github/workflows/sync-langs.yml index 2768ac8..913cb93 100644 --- a/.github/workflows/sync-langs.yml +++ b/.github/workflows/sync-langs.yml @@ -3,7 +3,7 @@ name: Sync arborium languages on: push: branches: - - 'renovate/arborium-crates' + - "renovate/arborium-crates" permissions: contents: write diff --git a/AGENTS.md b/AGENTS.md index 48ba8d5..c61b728 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -12,26 +12,26 @@ ## Core Constructs -| Syntax | Meaning | -| ------------------- | ------------------------------ | -| `(node_kind)` | Named node | -| `"text"` / `'text'` | Anonymous node (literal token) | -| `(_)` | Any named node | -| `_` | Any node | -| `@name` | Capture (snake_case only) | -| `@x :: T` | Type annotation | -| `@x :: string` | Extract node text | -| `field: pattern` | Field constraint | -| `-field` | Negated field (assert absent) | -| `?` `*` `+` | Quantifiers (0-1, 0+, 1+) | -| `??` `*?` `+?` | Non-greedy variants | -| `.` | Anchor (adjacency, see below) | -| `{...}` | Sequence (siblings in order) | -| `[...]` | Alternation (first match wins) | -| `Name = ...` | Named definition (entrypoint) | -| `(Name)` | Use named expression | -| `(node == "x")` | String predicate (== != ^= $= *=) | -| `(node =~ /x/)` | Regex predicate (=~ !~) | +| Syntax | Meaning | +| ------------------- | ---------------------------------- | +| `(node_kind)` | Named node | +| `"text"` / `'text'` | Anonymous node (literal token) | +| `(_)` | Any named node | +| `_` | Any node | +| `@name` | Capture (snake_case only) | +| `@x :: T` | Type annotation | +| `@x :: string` | Extract node text | +| `field: pattern` | Field constraint | +| `-field` | Negated field (assert absent) | +| `?` `*` `+` | Quantifiers (0-1, 0+, 1+) | +| `??` `*?` `+?` | Non-greedy variants | +| `.` | Anchor (adjacency, see below) | +| `{...}` | Sequence (siblings in order) | +| `[...]` | Alternation (first match wins) | +| `Name = ...` | Named definition (entrypoint) | +| `(Name)` | Use named expression | +| `(node == "x")` | String predicate (== != ^= $= \*=) | +| `(node =~ /x/)` | Regex predicate (=~ !~) | ## Data Model Rules @@ -148,23 +148,26 @@ Tree-sitter: `((a) (b))` — Plotnik: `{(a) (b)}`. The #1 syntax error. ``` crates/ + plotnik-bytecode/ # Binary format definitions + src/ + bytecode/ # Instruction set, modules, linking + type_system/ # Shared type primitives plotnik-cli/ # CLI tool src/commands/ # Subcommands (ast, check, dump, exec, infer, trace, langs) - plotnik-core/ # Node type database (NodeTypes, StaticNodeTypes) and string interning (Interner, Symbol) - plotnik-lib/ # Plotnik as library + plotnik-compiler/ # Compilation pipeline src/ analyze/ # Semantic analysis (symbol_table, dependencies, type_check, validation) - bytecode/ # Binary format definitions compile/ # Thompson NFA construction (AST → IR) diagnostics/ # User-friendly error reporting emit/ # Bytecode emission (IR → binary) - engine/ # Runtime VM (execution, backtracking, effects) parser/ # Syntactic parsing (lexer, grammar, AST) query/ # Query facade (Query, QueryBuilder, SourceMap) - type_system/ # Shared type primitives typegen/ # Type declaration extraction (bytecode → .d.ts) + plotnik-core/ # Node type database (NodeTypes, StaticNodeTypes) and string interning (Interner, Symbol) plotnik-langs/ # Tree-sitter language bindings - plotnik-macros/ # Proc macros + plotnik-lib/ # Facade crate re-exporting bytecode, compiler, vm + plotnik-vm/ # Runtime VM + src/engine/ # Execution, backtracking, effects docs/ binary-format/ # Bytecode format specification lang-reference.md # Language specification diff --git a/crates/plotnik-bytecode/src/bytecode/aligned_vec.rs b/crates/plotnik-bytecode/src/bytecode/aligned_vec.rs index ca7aa82..c5d92fb 100644 --- a/crates/plotnik-bytecode/src/bytecode/aligned_vec.rs +++ b/crates/plotnik-bytecode/src/bytecode/aligned_vec.rs @@ -102,7 +102,10 @@ impl std::fmt::Debug for AlignedVec { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("AlignedVec") .field("len", &self.len) - .field("aligned", &(self.blocks.as_ptr() as usize).is_multiple_of(ALIGN)) + .field( + "aligned", + &(self.blocks.as_ptr() as usize).is_multiple_of(ALIGN), + ) .finish() } } diff --git a/crates/plotnik-bytecode/src/bytecode/aligned_vec_tests.rs b/crates/plotnik-bytecode/src/bytecode/aligned_vec_tests.rs index dc7650d..87b9842 100644 --- a/crates/plotnik-bytecode/src/bytecode/aligned_vec_tests.rs +++ b/crates/plotnik-bytecode/src/bytecode/aligned_vec_tests.rs @@ -1,4 +1,4 @@ -use super::aligned_vec::{AlignedVec, ALIGN}; +use super::aligned_vec::{ALIGN, AlignedVec}; fn is_aligned(ptr: *const u8) -> bool { (ptr as usize).is_multiple_of(ALIGN) diff --git a/crates/plotnik-bytecode/src/bytecode/dump.rs b/crates/plotnik-bytecode/src/bytecode/dump.rs index c14b671..ecb0111 100644 --- a/crates/plotnik-bytecode/src/bytecode/dump.rs +++ b/crates/plotnik-bytecode/src/bytecode/dump.rs @@ -5,15 +5,15 @@ use std::collections::BTreeMap; use std::fmt::Write as _; -use plotnik_core::Colors; use crate::predicate_op::PredicateOp; +use plotnik_core::Colors; use super::format::{LineBuilder, Symbol, format_effect, nav_symbol, width_for_count}; use super::ids::TypeId; use super::instructions::StepId; use super::module::{Instruction, Module}; -use super::node_type_ir::NodeTypeIR; use super::nav::Nav; +use super::node_type_ir::NodeTypeIR; use super::type_meta::{TypeData, TypeKind}; use super::{Call, Match, Return, Trampoline}; @@ -32,7 +32,6 @@ pub fn dump(module: &Module, colors: Colors) -> String { out } - /// Context for dump formatting, precomputes lookups for O(1) access. struct DumpContext { /// Maps step ID to entrypoint name for labeling. diff --git a/crates/plotnik-bytecode/src/bytecode/header_tests.rs b/crates/plotnik-bytecode/src/bytecode/header_tests.rs index fd88543..263adb2 100644 --- a/crates/plotnik-bytecode/src/bytecode/header_tests.rs +++ b/crates/plotnik-bytecode/src/bytecode/header_tests.rs @@ -49,12 +49,12 @@ fn compute_offsets_empty() { // New order: blobs first, then tables // All sections 64-byte aligned. With 0 counts, each table still has 1 sentinel entry (4 bytes) - assert_eq!(offsets.str_blob, 64); // after header - assert_eq!(offsets.regex_blob, 64); // 64 + align(0) = 64 - assert_eq!(offsets.str_table, 64); // 64 + align(0) = 64 - assert_eq!(offsets.regex_table, 128); // 64 + align(4) = 128 - assert_eq!(offsets.node_types, 192); // 128 + align(4) = 192 - assert_eq!(offsets.node_fields, 192); // 192 + align(0) = 192 + assert_eq!(offsets.str_blob, 64); // after header + assert_eq!(offsets.regex_blob, 64); // 64 + align(0) = 64 + assert_eq!(offsets.str_table, 64); // 64 + align(0) = 64 + assert_eq!(offsets.regex_table, 128); // 64 + align(4) = 128 + assert_eq!(offsets.node_types, 192); // 128 + align(4) = 192 + assert_eq!(offsets.node_fields, 192); // 192 + align(0) = 192 assert_eq!(offsets.trivia, 192); assert_eq!(offsets.type_defs, 192); assert_eq!(offsets.type_members, 192); @@ -84,16 +84,16 @@ fn compute_offsets_with_data() { let offsets = h.compute_offsets(); // New order: blobs first, then tables. All offsets 64-byte aligned. - assert_eq!(offsets.str_blob, 64); // header end - assert_eq!(offsets.regex_blob, 192); // 64 + 100 = 164 → 192 - assert_eq!(offsets.str_table, 320); // 192 + 128 = 320 (aligned) - assert_eq!(offsets.regex_table, 384); // 320 + 24 = 344 → 384 - assert_eq!(offsets.node_types, 448); // 384 + 12 = 396 → 448 - assert_eq!(offsets.node_fields, 512); // 448 + 40 = 488 → 512 - assert_eq!(offsets.trivia, 576); // 512 + 20 = 532 → 576 - assert_eq!(offsets.type_defs, 640); // 576 + 6 = 582 → 640 - assert_eq!(offsets.type_members, 704); // 640 + 32 = 672 → 704 - assert_eq!(offsets.type_names, 768); // 704 + 48 = 752 → 768 - assert_eq!(offsets.entrypoints, 832); // 768 + 16 = 784 → 832 - assert_eq!(offsets.transitions, 896); // 832 + 16 = 848 → 896 + assert_eq!(offsets.str_blob, 64); // header end + assert_eq!(offsets.regex_blob, 192); // 64 + 100 = 164 → 192 + assert_eq!(offsets.str_table, 320); // 192 + 128 = 320 (aligned) + assert_eq!(offsets.regex_table, 384); // 320 + 24 = 344 → 384 + assert_eq!(offsets.node_types, 448); // 384 + 12 = 396 → 448 + assert_eq!(offsets.node_fields, 512); // 448 + 40 = 488 → 512 + assert_eq!(offsets.trivia, 576); // 512 + 20 = 532 → 576 + assert_eq!(offsets.type_defs, 640); // 576 + 6 = 582 → 640 + assert_eq!(offsets.type_members, 704); // 640 + 32 = 672 → 704 + assert_eq!(offsets.type_names, 768); // 704 + 48 = 752 → 768 + assert_eq!(offsets.entrypoints, 832); // 768 + 16 = 784 → 832 + assert_eq!(offsets.transitions, 896); // 832 + 16 = 848 → 896 } diff --git a/crates/plotnik-bytecode/src/bytecode/module.rs b/crates/plotnik-bytecode/src/bytecode/module.rs index 554fb8d..08f4672 100644 --- a/crates/plotnik-bytecode/src/bytecode/module.rs +++ b/crates/plotnik-bytecode/src/bytecode/module.rs @@ -197,7 +197,10 @@ impl Module { } /// Load a module from owned bytes (copies into aligned storage). - #[deprecated(since = "0.1.0", note = "use `Module::from_aligned` for AlignedVec or `Module::load` for copying")] + #[deprecated( + since = "0.1.0", + note = "use `Module::from_aligned` for AlignedVec or `Module::load` for copying" + )] pub fn from_bytes(bytes: Vec) -> Result { Self::load(&bytes) } diff --git a/crates/plotnik-bytecode/src/bytecode/module_tests.rs b/crates/plotnik-bytecode/src/bytecode/module_tests.rs index 84b3100..1a4a6e3 100644 --- a/crates/plotnik-bytecode/src/bytecode/module_tests.rs +++ b/crates/plotnik-bytecode/src/bytecode/module_tests.rs @@ -1,7 +1,7 @@ //! Tests for the bytecode module. -use super::module::{ByteStorage, ModuleError}; use super::AlignedVec; +use super::module::{ByteStorage, ModuleError}; #[test] fn byte_storage_copy_from_slice() { diff --git a/crates/plotnik-bytecode/src/lib.rs b/crates/plotnik-bytecode/src/lib.rs index 116a85b..771fa2e 100644 --- a/crates/plotnik-bytecode/src/lib.rs +++ b/crates/plotnik-bytecode/src/lib.rs @@ -15,13 +15,12 @@ pub mod type_system; // Re-export commonly used items at crate root pub use bytecode::{ AlignedVec, ByteStorage, Call, EffectOp, EffectOpcode, Entrypoint, EntrypointsView, - FieldSymbol, Header, Instruction, LineBuilder, MAGIC, MAX_MATCH_PAYLOAD_SLOTS, - MAX_PRE_EFFECTS, Match, Module, ModuleError, Nav, NodeSymbol, NodeTypeIR, Opcode, RegexView, - Return, SECTION_ALIGN, STEP_SIZE, SectionOffsets, Slice, StepAddr, StepId, StringId, - StringsView, Symbol, SymbolsView, Trampoline, TriviaEntry, TriviaView, TypeData, TypeDef, - TypeId, TypeKind, TypeMember, TypeName, TypesView, VERSION, align_to_section, cols, dump, - format_effect, nav_symbol, select_match_opcode, superscript, trace, truncate_text, - width_for_count, + FieldSymbol, Header, Instruction, LineBuilder, MAGIC, MAX_MATCH_PAYLOAD_SLOTS, MAX_PRE_EFFECTS, + Match, Module, ModuleError, Nav, NodeSymbol, NodeTypeIR, Opcode, RegexView, Return, + SECTION_ALIGN, STEP_SIZE, SectionOffsets, Slice, StepAddr, StepId, StringId, StringsView, + Symbol, SymbolsView, Trampoline, TriviaEntry, TriviaView, TypeData, TypeDef, TypeId, TypeKind, + TypeMember, TypeName, TypesView, VERSION, align_to_section, cols, dump, format_effect, + nav_symbol, select_match_opcode, superscript, trace, truncate_text, width_for_count, }; pub use dfa::deserialize_dfa; pub use predicate_op::PredicateOp; diff --git a/crates/plotnik-cli/src/commands/exec.rs b/crates/plotnik-cli/src/commands/exec.rs index eaf677d..10bcfc2 100644 --- a/crates/plotnik-cli/src/commands/exec.rs +++ b/crates/plotnik-cli/src/commands/exec.rs @@ -35,7 +35,9 @@ pub fn run(args: ExecArgs) { color: args.color, }); - let vm = VM::builder(&source_code, &tree).trivia_types(trivia_types).build(); + let vm = VM::builder(&source_code, &tree) + .trivia_types(trivia_types) + .build(); let effects = match vm.execute(&module, 0, &entrypoint) { Ok(effects) => effects, Err(RuntimeError::NoMatch) => { diff --git a/crates/plotnik-compiler/src/analyze/validation/predicates.rs b/crates/plotnik-compiler/src/analyze/validation/predicates.rs index 3d2384a..232daa1 100644 --- a/crates/plotnik-compiler/src/analyze/validation/predicates.rs +++ b/crates/plotnik-compiler/src/analyze/validation/predicates.rs @@ -5,7 +5,7 @@ //! - Lookahead/lookbehind (`(?=...)`, `(?!...)`, etc.) //! - Named captures (`(?P...)`) -use regex_syntax::ast::{self, visit, Ast, GroupKind, Visitor as RegexVisitor}; +use regex_syntax::ast::{self, Ast, GroupKind, Visitor as RegexVisitor, visit}; use rowan::TextRange; use crate::SourceId; @@ -13,12 +13,7 @@ use crate::analyze::visitor::{Visitor, walk_named_node}; use crate::diagnostics::{DiagnosticKind, Diagnostics}; use crate::parser::{NamedNode, Root}; -pub fn validate_predicates( - source_id: SourceId, - source: &str, - ast: &Root, - diag: &mut Diagnostics, -) { +pub fn validate_predicates(source_id: SourceId, source: &str, ast: &Root, diag: &mut Diagnostics) { let mut validator = PredicateValidator { diag, source_id, @@ -69,13 +64,16 @@ impl PredicateValidator<'_, '_> { let span = self.map_regex_span(e.span(), regex_range); let report = match e.kind() { ast::ErrorKind::UnsupportedBackreference => { - self.diag.report(self.source_id, DiagnosticKind::RegexBackreference, span) + self.diag + .report(self.source_id, DiagnosticKind::RegexBackreference, span) } ast::ErrorKind::UnsupportedLookAround => { // Skip the opening `(` - point at `?=` / `?!` / `?<=` / `? self .diag @@ -128,8 +126,16 @@ impl RegexVisitor for NamedCaptureDetector { && let GroupKind::CaptureName { name, .. } = &group.kind { // Span for `?P` (skip opening paren, include closing `>`) - let start = ast::Position::new(group.span.start.offset + 1, group.span.start.line, group.span.start.column + 1); - let end = ast::Position::new(name.span.end.offset + 1, name.span.end.line, name.span.end.column + 1); + let start = ast::Position::new( + group.span.start.offset + 1, + group.span.start.line, + group.span.start.column + 1, + ); + let end = ast::Position::new( + name.span.end.offset + 1, + name.span.end.line, + name.span.end.column + 1, + ); self.named_captures.push(ast::Span::new(start, end)); } Ok(()) diff --git a/crates/plotnik-compiler/src/bytecode/ir.rs b/crates/plotnik-compiler/src/bytecode/ir.rs index aedae9c..8eed365 100644 --- a/crates/plotnik-compiler/src/bytecode/ir.rs +++ b/crates/plotnik-compiler/src/bytecode/ir.rs @@ -7,11 +7,11 @@ use std::collections::BTreeMap; use std::num::NonZeroU16; +use crate::analyze::type_check::TypeId; use plotnik_bytecode::{ Call, EffectOp, EffectOpcode, Nav, Opcode, PredicateOp, Return, StepAddr, StepId, Trampoline, select_match_opcode, }; -use crate::analyze::type_check::TypeId; /// Node type constraint for Match instructions. /// @@ -643,8 +643,9 @@ impl MatchIR { let value_ref = match &pred.value { PredicateValueIR::String(string_id) => string_id.get(), - PredicateValueIR::Regex(string_id) => lookup_regex(*string_id) - .expect("regex predicate must be interned"), + PredicateValueIR::Regex(string_id) => { + lookup_regex(*string_id).expect("regex predicate must be interned") + } }; bytes[offset..offset + 2].copy_from_slice(&value_ref.to_le_bytes()); offset += 2; diff --git a/crates/plotnik-compiler/src/compile/capture.rs b/crates/plotnik-compiler/src/compile/capture.rs index 8c399e1..1529619 100644 --- a/crates/plotnik-compiler/src/compile/capture.rs +++ b/crates/plotnik-compiler/src/compile/capture.rs @@ -7,8 +7,8 @@ use std::collections::HashSet; use crate::analyze::type_check::{TypeContext, TypeId, TypeShape}; use crate::bytecode::EffectIR; -use plotnik_bytecode::EffectOpcode; use crate::parser::ast::{self, Expr}; +use plotnik_bytecode::EffectOpcode; use super::Compiler; use super::navigation::{inner_creates_scope, is_star_or_plus_quantifier, is_truly_empty_scope}; @@ -143,12 +143,12 @@ impl Compiler<'_> { if !inner_creates_scope(&ei) { return false; } - let Some(info) = self.type_ctx.get_term_info(&ei) else { + let Some(info) = self.ctx.type_ctx.get_term_info(&ei) else { return false; }; info.flow .type_id() - .and_then(|id| self.type_ctx.get_type(id)) + .and_then(|id| self.ctx.type_ctx.get_type(id)) .is_some_and(|shape| matches!(shape, TypeShape::Struct(_) | TypeShape::Enum(_))) }); @@ -195,7 +195,7 @@ impl Compiler<'_> { } // Check the actual inferred type, not syntax - let Some(info) = self.type_ctx.get_term_info(&inner) else { + let Some(info) = self.ctx.type_ctx.get_term_info(&inner) else { return true; }; @@ -204,7 +204,7 @@ impl Compiler<'_> { !info .flow .type_id() - .and_then(|id| self.type_ctx.get_type(id)) + .and_then(|id| self.ctx.type_ctx.get_type(id)) .is_some_and(|shape| matches!(shape, TypeShape::Struct(_) | TypeShape::Enum(_))) } @@ -235,9 +235,9 @@ impl Compiler<'_> { /// In this case, we skip emitting Node/Text effects in captures. fn ref_returns_structured(&self, r: &ast::Ref) -> bool { r.name() - .and_then(|name| self.type_ctx.get_def_id(self.interner, name.text())) - .and_then(|def_id| self.type_ctx.get_def_type(def_id)) - .and_then(|def_type| self.type_ctx.get_type(def_type)) + .and_then(|name| self.ctx.type_ctx.get_def_id(self.ctx.interner, name.text())) + .and_then(|def_id| self.ctx.type_ctx.get_def_type(def_id)) + .and_then(|def_type| self.ctx.type_ctx.get_type(def_type)) .is_some_and(|shape| { matches!( shape, diff --git a/crates/plotnik-compiler/src/compile/capture_tests.rs b/crates/plotnik-compiler/src/compile/capture_tests.rs index b6e5051..595738d 100644 --- a/crates/plotnik-compiler/src/compile/capture_tests.rs +++ b/crates/plotnik-compiler/src/compile/capture_tests.rs @@ -1,6 +1,6 @@ use super::capture::CaptureEffects; -use plotnik_bytecode::EffectOpcode; use crate::bytecode::{EffectIR, MemberRef}; +use plotnik_bytecode::EffectOpcode; #[test] fn nest_scope_preserves_outer_and_nests_inner() { diff --git a/crates/plotnik-compiler/src/compile/compile_tests.rs b/crates/plotnik-compiler/src/compile/compile_tests.rs index 255127e..622f9b7 100644 --- a/crates/plotnik-compiler/src/compile/compile_tests.rs +++ b/crates/plotnik-compiler/src/compile/compile_tests.rs @@ -1,8 +1,23 @@ //! Integration tests for the compilation pipeline. +use std::cell::RefCell; + use super::*; -use crate::emit::StringTableBuilder; -use crate::query::QueryBuilder; +use crate::{emit::StringTableBuilder, query::QueryBuilder}; + +/// Helper to compile a query with default context. +fn compile_query(query: &crate::query::QueryAnalyzed) -> CompileResult { + let strings = RefCell::new(StringTableBuilder::new()); + let ctx = CompileCtx { + interner: query.interner(), + type_ctx: query.type_context(), + symbol_table: &query.symbol_table, + strings: &strings, + node_types: None, + node_fields: None, + }; + Compiler::compile(&ctx).unwrap() +} #[test] fn compile_simple_named_node() { @@ -11,16 +26,7 @@ fn compile_simple_named_node() { .unwrap() .analyze(); - let mut strings = StringTableBuilder::new(); - let result = Compiler::compile( - query.interner(), - query.type_context(), - &query.symbol_table, - &mut strings, - None, - None, - ) - .unwrap(); + let result = compile_query(&query); // Should have at least one instruction assert!(!result.instructions.is_empty()); @@ -35,16 +41,7 @@ fn compile_alternation() { .unwrap() .analyze(); - let mut strings = StringTableBuilder::new(); - let result = Compiler::compile( - query.interner(), - query.type_context(), - &query.symbol_table, - &mut strings, - None, - None, - ) - .unwrap(); + let result = compile_query(&query); assert!(!result.instructions.is_empty()); } @@ -56,16 +53,7 @@ fn compile_sequence() { .unwrap() .analyze(); - let mut strings = StringTableBuilder::new(); - let result = Compiler::compile( - query.interner(), - query.type_context(), - &query.symbol_table, - &mut strings, - None, - None, - ) - .unwrap(); + let result = compile_query(&query); assert!(!result.instructions.is_empty()); } @@ -77,16 +65,7 @@ fn compile_quantified() { .unwrap() .analyze(); - let mut strings = StringTableBuilder::new(); - let result = Compiler::compile( - query.interner(), - query.type_context(), - &query.symbol_table, - &mut strings, - None, - None, - ) - .unwrap(); + let result = compile_query(&query); assert!(!result.instructions.is_empty()); } @@ -98,16 +77,7 @@ fn compile_capture() { .unwrap() .analyze(); - let mut strings = StringTableBuilder::new(); - let result = Compiler::compile( - query.interner(), - query.type_context(), - &query.symbol_table, - &mut strings, - None, - None, - ) - .unwrap(); + let result = compile_query(&query); assert!(!result.instructions.is_empty()); } @@ -119,16 +89,7 @@ fn compile_nested() { .unwrap() .analyze(); - let mut strings = StringTableBuilder::new(); - let result = Compiler::compile( - query.interner(), - query.type_context(), - &query.symbol_table, - &mut strings, - None, - None, - ) - .unwrap(); + let result = compile_query(&query); assert!(!result.instructions.is_empty()); } @@ -148,16 +109,7 @@ fn compile_large_tagged_alternation() { .unwrap() .analyze(); - let mut strings = StringTableBuilder::new(); - let result = Compiler::compile( - query.interner(), - query.type_context(), - &query.symbol_table, - &mut strings, - None, - None, - ) - .unwrap(); + let result = compile_query(&query); assert!(!result.instructions.is_empty()); } @@ -174,16 +126,7 @@ fn compile_unlabeled_alternation_5_branches_with_captures() { .unwrap() .analyze(); - let mut strings = StringTableBuilder::new(); - let result = Compiler::compile( - query.interner(), - query.type_context(), - &query.symbol_table, - &mut strings, - None, - None, - ) - .unwrap(); + let result = compile_query(&query); assert!(!result.instructions.is_empty()); @@ -213,16 +156,7 @@ fn compile_unlabeled_alternation_8_branches_with_captures() { .unwrap() .analyze(); - let mut strings = StringTableBuilder::new(); - let result = Compiler::compile( - query.interner(), - query.type_context(), - &query.symbol_table, - &mut strings, - None, - None, - ) - .unwrap(); + let result = compile_query(&query); assert!(!result.instructions.is_empty()); } diff --git a/crates/plotnik-compiler/src/compile/compiler.rs b/crates/plotnik-compiler/src/compile/compiler.rs index 93a465a..6f0e95e 100644 --- a/crates/plotnik-compiler/src/compile/compiler.rs +++ b/crates/plotnik-compiler/src/compile/compiler.rs @@ -1,27 +1,40 @@ //! Core compiler state and entry points. +use std::cell::RefCell; + use indexmap::IndexMap; use plotnik_core::{Interner, NodeFieldId, NodeTypeId, Symbol}; use crate::analyze::symbol_table::SymbolTable; use crate::analyze::type_check::{DefId, TypeContext}; -use plotnik_bytecode::Nav; use crate::bytecode::{InstructionIR, Label, ReturnIR, TrampolineIR}; use crate::emit::StringTableBuilder; use crate::parser::Expr; +use plotnik_bytecode::Nav; use super::capture::CaptureEffects; +use super::dce::remove_unreachable; +use super::epsilon_elim::eliminate_epsilons; use super::error::{CompileError, CompileResult}; use super::scope::StructScope; +use super::verify::debug_verify_ir_fingerprint; + +/// Compilation context bundling all shared compilation state. +/// +/// Uses `RefCell` for `strings` to allow interior mutability while +/// sharing the context across compilation phases. +pub struct CompileCtx<'a> { + pub interner: &'a Interner, + pub type_ctx: &'a TypeContext, + pub symbol_table: &'a SymbolTable, + pub strings: &'a RefCell, + pub node_types: Option<&'a IndexMap>, + pub node_fields: Option<&'a IndexMap>, +} /// Compiler state for Thompson construction. pub struct Compiler<'a> { - pub(super) interner: &'a Interner, - pub(super) type_ctx: &'a TypeContext, - pub(crate) symbol_table: &'a SymbolTable, - pub(super) strings: &'a mut StringTableBuilder, - pub(super) node_type_ids: Option<&'a IndexMap>, - pub(super) node_field_ids: Option<&'a IndexMap>, + pub(super) ctx: &'a CompileCtx<'a>, pub(super) instructions: Vec, pub(crate) next_label_id: u32, pub(super) def_entries: IndexMap, @@ -30,111 +43,50 @@ pub struct Compiler<'a> { pub(super) scope_stack: Vec, } -/// Builder for `Compiler`. -pub struct CompilerBuilder<'a> { - interner: &'a Interner, - type_ctx: &'a TypeContext, - symbol_table: &'a SymbolTable, - strings: &'a mut StringTableBuilder, - node_type_ids: Option<&'a IndexMap>, - node_field_ids: Option<&'a IndexMap>, -} - -impl<'a> CompilerBuilder<'a> { - /// Create a new builder with required parameters. - pub fn new( - interner: &'a Interner, - type_ctx: &'a TypeContext, - symbol_table: &'a SymbolTable, - strings: &'a mut StringTableBuilder, - ) -> Self { +impl<'a> Compiler<'a> { + /// Create a new compiler with the given context. + pub fn new(ctx: &'a CompileCtx<'a>) -> Self { Self { - interner, - type_ctx, - symbol_table, - strings, - node_type_ids: None, - node_field_ids: None, - } - } - - /// Set node type and field IDs for linked compilation. - pub fn linked( - mut self, - node_type_ids: &'a IndexMap, - node_field_ids: &'a IndexMap, - ) -> Self { - self.node_type_ids = Some(node_type_ids); - self.node_field_ids = Some(node_field_ids); - self - } - - /// Build the Compiler. - pub fn build(self) -> Compiler<'a> { - Compiler { - interner: self.interner, - type_ctx: self.type_ctx, - symbol_table: self.symbol_table, - strings: self.strings, - node_type_ids: self.node_type_ids, - node_field_ids: self.node_field_ids, + ctx, instructions: Vec::new(), next_label_id: 0, def_entries: IndexMap::new(), scope_stack: Vec::new(), } } -} - -impl<'a> Compiler<'a> { - /// Create a builder for Compiler. - pub fn builder( - interner: &'a Interner, - type_ctx: &'a TypeContext, - symbol_table: &'a SymbolTable, - strings: &'a mut StringTableBuilder, - ) -> CompilerBuilder<'a> { - CompilerBuilder::new(interner, type_ctx, symbol_table, strings) - } /// Compile all definitions in the query. - pub fn compile( - interner: &'a Interner, - type_ctx: &'a TypeContext, - symbol_table: &'a SymbolTable, - strings: &'a mut StringTableBuilder, - node_type_ids: Option<&'a IndexMap>, - node_field_ids: Option<&'a IndexMap>, - ) -> Result { - let mut compiler = - if let (Some(type_ids), Some(field_ids)) = (node_type_ids, node_field_ids) { - Compiler::builder(interner, type_ctx, symbol_table, strings) - .linked(type_ids, field_ids) - .build() - } else { - Compiler::builder(interner, type_ctx, symbol_table, strings).build() - }; + pub fn compile(ctx: &'a CompileCtx<'a>) -> Result { + let mut compiler = Compiler::new(ctx); // Emit universal preamble first: Obj -> Trampoline -> EndObj -> Return // This wraps any entrypoint to create the top-level scope. let preamble_entry = compiler.emit_preamble(); // Pre-allocate entry labels for all definitions - for (def_id, _) in type_ctx.iter_def_types() { + for (def_id, _) in ctx.type_ctx.iter_def_types() { let label = compiler.fresh_label(); compiler.def_entries.insert(def_id, label); } // Compile each definition - for (def_id, _) in type_ctx.iter_def_types() { + for (def_id, _) in ctx.type_ctx.iter_def_types() { compiler.compile_def(def_id)?; } - Ok(CompileResult { + let mut result = CompileResult { instructions: compiler.instructions, def_entries: compiler.def_entries, preamble_entry, - }) + }; + + // Eliminate epsilon transitions (with semantic verification in debug builds) + eliminate_epsilons(&mut result, ctx); + + // Remove unreachable instructions (bypassed epsilons, etc.) + remove_unreachable(&mut result); + + Ok(result) } /// Emit the universal preamble: Obj -> Trampoline -> EndObj -> Return @@ -165,10 +117,10 @@ impl<'a> Compiler<'a> { /// Compile a single definition. fn compile_def(&mut self, def_id: DefId) -> Result<(), CompileError> { - let name_sym = self.type_ctx.def_name_sym(def_id); - let name = self.interner.resolve(name_sym); + let name_sym = self.ctx.type_ctx.def_name_sym(def_id); + let name = self.ctx.interner.resolve(name_sym); - let Some(body) = self.symbol_table.get(name) else { + let Some(body) = self.ctx.symbol_table.get(name) else { return Err(CompileError::DefinitionNotFound(name.to_string())); }; @@ -189,7 +141,7 @@ impl<'a> Compiler<'a> { // Definitions are compiled in normalized form: body -> Return // No Obj/EndObj wrapper - that's the caller's responsibility (call-site scoping). // We still use with_scope for member index lookup during compilation. - let body_entry = if let Some(type_id) = self.type_ctx.get_def_type(def_id) { + let body_entry = if let Some(type_id) = self.ctx.type_ctx.get_def_type(def_id) { self.with_scope(type_id, |this| { this.compile_expr_with_nav(body, return_label, body_nav) }) @@ -202,6 +154,15 @@ impl<'a> Compiler<'a> { self.emit_epsilon(entry_label, vec![body_entry]); } + // Debug-only: verify IR semantic fingerprint + debug_verify_ir_fingerprint( + &self.instructions, + entry_label, + &self.def_entries, + name, + self.ctx, + ); + Ok(()) } diff --git a/crates/plotnik-compiler/src/compile/dce.rs b/crates/plotnik-compiler/src/compile/dce.rs new file mode 100644 index 0000000..2a4307b --- /dev/null +++ b/crates/plotnik-compiler/src/compile/dce.rs @@ -0,0 +1,135 @@ +//! Dead code elimination pass. +//! +//! Removes unreachable instructions after epsilon elimination. +//! Instructions become unreachable when epsilon transitions are +//! bypassed and no other path leads to them. + +use std::collections::HashSet; + +use crate::bytecode::Label; + +use super::error::CompileResult; + +/// Remove instructions not reachable from any entry point. +/// +/// This pass runs after epsilon elimination to clean up instructions +/// that were bypassed during optimization. +pub fn remove_unreachable(result: &mut CompileResult) { + let reachable = compute_reachable(result); + result + .instructions + .retain(|instr| reachable.contains(&instr.label())); +} + +/// Compute all labels reachable from entry points via BFS. +fn compute_reachable(result: &CompileResult) -> HashSet