From 0ebf0a2246febc07d02f654502521fe9d7a909de Mon Sep 17 00:00:00 2001 From: Sergei Zharinov Date: Wed, 31 Dec 2025 09:27:28 -0300 Subject: [PATCH 1/3] feat: add query compilation to bytecode --- crates/plotnik-lib/src/bytecode/dump_tests.rs | 131 ++ crates/plotnik-lib/src/bytecode/mod.rs | 2 + ...ecode__dump_tests__dump_comprehensive.snap | 81 ++ .../src/query/{emit.rs => codegen.rs} | 126 +- crates/plotnik-lib/src/query/codegen_tests.rs | 420 ++++++ crates/plotnik-lib/src/query/compile.rs | 1247 +++++++++++++++++ crates/plotnik-lib/src/query/emit_tests.rs | 333 ----- crates/plotnik-lib/src/query/link.rs | 6 +- crates/plotnik-lib/src/query/mod.rs | 5 +- crates/plotnik-lib/src/query/query.rs | 36 +- crates/plotnik-lib/src/query/query_tests.rs | 46 +- 11 files changed, 2060 insertions(+), 373 deletions(-) create mode 100644 crates/plotnik-lib/src/bytecode/dump_tests.rs create mode 100644 crates/plotnik-lib/src/bytecode/snapshots/plotnik_lib__bytecode__dump_tests__dump_comprehensive.snap rename crates/plotnik-lib/src/query/{emit.rs => codegen.rs} (85%) create mode 100644 crates/plotnik-lib/src/query/codegen_tests.rs create mode 100644 crates/plotnik-lib/src/query/compile.rs delete mode 100644 crates/plotnik-lib/src/query/emit_tests.rs diff --git a/crates/plotnik-lib/src/bytecode/dump_tests.rs b/crates/plotnik-lib/src/bytecode/dump_tests.rs new file mode 100644 index 00000000..1121b82b --- /dev/null +++ b/crates/plotnik-lib/src/bytecode/dump_tests.rs @@ -0,0 +1,131 @@ +//! Tests for bytecode dump functionality. + +use crate::Query; +use indoc::indoc; + +#[test] +fn dump_minimal() { + let input = "Test = (identifier) @id"; + + let res = Query::expect_valid_linked_bytecode(input); + + insta::assert_snapshot!(res, @r#" + [header] + linked = true + + [strings] + S00 "Beauty will save the world" + S01 "id" + S02 "Test" + S03 "identifier" + + [types.defs] + T00 = void + T01 = Node + T02 = str + T03 = Struct(M0, 1) ; { id } + + [types.members] + M0 = (S01, T01) ; id: Node + + [types.names] + N0 = (S02, T03) ; Test + + [entry] + Test = 01 :: T03 + + [code] + 00 ๐œ€ โ—ผ + + Test: + 01 ๐œ€ 02 + 02 *โ†“ (identifier) 03 + 03 ๐œ€ [Node Set(M0)] โ—ผ + "#); +} + +#[test] +fn dump_multiple_entrypoints() { + let input = indoc! {r#" + Expression = [(identifier) @name (number) @value] + Root = (function_declaration name: (identifier) @name) + "#}; + + let res = Query::expect_valid_linked_bytecode(input); + + // Verify key sections exist + assert!(res.contains("[header]")); + assert!(res.contains("[strings]")); + assert!(res.contains("[types.defs]")); + assert!(res.contains("[types.members]")); + assert!(res.contains("[types.names]")); + assert!(res.contains("[entry]")); + assert!(res.contains("[code]")); + + // Verify both entrypoints appear + assert!(res.contains("Expression")); + assert!(res.contains("Root")); + + // Verify code section has entrypoint labels + assert!(res.contains("Expression:")); + assert!(res.contains("Root:")); +} + +#[test] +fn dump_with_field_constraints() { + let input = indoc! {r#" + Test = (binary_expression + left: (_) @left + right: (_) @right) + "#}; + + let res = Query::expect_valid_linked_bytecode(input); + + // Should have field references in code section + assert!(res.contains("left:")); + assert!(res.contains("right:")); +} + +#[test] +fn dump_with_quantifier() { + let input = "Test = (identifier)* @items"; + + let res = Query::expect_valid_linked_bytecode(input); + + // Should have array type + assert!(res.contains("Array") || res.contains("[]")); +} + +#[test] +fn dump_with_alternation() { + let input = "Test = [(identifier) @id (string) @str]"; + + let res = Query::expect_valid_linked_bytecode(input); + + // Should have code section with branching + assert!(res.contains("[code]")); +} + +#[test] +fn dump_comprehensive() { + // A query that exercises most features: + // - Multiple definitions (entrypoints) + // - Field constraints (node_fields) + // - Multiple node types (node_types) + // - Captures with types (type_defs, type_members) + // - Alternation (branching in code) + let input = indoc! {r#" + Ident = (identifier) @name :: string + Expression = [ + Literal: (number) @value + Variable: (identifier) @name + ] + Assignment = (assignment_expression + left: (identifier) @target + right: (Expression) @value) + "#}; + + let res = Query::expect_valid_linked_bytecode(input); + + insta::assert_snapshot!(res); +} diff --git a/crates/plotnik-lib/src/bytecode/mod.rs b/crates/plotnik-lib/src/bytecode/mod.rs index e288c04a..9fdf7557 100644 --- a/crates/plotnik-lib/src/bytecode/mod.rs +++ b/crates/plotnik-lib/src/bytecode/mod.rs @@ -47,6 +47,8 @@ pub use module::{ pub use dump::dump; +#[cfg(test)] +mod dump_tests; #[cfg(test)] mod instructions_tests; #[cfg(test)] diff --git a/crates/plotnik-lib/src/bytecode/snapshots/plotnik_lib__bytecode__dump_tests__dump_comprehensive.snap b/crates/plotnik-lib/src/bytecode/snapshots/plotnik_lib__bytecode__dump_tests__dump_comprehensive.snap new file mode 100644 index 00000000..69ea7eae --- /dev/null +++ b/crates/plotnik-lib/src/bytecode/snapshots/plotnik_lib__bytecode__dump_tests__dump_comprehensive.snap @@ -0,0 +1,81 @@ +--- +source: crates/plotnik-lib/src/bytecode/dump_tests.rs +expression: res +--- +[header] +linked = true + +[strings] +S00 "Beauty will save the world" +S01 "name" +S02 "value" +S03 "Literal" +S04 "Variable" +S05 "target" +S06 "Ident" +S07 "Expression" +S08 "Assignment" +S09 "identifier" +S10 "number" +S11 "assignment_expression" +S12 "left" +S13 "right" + +[types.defs] +T00 = void +T01 = Node +T02 = str +T03 = Struct(M0, 1) ; { name } +T04 = Struct(M1, 1) ; { value } +T05 = Struct(M2, 1) ; { name } +T06 = Enum(M3, 2) ; Literal | Variable +T07 = Struct(M5, 2) ; { value, target } + +[types.members] +M0 = (S01, T02) ; name: str +M1 = (S02, T01) ; value: Node +M2 = (S01, T01) ; name: Node +M3 = (S03, T04) ; Literal: T04 +M4 = (S04, T05) ; Variable: T05 +M5 = (S02, T06) ; value: Expression +M6 = (S05, T01) ; target: Node + +[types.names] +N0 = (S06, T03) ; Ident +N1 = (S07, T06) ; Expression +N2 = (S08, T07) ; Assignment + +[entry] +Assignment = 08 :: T07 +Expression = 05 :: T06 +Ident = 01 :: T03 + +[code] + 00 ๐œ€ โ—ผ + +Ident: + 01 ๐œ€ 02 + 02 *โ†“ (identifier) 03 + 03 ๐œ€ [Text Set(M0)] โ—ผ + +Expression: + 05 ๐œ€ 06 + 06 ๐œ€ 23, 30 + +Assignment: + 08 ๐œ€ 09 + 09 *โ†“ (assignment_expression) 10 + 10 ๐œ€ left: _ 11 + 11 *โ†“ (identifier) 12 + 12 ๐œ€ [Node Set(M1)] 14 + 14 ๐œ€ right: _ โ–ถ(Expression) + 15 ๐œ€ [Node Set(M0)] 17 + 17 *โ†‘ยน โ—ผ + 18 ๐œ€ [EndE] โ—ผ + 20 ๐œ€ [Node] 18 + 22 *โ†“ (number) 20 + 23 ๐œ€ [E(M0)] 22 + 25 ๐œ€ [EndE] โ—ผ + 27 ๐œ€ [Node] 25 + 29 *โ†“ (identifier) 27 + 30 ๐œ€ [E(M1)] 29 diff --git a/crates/plotnik-lib/src/query/emit.rs b/crates/plotnik-lib/src/query/codegen.rs similarity index 85% rename from crates/plotnik-lib/src/query/emit.rs rename to crates/plotnik-lib/src/query/codegen.rs index ab2f1665..2c2cbde7 100644 --- a/crates/plotnik-lib/src/query/emit.rs +++ b/crates/plotnik-lib/src/query/codegen.rs @@ -4,15 +4,20 @@ use std::collections::{HashMap, HashSet}; +use indexmap::IndexMap; use plotnik_core::{Interner, NodeFieldId, NodeTypeId, Symbol}; +use crate::bytecode::ir::Label; +use crate::bytecode::layout::CacheAligned; use crate::bytecode::{ Entrypoint, FieldSymbol, Header, NodeSymbol, QTypeId, SECTION_ALIGN, StepId, StringId, TriviaEntry, TypeDef, TypeMember, TypeMetaHeader, TypeName, }; use crate::type_system::TypeKind; +use super::compile::Compiler; use super::query::LinkedQuery; +use super::symbol_table::SymbolTable; use super::type_check::{ FieldInfo, TYPE_NODE, TYPE_STRING, TYPE_VOID, TypeContext, TypeId, TypeShape, }; @@ -20,6 +25,8 @@ use super::type_check::{ /// Error during bytecode emission. #[derive(Clone, Debug)] pub enum EmitError { + /// Query has validation errors (must be valid before emitting). + InvalidQuery, /// Too many strings (exceeds u16 max). TooManyStrings(usize), /// Too many types (exceeds u16 max). @@ -28,29 +35,43 @@ pub enum EmitError { TooManyTypeMembers(usize), /// Too many entrypoints (exceeds u16 max). TooManyEntrypoints(usize), + /// Too many transitions (exceeds u16 max). + TooManyTransitions(usize), /// String not found in interner. StringNotFound(Symbol), + /// Compilation error. + Compile(super::compile::CompileError), } impl std::fmt::Display for EmitError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { + Self::InvalidQuery => write!(f, "query has validation errors"), Self::TooManyStrings(n) => write!(f, "too many strings: {n} (max 65534)"), Self::TooManyTypes(n) => write!(f, "too many types: {n} (max 65533)"), Self::TooManyTypeMembers(n) => write!(f, "too many type members: {n} (max 65535)"), Self::TooManyEntrypoints(n) => write!(f, "too many entrypoints: {n} (max 65535)"), + Self::TooManyTransitions(n) => write!(f, "too many transitions: {n} (max 65535)"), Self::StringNotFound(sym) => write!(f, "string not found for symbol: {sym:?}"), + Self::Compile(e) => write!(f, "compilation error: {e}"), } } } impl std::error::Error for EmitError {} +/// Easter egg string at index 0 (Dostoevsky, The Idiot). +/// StringId(0) is reserved and never referenced by instructions. +pub const EASTER_EGG: &str = "Beauty will save the world"; + /// Builds the string table, remapping query Symbols to bytecode StringIds. /// /// The bytecode format requires a subset of the query interner's strings. /// This builder collects only the strings that are actually used and assigns /// compact StringId indices. +/// +/// StringId(0) is reserved for an easter egg and is never referenced by +/// instructions. Actual strings start at index 1. #[derive(Debug)] pub struct StringTableBuilder { /// Map from query Symbol to bytecode StringId. @@ -63,11 +84,15 @@ pub struct StringTableBuilder { impl StringTableBuilder { pub fn new() -> Self { - Self { + let mut builder = Self { mapping: HashMap::new(), str_lookup: HashMap::new(), strings: Vec::new(), - } + }; + // Reserve index 0 for easter egg + builder.strings.push(EASTER_EGG.to_string()); + builder.str_lookup.insert(EASTER_EGG.to_string(), StringId(0)); + builder } /// Get or create a StringId for a Symbol. @@ -115,7 +140,8 @@ impl StringTableBuilder { /// Validate that the string count fits in u16. pub fn validate(&self) -> Result<(), EmitError> { - // Max count is 65534 because the table needs count+1 entries + // Max count is 65534 because the table needs count+1 entries. + // Index 0 is reserved for the easter egg, so we can have 65533 user strings. if self.strings.len() > 65534 { return Err(EmitError::TooManyStrings(self.strings.len())); } @@ -211,10 +237,10 @@ impl TypeTableBuilder { // Emit TypeDefs and TypeMembers - fill in the placeholders. for (slot_index, &type_id) in ordered_types.iter().enumerate() { - let type_kind = type_ctx + let type_shape = type_ctx .get_type(type_id) .expect("collected type must exist"); - self.emit_type_at_slot(slot_index, type_id, type_kind, type_ctx, interner, strings)?; + self.emit_type_at_slot(slot_index, type_id, type_shape, type_ctx, interner, strings)?; } // Collect TypeName entries for named definitions @@ -236,12 +262,12 @@ impl TypeTableBuilder { &mut self, slot_index: usize, _type_id: TypeId, - type_kind: &TypeShape, + type_shape: &TypeShape, type_ctx: &TypeContext, interner: &Interner, strings: &mut StringTableBuilder, ) -> Result<(), EmitError> { - match type_kind { + match type_shape { TypeShape::Void | TypeShape::Node | TypeShape::String => { // Builtins - should not reach here unreachable!("builtins should be handled separately") @@ -362,8 +388,8 @@ impl TypeTableBuilder { } // Handle Ref types by following the reference - if let Some(type_kind) = type_ctx.get_type(type_id) - && let TypeShape::Ref(def_id) = type_kind + if let Some(type_shape) = type_ctx.get_type(type_id) + && let TypeShape::Ref(def_id) = type_shape && let Some(def_type_id) = type_ctx.get_def_type(*def_id) { return self.resolve_type(def_type_id, type_ctx); @@ -486,12 +512,12 @@ fn collect_types_dfs( return; } - let Some(type_kind) = type_ctx.get_type(type_id) else { + let Some(type_shape) = type_ctx.get_type(type_id) else { return; }; // Resolve Ref types to their target - if let TypeShape::Ref(def_id) = type_kind { + if let TypeShape::Ref(def_id) = type_shape { if let Some(target_id) = type_ctx.get_def_type(*def_id) { collect_types_dfs(target_id, type_ctx, out, seen); } @@ -501,7 +527,7 @@ fn collect_types_dfs( seen.insert(type_id); // Collect children first (depth-first), then add self - match type_kind { + match type_shape { TypeShape::Struct(fields) => { for field_info in fields.values() { collect_types_dfs(field_info.type_id, type_ctx, out, seen); @@ -542,8 +568,12 @@ fn pad_to_section(buf: &mut Vec) { } /// Emit bytecode from type context only (no node validation). -pub fn emit(type_ctx: &TypeContext, interner: &Interner) -> Result, EmitError> { - emit_inner(type_ctx, interner, None, None) +pub fn emit( + type_ctx: &TypeContext, + interner: &Interner, + symbol_table: &SymbolTable, +) -> Result, EmitError> { + emit_inner(type_ctx, interner, symbol_table, None, None) } /// Emit bytecode from a LinkedQuery (includes node type/field validation info). @@ -551,6 +581,7 @@ pub fn emit_linked(query: &LinkedQuery) -> Result, EmitError> { emit_inner( query.type_context(), query.interner(), + &query.symbol_table, Some(query.node_type_ids()), Some(query.node_field_ids()), ) @@ -560,13 +591,28 @@ pub fn emit_linked(query: &LinkedQuery) -> Result, EmitError> { fn emit_inner( type_ctx: &TypeContext, interner: &Interner, - node_type_ids: Option<&HashMap>, - node_field_ids: Option<&HashMap>, + symbol_table: &SymbolTable, + node_type_ids: Option<&IndexMap>, + node_field_ids: Option<&IndexMap>, ) -> Result, EmitError> { + let is_linked = node_type_ids.is_some(); let mut strings = StringTableBuilder::new(); let mut types = TypeTableBuilder::new(); types.build(type_ctx, interner, &mut strings)?; + // Compile transitions (strings are interned here for unlinked mode) + let compile_result = Compiler::compile(interner, type_ctx, symbol_table, &mut strings, node_type_ids, node_field_ids) + .map_err(EmitError::Compile)?; + + // Layout with cache alignment + let entry_labels: Vec