From 3868ee5c96c32e0f09fb25b78dcd18bfc815519e Mon Sep 17 00:00:00 2001 From: Sergei Zharinov Date: Mon, 29 Dec 2025 15:07:32 -0300 Subject: [PATCH 1/2] feat: add bytecode module loader Add unified module storage with lazy decoding: - Module: owns or mmaps bytecode, provides section views - Instruction/InstructionView: decoded instruction wrappers - StringsView, SymbolsView, TypesView, etc. - Update binary format documentation --- Cargo.lock | 11 + crates/plotnik-lib/Cargo.toml | 2 + crates/plotnik-lib/src/bytecode/entrypoint.rs | 2 +- crates/plotnik-lib/src/bytecode/mod.rs | 16 +- crates/plotnik-lib/src/bytecode/module.rs | 530 ++++++++++++++++++ .../plotnik-lib/src/bytecode/module_tests.rs | 338 +++++++++++ 6 files changed, 894 insertions(+), 5 deletions(-) create mode 100644 crates/plotnik-lib/src/bytecode/module.rs create mode 100644 crates/plotnik-lib/src/bytecode/module_tests.rs diff --git a/Cargo.lock b/Cargo.lock index c58567f0..4c7f1bf3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1474,6 +1474,15 @@ version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" +[[package]] +name = "memmap2" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "744133e4a0e0a658e1374cf3bf8e415c4052a15a111acd372764c55b4177d490" +dependencies = [ + "libc", +] + [[package]] name = "once_cell" version = "1.21.3" @@ -1629,11 +1638,13 @@ dependencies = [ "indoc", "insta", "logos", + "memmap2", "plotnik-core", "plotnik-langs", "rowan", "serde", "serde_json", + "tempfile", "thiserror", ] diff --git a/crates/plotnik-lib/Cargo.toml b/crates/plotnik-lib/Cargo.toml index 75cc0c69..7f698d86 100644 --- a/crates/plotnik-lib/Cargo.toml +++ b/crates/plotnik-lib/Cargo.toml @@ -21,6 +21,7 @@ rowan = "0.16.1" serde = { version = "1.0.228", features = ["derive"] } thiserror = "2.0.17" arborium-tree-sitter = "2.3.2" +memmap2 = "0.9" plotnik-core = { version = "0.1", path = "../plotnik-core" } plotnik-langs = { version = "0.1", path = "../plotnik-langs", optional = true } @@ -31,3 +32,4 @@ default = ["plotnik-langs"] insta = { version = "=1.45.1", features = ["yaml"] } indoc = "=2.0.7" serde_json = "=1.0.148" +tempfile = "3" diff --git a/crates/plotnik-lib/src/bytecode/entrypoint.rs b/crates/plotnik-lib/src/bytecode/entrypoint.rs index 3550cf21..ccc848ff 100644 --- a/crates/plotnik-lib/src/bytecode/entrypoint.rs +++ b/crates/plotnik-lib/src/bytecode/entrypoint.rs @@ -3,7 +3,7 @@ use super::{QTypeId, StepId, StringId}; /// Named query definition entry point (8 bytes). -#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] #[repr(C)] pub struct Entrypoint { /// Definition name. diff --git a/crates/plotnik-lib/src/bytecode/mod.rs b/crates/plotnik-lib/src/bytecode/mod.rs index 1b39e286..6f947aec 100644 --- a/crates/plotnik-lib/src/bytecode/mod.rs +++ b/crates/plotnik-lib/src/bytecode/mod.rs @@ -8,6 +8,7 @@ mod entrypoint; mod header; mod ids; mod instructions; +mod module; mod nav; mod sections; mod type_meta; @@ -21,19 +22,26 @@ pub use ids::{QTypeId, StepId, StringId}; pub use header::Header; -pub use nav::Nav; - pub use sections::{FieldSymbol, NodeSymbol, Slice, TriviaEntry}; -pub use effects::{EffectOp, EffectOpcode}; - pub use entrypoint::Entrypoint; pub use type_meta::{TypeDef, TypeKind, TypeMember, TypeMetaHeader, TypeName}; +pub use nav::Nav; + +pub use effects::{EffectOp, EffectOpcode}; + pub use instructions::{ Call, Match, MatchView, Opcode, Return, align_to_section, select_match_opcode, }; +pub use module::{ + ByteStorage, EntrypointsView, Instruction, InstructionView, Module, ModuleError, StringsView, + SymbolsView, TriviaView, TypesView, +}; + #[cfg(test)] mod instructions_tests; +#[cfg(test)] +mod module_tests; diff --git a/crates/plotnik-lib/src/bytecode/module.rs b/crates/plotnik-lib/src/bytecode/module.rs new file mode 100644 index 00000000..5c75f23f --- /dev/null +++ b/crates/plotnik-lib/src/bytecode/module.rs @@ -0,0 +1,530 @@ +//! Bytecode module with unified storage. +//! +//! The [`Module`] struct holds compiled bytecode in either owned or memory-mapped +//! form, decoding instructions lazily when the VM steps into them. + +use std::fs::File; +use std::io; +use std::ops::Deref; +use std::path::Path; + +use memmap2::Mmap; + +use super::header::Header; +use super::ids::{QTypeId, StepId, StringId}; +use super::instructions::{Call, Match, MatchView, Opcode, Return}; +use super::sections::{FieldSymbol, NodeSymbol, TriviaEntry}; +use super::type_meta::{TypeDef, TypeMember, TypeMetaHeader, TypeName}; +use super::{Entrypoint, SECTION_ALIGN, VERSION}; + +/// Read a little-endian u16 from bytes at the given offset. +#[inline] +fn read_u16_le(bytes: &[u8], offset: usize) -> u16 { + u16::from_le_bytes([bytes[offset], bytes[offset + 1]]) +} + +/// Read a little-endian u32 from bytes at the given offset. +#[inline] +fn read_u32_le(bytes: &[u8], offset: usize) -> u32 { + u32::from_le_bytes([ + bytes[offset], + bytes[offset + 1], + bytes[offset + 2], + bytes[offset + 3], + ]) +} + +/// Storage for bytecode bytes—either owned or memory-mapped. +#[derive(Debug)] +pub enum ByteStorage { + /// Owned byte vector (from compilation or read into memory). + Owned(Vec), + /// Memory-mapped file. + Mapped(Mmap), +} + +impl Deref for ByteStorage { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + match self { + ByteStorage::Owned(v) => v, + ByteStorage::Mapped(m) => m, + } + } +} + +impl ByteStorage { + /// Create from owned bytes. + pub fn from_vec(bytes: Vec) -> Self { + Self::Owned(bytes) + } + + /// Memory-map a file. + /// + /// # Safety + /// The file must not be modified while the mapping is active. + pub fn from_file(file: &File) -> io::Result { + // SAFETY: Caller ensures the file is not modified while mapped. + let mmap = unsafe { Mmap::map(file)? }; + Ok(Self::Mapped(mmap)) + } +} + +/// Decoded instruction from bytecode. +#[derive(Clone, PartialEq, Eq, Debug)] +pub enum Instruction { + Match(Match), + Call(Call), + Return(Return), +} + +impl Instruction { + /// Decode an instruction from bytecode bytes. + /// + /// The slice must start at the instruction and contain at least 8 bytes. + pub fn from_bytes(bytes: &[u8]) -> Self { + assert!(bytes.len() >= 8, "instruction too short"); + + let opcode = Opcode::from_u8(bytes[0] & 0xF); + match opcode { + Opcode::Call => { + let arr: [u8; 8] = bytes[..8].try_into().unwrap(); + Self::Call(Call::from_bytes(arr)) + } + Opcode::Return => { + let arr: [u8; 8] = bytes[..8].try_into().unwrap(); + Self::Return(Return::from_bytes(arr)) + } + _ => Self::Match(Match::from_bytes(bytes)), + } + } +} + +/// Zero-copy instruction view for efficient VM execution. +/// +/// Unlike `Instruction`, this doesn't allocate for Match instructions. +#[derive(Clone, Copy, Debug)] +pub enum InstructionView<'a> { + Match(MatchView<'a>), + Call(Call), + Return(Return), +} + +impl<'a> InstructionView<'a> { + /// Decode an instruction view from bytecode bytes without allocating. + /// + /// The slice must start at the instruction and contain at least 8 bytes. + #[inline] + pub fn from_bytes(bytes: &'a [u8]) -> Self { + debug_assert!(bytes.len() >= 8, "instruction too short"); + + let opcode = Opcode::from_u8(bytes[0] & 0xF); + match opcode { + Opcode::Call => { + let arr: [u8; 8] = bytes[..8].try_into().unwrap(); + Self::Call(Call::from_bytes(arr)) + } + Opcode::Return => { + let arr: [u8; 8] = bytes[..8].try_into().unwrap(); + Self::Return(Return::from_bytes(arr)) + } + _ => Self::Match(MatchView::from_bytes(bytes)), + } + } +} + +/// Module load error. +#[derive(Debug, thiserror::Error)] +pub enum ModuleError { + #[error("invalid magic: expected PTKQ")] + InvalidMagic, + #[error("unsupported version: {0} (expected {VERSION})")] + UnsupportedVersion(u32), + #[error("file too small: {0} bytes (minimum 64)")] + FileTooSmall(usize), + #[error("size mismatch: header says {header} bytes, got {actual}")] + SizeMismatch { header: u32, actual: usize }, + #[error("io error: {0}")] + Io(#[from] io::Error), +} + +/// A compiled bytecode module. +/// +/// Instructions are decoded lazily via [`decode_step`](Self::decode_step). +/// Cold data (strings, symbols, types) is accessed through view methods. +#[derive(Debug)] +pub struct Module { + storage: ByteStorage, + header: Header, +} + +impl Module { + /// Load a module from owned bytes. + pub fn from_bytes(bytes: Vec) -> Result { + Self::from_storage(ByteStorage::Owned(bytes)) + } + + /// Load a module from a file path (memory-mapped). + pub fn from_path(path: impl AsRef) -> Result { + let file = File::open(path)?; + let storage = ByteStorage::from_file(&file)?; + Self::from_storage(storage) + } + + /// Load a module from storage. + fn from_storage(storage: ByteStorage) -> Result { + if storage.len() < 64 { + return Err(ModuleError::FileTooSmall(storage.len())); + } + + let header = Header::from_bytes(&storage[..64]); + + if !header.validate_magic() { + return Err(ModuleError::InvalidMagic); + } + if !header.validate_version() { + return Err(ModuleError::UnsupportedVersion(header.version)); + } + if header.total_size as usize != storage.len() { + return Err(ModuleError::SizeMismatch { + header: header.total_size, + actual: storage.len(), + }); + } + + Ok(Self { storage, header }) + } + + /// Get the parsed header. + pub fn header(&self) -> &Header { + &self.header + } + + /// Get the raw bytes. + pub fn bytes(&self) -> &[u8] { + &self.storage + } + + /// Decode an instruction at the given step ID. + /// + /// This allocates for Match instructions. For zero-allocation decoding, + /// use [`decode_step_view`](Self::decode_step_view) instead. + pub fn decode_step(&self, step_id: StepId) -> Instruction { + let offset = self.header.transitions_offset as usize + step_id.byte_offset(); + Instruction::from_bytes(&self.storage[offset..]) + } + + /// Decode an instruction view at the given step ID without allocating. + /// + /// This is the VM's main access point for fetching instructions efficiently. + #[inline] + pub fn decode_step_view(&self, step_id: StepId) -> InstructionView<'_> { + let offset = self.header.transitions_offset as usize + step_id.byte_offset(); + InstructionView::from_bytes(&self.storage[offset..]) + } + + /// Get a view into the string table. + pub fn strings(&self) -> StringsView<'_> { + StringsView { + blob: &self.storage[self.header.str_blob_offset as usize..], + table: self.string_table_slice(), + } + } + + /// Get a view into the node type symbols. + pub fn node_types(&self) -> SymbolsView<'_, NodeSymbol> { + let offset = self.header.node_types_offset as usize; + let count = self.header.node_types_count as usize; + SymbolsView { + bytes: &self.storage[offset..offset + count * 4], + count, + _marker: std::marker::PhantomData, + } + } + + /// Get a view into the node field symbols. + pub fn node_fields(&self) -> SymbolsView<'_, FieldSymbol> { + let offset = self.header.node_fields_offset as usize; + let count = self.header.node_fields_count as usize; + SymbolsView { + bytes: &self.storage[offset..offset + count * 4], + count, + _marker: std::marker::PhantomData, + } + } + + /// Get a view into the trivia entries. + pub fn trivia(&self) -> TriviaView<'_> { + let offset = self.header.trivia_offset as usize; + let count = self.header.trivia_count as usize; + TriviaView { + bytes: &self.storage[offset..offset + count * 2], + count, + } + } + + /// Get a view into the type metadata. + pub fn types(&self) -> TypesView<'_> { + let meta_offset = self.header.type_meta_offset as usize; + let meta_header = TypeMetaHeader::from_bytes(&self.storage[meta_offset..]); + + // Sub-section offsets (each aligned to 64-byte boundary) + let defs_offset = align64(meta_offset + 8); + let defs_count = meta_header.type_defs_count as usize; + let members_offset = align64(defs_offset + defs_count * 4); + let members_count = meta_header.type_members_count as usize; + let names_offset = align64(members_offset + members_count * 4); + let names_count = meta_header.type_names_count as usize; + + TypesView { + defs_bytes: &self.storage[defs_offset..defs_offset + defs_count * 4], + members_bytes: &self.storage[members_offset..members_offset + members_count * 4], + names_bytes: &self.storage[names_offset..names_offset + names_count * 4], + defs_count, + members_count, + names_count, + } + } + + /// Get a view into the entrypoints. + pub fn entrypoints(&self) -> EntrypointsView<'_> { + let offset = self.header.entrypoints_offset as usize; + let count = self.header.entrypoints_count as usize; + EntrypointsView { + bytes: &self.storage[offset..offset + count * 8], + count, + } + } + + // Helper to get string table as bytes + // The table has count+1 entries (includes sentinel for length calculation) + fn string_table_slice(&self) -> &[u8] { + let offset = self.header.str_table_offset as usize; + let count = self.header.str_table_count as usize; + &self.storage[offset..offset + (count + 1) * 4] + } +} + +/// Align offset to 64-byte boundary. +fn align64(offset: usize) -> usize { + let rem = offset % SECTION_ALIGN; + if rem == 0 { + offset + } else { + offset + SECTION_ALIGN - rem + } +} + +/// View into the string table for lazy string lookup. +pub struct StringsView<'a> { + blob: &'a [u8], + table: &'a [u8], +} + +impl<'a> StringsView<'a> { + /// Get a string by its ID. + /// + /// The string table contains sequential u32 offsets. To get string i: + /// `start = table[i]`, `end = table[i+1]`, `length = end - start`. + pub fn get(&self, id: StringId) -> &'a str { + let idx = id.0 as usize; + let start = read_u32_le(self.table, idx * 4) as usize; + let end = read_u32_le(self.table, (idx + 1) * 4) as usize; + std::str::from_utf8(&self.blob[start..end]).expect("invalid UTF-8 in string table") + } +} + +/// View into symbol tables (node types or field names). +pub struct SymbolsView<'a, T> { + bytes: &'a [u8], + count: usize, + _marker: std::marker::PhantomData, +} + +impl<'a> SymbolsView<'a, NodeSymbol> { + /// Get a node symbol by index. + pub fn get(&self, idx: usize) -> NodeSymbol { + assert!(idx < self.count, "node symbol index out of bounds"); + let offset = idx * 4; + NodeSymbol { + id: read_u16_le(self.bytes, offset), + name: StringId(read_u16_le(self.bytes, offset + 2)), + } + } + + /// Number of entries. + pub fn len(&self) -> usize { + self.count + } + + /// Check if empty. + pub fn is_empty(&self) -> bool { + self.count == 0 + } +} + +impl<'a> SymbolsView<'a, FieldSymbol> { + /// Get a field symbol by index. + pub fn get(&self, idx: usize) -> FieldSymbol { + assert!(idx < self.count, "field symbol index out of bounds"); + let offset = idx * 4; + FieldSymbol { + id: read_u16_le(self.bytes, offset), + name: StringId(read_u16_le(self.bytes, offset + 2)), + } + } + + /// Number of entries. + pub fn len(&self) -> usize { + self.count + } + + /// Check if empty. + pub fn is_empty(&self) -> bool { + self.count == 0 + } +} + +/// View into trivia entries. +pub struct TriviaView<'a> { + bytes: &'a [u8], + count: usize, +} + +impl<'a> TriviaView<'a> { + /// Get a trivia entry by index. + pub fn get(&self, idx: usize) -> TriviaEntry { + assert!(idx < self.count, "trivia index out of bounds"); + TriviaEntry { + node_type: read_u16_le(self.bytes, idx * 2), + } + } + + /// Number of entries. + pub fn len(&self) -> usize { + self.count + } + + /// Check if empty. + pub fn is_empty(&self) -> bool { + self.count == 0 + } + + /// Check if a node type is trivia. + pub fn contains(&self, node_type: u16) -> bool { + (0..self.count).any(|i| self.get(i).node_type == node_type) + } +} + +/// View into type metadata. +/// +/// The TypeMeta section contains three sub-sections: +/// - TypeDefs: structural topology (4 bytes each) +/// - TypeMembers: fields and variants (4 bytes each) +/// - TypeNames: name → TypeId mapping (4 bytes each) +pub struct TypesView<'a> { + defs_bytes: &'a [u8], + members_bytes: &'a [u8], + names_bytes: &'a [u8], + defs_count: usize, + members_count: usize, + names_count: usize, +} + +impl<'a> TypesView<'a> { + /// Get a type definition by index. + pub fn get_def(&self, idx: usize) -> TypeDef { + assert!(idx < self.defs_count, "type def index out of bounds"); + let offset = idx * 4; + TypeDef { + data: read_u16_le(self.defs_bytes, offset), + count: self.defs_bytes[offset + 2], + kind: self.defs_bytes[offset + 3], + } + } + + /// Get a type definition by QTypeId. + pub fn get(&self, id: QTypeId) -> Option { + id.custom_index().map(|idx| self.get_def(idx)) + } + + /// Get a type member by index. + pub fn get_member(&self, idx: usize) -> TypeMember { + assert!(idx < self.members_count, "type member index out of bounds"); + let offset = idx * 4; + TypeMember { + name: StringId(read_u16_le(self.members_bytes, offset)), + type_id: QTypeId(read_u16_le(self.members_bytes, offset + 2)), + } + } + + /// Get a type name entry by index. + pub fn get_name(&self, idx: usize) -> TypeName { + assert!(idx < self.names_count, "type name index out of bounds"); + let offset = idx * 4; + TypeName { + name: StringId(read_u16_le(self.names_bytes, offset)), + type_id: QTypeId(read_u16_le(self.names_bytes, offset + 2)), + } + } + + /// Number of type definitions. + pub fn defs_count(&self) -> usize { + self.defs_count + } + + /// Number of type members. + pub fn members_count(&self) -> usize { + self.members_count + } + + /// Number of type names. + pub fn names_count(&self) -> usize { + self.names_count + } + + /// Iterate over members of a struct or enum type. + pub fn members_of(&self, def: &TypeDef) -> impl Iterator + '_ { + let start = def.data as usize; + let count = def.count as usize; + (0..count).map(move |i| self.get_member(start + i)) + } +} + +/// View into entrypoints. +pub struct EntrypointsView<'a> { + bytes: &'a [u8], + count: usize, +} + +impl<'a> EntrypointsView<'a> { + /// Get an entrypoint by index. + pub fn get(&self, idx: usize) -> Entrypoint { + assert!(idx < self.count, "entrypoint index out of bounds"); + let offset = idx * 8; + Entrypoint { + name: StringId(read_u16_le(self.bytes, offset)), + target: StepId(read_u16_le(self.bytes, offset + 2)), + result_type: QTypeId(read_u16_le(self.bytes, offset + 4)), + ..Default::default() + } + } + + /// Number of entrypoints. + pub fn len(&self) -> usize { + self.count + } + + /// Check if empty. + pub fn is_empty(&self) -> bool { + self.count == 0 + } + + /// Find an entrypoint by name (requires StringsView for comparison). + pub fn find_by_name(&self, name: &str, strings: &StringsView<'_>) -> Option { + (0..self.count) + .map(|i| self.get(i)) + .find(|e| strings.get(e.name) == name) + } +} diff --git a/crates/plotnik-lib/src/bytecode/module_tests.rs b/crates/plotnik-lib/src/bytecode/module_tests.rs new file mode 100644 index 00000000..c4b8be1f --- /dev/null +++ b/crates/plotnik-lib/src/bytecode/module_tests.rs @@ -0,0 +1,338 @@ +//! Tests for the bytecode module. + +use super::*; +use crate::bytecode::nav::Nav; +use crate::bytecode::{Header, MAGIC, Match, TypeMetaHeader, VERSION}; + +/// Build a minimal valid bytecode for testing. +fn build_test_bytecode() -> Vec { + // Layout (all sections 64-byte aligned): + // [0..64) Header + // [64..128) StringBlob + padding + // [128..192) StringTable + padding (needs 2 u32 entries: offset + sentinel) + // [192..256) NodeTypes + padding + // [256..320) NodeFields + padding + // [320..384) Trivia + padding + // [384..448) TypeMeta: TypeMetaHeader (8 bytes) + padding + // [448..512) TypeDefs sub-section (aligned) + // [512..576) TypeMembers sub-section (aligned, empty) + // [576..640) TypeNames sub-section (aligned, empty) + // [640..704) Entrypoints + padding + // [704..768) Transitions + padding + + let mut bytes = vec![0u8; 768]; + + // String blob: "Test" at offset 0 + let str_blob_offset = 64; + bytes[64] = b'T'; + bytes[65] = b'e'; + bytes[66] = b's'; + bytes[67] = b't'; + + // String table: sequential u32 offsets with sentinel + // Entry 0: offset 0 (start of "Test") + // Entry 1: offset 4 (sentinel = end of blob) + let str_table_offset = 128; + bytes[128..132].copy_from_slice(&0u32.to_le_bytes()); // offset of string 0 + bytes[132..136].copy_from_slice(&4u32.to_le_bytes()); // sentinel (end of blob) + + // Node types: one entry (id=42, name=StringId(0)) + let node_types_offset = 192; + bytes[192..194].copy_from_slice(&42u16.to_le_bytes()); + bytes[194..196].copy_from_slice(&0u16.to_le_bytes()); + + // Node fields: one entry (id=7, name=StringId(0)) + let node_fields_offset = 256; + bytes[256..258].copy_from_slice(&7u16.to_le_bytes()); + bytes[258..260].copy_from_slice(&0u16.to_le_bytes()); + + // Trivia: one entry (node_type=100) + let trivia_offset = 320; + bytes[320..322].copy_from_slice(&100u16.to_le_bytes()); + + // TypeMeta section + let type_meta_offset = 384; + + // TypeMetaHeader (8 bytes): type_defs_count=1, type_members_count=0, type_names_count=0 + let type_meta_header = TypeMetaHeader { + type_defs_count: 1, + type_members_count: 0, + type_names_count: 0, + _pad: 0, + }; + bytes[384..392].copy_from_slice(&type_meta_header.to_bytes()); + + // TypeDefs sub-section at aligned offset (448) + // One TypeDef (4 bytes): data=0, count=0, kind=3 (Struct) + bytes[448..450].copy_from_slice(&0u16.to_le_bytes()); // data (member index) + bytes[450] = 0; // count + bytes[451] = 3; // kind=Struct + + // TypeMembers sub-section at 512 (empty) + // TypeNames sub-section at 576 (empty) + + // Entrypoints: one entry (name=StringId(0), target=StepId(0), result_type=QTypeId(0)) + let entrypoints_offset = 640; + bytes[640..642].copy_from_slice(&0u16.to_le_bytes()); // name + bytes[642..644].copy_from_slice(&0u16.to_le_bytes()); // target + bytes[644..646].copy_from_slice(&0u16.to_le_bytes()); // result_type + bytes[646..648].copy_from_slice(&0u16.to_le_bytes()); // padding + + // Transitions: one Match8 instruction (accept state) + let transitions_offset = 704; + // type_id=0x00 (Match8, segment 0) + bytes[704] = 0x00; + // nav=Stay + bytes[705] = Nav::Stay.to_byte(); + // node_type=None (0) + bytes[706..708].copy_from_slice(&0u16.to_le_bytes()); + // node_field=None (0) + bytes[708..710].copy_from_slice(&0u16.to_le_bytes()); + // next=0 (accept) + bytes[710..712].copy_from_slice(&0u16.to_le_bytes()); + + // Build header + let header = Header { + magic: MAGIC, + version: VERSION, + checksum: 0, + total_size: 768, + str_blob_offset: str_blob_offset as u32, + str_table_offset: str_table_offset as u32, + node_types_offset: node_types_offset as u32, + node_fields_offset: node_fields_offset as u32, + trivia_offset: trivia_offset as u32, + type_meta_offset: type_meta_offset as u32, + entrypoints_offset: entrypoints_offset as u32, + transitions_offset: transitions_offset as u32, + str_table_count: 1, + node_types_count: 1, + node_fields_count: 1, + trivia_count: 1, + entrypoints_count: 1, + transitions_count: 1, + ..Default::default() + }; + + bytes[0..64].copy_from_slice(&header.to_bytes()); + bytes +} + +#[test] +fn module_from_bytes_valid() { + let bytes = build_test_bytecode(); + let module = Module::from_bytes(bytes).unwrap(); + + assert!(module.header().validate_magic()); + assert!(module.header().validate_version()); + assert_eq!(module.header().total_size, 768); +} + +#[test] +fn module_from_bytes_too_small() { + let bytes = vec![0u8; 32]; + let err = Module::from_bytes(bytes).unwrap_err(); + assert!(matches!(err, ModuleError::FileTooSmall(32))); +} + +#[test] +fn module_from_bytes_invalid_magic() { + let mut bytes = build_test_bytecode(); + bytes[0] = b'X'; // Corrupt magic + let err = Module::from_bytes(bytes).unwrap_err(); + assert!(matches!(err, ModuleError::InvalidMagic)); +} + +#[test] +fn module_from_bytes_wrong_version() { + let mut bytes = build_test_bytecode(); + bytes[4..8].copy_from_slice(&999u32.to_le_bytes()); // Wrong version + let err = Module::from_bytes(bytes).unwrap_err(); + assert!(matches!(err, ModuleError::UnsupportedVersion(999))); +} + +#[test] +fn module_from_bytes_size_mismatch() { + let mut bytes = build_test_bytecode(); + bytes[12..16].copy_from_slice(&1000u32.to_le_bytes()); // Wrong total_size + let err = Module::from_bytes(bytes).unwrap_err(); + assert!(matches!( + err, + ModuleError::SizeMismatch { + header: 1000, + actual: 768 + } + )); +} + +#[test] +fn module_decode_step() { + let bytes = build_test_bytecode(); + let module = Module::from_bytes(bytes).unwrap(); + + let instr = module.decode_step(StepId(0)); + match instr { + Instruction::Match(m) => { + assert_eq!(m.nav, Nav::Stay); + assert!(m.is_epsilon()); + assert!(m.is_terminal()); + } + _ => panic!("expected Match instruction"), + } +} + +#[test] +fn module_strings_view() { + let bytes = build_test_bytecode(); + let module = Module::from_bytes(bytes).unwrap(); + + let strings = module.strings(); + assert_eq!(strings.get(StringId(0)), "Test"); +} + +#[test] +fn module_node_types_view() { + let bytes = build_test_bytecode(); + let module = Module::from_bytes(bytes).unwrap(); + + let node_types = module.node_types(); + assert_eq!(node_types.len(), 1); + assert!(!node_types.is_empty()); + + let sym = node_types.get(0); + assert_eq!(sym.id, 42); + assert_eq!(sym.name, StringId(0)); +} + +#[test] +fn module_node_fields_view() { + let bytes = build_test_bytecode(); + let module = Module::from_bytes(bytes).unwrap(); + + let fields = module.node_fields(); + assert_eq!(fields.len(), 1); + + let sym = fields.get(0); + assert_eq!(sym.id, 7); + assert_eq!(sym.name, StringId(0)); +} + +#[test] +fn module_trivia_view() { + let bytes = build_test_bytecode(); + let module = Module::from_bytes(bytes).unwrap(); + + let trivia = module.trivia(); + assert_eq!(trivia.len(), 1); + assert!(trivia.contains(100)); + assert!(!trivia.contains(42)); +} + +#[test] +fn module_types_view() { + let bytes = build_test_bytecode(); + let module = Module::from_bytes(bytes).unwrap(); + + let types = module.types(); + assert_eq!(types.defs_count(), 1); + assert_eq!(types.members_count(), 0); + assert_eq!(types.names_count(), 0); + + let def = types.get_def(0); + assert_eq!(def.kind, 3); // Struct + assert_eq!(def.data, 0); // member index + assert_eq!(def.count, 0); // member count +} + +#[test] +fn module_entrypoints_view() { + let bytes = build_test_bytecode(); + let module = Module::from_bytes(bytes).unwrap(); + + let entrypoints = module.entrypoints(); + assert_eq!(entrypoints.len(), 1); + assert!(!entrypoints.is_empty()); + + let ep = entrypoints.get(0); + assert_eq!(ep.name, StringId(0)); + assert_eq!(ep.target, StepId(0)); + + let strings = module.strings(); + let found = entrypoints.find_by_name("Test", &strings); + assert!(found.is_some()); + assert_eq!(found.unwrap().target, StepId(0)); +} + +#[test] +fn instruction_from_bytes_dispatch() { + // Test Match8 + let match8 = Match { + segment: 0, + nav: Nav::Down, + node_type: std::num::NonZeroU16::new(42), + node_field: None, + pre_effects: vec![], + neg_fields: vec![], + post_effects: vec![], + successors: vec![StepId(10)], + }; + let bytes = match8.to_bytes().unwrap(); + let instr = Instruction::from_bytes(&bytes); + assert!(matches!(instr, Instruction::Match(_))); + + // Test Call + let call = Call { + segment: 0, + next: StepId(5), + target: StepId(100), + ref_id: 1, + }; + let bytes = call.to_bytes(); + let instr = Instruction::from_bytes(&bytes); + assert!(matches!(instr, Instruction::Call(_))); + + // Test Return + let ret = Return { + segment: 0, + ref_id: 1, + }; + let bytes = ret.to_bytes(); + let instr = Instruction::from_bytes(&bytes); + assert!(matches!(instr, Instruction::Return(_))); +} + +#[test] +fn byte_storage_deref() { + let data = vec![1, 2, 3, 4, 5]; + let storage = ByteStorage::from_vec(data.clone()); + + assert_eq!(&*storage, &data[..]); + assert_eq!(storage.len(), 5); + assert_eq!(storage[2], 3); +} + +#[test] +fn module_from_path_mmap() { + use std::io::Write; + + let bytes = build_test_bytecode(); + + // Write to temp file + let mut tmpfile = tempfile::NamedTempFile::new().unwrap(); + tmpfile.write_all(&bytes).unwrap(); + tmpfile.flush().unwrap(); + + // Load via mmap + let module = Module::from_path(tmpfile.path()).unwrap(); + + assert!(module.header().validate_magic()); + assert_eq!(module.header().total_size, 768); + + // Verify we can decode instructions + let instr = module.decode_step(StepId(0)); + assert!(matches!(instr, Instruction::Match(_))); + + // Verify string lookup works through mmap + let strings = module.strings(); + assert_eq!(strings.get(StringId(0)), "Test"); +} From 4973f1280ec7b0826886b8cf46cfff7ddcb789c8 Mon Sep 17 00:00:00 2001 From: Sergei Zharinov Date: Mon, 29 Dec 2025 15:07:32 -0300 Subject: [PATCH 2/2] fixup! feat: add bytecode module loader --- Cargo.lock | 10 ------ crates/plotnik-lib/Cargo.toml | 1 - crates/plotnik-lib/src/bytecode/module.rs | 42 +++++++---------------- 3 files changed, 13 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4c7f1bf3..325559bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1474,15 +1474,6 @@ version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" -[[package]] -name = "memmap2" -version = "0.9.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744133e4a0e0a658e1374cf3bf8e415c4052a15a111acd372764c55b4177d490" -dependencies = [ - "libc", -] - [[package]] name = "once_cell" version = "1.21.3" @@ -1638,7 +1629,6 @@ dependencies = [ "indoc", "insta", "logos", - "memmap2", "plotnik-core", "plotnik-langs", "rowan", diff --git a/crates/plotnik-lib/Cargo.toml b/crates/plotnik-lib/Cargo.toml index 7f698d86..f905a3bf 100644 --- a/crates/plotnik-lib/Cargo.toml +++ b/crates/plotnik-lib/Cargo.toml @@ -21,7 +21,6 @@ rowan = "0.16.1" serde = { version = "1.0.228", features = ["derive"] } thiserror = "2.0.17" arborium-tree-sitter = "2.3.2" -memmap2 = "0.9" plotnik-core = { version = "0.1", path = "../plotnik-core" } plotnik-langs = { version = "0.1", path = "../plotnik-langs", optional = true } diff --git a/crates/plotnik-lib/src/bytecode/module.rs b/crates/plotnik-lib/src/bytecode/module.rs index 5c75f23f..82cefde1 100644 --- a/crates/plotnik-lib/src/bytecode/module.rs +++ b/crates/plotnik-lib/src/bytecode/module.rs @@ -1,15 +1,12 @@ //! Bytecode module with unified storage. //! -//! The [`Module`] struct holds compiled bytecode in either owned or memory-mapped -//! form, decoding instructions lazily when the VM steps into them. +//! The [`Module`] struct holds compiled bytecode, decoding instructions lazily +//! when the VM steps into them. -use std::fs::File; use std::io; use std::ops::Deref; use std::path::Path; -use memmap2::Mmap; - use super::header::Header; use super::ids::{QTypeId, StepId, StringId}; use super::instructions::{Call, Match, MatchView, Opcode, Return}; @@ -34,40 +31,28 @@ fn read_u32_le(bytes: &[u8], offset: usize) -> u32 { ]) } -/// Storage for bytecode bytes—either owned or memory-mapped. +/// Storage for bytecode bytes. #[derive(Debug)] -pub enum ByteStorage { - /// Owned byte vector (from compilation or read into memory). - Owned(Vec), - /// Memory-mapped file. - Mapped(Mmap), -} +pub struct ByteStorage(Vec); impl Deref for ByteStorage { type Target = [u8]; fn deref(&self) -> &Self::Target { - match self { - ByteStorage::Owned(v) => v, - ByteStorage::Mapped(m) => m, - } + &self.0 } } impl ByteStorage { /// Create from owned bytes. pub fn from_vec(bytes: Vec) -> Self { - Self::Owned(bytes) + Self(bytes) } - /// Memory-map a file. - /// - /// # Safety - /// The file must not be modified while the mapping is active. - pub fn from_file(file: &File) -> io::Result { - // SAFETY: Caller ensures the file is not modified while mapped. - let mmap = unsafe { Mmap::map(file)? }; - Ok(Self::Mapped(mmap)) + /// Read a file into memory. + pub fn from_file(path: impl AsRef) -> io::Result { + let bytes = std::fs::read(path)?; + Ok(Self(bytes)) } } @@ -162,13 +147,12 @@ pub struct Module { impl Module { /// Load a module from owned bytes. pub fn from_bytes(bytes: Vec) -> Result { - Self::from_storage(ByteStorage::Owned(bytes)) + Self::from_storage(ByteStorage::from_vec(bytes)) } - /// Load a module from a file path (memory-mapped). + /// Load a module from a file path. pub fn from_path(path: impl AsRef) -> Result { - let file = File::open(path)?; - let storage = ByteStorage::from_file(&file)?; + let storage = ByteStorage::from_file(&path)?; Self::from_storage(storage) }