From dd89d2557904b879cb1266e20f6b4d4da9e62259 Mon Sep 17 00:00:00 2001 From: Sergei Zharinov Date: Thu, 1 Jan 2026 12:27:07 -0300 Subject: [PATCH] docs: add CLI reference and update documentation --- .../src/bytecode/emit/typescript.rs | 783 ------------------ docs/README.md | 7 +- docs/binary-format/01-overview.md | 6 +- docs/binary-format/02-strings.md | 2 +- docs/binary-format/06-transitions.md | 67 +- docs/binary-format/07-dump-format.md | 242 +++--- docs/cli.md | 317 +++++++ docs/lang-reference.md | 1 + docs/runtime-engine.md | 34 +- 9 files changed, 493 insertions(+), 966 deletions(-) delete mode 100644 crates/plotnik-lib/src/bytecode/emit/typescript.rs create mode 100644 docs/cli.md diff --git a/crates/plotnik-lib/src/bytecode/emit/typescript.rs b/crates/plotnik-lib/src/bytecode/emit/typescript.rs deleted file mode 100644 index a35630b9..00000000 --- a/crates/plotnik-lib/src/bytecode/emit/typescript.rs +++ /dev/null @@ -1,783 +0,0 @@ -//! TypeScript type emitter from bytecode Module. -//! -//! Converts compiled bytecode back to TypeScript declarations. -//! Used as a test oracle and for generating types from .ptkq files. - -use std::collections::hash_map::Entry; -use std::collections::{BTreeSet, HashMap, HashSet}; - -use plotnik_core::utils::to_pascal_case; - -use crate::bytecode::module::{Module, StringsView, TypesView}; -use crate::bytecode::type_meta::{TypeDef, TypeKind}; -use crate::bytecode::{EntrypointsView, QTypeId}; - -/// Configuration for TypeScript emission. -#[derive(Clone, Debug)] -pub struct EmitConfig { - /// Whether to export types - pub export: bool, - /// Whether to emit the Node type definition - pub emit_node_type: bool, - /// Use verbose node representation (with kind, text, etc.) - pub verbose_nodes: bool, -} - -impl Default for EmitConfig { - fn default() -> Self { - Self { - export: true, - emit_node_type: true, - verbose_nodes: false, - } - } -} - -/// TypeScript emitter from bytecode module. -pub struct TsEmitter<'a> { - types: TypesView<'a>, - strings: StringsView<'a>, - entrypoints: EntrypointsView<'a>, - config: EmitConfig, - - /// TypeId -> assigned name mapping - type_names: HashMap, - /// Names already used (for collision avoidance) - used_names: BTreeSet, - /// Track which builtin types are referenced - node_referenced: bool, - /// Track which types have been emitted - emitted: HashSet, - /// Types visited during builtin reference collection (cycle detection) - refs_visited: HashSet, - /// Output buffer - output: String, -} - -impl<'a> TsEmitter<'a> { - pub fn new(module: &'a Module, config: EmitConfig) -> Self { - Self { - types: module.types(), - strings: module.strings(), - entrypoints: module.entrypoints(), - config, - type_names: HashMap::new(), - used_names: BTreeSet::new(), - node_referenced: false, - emitted: HashSet::new(), - refs_visited: HashSet::new(), - output: String::new(), - } - } - - /// Emit TypeScript for all entrypoint types. - pub fn emit(mut self) -> String { - self.prepare_emission(); - - // Collect all entrypoints and their result types - let mut primary_names: HashMap = HashMap::new(); - let mut aliases: Vec<(String, QTypeId)> = Vec::new(); - - for i in 0..self.entrypoints.len() { - let ep = self.entrypoints.get(i); - let name = self.strings.get(ep.name).to_string(); - let type_id = ep.result_type; - - match primary_names.entry(type_id) { - Entry::Vacant(e) => { - e.insert(name); - } - Entry::Occupied(_) => { - aliases.push((name, type_id)); - } - } - } - - // Collect all reachable types starting from entrypoints - let mut to_emit = HashSet::new(); - for i in 0..self.entrypoints.len() { - let ep = self.entrypoints.get(i); - self.collect_reachable_types(ep.result_type, &mut to_emit); - } - - // Emit in topological order - for type_id in self.sort_topologically(to_emit) { - if let Some(def_name) = primary_names.get(&type_id) { - self.emit_type_definition(def_name, type_id); - } else { - self.emit_generated_or_custom(type_id); - } - } - - // Emit aliases - for (alias_name, type_id) in aliases { - if let Some(primary_name) = primary_names.get(&type_id) { - self.emit_type_alias(&alias_name, primary_name); - } - } - - self.output - } - - fn prepare_emission(&mut self) { - // Reserve entrypoint names to avoid collisions - for i in 0..self.entrypoints.len() { - let ep = self.entrypoints.get(i); - let name = self.strings.get(ep.name); - self.used_names.insert(to_pascal_case(name)); - } - - // Assign names to named types from TypeNames section - for i in 0..self.types.names_count() { - let type_name = self.types.get_name(i); - let name = self.strings.get(type_name.name); - self.type_names - .insert(type_name.type_id, to_pascal_case(name)); - } - - // Assign names to struct/enum types that need them but don't have names - self.assign_generated_names(); - - // Collect builtin references - self.collect_builtin_references(); - - // Emit Node interface if referenced - if self.config.emit_node_type && self.node_referenced { - self.emit_node_interface(); - } - } - - fn assign_generated_names(&mut self) { - // Collect naming contexts from entrypoints → fields - let mut contexts: HashMap = HashMap::new(); - - for i in 0..self.entrypoints.len() { - let ep = self.entrypoints.get(i); - let def_name = self.strings.get(ep.name); - self.collect_naming_contexts( - ep.result_type, - &NamingContext { - def_name: def_name.to_string(), - field_name: None, - }, - &mut contexts, - ); - } - - // Assign names to types that need them - for i in 0..self.types.defs_count() { - let type_id = QTypeId::from_custom_index(i); - if self.type_names.contains_key(&type_id) { - continue; - } - - let type_def = self.types.get_def(i); - if !self.needs_generated_name(&type_def) { - continue; - } - - let name = if let Some(ctx) = contexts.get(&type_id) { - self.generate_contextual_name(ctx) - } else { - self.generate_fallback_name(&type_def) - }; - self.type_names.insert(type_id, name); - } - } - - fn collect_naming_contexts( - &self, - type_id: QTypeId, - ctx: &NamingContext, - contexts: &mut HashMap, - ) { - if type_id.is_builtin() || contexts.contains_key(&type_id) { - return; - } - - let Some(type_def) = self.types.get(type_id) else { - return; - }; - - let Some(kind) = type_def.type_kind() else { - return; - }; - - match kind { - TypeKind::Struct => { - contexts.entry(type_id).or_insert_with(|| ctx.clone()); - for member in self.types.members_of(&type_def) { - let field_name = self.strings.get(member.name); - // Unwrap Optional wrappers to get the actual type - let (inner_type, _) = self.unwrap_optional(member.type_id); - let field_ctx = NamingContext { - def_name: ctx.def_name.clone(), - field_name: Some(field_name.to_string()), - }; - self.collect_naming_contexts(inner_type, &field_ctx, contexts); - } - } - TypeKind::Enum => { - contexts.entry(type_id).or_insert_with(|| ctx.clone()); - } - TypeKind::ArrayZeroOrMore | TypeKind::ArrayOneOrMore => { - let inner = QTypeId(type_def.data); - self.collect_naming_contexts(inner, ctx, contexts); - } - TypeKind::Optional => { - let inner = QTypeId(type_def.data); - self.collect_naming_contexts(inner, ctx, contexts); - } - TypeKind::Alias => { - // Aliases don't need contexts - } - } - } - - fn collect_builtin_references(&mut self) { - for i in 0..self.entrypoints.len() { - let ep = self.entrypoints.get(i); - self.collect_refs_recursive(ep.result_type); - } - } - - fn collect_refs_recursive(&mut self, type_id: QTypeId) { - if type_id == QTypeId::NODE { - self.node_referenced = true; - return; - } - if type_id == QTypeId::STRING || type_id == QTypeId::VOID { - return; - } - - // Cycle detection - if !self.refs_visited.insert(type_id) { - return; - } - - let Some(type_def) = self.types.get(type_id) else { - return; - }; - - let Some(kind) = type_def.type_kind() else { - return; - }; - - match kind { - TypeKind::Struct | TypeKind::Enum => { - let member_types: Vec<_> = self - .types - .members_of(&type_def) - .map(|m| m.type_id) - .collect(); - for ty in member_types { - self.collect_refs_recursive(ty); - } - } - TypeKind::ArrayZeroOrMore | TypeKind::ArrayOneOrMore | TypeKind::Optional => { - self.collect_refs_recursive(QTypeId(type_def.data)); - } - TypeKind::Alias => { - // Alias to Node - self.node_referenced = true; - } - } - } - - fn sort_topologically(&self, types: HashSet) -> Vec { - let mut deps: HashMap> = HashMap::new(); - let mut rdeps: HashMap> = HashMap::new(); - - for &tid in &types { - deps.entry(tid).or_default(); - rdeps.entry(tid).or_default(); - } - - // Build dependency graph - for &tid in &types { - for dep in self.get_direct_deps(tid) { - if types.contains(&dep) && dep != tid { - deps.entry(tid).or_default().insert(dep); - rdeps.entry(dep).or_default().insert(tid); - } - } - } - - // Kahn's algorithm - let mut result = Vec::with_capacity(types.len()); - let mut queue: Vec = deps - .iter() - .filter(|(_, d)| d.is_empty()) - .map(|(&tid, _)| tid) - .collect(); - - queue.sort_by_key(|tid| tid.0); - - while let Some(tid) = queue.pop() { - result.push(tid); - if let Some(dependents) = rdeps.get(&tid) { - for &dependent in dependents { - if let Some(dep_set) = deps.get_mut(&dependent) { - dep_set.remove(&tid); - if dep_set.is_empty() { - queue.push(dependent); - queue.sort_by_key(|t| t.0); - } - } - } - } - } - - result - } - - fn collect_reachable_types(&self, type_id: QTypeId, out: &mut HashSet) { - if type_id.is_builtin() || out.contains(&type_id) { - return; - } - - let Some(type_def) = self.types.get(type_id) else { - return; - }; - - let Some(kind) = type_def.type_kind() else { - return; - }; - - match kind { - TypeKind::Struct => { - out.insert(type_id); - for member in self.types.members_of(&type_def) { - self.collect_reachable_types(member.type_id, out); - } - } - TypeKind::Enum => { - out.insert(type_id); - for member in self.types.members_of(&type_def) { - // For enum variants, recurse into payload fields but don't - // add the payload struct itself - it will be inlined. - self.collect_enum_variant_refs(member.type_id, out); - } - } - TypeKind::Alias => { - out.insert(type_id); - } - TypeKind::ArrayZeroOrMore | TypeKind::ArrayOneOrMore => { - self.collect_reachable_types(QTypeId(type_def.data), out); - } - TypeKind::Optional => { - self.collect_reachable_types(QTypeId(type_def.data), out); - } - } - } - - /// Collect reachable types from enum variant payloads. - /// Recurses into struct fields but doesn't add the payload struct itself. - fn collect_enum_variant_refs(&self, type_id: QTypeId, out: &mut HashSet) { - if type_id.is_builtin() { - return; - } - - let Some(type_def) = self.types.get(type_id) else { - return; - }; - - let Some(kind) = type_def.type_kind() else { - return; - }; - - match kind { - TypeKind::Struct => { - // DON'T add the struct - it will be inlined as $data. - // But DO recurse into its fields to find named types. - for member in self.types.members_of(&type_def) { - self.collect_reachable_types(member.type_id, out); - } - } - _ => { - // For non-struct payloads (shouldn't happen normally), - // fall back to regular collection. - self.collect_reachable_types(type_id, out); - } - } - } - - fn get_direct_deps(&self, type_id: QTypeId) -> Vec { - let Some(type_def) = self.types.get(type_id) else { - return vec![]; - }; - - let Some(kind) = type_def.type_kind() else { - return vec![]; - }; - - match kind { - TypeKind::Struct | TypeKind::Enum => self - .types - .members_of(&type_def) - .flat_map(|member| self.unwrap_for_deps(member.type_id)) - .collect(), - TypeKind::ArrayZeroOrMore | TypeKind::ArrayOneOrMore => { - self.unwrap_for_deps(QTypeId(type_def.data)) - } - TypeKind::Optional => self.unwrap_for_deps(QTypeId(type_def.data)), - TypeKind::Alias => vec![], - } - } - - fn unwrap_for_deps(&self, type_id: QTypeId) -> Vec { - if type_id.is_builtin() { - return vec![]; - } - - let Some(type_def) = self.types.get(type_id) else { - return vec![]; - }; - - let Some(kind) = type_def.type_kind() else { - return vec![]; - }; - - match kind { - TypeKind::ArrayZeroOrMore | TypeKind::ArrayOneOrMore | TypeKind::Optional => { - self.unwrap_for_deps(QTypeId(type_def.data)) - } - TypeKind::Struct | TypeKind::Enum | TypeKind::Alias => vec![type_id], - } - } - - fn emit_generated_or_custom(&mut self, type_id: QTypeId) { - if self.emitted.contains(&type_id) || type_id.is_builtin() { - return; - } - - let Some(type_def) = self.types.get(type_id) else { - return; - }; - - // Check if this is an alias type (custom type annotation) - if type_def.is_alias() { - if let Some(name) = self.type_names.get(&type_id).cloned() { - self.emit_custom_type_alias(&name); - self.emitted.insert(type_id); - } - return; - } - - // Check if we have a generated name - if let Some(name) = self.type_names.get(&type_id).cloned() { - self.emit_generated_type_def(type_id, &name); - } - } - - fn emit_generated_type_def(&mut self, type_id: QTypeId, name: &str) { - self.emitted.insert(type_id); - let export = if self.config.export { "export " } else { "" }; - - let Some(type_def) = self.types.get(type_id) else { - return; - }; - - let Some(kind) = type_def.type_kind() else { - return; - }; - - match kind { - TypeKind::Struct => self.emit_interface(name, &type_def, export), - TypeKind::Enum => self.emit_tagged_union(name, &type_def, export), - _ => {} - } - } - - fn emit_type_definition(&mut self, name: &str, type_id: QTypeId) { - self.emitted.insert(type_id); - let export = if self.config.export { "export " } else { "" }; - let type_name = to_pascal_case(name); - - let Some(type_def) = self.types.get(type_id) else { - // Builtin type - emit as alias - let ts_type = self.type_to_ts(type_id); - self.output - .push_str(&format!("{}type {} = {};\n\n", export, type_name, ts_type)); - return; - }; - - let Some(kind) = type_def.type_kind() else { - return; - }; - - match kind { - TypeKind::Struct => self.emit_interface(&type_name, &type_def, export), - TypeKind::Enum => self.emit_tagged_union(&type_name, &type_def, export), - _ => { - let ts_type = self.type_to_ts(type_id); - self.output - .push_str(&format!("{}type {} = {};\n\n", export, type_name, ts_type)); - } - } - } - - fn emit_interface(&mut self, name: &str, type_def: &TypeDef, export: &str) { - self.output - .push_str(&format!("{}interface {} {{\n", export, name)); - - // Collect fields and sort by name - let mut fields: Vec<(String, QTypeId, bool)> = self - .types - .members_of(type_def) - .map(|member| { - let field_name = self.strings.get(member.name).to_string(); - let (inner_type, optional) = self.unwrap_optional(member.type_id); - (field_name, inner_type, optional) - }) - .collect(); - fields.sort_by(|a, b| a.0.cmp(&b.0)); - - for (field_name, field_type, optional) in fields { - let ts_type = self.type_to_ts(field_type); - let opt_marker = if optional { "?" } else { "" }; - self.output - .push_str(&format!(" {}{}: {};\n", field_name, opt_marker, ts_type)); - } - - self.output.push_str("}\n\n"); - } - - fn emit_tagged_union(&mut self, name: &str, type_def: &TypeDef, export: &str) { - let mut variant_types = Vec::new(); - - for member in self.types.members_of(type_def) { - let variant_name = self.strings.get(member.name); - let variant_type_name = format!("{}{}", name, to_pascal_case(variant_name)); - variant_types.push(variant_type_name.clone()); - - let data_str = self.inline_data_type(member.type_id); - self.output.push_str(&format!( - "{}interface {} {{\n $tag: \"{}\";\n $data: {};\n}}\n\n", - export, variant_type_name, variant_name, data_str - )); - } - - let union = variant_types.join(" | "); - self.output - .push_str(&format!("{}type {} = {};\n\n", export, name, union)); - } - - fn emit_custom_type_alias(&mut self, name: &str) { - let export = if self.config.export { "export " } else { "" }; - self.output - .push_str(&format!("{}type {} = Node;\n\n", export, name)); - } - - fn emit_type_alias(&mut self, alias_name: &str, target_name: &str) { - let export = if self.config.export { "export " } else { "" }; - self.output.push_str(&format!( - "{}type {} = {};\n\n", - export, alias_name, target_name - )); - } - - fn emit_node_interface(&mut self) { - let export = if self.config.export { "export " } else { "" }; - if self.config.verbose_nodes { - self.output.push_str(&format!( - "{}interface Node {{\n kind: string;\n text: string;\n startPosition: {{ row: number; column: number }};\n endPosition: {{ row: number; column: number }};\n}}\n\n", - export - )); - } else { - self.output.push_str(&format!( - "{}interface Node {{\n kind: string;\n text: string;\n}}\n\n", - export - )); - } - } - - fn type_to_ts(&self, type_id: QTypeId) -> String { - match type_id { - QTypeId::VOID => "void".to_string(), - QTypeId::NODE => "Node".to_string(), - QTypeId::STRING => "string".to_string(), - _ => self.custom_type_to_ts(type_id), - } - } - - fn custom_type_to_ts(&self, type_id: QTypeId) -> String { - let Some(type_def) = self.types.get(type_id) else { - return "unknown".to_string(); - }; - - let Some(kind) = type_def.type_kind() else { - return "unknown".to_string(); - }; - - match kind { - TypeKind::Struct | TypeKind::Enum => { - if let Some(name) = self.type_names.get(&type_id) { - name.clone() - } else { - self.inline_composite(type_id, &type_def, &kind) - } - } - TypeKind::Alias => { - if let Some(name) = self.type_names.get(&type_id) { - name.clone() - } else { - "Node".to_string() - } - } - TypeKind::ArrayZeroOrMore => { - let elem_type = self.type_to_ts(QTypeId(type_def.data)); - format!("{}[]", elem_type) - } - TypeKind::ArrayOneOrMore => { - let elem_type = self.type_to_ts(QTypeId(type_def.data)); - format!("[{}, ...{}[]]", elem_type, elem_type) - } - TypeKind::Optional => { - let inner_type = self.type_to_ts(QTypeId(type_def.data)); - format!("{} | null", inner_type) - } - } - } - - fn inline_composite(&self, _type_id: QTypeId, type_def: &TypeDef, kind: &TypeKind) -> String { - match kind { - TypeKind::Struct => self.inline_struct(type_def), - TypeKind::Enum => self.inline_enum(type_def), - _ => "unknown".to_string(), - } - } - - fn inline_struct(&self, type_def: &TypeDef) -> String { - if type_def.count == 0 { - return "{}".to_string(); - } - - let mut fields: Vec<(String, QTypeId, bool)> = self - .types - .members_of(type_def) - .map(|member| { - let field_name = self.strings.get(member.name).to_string(); - let (inner_type, optional) = self.unwrap_optional(member.type_id); - (field_name, inner_type, optional) - }) - .collect(); - fields.sort_by(|a, b| a.0.cmp(&b.0)); - - let field_strs: Vec = fields - .iter() - .map(|(name, ty, opt)| { - let ts_type = self.type_to_ts(*ty); - let opt_marker = if *opt { "?" } else { "" }; - format!("{}{}: {}", name, opt_marker, ts_type) - }) - .collect(); - - format!("{{ {} }}", field_strs.join("; ")) - } - - fn inline_enum(&self, type_def: &TypeDef) -> String { - let variant_strs: Vec = self - .types - .members_of(type_def) - .map(|member| { - let name = self.strings.get(member.name); - let data_type = self.type_to_ts(member.type_id); - format!("{{ $tag: \"{}\"; $data: {} }}", name, data_type) - }) - .collect(); - - variant_strs.join(" | ") - } - - fn inline_data_type(&self, type_id: QTypeId) -> String { - if type_id == QTypeId::VOID { - return "{}".to_string(); - } - - let Some(type_def) = self.types.get(type_id) else { - return self.type_to_ts(type_id); - }; - - let Some(kind) = type_def.type_kind() else { - return self.type_to_ts(type_id); - }; - - if kind == TypeKind::Struct { - self.inline_struct(&type_def) - } else { - self.type_to_ts(type_id) - } - } - - /// Unwrap Optional wrappers and return (inner_type, is_optional). - fn unwrap_optional(&self, type_id: QTypeId) -> (QTypeId, bool) { - if type_id.is_builtin() { - return (type_id, false); - } - let Some(type_def) = self.types.get(type_id) else { - return (type_id, false); - }; - if type_def.type_kind() != Some(TypeKind::Optional) { - return (type_id, false); - } - (QTypeId(type_def.data), true) - } - - fn needs_generated_name(&self, type_def: &TypeDef) -> bool { - matches!( - type_def.type_kind(), - Some(TypeKind::Struct) | Some(TypeKind::Enum) - ) - } - - fn generate_contextual_name(&mut self, ctx: &NamingContext) -> String { - let base = if let Some(field) = &ctx.field_name { - format!("{}{}", to_pascal_case(&ctx.def_name), to_pascal_case(field)) - } else { - to_pascal_case(&ctx.def_name) - }; - self.unique_name(&base) - } - - fn generate_fallback_name(&mut self, type_def: &TypeDef) -> String { - let base = match type_def.type_kind() { - Some(TypeKind::Struct) => "Struct", - Some(TypeKind::Enum) => "Enum", - _ => "Type", - }; - self.unique_name(base) - } - - fn unique_name(&mut self, base: &str) -> String { - let base = to_pascal_case(base); - if self.used_names.insert(base.clone()) { - return base; - } - - let mut counter = 2; - loop { - let name = format!("{}{}", base, counter); - if self.used_names.insert(name.clone()) { - return name; - } - counter += 1; - } - } -} - -#[derive(Clone, Debug)] -struct NamingContext { - def_name: String, - field_name: Option, -} - -/// Emit TypeScript from a bytecode module. -pub fn emit_typescript(module: &Module) -> String { - TsEmitter::new(module, EmitConfig::default()).emit() -} - -/// Emit TypeScript from a bytecode module with custom config. -pub fn emit_typescript_with_config(module: &Module, config: EmitConfig) -> String { - TsEmitter::new(module, config).emit() -} diff --git a/docs/README.md b/docs/README.md index e166150f..0406ee17 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,6 +6,7 @@ Plotnik is a strongly-typed pattern matching language for tree-sitter syntax tre ### Users +- [CLI Guide](cli.md) — Command-line tool usage - [Language Reference](lang-reference.md) — Complete syntax and semantics - [Type System](type-system.md) — How output types are inferred from queries @@ -21,6 +22,7 @@ Plotnik is a strongly-typed pattern matching language for tree-sitter syntax tre AGENTS.md # Project constitution (coding rules, testing, ADRs) docs/ ├── README.md # You are here +├── cli.md # CLI tool usage guide ├── lang-reference.md # Query language syntax and semantics ├── type-system.md # Type inference rules and output shapes ├── runtime-engine.md # VM state, backtracking, effects @@ -37,8 +39,9 @@ docs/ New to Plotnik: -1. `lang-reference.md` — Learn the query syntax -2. `type-system.md` — Understand output shapes +1. `cli.md` — Get started with the CLI +2. `lang-reference.md` — Learn the query syntax +3. `type-system.md` — Understand output shapes Building tooling: diff --git a/docs/binary-format/01-overview.md b/docs/binary-format/01-overview.md index 60cbcc81..8c1d1bab 100644 --- a/docs/binary-format/01-overview.md +++ b/docs/binary-format/01-overview.md @@ -90,9 +90,9 @@ struct Header { ### Flags Field -| Bit | Name | Description | -| --- | ------- | -------------------------------------------------------- | -| 0 | LINKED | If set, bytecode contains grammar NodeTypeId/NodeFieldId | +| Bit | Name | Description | +| --- | ------ | -------------------------------------------------------- | +| 0 | LINKED | If set, bytecode contains grammar NodeTypeId/NodeFieldId | **Linked vs Unlinked Bytecode**: diff --git a/docs/binary-format/02-strings.md b/docs/binary-format/02-strings.md index 598d0b3b..0cfdc695 100644 --- a/docs/binary-format/02-strings.md +++ b/docs/binary-format/02-strings.md @@ -8,7 +8,7 @@ Strings are stored in a centralized pool to eliminate redundancy and alignment p ### Reserved StringId(0) -`StringId(0)` is reserved and contains an easter egg: `"Beauty will save the world"` (Dostoevsky, *The Idiot*). +`StringId(0)` is reserved and contains an easter egg: `"Beauty will save the world"` (Dostoevsky, _The Idiot_). This reservation has a practical purpose: since Match instructions use `0` to indicate "no constraint" (wildcard), `StringId(0)` can never appear in unlinked bytecode instructions. User strings start at index 1. diff --git a/docs/binary-format/06-transitions.md b/docs/binary-format/06-transitions.md index 6b3ee87b..0f47d5e1 100644 --- a/docs/binary-format/06-transitions.md +++ b/docs/binary-format/06-transitions.md @@ -87,20 +87,20 @@ EffectOp (u16) - **Opcode**: 6 bits (0-63), currently 12 defined. - **Payload**: 10 bits (0-1023), member/variant index. -| Opcode | Name | Payload | -| :----- | :------ | :--------------------- | -| 0 | `Node` | - | -| 1 | `A` | - | -| 2 | `Push` | - | -| 3 | `EndA` | - | -| 4 | `S` | - | -| 5 | `EndS` | - | -| 6 | `Set` | Member index (0-1023) | -| 7 | `E` | Variant index (0-1023) | -| 8 | `EndE` | - | -| 9 | `Text` | - | -| 10 | `Clear` | - | -| 11 | `Null` | - | +| Opcode | Name | Payload | +| :----- | :-------- | :--------------------- | +| 0 | `Node` | - | +| 1 | `Arr` | - | +| 2 | `Push` | - | +| 3 | `EndArr` | - | +| 4 | `Obj` | - | +| 5 | `EndObj` | - | +| 6 | `Set` | Member index (0-1023) | +| 7 | `Enum` | Variant index (0-1023) | +| 8 | `EndEnum` | - | +| 9 | `Text` | - | +| 10 | `Clear` | - | +| 11 | `Null` | - | **Opcode Ranges** (future extensibility): @@ -134,9 +134,9 @@ struct Match8 { Bytes 2-5 (`node_type` and `node_field`) have different meanings based on the header's `linked` flag: -| Mode | `node_type` (bytes 2-3) | `node_field` (bytes 4-5) | -| -------- | ------------------------------- | -------------------------------- | -| Linked | `NodeTypeId` from tree-sitter | `NodeFieldId` from tree-sitter | +| Mode | `node_type` (bytes 2-3) | `node_field` (bytes 4-5) | +| -------- | -------------------------------- | --------------------------------- | +| Linked | `NodeTypeId` from tree-sitter | `NodeFieldId` from tree-sitter | | Unlinked | `StringId` pointing to type name | `StringId` pointing to field name | In **linked mode**, the runtime can directly compare against tree-sitter node types/fields. @@ -217,8 +217,10 @@ The compiler selects the smallest step size that fits the payload. If the total **Pre vs Post Effects**: -- `pre_effects`: Execute before match attempt. Used for scope openers (`S`, `A`, `E`) that must run regardless of which branch succeeds. -- `post_effects`: Execute after successful match. Used for capture/assignment ops (`Node`, `Set`, `EndS`, etc.) that depend on `matched_node`. +- `pre_effects`: Execute before match attempt (before nav, before node checks). Any effect can appear here. +- `post_effects`: Execute after successful match (after `matched_node` is set). Any effect can appear here. + +The compiler places effects based on semantic requirements: scope openers often go in pre (to run regardless of which branch succeeds), captures often go in post (to access `matched_node`). But this is a compiler decision, not a bytecode-level restriction. ### 4.3. Epsilon Transitions @@ -230,19 +232,20 @@ A Match8 or Match16–64 with `node_type: None`, `node_field: None`, and `nav: S ### 4.4. Call -Invokes another definition (recursion). Pushes return address to the call stack and jumps to target. +Invokes another definition (recursion). Executes navigation (with optional field constraint), pushes return address to the call stack, and jumps to target. ```rust #[repr(C)] struct Call { - type_id: u8, // segment(4) | 0x6 - reserved: u8, - next: u16, // Return address (StepId, current segment) - target: u16, // Callee StepId (segment from type_id) - ref_id: u16, // Must match Return.ref_id + type_id: u8, // segment(4) | 0x6 + nav: u8, // Nav + node_field: Option, // None (0) means "any" + next: u16, // Return address (StepId, current segment) + target: u16, // Callee StepId (segment from type_id) } ``` +- **Nav + Field**: Call handles navigation and field constraint. The callee's first Match checks node type. This allows `field: (Ref)` patterns to check field and type on the same node. - **Target Segment**: Defined by `type_id >> 4`. - **Return Segment**: Implicitly the current segment. @@ -254,22 +257,10 @@ Returns from a definition. Pops the return address from the call stack. #[repr(C)] struct Return { type_id: u8, // segment(4) | 0x7 - reserved: u8, - ref_id: u16, // Must match Call.ref_id - _pad: u32, + _pad: [u8; 7], } ``` -### 4.6. The `ref_id` Invariant - -The `ref_id` field enforces stack discipline between `Call` and `Return`. Each definition gets a unique `ref_id` at compile time. At runtime: - -1. `Call` pushes a frame with its `ref_id` onto the call stack. -2. `Return` verifies its `ref_id` matches the current frame's `ref_id`. -3. Mismatch indicates a malformed query or VM bug—panic in debug builds. - -This catches errors like mismatched call/return pairs or corrupted stack state during backtracking. The check is O(1). - ## 5. Execution Semantics ### 5.1. Match8 Execution diff --git a/docs/binary-format/07-dump-format.md b/docs/binary-format/07-dump-format.md index a535d1d0..50767642 100644 --- a/docs/binary-format/07-dump-format.md +++ b/docs/binary-format/07-dump-format.md @@ -3,47 +3,50 @@ ## Example Query ``` +Ident = (identifier) @name :: string Expression = [ - Ident: (identifier) @name :: string - Num: (number) @value :: string + Literal: (number) @value + Variable: (identifier) @name ] - -Statement = [ - Assign: (assignment_expression - left: (identifier) @target :: string +Assignment = (assignment_expression + left: (identifier) @target right: (Expression) @value) - Return: (return_statement (Expression)? @value) -] - -Root = (program (Statement)+ @statements) ``` ## Bytecode Dump +**Epsilon transitions** (`𝜀`) succeed unconditionally without cursor interaction. +They require all three conditions: +- `nav == Stay` (no cursor movement) +- `node_type == None` (no type constraint) +- `node_field == None` (no field constraint) + +A step with `nav == Stay` but with a type constraint (e.g., `(identifier)`) is NOT +epsilon—it matches at the current cursor position. + +**Capture effect consolidation**: Scalar capture effects (`Node`, `Text`, `Set`) are +placed directly on match instructions rather than in separate epsilon steps. Structural +effects (`Obj`, `EndObj`, `Arr`, `EndArr`, `Enum`, `EndEnum`) remain in epsilons. + ``` [header] -linked = false +linked = true [strings] S00 "Beauty will save the world" -S01 "Assign" -S02 "Expression" -S03 "Ident" -S04 "Num" -S05 "Return" -S06 "Root" -S07 "Statement" -S08 "assignment_expression" +S01 "name" +S02 "value" +S03 "Literal" +S04 "Variable" +S05 "target" +S06 "Ident" +S07 "Expression" +S08 "Assignment" S09 "identifier" -S10 "left" -S11 "name" -S12 "number" -S13 "program" -S14 "return_statement" -S15 "right" -S16 "statements" -S17 "target" -S18 "value" +S10 "number" +S11 "assignment_expression" +S12 "left" +S13 "right" [types.defs] T00 = void @@ -51,73 +54,56 @@ T01 = Node T02 = str T03 = Struct(M0, 1) ; { name } T04 = Struct(M1, 1) ; { value } -T05 = Enum(M2, 2) ; Ident | Num -T06 = Struct(M4, 2) ; { target, value } -T07 = Optional(T05) ; Expression? -T08 = Struct(M6, 1) ; { value } -T09 = Enum(M7, 2) ; Assign | Return -T10 = ArrayPlus(T09) ; Statement+ -T11 = Struct(M9, 1) ; { statements } +T05 = Struct(M2, 1) ; { name } +T06 = Enum(M3, 2) ; Literal | Variable +T07 = Struct(M5, 2) ; { value, target } [types.members] -M0 = (S11, T02) ; name: str -M1 = (S18, T02) ; value: str -M2 = (S03, T03) ; Ident => T03 -M3 = (S04, T04) ; Num => T04 -M4 = (S17, T02) ; target: str -M5 = (S18, T05) ; value: Expression -M6 = (S18, T07) ; value: Expression? -M7 = (S01, T06) ; Assign => T06 -M8 = (S05, T08) ; Return => T08 -M9 = (S16, T10) ; statements: Statement+ +M0 = (S01, T02) ; name: str +M1 = (S02, T01) ; value: Node +M2 = (S01, T01) ; name: Node +M3 = (S03, T04) ; Literal: T04 +M4 = (S04, T05) ; Variable: T05 +M5 = (S02, T06) ; value: Expression +M6 = (S05, T01) ; target: Node [types.names] -N0 = (S02, T05) ; Expression -N1 = (S06, T11) ; Root -N2 = (S07, T09) ; Statement +N0 = (S06, T03) ; Ident +N1 = (S07, T06) ; Expression +N2 = (S08, T07) ; Assignment -[entry] ; sorted lexicographically for binary search -Expression = 46 :: T05 -Root = 01 :: T11 -Statement = 14 :: T09 +[entry] +Assignment = 08 :: T07 +Expression = 05 :: T06 +Ident = 01 :: T03 [code] - 00 𝜀 ◼ - -Root: - 01 *↓ [S] (program) 03 - 03 𝜀 [A] 05 - 05 ▶ (Statement) 06 - 06 𝜀 [Push] 08 - 08 𝜀 05, 10 - 10 𝜀 [EndA Set(M9) EndS] 12 - 12 *↑¹ ◼ - -Statement: - 14 𝜀 16, 32 - 16 *↓ [E(M7) S] (assignment_expression) 18 - 18 *↓ left: (identifier) [Node Text Set(M4)] 20 - 20 * right: _ 21 - 21 ▷(Expression) 22 - 22 𝜀 [Set(M5) EndS EndE] 24 - 24 *↑² 26 - 26 (Statement) ▶ - 32 *↓ [E(M8) S] (return_statement) 34 - 34 𝜀 36, 40 - 36 *↓ 37 - 37 ▶ (Expression) 38 - 38 𝜀 [Set(M6)] 42 - 40 𝜀 [Null Set(M6)] 42 - 42 𝜀 [EndS EndE] 44 - 44 *↑¹ 26 + 00 𝜀 ◼ + +Ident: + 01 𝜀 02 + 02 (identifier) [Text Set(M0)] 04 + 04 ▶ Expression: - 46 𝜀 48, 54 - 48 *↓ [E(M2) S] (identifier) [Node Text Set(M0) EndS EndE] 50 - 50 *↑¹ 52 - 52 (Expression) ▶ - 54 *↓ [E(M3) S] (number) [Node Text Set(M1) EndS EndE] 56 - 56 *↑¹ 52 + 05 𝜀 06 + 06 𝜀 22, 28 + +Assignment: + 08 𝜀 09 + 09 (assignment_expression) 10 + 10 ↓* left: (identifier) [Node Set(M6)]12 + 12 * ▶ right: (Expression) 13 + 13 𝜀 [Set(M5)] 15 + 15 *↑¹ 16 + 16 ▶ + 17 ▶ + 18 𝜀 [EndEnum] 17 + 20 (number) [Node Set(M1)] 18 + 22 𝜀 [Enum(M3)] 20 + 24 𝜀 [EndEnum] 17 + 26 (identifier) [Node Set(M2)] 24 + 28 𝜀 [Enum(M4)] 26 ``` ## Files @@ -136,51 +122,65 @@ Future: options for verbosity levels, hiding sections, etc. ## Instruction Format +Each line follows the column layout: `