From 69d3b069919a042e466e1660696dc80c6197817b Mon Sep 17 00:00:00 2001 From: Sergei Zharinov Date: Fri, 19 Dec 2025 01:44:29 -0300 Subject: [PATCH 1/3] feat: Final docs --- docs/README.md | 51 ++ docs/adr/ADR-0001-query-parser.md | 21 - docs/adr/ADR-0002-diagnostics-system.md | 21 - docs/adr/ADR-0004-query-ir-binary-format.md | 192 ----- docs/adr/ADR-0005-transition-graph-format.md | 325 -------- docs/adr/ADR-0006-dynamic-query-execution.md | 286 ------- docs/adr/ADR-0007-type-metadata-format.md | 197 ----- docs/adr/ADR-0008-tree-navigation.md | 336 -------- docs/adr/ADR-0009-type-system.md | 426 ---------- docs/adr/ADR-0010-type-system-v2.md | 119 --- docs/adr/README.md | 52 -- docs/binary-format/01-overview.md | 79 ++ docs/binary-format/02-strings.md | 63 ++ docs/binary-format/03-symbols.md | 52 ++ docs/binary-format/04-types.md | 135 +++ docs/binary-format/05-entrypoints.md | 38 + docs/binary-format/06-transitions.md | 332 ++++++++ docs/lang-reference.md | 833 +++++++++++++++++++ docs/runtime-engine.md | 177 ++++ docs/type-system.md | 274 ++++++ 20 files changed, 2034 insertions(+), 1975 deletions(-) create mode 100644 docs/README.md delete mode 100644 docs/adr/ADR-0001-query-parser.md delete mode 100644 docs/adr/ADR-0002-diagnostics-system.md delete mode 100644 docs/adr/ADR-0004-query-ir-binary-format.md delete mode 100644 docs/adr/ADR-0005-transition-graph-format.md delete mode 100644 docs/adr/ADR-0006-dynamic-query-execution.md delete mode 100644 docs/adr/ADR-0007-type-metadata-format.md delete mode 100644 docs/adr/ADR-0008-tree-navigation.md delete mode 100644 docs/adr/ADR-0009-type-system.md delete mode 100644 docs/adr/ADR-0010-type-system-v2.md delete mode 100644 docs/adr/README.md create mode 100644 docs/binary-format/01-overview.md create mode 100644 docs/binary-format/02-strings.md create mode 100644 docs/binary-format/03-symbols.md create mode 100644 docs/binary-format/04-types.md create mode 100644 docs/binary-format/05-entrypoints.md create mode 100644 docs/binary-format/06-transitions.md create mode 100644 docs/lang-reference.md create mode 100644 docs/runtime-engine.md create mode 100644 docs/type-system.md diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..b89def12 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,51 @@ +# Plotnik Documentation + +Plotnik is a strongly-typed pattern matching language for tree-sitter syntax trees. + +## Quick Links by Audience + +### Users + +- [Language Reference](lang-reference.md) — Complete syntax and semantics +- [Type System](type-system.md) — How output types are inferred from queries + +### Contributors & LLM Agents + +- [AGENTS.md](../AGENTS.md) — Project rules, coding standards, testing patterns +- [Runtime Engine](runtime-engine.md) — VM execution model +- [Binary Format](binary-format/01-overview.md) — Compiled query format + +## Document Map + +``` +AGENTS.md # Project constitution (coding rules, testing, ADRs) +docs/ +├── README.md # You are here +├── lang-reference.md # Query language syntax and semantics +├── type-system.md # Type inference rules and output shapes +├── runtime-engine.md # VM state, backtracking, effects +└── binary-format/ # Compiled bytecode specification + ├── 01-overview.md # Header, sections, alignment + ├── 02-strings.md # String pool and table + ├── 03-symbols.md # Node types, fields, trivia + ├── 04-types.md # Type metadata format + ├── 05-entrypoints.md # Public definition table + └── 06-transitions.md # VM instructions and data blocks +``` + +## Reading Order + +New to Plotnik: + +1. `lang-reference.md` — Learn the query syntax +2. `type-system.md` — Understand output shapes + +Building tooling: + +1. `binary-format/01-overview.md` → through `06-transitions.md` +2. `runtime-engine.md` + +Contributing: + +1. `AGENTS.md` — Required reading +2. ADRs in `docs/adr/` — Architectural context diff --git a/docs/adr/ADR-0001-query-parser.md b/docs/adr/ADR-0001-query-parser.md deleted file mode 100644 index 39c52b91..00000000 --- a/docs/adr/ADR-0001-query-parser.md +++ /dev/null @@ -1,21 +0,0 @@ -# ADR-0001: Hand-written Parser with Rowan - -- **Status**: Accepted -- **Date**: 2025-12-08 (retrospective) - -## Context - -We need a resilient parser with excellent, user-friendly diagnostics and fine-grained error recovery. Parser generators like `chumsky` were considered but offer insufficient control. - -## Decision - -We implemented a hand-written recursive descent parser. - -- **Lexer**: `logos` for zero-copy tokenization. -- **CST**: `rowan` to build a lossless Concrete Syntax Tree, preserving all source text and trivia. -- **AST**: A typed AST wrapper provides a clean API for semantic analysis. - -## Consequences - -- **Positive**: Full control over error recovery, enabling high-quality diagnostics. The lossless CST is ideal for accurate error reporting and future tooling (e.g., formatters). -- **Negative**: Higher initial development effort and complexity compared to parser generators. diff --git a/docs/adr/ADR-0002-diagnostics-system.md b/docs/adr/ADR-0002-diagnostics-system.md deleted file mode 100644 index feb5b420..00000000 --- a/docs/adr/ADR-0002-diagnostics-system.md +++ /dev/null @@ -1,21 +0,0 @@ -# ADR-0002: Prioritized Diagnostics System - -- **Status**: Accepted -- **Date**: 2025-12-08 (retrospective) - -## Context - -A single syntax error can cause many cascading downstream errors, overwhelming the user. Our goal is to present only the most relevant, actionable feedback. - -## Decision - -We implemented a diagnostics system with priority-based suppression. - -- **Priority**: A central `DiagnosticKind` enum defines all possible diagnostics, ordered by priority. -- **Suppression**: When multiple diagnostics overlap, a filtering process suppresses lower-priority ones, effectively hiding noise and showing the likely root cause. -- **Formatting**: The `annotate-snippets` crate renders rich, user-friendly error messages with source context. - -## Consequences - -- **Positive**: Provides high-quality, actionable feedback by eliminating distracting cascading errors. The system is decoupled and independently testable. -- **Negative**: The suppression logic adds complexity and requires careful maintenance and tuning to remain effective. diff --git a/docs/adr/ADR-0004-query-ir-binary-format.md b/docs/adr/ADR-0004-query-ir-binary-format.md deleted file mode 100644 index 72f0eb3e..00000000 --- a/docs/adr/ADR-0004-query-ir-binary-format.md +++ /dev/null @@ -1,192 +0,0 @@ -# ADR-0004: Compiled Query Binary Format - -- **Status**: Accepted -- **Date**: 2024-12-12 -- **Supersedes**: Parts of ADR-0003 - -## Context - -The compiled query lives in a single contiguous allocation—cache-friendly, zero fragmentation, portable to WASM. This ADR defines the binary layout. Graph structures are in [ADR-0005](ADR-0005-transition-graph-format.md). Type metadata is in [ADR-0007](ADR-0007-type-metadata-format.md). - -## Decision - -### Container - -```rust -struct CompiledQuery { - buffer: CompiledQueryBuffer, - successors_offset: u32, - effects_offset: u32, - negated_fields_offset: u32, - string_refs_offset: u32, - string_bytes_offset: u32, - type_defs_offset: u32, - type_members_offset: u32, - entrypoints_offset: u32, - trivia_kinds_offset: u32, // 0 = no trivia kinds -} -``` - -Transitions start at buffer offset 0. The default entrypoint is **Transition 0** (the root of the graph). The `entrypoints` table provides named exports for multi-definition queries; it does not affect the default entrypoint. - -### CompiledQueryBuffer - -```rust -const BUFFER_ALIGN: usize = 64; // cache-line alignment for transitions - -struct CompiledQueryBuffer { - ptr: *mut u8, - len: usize, - owned: bool, // true if allocated, false if mmap'd -} -``` - -Allocated via `Layout::from_size_align(len, BUFFER_ALIGN)`. Standard `Box<[u8]>` won't work—it assumes 1-byte alignment and corrupts `dealloc`. The 64-byte alignment ensures transitions never straddle cache lines. - -**Ownership semantics**: - -| `owned` | Source | `Drop` action | -| ------- | ------------------- | ------------------------------------------------ | -| `true` | `std::alloc::alloc` | Reconstruct `Layout`, call `std::alloc::dealloc` | -| `false` | `mmap` / external | No-op (caller manages lifetime) | - -For mmap'd queries, the OS maps file pages directly into address space. The 64-byte header ensures buffer data starts aligned. `CompiledQueryBuffer` with `owned: false` provides a view without taking ownership—the backing file mapping must outlive the `CompiledQuery`. - -**Deallocation**: When `owned: true`, `Drop` must reconstruct the exact `Layout` (size + 64-byte alignment) and call `std::alloc::dealloc`. Using `Box::from_raw` or similar would assume align=1 and cause undefined behavior. - -### Segments - -| Segment | Type | Offset | Align | -| -------------- | ------------------- | ----------------------- | ----- | -| Transitions | `[Transition; N]` | 0 | 64 | -| Successors | `[TransitionId; M]` | `successors_offset` | 4 | -| Effects | `[EffectOp; P]` | `effects_offset` | 2 | -| Negated Fields | `[NodeFieldId; Q]` | `negated_fields_offset` | 2 | -| String Refs | `[StringRef; R]` | `string_refs_offset` | 4 | -| String Bytes | `[u8; S]` | `string_bytes_offset` | 1 | -| Type Defs | `[TypeDef; T]` | `type_defs_offset` | 4 | -| Type Members | `[TypeMember; U]` | `type_members_offset` | 2 | -| Entrypoints | `[Entrypoint; V]` | `entrypoints_offset` | 4 | -| Trivia Kinds | `[NodeTypeId; W]` | `trivia_kinds_offset` | 2 | - -Each offset is aligned: `(offset + align - 1) & !(align - 1)`. - -For `Transition`, `EffectOp` see [ADR-0005](ADR-0005-transition-graph-format.md). For `TypeDef`, `TypeMember` see [ADR-0007](ADR-0007-type-metadata-format.md). - -### Strings - -Single pool for all strings (field names, variant tags, entrypoint names, type names): - -```rust -type StringId = u16; -const STRING_NONE: StringId = 0xFFFF; // sentinel for unnamed types - -#[repr(C)] -struct StringRef { - offset: u32, // byte offset into string_bytes (NOT element index) - len: u16, - _pad: u16, -} -// 8 bytes, align 4 - -type DataFieldId = StringId; // field names in effects -type VariantTagId = StringId; // variant tags in effects - -type TypeId = u16; // see ADR-0007 for semantics -``` - -`StringId` indexes into `string_refs`. `DataFieldId` and `VariantTagId` are aliases for type safety. `TypeId` indexes into type_defs (with reserved primitives 0-2). - -Strings are interned during construction—identical strings share storage and ID. - -### Entrypoints - -```rust -#[repr(C)] -struct Entrypoint { - name_id: StringId, // 2 - _pad: u16, // 2 - target: TransitionId, // 4 - result_type: TypeId, // 2 - see ADR-0007 - _pad2: u16, // 2 -} -// 12 bytes, align 4 -``` - -### Serialization - -``` -Header (64 bytes): - magic: [u8; 4] b"PLNK" - version: u32 format version + ABI hash - checksum: u32 CRC32(header[12..64] || buffer_data) - buffer_len: u32 - successors_offset: u32 - effects_offset: u32 - negated_fields_offset: u32 - string_refs_offset: u32 - string_bytes_offset: u32 - type_defs_offset: u32 - type_members_offset: u32 - entrypoints_offset: u32 - trivia_kinds_offset: u32 - _pad: [u8; 12] reserved, zero-filled - -Buffer Data (buffer_len bytes) -``` - -Header is 64 bytes to ensure buffer data starts at a 64-byte aligned offset. This enables true zero-copy `mmap` usage where transitions at offset 0 within the buffer are correctly aligned. - -Little-endian always. UTF-8 strings. Version mismatch or checksum failure → recompile. - -**Checksum coverage**: The checksum covers bytes 12–63 of the header (everything after the checksum field) plus all buffer data. The magic and version are verified independently before checksum validation—a version mismatch triggers recompile without checking the checksum. - -### Construction - -Three passes: - -1. **Analysis**: Count elements, intern strings, infer types -2. **Layout**: Compute aligned offsets, allocate once -3. **Emission**: Write via `ptr::write` - -No `realloc`. - -### Example - -Query: - -``` -Func = (function_declaration name: (identifier) @name) -Expr = [ Ident: (identifier) @name Num: (number) @value ] -``` - -Buffer layout: - -``` -0x0000 Transitions [T0, T1, T2, ...] -0x0180 Successors [1, 2, 3, ...] -0x0200 Effects [StartObject, Field(0), ...] -0x0280 Negated Fields [] -0x0280 String Refs [{0,4}, {4,5}, {9,5}, ...] -0x02C0 String Bytes "namevalueIdentNumFuncExpr" -0x0300 Type Defs [Struct{...}, Enum{...}, ...] -0x0340 Type Members [{name,Str}, {Ident,Ty5}, ...] -0x0380 Entrypoints [{name=Func, target=Tr0, type=Ty3}, ...] -0x03A0 Trivia Kinds [comment, ...] -``` - -`"name"` stored once, used by both `@name` captures. - -## Consequences - -**Positive**: Cache-efficient, O(1) string lookup, zero-copy access, simple validation. Self-contained binaries enable query caching by input hash. - -**Negative**: Format changes require rebuild. No version migration. - -**WASM**: Explicit alignment prevents traps. `u32` offsets fit WASM32. - -## References - -- [ADR-0005: Transition Graph Format](ADR-0005-transition-graph-format.md) -- [ADR-0006: Dynamic Query Execution](ADR-0006-dynamic-query-execution.md) -- [ADR-0007: Type Metadata Format](ADR-0007-type-metadata-format.md) diff --git a/docs/adr/ADR-0005-transition-graph-format.md b/docs/adr/ADR-0005-transition-graph-format.md deleted file mode 100644 index 08abc6ff..00000000 --- a/docs/adr/ADR-0005-transition-graph-format.md +++ /dev/null @@ -1,325 +0,0 @@ -# ADR-0005: Transition Graph Format - -- **Status**: Accepted -- **Date**: 2024-12-12 -- **Supersedes**: Parts of ADR-0003 - -## Context - -Edge-centric IR: transitions carry all semantics (matching, effects, successors). States are implicit junction points. The result is a recursive transition network—NFA with call/return for definition references. - -## Decision - -### Types - -```rust -type TransitionId = u32; -type NodeTypeId = u16; // from tree-sitter, do not change -type NodeFieldId = NonZeroU16; // from tree-sitter, Option uses 0 for None -type RefId = u16; -// StringId, DataFieldId, VariantTagId: see ADR-0004 -``` - -### Slice - -Relative range within a segment: - -```rust -#[repr(C)] -struct Slice { - start_index: u32, // element index into segment array (NOT byte offset) - len: u16, // 65k elements per slice is sufficient - _pad: u16, - _phantom: PhantomData T>, -} -// 8 bytes, align 4 -``` - -`start_index` is an **element index**, not a byte offset. This naming distinguishes it from byte offsets like `StringRef.offset` and `CompiledQuery.*_offset`. The distinction matters for typed array access. - -### Transition - -```rust -/// Transitions use SSO (small-size optimization) for successors: -/// - 0-8 successors: stored inline in `successor_data` -/// - 9+ successors: `successor_data[0]` is index into successors segment -#[repr(C, align(64))] -struct Transition { - // --- 32 bytes metadata --- - matcher: Matcher, // 16 (offset 0) - ref_marker: RefTransition, // 4 (offset 16) - nav: Nav, // 2 (offset 20, see ADR-0008) - effects_len: u16, // 2 (offset 22, inlined from Slice) - successor_count: u32, // 4 (offset 24) - effects_start: u32, // 4 (offset 28, inlined from Slice) - - // --- 32 bytes control flow --- - successor_data: [u32; 8], // 32 (offset 32) -} -// 64 bytes, align 64 (cache-line aligned) -``` - -The `effects_start` and `effects_len` fields are inlined rather than using `Slice` to maintain 64-byte alignment without sacrificing inline successor slots. Accessors reconstruct a `Slice` on demand. - -Navigation is fully determined by `nav`—no runtime dispatch based on previous matcher. See [ADR-0008](ADR-0008-tree-navigation.md) for `Nav` definition and semantics. - -Single `ref_marker` slot—sequences like `Enter(A) → Enter(B)` remain as epsilon chains. - -### Inline Successors (SSO-style) - -Successors use a small-size optimization to avoid indirection for the common case: - -| `successor_count` | Layout | -| ----------------- | ----------------------------------------------------------------------------------- | -| 0–8 | `successor_data[0..count]` contains `TransitionId` values directly | -| > 8 | `successor_data[0]` is index into `successors` segment, `successor_count` is length | - -Why 8 slots: Moving `successor_count` into the metadata block frees 32 bytes for `successor_data`, giving 32 / 4 = 8 inline slots. - -Coverage: - -- Linear sequences: 1 successor -- Simple branches, quantifiers: 2 successors -- Most alternations: 2–8 branches - -Only massive alternations (9+ branches) spill to the external buffer. - -Cache benefits: - -- 64 bytes = L1 cache line on x86/ARM64 -- No transition straddles cache lines -- No pointer chase for 99%+ of transitions - -### Matcher - -```rust -#[repr(C, u32)] -enum Matcher { - Epsilon, - Node { - kind: NodeTypeId, // 2 - field: Option, // 2 - negated_fields: Slice, // 8 (align 4, starts at offset 8) - }, - Anonymous { - kind: NodeTypeId, // 2 - field: Option, // 2 - negated_fields: Slice, // 8 (align 4, starts at offset 8) - }, - Wildcard, -} -// 16 bytes, align 4 (discriminant 4 + payload 12, but payload naturally aligns) -``` - -`Option` uses 0 for `None` (niche optimization). - -Navigation (descend/ascend) is handled by `Nav`, not matchers. Matchers are purely for node matching. - -### RefTransition - -```rust -#[repr(C, u8)] -enum RefTransition { - None, - Enter(RefId), // push call frame with returns - Exit(RefId), // pop frame, use stored returns -} -// 4 bytes, align 2 -``` - -Layout: 1-byte discriminant + 1-byte padding + 2-byte `RefId` payload = 4 bytes. Alignment is 2 (from `RefId: u16`). Fits comfortably in the 64-byte `Transition` struct with room to spare. - -Explicit `None` ensures stable binary layout (`Option` niche is unspecified). - -**RefId semantics**: `RefId` is a unique identifier assigned per definition reference during graph construction. It is **not** an index into the `Entrypoints` table (which is for named exports). The actual jump target comes from `successors()[0]}` of the `Enter` transition. `RefId` exists solely to verify that `Exit(id)` matches the corresponding `Enter(id)` at runtime—a mismatch indicates an IR bug. - -### Enter/Exit Semantics - -**Problem**: A definition can be called from multiple sites. Naively, `Exit.next` would contain all possible return points from all call sites, requiring O(N) filtering at runtime to find which return is valid for the current call. - -**Solution**: Store return transitions at `Enter` time (in the call frame), retrieve at `Exit` time. O(1) exit, no filtering. - -For `Enter(ref_id)` transitions, the **logical** successor list (accessed via `TransitionView::successors()`) has special structure: - -- `successors()[0]`: definition entry point (where to jump) -- `successors()[1..]`: return transitions (stored in call frame) - -This structure applies to the view, not raw `successor_data` memory. The SSO optimization (inline vs spilled storage) is orthogonal—the view abstracts it away. An `Enter` with 8+ returns spills to the external segment like any other transition; the interpreter accesses the logical list uniformly. - -For `Exit(ref_id)` transitions, successors are **ignored**. Return transitions come from the call frame pushed at `Enter`. See [ADR-0006](ADR-0006-dynamic-query-execution.md) for execution details. - -``` -Call site: -T1: ε + Enter(Func) successors=[T10, T2, T3] - │ └─────┴─── return transitions (stored in frame) - └─────────────── definition entry - -Definition: -T10: Match(...) successors=[T11] -T11: ε + Exit(Func) successors=[] (ignored, returns from frame) -``` - -### EffectOp - -```rust -#[repr(C, u16)] -enum EffectOp { - CaptureNode, // store matched node as current value - StartArray, - PushElement, - EndArray, - StartObject, - EndObject, - SetField(DataFieldId), - PushField(DataFieldId), - StartVariant(VariantTagId), - EndVariant, - ToString, -} -// 4 bytes, align 2 -``` - -**Graph construction invariant**: `CaptureNode` may only appear in the effects list of a transition where `matcher` is `Node`, `Anonymous`, or `Wildcard`. Placing `CaptureNode` on an `Epsilon` transition is illegal—graph construction must enforce this at build time. - -### View Types - -```rust -struct TransitionView<'a> { - query: &'a CompiledQuery, - raw: &'a Transition, -} - -struct MatcherView<'a> { - query: &'a CompiledQuery, - raw: &'a Matcher, -} - -enum MatcherKind { Epsilon, Node, Anonymous, Wildcard } -``` - -Views resolve `Slice` to `&[T]`. `TransitionView::successors()` returns `&[TransitionId]`, hiding the inline/spilled distinction—callers see a uniform slice regardless of storage location. Engine code never touches offsets or `successor_data` directly. - -### Quantifiers - -Examples in this section show graph structure and effects. Navigation (`nav`) is omitted for brevity—see [ADR-0008](ADR-0008-tree-navigation.md) for full transition examples with navigation. - -**Greedy `*`**: - -``` - ┌─────────────────┐ - ↓ │ -Entry ─ε→ Branch ─ε→ Match ─┘ - │ - └─ε→ Exit - -Branch.next = [match, exit] -``` - -**Greedy `+`**: - -``` - ┌─────────────────┐ - ↓ │ -Entry ─→ Match ─ε→ Branch ─┘ - │ - └─ε→ Exit - -Branch.next = [match, exit] -``` - -**Non-greedy `*?`/`+?`**: Same, but `Branch.next = [exit, match]`. - -### Example: Array - -Query: `(parameters (identifier)* @params)` - -Before elimination: - -``` -T0: ε [StartArray] → [T1] -T1: ε (branch) → [T2, T4] -T2: Match(identifier) [CaptureNode] → [T3] -T3: ε [PushElement] → [T1] -T4: ε [EndArray] → [T5] -T5: ε [Field("params")] → [...] -``` - -After: - -``` -T2': Match(identifier) [StartArray, CaptureNode, PushElement] → [T2', T4'] -T4': ε [EndArray, Field("params")] → [...] -``` - -First iteration gets `StartArray` from T0's path. Loop iterations skip it. Note T4' remains epsilon—effects cannot merge into T2' without breaking semantics. - -### Example: Object - -Query: `{ (identifier) @name (number) @value } @pair` - -``` -T0: ε + StartObject → [T1] -T1: Match(identifier) → [T2] -T2: ε + SetField("name") → [T3] -T3: Match(number) → [T4] -T4: ε + SetField("value") → [T5] -T5: ε + EndObject → [T6] -T6: ε + SetField("pair") → [...] -``` - -### Example: Tagged Alternation - -Query: `[ A: (true) @val B: (false) @val ]` - -``` -T0: ε (branch) → [T1, T4] -T1: ε + StartVariant("A") → [T2] -T2: Match(true) → [T3] -T3: ε + SetField("val") + EndVariant → [T7] -T4: ε + StartVariant("B") → [T5] -T5: Match(false) → [T6] -T6: ε + SetField("val") + EndVariant → [T7] -``` - -### Epsilon Elimination - -Partial—full elimination impossible due to single `ref_marker` and effect ordering constraints. - -**Execution order** (all transitions, including epsilon): - -1. Execute `nav` and matcher -2. On success: emit `effects` in order - -With explicit `CaptureNode`, effect order is unambiguous. When eliminating epsilon chains, concatenate effect lists in traversal order. - -**When epsilon nodes must remain**: - -1. **Ref markers**: A transition can hold at most one `Enter`/`Exit`. Sequences like `Enter(A) → Enter(B)` need epsilon. -2. **Branch points**: An epsilon with multiple successors cannot merge into predecessors without duplicating effects. -3. **Effect ordering conflicts**: When incoming and outgoing effects cannot be safely reordered. - -Example of safe elimination: - -``` -Before: -T1: Match(A) [CaptureNode] → [T2] -T2: ε [PushElement] → [T3] -T3: Match(B) [CaptureNode, SetField("b")] → [...] - -After: -T3': Match(B) [PushElement, CaptureNode, SetField("b")] → [...] -``` - -`PushElement` consumes T1's captured value before T3 overwrites `current`. - -## Consequences - -**Positive**: No state objects. Cache-line aligned 64-byte transitions eliminate cache straddling. Inline successors remove pointer chasing for common cases. Views hide offset arithmetic and inline/spilled distinction. - -**Negative**: Single `ref_marker` leaves some epsilon chains. 33% size increase over minimal layout (acceptable for KB-scale query binaries). - -## References - -- [ADR-0004: Query IR Binary Format](ADR-0004-query-ir-binary-format.md) -- [ADR-0006: Dynamic Query Execution](ADR-0006-dynamic-query-execution.md) -- [ADR-0007: Type Metadata Format](ADR-0007-type-metadata-format.md) diff --git a/docs/adr/ADR-0006-dynamic-query-execution.md b/docs/adr/ADR-0006-dynamic-query-execution.md deleted file mode 100644 index a755d4a4..00000000 --- a/docs/adr/ADR-0006-dynamic-query-execution.md +++ /dev/null @@ -1,286 +0,0 @@ -# ADR-0006: Query Execution - -- **Status**: Accepted -- **Date**: 2024-12-12 -- **Supersedes**: Parts of ADR-0003 - -## Context - -Runtime execution of the transition graph ([ADR-0005](ADR-0005-transition-graph-format.md)). Proc-macro compilation is a future ADR. - -## Decision - -### Execution Order - -For each transition: - -1. Execute `nav` initial movement (e.g., goto_first_child, goto_next_sibling) -2. Search loop: try matcher, on fail apply skip policy (advance or fail) -3. On match success: store matched node, execute `effects` sequentially -4. Process `ref_marker` (see below) -5. Process successors with backtracking - -For `Up*` variants, step 2 becomes: validate exit constraint, ascend N levels (no search loop). - -**RefTransition handling** (step 4): - -- `None`: no action, proceed to step 5 -- `Enter(ref_id)`: push frame onto `FrameArena`, store `successors()[1..]` as returns, then jump to `successors()[0]` (definition entry)—step 5 is skipped -- `Exit(ref_id)`: verify `ref_id` matches current frame, pop frame, use stored returns as successors—step 5 uses these instead of the transition's own successors - -Navigation is fully determined by `nav`—no runtime dispatch based on previous matcher. See [ADR-0008](ADR-0008-tree-navigation.md) for detailed semantics. - -The matched node is stored in a temporary slot (`matched_node`) accessible to `CaptureNode` effect. Effects execute in order—`CaptureNode` reads from this slot and sets `executor.current`. - -**Slot invariant**: The `matched_node` slot is cleared (set to `None`) at the start of each transition execution, before `nav`. This prevents stale captures if a transition path has `Epsilon → CaptureNode` without a preceding match—such a path indicates a graph construction bug, and the clear-on-entry invariant ensures it manifests as a predictable panic rather than silently capturing a wrong node. - -### Effect Stream - -```rust -struct EffectStream<'a> { - ops: Vec, // effect log, backtrack via truncate - nodes: Vec>, // captured nodes, one per CaptureNode op -} -``` - -Effects are **recorded**, not eagerly executed. On match success, the transition's `effects` list is appended to `ops`. For each `CaptureNode`, the `matched_node` is also appended to `nodes`. - -On backtrack, both vectors truncate to their watermarks. On full match success, the executor replays `ops` sequentially, consuming from `nodes` for each `CaptureNode`. - -### Materializer - -Materializes effect stream into output value. - -```rust -struct Materializer<'a> { - current: Option>, - stack: Vec>, -} - -enum Value<'a> { - Node(Node<'a>), - String(String), - Array(Vec>), - Object(BTreeMap>), - Variant(VariantTagId, Box>), -} - -enum Container<'a> { - Array(Vec>), - Object(BTreeMap>), - Variant(VariantTagId), -} -``` - -| Effect | Action | -| ------------------- | ----------------------------------------- | -| `CaptureNode` | `current = Node(nodes.next())` (consumes) | -| `ClearCurrent` | `current = None` | -| `StartArray` | push `Array([])` onto stack | -| `PushElement` | move `current` into top array | -| `EndArray` | pop array into `current` | -| `StartObject` | push `Object({})` onto stack | -| `SetField(id)` | set field `id` to `current` | -| `PushField(id)` | append `current` to array at field `id` | -| `EndObject` | pop object into `current` | -| `StartVariant(tag)` | push `Variant(tag)` onto stack | -| `EndVariant` | pop, wrap `current`, set as current | -| `ToString` | replace `current` Node with text | - -`ClearCurrent` is emitted on skip paths for optional captures (`expr? @name`). When the optional is skipped, `ClearCurrent` ensures `current = None` before `SetField(id)` executes, producing the correct `None` value for the optional field. - -Invalid state = IR bug → panic. - -### QueryInterpreter - -```rust -struct QueryInterpreter<'a> { - query: &'a CompiledQuery, - checkpoints: CheckpointStack, - frames: FrameArena, - cursor: TreeCursor<'a>, // created at tree root, never reset - effects: EffectStream<'a>, -} -``` - -**Cursor constraint**: The cursor must be created once at the tree root and never call `reset()`. This preserves `descendant_index` validity for backtracking checkpoints. - -No `prev_matcher` tracking needed—each transition's `nav` encodes the exact navigation to perform. - -Two structures interact: backtracking can restore to a point inside a previously-exited call, so the frame arena must preserve frames. - -### Checkpoints - -```rust -struct CheckpointStack { - points: Vec, - max_frame_watermark: Option, // highest frame index referenced by any point -} - -struct Checkpoint { - cursor_checkpoint: u32, // tree-sitter descendant_index - effect_watermark: u32, - recursion_frame: Option, // saved frame index - prev_max_watermark: Option, // restore on pop for O(1) maintenance - transition_id: TransitionId, // source transition for alternatives - next_alt: u32, // index of next alternative to try -} -``` - -Alternatives are retrieved via `TransitionView::successors()[next_alt..]`. This avoids the `Slice` incompatibility with inline successors (SSO stores successors inside the `Transition` struct, not in the `Successors` segment). - -| Operation | Action | -| --------- | ------------------------------------------------------ | -| Save | `cursor_checkpoint = cursor.descendant_index()` — O(1) | -| Restore | `cursor.goto_descendant(cursor_checkpoint)` — O(depth) | - -Restore also truncates `effects` to `effect_watermark` and sets `frame_arena.current` to `recursion_frame`. - -### Recursion - -**Problem**: A definition can be called from N sites. Naively, Exit's successors contain all N return points, requiring O(N) filtering. - -**Solution**: Store returns in call frame at `Enter`, retrieve at `Exit`. O(1), no filtering. - -```rust -struct FrameArena { - frames: Vec, // append-only, pruned by watermark - current: Option, // index into frames (the "stack pointer") -} - -struct Frame { - parent: Option, // index of caller's frame - ref_id: RefId, // verify Exit matches Enter - enter_transition: TransitionId, // to retrieve returns via successors()[1..] -} -``` - -Returns are retrieved via `TransitionView::successors()[1..]` on the `enter_transition`. Same rationale as `BacktrackPoint`—avoids `Slice` incompatibility with inline successors. - -**Append-only invariant**: Frames persist for backtracking correctness. On `Exit`, set `current` to parent index. Backtracking restores `current`; the original frame is still accessible via its index. - -**Frame pruning**: After `Exit`, frames at the arena top may be reclaimed if: - -1. Not the current frame (already exited) -2. Not referenced by any live backtrack point - -This bounds memory by `max(recursion_depth, backtrack_depth)` rather than total call count. Without pruning, `(Rule)*` over N items allocates N frames; with pruning, it remains O(1) for non-backtracking iteration. - -**O(1) watermark tracking**: Each checkpoint stores the previous `max_frame_watermark`, enabling O(1) restore on pop: - -```rust -impl CheckpointStack { - fn push(&mut self, mut point: Checkpoint) { - point.prev_max_watermark = self.max_frame_watermark; - if let Some(frame) = point.recursion_frame { - self.max_frame_watermark = Some(match self.max_frame_watermark { - Some(max) => max.max(frame), - None => frame, - }); - } - self.points.push(point); - } - - fn pop(&mut self) -> Option { - let point = self.points.pop()?; - self.max_frame_watermark = point.prev_max_watermark; - Some(point) - } -} - -fn prune_high_water_mark( - current: Option, - checkpoints: &CheckpointStack, -) -> Option { - match (current, checkpoints.max_frame_watermark) { - (None, None) => None, - (Some(c), None) => Some(c), - (None, Some(m)) => Some(m), - (Some(c), Some(m)) => Some(c.max(m)), - } -} -``` - -Frames with index > high-water mark can be truncated. - -**Why not just check the last backtrack point?** Backtrack points are _not_ chronologically ordered by frame depth. After an Enter-Exit sequence, a new backtrack point may reference a shallower frame than earlier points: - -``` -1. Enter(A) → frames=[F0], current=0 -2. Save BP1 → BP1.recursion_frame = Some(0) -3. Exit(A) → current = None -4. Save BP2 → BP2.recursion_frame = None - -# BP2 is last, but BP1 still references F0 -# Checking only last point would incorrectly allow pruning F0 -``` - -The `max_frame_watermark` tracks the true maximum across all live points. Both push and pop are O(1)—each checkpoint stores the previous max, so pop simply restores it without scanning. - -| Operation | Action | -| ------------------ | ------------------------------------------------------------------------------ | -| `Enter(ref_id)` | Push frame (parent = `current`), set `current = len-1`, follow `successors[0]` | -| `Exit(ref_id)` | Verify ref_id, set `current = frame.parent`, continue with `frame.returns` | -| Save checkpoint | Store `current` | -| Restore checkpoint | Set `current` to saved value | - -**Why index instead of depth?** Using logical depth breaks on Enter-Exit-Enter sequences: - -``` -Main = [(A) (B)] -A = (identifier) -B = (number) -Input: boolean - -# Broken (depth-based): -1. Save BP depth=0 -2. Enter(A) push FA, depth=1 -3. Match identifier ✗ -4. Exit(A) depth=0 -5. Restore BP depth=0 -6. Enter(B) push FB, frames=[FA,FB], depth=1 -7. frames[depth-1] = FA, not FB! ← wrong frame - -# Correct (index-based): -1. Save BP current=None -2. Enter(A) push FA{parent=None}, current=0 -3. Match identifier ✗ -4. Exit(A) current=None -5. Restore BP current=None -6. Enter(B) push FB{parent=None}, current=1 -7. frames[current] = FB ✓ -``` - -Frames form a forest of call chains. Each checkpoint references an exact frame, not a depth. - -### Atomic Groups (Future) - -Cut/commit (discard checkpoints) works correctly: unreachable frames become garbage but cause no issues. - -### Variant Serialization - -```json -{ "$tag": "A", "$data": { ... } } -``` - -`$tag`/`$data` avoid capture name collisions. - -### Fuel - -- `transition_fuel`: decremented per transition -- `recursion_fuel`: decremented per `Enter` - -Details deferred. - -## Consequences - -**Positive**: Append-only stacks make backtracking trivial. O(1) exit via stored returns. Navigation fully determined by `nav`—no state tracking between transitions. - -**Negative**: Interpretation overhead. Recursion stack memory grows monotonically (bounded by `recursion_fuel`). - -## References - -- [ADR-0004: Query IR Binary Format](ADR-0004-query-ir-binary-format.md) -- [ADR-0005: Transition Graph Format](ADR-0005-transition-graph-format.md) -- [ADR-0007: Type Metadata Format](ADR-0007-type-metadata-format.md) -- [ADR-0008: Tree Navigation](ADR-0008-tree-navigation.md) diff --git a/docs/adr/ADR-0007-type-metadata-format.md b/docs/adr/ADR-0007-type-metadata-format.md deleted file mode 100644 index 8d8f3dbd..00000000 --- a/docs/adr/ADR-0007-type-metadata-format.md +++ /dev/null @@ -1,197 +0,0 @@ -# ADR-0007: Type Metadata Format - -- **Status**: Accepted -- **Date**: 2025-01-13 - -## Context - -Query execution produces structured values via the effect stream ([ADR-0006](ADR-0006-dynamic-query-execution.md)). Type metadata enables: - -- **Code generation**: Emit Rust structs, TypeScript interfaces, Python dataclasses -- **Validation**: Verify effect stream output matches expected shape (debug/test builds) -- **Tooling**: IDE completions, documentation generation - -Type metadata is descriptive, not prescriptive. Transitions define execution semantics; types describe what transitions produce. - -**Cache efficiency goal**: Proc macro compilation inlines query logic as native instructions (I-cache), leaving D-cache exclusively for tree-sitter cursor traversal. Type metadata is consumed at compile time, not runtime. - -## Decision - -### TypeId - -```rust -type TypeId = u16; - -const TYPE_VOID: TypeId = 0; // definition captures nothing -const TYPE_NODE: TypeId = 1; // AST node reference (see "Node Semantics" below) -const TYPE_STR: TypeId = 2; // extracted source text (:: string) -// 3..0xFFFE: composite types (index into type_defs + 3) -const TYPE_INVALID: TypeId = 0xFFFF; // error sentinel during inference -``` - -Type alias declared in [ADR-0004](ADR-0004-query-ir-binary-format.md); constants and semantics here. - -Primitives exist only as TypeId values—no TypeDef entries. Composite types start at ID 3. - -### Node Semantics - -`TYPE_NODE` represents a platform-dependent handle to a tree-sitter AST node: - -| Context | Representation | -| ---------- | ---------------------------------------------------------- | -| Rust | `tree_sitter::Node<'tree>` (lifetime-bound reference) | -| TypeScript | Binding-provided object with `startPosition`, `text`, etc. | -| Text/JSON | Unique node identifier (e.g., `"node:42"` or path-based) | - -The handle provides access to node metadata (kind, span, text) without copying the source. Lifetime management is platform-specific—Rust enforces it statically, bindings may use reference counting or arena allocation. - -### TypeDef - -```rust -#[repr(C)] -struct TypeDef { - kind: TypeKind, // 1 - _pad: u8, // 1 - name: StringId, // 2 - synthetic or explicit, 0xFFFF for wrappers - members: Slice, // 8 - see interpretation below -} -// 12 bytes, align 4 -``` - -The `members` field has dual semantics based on `kind`: - -| Kind | `members.start_index` | `members.len` | -| ---------------------------------- | ----------------------- | ------------- | -| Wrappers (Optional/Array\*/Array+) | Inner `TypeId` (as u32) | 0 | -| Composites (Struct/Enum) | Index into type_members | Member count | - -This reuses `Slice` for consistency with [ADR-0005](ADR-0005-transition-graph-format.md), while keeping TypeDef compact. - -### TypeKind - -```rust -#[repr(C, u8)] -enum TypeKind { - Optional = 0, // T? — members.start = inner TypeId - ArrayStar = 1, // T* — members.start = element TypeId - ArrayPlus = 2, // T+ — members.start = element TypeId - Struct = 3, // struct — members = slice into type_members - Enum = 4, // tagged union — members = slice into type_members -} -``` - -| Kind | Query Syntax | Semantics | -| --------- | ------------------- | -------------------------------- | -| Optional | `expr?` | Nullable wrapper | -| ArrayStar | `expr*` | Zero or more elements | -| ArrayPlus | `expr+` | One or more elements (non-empty) | -| Struct | `{ ... } @name` | Named fields | -| Enum | `[ A: ... B: ... ]` | Tagged union (discriminated) | - -### TypeMember - -Shared structure for Struct fields and Enum variants: - -```rust -#[repr(C)] -struct TypeMember { - name: StringId, // 2 - field name or variant tag - ty: TypeId, // 2 - field type or variant payload (TYPE_VOID for unit) -} -// 4 bytes, align 2 -``` - -### Synthetic Naming - -When no explicit `:: TypeName` annotation exists, names are synthesized: - -| Context | Pattern | Example | -| -------------------- | --------------- | ---------------------------------------- | -| Definition | Definition name | `Func` | -| Captured sequence | `{Def}{Field}` | `FuncParams` for `@params` in `Func` | -| Captured alternation | `{Def}{Field}` | `FuncBody` for `@body` in `Func` | -| Variant payload | `{Parent}{Tag}` | `FuncBodyStmt` for `Stmt:` in `FuncBody` | - -Collisions resolved by numeric suffix: `FuncBody`, `FuncBody2`, etc. - -### Single-Capture Variant Flattening - -When an enum variant's branch has exactly one capture, the variant payload flattens to that capture's type directly—no wrapper struct. - -| Branch Captures | Variant Payload | -| --------------- | -------------------------- | -| 0 | Unit (Void) | -| 1 | Capture's type (flattened) | -| ≥2 | Struct with named fields | - -Rationale: The variant tag already discriminates; a single-field wrapper struct adds verbosity without information. - -### Example - -Query: - -``` -Func = (function_declaration - name: (identifier) @name :: string - body: [ - Stmt: (statement) @stmt - Expr: (expression) @expr - ] @body -) -``` - -Type graph: - -``` -T3: Struct "Func" → [name: Str, body: T4] -T4: Enum "FuncBody" → [Stmt: Node, Expr: Node] // flattened: 1 capture per branch - -Entrypoint: Func → result_type: T3 -``` - -Generated TypeScript: - -```typescript -interface Func { - name: string; - body: { $tag: "Stmt"; $data: Node } | { $tag: "Expr"; $data: Node }; -} -``` - -Generated Rust: - -```rust -enum FuncBody { - Stmt(Node), - Expr(Node), -} -``` - -### Validation - -Optional runtime check for debugging: - -```rust -fn validate(value: &Value, expected: TypeId, query: &CompiledQuery) -> Result<(), TypeError>; -``` - -Walk the `Value` tree, verify shape matches `TypeId`. Mismatch indicates IR construction bug—panic in debug, skip in release. - -## Consequences - -**Positive**: - -- Single IR serves interpreter, proc macro codegen, and external tooling -- Language-agnostic: same metadata generates Rust, TypeScript, Python, etc. -- Self-contained queries enable caching by input hash (`~/.cache/plotnik/`) - -**Negative**: - -- Synthetic names can be verbose for deeply nested structures -- KB-scale overhead for complex queries (acceptable) - -## References - -- [ADR-0004: Query IR Binary Format](ADR-0004-query-ir-binary-format.md) -- [ADR-0005: Transition Graph Format](ADR-0005-transition-graph-format.md) -- [ADR-0006: Dynamic Query Execution](ADR-0006-dynamic-query-execution.md) diff --git a/docs/adr/ADR-0008-tree-navigation.md b/docs/adr/ADR-0008-tree-navigation.md deleted file mode 100644 index e783ab92..00000000 --- a/docs/adr/ADR-0008-tree-navigation.md +++ /dev/null @@ -1,336 +0,0 @@ -# ADR-0008: Tree Navigation - -- **Status**: Accepted -- **Date**: 2025-01-13 - -## Context - -Plotnik's query execution engine ([ADR-0006](ADR-0006-dynamic-query-execution.md)) navigates tree-sitter syntax trees. This ADR covers: - -1. Which tree-sitter API to use (TreeCursor vs Node) -2. How `Nav` encodes navigation and anchor constraints -3. How transitions execute navigation deterministically - -Key insight: navigation decisions can be resolved at graph construction time, not runtime. Each transition carries its own `Nav` instruction—no need to track previous matcher state. - -## Decision - -### API Choice: TreeCursor with `descendant_index` Checkpoints - -```rust -struct InterpreterState<'tree> { - cursor: TreeCursor<'tree>, // created once at tree root, never reset -} - -struct BacktrackCheckpoint { - descendant_index: u32, // 4 bytes, O(1) save - // ... other state from ADR-0006 -} -``` - -**Critical constraint**: The cursor must be created at the tree root and never call `reset()`. The `descendant_index` is relative to the cursor's root—`reset(node)` invalidates all checkpoints. - -### Nav - -Navigation and anchor constraints unified into a single enum: - -```rust -#[repr(C)] -struct Nav { - kind: NavKind, // 1 byte - level: u8, // 1 byte - ascent level count for Up*, ignored otherwise -} -// 2 bytes total - -#[repr(u8)] -enum NavKind { - // No movement (first transition only, cursor at root) - Stay = 0, - - // Sibling traversal (horizontal) - Next = 1, // skip any nodes to find match - NextSkipTrivia = 2, // skip trivia only, fail if non-trivia skipped - NextExact = 3, // no skipping, current sibling must match - - // Enter children (descend) - Down = 4, // skip any among children - DownSkipTrivia = 5, // skip trivia only among children - DownExact = 6, // first child must match, no skip - - // Exit children (ascend) - Up = 7, // ascend `level` levels, no constraint - UpSkipTrivia = 8, // validate last non-trivia, ascend `level` levels - UpExact = 9, // validate last child, ascend `level` levels -} -``` - -For non-Up variants, `level` is ignored (conventionally 0). For Up variants, `level >= 1`. - -**Design note**: Multi-level `Up(n)` with n>1 is an optimization for the common case (no intermediate anchors). When anchors exist at intermediate nesting levels, decompose into separate `Up*` transitions at each level. - -### Trivia - -**Trivia** = anonymous nodes + language-specific trivia named nodes (e.g., `comment`). - -The trivia kinds list is populated from the `Lang` binding during IR construction and stored in the `trivia_kinds` segment ([ADR-0004](ADR-0004-query-ir-binary-format.md)). Zero offset means no trivia kinds. - -**Skip invariant**: A node is never skipped if its kind matches the current transition's matcher target. This ensures `(comment)` explicitly in a query still matches comment nodes, even though comments are typically ignored. - -### Execution Semantics - -Navigation and matching are intertwined in a search loop. The `Nav` determines initial movement and skip policy for the loop. - -**Stay**: No cursor movement. Used only for the first transition when cursor is already positioned at root. Then attempt match. - -**Next variants**: Move to next sibling, enter search loop: - -- `Next`: Try match; on fail, advance to next sibling and retry; exhausted → fail -- `NextSkipTrivia`: Try match; on fail, if current node is non-trivia → fail, else advance and retry -- `NextExact`: Try match; on fail → fail (no retry) - -**Down variants**: Move to first child, enter search loop: - -- `Down`: Try match; on fail, advance to next sibling and retry; exhausted → fail -- `DownSkipTrivia`: Try match; on fail, if current node is non-trivia → fail, else advance and retry -- `DownExact`: Try match; on fail → fail (no retry) - -**Up variants**: Validate exit constraint, then ascend N levels (no search loop): - -- `Up`: No constraint, ascend -- `UpSkipTrivia`: Fail if non-trivia siblings follow current position, then ascend -- `UpExact`: Fail if any siblings follow current position, then ascend - -Example: `(foo (bar))` matching `(foo (foo) (foo) (bar))`: - -1. `[Down]` → goto_first_child (cursor at first `foo` child) -2. Try match `bar` → fail -3. Mode is `Down` (skip any) → goto_next_sibling (cursor at second `foo`) -4. Try match `bar` → fail -5. goto_next_sibling (cursor at `bar`) -6. Try match `bar` → success, exit loop - -### Skip Mode Symmetry - -| Mode | Entry/Search (Next/Down) | Exit (Up) | -| ---------- | --------------------------------------- | -------------------------------- | -| None | skip any nodes | no constraint on siblings | -| SkipTrivia | skip trivia, fail if non-trivia skipped | must be at last non-trivia child | -| Exact | no skip, immediate position | must be at last child | - -### Anchor Lowering - -The anchor operator (`.`) in the query language compiles to `Nav` variants: - -| Query Pattern | Nav on Following Transition | -| -------------------- | ---------------------------- | -| `(foo) (bar)` | `Next` | -| `(foo) . (bar)` | `NextSkipTrivia` | -| `"x" . (bar)` | `NextExact` | -| `(parent (child))` | `Down` on child's transition | -| `(parent . (child))` | `DownSkipTrivia` | -| `(parent (child) .)` | `UpSkipTrivia` on exit | -| `(parent "x" .)` | `UpExact` on exit | - -Mode determined by what **precedes** the anchor: - -| Precedes `.` | Mode | -| -------------------------------- | ---------- | -| Named node `(foo)`, wildcard `_` | SkipTrivia | -| String literal `"foo"` | Exact | -| Start of children (prefix `.`) | SkipTrivia | - -### Multi-Level Ascent - -Closing multiple nesting levels uses `Up` with a level count. For `(a (b (c (d))))`: - -``` -T3: [Down] Node(d) → T4 -T4: [Up level=3] Epsilon → Accept -``` - -When anchors exist at intermediate levels, decompose. For `(a (b (c) .) .)`: - -``` -T2: [Down] Node(c) → T3 -T3: [UpSkipTrivia] Epsilon → T4 // c must be last non-trivia in b -T4: [UpSkipTrivia] Epsilon → Accept // b must be last non-trivia in a -``` - -Cannot combine into `UpSkipTrivia(2)` because constraints apply at each level. - -### Execution Flow - -``` -1. MOVE nav → initial cursor movement -2. SEARCH loop: try matcher, on fail check skip policy, advance or fail -3. EFFECTS on match success: execute effects list (including explicit CaptureNode) -``` - -For `Up*` variants, step 2 is replaced by: validate exit constraint, ascend N levels. - -No post-validation phase. Exit constraints are front-loaded into `Up*` variants. - -### Field Handling - -**Field constraints** are part of the match attempt within the search loop. A node that doesn't satisfy field constraints is treated as a match failure, triggering the skip policy: - -```rust -// Inside search loop, before structural match: -if let Some(required) = pattern.field { - if cursor.field_id() != Some(required) { - // Field mismatch = match fail, apply skip policy - continue; - } -} -// Then check node kind, negated fields, etc. -``` - -**Negated fields** are also part of match—checked after field/kind match succeeds: - -```rust -// After node kind matches: -for &fid in pattern.negated_fields { - if node.child_by_field_id(fid).is_some() { - // Negated field present = match fail, apply skip policy - continue; - } -} -// Match succeeds, exit search loop -``` - -### Examples - -**Simple**: `(function (identifier) @name)` - -``` -T0: [Stay] Node(function) → T1 -T1: [Down] Node(identifier) [CaptureNode] → T2 -T2: [Up] Epsilon [Field("name")] → Accept -``` - -**Anchored first child**: `(function . (identifier))` - -``` -T0: [Stay] Node(function) → T1 -T1: [DownSkipTrivia] Node(identifier) → T2 -T2: [Up] Epsilon → Accept -``` - -**Anchored last child**: `(function (identifier) .)` - -``` -T0: [Stay] Node(function) → T1 -T1: [Down] Node(identifier) → T2 -T2: [UpSkipTrivia] Epsilon → Accept -``` - -**Siblings**: `(block (stmt) (stmt))` - -``` -T0: [Stay] Node(block) → T1 -T1: [Down] Node(stmt) → T2 -T2: [Next] Node(stmt) → T3 -T3: [Up] Epsilon → Accept -``` - -**Adjacent siblings**: `(block (stmt) . (stmt))` - -``` -T0: [Stay] Node(block) → T1 -T1: [Down] Node(stmt) → T2 -T2: [NextSkipTrivia] Node(stmt) → T3 -T3: [Up] Epsilon → Accept -``` - -**Deep nesting**: `(a (b (c (d))))` - -``` -T0: [Stay] Node(a) → T1 -T1: [Down] Node(b) → T2 -T2: [Down] Node(c) → T3 -T3: [Down] Node(d) → T4 -T4: [Up level=3] Epsilon → Accept -``` - -**Mixed anchors**: `(a (b) . (c) .)` - -``` -T0: [Stay] Node(a) → T1 -T1: [Down] Node(b) → T2 -T2: [NextSkipTrivia] Node(c) → T3 // . before (c): adjacent to b -T3: [UpSkipTrivia] Epsilon → Accept // . after (c): c is last non-trivia -``` - -**Intermediate anchor**: `(foo (foo (bar) .)) (baz)` - -``` -T0: [Stay] Node(foo_outer) → T1 -T1: [Down] Node(foo_inner) → T2 -T2: [Down] Node(bar) → T3 -T3: [UpSkipTrivia] Epsilon → T4 // bar must be last non-trivia in foo_inner -T4: [Up] Epsilon → T5 // no constraint on foo_inner in foo_outer -T5: [Next] Node(baz) → Accept -``` - -## Alternatives Considered - -### Pure Node API - -Rejected: `next_sibling()` is O(siblings), no efficient backtracking. - -### Cursor Cloning - -Rejected: `TreeCursor::clone()` heap-allocates, O(depth) memory per checkpoint. - -### Runtime Navigation Dispatch - -Previous design used `(prev_matcher, curr_matcher)` pairs to determine movement at runtime. Rejected: - -- Required tracking `prev_matcher` in interpreter state and backtrack checkpoints -- Complex dispatch table -- Navigation decisions can be resolved at compile time - -### Separate Post-Anchor Validation - -Previous design had `post_anchor` field validated after match. Rejected: - -- Extra phase in execution loop -- Exit constraints naturally encode as `Up*` variants -- "Must be last child" is validated before ascending, not after matching - -## Complexity Analysis - -| Operation | Cursor | Node | -| ----------------------- | ------------ | ----------- | -| `goto_first_child()` | O(1) | — | -| `goto_next_sibling()` | O(1) | O(siblings) | -| `goto_parent()` | O(1) | O(1) | -| `field_id()` | O(field_map) | — | -| `child_by_field_id(id)` | — | O(children) | -| `descendant_index()` | O(1) | — | -| `goto_descendant(idx)` | O(depth) | — | - -- Checkpoint save: O(1) -- Checkpoint restore: O(depth)—cold path only - -## Consequences - -**Positive**: - -- O(1) sibling traversal -- 4-byte checkpoints -- No `prev_matcher` tracking—navigation fully determined by `Nav` -- Simpler execution loop: navigate → search → match (no post-validation) -- Anchor constraints resolved at graph construction time - -**Negative**: - -- Single cursor constraint requires careful state management -- O(depth) restore cost on backtrack -- Intermediate anchors prevent multi-level `Up(n)` optimization - -## References - -- [ADR-0005: Transition Graph Format](ADR-0005-transition-graph-format.md) -- [ADR-0006: Dynamic Query Execution](ADR-0006-dynamic-query-execution.md) -- `tree-sitter/lib/src/tree_cursor.c` diff --git a/docs/adr/ADR-0009-type-system.md b/docs/adr/ADR-0009-type-system.md deleted file mode 100644 index e6c85b61..00000000 --- a/docs/adr/ADR-0009-type-system.md +++ /dev/null @@ -1,426 +0,0 @@ -# ADR-0009: Type System - -- **Status**: Superseded by [ADR-0010](ADR-0010-type-system-v2.md) -- **Date**: 2025-01-14 - -## Context - -Type inference transforms a query into typed structures. This ADR formalizes the inference rules with a unified conceptual model. - -## Decision - -### Core Principle - -The type system reduces to two orthogonal concepts: - -1. **Scope boundaries** — where captures land -2. **Payload rule** — what type a scope produces - -> Captures bubble up to the nearest scope boundary; each scope's type is determined by its capture count and scope kind. - -### Type Universe - -``` -τ ::= Void -- no captures (TypeId = 0) - | Node -- AST node reference (TypeId = 1) - | String -- extracted source text (TypeId = 2) - | Optional(τ) -- zero or one - | ArrayStar(τ) -- zero or more - | ArrayPlus(τ) -- one or more - | Struct(fields) -- named fields - | Enum(variants) -- tagged union -``` - -### Captures - -A capture `@name` creates a field that bubbles up to the nearest enclosing scope. - -| Pattern | Field Type | -| --------------- | -------------------- | -| `(node) @x` | `Node` | -| `"literal" @x` | `Node` | -| `@x ::string` | `String` | -| `@x ::TypeName` | `TypeName` (nominal) | -| `{...} @x` | scope payload | -| `[...] @x` | scope payload | - -### Scope Boundaries - -**Golden rule**: `{}` and `[]` create a scope **only when captured**. - -Scopes are created by: - -1. **Definition root**: `Def = expr` — always a scope -2. **Captured sequence**: `{...} @name` — creates Struct scope -3. **Captured tagged alternation**: `[A: ... B: ...] @name` — creates Enum scope -4. **Captured untagged alternation**: `[...] @name` — creates Struct scope (merged fields) -5. **QIS** (Quantifier-Induced Scope): auto-created when quantifier has ≥2 captures -6. **Reference**: `(Def)` is opaque — blocks propagation entirely - -**Uncaptured containers are transparent**: - -- `{...}` without `@name` — captures pass through to outer scope -- `[...]` without `@name` — captures pass through (asymmetric ones become Optional) -- `[A: ... B: ...]` without `@name` — **tags ignored**, behaves like untagged - -### Payload Rule - -| Captures | Payload Type | -| -------- | ----------------------- | -| 0 | `Void` | -| 1 | unwrap OR `Struct` | -| ≥2 | `Struct { field, ... }` | - -**Unwrap applies to** (1 capture → capture's type directly): - -- Definition roots -- Enum variants -- QIS element types - -**Always Struct** (1 capture → `Struct { field }`): - -- Captured sequences `{...} @name` -- Captured untagged alternations `[...] @name` - -**Rationale**: Explicit `@name` on a container signals intent to preserve structure. Definition roots and enum variants unwrap because the container name (def name / variant tag) already provides context. - -### Reference Opacity - -References are opaque barriers. Calling `(Foo)` does NOT inherit `Foo`'s captures. - -```plotnik -A = (identifier) @name -B = (A) -C = (A) @node -``` - -Types: - -- `A` → `Node` (1 capture, unwrapped) -- `B` → `Void` (0 captures — A's captures don't leak) -- `C` → `Node` (1 capture of type `A`, which is `Node`) - -To access a definition's structure, capture it: `(Foo) @foo` yields a field of type `Foo`. - -### Flat Scoping Principle - -Query nesting does NOT create data nesting. Only scope boundaries matter: - -```plotnik -Query = (a (b (c) @val)) -``` - -Result: `Node` — the `(a ...)` and `(b ...)` wrappers contribute nothing. Single capture at def root unwraps. - -```plotnik -Query = (a (b (c) @x (d) @y)) -``` - -Result: `Struct { x: Node, y: Node }` — two captures form a struct. - -### Cardinality - -Cardinality describes how many values a capture produces: - -| Cardinality | Notation | Wrapper | -| ----------- | -------- | ----------- | -| Required | `1` | none | -| Optional | `?` | `Optional` | -| Star | `*` | `ArrayStar` | -| Plus | `+` | `ArrayPlus` | - -**Propagation through nesting** (outer × inner): - -``` - 1 × 1 = 1 ? × 1 = ? * × 1 = * + × 1 = + - 1 × ? = ? ? × ? = ? * × ? = * + × ? = * - 1 × * = * ? × * = * * × * = * + × * = * - 1 × + = + ? × + = * * × + = * + × + = + -``` - -**Join** (merging branches with same capture): - -``` - + - /|\ - * | - \| - ? - | - 1 -``` - -When join produces array (`*`/`+`) but branch has scalar (`1`/`?`), compiler inserts lifting coercion to wrap in singleton array. - -### Alternation Semantics - -**Key insight**: Tags only matter when the alternation is captured. - -#### Uncaptured Alternation - -Captures propagate to parent scope. Asymmetric captures become `Optional`. Tags are ignored. - -```plotnik -// Tagged but uncaptured — tags ignored -Foo = [ A: (a) @x B: (b) @y ] -``` - -- `@x` only in A → `Optional(Node)` -- `@y` only in B → `Optional(Node)` -- Result: `Struct { x: Optional(Node), y: Optional(Node) }` - -```plotnik -// Symmetric captures -Bar = [ (a) @v (b) @v ] -``` - -- `@v` in all branches → `Node` (not Optional) -- Result: `Node` (1 capture at def root, unwraps) - -Diagnostic: warning for inline uncaptured tagged alternation (likely forgot `@name`). - -#### Captured Untagged Alternation - -Creates Struct scope. Branches merge. No unwrapping. - -```plotnik -Foo = [ (a) @x (b) @y ] @z -``` - -- `@z` creates Struct scope -- Merge: `{ x: Optional(Node), y: Optional(Node) }` -- Result: `Struct { z: Struct { x: Optional(Node), y: Optional(Node) } }` - -```plotnik -Bar = [ (a) @v (b) @v ] @z -``` - -- `@z` creates Struct scope -- Merge: `{ v: Node }` -- Always Struct (no unwrap): `Struct { v: Node }` -- Result: `Struct { z: Struct { v: Node } }` - -#### Captured Tagged Alternation - -Creates Enum scope. Each variant is independent, follows payload rule. - -```plotnik -Result = [ - Ok: (value) @val - Err: (error) @msg ::string -] @result -``` - -- Variant `Ok`: 1 capture → `Node` (unwrap) -- Variant `Err`: 1 capture → `String` (unwrap) -- Result: `Struct { result: Enum { Ok(Node), Err(String) } }` - -#### Tagged Alternation at Definition Root - -Special case: tagged alternation directly at definition root makes the definition itself an Enum. - -```plotnik -Result = [ - Ok: (value) @val - Err: (error) @msg ::string -] -``` - -- Result: `Enum Result { Ok(Node), Err(String) }` - -No wrapper struct — the definition IS the enum. - -### Unification Rules (Branch Merge) - -When merging captures across untagged alternation branches: - -**1-level merge semantics**: Top-level fields merge with optionality; nested struct mismatches are errors. - -``` -// OK: top-level field merge -Branch 1: { x: Node, y: Node } -Branch 2: { x: Node, z: String } -Result: { x: Node, y: Optional(Node), z: Optional(String) } - -// OK: nested structs identical -Branch 1: { data: { a: Node }, extra: Node } -Branch 2: { data: { a: Node } } -Result: { data: { a: Node }, extra: Optional(Node) } - -// ERROR: nested structs differ -Branch 1: { data: { a: Node } } -Branch 2: { data: { b: Node } } -→ Error: field `data` has incompatible struct types - -// ERROR: primitive mismatch -Branch 1: { val: String } -Branch 2: { val: Node } -→ Error: field `val` has incompatible types -``` - -**Rationale**: Deep recursive merging produces heavily-optional types, defeating typed extraction's purpose. Use tagged alternations for precise discrimination. - -### Quantifier-Induced Scope (QIS) - -When a quantified expression has **≥2 propagating captures**, QIS auto-creates a scope to keep values paired per-iteration. - -```plotnik -// 2 captures under quantifier → QIS triggers -Functions = (function - name: (identifier) @name - body: (block) @body -)* -``` - -- QIS creates element scope with 2 captures → Struct (always, by payload rule) -- Result: `ArrayStar(FunctionsItem)` where `FunctionsItem { name: Node, body: Node }` -- Definition has 1 propagating capture (the array) → unwrap -- Final: `Functions` is `ArrayStar(FunctionsItem)` - -```plotnik -// 1 capture → no QIS, standard cardinality multiplication -Items = { (item) @item }* -``` - -- No QIS (only 1 capture) -- `@item` gets cardinality `*` -- Result: `Node` would be wrong... actually 1 capture at def root -- Wait, the capture is `ArrayStar(Node)`, so def root has 1 "field" -- Result: `ArrayStar(Node)` (unwrapped) - -**Naming**: - -- At definition root: `{Def}Item` -- With explicit capture `E* @name`: `{Parent}{Name}` -- Neither (not at root, no capture): Error — require explicit `@name` - -### Synthetic Naming - -Types without explicit `::Name` receive synthetic names: - -| Context | Pattern | -| -------------------- | ----------------- | -| Definition root | `{DefName}` | -| Captured sequence | `{Def}{Capture}` | -| Captured alternation | `{Def}{Capture}` | -| Enum variant payload | `{Enum}{Variant}` | -| QIS element | `{Def}Item` | - -Collision resolution: append numeric suffix (`Foo`, `Foo2`, `Foo3`). - -### Error Conditions - -| Condition | Severity | Recovery | -| --------------------------------- | -------- | -------------------------- | -| Incompatible types in alternation | Error | Use invalid type, continue | -| Nested struct mismatch | Error | Use invalid type, continue | -| Duplicate capture in same scope | Error | Keep first | -| Inline uncaptured tagged alt | Warning | Treat as untagged | -| QIS without capture (not at root) | Error | Cannot infer element type | - -Error reporting is exhaustive: all incompatibilities across all branches are reported, not just the first. - -## Examples - -### Single Capture at Definition Root - -```plotnik -Name = (identifier) @name -``` - -- 1 capture at def root → unwrap -- Result: `Name` is `Node` - -### Multiple Captures at Definition Root - -```plotnik -Binding = (variable_declaration - name: (identifier) @name - value: (expression) @value -) -``` - -- 2 captures → Struct -- Result: `Binding { name: Node, value: Node }` - -### Captured vs Uncaptured Sequence - -```plotnik -// Captured sequence — creates scope, always Struct -Foo = { (bar) @bar } @baz -``` - -- `@bar` stays in `@baz`'s scope -- Captured sequence: always Struct -- Result: `Struct { baz: Struct { bar: Node } }` - -```plotnik -// Uncaptured sequence — transparent, captures pass through -Foo = { (bar) @bar } -``` - -- `{...}` without `@name` is transparent -- `@bar` bubbles up to definition root -- 1 capture at def root → unwrap -- Result: `Foo` is `Node` - -### Enum at Definition Root - -```plotnik -Boolean = [ - True: "true" - False: "false" -] -``` - -- Tagged alt at root, 0 captures per variant → Void -- Result: `Enum Boolean { True, False }` - -### Mixed Variant Payloads - -```plotnik -Expr = [ - Lit: (number) @value - Bin: (binary left: (_) @left right: (_) @right) -] -``` - -- `Lit`: 1 capture → unwrap → `Node` -- `Bin`: 2 captures → Struct -- Result: `Enum Expr { Lit(Node), Bin { left: Node, right: Node } }` - -### QIS with Multiple Captures - -```plotnik -Module = (module { - (function - name: (identifier) @name - params: (parameters) @params - )* -}) -``` - -- 2 captures under `*` → QIS triggers -- Element type: `ModuleItem { name: Node, params: Node }` -- Array propagates to def root (1 capture) → unwrap -- Result: `Module` is `ArrayStar(ModuleItem)` - -## Consequences - -**Positive**: - -- Golden rule ("only captured containers create scopes") is easy to remember -- Payload rule is uniform: 0→void, 1→unwrap, 2+→struct -- Exception for captured containers (always Struct) matches user intent -- "Tags only matter when captured" eliminates confusion - -**Negative**: - -- Field name loss on single-capture unwrap (mitigated by `::Type` annotation) -- 1-level merge is less flexible than deep merge (intentional trade-off) - -**Alternatives Considered**: - -- Always wrap in struct (rejected: verbose types like `{ val: Node }` instead of `Node`) -- Deep recursive merge (rejected: heavily-optional types defeat typed extraction) -- Error on uncaptured tagged alternations (rejected: too restrictive) diff --git a/docs/adr/ADR-0010-type-system-v2.md b/docs/adr/ADR-0010-type-system-v2.md deleted file mode 100644 index 3990d65e..00000000 --- a/docs/adr/ADR-0010-type-system-v2.md +++ /dev/null @@ -1,119 +0,0 @@ -# ADR-0010: Type System v2 (Transparent Graph Model) - -- **Status**: Accepted -- **Date**: 2025-01-14 -- **Supersedes**: ADR-0009 - -## Context - -The previous type system (ADR-0009) relied on implicit behaviors like "Quantifier-Induced Scope" (QIS) and "Single-Capture Unwrap" to reduce verbosity. While well-intentioned, these rules created "Wrapper Hell," where extracting logic into a reusable definition inadvertently changed the output structure. - -We need a model that supports **Mixin-like composition** (logic reuse without structural nesting) while maintaining strict type safety and data integrity. - -## Decision - -We adopt the **Transparent Graph Model**. - -### 1. Universal Bubbling ("Let It Bubble") - -Captures (`@name`) always bubble up to the nearest **Explicit Scope Boundary**. - -- **Private Definitions (`Def =`) are Transparent.** They act as macros or fragments. -- **Uncaptured Containers (`{...}`, `[...]`) are Transparent.** -- **References (`(Def)`) are Transparent.** - -This enables compositional patterns where a definition contributes fields to its caller's struct. - -### 2. Explicit Scope Boundaries - -A new data structure (Scope) is created **only** by explicit intent. - -1. **Public Roots:** `pub Def = ...` (The API Contract). -2. **Explicit Wrappers:** - - `{...} @name` (Nested Group). - - `[...] @name` (Nested Union). - - `[ L: ... ] @name` (Tagged Union). - -**Payload Rule**: - -- **0 Captures**: `Void` (Logic-only matcher). -- **1..N Captures**: `Struct { field_1, ..., field_N }`. -- **No Implicit Unwrap**: A single capture `(node) @x` produces `{ x: Node }`. It is never unwrapped to `Node`. - - _Benefit:_ Adding a second capture is non-breaking (`res.x` remains valid). - -### 3. Parallel Arrays (Columnar Output) - -Quantifiers (`*`, `+`) do **not** create implicit "Row Structs." instead, they change the cardinality of the bubbled fields to `Array`. - -**Example**: `( (A) @a (B) @b )*` -**Output**: `{ a: Array, b: Array }` (Struct of Arrays). - -This optimizes for the common case of data extraction (where SoA is often preferred) and avoids the complexity of implicit row creation. - -### 4. Row Integrity (Safety Check) - -To prevent **Data Desynchronization** (where `a[i]` no longer corresponds to `b[i]`), the Inference Pass enforces **Row Integrity**. - -**Rule**: A quantified scope cannot mix **Synchronized** and **Desynchronized** fields. - -- **Synchronized**: Field is strictly required (`1`) in the loop body. -- **Desynchronized**: Field is optional (`?`), repeated (`*`, `+`), or in an alternation. - -| Pattern | Fields | Status | Result | -| :--------------------- | :------------- | :----------- | :-------------- | -| `(A) @a (B) @b` | `a: 1`, `b: 1` | **Aligned** | ✅ OK (Columns) | -| `[ (A) @a \| (B) @b ]` | `a: ?`, `b: ?` | **Disjoint** | ✅ OK (Buckets) | -| `(A) @a (B)? @b` | `a: 1`, `b: ?` | **Mixed** | ❌ **Error** | - -**Error Message**: _"Field `@b` is optional while `@a` is required. Parallel arrays will not align. Wrap in `{...} @row` to enforce structure."_ - -### 5. Definition Roles - -| Feature | `Def` (Private) | `pub Def` (Public) | -| :----------------- | :---------------------------- | :---------------------- | -| **Concept** | **Fragment / Mixin** | **API Contract / Root** | -| **Graph Behavior** | Inlined (Copy-Paste) | Entrypoint | -| **Scoping** | Transparent (Captures bubble) | **Scope Boundary** | -| **Output Type** | Merges into parent | Named Interface | - -## Mental Model Migration - -| Old Way (Opaque) | New Way (Transparent) | -| :---------------- | :------------------------------------------- | -------------------------------------------------- | -| **Extract Def** | Broken `res.x`. Must rewrite as `res.def.x`. | Safe. `res.x` remains `res.x`. | -| **List of Items** | Implicit `RowStruct`. Hard to desync. | Explicit `Array, Array`. Enforced integrity. | -| **Collision** | Silent (Data Loss). | Compiler Error ("Duplicate Capture"). | -| **Fix Collision** | Manual re-capture. | Wrap: `{ (Def) } @alias`. | - -## Edge Cases - -### Recursive Definitions - -Since private definitions inline their contents, infinite recursion is structurally impossible for inlining. - -**Solution**: - -- Recursive definitions must be `pub` (creating a stable API boundary) OR wrapped in a capture at the call site `(Recurse) @next`. -- _Note: This is a natural constraint. Recursion implies a tree structure, so the output type must naturally reflect that tree structure._ - -### Collision Handling - -`A(B) = (node (B) (B))` - -- **Issue**: `B` captures `@id`. Using it twice causes "Duplicate Capture". -- **Solution**: User must disambiguate: `(node (B) @left (B) @right)`. -- **Benefit**: The output shape `{ left: {id}, right: {id} }` matches the semantic intent. - -## Consequences - -**Positive**: - -- **Refactoring Safety**: Extracting logic into a `Def` never changes the output shape. -- **Performance**: Parallel arrays (SoA) are cache-friendly and often what is needed for analysis. -- **Robustness**: The Row Integrity check prevents silent data corruption. -- **Simplicity**: No magic rules (QIS, Implicit Unwrap). - -**Negative**: - -- **Verbosity**: Must explicitly wrap `{...} @row` for list-of-structs. -- **Strictness**: "Mixed" optionality in loops is now a hard error, requiring explicit handling. diff --git a/docs/adr/README.md b/docs/adr/README.md deleted file mode 100644 index f2a74138..00000000 --- a/docs/adr/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# The Plotnik ADR System - -An ADR system documents important architectural decisions, their context, and their consequences. This helps maintain architectural consistency and provides valuable context for current and future contributors. - -## 1. Location - -As hinted at in your `AGENTS.md`, the best place for these is `docs/adr/`. - -## 2. Naming Convention - -Files should be named `ADR-XXXX-short-title-in-kebab-case.md`, where `XXXX` is a sequential number (e.g., `0001`, `0002`). - -## 3. ADR Template - -Create a file named `ADR-0000-template.md` in the `docs/adr/` directory with the following content. This makes it easy for anyone to start a new record. - -```markdown -# ADR-XXXX: Title of the Decision - -- **Status**: Proposed | Accepted | Deprecated | Superseded by [ADR-YYYY](ADR-YYYY-...) -- **Date**: YYYY-MM-DD - -## Context - -Describe the issue, problem, or driving force that led to this decision. What are the constraints and requirements? What is the scope of this decision? This section should be understandable to someone without deep project knowledge. - -## Decision - -Clearly and concisely state the decision that was made. This is the "what," not the "why." - -## Consequences - -This is the most critical section. Describe the results, outcomes, and trade-offs of the decision. - -### Positive Consequences - -- What benefits does this decision provide? -- How does it align with the project's goals (e.g., resilience, user experience, performance)? - -### Negative Consequences - -- What are the drawbacks or costs? -- What trade-offs were made? -- What future challenges might this decision introduce? - -### Considered Alternatives - -- **Alternative 1:** A brief description of a rejected option. - - _Pros_: Why was it considered? - - _Cons_: Why was it rejected? -- **Alternative 2:** ... -``` diff --git a/docs/binary-format/01-overview.md b/docs/binary-format/01-overview.md new file mode 100644 index 00000000..c1290df2 --- /dev/null +++ b/docs/binary-format/01-overview.md @@ -0,0 +1,79 @@ +# Binary Format: Overview + +64-byte Header + 8 aligned Sections. + +## Architecture + +- **Alignment**: Sections start on 64-byte boundaries; internal structures align to natural size (2/4/8 bytes) +- **Sequential**: Fixed order for single-pass writing +- **Endianness**: Little Endian +- **Limits**: All indices u16 (max 65,535). Transitions: 512 KB max. Use `Call` to share patterns. + +### Addressing + +| Type | Description | +| ------------------- | --------------------------------- | +| `StepId` (u16) | 8-byte block index in Transitions | +| `StringId` (u16) | String Table index | +| `TypeId` (u16) | Type Definition index | +| `NodeTypeId` (u16) | Tree-sitter node type ID | +| `NodeFieldId` (u16) | Tree-sitter field ID | + +## Memory Layout + +Section offsets defined in Header for robust parsing. + +| Section | Content | Record Size | +| ------------- | ------------------------ | ----------- | +| Header | Meta | 64 | +| [StringBlob] | UTF-8 | 1 | +| [StringTable] | StringId → Offset+Length | 4 | +| [NodeTypes] | NodeTypeId → StringId | 4 | +| [NodeFields] | NodeFieldId → StringId | 4 | +| [Trivia] | List of NodeTypeId | 2 | +| [TypeMeta] | Types | Var | +| [Entrypoints] | `pub` definitions | 8 | +| [Transitions] | Tree walking graph | 8 | + +[StringBlob]: 02-strings.md +[StringTable]: 02-strings.md +[NodeTypes]: 03-symbols.md +[NodeFields]: 03-symbols.md +[Trivia]: 03-symbols.md +[TypeMeta]: 04-types.md +[Entrypoints]: 05-entrypoints.md +[Transitions]: 06-transitions.md + +## Header + +First 64 bytes: magic (`PTKQ`), version (1), CRC32 checksum, section offsets. + +```rust +#[repr(C, align(64))] +struct Header { + magic: [u8; 4], // b"PTKQ" + version: u32, // 1 + checksum: u32, // CRC32 + total_size: u32, // Total file size in bytes + + // Section Offsets (Absolute byte offsets) + str_blob_offset: u32, + str_table_offset: u32, + node_types_offset: u32, + node_fields_offset: u32, + trivia_offset: u32, + type_meta_offset: u32, + entrypoints_offset: u32, + transitions_offset: u32, + + // Element Counts + str_table_count: u16, + node_types_count: u16, + node_fields_count: u16, + trivia_count: u16, + type_defs_count: u16, + type_members_count: u16, // Number of TypeMember blocks + entrypoints_count: u16, + transitions_count: u16, +} +``` diff --git a/docs/binary-format/02-strings.md b/docs/binary-format/02-strings.md new file mode 100644 index 00000000..926ac033 --- /dev/null +++ b/docs/binary-format/02-strings.md @@ -0,0 +1,63 @@ +# Binary Format: Strings + +Strings are stored in a centralized pool to eliminate redundancy and alignment padding overhead. They are referenced by `StringId` throughout the file. + +## Primitives + +**StringId (u16)**: Zero-based index into the String Table. + +- `0xFFFF` is reserved as a sentinel for "None" or "Anonymous". + +## 1. String Blob + +Contains the raw UTF-8 bytes for all strings concatenated together. + +- **Section Offset**: `header.str_blob_offset` +- **Content**: Raw bytes. Strings are **not** null-terminated. +- **Padding**: The section is padded to a 64-byte boundary at the end. + +## 2. String Table + +Lookup table mapping `StringId` to byte offsets within the String Blob. + +- **Section Offset**: `header.str_table_offset` +- **Record Size**: 4 bytes (`u32`). +- **Capacity**: `header.str_table_count + 1` entries. + - The table contains one extra entry at the end representing the total size of the unpadded blob. + +### Lookup Logic + +To retrieve string `i` (where `0 <= i < header.str_table_count`): + +1. Read `start = table[i]` +2. Read `end = table[i+1]` +3. Length = `end - start` +4. Data = `blob[start..end]` + +```rust +// Logical layout (not a single struct) +struct StringTable { + offsets: [u32; header.str_table_count + 1], +} +``` + +> **Limit**: Maximum `str_table_count` is 65,534 (0xFFFE). The table requires `count + 1` entries for length calculation, and the extra entry must fit in addressable space. + +### Example + +Stored strings: `"id"`, `"foo"` + +**String Blob**: + +```text +0x00: 'i', 'd', 'f', 'o', 'o' +... padding to 64 bytes ... +``` + +**String Table** (`str_table_count = 2`): + +```text +0x00: 0 (Offset of "id") +0x04: 2 (Offset of "foo") +0x08: 5 (End of blob, used to calculate length of "foo") +``` diff --git a/docs/binary-format/03-symbols.md b/docs/binary-format/03-symbols.md new file mode 100644 index 00000000..1521e258 --- /dev/null +++ b/docs/binary-format/03-symbols.md @@ -0,0 +1,52 @@ +# Binary Format: Symbols + +This section defines the symbol tables used to map external Tree-sitter IDs to internal string representations, and to define trivia kinds. + +## 1. Node Types + +A mapping from Tree-sitter's internal `u16` node type ID to a `StringId` in the query's string table. This allows the runtime to verify node kinds by name or display them for debugging. + +- **Section Offset**: `header.node_types_offset` +- **Record Size**: 4 bytes +- **Count**: `header.node_types_count` + +```rust +#[repr(C)] +struct NodeSymbol { + id: u16, // Tree-sitter Node Type ID + name: u16, // StringId +} +``` + +## 2. Node Fields + +A mapping from Tree-sitter's internal `u16` field ID to a `StringId`. Used for field verification during matching. + +- **Section Offset**: `header.node_fields_offset` +- **Record Size**: 4 bytes +- **Count**: `header.node_fields_count` + +```rust +#[repr(C)] +struct FieldSymbol { + id: u16, // Tree-sitter Field ID + name: u16, // StringId +} +``` + +## 3. Trivia + +A list of node type IDs that are considered "trivia" (e.g., whitespace, comments). The runtime uses this list when executing navigation commands like `NextSkipTrivia` or `DownSkipTrivia`. + +- **Section Offset**: `header.trivia_offset` +- **Record Size**: 2 bytes +- **Count**: `header.trivia_count` + +```rust +#[repr(C)] +struct TriviaEntry { + node_type: u16, // Tree-sitter Node Type ID +} +``` + +The list is not required to be sorted. Runtimes should build a lookup structure (e.g., bitset indexed by node type) on load for O(1) trivia checks. diff --git a/docs/binary-format/04-types.md b/docs/binary-format/04-types.md new file mode 100644 index 00000000..6935f3a8 --- /dev/null +++ b/docs/binary-format/04-types.md @@ -0,0 +1,135 @@ +# Binary Format: Type Metadata + +This section defines the type system metadata used for code generation and runtime validation. It allows consumers to understand the shape of the data extracted by the query. + +## 1. Primitives + +**TypeId (u16)**: Index into the Type Definition table. + +- `0`: `Void` (Captures nothing) +- `1`: `Node` (AST Node reference) +- `2`: `String` (Source text) +- `3..N`: Composite types (Index = `TypeId - 3`) +- `0xFFFF`: Invalid/Sentinel + +### Node Semantics + +`TYPE_NODE` (1) represents a platform-dependent handle to a tree-sitter AST node: + +| Context | Representation | +| :--------- | :--------------------------------------------------------- | +| Rust | `tree_sitter::Node<'tree>` (lifetime-bound reference) | +| TypeScript | Binding-provided object with `startPosition`, `text`, etc. | +| JSON | Unique node identifier (e.g., `"node:42"` or path-based) | + +The handle provides access to node metadata (kind, span, text) without copying the source. Lifetime management is platform-specific—Rust enforces it statically, bindings may use reference counting or arena allocation. + +**TypeKind (u8)**: Discriminator for `TypeDef`. + +- `0`: `Optional` (Wraps another type) +- `1`: `ArrayStar` (Zero or more) +- `2`: `ArrayPlus` (One or more) +- `3`: `Struct` (Record with named fields) +- `4`: `Enum` (Discriminated union) + +## 2. Layout + +The **TypeMeta** section contains two contiguous arrays: + +1. **Definitions**: `[TypeDef; header.type_defs_count]` +2. **Members**: `[TypeMember; header.type_members_count]` + +Both `header.type_members_count` and `Slice.ptr` are `u16`, so the addressable range (0..65535) is identical—no capacity mismatch is possible by construction. + +### 2.1. TypeDef (8 bytes) + +Describes a single type. + +```rust +#[repr(C)] +struct TypeDef { + members: Slice, // 4 bytes + name: u16, // StringId (0xFFFF for anonymous/wrappers) + kind: u8, // TypeKind + _pad: u8, +} + +#[repr(C)] +struct Slice { + ptr: u16, // Index or Data + len: u16, // Count +} +``` + +**Semantics of `members` field**: + +| Kind | `ptr` (u16) | `len` (u16) | Interpretation | +| :--------- | :------------ | :------------- | :------------- | +| `Optional` | `InnerTypeId` | 0 | Wrapper `T?` | +| `Array*` | `InnerTypeId` | 0 | Wrapper `T*` | +| `Array+` | `InnerTypeId` | 0 | Wrapper `T+` | +| `Struct` | `MemberIndex` | `MemberCount` | Record fields | +| `Enum` | `MemberIndex` | `VariantCount` | Union variants | + +> **Note**: The interpretation of `members.ptr` depends entirely on `kind`. For wrappers (`Optional`, `Array*`, `Array+`), `ptr` is a `TypeId`. For composites (`Struct`, `Enum`), `ptr` is an index into the TypeMember array. Parsers must dispatch on `kind` first. + +- `MemberIndex`: Index into the **TypeMember** array (relative to the start of the members region). + +### 2.2. TypeMember (4 bytes) + +Describes a field in a struct or a variant in an enum. + +```rust +#[repr(C)] +struct TypeMember { + name: u16, // StringId + ty: u16, // TypeId +} +``` + +**Storage**: +Members are tightly packed. Since `TypeDef` is 8 bytes, keeping `TypeMember` arrays aligned to 8 bytes ensures the whole section is dense. + +Example of `Struct { x: Node, y: String }`: + +1. `TypeDef`: `kind=Struct`, `members={ptr=0, len=2}` +2. `TypeMember[0]`: `name="x"`, `ty=Node` +3. `TypeMember[1]`: `name="y"`, `ty=String` + +**Padding**: Like all sections, TypeMeta is padded to a 64-byte boundary at the end. Since `TypeDef` is 8 bytes and `TypeMember` is 4 bytes, the section naturally maintains internal alignment; only end-of-section padding is needed. + +## 3. Recursive Types + +Recursive types reference themselves via TypeId. Since types are addressed by index, cycles are naturally representable. + +Example query: + +```plotnik +pub List = [ + Nil: (nil) + Cons: (cons (T) @head (List) @tail) +] +``` + +Type graph: + +```text +Strings: ["List", "Nil", "Cons", "head", "tail"] + Str#0 Str#1 Str#2 Str#3 Str#4 + +TypeDefs: + T3: Enum "List" (Str#0), members={ptr=0, len=2} + +TypeMembers: + [0]: name=Str#1 ("Nil"), ty=0 (Void) // unit variant + [1]: name=Str#2 ("Cons"), ty=T4 // payload is struct + +TypeDefs (continued): + T4: Struct 0xFFFF (anonymous), members={ptr=2, len=2} + +TypeMembers (continued): + [2]: name=Str#3 ("head"), ty=1 (Node) + [3]: name=Str#4 ("tail"), ty=T3 // <-- self-reference to List +``` + +The `tail` field's type (`T3`) points back to the `List` enum. The runtime handles this via lazy evaluation or boxing, depending on the target language. diff --git a/docs/binary-format/05-entrypoints.md b/docs/binary-format/05-entrypoints.md new file mode 100644 index 00000000..23ea31dd --- /dev/null +++ b/docs/binary-format/05-entrypoints.md @@ -0,0 +1,38 @@ +# Binary Format: Entrypoints + +This section defines the named entry points for the query. Each entry point exposes a public definition that can be executed against a syntax tree. + +## Layout + +- **Section Offset**: `header.entrypoints_offset` +- **Record Size**: 8 bytes +- **Count**: `header.entrypoints_count` +- **Ordering**: Entries **must** be sorted lexicographically by the UTF-8 content of their `name` (resolved via String Table). This enables binary search at runtime. + +## Definition + +```rust +#[repr(C)] +struct Entrypoint { + name: u16, // StringId + target: u16, // StepId (into Transitions section) + result_type: u16, // TypeId + _pad: u16, // Padding to 8 bytes +} +``` + +### Fields + +- **name**: The name of the export (e.g., "Func", "Class"). `StringId`. +- **target**: The instruction pointer (`StepId`) where execution begins for this definition. This index is relative to the start of the **Transitions** section. +- **result_type**: The `TypeId` of the structure produced by this query definition. +- **\_pad**: Reserved for alignment. + +### Usage + +When the user runs a query with a specific entry point (e.g., `--entry Func`), the runtime: + +1. Performs a binary search over the entrypoints table, resolving `name` ID to string content for comparison. +2. Sets the initial instruction pointer (`IP`) to `target`. +3. Executes the VM. +4. Validates that the resulting value matches `result_type`. diff --git a/docs/binary-format/06-transitions.md b/docs/binary-format/06-transitions.md new file mode 100644 index 00000000..29bd976f --- /dev/null +++ b/docs/binary-format/06-transitions.md @@ -0,0 +1,332 @@ +# Binary Format: Transitions + +This section contains the Virtual Machine (VM) instructions and associated data blocks. It is a heap of 8-byte aligned blocks addressed by `StepId`. See [runtime-engine.md](../runtime-engine.md) for execution semantics. + +## 1. Addressing + +**StepId (u16)**: Zero-based index into this section. + +- Byte offset = `header.transitions_offset + (index * 8)`. +- Limit: 65,536 blocks (512 KB section size). + +## 2. Block Types + +The first byte of every block encodes both type and terminal status: + +```text +type_id (u8) +┌──────────┬───────────────┐ +│ term (1) │ type (7) │ +└──────────┴───────────────┘ +``` + +- **Bit 7**: Terminal flag (`type_id & 0x80`). If set, this is an accept state—match complete. +- **Bits 0-6**: Block type (`type_id & 0x7F`). + +| Code | Name | Category | +| :--- | :------------- | :---------- | +| 0x00 | `Match` | Instruction | +| 0x01 | `MatchExt` | Instruction | +| 0x02 | `Call` | Instruction | +| 0x03 | `Return` | Instruction | +| 0x10 | `MatchPayload` | Data | + +Terminal variants: `0x80` (Match), `0x81` (MatchExt). `Call`, `Return`, and `MatchPayload` are never terminal. + +## 3. Primitives + +### 3.1. Nav (u8) + +Bit-packed navigation command. + +| Bits 7-6 | Mode | Bits 5-0 Payload | +| :------- | :----------- | :--------------------- | +| `00` | Standard | Enum (see below) | +| `01` | Up | Level count `n` (1-63) | +| `10` | UpSkipTrivia | Level count `n` (1-63) | +| `11` | UpExact | Level count `n` (1-63) | + +**Standard Modes**: + +- `0`: `Stay` (Entry only) +- `1`: `Next` (Sibling, skip any) +- `2`: `NextSkip` (Sibling, skip trivia) +- `3`: `NextExact` (Sibling, exact) +- `4`: `Down` (Child, skip any) +- `5`: `DownSkip` (Child, skip trivia) +- `6`: `DownExact` (Child, exact) + +### 3.2. EffectOp (u16) + +Side-effect operation code packed into 16 bits. + +```text +EffectOp (u16) +┌──────────────┬─────────────────────┐ +│ opcode (6b) │ payload (10b) │ +└──────────────┴─────────────────────┘ +``` + +- **Opcode**: 6 bits (0-63), currently 13 defined +- **Payload**: 10 bits (0-1023), member/variant index + +| Opcode | Name | Payload (10b) | +| :----- | :------------- | :--------------------- | +| 0 | `CaptureNode` | - | +| 1 | `StartArray` | - | +| 2 | `PushElement` | - | +| 3 | `EndArray` | - | +| 4 | `StartObject` | - | +| 5 | `EndObject` | - | +| 6 | `SetField` | Member index (0-1023) | +| 7 | `PushField` | Member index (0-1023) | +| 8 | `StartVariant` | Variant index (0-1023) | +| 9 | `EndVariant` | - | +| 10 | `ToString` | - | +| 11 | `ClearCurrent` | - | +| 12 | `PushNull` | - | + +Member/variant indices are resolved via `type_members[struct_or_enum.members.start + index]`. + +## 4. Instructions + +All instructions are exactly 8 bytes. + +**Note**: In tree-sitter, `0` is never a valid `NodeTypeId` or `NodeFieldId`. We use `Option` to represent these values, where `None` (stored as `0`) indicates no check (wildcard). + +**Epsilon Transitions**: A `MatchExt` with `node_type: None`, `node_field: None`, and `nav: Stay` is an **epsilon transition**—it succeeds unconditionally without cursor interaction. This is critical for: + +- **Branching at EOF**: `(A)?` must succeed when no node exists to match +- **Trailing navigation**: Many queries end with epsilon + `Up(n)` to restore cursor position after matching descendants + +Epsilon transitions bypass the normal "check node exists → check type → check field" logic entirely. They execute effects and select successors without touching the cursor. + +### 4.1. Match + +Optimized fast-path transition. + +```rust +#[repr(C)] +struct Match { + type_id: u8, // 0x00 or 0x80 (terminal) + nav: u8, // Nav + node_type: Option, // None means "any" + node_field: Option, // None means "any" + next: u16, // Next StepId (ignored if terminal) +} +``` + +When `type_id & 0x80` is set, the match succeeds and accepts—`next` is ignored. + +### 4.2. MatchExt + +Extended transition pointing to a payload block. + +```rust +#[repr(C)] +struct MatchExt { + type_id: u8, // 0x01 + nav: u8, // Nav + node_type: Option, // None means "any" + node_field: Option, // None means "any" + payload: u16, // StepId to MatchPayload +} +``` + +### 4.3. Call + +Invokes another definition (recursion). Pushes `next` to the call stack and jumps to `target`. + +```rust +#[repr(C)] +struct Call { + type_id: u8, // 0x02 + reserved: u8, + next: u16, // Return address (StepId) + target: u16, // Callee StepId + ref_id: u16, // Must match Return.ref_id +} +``` + +### 4.4. Return + +Returns from a definition. Pops the return address from the call stack. + +```rust +#[repr(C)] +struct Return { + type_id: u8, // 0x03 + reserved: u8, + ref_id: u16, // Must match Call.ref_id (invariant check) + _pad: u32, +} +``` + +### 4.5. The `ref_id` Invariant + +The `ref_id` field enforces stack discipline between `Call` and `Return` instructions. Each definition gets a unique `ref_id` at compile time. At runtime: + +1. `Call` pushes a frame with its `ref_id` onto the call stack. +2. `Return` verifies its `ref_id` matches the current frame's `ref_id`. +3. Mismatch indicates a malformed query or VM bug—panic in debug builds. + +This catches errors like mismatched call/return pairs or corrupted stack state during backtracking. The check is O(1) and provides strong guarantees about control flow integrity. + +## 5. Data Blocks + +Variable-length blocks. The total size must be padded to a multiple of 8 bytes. + +> **Note**: These blocks are included in the Transitions segment to allow co-location with related instructions (e.g., placing `MatchPayload` immediately after `MatchExt`) to optimize for CPU cache locality. + +### 5.1. MatchPayload + +Contains extended logic for `MatchExt`. + +```rust +#[repr(C)] +struct MatchPayloadHeader { + type_id: u8, // 0x10 + reserved: u8, + pre_count: u8, // Count of Pre-Effects + neg_count: u8, // Count of Negated Fields + post_count: u8, // Count of Post-Effects + succ_count: u8, // Count of Successors + _pad: u16, +} +``` + +**Body Layout** (contiguous, u16 aligned): + +1. `pre_effects`: `[EffectOp; pre_count]` +2. `post_effects`: `[EffectOp; post_count]` +3. `negated_fields`: `[u16; neg_count]` +4. `successors`: `[u16; succ_count]` (StepIds) + +**Continuation Logic**: + +| `succ_count` | Behavior | Use case | +| :----------- | :---------------------------- | :------------------------- | +| 0 | Check terminal bit | Accept or invalid | +| 1 | `ip = successors[0]` | Linear continuation | +| 2+ | Branch via `successors[0..n]` | Alternation (backtracking) | + +When `succ_count == 0`, the owning `MatchExt` must have the terminal bit set (`type_id == 0x81`). This executes effects and accepts. A non-terminal `MatchExt` with `succ_count == 0` is invalid (no continuation path). + +**Contrast with `Match`**: The simpler `Match` block has inline `next` and uses the terminal bit directly. `MatchExt` uses `succ_count` for branching, with `succ_count == 0` + terminal bit for accept states that need effects. + +## 6. Quantifier Compilation + +Quantifiers compile to branching patterns in the transition graph. + +**Note on "Branch" blocks**: The diagrams below use "Branch" as a logical construct. In the actual bytecode, a Branch is implemented as a `MatchExt` with: + +- `node_type: None` (no type check) +- `nav: Stay` (no cursor movement) +- `succ_count >= 2` (multiple successors) + +This combination creates an **epsilon transition**—a decision point that doesn't consume input, only selects which path to follow. + +### Greedy `*` (Zero or More) + +``` + ┌─────────────────┐ + ↓ │ +Entry ─ε→ Branch ─ε→ Match ─┘ + │ + └─ε→ Exit + +Branch.successors = [match, exit] // try match first +``` + +### Greedy `+` (One or More) + +``` + ┌─────────────────┐ + ↓ │ +Entry ─→ Match ─ε→ Branch ─┘ + │ + └─ε→ Exit + +Branch.successors = [match, exit] +``` + +### Non-Greedy `*?` / `+?` + +Same structure as greedy, but successor order is reversed: + +``` +Branch.successors = [exit, match] // try exit first +``` + +### Greedy `?` (Optional) + +``` +Entry ─ε→ Branch ─ε→ Match ─ε→ Exit + │ + └─ε→ [PushNull] ─ε→ Exit + +Branch.successors = [match, skip] // try match first +``` + +The `PushNull` effect on the skip path is required for **Row Integrity** (see [type-system.md](../type-system.md#4-row-integrity)). When `?` captures a synchronized field, the skip branch must emit a null placeholder to keep parallel arrays aligned. + +## 7. Alternation Compilation + +Untagged alternations `[ A B ]` compile to branching with **symmetric effect injection** for row integrity. + +### Row Integrity in Alternations + +When a capture appears in some branches but not others, the compiler injects `PushNull` into branches missing that capture: + +``` +Query: [ (A) @x (B) ] + +Branch 1 (A): [CaptureNode, PushField(x)] → Exit +Branch 2 (B): [PushNull, PushField(x)] → Exit + ↑ injected +``` + +In columnar context `([ (A) @x (B) ])*`: + +- Iteration 1 matches A: `x` array gets the node +- Iteration 2 matches B: `x` array gets null placeholder +- Result: `x` array length equals iteration count + +### Multiple Captures + +Each missing capture gets its own `PushNull`: + +``` +Query: [ + { (A) @x (B) @y } + { (C) @x } + (D) +] + +Branch 1: [CaptureNode, PushField(x), CaptureNode, PushField(y)] +Branch 2: [CaptureNode, PushField(x), PushNull, PushField(y)] +Branch 3: [PushNull, PushField(x), PushNull, PushField(y)] +``` + +This ensures all synchronized fields maintain identical array lengths across iterations. + +### Non-Greedy `??` + +Same structure as `?`, but successor order is reversed: + +``` +Branch.successors = [skip, match] // try skip first +``` + +### Example: Array Capture + +Query: `(parameters (identifier)* @params)` + +Compiled graph (after epsilon elimination): + +``` +T0: MatchExt(identifier) [StartArray, CaptureNode, PushElement] → [T0, T1] +T1: Match [EndArray, SetField("params")] → next +``` + +The first iteration gets `StartArray` from the entry path. Loop iterations execute only `CaptureNode, PushElement`. On exit, `EndArray` finalizes the array. diff --git a/docs/lang-reference.md b/docs/lang-reference.md new file mode 100644 index 00000000..200870ba --- /dev/null +++ b/docs/lang-reference.md @@ -0,0 +1,833 @@ +# Plotnik Query Language Reference + +Plotnik is a pattern-matching language for tree-sitter syntax trees. It extends [tree-sitter's query syntax](https://tree-sitter.github.io/tree-sitter/using-parsers/queries/1-syntax.html) with named expressions, recursion, and static type inference. + +Predicates (`#eq?`, `#match?`) and directives (`#set!`) are intentionally unsupported—filtering logic belongs in your host language. + +--- + +## Execution Model + +NFA-based cursor walk with backtracking. + +### Key Properties + +- **Root-anchored**: Matches the entire tree structure (like `^...$` in regex) +- **Backtracking**: Failed branches restore state and try alternatives +- **Ordered choice**: `[A B C]` tries branches left-to-right; first match wins + +### Trivia Handling + +Comments and "extra" nodes (per tree-sitter grammar) are automatically skipped unless explicitly matched. + +```plotnik/docs/lang-reference.md#L24-24 +(function_declaration (identifier) @name (block) @body) +``` + +Matches even with comments between children: + +```plotnik/docs/lang-reference.md#L28-31 +function foo /* comment */() { + /* body */ +} +``` + +The `.` anchor enforces strict adjacency: + +```plotnik/docs/lang-reference.md#L35-35 +(array . (identifier) @first) ; must be immediately after bracket +``` + +### Partial Matching + +Node patterns are open—unmentioned children are ignored: + +```plotnik/docs/lang-reference.md#L46-46 +(binary_expression left: (identifier) @left) +``` + +Matches any `binary_expression` with an `identifier` in `left`, regardless of `operator`, `right`, etc. + +Sequences `{...}` advance through siblings in order, skipping non-matching nodes. + +### Field Constraints + +`field: pattern` requires the child to have that field AND match the pattern: + +```plotnik/docs/lang-reference.md#L58-61 +(binary_expression + left: (identifier) @x + right: (number) @y +) +``` + +Fields participate in sequential matching—they're not independent lookups. + +--- + +## File Structure + +A `.ptk` file contains definitions: + +```plotnik/docs/lang-reference.md#L78-82 +; Internal (mixin/fragment) +Expr = [(identifier) (number) (string)] + +; Public entrypoint +pub Stmt = (statement) @stmt +``` + +### Visibility + +| Syntax | Role | In Binary | +| --------------- | ----------------- | --------- | +| `Def = ...` | Internal mixin | No | +| `pub Def = ...` | Public entrypoint | Yes | + +Internal definitions exist only to support `pub` definitions. + +### Script vs Module Mode + +**Script** (`-q` flag): Anonymous expressions allowed, auto-wrapped in language root. + +```sh +plotnik exec -q '(identifier) @id' -s app.js +``` + +**Module** (`.ptk` files): Only named definitions allowed. + +```plotnik/docs/lang-reference.md#L106-110 +; ERROR in .ptk file +(identifier) @id + +; OK +pub Query = (identifier) @id +``` + +--- + +## Workspace + +A directory of `.ptk` files loaded as a single compilation unit. + +### Properties + +- **Flat namespace**: `Foo` in `a.ptk` visible in `b.ptk` without imports +- **Global uniqueness**: Duplicate names are errors +- **Non-recursive**: Subdirectories are separate workspaces +- **Dead code elimination**: Unreachable internals stripped + +### Language Inference + +Inferred from directory name (`queries.ts/` → TypeScript, `java-checks/` → Java). Override with `-l/--lang`. + +### Execution + +- Single `pub`: Default entrypoint +- Multiple `pub`: Use `--entry ` +- No `pub`: Compilation error + +### Example + +`helpers.ptk`: + +```plotnik/docs/lang-reference.md#L147-153 +Ident = (identifier) + +DeepSearch = [ + (Ident) @target + (_ (DeepSearch)*) +] +``` + +`main.ptk`: + +```plotnik/docs/lang-reference.md#L157-158 +pub AllIdentifiers = (program (DeepSearch)*) +``` + +--- + +## Naming Conventions + +| Kind | Case | Examples | +| -------------------------- | ------------ | ------------------------------------ | +| Definitions, labels, types | `PascalCase` | `Expr`, `Statement`, `BinaryOp` | +| Node kinds | `snake_case` | `function_declaration`, `identifier` | +| Captures, fields | `snake_case` | `@name`, `@func_body` | + +Tree-sitter allows `@function.name`; Plotnik requires `@function_name` because captures map to struct fields. + +--- + +## Data Model + +Plotnik infers output types from your query. The key rule may surprise you: + +### Flat by Default + +Query nesting does NOT create output nesting. All captures become fields in a single flat record. + +``` +(function_declaration + name: (identifier) @name + body: (block + (return_statement (expression) @retval))) +``` + +Output type: + +```typescript +{ name: Node, retval: Node } // flat, not nested +``` + +The pattern is 4 levels deep, but the output is flat. This is intentional: you're usually extracting specific pieces from an AST, not reconstructing its shape. + +### The Node Type + +Default capture type—a reference to a tree-sitter node: + +```plotnik/docs/lang-reference.md#L205-210 +interface Node { + kind: string; // e.g. "identifier" + text: string; // source text + start: Position; // { row, column } + end: Position; +} +``` + +### Cardinality: Quantifiers → Arrays + +Quantifiers on the captured pattern determine whether a field is singular, optional, or an array: + +| Pattern | Output Type | Meaning | +| --------- | ---------------- | ------------ | +| `(x) @a` | `a: T` | exactly one | +| `(x)? @a` | `a?: T` | zero or one | +| `(x)* @a` | `a: T[]` | zero or more | +| `(x)+ @a` | `a: [T, ...T[]]` | one or more | + +### Creating Nested Structure + +Capture a sequence `{...}` or alternation `[...]` to create a new scope. Braces alone don't introduce nesting: + +``` +{ + (function_declaration + name: (identifier) @name + body: (_) @body + ) @node +} @func +``` + +Output type: + +```typescript +{ func: { node: Node, name: Node, body: Node } } +``` + +The `@func` capture on the group creates a nested scope. All captures inside (`@node`, `@name`, `@body`) become fields of that nested object. + +### Type Annotations + +`::` after a capture controls the output type: + +| Annotation | Effect | +| -------------- | ----------------------------- | +| `@x` | Inferred (usually `Node`) | +| `@x :: string` | Extract `node.text` as string | +| `@x :: T` | Name the type `T` in codegen | + +Only `:: string` changes data; other `:: T` affect only generated type names. + +Example: + +``` +{ + (function_declaration + name: (identifier) @name :: string + body: (_) @body + ) @node +} @func :: FunctionDeclaration +``` + +Output type: + +```typescript +interface FunctionDeclaration { + node: Node; + name: string; // :: string converted this + body: Node; +} + +{ + func: FunctionDeclaration; +} +``` + +### Summary + +| Pattern | Output | +| ----------------------- | ------------------------- | +| `@name` | Field in current scope | +| `(x)? @a` | Optional field | +| `(x)* @a` | Array field | +| `{...} @x` / `[...] @x` | Nested object (new scope) | +| `@x :: string` | String value | +| `@x :: T` | Custom type name | + +--- + +## Nodes + +### Named Nodes + +Match named nodes (non-terminals and named terminals) by type: + +``` +(function_declaration) +(binary_expression (identifier) (number)) +``` + +Children can be partial—this matches any `binary_expression` with at least one `string_literal` child: + +``` +(binary_expression (string_literal)) +``` + +With captures: + +``` +(binary_expression + (identifier) @left + (number) @right) +``` + +Output type: + +```typescript +{ left: Node, right: Node } +``` + +### Anonymous Nodes + +Match literal tokens (operators, keywords, punctuation) with double or single quotes: + +``` +(binary_expression operator: "!=") +(return_statement "return") +``` + +Single quotes are equivalent to double quotes, useful when the query itself is wrapped in double quotes (e.g., in tool calls or JSON): + +``` +(return_statement 'return') +``` + +Anonymous nodes can be captured directly: + +``` +(binary_expression "+" @op) +"return" @keyword +``` + +Output type: + +```typescript +{ + op: Node; +} +{ + keyword: Node; +} +``` + +### Wildcards + +| Syntax | Matches | +| ------ | ----------------------------- | +| `(_)` | Any named node | +| `_` | Any node (named or anonymous) | + +```plotnik/docs/lang-reference.md#L370-371 +(call_expression function: (_) @fn) +(pair key: _ @key value: _ @value) +``` + +### Special Nodes + +- `(ERROR)` — matches parser error nodes +- `(MISSING)` — matches nodes inserted by error recovery +- `(MISSING identifier)` — matches a specific missing node type +- `(MISSING ";")` — matches a missing anonymous node + +``` +(ERROR) @syntax_error +(MISSING ";") @missing_semicolon +``` + +Output type: + +```typescript +{ + syntax_error: Node; +} +{ + missing_semicolon: Node; +} +``` + +### Supertypes + +Query abstract node types directly, or narrow with `/`: + +```plotnik/docs/lang-reference.md#L406-409 +(expression) @expr +(expression/binary_expression) @binary +(expression/"()") @empty_parens +``` + +--- + +## Fields + +Constrain children to named fields. A field value must be a node pattern, an alternation, or a quantifier applied to one of these. Groups `{...}` are not allowed as direct field values. + +``` +(assignment_expression + left: (identifier) @target + right: (call_expression) @value) +``` + +Output type: + +```typescript +{ target: Node, value: Node } +``` + +With type annotations: + +``` +(assignment_expression + left: (identifier) @target :: string + right: (call_expression) @value) +``` + +Output type: + +```typescript +{ target: string, value: Node } +``` + +### Negated Fields + +Assert a field is absent with `!`: + +``` +(function_declaration + name: (identifier) @name + !type_parameters) +``` + +Negated fields don't affect the output type—they're purely structural constraints: + +```typescript +{ + name: Node; +} +``` + +--- + +## Quantifiers + +- `?` — zero or one (optional) +- `*` — zero or more +- `+` — one or more (non-empty) + +``` +(function_declaration (decorator)? @decorator) +(function_declaration (decorator)* @decorators) +(function_declaration (decorator)+ @decorators) +``` + +Output types: + +```typescript +{ decorator?: Node } +{ decorators: Node[] } +{ decorators: [Node, ...Node[]] } +``` + +The `+` quantifier always produces non-empty arrays—no opt-out. + +Plotnik also supports non-greedy variants: `*?`, `+?`, `??` + +--- + +## Sequences + +Match sibling patterns in order with braces. + +> **⚠️ Syntax Difference from Tree-sitter** +> +> Tree-sitter: `((a) (b))` — parentheses for sequences +> Plotnik: `{(a) (b)}` — braces for sequences +> +> This avoids ambiguity: `(foo)` is always a node, `{...}` is always a sequence. +> Using tree-sitter's `((a) (b))` syntax in Plotnik is a parse error. + +Plotnik uses `{...}` to visually distinguish grouping from node patterns, and adds scope creation when captured (`{...} @name`). + +``` +{ + (comment) + (function_declaration) +} +``` + +Quantifiers apply to sequences: + +``` +{ + (number) + {"," (number)}* +} +``` + +### Sequences with Captures + +Capture elements inside a sequence: + +``` +{ + (decorator)* @decorators + (function_declaration) @fn +} +``` + +Output type: + +```typescript +{ decorators: Node[], fn: Node } +``` + +Capture the entire sequence with a type name: + +``` +{ + (comment)+ + (function_declaration) @fn +}+ @sections :: Section +``` + +Output type: + +```typescript +interface Section { + fn: Node; +} + +{ sections: [Section, ...Section[]] } +``` + +--- + +## Alternations + +Match alternatives with `[...]`: + +- **Untagged**: Fields merge across branches +- **Tagged** (with labels): Discriminated union + +```plotnik/docs/lang-reference.md#L570-573 +[ + (identifier) + (string_literal) +] @value +``` + +### Merge Style (Unlabeled) + +Captures merge: present in all branches → required; some branches → optional. Same-name captures must have compatible types. + +Branches must be type-compatible. Bare nodes are auto-promoted to single-field structs when mixed with structured branches. + +``` +(statement + [ + (assignment_expression left: (identifier) @left) + (call_expression function: (identifier) @func) + ]) +``` + +Output type: + +```typescript +{ left?: Node, func?: Node } // each appears in one branch only +``` + +When the same capture appears in all branches: + +``` +[ + (identifier) @name + (string) @name +] +``` + +Output type: + +```typescript +{ + name: Node; +} // required: present in all branches, same type +``` + +Mixed presence: + +``` +[ + (binary_expression + left: (_) @x + right: (_) @y) + (identifier) @x +] +``` + +The second branch `(identifier) @x` is auto-promoted to a structure `{ x: Node }`, making it compatible with the first branch. + +Output type: + +```typescript +{ x: Node, y?: Node } // x in all branches (required), y in one (optional) +``` + +Type mismatch is an error: + +``` +[(identifier) @x :: string (number) @x :: number] // ERROR: @x has different types +``` + +With a capture on the alternation itself, the type is non-optional since exactly one branch must match: + +``` +[ + (identifier) + (number) +] @value +``` + +Output type: + +```typescript +{ + value: Node; +} +``` + +### Tagged Style (Labeled) + +Labels create a discriminated union (`$tag` + `$data`): + +```plotnik/docs/lang-reference.md#L657-660 +[ + Assign: (assignment_expression left: (identifier) @left) + Call: (call_expression function: (identifier) @func) +] @stmt :: Stmt +``` + +```plotnik/docs/lang-reference.md#L664-667 +type Stmt = + | { $tag: "Assign"; $data: { left: Node } } + | { $tag: "Call"; $data: { func: Node } }; +``` + +### Alternations with Type Annotations + +When a merge alternation produces a structure (branches have internal captures), the capture on the alternation must have an explicit type annotation for codegen: + +``` +(call_expression + function: [ + (identifier) @fn + (member_expression property: (property_identifier) @method) + ] @target :: Target) +``` + +Output type: + +```typescript +interface Target { + fn?: Node; + method?: Node; +} + +{ + target: Target; +} +``` + +--- + +## Anchors + +The anchor `.` constrains sibling positions. Anchors don't affect types—they're structural constraints. + +First child: + +``` +(array . (identifier) @first) +``` + +Last child: + +``` +(block (_) @last .) +``` + +Immediate adjacency: + +``` +(dotted_name (identifier) @a . (identifier) @b) +``` + +Without the anchor, `@a` and `@b` would match non-adjacent pairs too. + +Output type for all examples: + +```typescript +{ first: Node } +{ last: Node } +{ a: Node, b: Node } +``` + +Anchors ignore anonymous nodes. + +--- + +## Named Expressions + +Define reusable patterns: + +```plotnik/docs/lang-reference.md#L744-748 +BinaryOp = + (binary_expression + left: (_) @left + operator: _ @op + right: (_) @right) +``` + +Use as node types: + +```plotnik/docs/lang-reference.md#L752-752 +(return_statement (BinaryOp) @expr) +``` + +**Encapsulation**: `(Name)` matches but extracts nothing. You must capture (`(Name) @x`) to access fields. This separates structural reuse from data extraction. + +Named expressions define both pattern and type: + +```plotnik/docs/lang-reference.md#L764-764 +Expr = [(BinaryOp) (UnaryOp) (identifier) (number)] +``` + +--- + +## Recursion + +Named expressions can self-reference: + +```plotnik/docs/lang-reference.md#L794-798 +NestedCall = + (call_expression + function: [(identifier) @name (NestedCall) @inner] + arguments: (arguments)) +``` + +Matches `a()`, `a()()`, `a()()()`, etc. → `{ name?: Node, inner?: NestedCall }` + +Tagged recursive example: + +```plotnik/docs/lang-reference.md#L810-815 +MemberChain = [ + Base: (identifier) @name + Access: (member_expression + object: (MemberChain) @object + property: (property_identifier) @property) +] +``` + +--- + +## Full Example + +``` +Statement = [ + Assign: (assignment_expression + left: (identifier) @target :: string + right: (Expression) @value) + Call: (call_expression + function: (identifier) @func :: string + arguments: (arguments (Expression)* @args)) + Return: (return_statement + (Expression)? @value) +] + +Expression = [ + Ident: (identifier) @name :: string + Num: (number) @value :: string + Str: (string) @value :: string +] + +(program (Statement)+ @statements) +``` + +Output types: + +```typescript +type Statement = + | { $tag: "Assign"; $data: { target: string; value: Expression } } + | { $tag: "Call"; $data: { func: string; args: Expression[] } } + | { $tag: "Return"; $data: { value?: Expression } }; + +type Expression = + | { $tag: "Ident"; $data: { name: string } } + | { $tag: "Num"; $data: { value: string } } + | { $tag: "Str"; $data: { value: string } }; + +type Root = { + statements: [Statement, ...Statement[]]; +}; +``` + +--- + +## Quick Reference + +| Feature | Tree-sitter | Plotnik | +| -------------------- | ---------------- | ------------------------- | +| Capture | `@name` | `@name` (snake_case only) | +| Type annotation | | `@x :: T` | +| Text extraction | | `@x :: string` | +| Named node | `(type)` | `(type)` | +| Anonymous node | `"text"` | `"text"` | +| Any node | `_` | `_` | +| Any named node | `(_)` | `(_)` | +| Field constraint | `field: pattern` | `field: pattern` | +| Negated field | `!field` | `!field` | +| Quantifiers | `?` `*` `+` | `?` `*` `+` | +| Non-greedy | | `??` `*?` `+?` | +| Sequence | `((a) (b))` | `{(a) (b)}` | +| Alternation | `[a b]` | `[a b]` | +| Tagged alternation | | `[A: (a) B: (b)]` | +| Anchor | `.` | `.` | +| Named expression | | `Name = pattern` | +| Public entrypoint | | `pub Name = pattern` | +| Use named expression | | `(Name)` | + +--- + +## Diagnostics + +Priority-based suppression: when diagnostics overlap, lower-priority ones are hidden. You see the root cause, not cascading symptoms. diff --git a/docs/runtime-engine.md b/docs/runtime-engine.md new file mode 100644 index 00000000..405f2295 --- /dev/null +++ b/docs/runtime-engine.md @@ -0,0 +1,177 @@ +# Runtime Engine + +Executes compiled query graphs against Tree-sitter syntax trees. See [06-transitions.md](binary-format/06-transitions.md) for block types. + +## VM State + +```rust +struct VM<'a> { + cursor: TreeCursor<'a>, // Never reset—preserves descendant_index for O(1) backtrack + ip: StepId, // Current block index + frames: Vec, // Call stack + effects: EffectStream<'a>, // Side-effect log + matched_node: Option>, // Current match slot +} + +struct Frame { + ref_id: u16, // For Return verification + return_addr: u16, // Where to jump on Return +} +``` + +## Execution Cycle + +Fetch block at `ip` → dispatch by `type_id` → execute → update `ip`. + +### Match — Fast Path + +1. Execute `nav` → check `node_type` → check `node_field` +2. Fail → backtrack +3. Success: if terminal (`type_id & 0x80`) → accept; else `ip = next` + +### MatchExt — Extended Path + +1. Fetch `MatchPayload`, execute `pre_effects`, clear `matched_node` +2. Execute `nav`, check `node_type`/`node_field` (see Epsilon Transitions below) +3. Success: `matched_node = cursor.node()`, verify negated fields absent +4. Execute `post_effects` +5. Continuation: + - Terminal (`type_id & 0x80`) → accept (requires `succ_count == 0`) + - `succ_count == 1` → `ip = successors[0]` + - `succ_count >= 2` → branch via `successors` (backtracking) + +### Epsilon Transitions + +A `MatchExt` with `node_type: None` and `nav: Stay` is an **epsilon transition**—it succeeds unconditionally without cursor interaction. This enables pure control-flow decisions (branching for quantifiers) even when the cursor is exhausted (EOF). + +Common patterns: + +- **Quantifier branches**: `(A)?` uses epsilon to decide match-or-skip +- **Trailing cleanup**: Many queries end with epsilon + `Up(n)` to restore cursor position after matching, regardless of tree depth + +### Call (0x02) + +Push `{ ref_id, return_addr: next }` → `ip = target` + +### Return (0x03) + +Pop frame → verify `ref_id` match (panic on mismatch) → `ip = return_addr` + +## Navigation + +`Nav` byte encodes cursor movement, resolved at compile time. + +| Mode | Behavior | +| ------------------- | --------------------------------- | +| Stay | No movement | +| Next/Down | Skip any nodes until match | +| NextSkip/DownSkip | Skip trivia only | +| NextExact/DownExact | Immediate match required | +| Up(n) | Ascend n levels | +| UpSkipTrivia(n) | Ascend n, must be last non-trivia | +| UpExact(n) | Ascend n, must be last child | + +### Search Loop + +1. Move cursor → try match +2. On fail: Exact → fail; Skip → fail if non-trivia, else retry; Any → retry +3. On exhaustion: fail + +Example: `(foo (bar))` vs `(foo (foo) (foo) (bar))` with `Down` mode skips two `foo` children to find `bar`. With `DownExact`, first mismatch fails immediately. + +## Recursion + +### Cactus Stack + +Backtracking needs to restore frames destroyed by failed branches. Solution: arena + parent pointer. + +```rust +struct FrameArena { + frames: Vec, // Append-only + current: Option, // "Stack pointer" +} +struct Frame { + ref_id: u16, + return_addr: u16, + parent: Option, // Caller's frame index +} +``` + +"Pop" just moves `current`—frames remain for checkpoint restoration. + +### Pruning + +Problem: `(A)+` accumulates frames forever. Solution: high-water mark pruning after `Return`: + +``` +high_water = max(current_frame_idx, max_checkpoint_watermark) +arena.truncate(high_water + 1) +``` + +Bounds arena to O(max_checkpoint_depth + current_call_depth). + +### Call/Return + +Each call site stores its return address in the pushed frame. The `ref_id` check catches stack corruption (malformed IR or VM bug). + +## Backtracking + +```rust +struct Checkpoint { + descendant_index: u32, // Cursor position + effect_watermark: usize, // Effect stream length + frame_index: Option, // Frame arena state + ip: StepId, // Resume point +} +``` + +### Process + +1. **Save**: Push checkpoint, track `max_frame_watermark` for pruning +2. **Restore**: `goto_descendant()`, truncate effects, set `frames.current` +3. **Resume**: `ip = checkpoint.ip` + +### Branching (`succ_count > 1`) + +Save checkpoint for `successors[1..]` → try `successors[0]` → on fail, restore and try next. + +## Effects + +Operations logged instead of inline output. Backtracking: `truncate(watermark)`. + +```rust +struct EffectStream<'a> { + ops: Vec, + nodes: Vec>, +} +``` + +| Effect | Action | +| ------------------- | ---------------------------------- | +| CaptureNode | Push `matched_node` | +| Start/EndObject | Object boundaries | +| SetField(id) | Assign to field | +| PushField(id) | Append to array field (columnar) | +| Start/EndArray | Array boundaries | +| PushElement | Append to array | +| Start/EndVariant(t) | Tagged union boundaries | +| ToString | Node → source text | +| ClearCurrent | Reset current value | +| PushNull | Null placeholder (`?` in columnar) | + +### Materialization + +Materializer replays effects to build output. Stream is purely structural; nominal types come from `Entrypoint.result_type`. + +## Fuel Limits + +| Limit | Default | Purpose | +| -------------- | --------- | ----------------- | +| Exec fuel | 1,000,000 | Total transitions | +| Recursion fuel | 1,024 | Call depth | + +Exhaustion returns `RuntimeError`, not panic. + +## Trivia Handling + +Per-language trivia list used for `*Skip` navigation. A node is never skipped if it matches the current target—`(comment)` still matches comments. diff --git a/docs/type-system.md b/docs/type-system.md new file mode 100644 index 00000000..93f83622 --- /dev/null +++ b/docs/type-system.md @@ -0,0 +1,274 @@ +# Plotnik Type System + +Plotnik infers static types from query structure. This governs how captures materialize into output (JSON, structs, etc.). + +## Mental Model + +| Operation | Nested (tree-sitter) | Transparent (Plotnik) | +| ------------------ | -------------------- | --------------------- | +| Extract definition | `res.def.x` | `res.x` (unchanged) | +| List of items | Implicit row struct | Explicit `{...} @row` | +| Capture collision | Silent data loss | Compiler error | +| Fix collision | Manual re-capture | Wrap: `(Def) @alias` | + +## 1. Transparent Graph Model + +### Universal Bubbling + +Scopes are transparent by default. Captures bubble up through definitions and containers until hitting an explicit scope boundary. + +This enables reusable fragments ("mixins") that contribute fields to parent output without creating nesting. + +- **Private Definitions (`Def = ...`)**: Transparent (macro-like) +- **Uncaptured Containers (`{...}`, `[...]`)**: Transparent +- **References (`(Def)`)**: Transparent + +### Explicit Scope Boundaries + +New data structures are created only when explicitly requested: + +1. **Public Roots**: `pub Def = ...` +2. **Captured Groups**: `{...} @name` → Struct +3. **Captured Alternations**: `[...] @name` → Union +4. **Tagged Alternations**: `[ L: ... ] @name` → Tagged Union + +## 2. Data Shapes + +### Structs + +Created by `{ ... } @name`: + +| Captures | Result | +| -------- | ---------------------------------- | +| 0 | `Void` | +| 1+ | `Struct { field_1, ..., field_N }` | + +**No Implicit Unwrap**: `(node) @x` produces `{ x: Node }`, never bare `Node`. Adding fields later is non-breaking. + +### Unions + +Created by `[ ... ]`: + +- **Tagged**: `[ L1: (A) @a L2: (B) @b ]` → `{ "$tag": "L1", "$data": { a: Node } }` +- **Untagged**: `[ (A) @a (B) @b ]` → `{ a?: Node, b?: Node }` (merged) + +### Enum Variants + +| Captures | Payload | +| -------- | --------- | +| 0 | Unit/Void | +| 1+ | Struct | + +```plotnik/docs/type-system.md#L58-61 +Result = [ + Ok: (value) @val + Err: (error (code) @code (message) @msg) +] +``` + +Single-capture variants stay wrapped (`result.$data.val`), making field additions non-breaking. + +## 3. Parallel Arrays (Columnar Output) + +Quantifiers (`*`, `+`) produce arrays per-field, not lists of objects: + +```plotnik/docs/type-system.md#L75-75 +{ (Key) @k (Value) @v }* +``` + +Output: `{ "k": ["key1", "key2"], "v": ["val1", "val2"] }` + +This Struct-of-Arrays layout is efficient for analysis and avoids implicit row creation. + +For List-of-Objects, wrap explicitly: + +```plotnik/docs/type-system.md#L84-84 +( { (Key) @k (Value) @v } @entry )* +``` + +Output: `{ "entry": [{ "k": "key1", "v": "val1" }, ...] }` + +## 4. Row Integrity + +Parallel arrays require `a[i]` to correspond to `b[i]`. The compiler enforces this: + +**Rule**: Quantified scopes cannot mix synchronized and desynchronized fields. + +| Type | Cardinality | Behavior | +| -------------- | ----------- | ---------------------------------------------------- | +| Synchronized | `1` or `?` | One value per iteration (`?` emits null when absent) | +| Desynchronized | `*` or `+` | Variable values per iteration | + +`?` is synchronized because it emits null placeholders—like nullable columns in Arrow/Parquet. + +### Nested Quantifiers + +Cardinality multiplies through nesting: + +| Outer | Inner | Result | +| ----- | ----- | ------ | +| `1` | `*` | `*` | +| `*` | `1` | `*` | +| `*` | `*` | `*` | +| `+` | `+` | `+` | +| `?` | `+` | `*` | + +Example: + +```plotnik/docs/type-system.md#L123-123 +{ (A)* @a (B) @b }* // ERROR: @a is *, @b is 1 +{ (A)? @a (B) @b }* // OK: both synchronized +``` + +Fixes: + +```plotnik/docs/type-system.md#L128-129 +{ (A)* @a (B)* @b }* // Both columnar +{ { (A)* @a (B) @b } @row }* // Wrap for rows +``` + +## 5. Type Unification in Alternations + +Shallow unification across untagged branches: + +| Scenario | Result | +| --------------------------- | ------------- | +| Same capture, all branches | Required | +| Same capture, some branches | Optional | +| Type mismatch | Compile error | + +```plotnik/docs/type-system.md#L140-160 +[ + (A) @x + (B) @x +] // x: Node (required) + +[ + (_ (A) @x (B) @y) + (_ (A) @x) +] // x: Node, y?: Node + +[ + (A) @x ::string + (B) @x +] // ERROR: String vs Node +``` + +### Array Captures in Alternations + +When a quantified capture appears in some branches but not others, the result is `Array | null`: + +```plotnik/docs/type-system.md#L166-170 +[ + (A)+ @x + (B) +] // x: Node[] | null +``` + +The missing branch emits `PushNull`, not an empty array. This distinction matters for columnar output—`null` indicates "branch didn't match" vs `[]` meaning "matched zero times." + +For type conflicts, use tagged alternations: + +```plotnik/docs/type-system.md#L157-160 +[ + Str: (A) @x ::string + Node: (B) @x +] @result +``` + +### Unification Rules + +1. Primitives: exact match required +2. Arrays: element types unify; looser cardinality wins (`+` ∪ `*` → `*`) +3. Structs: identical field sets, recursively compatible +4. Enums: identical variant sets + +### 1-Level Merge Only + +Top-level fields merge with optionality; nested mismatches are errors: + +```/dev/null/merge.txt#L1-8 +// OK: top-level merge +{ x: Node, y: Node } ∪ { x: Node, z: String } → { x: Node, y?: Node, z?: String } + +// OK: identical nested +{ data: { a: Node } } ∪ { data: { a: Node }, extra: Node } → { data: { a: Node }, extra?: Node } + +// ERROR: nested differ +{ data: { a: Node } } ∪ { data: { b: Node } } → incompatible struct types +``` + +Deep merging produces heavily-optional types that defeat typed extraction's purpose. + +## 6. Recursion + +Self-referential types via: + +1. **TypeId indirection**: Types reference by ID, enabling cycles +2. **Escape analysis**: Every cycle needs a non-recursive exit +3. **Guarded recursion**: Every cycle must consume input (descend) +4. **Automatic detection**: Compiler generates Call/Return instead of inlining + +### Example + +```plotnik/docs/type-system.md#L213-219 +pub Expr = [ + Lit: (number) @value ::string + Binary: (binary_expression + left: (Expr) @left + right: (Expr) @right + ) +] +``` + +### Requirements + +```plotnik/docs/type-system.md#L226-232 +Loop = (Loop) // ERROR: no escape path +Expr = [ Lit: (n) @n Rec: (Expr) @e ] // OK: Lit escapes + +A = (B) +B = (A) // ERROR: no input consumed + +A = (foo (B)) +B = (bar (A)) // OK: descends each step +``` + +### Scope Boundaries + +Recursive private definitions get automatic type boundaries (behave like `pub` for typing): + +```plotnik/docs/type-system.md#L240-241 +NestedCall = (call_expression + function: [(identifier) @name (NestedCall) @inner]) +``` + +### Recursive Deep Search + +Combines recursion with bubbling for flat output: + +```plotnik/docs/type-system.md#L249-253 +DeepSearch = [ + (identifier) @target + (_ (DeepSearch)*) +] +pub AllIdentifiers = (program (DeepSearch)*) +``` + +Output: `{ target: Node[] }` — flat array regardless of tree depth. + +## 7. Definition Roles + +| Feature | `Def` (Private) | `pub Def` (Public) | +| -------- | ------------------ | ------------------ | +| Concept | Fragment/Mixin | API Contract | +| Behavior | Inlined | Entrypoint | +| Scoping | Transparent | Boundary | +| Output | Merges into parent | Named Interface | + +## 8. Type Metadata + +For codegen, types are named: + +- **Explicit**: `@name :: TypeName` +- **Synthetic**: `{DefName}{FieldName}` (e.g., `FuncParams`), with numeric suffix on collision From 944e7affbb044cc92933405ae6f56c64fc8c64ce Mon Sep 17 00:00:00 2001 From: Sergei Zharinov Date: Fri, 19 Dec 2025 08:30:22 -0300 Subject: [PATCH 2/3] Update --- AGENTS.md | 13 +- README.md | 2 +- crates/plotnik-cli/Cargo.toml | 2 +- crates/plotnik-cli/docs/REFERENCE.md | 1 - crates/plotnik-cli/docs/lang-reference.md | 1 + crates/plotnik-cli/src/commands/docs.rs | 2 +- crates/plotnik-langs/src/lib.rs | 23 + docs/REFERENCE.md | 831 ---------------------- docs/adr/README.md | 52 ++ docs/binary-format/06-transitions.md | 2 +- docs/lang-reference.md | 4 +- docs/type-system.md | 90 ++- 12 files changed, 174 insertions(+), 849 deletions(-) delete mode 120000 crates/plotnik-cli/docs/REFERENCE.md create mode 120000 crates/plotnik-cli/docs/lang-reference.md delete mode 100644 docs/REFERENCE.md create mode 100644 docs/adr/README.md diff --git a/AGENTS.md b/AGENTS.md index 5005e79b..8f76bc22 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -14,16 +14,7 @@ - **Location**: `docs/adr/` - **Naming**: `ADR-XXXX-short-title-in-kebab-case.md` (`XXXX` is a sequential number). - **Index**: - - [ADR-0001: Query Parser](docs/adr/ADR-0001-query-parser.md) - - [ADR-0002: Diagnostics System](docs/adr/ADR-0002-diagnostics-system.md) - - ADR-0003: Query Intermediate Representation (superseded by ADR-0004, ADR-0005, ADR-0006, available via git history) - - [ADR-0004: Query IR Binary Format](docs/adr/ADR-0004-query-ir-binary-format.md) - - [ADR-0005: Transition Graph Format](docs/adr/ADR-0005-transition-graph-format.md) - - [ADR-0006: Dynamic Query Execution](docs/adr/ADR-0006-dynamic-query-execution.md) - - [ADR-0007: Type Metadata Format](docs/adr/ADR-0007-type-metadata-format.md) - - [ADR-0008: Tree Navigation](docs/adr/ADR-0008-tree-navigation.md) - - [ADR-0009: Type System](docs/adr/ADR-0009-type-system.md) - - [ADR-0010: Type System v2](docs/adr/ADR-0010-type-system-v2.md) + - _(no ADRs yet)_ - **Template**: ```markdown @@ -169,7 +160,7 @@ crates/ plotnik-macros/ # Proc macros of the project docs/ adr/ # Architecture Decision Records (ADRs) - REFERENCE.md # Language specification + lang-reference.md # Language specification ``` # CLI diff --git a/README.md b/README.md index d0a23109..04fafada 100644 --- a/README.md +++ b/README.md @@ -190,7 +190,7 @@ for (const stmt of result.statements) { } ``` -For the detailed specification, see the [Language Reference](docs/REFERENCE.md). +For the detailed specification, see the [Language Reference](docs/lang-reference.md). ## Supported Languages diff --git a/crates/plotnik-cli/Cargo.toml b/crates/plotnik-cli/Cargo.toml index c80184d8..451e4cff 100644 --- a/crates/plotnik-cli/Cargo.toml +++ b/crates/plotnik-cli/Cargo.toml @@ -9,7 +9,7 @@ documentation = "https://docs.rs/plotnik-cli" keywords = ["tree-sitter", "query", "ast", "parser", "cli"] categories = ["command-line-utilities", "development-tools"] readme = "../../README.md" -include = ["src/**/*", "Cargo.toml", "docs/REFERENCE.md"] +include = ["src/**/*", "Cargo.toml", "docs/lang-reference.md"] [[bin]] name = "plotnik" diff --git a/crates/plotnik-cli/docs/REFERENCE.md b/crates/plotnik-cli/docs/REFERENCE.md deleted file mode 120000 index c40a9113..00000000 --- a/crates/plotnik-cli/docs/REFERENCE.md +++ /dev/null @@ -1 +0,0 @@ -../../../docs/REFERENCE.md \ No newline at end of file diff --git a/crates/plotnik-cli/docs/lang-reference.md b/crates/plotnik-cli/docs/lang-reference.md new file mode 120000 index 00000000..c5f50e4f --- /dev/null +++ b/crates/plotnik-cli/docs/lang-reference.md @@ -0,0 +1 @@ +../../../docs/lang-reference.md \ No newline at end of file diff --git a/crates/plotnik-cli/src/commands/docs.rs b/crates/plotnik-cli/src/commands/docs.rs index d9d3e70f..a0c5de6e 100644 --- a/crates/plotnik-cli/src/commands/docs.rs +++ b/crates/plotnik-cli/src/commands/docs.rs @@ -8,7 +8,7 @@ pub fn run(topic: Option<&str>) { println!("Usage: plotnik docs "); } Some("reference") => { - println!("{}", include_str!("../../docs/REFERENCE.md")); + println!("{}", include_str!("../../docs/lang-reference.md")); } Some("examples") => { println!("(examples not yet written)"); diff --git a/crates/plotnik-langs/src/lib.rs b/crates/plotnik-langs/src/lib.rs index 581ae00d..94ca4fe4 100644 --- a/crates/plotnik-langs/src/lib.rs +++ b/crates/plotnik-langs/src/lib.rs @@ -362,4 +362,27 @@ mod tests { assert!(lang.resolve_field("name").is_some()); assert!(lang.resolve_field("fake_field").is_none()); } + + /// Verifies that languages with "end" keyword assign it a non-zero ID. + /// This proves that ID 0 ("end" sentinel) is internal to tree-sitter + /// and never exposed via the Cursor API for actual syntax nodes. + #[test] + #[cfg(all(feature = "ruby", feature = "lua"))] + fn end_keyword_has_nonzero_id() { + // Ruby has "end" keyword for blocks, methods, classes, etc. + let ruby = ruby(); + let ruby_end = ruby.resolve_anonymous_node("end"); + assert!(ruby_end.is_some(), "Ruby should have 'end' keyword"); + assert_ne!(ruby_end, Some(0), "Ruby 'end' keyword must not be ID 0"); + + // Lua has "end" keyword for blocks, functions, etc. + let lua = lua(); + let lua_end = lua.resolve_anonymous_node("end"); + assert!(lua_end.is_some(), "Lua should have 'end' keyword"); + assert_ne!(lua_end, Some(0), "Lua 'end' keyword must not be ID 0"); + + // Both languages still have internal "end" sentinel at ID 0 + assert_eq!(ruby.node_type_name(0), Some("end")); + assert_eq!(lua.node_type_name(0), Some("end")); + } } diff --git a/docs/REFERENCE.md b/docs/REFERENCE.md deleted file mode 100644 index 6834c98d..00000000 --- a/docs/REFERENCE.md +++ /dev/null @@ -1,831 +0,0 @@ -# Plotnik Query Language Reference - -Plotnik QL is a pattern-matching language for tree-sitter syntax trees. It extends [tree-sitter's query language](https://tree-sitter.github.io/tree-sitter/using-parsers/queries/1-syntax.html) with named expressions, recursion, and type inference. - -> Predicates (`#eq?`, `#match?`, etc.) and directives (`#set!`, etc.) from tree-sitter QL are intentionally not supported. Plotnik focuses on structural pattern matching; filtering logic belongs in the host language. - ---- - -## Execution Model - -Plotnik uses an NFA-based recursive cursor walk with backtracking. Understanding this model helps predict matching behavior. - -### Key Properties - -- **Root-anchored:** Matching starts at the root of the target tree and must match the entire structure (like `^...$` in regex, not a substring search) -- **Backtracking:** When a branch fails, the engine backtracks and tries alternatives -- **Ordered choice:** In alternations `[A B C]`, branches are tried in order; first match wins - -### Trivia Handling - -Comments and other "extra" nodes (as defined by the tree-sitter grammar) are automatically skipped when walking siblings, unless explicitly matched in the pattern. - -``` -(function_declaration (identifier) @name (block) @body) -``` - -This matches even if comments appear between children: - -```javascript -function foo /* comment */() { - /* body */ -} -``` - -The `.` anchor enforces strict adjacency (no trivia between): - -``` -(array . (identifier) @first) ; first must be immediately after opening bracket -``` - -### Partial Matching (Open World) - -Node patterns `(type ...)` are partial—unmentioned children are ignored: - -``` -(binary_expression left: (identifier) @left) -``` - -Matches any `binary_expression` with an `identifier` in its `left` field, regardless of other children (`operator`, `right`, etc.). - -Sequences `{...}` advance the cursor through siblings in order, skipping non-matching nodes between elements. - -### Field Constraints - -Field constraints (`field: pattern`) add a field requirement to positional matching. The child must match both the pattern AND have the specified field: - -``` -(binary_expression - left: (identifier) @x - right: (number) @y -) -``` - -This matches a `binary_expression` where: - -- The first matched child has field `left` and is an `identifier` -- The second matched child has field `right` and is a `number` - -Field constraints participate in sequential matching just like regular children—they are not independent lookups. - ---- - -## File Structure - -A Plotnik file contains one or more definitions. All definitions must be named (`Name = expr`) except optionally the last one, which becomes the entry point: - -``` -; named definitions (required for all but last) -Expr = [(identifier) (number) (string)] -Stmt = (statement) - -; unnamed entry point (only allowed as last definition) -(assignment_expression right: (Expr) @value) -``` - -An unnamed definition that is not the last in the file produces an error. The error message includes the entire unnamed definition to help identify and fix it. - ---- - -## Naming Conventions - -- Capitalized names (`Expr`, `Statement`, `BinaryOp`) are user-defined: named expressions, alternation labels, type annotations -- Lowercase names (`function_declaration`, `identifier`, `binary_expression`) are language-defined: node types from tree-sitter grammars -- Capture names must be snake_case (e.g., `@name`, `@func_body`) - -This distinction is enforced by the parser. - -> **Difference from tree-sitter:** Tree-sitter allows arbitrary capture names including dots (e.g., `@function.name`). Plotnik restricts captures to snake*case identifiers (`[a-z]a-z0-9*]\*`) because they map directly to struct fields in generated code (Rust, TypeScript, Python). Use underscores instead: `@function_name`. - ---- - -## Data Model - -Plotnik infers structured output types from your query. Understanding this section is essential—the rules are simple but may surprise users expecting nested output to mirror nested patterns. - -### Core Concept: Flat by Default - -Query nesting does NOT create output nesting. All captures within a query become fields in a single flat record, regardless of how deeply nested the pattern is. - -``` -(function_declaration - name: (identifier) @name - body: (block - (return_statement (expression) @retval))) -``` - -Output type: - -```typescript -{ name: Node, retval: Node } // flat, not nested -``` - -The pattern is 4 levels deep, but the output is flat. This is intentional: you're usually extracting specific pieces from an AST, not reconstructing its shape. - -### The Node Type - -Every capture produces a `Node` by default—a reference to a tree-sitter node: - -```typescript -interface Node { - kind: string; // node type, e.g. "identifier" - text: string; // source text - start: Position; // { row, column } - end: Position; -} -``` - -### Cardinality: Quantifiers → Arrays - -Quantifiers on the captured pattern determine whether a field is singular, optional, or an array: - -| Pattern | Output Type | Meaning | -| --------- | ---------------- | ------------ | -| `(x) @a` | `a: T` | exactly one | -| `(x)? @a` | `a?: T` | zero or one | -| `(x)* @a` | `a: T[]` | zero or more | -| `(x)+ @a` | `a: [T, ...T[]]` | one or more | - -### Creating Nested Structure - -To create nested structure, place a capture on a sequence `{...}` or alternation `[...]`. It's the capture on the grouping construct that creates a new scope—the braces alone don't introduce nesting: - -``` -{ - (function_declaration - name: (identifier) @name - body: (_) @body - ) @node -} @func -``` - -Output type: - -```typescript -{ func: { node: Node, name: Node, body: Node } } -``` - -The `@func` capture on the group creates a nested scope. All captures inside (`@node`, `@name`, `@body`) become fields of that nested object. - -### Type Annotations - -The `::` syntax after a capture names the output type for codegen: - -``` -@x :: MyType // name this capture's type "MyType" -@x :: string // special: extract node.text as a string -``` - -| Annotation | Effect | -| -------------- | ------------------------------------------- | -| `@x` | inferred type (usually `Node`) | -| `@x :: string` | converts to `string` (extracts `node.text`) | -| `@x :: T` | names the type `T` in generated code | - -Only `:: string` changes the actual data. Other `:: T` annotations only affect generated type/interface names. - -Example with type annotation on a group: - -``` -{ - (function_declaration - name: (identifier) @name :: string - body: (_) @body - ) @node -} @func :: FunctionDeclaration -``` - -Output type: - -```typescript -interface FunctionDeclaration { - node: Node; - name: string; // :: string converted this - body: Node; -} - -{ - func: FunctionDeclaration; -} -``` - -### Summary - -| What you write | What you get | -| ------------------------- | -------------------------------------- | -| `@name` anywhere in query | field `name` in current scope | -| `(pattern)? @x` | optional field | -| `(pattern)* @x` | array field | -| `{...} @x` or `[...] @x` | nested object (new scope for captures) | -| `@x :: string` | string value instead of Node | -| `@x :: TypeName` | custom type name in codegen | - ---- - -## Nodes - -### Named Nodes - -Match named nodes (non-terminals and named terminals) by type: - -``` -(function_declaration) -(binary_expression (identifier) (number)) -``` - -Children can be partial—this matches any `binary_expression` with at least one `string_literal` child: - -``` -(binary_expression (string_literal)) -``` - -With captures: - -``` -(binary_expression - (identifier) @left - (number) @right) -``` - -Output type: - -```typescript -{ left: Node, right: Node } -``` - -### Anonymous Nodes - -Match literal tokens (operators, keywords, punctuation) with double or single quotes: - -``` -(binary_expression operator: "!=") -(return_statement "return") -``` - -Single quotes are equivalent to double quotes, useful when the query itself is wrapped in double quotes (e.g., in tool calls or JSON): - -``` -(return_statement 'return') -``` - -Anonymous nodes can be captured directly: - -``` -(binary_expression "+" @op) -"return" @keyword -``` - -Output type: - -```typescript -{ - op: Node; -} -{ - keyword: Node; -} -``` - -### Wildcards - -- `(_)` — matches any named node -- `_` — matches any node (named or anonymous) - -``` -(call_expression function: (_) @fn) -(pair key: _ @key value: _ @value) -``` - -Output type: - -```typescript -{ fn: Node } -{ key: Node, value: Node } -``` - -### Special Nodes - -- `(ERROR)` — matches parser error nodes -- `(MISSING)` — matches nodes inserted by error recovery -- `(MISSING identifier)` — matches a specific missing node type -- `(MISSING ";")` — matches a missing anonymous node - -``` -(ERROR) @syntax_error -(MISSING ";") @missing_semicolon -``` - -Output type: - -```typescript -{ - syntax_error: Node; -} -{ - missing_semicolon: Node; -} -``` - -### Supertypes - -Some grammars define supertypes (abstract node types). Query them directly: - -``` -(expression) @expr -``` - -Query a specific subtype within a supertype context: - -``` -(expression/binary_expression) @binary -(expression/"()") @empty_parens -``` - -Output type: - -```typescript -{ - binary: Node; -} -{ - empty_parens: Node; -} -``` - ---- - -## Fields - -Constrain children to named fields. A field value must be a node pattern, an alternation, or a quantifier applied to one of these. Groups `{...}` are not allowed as direct field values. - -``` -(assignment_expression - left: (identifier) @target - right: (call_expression) @value) -``` - -Output type: - -```typescript -{ target: Node, value: Node } -``` - -With type annotations: - -``` -(assignment_expression - left: (identifier) @target :: string - right: (call_expression) @value) -``` - -Output type: - -```typescript -{ target: string, value: Node } -``` - -### Negated Fields - -Assert a field is absent with `!`: - -``` -(function_declaration - name: (identifier) @name - !type_parameters) -``` - -Negated fields don't affect the output type—they're purely structural constraints: - -```typescript -{ - name: Node; -} -``` - ---- - -## Quantifiers - -- `?` — zero or one (optional) -- `*` — zero or more -- `+` — one or more (non-empty) - -``` -(function_declaration (decorator)? @decorator) -(function_declaration (decorator)* @decorators) -(function_declaration (decorator)+ @decorators) -``` - -Output types: - -```typescript -{ decorator?: Node } -{ decorators: Node[] } -{ decorators: [Node, ...Node[]] } -``` - -The `+` quantifier always produces non-empty arrays—no opt-out. - -Plotnik also supports non-greedy variants: `*?`, `+?`, `??` - ---- - -## Sequences - -Match sibling patterns in order with braces. Tree-sitter uses `((a) (b))` for the same purpose. Plotnik uses `{...}` to visually distinguish grouping from node patterns, and adds scope creation when captured (`{...} @name`). - -``` -{ - (comment) - (function_declaration) -} -``` - -Quantifiers apply to sequences: - -``` -{ - (number) - {"," (number)}* -} -``` - -### Sequences with Captures - -Capture elements inside a sequence: - -``` -{ - (decorator)* @decorators - (function_declaration) @fn -} -``` - -Output type: - -```typescript -{ decorators: Node[], fn: Node } -``` - -Capture the entire sequence with a type name: - -``` -{ - (comment)+ - (function_declaration) @fn -}+ @sections :: Section -``` - -Output type: - -```typescript -interface Section { - fn: Node; -} - -{ sections: [Section, ...Section[]] } -``` - ---- - -## Alternations - -Match one of several alternatives with `[...]`: - -- **Untagged** (no labels): Simpler output, fields merge. Use when you only need the captured data. -- **Tagged** (with labels): Precise discriminated union. Use when you need to know which branch matched. - -``` -[ - (identifier) - (string_literal) -] @value -``` - -### Merge Style (Unlabeled) - -Without labels, captures from all branches merge. If a capture appears in all branches, it's required; otherwise optional. Captures with the same name must have the same type across all branches where they appear. - -All branches must be type-compatible: either all branches produce bare nodes (no internal captures), or all branches produce structures (have internal captures). When branches mix nodes and structures, bare node captures are auto-promoted to single-field structures. When merging structures, the captured alternation requires an explicit type annotation (`@x :: TypeName`) for codegen. - -``` -(statement - [ - (assignment_expression left: (identifier) @left) - (call_expression function: (identifier) @func) - ]) -``` - -Output type: - -```typescript -{ left?: Node, func?: Node } // each appears in one branch only -``` - -When the same capture appears in all branches: - -``` -[ - (identifier) @name - (string) @name -] -``` - -Output type: - -```typescript -{ - name: Node; -} // required: present in all branches, same type -``` - -Mixed presence: - -``` -[ - (binary_expression - left: (_) @x - right: (_) @y) - (identifier) @x -] -``` - -The second branch `(identifier) @x` is auto-promoted to a structure `{ x: Node }`, making it compatible with the first branch. - -Output type: - -```typescript -{ x: Node, y?: Node } // x in all branches (required), y in one (optional) -``` - -Type mismatch is an error: - -``` -[(identifier) @x :: string (number) @x :: number] // ERROR: @x has different types -``` - -With a capture on the alternation itself, the type is non-optional since exactly one branch must match: - -``` -[ - (identifier) - (number) -] @value -``` - -Output type: - -```typescript -{ - value: Node; -} -``` - -### Tagged Style (Labeled) - -Labels create a discriminated union: - -``` -[ - Assign: (assignment_expression left: (identifier) @left) - Call: (call_expression function: (identifier) @func) -] @stmt :: Stmt -``` - -Output type (discriminant is always `$tag`, payload in `$data`): - -```typescript -type Stmt = - | { $tag: "Assign"; $data: { left: Node } } - | { $tag: "Call"; $data: { func: Node } }; -``` - -In Rust, tagged alternations become enums: - -```rust -enum Stmt { - Assign { left: Node }, - Call { func: Node }, -} -``` - -### Alternations with Type Annotations - -When a merge alternation produces a structure (branches have internal captures), the capture on the alternation must have an explicit type annotation for codegen: - -``` -(call_expression - function: [ - (identifier) @fn - (member_expression property: (property_identifier) @method) - ] @target :: Target) -``` - -Output type: - -```typescript -interface Target { - fn?: Node; - method?: Node; -} - -{ - target: Target; -} -``` - ---- - -## Anchors - -The anchor `.` constrains sibling positions. Anchors don't affect types—they're structural constraints. - -First child: - -``` -(array . (identifier) @first) -``` - -Last child: - -``` -(block (_) @last .) -``` - -Immediate adjacency: - -``` -(dotted_name (identifier) @a . (identifier) @b) -``` - -Without the anchor, `@a` and `@b` would match non-adjacent pairs too. - -Output type for all examples: - -```typescript -{ first: Node } -{ last: Node } -{ a: Node, b: Node } -``` - -Anchors ignore anonymous nodes. - ---- - -## Named Expressions - -Define reusable patterns with `Name = pattern`: - -``` -BinaryOp = - (binary_expression - left: (_) @left - operator: _ @op - right: (_) @right) -``` - -Use named expressions as node types: - -``` -(return_statement (BinaryOp) @expr) -``` - -Output type: - -```typescript -{ - expr: BinaryOp; -} // BinaryOp = { left: Node, op: Node, right: Node } -``` - -> **Important: Encapsulation.** Named expressions encapsulate their captures. Using `(Name)` without a capture matches the pattern but extracts no data. You must capture the reference (`(Name) @x`) to access the named expression's fields via `x`. This is intentional—named expressions provide structural reuse (pattern abstraction), while captures provide data extraction (explicit addressing). - -Named expressions define both a pattern and a type. The type is inferred from captures within: - -``` -Expr = [(BinaryOp) (UnaryOp) (identifier) (number)] -``` - -When used: - -``` -(assignment_expression right: (Expr) @value) -``` - -Output type: - -```typescript -{ - value: Expr; -} // union of BinaryOp, UnaryOp, or Node -``` - ---- - -## Recursion - -Named expressions can reference themselves: - -``` -NestedCall = - (call_expression - function: [(identifier) @name (NestedCall) @inner] - arguments: (arguments)) -``` - -This matches `a()`, `a()()`, `a()()()`, etc. - -Output type: - -```typescript -type NestedCall = { - name?: Node; - inner?: NestedCall; -}; -``` - -Another example—matching arbitrarily nested member chains: - -``` -MemberChain = [ - Base: (identifier) @name - Access: (member_expression - object: (MemberChain) @object - property: (property_identifier) @property) -] -``` - -Output type: - -```typescript -type MemberChain = - | { $tag: "Base"; $data: { name: Node } } - | { $tag: "Access"; $data: { object: MemberChain; property: Node } }; -``` - ---- - -## Full Example - -``` -Statement = [ - Assign: (assignment_expression - left: (identifier) @target :: string - right: (Expression) @value) - Call: (call_expression - function: (identifier) @func :: string - arguments: (arguments (Expression)* @args)) - Return: (return_statement - (Expression)? @value) -] - -Expression = [ - Ident: (identifier) @name :: string - Num: (number) @value :: string - Str: (string) @value :: string -] - -(program (Statement)+ @statements) -``` - -Output types: - -```typescript -type Statement = - | { $tag: "Assign"; $data: { target: string; value: Expression } } - | { $tag: "Call"; $data: { func: string; args: Expression[] } } - | { $tag: "Return"; $data: { value?: Expression } }; - -type Expression = - | { $tag: "Ident"; $data: { name: string } } - | { $tag: "Num"; $data: { value: string } } - | { $tag: "Str"; $data: { value: string } }; - -type Root = { - statements: [Statement, ...Statement[]]; -}; -``` - ---- - -## Quick Reference - -| Feature | Tree-sitter | Plotnik | -| -------------------- | ---------------- | ------------------------- | -| Capture | `@name` | `@name` (snake_case only) | -| Type annotation | | `@x :: T` | -| Text extraction | | `@x :: string` | -| Named node | `(type)` | `(type)` | -| Anonymous node | `"text"` | `"text"` | -| Any node | `_` | `_` | -| Any named node | `(_)` | `(_)` | -| Field constraint | `field: pattern` | `field: pattern` | -| Negated field | `!field` | `!field` | -| Quantifiers | `?` `*` `+` | `?` `*` `+` | -| Non-greedy | | `??` `*?` `+?` | -| Sequence | `((a) (b))` | `{(a) (b)}` | -| Alternation | `[a b]` | `[a b]` | -| Tagged alternation | | `[A: (a) B: (b)]` | -| Anchor | `.` | `.` | -| Named expression | | `Name = pattern` | -| Use named expression | | `(Name)` | diff --git a/docs/adr/README.md b/docs/adr/README.md new file mode 100644 index 00000000..f2a74138 --- /dev/null +++ b/docs/adr/README.md @@ -0,0 +1,52 @@ +# The Plotnik ADR System + +An ADR system documents important architectural decisions, their context, and their consequences. This helps maintain architectural consistency and provides valuable context for current and future contributors. + +## 1. Location + +As hinted at in your `AGENTS.md`, the best place for these is `docs/adr/`. + +## 2. Naming Convention + +Files should be named `ADR-XXXX-short-title-in-kebab-case.md`, where `XXXX` is a sequential number (e.g., `0001`, `0002`). + +## 3. ADR Template + +Create a file named `ADR-0000-template.md` in the `docs/adr/` directory with the following content. This makes it easy for anyone to start a new record. + +```markdown +# ADR-XXXX: Title of the Decision + +- **Status**: Proposed | Accepted | Deprecated | Superseded by [ADR-YYYY](ADR-YYYY-...) +- **Date**: YYYY-MM-DD + +## Context + +Describe the issue, problem, or driving force that led to this decision. What are the constraints and requirements? What is the scope of this decision? This section should be understandable to someone without deep project knowledge. + +## Decision + +Clearly and concisely state the decision that was made. This is the "what," not the "why." + +## Consequences + +This is the most critical section. Describe the results, outcomes, and trade-offs of the decision. + +### Positive Consequences + +- What benefits does this decision provide? +- How does it align with the project's goals (e.g., resilience, user experience, performance)? + +### Negative Consequences + +- What are the drawbacks or costs? +- What trade-offs were made? +- What future challenges might this decision introduce? + +### Considered Alternatives + +- **Alternative 1:** A brief description of a rejected option. + - _Pros_: Why was it considered? + - _Cons_: Why was it rejected? +- **Alternative 2:** ... +``` diff --git a/docs/binary-format/06-transitions.md b/docs/binary-format/06-transitions.md index 29bd976f..bd49518f 100644 --- a/docs/binary-format/06-transitions.md +++ b/docs/binary-format/06-transitions.md @@ -92,7 +92,7 @@ Member/variant indices are resolved via `type_members[struct_or_enum.members.sta All instructions are exactly 8 bytes. -**Note**: In tree-sitter, `0` is never a valid `NodeTypeId` or `NodeFieldId`. We use `Option` to represent these values, where `None` (stored as `0`) indicates no check (wildcard). +**Note**: In tree-sitter, `NodeTypeId` 0 is reserved for an internal "end" sentinel and is never exposed via the Cursor API. Languages with an actual `end` keyword (Ruby, Lua, etc.) assign it a different non-zero ID. Similarly, `NodeFieldId` 0 is never valid. We use `Option` to represent these values, where `None` (stored as `0`) indicates no check (wildcard). **Epsilon Transitions**: A `MatchExt` with `node_type: None`, `node_field: None`, and `nav: Stay` is an **epsilon transition**—it succeeds unconditionally without cursor interaction. This is critical for: diff --git a/docs/lang-reference.md b/docs/lang-reference.md index 200870ba..e201a752 100644 --- a/docs/lang-reference.md +++ b/docs/lang-reference.md @@ -162,12 +162,14 @@ Tree-sitter allows `@function.name`; Plotnik requires `@function_name` because c ## Data Model -Plotnik infers output types from your query. The key rule may surprise you: +Plotnik infers output types from your query. The key rule may surprise you—but it's intentional for schema stability. ### Flat by Default Query nesting does NOT create output nesting. All captures become fields in a single flat record. +**Why?** Adding a new `@capture` to an existing query shouldn't break downstream code using other captures. Flat output makes capture additions non-breaking. See [Type System](type-system.md#design-philosophy) for the full rationale. + ``` (function_declaration name: (identifier) @name diff --git a/docs/type-system.md b/docs/type-system.md index 93f83622..2ee4ec21 100644 --- a/docs/type-system.md +++ b/docs/type-system.md @@ -2,6 +2,63 @@ Plotnik infers static types from query structure. This governs how captures materialize into output (JSON, structs, etc.). +## Design Philosophy + +Plotnik prioritizes **schema evolution** and **refactoring safety** over local intuition. + +Two principles guide the type system: + +1. **Additive captures are non-breaking**: Adding a new `@capture` to an existing query should not invalidate downstream code that uses other captures. + +2. **Extract-refactor equivalence**: Moving a pattern fragment into a named definition should not change the output shape. + +These constraints produce designs that may initially surprise users (parallel arrays instead of row objects, transparent scoping instead of nesting), but enable queries to evolve without breaking consumers. + +### Why Parallel Arrays + +Traditional row-oriented output breaks when queries evolve: + +``` +// v1: Extract names +(identifier)* @names +→ { names: Node[] } + +// v2: Also extract types (row-oriented would require restructuring) +{ (identifier) @name (type) @type }* @items +→ { items: [{ name, type }, ...] } // BREAKING: names[] is gone +``` + +Plotnik's columnar approach: + +``` +// v1 +(identifier)* @names +→ { names: Node[] } + +// v2: Add types alongside +{ (identifier) @names (type) @types }* +→ { names: Node[], types: Node[] } // NON-BREAKING: names[] unchanged +``` + +Existing code using `result.names[i]` continues to work. + +### Why Transparent Scoping + +Extracting a pattern into a definition shouldn't change output: + +``` +// Inline +(function name: (identifier) @name) +→ { name: Node } + +// Extracted +Func = (function name: (identifier) @name) +(Func) +→ { name: Node } // Same shape—@name bubbles through +``` + +If definitions created implicit boundaries, extraction would wrap output in a new struct, breaking downstream types. + ## Mental Model | Operation | Nested (tree-sitter) | Transparent (Plotnik) | @@ -78,7 +135,7 @@ Quantifiers (`*`, `+`) produce arrays per-field, not lists of objects: Output: `{ "k": ["key1", "key2"], "v": ["val1", "val2"] }` -This Struct-of-Arrays layout is efficient for analysis and avoids implicit row creation. +This Struct-of-Arrays layout enables non-breaking schema evolution: adding `@newfield` to an existing loop doesn't restructure existing fields. It also avoids implicit row creation and is efficient for columnar analysis. For List-of-Objects, wrap explicitly: @@ -127,6 +184,22 @@ Fixes: { { (A)* @a (B) @b } @row }* // Wrap for rows ``` +### Multiple Desynchronized Fields + +When multiple `*`/`+` fields coexist, each produces an independent array with no alignment guarantee: + +``` +{ (A)* @a (B)* @b }* +``` + +If iteration 1 yields `a: [1,2,3], b: [x]` and iteration 2 yields `a: [4], b: [y,z]`, the result is: + +``` +{ a: [1,2,3,4], b: [x,y,z] } // lengths differ, no row correspondence +``` + +This is valid columnar concatenation—arrays are independent streams. If you need per-iteration grouping, wrap with `{...} @row`. + ## 5. Type Unification in Alternations Shallow unification across untagged branches: @@ -167,6 +240,21 @@ When a quantified capture appears in some branches but not others, the result is The missing branch emits `PushNull`, not an empty array. This distinction matters for columnar output—`null` indicates "branch didn't match" vs `[]` meaning "matched zero times." +Note the `*` vs `+` difference: + +``` +[ (A)+ @x (B) ] // x: Node[] | null — null means B branch +[ (A)* @x (B) ] // x: Node[] | null — null means B branch, [] means A matched zero times +``` + +In the `*` case, `null` and `[]` are semantically distinct. Check explicitly: + +```typescript +if (result.x !== null) { + // A branch matched (possibly zero times if x.length === 0) +} +``` + For type conflicts, use tagged alternations: ```plotnik/docs/type-system.md#L157-160 From 5129b1a7034aaf23c4f94d3c46119927d036524a Mon Sep 17 00:00:00 2001 From: Sergei Zharinov Date: Fri, 19 Dec 2025 08:50:13 -0300 Subject: [PATCH 3/3] Remove docs CLI command --- crates/plotnik-cli/Cargo.toml | 2 +- crates/plotnik-cli/src/cli.rs | 6 ------ crates/plotnik-cli/src/commands/docs.rs | 22 ---------------------- crates/plotnik-cli/src/commands/mod.rs | 1 - crates/plotnik-cli/src/main.rs | 3 --- 5 files changed, 1 insertion(+), 33 deletions(-) delete mode 100644 crates/plotnik-cli/src/commands/docs.rs diff --git a/crates/plotnik-cli/Cargo.toml b/crates/plotnik-cli/Cargo.toml index 451e4cff..76c84e99 100644 --- a/crates/plotnik-cli/Cargo.toml +++ b/crates/plotnik-cli/Cargo.toml @@ -9,7 +9,7 @@ documentation = "https://docs.rs/plotnik-cli" keywords = ["tree-sitter", "query", "ast", "parser", "cli"] categories = ["command-line-utilities", "development-tools"] readme = "../../README.md" -include = ["src/**/*", "Cargo.toml", "docs/lang-reference.md"] +include = ["src/**/*", "Cargo.toml"] [[bin]] name = "plotnik" diff --git a/crates/plotnik-cli/src/cli.rs b/crates/plotnik-cli/src/cli.rs index 9ed47989..771b0ca2 100644 --- a/crates/plotnik-cli/src/cli.rs +++ b/crates/plotnik-cli/src/cli.rs @@ -52,12 +52,6 @@ pub enum Command { output: OutputArgs, }, - /// Print documentation - Docs { - /// Topic to display (e.g., "reference", "examples") - topic: Option, - }, - /// List supported languages Langs, diff --git a/crates/plotnik-cli/src/commands/docs.rs b/crates/plotnik-cli/src/commands/docs.rs deleted file mode 100644 index a0c5de6e..00000000 --- a/crates/plotnik-cli/src/commands/docs.rs +++ /dev/null @@ -1,22 +0,0 @@ -pub fn run(topic: Option<&str>) { - match topic { - None => { - println!("Available topics:"); - println!(" reference - Query language reference"); - println!(" examples - Example queries"); - println!(); - println!("Usage: plotnik docs "); - } - Some("reference") => { - println!("{}", include_str!("../../docs/lang-reference.md")); - } - Some("examples") => { - println!("(examples not yet written)"); - } - Some(other) => { - eprintln!("Unknown help topic: {}", other); - eprintln!("Run 'plotnik docs' to see available topics"); - std::process::exit(1); - } - } -} diff --git a/crates/plotnik-cli/src/commands/mod.rs b/crates/plotnik-cli/src/commands/mod.rs index f6f0efd5..7118f82a 100644 --- a/crates/plotnik-cli/src/commands/mod.rs +++ b/crates/plotnik-cli/src/commands/mod.rs @@ -1,5 +1,4 @@ pub mod debug; -pub mod docs; pub mod exec; pub mod langs; pub mod types; diff --git a/crates/plotnik-cli/src/main.rs b/crates/plotnik-cli/src/main.rs index 63cbedff..a0184782 100644 --- a/crates/plotnik-cli/src/main.rs +++ b/crates/plotnik-cli/src/main.rs @@ -33,9 +33,6 @@ fn main() { color: output.color.should_colorize(), }); } - Command::Docs { topic } => { - commands::docs::run(topic.as_deref()); - } Command::Langs => { commands::langs::run(); }