diff --git a/crates/gnomon-parser/src/lexer.rs b/crates/gnomon-parser/src/lexer.rs index 54143ca..1f79c6f 100644 --- a/crates/gnomon-parser/src/lexer.rs +++ b/crates/gnomon-parser/src/lexer.rs @@ -1,12 +1,25 @@ +use std::ops::Range; + use logos::Logos; use crate::syntax_kind::SyntaxKind; /// A single token produced by the lexer. +/// +/// Tokens store a byte range into the original source string rather than +/// owning a copy of the text. This avoids one heap allocation per token. #[derive(Debug, Clone, PartialEq, Eq)] pub struct Token { pub kind: SyntaxKind, - pub text: String, + pub span: Range, +} + +impl Token { + /// Get the token text by slicing the original source string. + #[inline] + pub fn text<'a>(&self, source: &'a str) -> &'a str { + &source[self.span.clone()] + } } /// Internal logos token enum. Maps 1:1 to the token variants of `SyntaxKind` @@ -201,17 +214,19 @@ impl LogosToken { /// Tokenize the input string into a sequence of tokens. /// Unrecognized bytes produce `ERROR` tokens. +/// +/// Each token stores a byte range referencing the original `input` string. pub fn lex(input: &str) -> Vec { let mut tokens = Vec::new(); let mut lexer = LogosToken::lexer(input); while let Some(result) = lexer.next() { - let text = lexer.slice().to_string(); + let span = lexer.span(); let kind = match result { Ok(tok) => tok.to_syntax_kind(), Err(()) => SyntaxKind::ERROR, }; - tokens.push(Token { kind, text }); + tokens.push(Token { kind, span }); } tokens @@ -223,15 +238,10 @@ mod tests { fn kinds(input: &str) -> Vec<(SyntaxKind, &str)> { let tokens = lex(input); - // Re-lex from input to get &str slices for comparison - let mut result = Vec::new(); - let mut pos = 0; - for tok in &tokens { - let end = pos + tok.text.len(); - result.push((tok.kind, &input[pos..end])); - pos = end; - } - result + tokens + .iter() + .map(|tok| (tok.kind, tok.text(input))) + .collect() } // ── Ambiguity resolution ───────────────────────────────────── diff --git a/crates/gnomon-parser/src/lib.rs b/crates/gnomon-parser/src/lib.rs index a50c432..2ac6432 100644 --- a/crates/gnomon-parser/src/lib.rs +++ b/crates/gnomon-parser/src/lib.rs @@ -59,7 +59,7 @@ impl Parse { pub fn parse(source: &str) -> Parse { let preprocessed = preprocess::preprocess(source); let tokens = lexer::lex(&preprocessed); - let parser = parser::Parser::new(tokens); + let parser = parser::Parser::new(tokens, preprocessed); let (green_node, errors) = parser.parse(); Parse { green_node, errors } } diff --git a/crates/gnomon-parser/src/parser.rs b/crates/gnomon-parser/src/parser.rs index 92f59c7..a26e246 100644 --- a/crates/gnomon-parser/src/parser.rs +++ b/crates/gnomon-parser/src/parser.rs @@ -16,21 +16,21 @@ pub struct ParseError { pub struct Parser { tokens: Vec, + /// The original source string; tokens reference it via byte ranges. + source: String, pos: usize, builder: GreenNodeBuilder<'static>, errors: Vec, - /// Cumulative byte offset up to (but not including) `tokens[pos]`. - offset: usize, } impl Parser { - pub fn new(tokens: Vec) -> Self { + pub fn new(tokens: Vec, source: String) -> Self { Self { tokens, + source, pos: 0, builder: GreenNodeBuilder::new(), errors: Vec::new(), - offset: 0, } } @@ -80,7 +80,7 @@ impl Parser { continue; } if remaining == 0 { - return &self.tokens[pos].text; + return self.tokens[pos].text(&self.source); } remaining -= 1; pos += 1; @@ -108,8 +108,7 @@ impl Parser { fn skip_trivia(&mut self) { while self.pos < self.tokens.len() && self.tokens[self.pos].kind.is_trivia() { let tok = &self.tokens[self.pos]; - self.builder.token(tok.kind.into(), &tok.text); - self.offset += tok.text.len(); + self.builder.token(tok.kind.into(), tok.text(&self.source)); self.pos += 1; } } @@ -119,8 +118,7 @@ impl Parser { self.skip_trivia(); if self.pos < self.tokens.len() { let tok = &self.tokens[self.pos]; - self.builder.token(tok.kind.into(), &tok.text); - self.offset += tok.text.len(); + self.builder.token(tok.kind.into(), tok.text(&self.source)); self.pos += 1; } } @@ -132,8 +130,7 @@ impl Parser { self.skip_trivia(); if self.pos < self.tokens.len() { let tok = &self.tokens[self.pos]; - self.builder.token(kind.into(), &tok.text); - self.offset += tok.text.len(); + self.builder.token(kind.into(), tok.text(&self.source)); self.pos += 1; } } @@ -175,16 +172,14 @@ impl Parser { fn current_range(&self) -> std::ops::Range { // Skip trivia to find the actual next token range let mut pos = self.pos; - let mut off = self.offset; while pos < self.tokens.len() && self.tokens[pos].kind.is_trivia() { - off += self.tokens[pos].text.len(); pos += 1; } if pos < self.tokens.len() { - let len = self.tokens[pos].text.len(); - off..off + len + self.tokens[pos].span.clone() } else { - off..off + let end = self.source.len(); + end..end } }