From 773b97eae9669e57e93ab615ca700b5ca0f5a1c6 Mon Sep 17 00:00:00 2001 From: John Lapeyre <1969884+jlapeyre@users.noreply.github.com> Date: Thu, 27 Nov 2025 19:39:56 -0500 Subject: [PATCH] Add comments on parser structs --- crates/oq3_lexer/src/lib.rs | 2 ++ crates/oq3_parser/src/input.rs | 5 +++++ crates/oq3_parser/src/lexed_str.rs | 12 ++++++++++++ crates/oq3_parser/src/shortcuts.rs | 2 ++ crates/oq3_syntax/src/parsing.rs | 4 ++++ 5 files changed, 25 insertions(+) diff --git a/crates/oq3_lexer/src/lib.rs b/crates/oq3_lexer/src/lib.rs index eda07a9..c895d3e 100644 --- a/crates/oq3_lexer/src/lib.rs +++ b/crates/oq3_lexer/src/lib.rs @@ -38,6 +38,8 @@ use unicode_properties::UnicodeEmoji; #[derive(Debug)] pub struct Token { pub kind: TokenKind, + + /// The length in bytes of text associated with `kind`. pub len: u32, } diff --git a/crates/oq3_parser/src/input.rs b/crates/oq3_parser/src/input.rs index 6a1338d..d6664ad 100644 --- a/crates/oq3_parser/src/input.rs +++ b/crates/oq3_parser/src/input.rs @@ -9,6 +9,8 @@ use crate::SyntaxKind; type bits = u64; // FIXME GJL `LexerToken` does not appear anywhere in the r-a project. +// `LexerToken` seems to refer to output of `oq3_lexer::tokenize` and `LexedStr` +// (also present in r-a). These *do* preserve whitespace and comments. /// Input for the parser -- a sequence of tokens. /// /// As of now, parser doesn't have access to the *text* of the tokens, and makes @@ -18,7 +20,10 @@ type bits = u64; /// Struct of arrays internally, but this shouldn't really matter. #[derive(Default)] pub struct Input { + /// SyntaxKind has u16 variants kind: Vec, + + /// Account for whitespace/comments dropped on construction joint: Vec, contextual_kind: Vec, } diff --git a/crates/oq3_parser/src/lexed_str.rs b/crates/oq3_parser/src/lexed_str.rs index 7b467ce..fc125c4 100644 --- a/crates/oq3_parser/src/lexed_str.rs +++ b/crates/oq3_parser/src/lexed_str.rs @@ -20,14 +20,26 @@ use crate::{ }; pub struct LexedStr<'a> { + /// The input source text text: &'a str, + + /// Stores translation of stream of `Token`s kind: Vec, + + /// Byte offset for start of each text span tagged in `kind`. + /// `start.len() == kind.len()`. start: Vec, + + /// `Token` flagged as invalid produce a `LexError` as well as a `SyntaxKind`. error: Vec, } +// TODO: Might be good to replace `msg` with a small `enum`. struct LexError { + /// One of a small set of error messages. msg: String, + + /// Index into `LexedStr.kind` token: u32, } diff --git a/crates/oq3_parser/src/shortcuts.rs b/crates/oq3_parser/src/shortcuts.rs index 668416b..c6490a3 100644 --- a/crates/oq3_parser/src/shortcuts.rs +++ b/crates/oq3_parser/src/shortcuts.rs @@ -28,12 +28,14 @@ pub enum StrStep<'a> { } impl LexedStr<'_> { + // `was_joint` is used to fix index into text when omitting whitespace/comments pub fn to_input(&self) -> crate::Input { let mut res = crate::Input::default(); let mut was_joint = false; for i in 0..self.len() { let kind = self.kind(i); if kind.is_trivia() { + // whitespace or comment was_joint = false } else { if kind == SyntaxKind::IDENT { diff --git a/crates/oq3_syntax/src/parsing.rs b/crates/oq3_syntax/src/parsing.rs index 1b9f72d..3fd4942 100644 --- a/crates/oq3_syntax/src/parsing.rs +++ b/crates/oq3_syntax/src/parsing.rs @@ -20,6 +20,10 @@ pub fn parse_text(openqasm_code_text: &str) -> (GreenNode, Vec) { /// Lex `openqasm_code_text`. If there are no lexing errors, parse the result /// returning the AST as `Option`, as well as errors. /// If lexing errors do occur, do no parsing, but rather, return the lexing errors. +/// +/// `LexedStr::new` calls `oq3_parser::tokenize(..)` to produce a stream of `Token`s. +/// `LexedStr::new` translates this stream into `Vec` plus offset and error information. +/// Data from previous step is converted to `Input`. pub fn parse_text_check_lex(openqasm_code_text: &str) -> (Option, Vec) { let lexed = oq3_parser::LexedStr::new(openqasm_code_text); if !lexed.errors_is_empty() {