diff --git a/Cargo.lock b/Cargo.lock index 2576022..b1ee25a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -805,6 +805,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lexer" +version = "0.1.0" +dependencies = [ + "cosmoflow 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "serde", + "serde_json", +] + [[package]] name = "libc" version = "0.2.172" diff --git a/cookbook/lexer/Cargo.toml b/cookbook/lexer/Cargo.toml new file mode 100644 index 0000000..0a9a973 --- /dev/null +++ b/cookbook/lexer/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "lexer" +version = "0.1.0" +edition = "2021" + +[dependencies] +cosmoflow = "0.5.1" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" diff --git a/cookbook/lexer/README.md b/cookbook/lexer/README.md new file mode 100644 index 0000000..7969c71 --- /dev/null +++ b/cookbook/lexer/README.md @@ -0,0 +1,524 @@ +# CosmoFlow Lexer with Nested Flows (DFA-based) + +This is a lexer built with the CosmoFlow workflow engine, showcasing how to map Deterministic Finite Automata (DFA) concepts to CosmoFlow's nested flow functionality. This implementation demonstrates advanced workflow composition using sub-flows for maximum modularity and extensibility. + +## Design Philosophy + +Traditional lexers typically use DFAs (Deterministic Finite Automata) for lexical analysis: +- Each state represents a specific stage in the parsing process +- State transitions are based on input characters +- Final states produce tokens + +Our design maps this concept to CosmoFlow using **nested flows**: +- **DFA States** → **CosmoFlow Sub-flows** +- **State Transitions** → **CosmoFlow Actions and Routes** +- **Input Processing** → **Lexical Context in Shared Store** + +## Architecture Design + +### Three-Layer Hierarchical DFA Architecture + +Our lexer implements a **sophisticated three-layer DFA architecture** using CosmoFlow's nested flows, where each layer represents a different level of abstraction: + +#### Layer 1: Main Dispatcher DFA +- **Purpose**: High-level token type detection and routing +- **Responsibility**: Analyze current character and route to appropriate token-specific sub-flow +- **Implementation**: Single dispatcher node with character-based routing logic + +#### Layer 2: Token-Type Sub-flows +- **Purpose**: Token-specific processing logic +- **Responsibility**: Handle the complete lifecycle of each token type +- **Implementation**: Dedicated sub-flows for each token type (identifier, integer, string, etc.) +- **Features**: Some flows include multi-stage processing (e.g., identifier → classification) + +#### Layer 3: Character Collection Sub-flows +- **Purpose**: Fine-grained character collection and validation +- **Responsibility**: Implement the actual DFA logic for character processing +- **Implementation**: Nested sub-flows within token collectors +- **Advantage**: Maximum modularity - even character collection logic is a composable flow + +### Core Components + +#### 1. Token Types (TokenType) +```rust +pub enum TokenType { + Identifier, // Identifiers (variable names, function names, etc.) + Integer, // Integer literals + String, // String literals (enclosed in quotes) + Keyword, // Keywords (if, else, while, etc.) + Operator, // Operators (+, -, *, /, =, ==, etc.) + Delimiter, // Delimiters (, ), {, }, [, ], ;, ,, etc.) + Whitespace, // Whitespace (spaces, tabs, newlines) + Comment, // Comments + Unknown, // Unknown/invalid token + EndOfInput, // End of input +} +``` + +#### 2. Lexical Analysis Context (LexerContext) +State information stored in CosmoFlow's shared store: +```rust +pub struct LexerContext { + pub input: String, // Input string + pub position: usize, // Current position + pub line: u32, // Current line number + pub column: u32, // Current column number + pub tokens: Vec, // Generated token list +} +``` + +### Three-Layer Nested Flow Architecture + +The lexer uses a **hierarchical dispatcher-based architecture** with **three levels of nested sub-flows**: + +#### 1. Main Flow Structure (Layer 1) +``` +Dispatcher → Token Sub-flow → Return to Dispatcher → Repeat → End +``` + +#### 2. Token Sub-flow Structure (Layer 2) +``` +Collector → [Optional Processing] → Complete + ↓ +Character Collection Sub-flow (Layer 3) +``` + +#### 3. Character Collection Sub-flow Structure (Layer 3) +``` +Start → Character Processing Loop → Complete +``` + +#### Example: Identifier Processing Flow +``` +Layer 1: DISPATCHER ──(identifier_flow)──> IDENTIFIER_SUB_FLOW + | +Layer 2: COLLECTOR ──> CLASSIFIER ──> COMPLETE + | +Layer 3: CHAR_COLLECTION_SUB_FLOW + | + (alphanumeric/underscore loop) +``` + +#### 4. Dispatcher Node (DispatcherNode) +- **Function**: Analyzes the current character and determines which token-specific sub-flow to invoke +- **Routing Logic**: + - Whitespace characters → `whitespace_flow` + - Letters/underscore → `identifier_flow` + - Digits → `integer_flow` + - Quotes → `string_flow` + - Operators → `operator_flow` + - Delimiters → `delimiter_flow` + - `//` → `comment_flow` + - Others → `unknown_flow` + - End of input → `end_of_input` + +#### 5. Token-Specific Sub-flows (Layer 2) +Each token type is handled by a dedicated sub-flow that coordinates character collection and processing: + +- **`whitespace_flow`**: Direct character collection (simple case) +- **`identifier_flow`**: Character collection → Classification (keyword/identifier) +- **`integer_flow`**: Character collection → Token creation +- **`string_flow`**: Character collection with escape sequence handling +- **`operator_flow`**: Character collection with multi-character operator detection +- **`delimiter_flow`**: Single character collection +- **`comment_flow`**: Character collection until newline +- **`unknown_flow`**: Single unknown character handling + +#### 6. Character Collection Sub-flows (Layer 3) +Each collector node uses its own dedicated character collection sub-flow: + +```rust +// Example: IdentifierCollectorNode contains +struct IdentifierCollectorNode { + char_collector_flow: Flow, // Layer 3 sub-flow +} + +// Character collection sub-flow structure: +FlowBuilder::new() + .start_node("collect_all") + .node("collect_all", IdentifierCharCollectorNode) // DFA logic + .terminal_route("collect_all", "complete") + .build() +``` + +#### 7. Sub-flow Internal Structure (Layer 2 Example) +Each token sub-flow is a complete Flow with its own nodes and routes: + +```rust +// Example: identifier_flow (Layer 2) +FlowBuilder::new() + .start_node("collect") + .node("collect", IdentifierCollectorNode::new()) // Contains Layer 3 sub-flow + .node("classify", IdentifierClassifierNode) // Classification logic + .route("collect", "classify", "classify") + .terminal_route("classify", "complete") + .build() +``` + +### Workflow Routing Design + +``` +Dispatcher → Sub-flow → Return → Dispatcher → ... → End +``` + +#### Main Flow Routes: +```rust +// From dispatcher to sub-flows +.route("dispatch", "whitespace_flow", "whitespace_flow") +.route("dispatch", "identifier_flow", "identifier_flow") +.route("dispatch", "integer_flow", "integer_flow") +// ... other routes + +// From sub-flows back to dispatcher +.route("whitespace_flow", "complete", "return_to_dispatcher") +.route("identifier_flow", "complete", "return_to_dispatcher") +// ... other returns + +// Continuation loop +.route("return_to_dispatcher", "dispatch", "dispatch") +.terminal_route("end_of_input", "complete") +// Terminal route +.terminal_route("end_of_input", "complete") +``` + +## Usage Example + +```rust +use lexer::CosmoFlowLexer; + +fn main() -> Result<(), Box> { + let mut lexer = CosmoFlowLexer::new(); + + let input = r#" + fn main() { + let x = 42; + if x > 0 { + println!("Hello, world!"); + } + } + "#; + + let tokens = lexer.tokenize(input)?; + + for (i, token) in tokens.iter().enumerate() { + println!("{:3}: {:?}", i, token); + } + + Ok(()) +} +``` + +## Sample Output + +``` + 0: Token { token_type: Whitespace, lexeme: "\n ", line: 2, column: 9 } + 1: Token { token_type: Keyword, lexeme: "fn", line: 2, column: 11 } + 2: Token { token_type: Whitespace, lexeme: " ", line: 2, column: 12 } + 3: Token { token_type: Identifier, lexeme: "main", line: 2, column: 16 } + 4: Token { token_type: Delimiter, lexeme: "(", line: 2, column: 17 } + 5: Token { token_type: Delimiter, lexeme: ")", line: 2, column: 18 } + ... +``` + +## Complete DFA Architecture Diagram + +Our lexer implements a **three-layer hierarchical DFA architecture** using CosmoFlow's nested flows: + +### Layer 1: Main Dispatcher DFA + +``` + [START] + | + v + [DISPATCHER] + | + ┌────────────────┼────────────────┐ + | | | + v v v +[whitespace] → [identifier] → [integer] → ... (other token flows) + | | | + v v v +[return_to_dispatcher] ← ─ ─ ─ ─ ─ ─ ─ ─ ┘ + | + v +[DISPATCHER] ─────────────────────────────────┐ + | | + v | +[end_of_input] ──────────────> [END] | + | + ─ ─ ─ ─ ─ ┘ + (loop back) +``` + +### Layer 2: Token-Type Sub-flows + +#### Identifier Flow DFA +``` +[START] → [IdentifierCollector] → [IdentifierClassifier] → [COMPLETE] + | | + v v + (uses char collector sub-flow) (keyword/identifier) +``` + +#### Integer Flow DFA +``` +[START] → [IntegerCollector] → [COMPLETE] + | + v + (uses char collector sub-flow) +``` + +#### String Flow DFA +``` +[START] → [StringCollector] → [COMPLETE] + | + v + (uses char collector sub-flow) +``` + +#### Other Flows (Operator, Delimiter, Comment) +``` +[START] → [Collector] → [COMPLETE] + | + v + (uses char collector sub-flow) +``` + +### Layer 3: Character Collection Sub-flows + +#### Identifier Character Collection DFA +``` + [START] + | + v + [collect_all] + | + ┌─────────────┼─────────────┐ + | (alphanumeric or '_') | + v | + [add char & advance] ──────────────┘ + | + v (not alphanumeric/underscore) + [COMPLETE] +``` + +#### Integer Character Collection DFA +``` + [START] + | + v + [collect_all] + | + ┌─────────────┼─────────────┐ + | (is digit) | + v | + [add char & advance] ──────────────┘ + | + v (not digit) + [COMPLETE] +``` + +#### String Character Collection DFA +``` + [START] + | + v + [collect_all] + | + v + [consume opening quote] + | + ┌─────────────┼─────────────┐ + | (not closing quote) | + v | + [handle escape & ──────────────┘ + add char & advance] + | + v (closing quote found) + [COMPLETE] +``` + +#### Operator Character Collection DFA +``` + [START] + | + v + [collect_all] + | + v + [consume first char] + | + v + [check for two-char operators] + | | + (==, !=, etc) | (single char) + | | + v v + [consume second] [keep first only] + | | + └───────┘ + | + v + [COMPLETE] +``` + +#### Comment Character Collection DFA +``` + [START] + | + v + [collect_all] + | + ┌─────────────┼─────────────┐ + | (not newline) | + v | + [add char & advance] ──────────────┘ + | + v (newline found) + [COMPLETE] +``` + +#### Delimiter Character Collection DFA +``` + [START] + | + v + [collect_all] + | + v + [consume single char] + | + v + [COMPLETE] +``` + +### Complete Flow Integration + +``` +Main Flow: + DISPATCHER ──(route)──> Token Sub-flow ──(complete)──> RETURN_TO_DISPATCHER + ^ | + | | + └──────────────────(dispatch)────────────────────────────── ┘ + +Token Sub-flow: + COLLECTOR ──(uses)──> Character Collection Sub-flow ──(complete)──> [Next Node or COMPLETE] + +Character Collection Sub-flow: + Specific character collection logic based on token type +``` + +### Dispatcher Routing Logic + +```rust +match current_char { + ' ' | '\t' | '\n' | '\r' → whitespace_flow + 'a'..='z' | 'A'..='Z' | '_' → identifier_flow + '0'..='9' → integer_flow + '"' → string_flow + '/' if next_char == '/' → comment_flow + '+' | '-' | '*' | '/' | '=' | ... → operator_flow + '(' | ')' | '{' | '}' | ... → delimiter_flow + _ → unknown_flow + EOF → end_of_input +} +``` + +## Technical Advantages + +### 1. Three-Layer Hierarchical Design with Nested Flows +- **Layer 1**: High-level token type routing and flow orchestration +- **Layer 2**: Token-specific processing logic with dedicated sub-flows +- **Layer 3**: Fine-grained character collection DFAs as composable sub-flows +- **Benefits**: Maximum modularity, each layer has clear responsibilities +- Easy to extend with new token types at any layer +- Clear separation of concerns across all three layers +- State transition logic is explicit and maintainable at each level + +### 2. Type Safety +- Leverages Rust's type system to ensure correctness of state transitions +- CosmoFlow's shared store ensures data consistency +- Compile-time guarantees for workflow structure + +### 3. Extensibility +- Easy to add new token types and corresponding sub-flows +- Support for complex state transition logic +- Can add conditional routing for more sophisticated syntax rules +- Sub-flows can be independently tested and developed + +### 4. Observability +- CosmoFlow provides execution path tracking +- Easy to debug and analyze the lexical analysis process +- Can record processing time and results for each state +- Clear flow visualization capabilities + +### 5. Error Handling +- Built-in retry mechanisms +- Detailed error information with context +- Graceful error recovery +- Isolated error handling within sub-flows + +### 6. Performance Benefits +- Efficient nested flow execution +- Minimal overhead from CosmoFlow's runtime +- Clear execution path optimization opportunities +- Memory-efficient token processing + +## Advanced Features + +### Three-Layer Sub-flow Composition +This implementation demonstrates CosmoFlow's most advanced sub-flow composition capabilities: + +#### Hierarchical Workflows (3 Levels) +- **Level 1**: Main flow orchestrates token-type sub-flows +- **Level 2**: Token sub-flows orchestrate character collection sub-flows +- **Level 3**: Character collection sub-flows implement pure DFA logic +- **Result**: True hierarchical state machine composition + +#### Reusable Components at Every Layer +- **Layer 1**: Dispatcher pattern reusable for other parsing tasks +- **Layer 2**: Token sub-flows reusable across different language lexers +- **Layer 3**: Character collection sub-flows reusable for any string processing +- **Cross-Layer**: Any sub-flow can be extracted and reused independently + +#### Independent Testing and Development +- **Each Layer**: Can be tested in complete isolation +- **Character Collection**: Pure DFA logic easily unit tested +- **Token Processing**: Integration tested with mock character collectors +- **Main Flow**: End-to-end tested with all layers integrated +- **Incremental Development**: Add new token types without touching existing layers + +### Advanced DFA State Machine Mapping +The three-layer nested flow architecture provides the most sophisticated mapping from traditional DFA concepts: + +#### Traditional DFA → Three-Layer CosmoFlow Mapping +- **DFA States** → **Three types of sub-flows** (dispatcher, token, character) +- **State Transitions** → **Inter-layer and intra-layer routes** +- **Acceptance States** → **Terminal routes at appropriate layers** +- **Error States** → **Error handling sub-flows at each layer** +- **Nested DFAs** → **True nested sub-flows** (DFA within DFA within DFA) + +#### Complex State Machine Benefits +- **Composability**: DFA logic can be composed at multiple levels +- **Extensibility**: New states can be added at the appropriate layer +- **Maintainability**: State logic is isolated within appropriate abstractions +- **Reusability**: DFA components can be reused across different contexts + +## 运行程序 + +```bash +cd cookbook/lexer +cargo run +``` + +## 扩展可能性 + +1. **语法分析**: 可以进一步扩展为语法分析器,使用类似的DFA→CosmoFlow映射 +2. **多语言支持**: 通过配置不同的节点和路由支持多种编程语言 +3. **增量解析**: 利用CosmoFlow的状态管理实现增量词法分析 +4. **并行处理**: 使用CosmoFlow的异步特性实现并行词法分析 +5. **IDE集成**: 可以集成到编辑器中提供实时语法高亮和错误检测 + +## 结论 + +这个示例展示了如何将传统的编译原理概念(DFA)与现代工作流引擎(CosmoFlow)相结合,创造出模块化、可扩展、类型安全的词法分析器。这种设计模式可以应用到其他需要状态机的场景中,如协议解析、游戏AI状态管理等。 diff --git a/cookbook/lexer/dfa_diagram.md b/cookbook/lexer/dfa_diagram.md new file mode 100644 index 0000000..c328f94 --- /dev/null +++ b/cookbook/lexer/dfa_diagram.md @@ -0,0 +1,172 @@ +# Lexer DFA State Transition Diagram (Nested Flow Architecture) + +## Overall Nested Flow Architecture + +``` + ┌─────────────────┐ + │ MAIN FLOW │ + │ (LEXER) │ + └─────────┬───────┘ + │ + ▼ + ┌─────────────────┐ + │ DISPATCHER │◄─────────────┐ + │ NODE │ │ + └─────────┬───────┘ │ + │ │ + ┌─────────────┼─────────────────┐ │ + │ │ │ │ + ▼ ▼ ▼ │ + ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ + │ WHITESPACE │ │ IDENTIFIER │ │ INTEGER │ + │ SUB-FLOW │ │ SUB-FLOW │ │ SUB-FLOW │ + └─────────┬───────┘ └─────────┬───────┘ └─────────┬───────┘ + │ │ │ + └───────────────────┼───────────────────┘ + │ + ┌───────────────┼───────────────┐ + │ │ │ + ▼ ▼ ▼ + ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ + │ STRING │ │ OPERATOR │ │ DELIMITER │ + │ SUB-FLOW │ │ SUB-FLOW │ │ SUB-FLOW │ + └─────────┬───────┘ └─────────┬───────┘ └─────────┬───────┘ + │ │ │ + └───────────────────┼───────────────────┘ + │ + ┌───────────────────┼───────────────────┐ + │ │ │ + ▼ ▼ ▼ + ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ + │ COMMENT │ │ UNKNOWN │ │ END_OF_INPUT │ + │ SUB-FLOW │ │ SUB-FLOW │ │ NODE │ + └─────────┬───────┘ └─────────┬───────┘ └─────────┬───────┘ + │ │ │ + └───────────────────┼───────────────────┘ + │ │ + ▼ ▼ + ┌─────────────────┐ ┌─────────────────┐ + │ RETURN_TO_ │ │ TERMINAL │ + │ DISPATCHER │ │ (COMPLETE) │ + └─────────┬───────┘ └─────────────────┘ + │ + └─────────────────────────┘ +``` + +## Dispatcher Logic + +The dispatcher analyzes the current character and routes to the appropriate sub-flow: + +``` +Current Character Analysis: +├── Whitespace (' ', '\t', '\n', '\r') → whitespace_flow +├── Letter or '_' → identifier_flow +├── Digit → integer_flow +├── Quote ('"') → string_flow +├── '/' (need to check next char for '//' comment) → comment_flow or operator_flow +├── Other Operators (+, -, *, =, <, >, !) → operator_flow +├── Delimiters ((, ), {, }, [, ], ;, ,, .) → delimiter_flow +├── End of Input → end_of_input +└── Others → unknown_flow +``` +## Detailed Sub-flow DFAs + +### 1. WHITESPACE Sub-flow DFA +``` +START ──[whitespace]──► COLLECT ──[whitespace]──► COLLECT + │ │ + └──[other]──────────────► COMPLETE +``` + +### 2. IDENTIFIER Sub-flow DFA +``` +START ──[a-zA-Z_]──► COLLECT ──[a-zA-Z0-9_]──► COLLECT + │ │ + └──[other]──────────────► CLASSIFY ──► COMPLETE + │ + ├── keyword → Keyword Token + └── other → Identifier Token +``` + +### 3. INTEGER Sub-flow DFA +``` +START ──[0-9]──► COLLECT ──[0-9]──► COLLECT + │ │ + └──[other]────────► COMPLETE +``` + +### 4. STRING Sub-flow DFA +``` +START ──["]──► COLLECT ──[char≠"]──► COLLECT ──["]──► COMPLETE + │ │ + │ └──[\]──► ESCAPE ──[any]──┐ + │ │ + └─────────────────────────────────────────────┘ +``` + +### 5. OPERATOR Sub-flow DFA +``` +START ──[op_char]──► CHECK_SECOND ──[valid_second_char]──► COMPLETE (two-char op) + │ │ + │ Examples: == != <= >= ++ │ + │ -- && || │ + └──[other]────────────────────────► COMPLETE (single-char op) +``` + +### 6. DELIMITER Sub-flow DFA +``` +START ──[delimiter]──► COLLECT ──► COMPLETE +``` + +### 7. COMMENT Sub-flow DFA +``` +START ──[/]──► FIRST_SLASH ──[/]──► COLLECT_LINE ──[char≠\n]──► COLLECT_LINE + │ │ │ + │ └──[\n or EOF]──────────► COMPLETE + └──[other]──► ERROR (Invalid comment start) +``` + +### 8. UNKNOWN Sub-flow DFA +``` +START ──[any]──► COLLECT ──► COMPLETE +``` + +## Main Flow Execution Cycle + +``` +DISPATCHER → SUB-FLOW → RETURN_TO_DISPATCHER → DISPATCHER → ... + │ │ + └──[end_of_input]──► END_OF_INPUT ──► COMPLETE +``` + +## Nested Flow Benefits + +### Modularity +- Each token type is handled by an independent sub-flow +- Sub-flows can be developed, tested, and maintained separately +- Easy to add new token types without modifying existing flows + +### Composability +- Sub-flows can be reused in different contexts +- Main flow orchestrates sub-flows without knowing their internal structure +- Clear separation between dispatcher logic and token processing + +### Extensibility +- New token types can be added by creating new sub-flows +- Existing sub-flows can be enhanced without affecting others +- Complex token processing can be implemented with multi-stage sub-flows + +### Maintainability +- Clear flow structure and execution paths +- Isolated concerns and responsibilities +- Easy debugging and testing at the sub-flow level + +## Sub-flow Return Handling + +Each sub-flow completes and returns control to the dispatcher, which continues processing the next character until EOF is reached. + +``` +Sub-flow → "complete" → ReturnToDispatcherNode → "dispatch" → DispatcherNode +``` + +This creates a natural loop that processes the entire input character by character, with each token type handled by its specialized sub-flow. diff --git a/cookbook/lexer/src/flows.rs b/cookbook/lexer/src/flows.rs new file mode 100644 index 0000000..c1d8572 --- /dev/null +++ b/cookbook/lexer/src/flows.rs @@ -0,0 +1,78 @@ +use cosmoflow::prelude::*; + +use crate::nodes::{ + CommentCollectorNode, DelimiterCollectorNode, IdentifierCollectorNode, IntegerCollectorNode, + OperatorCollectorNode, StringCollectorNode, UnknownCollectorNode, WhitespaceCollectorNode, +}; + +/// Creates a sub-flow for handling whitespace tokens +pub fn create_whitespace_flow() -> Flow { + FlowBuilder::new() + .start_node("collect") + .node("collect", WhitespaceCollectorNode) + .terminal_route("collect", "complete") + .build() +} + +/// Creates a sub-flow for handling identifier tokens +pub fn create_identifier_flow() -> Flow { + FlowBuilder::new() + .start_node("collect") + .node("collect", IdentifierCollectorNode) + .terminal_route("collect", "complete") + .build() +} + +/// Creates a sub-flow for handling integer tokens +pub fn create_integer_flow() -> Flow { + FlowBuilder::new() + .start_node("collect") + .node("collect", IntegerCollectorNode) + .terminal_route("collect", "complete") + .build() +} + +/// Creates a sub-flow for handling string tokens +pub fn create_string_flow() -> Flow { + FlowBuilder::new() + .start_node("collect") + .node("collect", StringCollectorNode) + .terminal_route("collect", "complete") + .build() +} + +/// Creates a sub-flow for handling operator tokens +pub fn create_operator_flow() -> Flow { + FlowBuilder::new() + .start_node("collect") + .node("collect", OperatorCollectorNode) + .terminal_route("collect", "complete") + .build() +} + +/// Creates a sub-flow for handling delimiter tokens +pub fn create_delimiter_flow() -> Flow { + FlowBuilder::new() + .start_node("collect") + .node("collect", DelimiterCollectorNode) + .terminal_route("collect", "complete") + .build() +} + +/// Creates a sub-flow for handling comment tokens +pub fn create_comment_flow() -> Flow { + FlowBuilder::new() + .start_node("collect") + .node("collect", CommentCollectorNode) + .terminal_route("collect", "complete") + .build() +} + +/// Creates a sub-flow for handling unknown tokens +pub fn create_unknown_flow() -> Flow { + FlowBuilder::new() + .start_node("collect") + .node("collect", UnknownCollectorNode) + .terminal_route("collect", "complete") + .build() +} diff --git a/cookbook/lexer/src/main.rs b/cookbook/lexer/src/main.rs new file mode 100644 index 0000000..7684594 --- /dev/null +++ b/cookbook/lexer/src/main.rs @@ -0,0 +1,222 @@ +mod flows; +mod nodes; + +use cosmoflow::prelude::*; +use serde::{Deserialize, Serialize}; + +use crate::{ + flows::{ + create_comment_flow, create_delimiter_flow, create_identifier_flow, create_integer_flow, + create_operator_flow, create_string_flow, create_unknown_flow, create_whitespace_flow, + }, + nodes::{DispatcherNode, EndOfInputNode, ReturnToDispatcherNode}, +}; + +/// Token types that our lexer can recognize +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum TokenType { + /// Identifiers (variable names, function names, etc.) + Identifier, + /// Integer literals + Integer, + /// String literals (enclosed in quotes) + String, + /// Keywords (if, else, while, etc.) + Keyword, + /// Operators (+, -, *, /, =, ==, etc.) + Operator, + /// Delimiters (, ), {, }, [, ], ;, ,, etc.) + Delimiter, + /// Whitespace (spaces, tabs, newlines) + Whitespace, + /// Comments + Comment, + /// Unknown/invalid token + Unknown, + /// End of input + EndOfInput, +} + +/// A token produced by the lexer +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Token { + pub token_type: TokenType, + pub lexeme: String, + pub line: u32, + pub column: u32, +} + +/// Lexer context containing the input and current position +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LexerContext { + pub input: String, + pub position: usize, + pub line: u32, + pub column: u32, + pub tokens: Vec, +} + +impl LexerContext { + pub fn new(input: String) -> Self { + Self { + input, + position: 0, + line: 1, + column: 1, + tokens: Vec::new(), + } + } + + pub fn current_char(&self) -> Option { + self.input.chars().nth(self.position) + } + + pub fn peek_char(&self) -> Option { + self.input.chars().nth(self.position + 1) + } + + pub fn advance(&mut self) -> Option { + if let Some(ch) = self.current_char() { + self.position += 1; + if ch == '\n' { + self.line += 1; + self.column = 1; + } else { + self.column += 1; + } + Some(ch) + } else { + None + } + } + + pub fn add_token(&mut self, token_type: TokenType, lexeme: String) { + self.tokens.push(Token { + token_type, + lexeme, + line: self.line, + column: self.column, + }); + } + + pub fn is_at_end(&self) -> bool { + self.position >= self.input.len() + } +} + +/// CosmoFlow-based lexer that implements a DFA-based tokenizer +pub struct CosmoFlowLexer { + flow: Flow, +} + +impl Default for CosmoFlowLexer { + fn default() -> Self { + // Create sub-flows for each token type + let whitespace_flow = create_whitespace_flow(); + let identifier_flow = create_identifier_flow(); + let integer_flow = create_integer_flow(); + let string_flow = create_string_flow(); + let operator_flow = create_operator_flow(); + let delimiter_flow = create_delimiter_flow(); + let comment_flow = create_comment_flow(); + let unknown_flow = create_unknown_flow(); + + // Build the main flow with dispatcher and sub-flows + let flow = FlowBuilder::new() + .start_node("dispatch") + .node("dispatch", DispatcherNode) + .node("end_of_input", EndOfInputNode) + .node("return_to_dispatcher", ReturnToDispatcherNode) + // Add sub-flows as nodes + .node("whitespace_flow", whitespace_flow) + .node("identifier_flow", identifier_flow) + .node("integer_flow", integer_flow) + .node("string_flow", string_flow) + .node("operator_flow", operator_flow) + .node("delimiter_flow", delimiter_flow) + .node("comment_flow", comment_flow) + .node("unknown_flow", unknown_flow) + // Routes from dispatcher to sub-flows + .route("dispatch", "whitespace_flow", "whitespace_flow") + .route("dispatch", "identifier_flow", "identifier_flow") + .route("dispatch", "integer_flow", "integer_flow") + .route("dispatch", "string_flow", "string_flow") + .route("dispatch", "operator_flow", "operator_flow") + .route("dispatch", "delimiter_flow", "delimiter_flow") + .route("dispatch", "comment_flow", "comment_flow") + .route("dispatch", "unknown_flow", "unknown_flow") + .route("dispatch", "end_of_input", "end_of_input") + // Routes from sub-flows back to dispatcher + .route("whitespace_flow", "complete", "return_to_dispatcher") + .route("identifier_flow", "complete", "return_to_dispatcher") + .route("integer_flow", "complete", "return_to_dispatcher") + .route("string_flow", "complete", "return_to_dispatcher") + .route("operator_flow", "complete", "return_to_dispatcher") + .route("delimiter_flow", "complete", "return_to_dispatcher") + .route("comment_flow", "complete", "return_to_dispatcher") + .route("unknown_flow", "complete", "return_to_dispatcher") + // Route from return node back to dispatcher + .route("return_to_dispatcher", "dispatch", "dispatch") + // Terminal route for end of input + .terminal_route("end_of_input", "complete") + .build(); + + Self { flow } + } +} + +impl CosmoFlowLexer { + pub fn new() -> Self { + Self::default() + } + + pub fn tokenize(&mut self, input: &str) -> Result, Box> { + let mut store = MemoryStorage::new(); + let lexer_ctx = LexerContext::new(input.to_string()); + store.set("lexer_context".to_string(), &lexer_ctx)?; + + let _result = self.flow.execute(&mut store)?; + + let final_ctx: Option = store.get("lexer_context")?; + let final_ctx = final_ctx.unwrap_or_else(|| LexerContext::new(String::new())); + + Ok(final_ctx.tokens) + } +} + +fn main() -> Result<(), Box> { + let mut lexer = CosmoFlowLexer::new(); + + // Test the lexer with a simple program + let input = r#" + fn main() { + let x = 42; + if x > 0 { + println!("Hello, world!"); + } + } + "#; + + println!("Tokenizing input:"); + println!("{input}"); + println!("\nTokens:"); + + let tokens = lexer.tokenize(input)?; + for (i, token) in tokens.iter().enumerate() { + println!("{i:3}: {token:?}"); + } + + // Test with a more complex example + let complex_input = "x = y + 42 * (z - 1) // comment"; + println!("\n\nTokenizing complex input:"); + println!("{complex_input}"); + println!("\nTokens:"); + + let mut lexer2 = CosmoFlowLexer::new(); + let tokens2 = lexer2.tokenize(complex_input)?; + for (i, token) in tokens2.iter().enumerate() { + println!("{i:3}: {token:?}"); + } + + Ok(()) +} diff --git a/cookbook/lexer/src/nodes.rs b/cookbook/lexer/src/nodes.rs new file mode 100644 index 0000000..174dc57 --- /dev/null +++ b/cookbook/lexer/src/nodes.rs @@ -0,0 +1,470 @@ +use crate::{LexerContext, TokenType}; +use cosmoflow::prelude::*; + +/// Dispatcher node - determines which sub-flow to route to +pub struct DispatcherNode; + +impl Node for DispatcherNode { + type PrepResult = (); + type ExecResult = String; + type Error = NodeError; + + fn prep(&mut self, _store: &S, _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn exec( + &mut self, + _prep_result: (), + _context: &ExecutionContext, + ) -> Result { + Ok("analyzed".to_string()) + } + + fn post( + &mut self, + store: &mut S, + _prep_result: (), + _exec_result: String, + _context: &ExecutionContext, + ) -> Result { + let lexer_ctx: Option = store.get("lexer_context").unwrap_or_default(); + let lexer_ctx = lexer_ctx.unwrap_or_else(|| LexerContext::new(String::new())); + + if lexer_ctx.is_at_end() { + return Ok(Action::simple("end_of_input")); + } + + match lexer_ctx.current_char() { + Some(' ') | Some('\t') | Some('\n') | Some('\r') => { + Ok(Action::simple("whitespace_flow")) + } + Some('a'..='z') | Some('A'..='Z') | Some('_') => Ok(Action::simple("identifier_flow")), + Some('0'..='9') => Ok(Action::simple("integer_flow")), + Some('"') => Ok(Action::simple("string_flow")), + Some('/') if lexer_ctx.peek_char() == Some('/') => Ok(Action::simple("comment_flow")), + Some('+') | Some('-') | Some('*') | Some('/') | Some('=') | Some('<') | Some('>') + | Some('!') => Ok(Action::simple("operator_flow")), + Some('(') | Some(')') | Some('{') | Some('}') | Some('[') | Some(']') | Some(';') + | Some(',') | Some('.') => Ok(Action::simple("delimiter_flow")), + _ => Ok(Action::simple("unknown_flow")), + } + } +} + +/// End of input node +pub struct EndOfInputNode; + +impl Node for EndOfInputNode { + type PrepResult = (); + type ExecResult = (); + type Error = NodeError; + + fn prep(&mut self, _store: &S, _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn exec(&mut self, _prep_result: (), _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn post( + &mut self, + store: &mut S, + _prep_result: (), + _exec_result: (), + _context: &ExecutionContext, + ) -> Result { + let lexer_ctx: Option = store.get("lexer_context").unwrap_or_default(); + let mut lexer_ctx = lexer_ctx.unwrap_or_else(|| LexerContext::new(String::new())); + + lexer_ctx.add_token(TokenType::EndOfInput, String::new()); + store.set("lexer_context".to_string(), &lexer_ctx).unwrap(); + + Ok(Action::simple("complete")) + } +} + +/// Node that returns control to the dispatcher after processing a token +pub struct ReturnToDispatcherNode; + +impl Node for ReturnToDispatcherNode { + type PrepResult = (); + type ExecResult = (); + type Error = NodeError; + + fn prep(&mut self, _store: &S, _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn exec(&mut self, _prep_result: (), _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn post( + &mut self, + _store: &mut S, + _prep_result: (), + _exec_result: (), + _context: &ExecutionContext, + ) -> Result { + Ok(Action::simple("dispatch")) + } +} + +/// Optimized whitespace collector node +pub struct WhitespaceCollectorNode; + +impl Node for WhitespaceCollectorNode { + type PrepResult = (); + type ExecResult = (); + type Error = NodeError; + + fn prep(&mut self, _store: &S, _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn exec(&mut self, _prep_result: (), _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn post( + &mut self, + store: &mut S, + _prep_result: (), + _exec_result: (), + _context: &ExecutionContext, + ) -> Result { + let lexer_ctx: Option = store.get("lexer_context").unwrap_or_default(); + let mut lexer_ctx = lexer_ctx.unwrap_or_else(|| LexerContext::new(String::new())); + + let mut lexeme = String::new(); + while let Some(ch) = lexer_ctx.current_char() { + if ch.is_whitespace() { + lexeme.push(ch); + lexer_ctx.advance(); + } else { + break; + } + } + + lexer_ctx.add_token(TokenType::Whitespace, lexeme); + store.set("lexer_context".to_string(), &lexer_ctx).unwrap(); + + Ok(Action::simple("complete")) + } +} + +/// Optimized identifier collector node with keyword classification +pub struct IdentifierCollectorNode; + +impl Node for IdentifierCollectorNode { + type PrepResult = (); + type ExecResult = (); + type Error = NodeError; + + fn prep(&mut self, _store: &S, _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn exec(&mut self, _prep_result: (), _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn post( + &mut self, + store: &mut S, + _prep_result: (), + _exec_result: (), + _context: &ExecutionContext, + ) -> Result { + let lexer_ctx: Option = store.get("lexer_context").unwrap_or_default(); + let mut lexer_ctx = lexer_ctx.unwrap_or_else(|| LexerContext::new(String::new())); + + let mut lexeme = String::new(); + while let Some(ch) = lexer_ctx.current_char() { + if ch.is_alphanumeric() || ch == '_' { + lexeme.push(ch); + lexer_ctx.advance(); + } else { + break; + } + } + + // Check if it's a keyword + let token_type = match lexeme.as_str() { + "if" | "else" | "while" | "for" | "fn" | "let" | "const" | "var" | "return" + | "true" | "false" => TokenType::Keyword, + _ => TokenType::Identifier, + }; + + lexer_ctx.add_token(token_type, lexeme); + store.set("lexer_context".to_string(), &lexer_ctx).unwrap(); + + Ok(Action::simple("complete")) + } +} + +/// Optimized integer collector node +pub struct IntegerCollectorNode; + +impl Node for IntegerCollectorNode { + type PrepResult = (); + type ExecResult = (); + type Error = NodeError; + + fn prep(&mut self, _store: &S, _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn exec(&mut self, _prep_result: (), _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn post( + &mut self, + store: &mut S, + _prep_result: (), + _exec_result: (), + _context: &ExecutionContext, + ) -> Result { + let lexer_ctx: Option = store.get("lexer_context").unwrap_or_default(); + let mut lexer_ctx = lexer_ctx.unwrap_or_else(|| LexerContext::new(String::new())); + + let mut lexeme = String::new(); + while let Some(ch) = lexer_ctx.current_char() { + if ch.is_ascii_digit() { + lexeme.push(ch); + lexer_ctx.advance(); + } else { + break; + } + } + + lexer_ctx.add_token(TokenType::Integer, lexeme); + store.set("lexer_context".to_string(), &lexer_ctx).unwrap(); + + Ok(Action::simple("complete")) + } +} + +/// Optimized string collector node +pub struct StringCollectorNode; + +impl Node for StringCollectorNode { + type PrepResult = (); + type ExecResult = (); + type Error = NodeError; + + fn prep(&mut self, _store: &S, _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn exec(&mut self, _prep_result: (), _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn post( + &mut self, + store: &mut S, + _prep_result: (), + _exec_result: (), + _context: &ExecutionContext, + ) -> Result { + let lexer_ctx: Option = store.get("lexer_context").unwrap_or_default(); + let mut lexer_ctx = lexer_ctx.unwrap_or_else(|| LexerContext::new(String::new())); + + let mut lexeme = String::new(); + + // Consume opening quote + if let Some('"') = lexer_ctx.current_char() { + lexeme.push(lexer_ctx.advance().unwrap()); + } + + let mut escaped = false; + while let Some(ch) = lexer_ctx.current_char() { + lexeme.push(ch); + lexer_ctx.advance(); + + if escaped { + escaped = false; + } else if ch == '\\' { + escaped = true; + } else if ch == '"' { + break; + } + } + + lexer_ctx.add_token(TokenType::String, lexeme); + store.set("lexer_context".to_string(), &lexer_ctx).unwrap(); + + Ok(Action::simple("complete")) + } +} + +/// Optimized operator collector node +pub struct OperatorCollectorNode; + +impl Node for OperatorCollectorNode { + type PrepResult = (); + type ExecResult = (); + type Error = NodeError; + + fn prep(&mut self, _store: &S, _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn exec(&mut self, _prep_result: (), _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn post( + &mut self, + store: &mut S, + _prep_result: (), + _exec_result: (), + _context: &ExecutionContext, + ) -> Result { + let lexer_ctx: Option = store.get("lexer_context").unwrap_or_default(); + let mut lexer_ctx = lexer_ctx.unwrap_or_else(|| LexerContext::new(String::new())); + + let mut lexeme = String::new(); + if let Some(ch) = lexer_ctx.current_char() { + lexeme.push(ch); + lexer_ctx.advance(); + + // Handle two-character operators + if let Some(next_ch) = lexer_ctx.current_char() { + let two_char = format!("{ch}{next_ch}"); + match two_char.as_str() { + "==" | "!=" | "<=" | ">=" | "++" | "--" | "&&" | "||" => { + lexeme.push(next_ch); + lexer_ctx.advance(); + } + _ => {} + } + } + } + + lexer_ctx.add_token(TokenType::Operator, lexeme); + store.set("lexer_context".to_string(), &lexer_ctx).unwrap(); + + Ok(Action::simple("complete")) + } +} + +/// Optimized delimiter collector node +pub struct DelimiterCollectorNode; + +impl Node for DelimiterCollectorNode { + type PrepResult = (); + type ExecResult = (); + type Error = NodeError; + + fn prep(&mut self, _store: &S, _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn exec(&mut self, _prep_result: (), _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn post( + &mut self, + store: &mut S, + _prep_result: (), + _exec_result: (), + _context: &ExecutionContext, + ) -> Result { + let lexer_ctx: Option = store.get("lexer_context").unwrap_or_default(); + let mut lexer_ctx = lexer_ctx.unwrap_or_else(|| LexerContext::new(String::new())); + + let mut lexeme = String::new(); + if let Some(ch) = lexer_ctx.advance() { + lexeme.push(ch); + } + + lexer_ctx.add_token(TokenType::Delimiter, lexeme); + store.set("lexer_context".to_string(), &lexer_ctx).unwrap(); + + Ok(Action::simple("complete")) + } +} + +/// Optimized comment collector node +pub struct CommentCollectorNode; + +impl Node for CommentCollectorNode { + type PrepResult = (); + type ExecResult = (); + type Error = NodeError; + + fn prep(&mut self, _store: &S, _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn exec(&mut self, _prep_result: (), _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn post( + &mut self, + store: &mut S, + _prep_result: (), + _exec_result: (), + _context: &ExecutionContext, + ) -> Result { + let lexer_ctx: Option = store.get("lexer_context").unwrap_or_default(); + let mut lexer_ctx = lexer_ctx.unwrap_or_else(|| LexerContext::new(String::new())); + + let mut lexeme = String::new(); + while let Some(ch) = lexer_ctx.current_char() { + if ch == '\n' { + break; + } + lexeme.push(ch); + lexer_ctx.advance(); + } + + lexer_ctx.add_token(TokenType::Comment, lexeme); + store.set("lexer_context".to_string(), &lexer_ctx).unwrap(); + + Ok(Action::simple("complete")) + } +} + +/// Optimized unknown token collector node +pub struct UnknownCollectorNode; + +impl Node for UnknownCollectorNode { + type PrepResult = (); + type ExecResult = (); + type Error = NodeError; + + fn prep(&mut self, _store: &S, _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn exec(&mut self, _prep_result: (), _context: &ExecutionContext) -> Result<(), Self::Error> { + Ok(()) + } + + fn post( + &mut self, + store: &mut S, + _prep_result: (), + _exec_result: (), + _context: &ExecutionContext, + ) -> Result { + let lexer_ctx: Option = store.get("lexer_context").unwrap_or_default(); + let mut lexer_ctx = lexer_ctx.unwrap_or_else(|| LexerContext::new(String::new())); + + if let Some(ch) = lexer_ctx.advance() { + lexer_ctx.add_token(TokenType::Unknown, ch.to_string()); + } + + store.set("lexer_context".to_string(), &lexer_ctx).unwrap(); + + Ok(Action::simple("complete")) + } +}