From 7acf9737542b4fad9ec7b64f81f1a3966b499601 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 27 Feb 2026 17:07:32 +0000 Subject: [PATCH] fix(parser): handle heredoc pipe ordering and edge cases - Lexer saves rest-of-line after heredoc delimiter and re-injects it via a VecDeque buffer, so `cat < delimiter `EOF2` - Multiple heredocs on one line work naturally via repeated rest-of-line save/reinject (e.g., `while cat < { /// Current position in the input position: Position, chars: std::iter::Peekable>, - /// Rest-of-line text captured during heredoc parsing. - /// In `cat < file`, this holds ` > file`. - pub heredoc_rest_of_line: String, + /// Buffer for re-injected characters (e.g., rest-of-line after heredoc delimiter). + /// Consumed before `chars`. + reinject_buf: VecDeque, } impl<'a> Lexer<'a> { @@ -31,7 +33,7 @@ impl<'a> Lexer<'a> { input, position: Position::new(), chars: input.chars().peekable(), - heredoc_rest_of_line: String::new(), + reinject_buf: VecDeque::new(), } } @@ -47,11 +49,19 @@ impl<'a> Lexer<'a> { } fn peek_char(&mut self) -> Option { - self.chars.peek().copied() + if let Some(&ch) = self.reinject_buf.front() { + Some(ch) + } else { + self.chars.peek().copied() + } } fn advance(&mut self) -> Option { - let ch = self.chars.next(); + let ch = if !self.reinject_buf.is_empty() { + self.reinject_buf.pop_front() + } else { + self.chars.next() + }; if let Some(c) = ch { self.position.advance(c); } @@ -708,10 +718,67 @@ impl<'a> Lexer<'a> { self.advance(); } + // If next char is another quote or word char, concatenate (e.g., 'EOF'"2" -> EOF2). + // Any quoting makes the whole token literal. + self.read_continuation_into(&mut content); + // Single-quoted strings are literal - no variable expansion Some(Token::LiteralWord(content)) } + /// After a closing quote, read any adjacent quoted or unquoted word chars + /// into `content`. Handles concatenation like `'foo'"bar"baz` -> `foobarbaz`. + fn read_continuation_into(&mut self, content: &mut String) { + loop { + match self.peek_char() { + Some('\'') => { + self.advance(); // opening ' + while let Some(ch) = self.peek_char() { + if ch == '\'' { + self.advance(); // closing ' + break; + } + content.push(ch); + self.advance(); + } + } + Some('"') => { + self.advance(); // opening " + while let Some(ch) = self.peek_char() { + if ch == '"' { + self.advance(); // closing " + break; + } + if ch == '\\' { + self.advance(); + if let Some(next) = self.peek_char() { + match next { + '"' | '\\' | '$' | '`' => { + content.push(next); + self.advance(); + } + _ => { + content.push('\\'); + content.push(next); + self.advance(); + } + } + continue; + } + } + content.push(ch); + self.advance(); + } + } + Some(ch) if self.is_word_char(ch) => { + content.push(ch); + self.advance(); + } + _ => break, + } + } + } + /// Read ANSI-C quoted content ($'...'). /// Opening $' already consumed. Returns the resolved string. fn read_dollar_single_quoted_content(&mut self) -> String { @@ -1212,16 +1279,34 @@ impl<'a> Lexer<'a> { let mut content = String::new(); let mut current_line = String::new(); - // Collect the rest of the command line after the heredoc delimiter. - // In bash, `cat < file` means `> file` is still part of - // the command and should be parsed for redirections. - self.heredoc_rest_of_line.clear(); + // Save rest of current line (after the delimiter token on the command line). + // For `cat < Lexer<'a> { } } + // Re-inject saved rest-of-line so subsequent tokens (pipes, commands, etc.) + // are visible to the parser. Add a newline so the tokenizer sees the line break. + if !rest_of_line.is_empty() { + for ch in rest_of_line.chars() { + self.reinject_buf.push_back(ch); + } + self.reinject_buf.push_back('\n'); + } + content } } @@ -1370,7 +1464,6 @@ mod tests { let mut lexer = Lexer::new("\nhello\nworld\nEOF"); let content = lexer.read_heredoc("EOF"); assert_eq!(content, "hello\nworld\n"); - assert_eq!(lexer.heredoc_rest_of_line.trim(), ""); } #[test] @@ -1378,7 +1471,6 @@ mod tests { let mut lexer = Lexer::new("\ntest\nEOF"); let content = lexer.read_heredoc("EOF"); assert_eq!(content, "test\n"); - assert_eq!(lexer.heredoc_rest_of_line.trim(), ""); } #[test] @@ -1394,18 +1486,23 @@ mod tests { // Now read heredoc content let content = lexer.read_heredoc("EOF"); assert_eq!(content, "hello\nworld\n"); - assert_eq!(lexer.heredoc_rest_of_line.trim(), ""); } #[test] fn test_read_heredoc_with_redirect() { + // Rest-of-line (> file.txt) is re-injected into the lexer buffer let mut lexer = Lexer::new("cat < file.txt\nhello\nEOF"); assert_eq!(lexer.next_token(), Some(Token::Word("cat".to_string()))); assert_eq!(lexer.next_token(), Some(Token::HereDoc)); assert_eq!(lexer.next_token(), Some(Token::Word("EOF".to_string()))); let content = lexer.read_heredoc("EOF"); assert_eq!(content, "hello\n"); - assert_eq!(lexer.heredoc_rest_of_line.trim(), "> file.txt"); + // The redirect tokens are now available from the lexer + assert_eq!(lexer.next_token(), Some(Token::RedirectOut)); + assert_eq!( + lexer.next_token(), + Some(Token::Word("file.txt".to_string())) + ); } #[test] diff --git a/crates/bashkit/src/parser/mod.rs b/crates/bashkit/src/parser/mod.rs index ad87adfa..c7673b6f 100644 --- a/crates/bashkit/src/parser/mod.rs +++ b/crates/bashkit/src/parser/mod.rs @@ -466,7 +466,9 @@ impl<'a> Parser<'a> { kind, target, }); - break; // heredoc consumes rest of input line + // Rest-of-line tokens re-injected by lexer; break so callers + // can see pipes/semicolons. + break; } _ => break, } @@ -1769,10 +1771,9 @@ impl<'a> Parser<'a> { // Don't advance - let read_heredoc consume directly from lexer position // Read the here document content (reads until delimiter line). - // Also captures rest-of-line text (e.g. `> file` in - // `cat < file`) into lexer.heredoc_rest_of_line. + // Rest-of-line tokens (redirects, pipes, etc.) are re-injected + // into the lexer buffer by read_heredoc. let content = self.lexer.read_heredoc(&delimiter); - let rest_of_line = std::mem::take(&mut self.lexer.heredoc_rest_of_line); // Strip leading tabs for <<- let content = if strip_tabs { @@ -1810,56 +1811,49 @@ impl<'a> Parser<'a> { target, }); - // Parse rest-of-line for additional redirects - // (e.g. `> file` in `cat < file`). - // We parse tokens directly instead of using parse_simple_command - // because that method returns None for redirect-only input - // (no command word), dropping the redirects we need. - if !rest_of_line.trim().is_empty() { - let mut sub = Parser::new(&rest_of_line); - loop { - match &sub.current_token { - Some(tokens::Token::RedirectOut) => { - sub.advance(); - if let Ok(target) = sub.expect_word() { - redirects.push(Redirect { - fd: None, - kind: RedirectKind::Output, - target, - }); - } + // Advance the parser token so it picks up re-injected + // rest-of-line tokens (|, ;, &&, > file) from the lexer. + self.advance(); + + // If re-injected tokens include redirects (e.g. `> file` + // in `cat < file`), consume them here before breaking + // out to pipeline/list parsers. + while let Some(tok) = &self.current_token { + match tok { + tokens::Token::RedirectOut => { + self.advance(); + if let Ok(target) = self.expect_word() { + redirects.push(Redirect { + fd: None, + kind: RedirectKind::Output, + target, + }); } - Some(tokens::Token::RedirectAppend) => { - sub.advance(); - if let Ok(target) = sub.expect_word() { - redirects.push(Redirect { - fd: None, - kind: RedirectKind::Append, - target, - }); - } + } + tokens::Token::RedirectAppend => { + self.advance(); + if let Ok(target) = self.expect_word() { + redirects.push(Redirect { + fd: None, + kind: RedirectKind::Append, + target, + }); } - Some(tokens::Token::RedirectFd(fd)) => { - let fd = *fd; - sub.advance(); - if let Ok(target) = sub.expect_word() { - redirects.push(Redirect { - fd: Some(fd), - kind: RedirectKind::Output, - target, - }); - } + } + tokens::Token::RedirectFd(fd) => { + let fd = *fd; + self.advance(); + if let Ok(target) = self.expect_word() { + redirects.push(Redirect { + fd: Some(fd), + kind: RedirectKind::Output, + target, + }); } - _ => break, } + _ => break, } } - - // Now advance past the heredoc body - self.advance(); - - // Heredoc body consumed subsequent lines from input. - // Stop parsing this command - next tokens belong to new commands. break; } Some(tokens::Token::ProcessSubIn) | Some(tokens::Token::ProcessSubOut) => { @@ -2788,4 +2782,35 @@ mod tests { assert!(matches!(&script.commands[0], Command::List(_))); } + + #[test] + fn test_heredoc_pipe() { + let parser = Parser::new("cat <