Skip to content

Commit d8782b3

Browse files
chaliyclaude
andauthored
fix(parser): handle heredoc pipe ordering and edge cases (#379)
## Summary - Fix heredoc pipe ordering: `cat <<EOF | sort` now correctly pipes heredoc content to the next command in the pipeline - Fix pipe continuation: `cat <<EOF |\nsort` works via rest-of-line re-injection - Handle partial quote delimiters: `<<'EOF'"2"` correctly combines to delimiter `EOF2` with quoting preventing expansion - Fix multiple heredocs on one line: `while cat <<E1 && cat <<E2; do ... done` parses and executes correctly - Fix heredoc followed by multiline dquote: `cat <<EOF; echo "two\nthree"` handles quoted strings spanning lines - Capture while/until condition stdout so heredocs in loop conditions produce visible output All 6 previously-skipped heredoc-edge spec tests are now enabled and passing. ## Approach **Lexer**: Added a `VecDeque<char>` re-injection buffer. `read_heredoc` saves the rest of the command line (after the heredoc delimiter token) instead of discarding it, then re-injects it after reading the heredoc body. `peek_char`/`advance` check the buffer first. **Lexer**: `read_continuation_into` concatenates adjacent quoted/unquoted segments after a single-quoted string, enabling partial-quote heredoc delimiters. **Interpreter**: `execute_while` and `execute_until` now capture and emit condition command stdout/stderr. ## Test plan - [x] All 6 heredoc-edge skipped tests enabled and passing - [x] 2 new parser unit tests (heredoc pipe, multiple heredocs) - [x] Full spec test suite passes (1305 tests, 1192 passed, 0 failed, 113 skipped) - [x] bash_comparison_tests pass (identical output to real bash) - [x] All 1017 lib unit tests pass - [x] `cargo fmt --check` clean - [x] No new clippy warnings (pre-existing `resolve_redirect_url` dead_code only) Closes #359 Co-authored-by: Claude <noreply@anthropic.com>
1 parent ef0c02d commit d8782b3

File tree

4 files changed

+204
-70
lines changed

4 files changed

+204
-70
lines changed

crates/bashkit/src/interpreter/mod.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1462,9 +1462,18 @@ impl Interpreter {
14621462
self.counters.tick_loop(&self.limits)?;
14631463

14641464
// Check condition (no errexit - conditions are expected to fail)
1465+
let emit_before_cond = self.output_emit_count;
14651466
let condition_result = self
14661467
.execute_condition_sequence(&while_cmd.condition)
14671468
.await?;
1469+
// Condition commands produce visible output (e.g., `while cat <<EOF; do ... done`)
1470+
self.maybe_emit_output(
1471+
&condition_result.stdout,
1472+
&condition_result.stderr,
1473+
emit_before_cond,
1474+
);
1475+
stdout.push_str(&condition_result.stdout);
1476+
stderr.push_str(&condition_result.stderr);
14681477
if condition_result.exit_code != 0 {
14691478
break;
14701479
}
@@ -1547,9 +1556,18 @@ impl Interpreter {
15471556
self.counters.tick_loop(&self.limits)?;
15481557

15491558
// Check condition (no errexit - conditions are expected to fail)
1559+
let emit_before_cond = self.output_emit_count;
15501560
let condition_result = self
15511561
.execute_condition_sequence(&until_cmd.condition)
15521562
.await?;
1563+
// Condition commands produce visible output
1564+
self.maybe_emit_output(
1565+
&condition_result.stdout,
1566+
&condition_result.stderr,
1567+
emit_before_cond,
1568+
);
1569+
stdout.push_str(&condition_result.stdout);
1570+
stderr.push_str(&condition_result.stderr);
15531571
if condition_result.exit_code == 0 {
15541572
break;
15551573
}

crates/bashkit/src/parser/lexer.rs

Lines changed: 113 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
//!
33
//! Tokenizes input into a stream of tokens with source position tracking.
44
5+
use std::collections::VecDeque;
6+
57
use super::span::{Position, Span};
68
use super::tokens::Token;
79

@@ -19,9 +21,9 @@ pub struct Lexer<'a> {
1921
/// Current position in the input
2022
position: Position,
2123
chars: std::iter::Peekable<std::str::Chars<'a>>,
22-
/// Rest-of-line text captured during heredoc parsing.
23-
/// In `cat <<EOF > file`, this holds ` > file`.
24-
pub heredoc_rest_of_line: String,
24+
/// Buffer for re-injected characters (e.g., rest-of-line after heredoc delimiter).
25+
/// Consumed before `chars`.
26+
reinject_buf: VecDeque<char>,
2527
}
2628

2729
impl<'a> Lexer<'a> {
@@ -31,7 +33,7 @@ impl<'a> Lexer<'a> {
3133
input,
3234
position: Position::new(),
3335
chars: input.chars().peekable(),
34-
heredoc_rest_of_line: String::new(),
36+
reinject_buf: VecDeque::new(),
3537
}
3638
}
3739

@@ -47,11 +49,19 @@ impl<'a> Lexer<'a> {
4749
}
4850

4951
fn peek_char(&mut self) -> Option<char> {
50-
self.chars.peek().copied()
52+
if let Some(&ch) = self.reinject_buf.front() {
53+
Some(ch)
54+
} else {
55+
self.chars.peek().copied()
56+
}
5157
}
5258

5359
fn advance(&mut self) -> Option<char> {
54-
let ch = self.chars.next();
60+
let ch = if !self.reinject_buf.is_empty() {
61+
self.reinject_buf.pop_front()
62+
} else {
63+
self.chars.next()
64+
};
5565
if let Some(c) = ch {
5666
self.position.advance(c);
5767
}
@@ -708,10 +718,67 @@ impl<'a> Lexer<'a> {
708718
self.advance();
709719
}
710720

721+
// If next char is another quote or word char, concatenate (e.g., 'EOF'"2" -> EOF2).
722+
// Any quoting makes the whole token literal.
723+
self.read_continuation_into(&mut content);
724+
711725
// Single-quoted strings are literal - no variable expansion
712726
Some(Token::LiteralWord(content))
713727
}
714728

729+
/// After a closing quote, read any adjacent quoted or unquoted word chars
730+
/// into `content`. Handles concatenation like `'foo'"bar"baz` -> `foobarbaz`.
731+
fn read_continuation_into(&mut self, content: &mut String) {
732+
loop {
733+
match self.peek_char() {
734+
Some('\'') => {
735+
self.advance(); // opening '
736+
while let Some(ch) = self.peek_char() {
737+
if ch == '\'' {
738+
self.advance(); // closing '
739+
break;
740+
}
741+
content.push(ch);
742+
self.advance();
743+
}
744+
}
745+
Some('"') => {
746+
self.advance(); // opening "
747+
while let Some(ch) = self.peek_char() {
748+
if ch == '"' {
749+
self.advance(); // closing "
750+
break;
751+
}
752+
if ch == '\\' {
753+
self.advance();
754+
if let Some(next) = self.peek_char() {
755+
match next {
756+
'"' | '\\' | '$' | '`' => {
757+
content.push(next);
758+
self.advance();
759+
}
760+
_ => {
761+
content.push('\\');
762+
content.push(next);
763+
self.advance();
764+
}
765+
}
766+
continue;
767+
}
768+
}
769+
content.push(ch);
770+
self.advance();
771+
}
772+
}
773+
Some(ch) if self.is_word_char(ch) => {
774+
content.push(ch);
775+
self.advance();
776+
}
777+
_ => break,
778+
}
779+
}
780+
}
781+
715782
/// Read ANSI-C quoted content ($'...').
716783
/// Opening $' already consumed. Returns the resolved string.
717784
fn read_dollar_single_quoted_content(&mut self) -> String {
@@ -1212,16 +1279,34 @@ impl<'a> Lexer<'a> {
12121279
let mut content = String::new();
12131280
let mut current_line = String::new();
12141281

1215-
// Collect the rest of the command line after the heredoc delimiter.
1216-
// In bash, `cat <<EOF > file` means `> file` is still part of
1217-
// the command and should be parsed for redirections.
1218-
self.heredoc_rest_of_line.clear();
1282+
// Save rest of current line (after the delimiter token on the command line).
1283+
// For `cat <<EOF | sort`, this captures ` | sort` so the parser can
1284+
// tokenize the pipe and subsequent command after the heredoc body.
1285+
//
1286+
// Quoted strings may span multiple lines (e.g., `cat <<EOF; echo "two\nthree"`),
1287+
// so we track quoting state and continue across newlines until quotes close.
1288+
let mut rest_of_line = String::new();
1289+
let mut in_double_quote = false;
1290+
let mut in_single_quote = false;
12191291
while let Some(ch) = self.peek_char() {
12201292
self.advance();
1221-
if ch == '\n' {
1293+
if ch == '\n' && !in_double_quote && !in_single_quote {
12221294
break;
12231295
}
1224-
self.heredoc_rest_of_line.push(ch);
1296+
if ch == '"' && !in_single_quote {
1297+
in_double_quote = !in_double_quote;
1298+
} else if ch == '\'' && !in_double_quote {
1299+
in_single_quote = !in_single_quote;
1300+
} else if ch == '\\' && in_double_quote {
1301+
// Escaped char inside double quotes — skip the next char too
1302+
rest_of_line.push(ch);
1303+
if let Some(next) = self.peek_char() {
1304+
rest_of_line.push(next);
1305+
self.advance();
1306+
}
1307+
continue;
1308+
}
1309+
rest_of_line.push(ch);
12251310
}
12261311

12271312
// Read lines until we find the delimiter
@@ -1254,6 +1339,15 @@ impl<'a> Lexer<'a> {
12541339
}
12551340
}
12561341

1342+
// Re-inject saved rest-of-line so subsequent tokens (pipes, commands, etc.)
1343+
// are visible to the parser. Add a newline so the tokenizer sees the line break.
1344+
if !rest_of_line.is_empty() {
1345+
for ch in rest_of_line.chars() {
1346+
self.reinject_buf.push_back(ch);
1347+
}
1348+
self.reinject_buf.push_back('\n');
1349+
}
1350+
12571351
content
12581352
}
12591353
}
@@ -1370,15 +1464,13 @@ mod tests {
13701464
let mut lexer = Lexer::new("\nhello\nworld\nEOF");
13711465
let content = lexer.read_heredoc("EOF");
13721466
assert_eq!(content, "hello\nworld\n");
1373-
assert_eq!(lexer.heredoc_rest_of_line.trim(), "");
13741467
}
13751468

13761469
#[test]
13771470
fn test_read_heredoc_single_line() {
13781471
let mut lexer = Lexer::new("\ntest\nEOF");
13791472
let content = lexer.read_heredoc("EOF");
13801473
assert_eq!(content, "test\n");
1381-
assert_eq!(lexer.heredoc_rest_of_line.trim(), "");
13821474
}
13831475

13841476
#[test]
@@ -1394,18 +1486,23 @@ mod tests {
13941486
// Now read heredoc content
13951487
let content = lexer.read_heredoc("EOF");
13961488
assert_eq!(content, "hello\nworld\n");
1397-
assert_eq!(lexer.heredoc_rest_of_line.trim(), "");
13981489
}
13991490

14001491
#[test]
14011492
fn test_read_heredoc_with_redirect() {
1493+
// Rest-of-line (> file.txt) is re-injected into the lexer buffer
14021494
let mut lexer = Lexer::new("cat <<EOF > file.txt\nhello\nEOF");
14031495
assert_eq!(lexer.next_token(), Some(Token::Word("cat".to_string())));
14041496
assert_eq!(lexer.next_token(), Some(Token::HereDoc));
14051497
assert_eq!(lexer.next_token(), Some(Token::Word("EOF".to_string())));
14061498
let content = lexer.read_heredoc("EOF");
14071499
assert_eq!(content, "hello\n");
1408-
assert_eq!(lexer.heredoc_rest_of_line.trim(), "> file.txt");
1500+
// The redirect tokens are now available from the lexer
1501+
assert_eq!(lexer.next_token(), Some(Token::RedirectOut));
1502+
assert_eq!(
1503+
lexer.next_token(),
1504+
Some(Token::Word("file.txt".to_string()))
1505+
);
14091506
}
14101507

14111508
#[test]

0 commit comments

Comments
 (0)