Skip to content

Commit d50c0cf

Browse files
committed
fix(parser): handle heredoc pipe ordering and edge cases
- Lexer saves rest-of-line after heredoc delimiter and re-injects it via a VecDeque buffer, so `cat <<EOF | sort` correctly pipes heredoc content - Pipe continuation (`cat <<EOF |\nsort`) also works via the same mechanism - Quote-aware rest-of-line scanning handles `cat <<EOF; echo "two\nthree"` where double-quoted strings span physical lines - read_continuation_into concatenates adjacent quoted/unquoted segments after single-quoted strings, enabling partial-quote delimiters like `<<'EOF'"2"` -> delimiter `EOF2` - Multiple heredocs on one line work naturally via repeated rest-of-line save/reinject (e.g., `while cat <<E1 && cat <<E2; do ... done`) - While/until condition stdout is now captured and emitted, fixing `while cat <<EOF; do ... done` producing no condition output Enables all 6 previously-skipped heredoc-edge spec tests. Closes #359
1 parent 6994214 commit d50c0cf

File tree

4 files changed

+162
-13
lines changed

4 files changed

+162
-13
lines changed

crates/bashkit/src/interpreter/mod.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,9 +1420,18 @@ impl Interpreter {
14201420
self.counters.tick_loop(&self.limits)?;
14211421

14221422
// Check condition (no errexit - conditions are expected to fail)
1423+
let emit_before_cond = self.output_emit_count;
14231424
let condition_result = self
14241425
.execute_condition_sequence(&while_cmd.condition)
14251426
.await?;
1427+
// Condition commands produce visible output (e.g., `while cat <<EOF; do ... done`)
1428+
self.maybe_emit_output(
1429+
&condition_result.stdout,
1430+
&condition_result.stderr,
1431+
emit_before_cond,
1432+
);
1433+
stdout.push_str(&condition_result.stdout);
1434+
stderr.push_str(&condition_result.stderr);
14261435
if condition_result.exit_code != 0 {
14271436
break;
14281437
}
@@ -1505,9 +1514,18 @@ impl Interpreter {
15051514
self.counters.tick_loop(&self.limits)?;
15061515

15071516
// Check condition (no errexit - conditions are expected to fail)
1517+
let emit_before_cond = self.output_emit_count;
15081518
let condition_result = self
15091519
.execute_condition_sequence(&until_cmd.condition)
15101520
.await?;
1521+
// Condition commands produce visible output
1522+
self.maybe_emit_output(
1523+
&condition_result.stdout,
1524+
&condition_result.stderr,
1525+
emit_before_cond,
1526+
);
1527+
stdout.push_str(&condition_result.stdout);
1528+
stderr.push_str(&condition_result.stderr);
15111529
if condition_result.exit_code == 0 {
15121530
break;
15131531
}

crates/bashkit/src/parser/lexer.rs

Lines changed: 106 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
//!
33
//! Tokenizes input into a stream of tokens with source position tracking.
44
5+
use std::collections::VecDeque;
6+
57
use super::span::{Position, Span};
68
use super::tokens::Token;
79

@@ -19,6 +21,9 @@ pub struct Lexer<'a> {
1921
/// Current position in the input
2022
position: Position,
2123
chars: std::iter::Peekable<std::str::Chars<'a>>,
24+
/// Buffer for re-injected characters (e.g., rest-of-line after heredoc delimiter).
25+
/// Consumed before `chars`.
26+
reinject_buf: VecDeque<char>,
2227
}
2328

2429
impl<'a> Lexer<'a> {
@@ -28,6 +33,7 @@ impl<'a> Lexer<'a> {
2833
input,
2934
position: Position::new(),
3035
chars: input.chars().peekable(),
36+
reinject_buf: VecDeque::new(),
3137
}
3238
}
3339

@@ -43,11 +49,19 @@ impl<'a> Lexer<'a> {
4349
}
4450

4551
fn peek_char(&mut self) -> Option<char> {
46-
self.chars.peek().copied()
52+
if let Some(&ch) = self.reinject_buf.front() {
53+
Some(ch)
54+
} else {
55+
self.chars.peek().copied()
56+
}
4757
}
4858

4959
fn advance(&mut self) -> Option<char> {
50-
let ch = self.chars.next();
60+
let ch = if !self.reinject_buf.is_empty() {
61+
self.reinject_buf.pop_front()
62+
} else {
63+
self.chars.next()
64+
};
5165
if let Some(c) = ch {
5266
self.position.advance(c);
5367
}
@@ -704,10 +718,67 @@ impl<'a> Lexer<'a> {
704718
self.advance();
705719
}
706720

721+
// If next char is another quote or word char, concatenate (e.g., 'EOF'"2" -> EOF2).
722+
// Any quoting makes the whole token literal.
723+
self.read_continuation_into(&mut content);
724+
707725
// Single-quoted strings are literal - no variable expansion
708726
Some(Token::LiteralWord(content))
709727
}
710728

729+
/// After a closing quote, read any adjacent quoted or unquoted word chars
730+
/// into `content`. Handles concatenation like `'foo'"bar"baz` -> `foobarbaz`.
731+
fn read_continuation_into(&mut self, content: &mut String) {
732+
loop {
733+
match self.peek_char() {
734+
Some('\'') => {
735+
self.advance(); // opening '
736+
while let Some(ch) = self.peek_char() {
737+
if ch == '\'' {
738+
self.advance(); // closing '
739+
break;
740+
}
741+
content.push(ch);
742+
self.advance();
743+
}
744+
}
745+
Some('"') => {
746+
self.advance(); // opening "
747+
while let Some(ch) = self.peek_char() {
748+
if ch == '"' {
749+
self.advance(); // closing "
750+
break;
751+
}
752+
if ch == '\\' {
753+
self.advance();
754+
if let Some(next) = self.peek_char() {
755+
match next {
756+
'"' | '\\' | '$' | '`' => {
757+
content.push(next);
758+
self.advance();
759+
}
760+
_ => {
761+
content.push('\\');
762+
content.push(next);
763+
self.advance();
764+
}
765+
}
766+
continue;
767+
}
768+
}
769+
content.push(ch);
770+
self.advance();
771+
}
772+
}
773+
Some(ch) if self.is_word_char(ch) => {
774+
content.push(ch);
775+
self.advance();
776+
}
777+
_ => break,
778+
}
779+
}
780+
}
781+
711782
/// Read ANSI-C quoted content ($'...').
712783
/// Opening $' already consumed. Returns the resolved string.
713784
fn read_dollar_single_quoted_content(&mut self) -> String {
@@ -1208,12 +1279,34 @@ impl<'a> Lexer<'a> {
12081279
let mut content = String::new();
12091280
let mut current_line = String::new();
12101281

1211-
// Skip to end of current line first (after the delimiter on command line)
1282+
// Save rest of current line (after the delimiter token on the command line).
1283+
// For `cat <<EOF | sort`, this captures ` | sort` so the parser can
1284+
// tokenize the pipe and subsequent command after the heredoc body.
1285+
//
1286+
// Quoted strings may span multiple lines (e.g., `cat <<EOF; echo "two\nthree"`),
1287+
// so we track quoting state and continue across newlines until quotes close.
1288+
let mut rest_of_line = String::new();
1289+
let mut in_double_quote = false;
1290+
let mut in_single_quote = false;
12121291
while let Some(ch) = self.peek_char() {
12131292
self.advance();
1214-
if ch == '\n' {
1293+
if ch == '\n' && !in_double_quote && !in_single_quote {
12151294
break;
12161295
}
1296+
if ch == '"' && !in_single_quote {
1297+
in_double_quote = !in_double_quote;
1298+
} else if ch == '\'' && !in_double_quote {
1299+
in_single_quote = !in_single_quote;
1300+
} else if ch == '\\' && in_double_quote {
1301+
// Escaped char inside double quotes — skip the next char too
1302+
rest_of_line.push(ch);
1303+
if let Some(next) = self.peek_char() {
1304+
rest_of_line.push(next);
1305+
self.advance();
1306+
}
1307+
continue;
1308+
}
1309+
rest_of_line.push(ch);
12171310
}
12181311

12191312
// Read lines until we find the delimiter
@@ -1246,6 +1339,15 @@ impl<'a> Lexer<'a> {
12461339
}
12471340
}
12481341

1342+
// Re-inject saved rest-of-line so subsequent tokens (pipes, commands, etc.)
1343+
// are visible to the parser. Add a newline so the tokenizer sees the line break.
1344+
if !rest_of_line.is_empty() {
1345+
for ch in rest_of_line.chars() {
1346+
self.reinject_buf.push_back(ch);
1347+
}
1348+
self.reinject_buf.push_back('\n');
1349+
}
1350+
12491351
content
12501352
}
12511353
}

crates/bashkit/src/parser/mod.rs

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,9 @@ impl<'a> Parser<'a> {
466466
kind,
467467
target,
468468
});
469-
break; // heredoc consumes rest of input line
469+
// Rest-of-line tokens re-injected by lexer; break so callers
470+
// can see pipes/semicolons.
471+
break;
470472
}
471473
_ => break,
472474
}
@@ -1810,8 +1812,10 @@ impl<'a> Parser<'a> {
18101812
target,
18111813
});
18121814

1813-
// Heredoc body consumed subsequent lines from input.
1814-
// Stop parsing this command - next tokens belong to new commands.
1815+
// Heredoc body consumed subsequent lines. Break out of
1816+
// parse_simple_command so pipeline/list parsers see any
1817+
// rest-of-line tokens (|, ;, &&) that were re-injected
1818+
// by the lexer.
18151819
break;
18161820
}
18171821
Some(tokens::Token::ProcessSubIn) | Some(tokens::Token::ProcessSubOut) => {
@@ -2740,4 +2744,35 @@ mod tests {
27402744

27412745
assert!(matches!(&script.commands[0], Command::List(_)));
27422746
}
2747+
2748+
#[test]
2749+
fn test_heredoc_pipe() {
2750+
let parser = Parser::new("cat <<EOF | sort\nc\na\nb\nEOF\n");
2751+
let script = parser.parse().unwrap();
2752+
assert!(
2753+
matches!(&script.commands[0], Command::Pipeline(_)),
2754+
"heredoc with pipe should parse as Pipeline"
2755+
);
2756+
}
2757+
2758+
#[test]
2759+
fn test_heredoc_multiple_on_line() {
2760+
let input = "while cat <<E1 && cat <<E2; do cat <<E3; break; done\n1\nE1\n2\nE2\n3\nE3\n";
2761+
let parser = Parser::new(input);
2762+
let script = parser.parse().unwrap();
2763+
assert_eq!(script.commands.len(), 1);
2764+
if let Command::Compound(comp, _) = &script.commands[0] {
2765+
if let CompoundCommand::While(w) = comp {
2766+
assert!(
2767+
!w.condition.is_empty(),
2768+
"while condition should be non-empty"
2769+
);
2770+
assert!(!w.body.is_empty(), "while body should be non-empty");
2771+
} else {
2772+
panic!("expected While compound command");
2773+
}
2774+
} else {
2775+
panic!("expected Compound command");
2776+
}
2777+
}
27432778
}

crates/bashkit/tests/spec_cases/bash/heredoc-edge.test.sh

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ $var $(echo nope) $((1+2))
2828

2929
### heredoc_partial_quote_delimiter
3030
# Partial quote in delimiter still prevents expansion
31-
### skip: TODO partial quoting in heredoc delimiter not implemented
3231
cat <<'EOF'"2"
3332
one
3433
two
@@ -40,7 +39,6 @@ two
4039
4140
### heredoc_pipe_first_line
4241
# Here doc with pipe on first line
43-
### skip: TODO heredoc piped to sort - pipe ordering issue
4442
cat <<EOF | sort
4543
c
4644
a
@@ -54,7 +52,6 @@ c
5452

5553
### heredoc_pipe_last_line
5654
# Here doc with pipe continued on last line
57-
### skip: TODO heredoc pipe continuation - pipe ordering issue
5855
cat <<EOF |
5956
c
6057
a
@@ -94,7 +91,6 @@ X 3
9491

9592
### heredoc_in_while_condition
9693
# Here doc in while condition and body
97-
### skip: TODO multiple heredocs in while condition not parsed
9894
while cat <<E1 && cat <<E2; do cat <<E3; break; done
9995
1
10096
E1
@@ -110,7 +106,6 @@ E3
110106

111107
### heredoc_multiline_condition
112108
# Here doc in while condition on multiple lines
113-
### skip: TODO multiple heredocs in while condition not parsed
114109
while cat <<E1 && cat <<E2
115110
1
116111
E1
@@ -130,7 +125,6 @@ done
130125

131126
### heredoc_with_multiline_dquote
132127
# Here doc with multiline double quoted string
133-
### skip: TODO heredoc followed by multiline dquote on same line not parsed correctly
134128
cat <<EOF; echo "two
135129
three"
136130
one

0 commit comments

Comments
 (0)