22//!
33//! Tokenizes input into a stream of tokens with source position tracking.
44
5+ use std:: collections:: VecDeque ;
6+
57use super :: span:: { Position , Span } ;
68use super :: tokens:: Token ;
79
@@ -19,9 +21,9 @@ pub struct Lexer<'a> {
1921 /// Current position in the input
2022 position : Position ,
2123 chars : std:: iter:: Peekable < std:: str:: Chars < ' a > > ,
22- /// Rest- of-line text captured during heredoc parsing .
23- /// In `cat <<EOF > file`, this holds ` > file `.
24- pub heredoc_rest_of_line : String ,
24+ /// Buffer for re-injected characters (e.g., rest- of-line after heredoc delimiter) .
25+ /// Consumed before `chars `.
26+ reinject_buf : VecDeque < char > ,
2527}
2628
2729impl < ' a > Lexer < ' a > {
@@ -31,7 +33,7 @@ impl<'a> Lexer<'a> {
3133 input,
3234 position : Position :: new ( ) ,
3335 chars : input. chars ( ) . peekable ( ) ,
34- heredoc_rest_of_line : String :: new ( ) ,
36+ reinject_buf : VecDeque :: new ( ) ,
3537 }
3638 }
3739
@@ -47,11 +49,19 @@ impl<'a> Lexer<'a> {
4749 }
4850
4951 fn peek_char ( & mut self ) -> Option < char > {
50- self . chars . peek ( ) . copied ( )
52+ if let Some ( & ch) = self . reinject_buf . front ( ) {
53+ Some ( ch)
54+ } else {
55+ self . chars . peek ( ) . copied ( )
56+ }
5157 }
5258
5359 fn advance ( & mut self ) -> Option < char > {
54- let ch = self . chars . next ( ) ;
60+ let ch = if !self . reinject_buf . is_empty ( ) {
61+ self . reinject_buf . pop_front ( )
62+ } else {
63+ self . chars . next ( )
64+ } ;
5565 if let Some ( c) = ch {
5666 self . position . advance ( c) ;
5767 }
@@ -708,10 +718,67 @@ impl<'a> Lexer<'a> {
708718 self . advance ( ) ;
709719 }
710720
721+ // If next char is another quote or word char, concatenate (e.g., 'EOF'"2" -> EOF2).
722+ // Any quoting makes the whole token literal.
723+ self . read_continuation_into ( & mut content) ;
724+
711725 // Single-quoted strings are literal - no variable expansion
712726 Some ( Token :: LiteralWord ( content) )
713727 }
714728
729+ /// After a closing quote, read any adjacent quoted or unquoted word chars
730+ /// into `content`. Handles concatenation like `'foo'"bar"baz` -> `foobarbaz`.
731+ fn read_continuation_into ( & mut self , content : & mut String ) {
732+ loop {
733+ match self . peek_char ( ) {
734+ Some ( '\'' ) => {
735+ self . advance ( ) ; // opening '
736+ while let Some ( ch) = self . peek_char ( ) {
737+ if ch == '\'' {
738+ self . advance ( ) ; // closing '
739+ break ;
740+ }
741+ content. push ( ch) ;
742+ self . advance ( ) ;
743+ }
744+ }
745+ Some ( '"' ) => {
746+ self . advance ( ) ; // opening "
747+ while let Some ( ch) = self . peek_char ( ) {
748+ if ch == '"' {
749+ self . advance ( ) ; // closing "
750+ break ;
751+ }
752+ if ch == '\\' {
753+ self . advance ( ) ;
754+ if let Some ( next) = self . peek_char ( ) {
755+ match next {
756+ '"' | '\\' | '$' | '`' => {
757+ content. push ( next) ;
758+ self . advance ( ) ;
759+ }
760+ _ => {
761+ content. push ( '\\' ) ;
762+ content. push ( next) ;
763+ self . advance ( ) ;
764+ }
765+ }
766+ continue ;
767+ }
768+ }
769+ content. push ( ch) ;
770+ self . advance ( ) ;
771+ }
772+ }
773+ Some ( ch) if self . is_word_char ( ch) => {
774+ content. push ( ch) ;
775+ self . advance ( ) ;
776+ }
777+ _ => break ,
778+ }
779+ }
780+ }
781+
715782 /// Read ANSI-C quoted content ($'...').
716783 /// Opening $' already consumed. Returns the resolved string.
717784 fn read_dollar_single_quoted_content ( & mut self ) -> String {
@@ -1212,16 +1279,34 @@ impl<'a> Lexer<'a> {
12121279 let mut content = String :: new ( ) ;
12131280 let mut current_line = String :: new ( ) ;
12141281
1215- // Collect the rest of the command line after the heredoc delimiter.
1216- // In bash, `cat <<EOF > file` means `> file` is still part of
1217- // the command and should be parsed for redirections.
1218- self . heredoc_rest_of_line . clear ( ) ;
1282+ // Save rest of current line (after the delimiter token on the command line).
1283+ // For `cat <<EOF | sort`, this captures ` | sort` so the parser can
1284+ // tokenize the pipe and subsequent command after the heredoc body.
1285+ //
1286+ // Quoted strings may span multiple lines (e.g., `cat <<EOF; echo "two\nthree"`),
1287+ // so we track quoting state and continue across newlines until quotes close.
1288+ let mut rest_of_line = String :: new ( ) ;
1289+ let mut in_double_quote = false ;
1290+ let mut in_single_quote = false ;
12191291 while let Some ( ch) = self . peek_char ( ) {
12201292 self . advance ( ) ;
1221- if ch == '\n' {
1293+ if ch == '\n' && !in_double_quote && !in_single_quote {
12221294 break ;
12231295 }
1224- self . heredoc_rest_of_line . push ( ch) ;
1296+ if ch == '"' && !in_single_quote {
1297+ in_double_quote = !in_double_quote;
1298+ } else if ch == '\'' && !in_double_quote {
1299+ in_single_quote = !in_single_quote;
1300+ } else if ch == '\\' && in_double_quote {
1301+ // Escaped char inside double quotes — skip the next char too
1302+ rest_of_line. push ( ch) ;
1303+ if let Some ( next) = self . peek_char ( ) {
1304+ rest_of_line. push ( next) ;
1305+ self . advance ( ) ;
1306+ }
1307+ continue ;
1308+ }
1309+ rest_of_line. push ( ch) ;
12251310 }
12261311
12271312 // Read lines until we find the delimiter
@@ -1254,6 +1339,15 @@ impl<'a> Lexer<'a> {
12541339 }
12551340 }
12561341
1342+ // Re-inject saved rest-of-line so subsequent tokens (pipes, commands, etc.)
1343+ // are visible to the parser. Add a newline so the tokenizer sees the line break.
1344+ if !rest_of_line. is_empty ( ) {
1345+ for ch in rest_of_line. chars ( ) {
1346+ self . reinject_buf . push_back ( ch) ;
1347+ }
1348+ self . reinject_buf . push_back ( '\n' ) ;
1349+ }
1350+
12571351 content
12581352 }
12591353}
@@ -1370,15 +1464,13 @@ mod tests {
13701464 let mut lexer = Lexer :: new ( "\n hello\n world\n EOF" ) ;
13711465 let content = lexer. read_heredoc ( "EOF" ) ;
13721466 assert_eq ! ( content, "hello\n world\n " ) ;
1373- assert_eq ! ( lexer. heredoc_rest_of_line. trim( ) , "" ) ;
13741467 }
13751468
13761469 #[ test]
13771470 fn test_read_heredoc_single_line ( ) {
13781471 let mut lexer = Lexer :: new ( "\n test\n EOF" ) ;
13791472 let content = lexer. read_heredoc ( "EOF" ) ;
13801473 assert_eq ! ( content, "test\n " ) ;
1381- assert_eq ! ( lexer. heredoc_rest_of_line. trim( ) , "" ) ;
13821474 }
13831475
13841476 #[ test]
@@ -1394,18 +1486,23 @@ mod tests {
13941486 // Now read heredoc content
13951487 let content = lexer. read_heredoc ( "EOF" ) ;
13961488 assert_eq ! ( content, "hello\n world\n " ) ;
1397- assert_eq ! ( lexer. heredoc_rest_of_line. trim( ) , "" ) ;
13981489 }
13991490
14001491 #[ test]
14011492 fn test_read_heredoc_with_redirect ( ) {
1493+ // Rest-of-line (> file.txt) is re-injected into the lexer buffer
14021494 let mut lexer = Lexer :: new ( "cat <<EOF > file.txt\n hello\n EOF" ) ;
14031495 assert_eq ! ( lexer. next_token( ) , Some ( Token :: Word ( "cat" . to_string( ) ) ) ) ;
14041496 assert_eq ! ( lexer. next_token( ) , Some ( Token :: HereDoc ) ) ;
14051497 assert_eq ! ( lexer. next_token( ) , Some ( Token :: Word ( "EOF" . to_string( ) ) ) ) ;
14061498 let content = lexer. read_heredoc ( "EOF" ) ;
14071499 assert_eq ! ( content, "hello\n " ) ;
1408- assert_eq ! ( lexer. heredoc_rest_of_line. trim( ) , "> file.txt" ) ;
1500+ // The redirect tokens are now available from the lexer
1501+ assert_eq ! ( lexer. next_token( ) , Some ( Token :: RedirectOut ) ) ;
1502+ assert_eq ! (
1503+ lexer. next_token( ) ,
1504+ Some ( Token :: Word ( "file.txt" . to_string( ) ) )
1505+ ) ;
14091506 }
14101507
14111508 #[ test]
0 commit comments