Skip to content

Commit ef0c02d

Browse files
chaliyclaude
andauthored
fix(interpreter): count unicode chars in ${#x} and add printf \u/\U escapes (#378)
## Summary - Fix `${#x}` to count Unicode characters instead of bytes: `x=café; echo ${#x}` now correctly returns `4` instead of `5` - Add `\uHHHH` (4-digit) and `\UHHHHHHHH` (8-digit) Unicode escape handling to the printf builtin, both in format strings and `%b` argument expansion - Remove 3 `### skip:` markers from `unicode.test.sh` and replace with `### bash_diff:` markers (system bash behavior differs based on locale) ## Test plan - [x] `cargo fmt --check` clean - [x] `cargo clippy --all-targets --all-features -- -D warnings` clean - [x] All 1015 unit tests pass - [x] Bash spec tests: 1184 passed, 0 failed (100% pass rate) - [x] Bash comparison tests: 1081/1081 match real bash (100%) - [x] New printf unicode unit tests: `test_unicode_escape_u`, `test_unicode_escape_big_u`, `test_unicode_escape_ascii`, `test_unicode_escape_in_expand` Closes #362 Co-authored-by: Claude <noreply@anthropic.com>
1 parent d3d162a commit ef0c02d

File tree

4 files changed

+87
-5
lines changed

4 files changed

+87
-5
lines changed

crates/bashkit/src/builtins/printf.rs

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,18 @@ fn format_string(format: &str, args: &[String], arg_index: &mut usize) -> String
232232
output.push(val as char);
233233
}
234234
}
235+
'u' => {
236+
// \uHHHH - 4-digit unicode escape
237+
if let Some(c) = parse_unicode_escape(&mut chars, 4) {
238+
output.push(c);
239+
}
240+
}
241+
'U' => {
242+
// \UHHHHHHHH - 8-digit unicode escape
243+
if let Some(c) = parse_unicode_escape(&mut chars, 8) {
244+
output.push(c);
245+
}
246+
}
235247
_ => {
236248
output.push('\\');
237249
output.push(next);
@@ -451,6 +463,18 @@ fn expand_escapes(s: &str) -> String {
451463
output.push(val as char);
452464
}
453465
}
466+
'u' => {
467+
// \uHHHH - 4-digit unicode escape
468+
if let Some(c) = parse_unicode_escape(&mut chars, 4) {
469+
output.push(c);
470+
}
471+
}
472+
'U' => {
473+
// \UHHHHHHHH - 8-digit unicode escape
474+
if let Some(c) = parse_unicode_escape(&mut chars, 8) {
475+
output.push(c);
476+
}
477+
}
454478
_ => {
455479
output.push('\\');
456480
output.push(next);
@@ -467,6 +491,30 @@ fn expand_escapes(s: &str) -> String {
467491
output
468492
}
469493

494+
/// Parse a unicode escape sequence (\uHHHH or \UHHHHHHHH) from a char iterator.
495+
/// `max_digits` is 4 for \u and 8 for \U.
496+
fn parse_unicode_escape(
497+
chars: &mut std::iter::Peekable<std::str::Chars<'_>>,
498+
max_digits: usize,
499+
) -> Option<char> {
500+
let mut hex = String::new();
501+
for _ in 0..max_digits {
502+
if let Some(&c) = chars.peek() {
503+
if c.is_ascii_hexdigit() {
504+
hex.push(chars.next().unwrap());
505+
} else {
506+
break;
507+
}
508+
} else {
509+
break;
510+
}
511+
}
512+
if hex.is_empty() {
513+
return None;
514+
}
515+
u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32)
516+
}
517+
470518
#[cfg(test)]
471519
mod tests {
472520
use super::*;
@@ -533,4 +581,38 @@ mod tests {
533581
let mut idx = 0;
534582
assert_eq!(format_string("%04x", &args, &mut idx), "00ff");
535583
}
584+
585+
#[test]
586+
fn test_unicode_escape_u() {
587+
// \u03bc -> μ (Greek small letter mu)
588+
let args = vec![];
589+
let mut idx = 0;
590+
assert_eq!(format_string("\\u03bc", &args, &mut idx), "\u{03bc}");
591+
}
592+
593+
#[test]
594+
fn test_unicode_escape_big_u() {
595+
// \U000003bc -> μ
596+
let args = vec![];
597+
let mut idx = 0;
598+
assert_eq!(format_string("\\U000003bc", &args, &mut idx), "\u{03bc}");
599+
}
600+
601+
#[test]
602+
fn test_unicode_escape_ascii() {
603+
// \u0041 -> A
604+
let args = vec![];
605+
let mut idx = 0;
606+
assert_eq!(
607+
format_string("\\u0041\\u0042\\u0043", &args, &mut idx),
608+
"ABC"
609+
);
610+
}
611+
612+
#[test]
613+
fn test_unicode_escape_in_expand() {
614+
// %b format also handles \u escapes
615+
assert_eq!(expand_escapes("\\u03bc"), "\u{03bc}");
616+
assert_eq!(expand_escapes("\\U000003bc"), "\u{03bc}");
617+
}
536618
}

crates/bashkit/src/interpreter/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5041,7 +5041,7 @@ impl Interpreter {
50415041
} else {
50425042
self.expand_variable(name)
50435043
};
5044-
result.push_str(&value.len().to_string());
5044+
result.push_str(&value.chars().count().to_string());
50455045
}
50465046
WordPart::ParameterExpansion {
50475047
name,

crates/bashkit/tests/spec_cases/bash/unicode.test.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,15 @@ echo $'\U000003bc'
4141

4242
### unicode_printf_u
4343
# printf \u escape
44-
### skip: TODO printf \u unicode escape not implemented
44+
### bash_diff: system bash printf \u requires UTF-8 locale
4545
printf '\u03bc\n'
4646
### expect
4747
μ
4848
### end
4949

5050
### unicode_printf_U
5151
# printf \U escape
52-
### skip: TODO printf \U unicode escape not implemented
52+
### bash_diff: system bash printf \U requires UTF-8 locale
5353
printf '\U000003bc\n'
5454
### expect
5555
μ
@@ -65,7 +65,7 @@ café
6565

6666
### unicode_string_length
6767
# String length of unicode string
68-
### skip: TODO ${#x} counts bytes instead of characters for unicode
68+
### bash_diff: system bash ${#x} counts bytes in POSIX locale, chars in UTF-8
6969
x=café
7070
echo ${#x}
7171
### expect

crates/bashkit/tests/spec_tests.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
//! - `### skip: reason` - Skip test entirely (not run in any test)
99
//! - `### bash_diff: reason` - Known difference from real bash (runs in spec tests, excluded from comparison)
1010
//!
11-
//! ## Skipped Tests (18 total)
11+
//! ## Skipped Tests (15 total)
1212
//!
1313
//! Actual `### skip:` markers across spec test files:
1414
//!

0 commit comments

Comments
 (0)