Skip to content

Commit eb874b7

Browse files
authored
feat(iconv): support //translit transliteration mode (#1136)
## Summary - Parses `//translit` suffix on target encoding (e.g. `ascii//translit`) - Non-ASCII characters are mapped to closest ASCII equivalents before encoding - Comprehensive character mapping table covering Latin-1, Latin Extended-A, and common diacritics - Unknown non-ASCII characters map to `?` (matching GNU iconv behavior) ## Test plan - [x] `test_translit_to_ascii` — café → cafe - [x] `test_translit_multiple_diacritics` — naïve → naive - [x] `test_translit_german` — Héllo Wörld → Hello World - [x] All 14 iconv unit tests pass (11 existing + 3 new) Closes #1117
1 parent 9fddfdc commit eb874b7

File tree

1 file changed

+119
-13
lines changed

1 file changed

+119
-13
lines changed

crates/bashkit/src/builtins/iconv.rs

Lines changed: 119 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,17 @@ use crate::interpreter::ExecResult;
1717
/// iconv builtin - character encoding conversion.
1818
pub struct Iconv;
1919

20+
/// Parse an encoding spec like "ascii//translit" into (encoding, translit).
21+
fn parse_encoding_spec(spec: &str) -> (Option<&'static str>, bool) {
22+
let (name, translit) = if let Some(pos) = spec.find("//") {
23+
let suffix = &spec[pos + 2..];
24+
(&spec[..pos], suffix.eq_ignore_ascii_case("translit"))
25+
} else {
26+
(spec, false)
27+
};
28+
(normalize_encoding(name), translit)
29+
}
30+
2031
/// Normalize encoding name to canonical form.
2132
fn normalize_encoding(name: &str) -> Option<&'static str> {
2233
match name.to_ascii_lowercase().replace('-', "").as_str() {
@@ -29,6 +40,54 @@ fn normalize_encoding(name: &str) -> Option<&'static str> {
2940
}
3041
}
3142

43+
/// Transliterate non-ASCII characters to their closest ASCII equivalents.
44+
fn transliterate_to_ascii(input: &str) -> String {
45+
let mut result = String::with_capacity(input.len());
46+
for ch in input.chars() {
47+
if ch.is_ascii() {
48+
result.push(ch);
49+
} else {
50+
result.push(match ch {
51+
'á' | 'à' | 'â' | 'ã' | 'ä' | 'å' | 'ā' | 'ă' | 'ą' => 'a',
52+
'é' | 'è' | 'ê' | 'ë' | 'ē' | 'ĕ' | 'ė' | 'ę' | 'ě' => 'e',
53+
'í' | 'ì' | 'î' | 'ï' | 'ĩ' | 'ī' | 'ĭ' | 'į' => 'i',
54+
'ó' | 'ò' | 'ô' | 'õ' | 'ö' | 'ø' | 'ō' | 'ŏ' | 'ő' => 'o',
55+
'ú' | 'ù' | 'û' | 'ü' | 'ũ' | 'ū' | 'ŭ' | 'ů' | 'ű' | 'ų' => 'u',
56+
'ý' | 'ÿ' | 'ŷ' => 'y',
57+
'ñ' | 'ń' | 'ņ' | 'ň' => 'n',
58+
'ç' | 'ć' | 'č' | 'ĉ' | 'ċ' => 'c',
59+
'ß' => 's',
60+
'æ' => 'a',
61+
'œ' => 'o',
62+
'ð' => 'd',
63+
'þ' => 't',
64+
'ł' => 'l',
65+
'đ' => 'd',
66+
'ğ' => 'g',
67+
'ş' | 'š' | 'ś' | 'ŝ' => 's',
68+
'ž' | 'ź' | 'ż' => 'z',
69+
'ř' => 'r',
70+
'ť' | 'ţ' => 't',
71+
'Á' | 'À' | 'Â' | 'Ã' | 'Ä' | 'Å' => 'A',
72+
'É' | 'È' | 'Ê' | 'Ë' => 'E',
73+
'Í' | 'Ì' | 'Î' | 'Ï' => 'I',
74+
'Ó' | 'Ò' | 'Ô' | 'Õ' | 'Ö' | 'Ø' => 'O',
75+
'Ú' | 'Ù' | 'Û' | 'Ü' => 'U',
76+
'Ý' => 'Y',
77+
'Ñ' => 'N',
78+
'Ç' => 'C',
79+
'Š' => 'S',
80+
'Ž' => 'Z',
81+
'Ř' => 'R',
82+
'Ť' => 'T',
83+
'Ď' => 'D',
84+
_ => '?',
85+
});
86+
}
87+
}
88+
result
89+
}
90+
3291
const SUPPORTED_ENCODINGS: &[&str] = &[
3392
"ASCII",
3493
"ISO-8859-1",
@@ -39,22 +98,33 @@ const SUPPORTED_ENCODINGS: &[&str] = &[
3998
];
4099

41100
/// Encode bytes from UTF-8 string into target encoding.
42-
fn encode_to(input: &str, encoding: &str) -> std::result::Result<Vec<u8>, String> {
101+
/// When `transliterate` is true, non-ASCII characters are replaced with
102+
/// ASCII equivalents before encoding (supports `//translit` mode).
103+
fn encode_to(
104+
input: &str,
105+
encoding: &str,
106+
transliterate: bool,
107+
) -> std::result::Result<Vec<u8>, String> {
108+
let text = if transliterate {
109+
transliterate_to_ascii(input)
110+
} else {
111+
input.to_string()
112+
};
43113
match encoding {
44-
"utf-8" => Ok(input.as_bytes().to_vec()),
114+
"utf-8" => Ok(text.as_bytes().to_vec()),
45115
"ascii" => {
46-
for (i, b) in input.bytes().enumerate() {
116+
for (i, b) in text.bytes().enumerate() {
47117
if b > 127 {
48118
return Err(format!(
49119
"iconv: cannot convert character at byte {i} to ASCII\n"
50120
));
51121
}
52122
}
53-
Ok(input.as_bytes().to_vec())
123+
Ok(text.as_bytes().to_vec())
54124
}
55125
"latin1" => {
56-
let mut out = Vec::with_capacity(input.len());
57-
for ch in input.chars() {
126+
let mut out = Vec::with_capacity(text.len());
127+
for ch in text.chars() {
58128
let cp = ch as u32;
59129
if cp > 255 {
60130
return Err(format!("iconv: cannot convert U+{cp:04X} to LATIN1\n"));
@@ -67,7 +137,7 @@ fn encode_to(input: &str, encoding: &str) -> std::result::Result<Vec<u8>, String
67137
let mut out = Vec::new();
68138
// BOM little-endian
69139
out.extend_from_slice(&[0xFF, 0xFE]);
70-
for ch in input.chars() {
140+
for ch in text.chars() {
71141
let mut buf = [0u16; 2];
72142
let encoded = ch.encode_utf16(&mut buf);
73143
for u in encoded {
@@ -78,7 +148,7 @@ fn encode_to(input: &str, encoding: &str) -> std::result::Result<Vec<u8>, String
78148
}
79149
"utf-16be" => {
80150
let mut out = Vec::new();
81-
for ch in input.chars() {
151+
for ch in text.chars() {
82152
let mut buf = [0u16; 2];
83153
let encoded = ch.encode_utf16(&mut buf);
84154
for u in encoded {
@@ -227,10 +297,10 @@ impl Builtin for Iconv {
227297
}
228298
};
229299

230-
let to = match &to_enc {
231-
Some(t) => match normalize_encoding(t) {
232-
Some(e) => e,
233-
None => {
300+
let (to, to_translit) = match &to_enc {
301+
Some(t) => match parse_encoding_spec(t) {
302+
(Some(e), translit) => (e, translit),
303+
(None, _) => {
234304
return Ok(ExecResult::err(
235305
format!("iconv: unsupported encoding '{}'\n", t),
236306
1,
@@ -268,7 +338,7 @@ impl Builtin for Iconv {
268338
};
269339

270340
// Encode from UTF-8 string to target encoding
271-
let output_bytes = match encode_to(&text, to) {
341+
let output_bytes = match encode_to(&text, to, to_translit) {
272342
Ok(b) => b,
273343
Err(e) => return Ok(ExecResult::err(e, 1)),
274344
};
@@ -415,4 +485,40 @@ mod tests {
415485
assert_eq!(r.exit_code, 0);
416486
assert_eq!(r.stdout, "hello world\n");
417487
}
488+
489+
#[tokio::test]
490+
async fn test_translit_to_ascii() {
491+
let r = run(
492+
&["-f", "UTF-8", "-t", "ascii//translit"],
493+
Some("caf\u{00e9}"),
494+
None,
495+
)
496+
.await;
497+
assert_eq!(r.exit_code, 0);
498+
assert_eq!(r.stdout, "cafe");
499+
}
500+
501+
#[tokio::test]
502+
async fn test_translit_multiple_diacritics() {
503+
let r = run(
504+
&["-f", "UTF-8", "-t", "ASCII//TRANSLIT"],
505+
Some("na\u{00ef}ve"),
506+
None,
507+
)
508+
.await;
509+
assert_eq!(r.exit_code, 0);
510+
assert_eq!(r.stdout, "naive");
511+
}
512+
513+
#[tokio::test]
514+
async fn test_translit_german() {
515+
let r = run(
516+
&["-f", "utf-8", "-t", "ascii//translit"],
517+
Some("H\u{00e9}llo W\u{00f6}rld"),
518+
None,
519+
)
520+
.await;
521+
assert_eq!(r.exit_code, 0);
522+
assert_eq!(r.stdout, "Hello World");
523+
}
418524
}

0 commit comments

Comments
 (0)