@@ -17,6 +17,17 @@ use crate::interpreter::ExecResult;
1717/// iconv builtin - character encoding conversion.
1818pub struct Iconv ;
1919
20+ /// Parse an encoding spec like "ascii//translit" into (encoding, translit).
21+ fn parse_encoding_spec ( spec : & str ) -> ( Option < & ' static str > , bool ) {
22+ let ( name, translit) = if let Some ( pos) = spec. find ( "//" ) {
23+ let suffix = & spec[ pos + 2 ..] ;
24+ ( & spec[ ..pos] , suffix. eq_ignore_ascii_case ( "translit" ) )
25+ } else {
26+ ( spec, false )
27+ } ;
28+ ( normalize_encoding ( name) , translit)
29+ }
30+
2031/// Normalize encoding name to canonical form.
2132fn normalize_encoding ( name : & str ) -> Option < & ' static str > {
2233 match name. to_ascii_lowercase ( ) . replace ( '-' , "" ) . as_str ( ) {
@@ -29,6 +40,54 @@ fn normalize_encoding(name: &str) -> Option<&'static str> {
2940 }
3041}
3142
43+ /// Transliterate non-ASCII characters to their closest ASCII equivalents.
44+ fn transliterate_to_ascii ( input : & str ) -> String {
45+ let mut result = String :: with_capacity ( input. len ( ) ) ;
46+ for ch in input. chars ( ) {
47+ if ch. is_ascii ( ) {
48+ result. push ( ch) ;
49+ } else {
50+ result. push ( match ch {
51+ 'á' | 'à' | 'â' | 'ã' | 'ä' | 'å' | 'ā' | 'ă' | 'ą' => 'a' ,
52+ 'é' | 'è' | 'ê' | 'ë' | 'ē' | 'ĕ' | 'ė' | 'ę' | 'ě' => 'e' ,
53+ 'í' | 'ì' | 'î' | 'ï' | 'ĩ' | 'ī' | 'ĭ' | 'į' => 'i' ,
54+ 'ó' | 'ò' | 'ô' | 'õ' | 'ö' | 'ø' | 'ō' | 'ŏ' | 'ő' => 'o' ,
55+ 'ú' | 'ù' | 'û' | 'ü' | 'ũ' | 'ū' | 'ŭ' | 'ů' | 'ű' | 'ų' => 'u' ,
56+ 'ý' | 'ÿ' | 'ŷ' => 'y' ,
57+ 'ñ' | 'ń' | 'ņ' | 'ň' => 'n' ,
58+ 'ç' | 'ć' | 'č' | 'ĉ' | 'ċ' => 'c' ,
59+ 'ß' => 's' ,
60+ 'æ' => 'a' ,
61+ 'œ' => 'o' ,
62+ 'ð' => 'd' ,
63+ 'þ' => 't' ,
64+ 'ł' => 'l' ,
65+ 'đ' => 'd' ,
66+ 'ğ' => 'g' ,
67+ 'ş' | 'š' | 'ś' | 'ŝ' => 's' ,
68+ 'ž' | 'ź' | 'ż' => 'z' ,
69+ 'ř' => 'r' ,
70+ 'ť' | 'ţ' => 't' ,
71+ 'Á' | 'À' | 'Â' | 'Ã' | 'Ä' | 'Å' => 'A' ,
72+ 'É' | 'È' | 'Ê' | 'Ë' => 'E' ,
73+ 'Í' | 'Ì' | 'Î' | 'Ï' => 'I' ,
74+ 'Ó' | 'Ò' | 'Ô' | 'Õ' | 'Ö' | 'Ø' => 'O' ,
75+ 'Ú' | 'Ù' | 'Û' | 'Ü' => 'U' ,
76+ 'Ý' => 'Y' ,
77+ 'Ñ' => 'N' ,
78+ 'Ç' => 'C' ,
79+ 'Š' => 'S' ,
80+ 'Ž' => 'Z' ,
81+ 'Ř' => 'R' ,
82+ 'Ť' => 'T' ,
83+ 'Ď' => 'D' ,
84+ _ => '?' ,
85+ } ) ;
86+ }
87+ }
88+ result
89+ }
90+
3291const SUPPORTED_ENCODINGS : & [ & str ] = & [
3392 "ASCII" ,
3493 "ISO-8859-1" ,
@@ -39,22 +98,33 @@ const SUPPORTED_ENCODINGS: &[&str] = &[
3998] ;
4099
41100/// Encode bytes from UTF-8 string into target encoding.
42- fn encode_to ( input : & str , encoding : & str ) -> std:: result:: Result < Vec < u8 > , String > {
101+ /// When `transliterate` is true, non-ASCII characters are replaced with
102+ /// ASCII equivalents before encoding (supports `//translit` mode).
103+ fn encode_to (
104+ input : & str ,
105+ encoding : & str ,
106+ transliterate : bool ,
107+ ) -> std:: result:: Result < Vec < u8 > , String > {
108+ let text = if transliterate {
109+ transliterate_to_ascii ( input)
110+ } else {
111+ input. to_string ( )
112+ } ;
43113 match encoding {
44- "utf-8" => Ok ( input . as_bytes ( ) . to_vec ( ) ) ,
114+ "utf-8" => Ok ( text . as_bytes ( ) . to_vec ( ) ) ,
45115 "ascii" => {
46- for ( i, b) in input . bytes ( ) . enumerate ( ) {
116+ for ( i, b) in text . bytes ( ) . enumerate ( ) {
47117 if b > 127 {
48118 return Err ( format ! (
49119 "iconv: cannot convert character at byte {i} to ASCII\n "
50120 ) ) ;
51121 }
52122 }
53- Ok ( input . as_bytes ( ) . to_vec ( ) )
123+ Ok ( text . as_bytes ( ) . to_vec ( ) )
54124 }
55125 "latin1" => {
56- let mut out = Vec :: with_capacity ( input . len ( ) ) ;
57- for ch in input . chars ( ) {
126+ let mut out = Vec :: with_capacity ( text . len ( ) ) ;
127+ for ch in text . chars ( ) {
58128 let cp = ch as u32 ;
59129 if cp > 255 {
60130 return Err ( format ! ( "iconv: cannot convert U+{cp:04X} to LATIN1\n " ) ) ;
@@ -67,7 +137,7 @@ fn encode_to(input: &str, encoding: &str) -> std::result::Result<Vec<u8>, String
67137 let mut out = Vec :: new ( ) ;
68138 // BOM little-endian
69139 out. extend_from_slice ( & [ 0xFF , 0xFE ] ) ;
70- for ch in input . chars ( ) {
140+ for ch in text . chars ( ) {
71141 let mut buf = [ 0u16 ; 2 ] ;
72142 let encoded = ch. encode_utf16 ( & mut buf) ;
73143 for u in encoded {
@@ -78,7 +148,7 @@ fn encode_to(input: &str, encoding: &str) -> std::result::Result<Vec<u8>, String
78148 }
79149 "utf-16be" => {
80150 let mut out = Vec :: new ( ) ;
81- for ch in input . chars ( ) {
151+ for ch in text . chars ( ) {
82152 let mut buf = [ 0u16 ; 2 ] ;
83153 let encoded = ch. encode_utf16 ( & mut buf) ;
84154 for u in encoded {
@@ -227,10 +297,10 @@ impl Builtin for Iconv {
227297 }
228298 } ;
229299
230- let to = match & to_enc {
231- Some ( t) => match normalize_encoding ( t) {
232- Some ( e) => e ,
233- None => {
300+ let ( to , to_translit ) = match & to_enc {
301+ Some ( t) => match parse_encoding_spec ( t) {
302+ ( Some ( e) , translit ) => ( e , translit ) ,
303+ ( None , _ ) => {
234304 return Ok ( ExecResult :: err (
235305 format ! ( "iconv: unsupported encoding '{}'\n " , t) ,
236306 1 ,
@@ -268,7 +338,7 @@ impl Builtin for Iconv {
268338 } ;
269339
270340 // Encode from UTF-8 string to target encoding
271- let output_bytes = match encode_to ( & text, to) {
341+ let output_bytes = match encode_to ( & text, to, to_translit ) {
272342 Ok ( b) => b,
273343 Err ( e) => return Ok ( ExecResult :: err ( e, 1 ) ) ,
274344 } ;
@@ -415,4 +485,40 @@ mod tests {
415485 assert_eq ! ( r. exit_code, 0 ) ;
416486 assert_eq ! ( r. stdout, "hello world\n " ) ;
417487 }
488+
489+ #[ tokio:: test]
490+ async fn test_translit_to_ascii ( ) {
491+ let r = run (
492+ & [ "-f" , "UTF-8" , "-t" , "ascii//translit" ] ,
493+ Some ( "caf\u{00e9} " ) ,
494+ None ,
495+ )
496+ . await ;
497+ assert_eq ! ( r. exit_code, 0 ) ;
498+ assert_eq ! ( r. stdout, "cafe" ) ;
499+ }
500+
501+ #[ tokio:: test]
502+ async fn test_translit_multiple_diacritics ( ) {
503+ let r = run (
504+ & [ "-f" , "UTF-8" , "-t" , "ASCII//TRANSLIT" ] ,
505+ Some ( "na\u{00ef} ve" ) ,
506+ None ,
507+ )
508+ . await ;
509+ assert_eq ! ( r. exit_code, 0 ) ;
510+ assert_eq ! ( r. stdout, "naive" ) ;
511+ }
512+
513+ #[ tokio:: test]
514+ async fn test_translit_german ( ) {
515+ let r = run (
516+ & [ "-f" , "utf-8" , "-t" , "ascii//translit" ] ,
517+ Some ( "H\u{00e9} llo W\u{00f6} rld" ) ,
518+ None ,
519+ )
520+ . await ;
521+ assert_eq ! ( r. exit_code, 0 ) ;
522+ assert_eq ! ( r. stdout, "Hello World" ) ;
523+ }
418524}
0 commit comments