diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 33ec4eb..f11e809 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -33,13 +33,16 @@ jobs: uses: actions-rs/cargo@v1 with: command: test + args: --all - name: Check formatting uses: actions-rs/cargo@v1 with: command: fmt - args: -- --check + args: --all --check - name: Catch common mistakes uses: actions-rs/cargo@v1 with: command: clippy - args: --all-targets -- -D warnings + args: --all-targets --workspace -- -D warnings + - name: Regenerate Unicode tables + run: mv src/tables.rs tables.rs.bak && cd tables && cargo run && cd .. && diff src/tables.rs tables.rs.bak diff --git a/Cargo.toml b/Cargo.toml index a9fa2c3..adfea40 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,3 +9,6 @@ repository = "https://github.com/withoutboats/heck" keywords = ["string", "case", "camel", "snake", "unicode"] categories = ["no-std"] include = ["src/**/*", "LICENSE-*", "README.md", "CHANGELOG.md"] + +[workspace] +members = ["tables"] diff --git a/README.md b/README.md index 9b9b4e8..b4659c9 100644 --- a/README.md +++ b/README.md @@ -8,24 +8,42 @@ consistent, and reasonably well performing. ## Definition of a word boundary -Word boundaries are defined by non-alphanumeric characters, as well as -within those words in this manner: - -1. If an uppercase character is followed by lowercase letters, a word -boundary is considered to be just prior to that uppercase character. -2. If multiple uppercase characters are consecutive, they are considered to -be within a single word, except that the last will be part of the next word -if it is followed by lowercase characters (see rule 1). - -That is, "HelloWorld" is segmented `Hello|World` whereas "XMLHttpRequest" is -segmented `XML|Http|Request`. - -Characters not within words (such as spaces, punctuations, and underscores) -are not included in the output string except as they are a part of the case -being converted to. Multiple adjacent word boundaries (such as a series of -underscores) are folded into one. ("hello__world" in snake case is therefore -"hello_world", not the exact same string). Leading or trailing word boundary -indicators are dropped, except insofar as CamelCase capitalizes the first word. +The definition of a word boundary is based on the +[identifier word boundary](https://www.unicode.org/reports/tr55/#Identifier-Chunks) +in Unicode Technical Standard 55. The rules are as follows: + +- The set of characters that can be in a word is + [`[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Cn}\p{Co}\p{Alphabetic}\p{N}-[\p{P}-\p{Po}]]`][1], + plus U+05F3, U+05F4, and U+0F0B. This notably includes + alphabetic and numeric characters, accents and other combining marks, + emoji, a few mathematical symbols, a few non-word-separating punctuation marks, + unassigned characters, and private-use characters. + +- Characters that cannot be in a word separate words. + For example, `foo_bar` is segmented `foo`|`bar` + because words cannout contain `_`. + These characters will be excluded from the output string. + +- Words cannot be empty. For example, `_foo__bar_` is segmented `foo`|`bar`, + and in snake_case becomes `foo_bar`. + +- There is a word boundary between a lowercase (or non-Greek titlecase) + and an uppercase (or titlecase) letter. For example, `fooBar` is segmented + `foo`|`Bar` because `oB` is a lowercase letter followed by an uppercase letter. + +- An uppercase letter followed by a lowercase letter + has a word boundary before it. For example, `XMLHttpRequest` is segmented + `XML`|`Http`|`Request`; the `Ht` in `HttpRequest` is an uppercase letter + followed by a lowercase letter, so there is a word boundary before it. + + - There is always a word boundary before a non-Greek titlecase letter + (U+01C5 'Dž', U+01C8 'Lj', U+01CB 'Nj', or U+01F2 'Dz'). + + - For the purpose of the preceding three rules, a letter followed + by some number of nonspacing marks (like accents or other diacritics) + is treated as if it was the letter alone. For example, `áB` is segmented `á`|`B`. + +[1]: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cp%7BID_Continue%7D%5Cp%7BID_Compat_Math_Continue%7D%5Cp%7BCn%7D%5Cp%7BCo%7D%5Cp%7BAlphabetic%7D%5Cp%7BN%7D-%5B%5Cp%7BP%7D-%5Cp%7BPo%7D%5D%5D&abb=on&g=&i= ## Cases contained in this library: diff --git a/src/lib.rs b/src/lib.rs index ab8a015..9ba5a9c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,25 +6,42 @@ //! //! ## Definition of a word boundary //! -//! Word boundaries are defined by non-alphanumeric characters, as well as -//! within those words in this manner: +//! The definition of a word boundary is based on the +//! [identifier word boundary](https://www.unicode.org/reports/tr55/#Identifier-Chunks) +//! in Unicode Technical Standard 55. The rules are as follows: //! -//! 1. If an uppercase character is followed by lowercase letters, a word -//! boundary is considered to be just prior to that uppercase character. -//! 2. If multiple uppercase characters are consecutive, they are considered to -//! be within a single word, except that the last will be part of the next word -//! if it is followed by lowercase characters (see rule 1). +//! - The set of characters that can be in a word is +//! [`[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Cn}\p{Co}\p{Alphabetic}\p{N}-[\p{P}-\p{Po}]]`][1], +//! plus U+05F3, U+05F4, and U+0F0B. This notably includes +//! alphabetic and numeric characters, accents and other combining marks, +//! emoji, a few mathematical symbols, a few non-word-separating punctuation marks, +//! unassigned characters, and private-use characters. //! -//! That is, "HelloWorld" is segmented `Hello|World` whereas "XMLHttpRequest" is -//! segmented `XML|Http|Request`. +//! - Characters that cannot be in a word separate words. +//! For example, `foo_bar` is segmented `foo`|`bar` +//! because words cannout contain `_`. +//! These characters will be excluded from the output string. //! -//! Characters not within words (such as spaces, punctuations, and underscores) -//! are not included in the output string except as they are a part of the case -//! being converted to. Multiple adjacent word boundaries (such as a series of -//! underscores) are folded into one. ("hello__world" in snake case is therefore -//! "hello_world", not the exact same string). Leading or trailing word boundary -//! indicators are dropped, except insofar as CamelCase capitalizes the first -//! word. +//! - Words cannot be empty. For example, `_foo__bar_` is segmented `foo`|`bar`, +//! and in snake_case becomes `foo_bar`. +//! +//! - There is a word boundary between a lowercase (or non-Greek titlecase) +//! and an uppercase (or titlecase) letter. For example, `fooBar` is segmented +//! `foo`|`Bar` because `oB` is a lowercase letter followed by an uppercase letter. +//! +//! - An uppercase letter followed by a lowercase letter +//! has a word boundary before it. For example, `XMLHttpRequest` is segmented +//! `XML`|`Http`|`Request`; the `Ht` in `HttpRequest` is an uppercase letter +//! followed by a lowercase letter, so there is a word boundary before it. +//! +//! - There is always a word boundary before a non-Greek titlecase letter +//! (U+01C5 'Dž', U+01C8 'Lj', U+01CB 'Nj', or U+01F2 'Dz'). +//! +//! - For the purpose of the preceding three rules, a letter followed +//! by some number of nonspacing marks (like accents or other diacritics) +//! is treated as if it was the letter alone. For example, `áB` is segmented `á`|`B`. +//! +//! [1]: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cp%7BID_Continue%7D%5Cp%7BID_Compat_Math_Continue%7D%5Cp%7BCn%7D%5Cp%7BCo%7D%5Cp%7BAlphabetic%7D%5Cp%7BN%7D-%5B%5Cp%7BP%7D-%5Cp%7BPo%7D%5D%5D&abb=on&g=&i= //! //! ### Cases contained in this library: //! @@ -47,6 +64,7 @@ mod lower_camel; mod shouty_kebab; mod shouty_snake; mod snake; +mod tables; mod title; mod train; mod upper_camel; @@ -58,6 +76,7 @@ pub use shouty_snake::{ AsShoutySnakeCase, AsShoutySnakeCase as AsShoutySnekCase, ToShoutySnakeCase, ToShoutySnekCase, }; pub use snake::{AsSnakeCase, AsSnakeCase as AsSnekCase, ToSnakeCase, ToSnekCase}; +pub use tables::UNICODE_VERSION; pub use title::{AsTitleCase, ToTitleCase}; pub use train::{AsTrainCase, ToTrainCase}; pub use upper_camel::{ @@ -66,6 +85,8 @@ pub use upper_camel::{ use core::fmt; +use tables::{is_non_greek_titlecase, CasedLetterKind}; + fn transform( s: &str, mut with_word: F, @@ -76,82 +97,100 @@ where F: FnMut(&str, &mut fmt::Formatter) -> fmt::Result, G: FnMut(&mut fmt::Formatter) -> fmt::Result, { - /// Tracks the current 'mode' of the transformation algorithm as it scans - /// the input string. - /// - /// The mode is a tri-state which tracks the case of the last cased - /// character of the current word. If there is no cased character - /// (either lowercase or uppercase) since the previous word boundary, - /// than the mode is `Boundary`. If the last cased character is lowercase, - /// then the mode is `Lowercase`. Othertherwise, the mode is - /// `Uppercase`. - #[derive(Clone, Copy, PartialEq)] - enum WordMode { - /// There have been no lowercase or uppercase characters in the current - /// word. - Boundary, - /// The previous cased character in the current word is lowercase. - Lowercase, - /// The previous cased character in the current word is uppercase. - Uppercase, - } - let mut first_word = true; - for word in s.split(|c: char| !c.is_alphanumeric()) { - let mut char_indices = word.char_indices().peekable(); - let mut init = 0; - let mut mode = WordMode::Boundary; - - while let Some((i, c)) = char_indices.next() { - if let Some(&(next_i, next)) = char_indices.peek() { - // The mode including the current character, assuming the - // current character does not result in a word boundary. - let next_mode = if c.is_lowercase() { - WordMode::Lowercase - } else if c.is_uppercase() { - WordMode::Uppercase - } else { - mode - }; - - // Word boundary after if current is not uppercase and next - // is uppercase - if next_mode == WordMode::Lowercase && next.is_uppercase() { - if !first_word { - boundary(f)?; + for word in s.split(|c: char| !tables::allowed_in_word(c)) { + let mut start_of_word_idx = 0; + // Whether the previous character seen, ignoring nonspacing marks, + // was lowercase or non-Greek titlecase. + // Used for determining CamelBoundaries. + let mut prev_was_lowercase_or_non_greek_titlecase = false; + // If the previous character seen, ignoring nonspacing marks, + // was uppercase or titlecase, then this stores that character's index. + // Otherwise, it stores `None`. + // Used for determining HATBoundaries. + let mut index_of_preceding_uppercase_or_titlecase_letter: Option = None; + + for (i, c) in word.char_indices() { + match tables::letter_casing(c) { + None => { + // Nonspacing marks are ignored for the purpose of determining boundaries. + if !tables::is_nonspacing_mark(c) { + prev_was_lowercase_or_non_greek_titlecase = false; + index_of_preceding_uppercase_or_titlecase_letter = None; + } + } + Some(CasedLetterKind::Lowercase) => { + prev_was_lowercase_or_non_greek_titlecase = true; + // There is a HATBoundary before an uppercase or titlecase letter followed by a lowercase letter + if let Some(preceding_idx) = index_of_preceding_uppercase_or_titlecase_letter { + index_of_preceding_uppercase_or_titlecase_letter = None; + if preceding_idx != start_of_word_idx { + if !first_word { + boundary(f)?; + } else { + first_word = false; + } + with_word(&word[start_of_word_idx..preceding_idx], f)?; + start_of_word_idx = preceding_idx; + } } - with_word(&word[init..next_i], f)?; - first_word = false; - init = next_i; - mode = WordMode::Boundary; - - // Otherwise if current and previous are uppercase and next - // is lowercase, word boundary before - } else if mode == WordMode::Uppercase && c.is_uppercase() && next.is_lowercase() { - if !first_word { - boundary(f)?; + } + Some(CasedLetterKind::Uppercase) => { + index_of_preceding_uppercase_or_titlecase_letter = Some(i); + // There is a CamelBoundary before an uppercase letter + // that is preceded by a lowercase or non-Greek titlecase letter + if prev_was_lowercase_or_non_greek_titlecase { + prev_was_lowercase_or_non_greek_titlecase = false; + if !first_word { + boundary(f)?; + } else { + first_word = false; + } + with_word(&word[start_of_word_idx..i], f)?; + start_of_word_idx = i; + } + } + Some(CasedLetterKind::Titlecase) => { + index_of_preceding_uppercase_or_titlecase_letter = Some(i); + // There is always a HATBoundary before a non-Greek titlecase letter + if is_non_greek_titlecase(c) { + prev_was_lowercase_or_non_greek_titlecase = true; + if i != start_of_word_idx { + if !first_word { + boundary(f)?; + } else { + first_word = false; + } + with_word(&word[start_of_word_idx..i], f)?; + start_of_word_idx = i; + } } else { - first_word = false; + // There is a CamelBoundary before a titlecase letter + // that is preceded by a lowercase or non-Greek titlecase letter + if prev_was_lowercase_or_non_greek_titlecase { + prev_was_lowercase_or_non_greek_titlecase = false; + if !first_word { + boundary(f)?; + } else { + first_word = false; + } + with_word(&word[start_of_word_idx..i], f)?; + start_of_word_idx = i; + } } - with_word(&word[init..i], f)?; - init = i; - mode = WordMode::Boundary; - - // Otherwise no word boundary, just update the mode - } else { - mode = next_mode; } + } + } + + if start_of_word_idx != word.len() { + // Collect trailing characters as a word + if !first_word { + boundary(f)?; } else { - // Collect trailing characters as a word - if !first_word { - boundary(f)?; - } else { - first_word = false; - } - with_word(&word[init..], f)?; - break; + first_word = false; } + with_word(&word[start_of_word_idx..], f)?; } } @@ -179,13 +218,25 @@ fn uppercase(s: &str, f: &mut fmt::Formatter) -> fmt::Result { Ok(()) } -fn capitalize(s: &str, f: &mut fmt::Formatter) -> fmt::Result { - let mut char_indices = s.char_indices(); - if let Some((_, c)) = char_indices.next() { - write!(f, "{}", c.to_uppercase())?; - if let Some((i, _)) = char_indices.next() { - lowercase(&s[i..], f)?; +fn titlecase(s: &str, f: &mut fmt::Formatter) -> fmt::Result { + // Find the first cased character + if let Some(titlecase_idx) = + s.find(|c| tables::letter_casing(c).is_some() || c.is_lowercase() || c.is_uppercase()) + { + // Everything before the first cased character is passed through unchanged. + f.write_str(&s[..titlecase_idx])?; + + let rem = &s[titlecase_idx..]; + let mut char_indices = rem.char_indices(); + if let Some((_, c)) = char_indices.next() { + write!(f, "{}", tables::to_titlecase(c))?; + if let Some((i, _)) = char_indices.next() { + lowercase(&rem[i..], f)?; + } } + } else { + // If there are no cased characters, pass through the string unchanged + write!(f, "{}", s)?; } Ok(()) diff --git a/src/lower_camel.rs b/src/lower_camel.rs index a31fc33..50d6dac 100644 --- a/src/lower_camel.rs +++ b/src/lower_camel.rs @@ -5,7 +5,7 @@ use alloc::{ string::{String, ToString}, }; -use crate::{capitalize, lowercase, transform}; +use crate::{lowercase, titlecase, transform}; /// This trait defines a lower camel case conversion. /// @@ -53,7 +53,7 @@ impl> fmt::Display for AsLowerCamelCase { first = false; lowercase(s, f) } else { - capitalize(s, f) + titlecase(s, f) } }, |_| Ok(()), diff --git a/src/snake.rs b/src/snake.rs index c3c8576..d7a2c6c 100644 --- a/src/snake.rs +++ b/src/snake.rs @@ -87,14 +87,14 @@ mod tests { t!(test12: "99BOTTLES" => "99bottles"); t!(test13: "FieldNamE11" => "field_nam_e11"); t!(test14: "abc123def456" => "abc123def456"); - t!(test16: "abc123DEF456" => "abc123_def456"); + t!(test16: "abc123DEF456" => "abc123def456"); t!(test17: "abc123Def456" => "abc123_def456"); - t!(test18: "abc123DEf456" => "abc123_d_ef456"); + t!(test18: "abc123DEf456" => "abc123d_ef456"); t!(test19: "ABC123def456" => "abc123def456"); t!(test20: "ABC123DEF456" => "abc123def456"); t!(test21: "ABC123Def456" => "abc123_def456"); t!(test22: "ABC123DEf456" => "abc123d_ef456"); - t!(test23: "ABC123dEEf456FOO" => "abc123d_e_ef456_foo"); + t!(test23: "ABC123dEEf456FOO" => "abc123d_e_ef456foo"); t!(test24: "abcDEF" => "abc_def"); t!(test25: "ABcDE" => "a_bc_de"); } diff --git a/src/tables.rs b/src/tables.rs new file mode 100644 index 0000000..2b21d03 --- /dev/null +++ b/src/tables.rs @@ -0,0 +1,1009 @@ +//! Automatically generated by `tables`. +//! Do not edit manually. + +/// The version of Unicode that the data included with this crate is based on. +/// +/// This crate also relies on Unicode data provided by the standard library; +/// that data is versioned according to [`char::UNICODE_VERSION`]. +pub const UNICODE_VERSION: (u8, u8, u8) = (15, 1, 0); + +/// Whether this character can be part of a word. +pub fn allowed_in_word(c: char) -> bool { + const BOTTOM_BITS_MASK: u32 = !((!0_u32) << 6); + let cp: u32 = c.into(); + let top_bits = cp >> 6; + if top_bits < 0x7F0 { + let leaf_idx: u8 = ALLOWED_IN_WORD_ROOT[usize::try_from(top_bits).unwrap()]; + let leaf = ALLOWED_IN_WORD_LEAVES[usize::from(leaf_idx)]; + (leaf >> (cp & BOTTOM_BITS_MASK)) & 1 == 1 + } else { + !matches!(cp, 0x0E0001 | 0x0E0020..=0x0E007F) + } +} + +static ALLOWED_IN_WORD_ROOT: [u8; 2032] = [ + 0x01, 0x02, 0x03, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x06, 0x07, 0x08, + 0x00, 0x00, 0x09, 0x00, 0x00, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00, 0x0F, 0x10, 0x00, 0x00, 0x11, + 0x12, 0x13, 0x14, 0x15, 0x00, 0x16, 0x00, 0x17, 0x00, 0x08, 0x00, 0x18, 0x00, 0x19, 0x00, 0x1A, + 0x00, 0x1B, 0x1C, 0x00, 0x00, 0x1D, 0x00, 0x1E, 0x1F, 0x20, 0x00, 0x00, 0x21, 0x00, 0x22, 0x23, + 0x00, 0x24, 0x25, 0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x27, 0x28, 0x00, + 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2A, 0x2B, 0x2C, 0x2D, 0x00, 0x00, 0x2E, + 0x2F, 0x00, 0x00, 0x00, 0x00, 0x30, 0x00, 0x31, 0x25, 0x00, 0x32, 0x00, 0x00, 0x33, 0x00, 0x34, + 0x35, 0x36, 0x00, 0x37, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0x39, + 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x43, 0x44, 0x45, 0x00, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x46, 0x47, 0x41, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x48, 0x49, 0x41, + 0x00, 0x00, 0x00, 0x4A, 0x00, 0x19, 0x00, 0x00, 0x4B, 0x4C, 0x4D, 0x4E, 0x41, 0x41, 0x41, 0x4F, + 0x50, 0x00, 0x51, 0x00, 0x00, 0x00, 0x52, 0x53, 0x54, 0x55, 0x56, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x57, 0x58, 0x00, 0x00, 0x00, 0x00, 0x59, 0x5A, 0x00, 0x5B, 0x5C, 0x00, 0x5D, 0x00, + 0x5E, 0x5F, 0x00, 0x60, 0x61, 0x62, 0x00, 0x63, 0x00, 0x64, 0x00, 0x65, 0x00, 0x66, 0x00, 0x67, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x68, 0x00, 0x69, 0x6A, + 0x00, 0x00, 0x00, 0x00, 0x36, 0x6B, 0x00, 0x6C, 0x6D, 0x6E, 0x00, 0x1F, 0x6F, 0x70, 0x00, 0x71, + 0x00, 0x00, 0x00, 0x00, 0x72, 0x73, 0x74, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x62, 0x76, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x78, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, 0x00, 0x7A, 0x00, 0x7B, 0x73, 0x00, 0x7C, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7D, 0x00, 0x00, 0x7E, 0x7F, 0x00, + 0x00, 0x80, 0x35, 0x81, 0x00, 0x82, 0x00, 0x83, 0x84, 0x00, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x85, 0x00, 0x86, 0x00, 0x00, 0x00, 0x87, 0x00, 0x88, 0x89, 0x00, 0x34, 0x00, 0x00, 0x00, + 0x26, 0x00, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x15, 0x1F, 0x8B, 0x8C, 0x00, 0x8D, 0x00, 0x00, 0x00, + 0x00, 0x8E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8F, 0x00, 0x90, 0x00, 0x91, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x93, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x95, 0x96, 0x07, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, 0x00, 0x15, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x57, 0x41, 0x99, + 0x41, 0x41, 0x41, 0x46, 0x9A, 0x9B, 0x9C, 0x9D, 0x41, 0x9E, 0x00, 0x00, 0x41, 0x9F, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xA0, 0xA1, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0xA3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0xA4, 0x00, 0xA5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, + 0xA6, 0x41, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0x00, 0x00, 0x41, 0x41, 0x41, 0x41, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0xAF, 0x41, 0xB0, 0x41, 0xB1, + 0xB2, 0xB3, 0xB4, 0x00, 0x41, 0x41, 0x41, 0x41, 0x41, 0xB5, 0xB6, 0xB7, 0x41, 0x41, 0xB8, 0x44, +]; + +static ALLOWED_IN_WORD_LEAVES: [u64; 185] = [ + 0xFFFFFFFFFFFFFFFF, + 0x03FF000000000000, + 0x07FFFFFE07FFFFFE, + 0x76AC040000000000, + 0xFF7FFFFFFF7FFFFF, + 0x0000501F0003FFC3, + 0xBFDFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFCF, + 0xFFBFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFCFB, + 0xFFFFFFFF03FFFFFF, + 0xBFFFFFFFFFFF19FF, + 0xFFFFFFFFFFFFFFB6, + 0xFFFFFFFF07FF0000, + 0xFFFFC3FFFFFFFFFF, + 0x9FFFFDFF9FEFFFFF, + 0xFFFFFFFFFFFF4000, + 0x3C3FFFFFFFFFFFFF, + 0x8000FFFFFFFFFFFF, + 0xFFFFFFFFBFFFFFFF, + 0xFFFFFFFFFFFCFEFF, + 0xFFFFFFFBFFFFFFFF, + 0xFFFEFFCFFFFFFFFF, + 0xD3F3FFFFFFFFFFFF, + 0xFFFCFFFFFFFFFFFF, + 0xFFFEFFFFFFFFFFFF, + 0xF807FFFFFFFFFFFF, + 0x7F7FFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFEF, + 0xFDFFFFFFFFFF7FFF, + 0xFFEFFFFFFFFFFFFF, + 0x7FFFFFFFFFFFFFFF, + 0xFFFFFFFFF3FF7FFF, + 0xC2AFFFFF03000801, + 0x3FFFFFFFFFFFFFDF, + 0xFFFFFFFFF8002040, + 0xFFFFFFFFFFFF03FF, + 0xFFFFFFFF3FFFFFFF, + 0xF7FFFFFFFFFFFFFF, + 0xFFFFFE00FFFFFFFF, + 0xFFFFFFFFFC00FFFF, + 0xFFFFFFFFFFFFFFFE, + 0xFFFF9FFFFFFFFFFF, + 0xFFFFFFFFE7FFFFFE, + 0xFFFFC7FFFFFFFFFF, + 0xFF9FFFFFFFFFFFFF, + 0xFFFFFFFFF08FFFFF, + 0xFFFFFFFFFFFFB800, + 0xFFFFFFFFFFFFFFCE, + 0x000000003FFFFFFF, + 0xBFFFC080FFFFFFFF, + 0x800FF80003FFFFFF, + 0x0FFFFFFFFFFFFFFF, + 0x07FFFFFFFFFFFFFF, + 0x3FFFFFFFFFFFFFFF, + 0xFFFFFFFFFFF7FF00, + 0x5FFFFFFFFFFFFFFF, + 0x9FFF1FFF1FFF1FFC, + 0x0000000000003000, + 0x9FFF002000000000, + 0x00000000FFFF9FFF, + 0xFFFFFFE21FFFFFFE, + 0xF3FFFD503F2FFC84, + 0xFFFFFFFFFFFF43E0, + 0x000000000000F3FF, + 0x0000000000000000, + 0x0000000040000084, + 0xFFFFFF8000000000, + 0xFFFFFFFFFFFFF800, + 0xFFC000000FFFFFFF, + 0xFFC0000000000000, + 0x00000000000FFFFF, + 0x0030000000000000, + 0x0000000000400000, + 0x21FFF81FFFFFFFFF, + 0x0000800000000000, + 0xFFFFFFFFC0000000, + 0x0000000004000000, + 0xFFF0000000000000, + 0x0000FFFFFFC00000, + 0x1F3EFFFE000000E0, + 0xFFFFFFFEFFFFFFFF, + 0xFFFFFFFF003CFFFF, + 0xFFFF7FF000000000, + 0x000003FF80000000, + 0x00000000FFFEFF00, + 0xFFFE0000000003FF, + 0x000000000000FFFF, + 0x3FFFFFFFFFFFFF80, + 0xFFFFFFFFFFFF1FFF, + 0xBFF0FFFFFFFFFFFF, + 0xFF03FFFFFFFFFFFF, + 0xFFFFFFFCFF800000, + 0xFFFFFFFFFFFFF9FF, + 0xFC3FF0FFFFFFFFFF, + 0xFF0FFFFFFFFFFFFF, + 0xE8FFFFFFFFFF3FFF, + 0xFFFF3FFFFFFFFFFF, + 0xFFFFFFFF7FFFFFFF, + 0xFFFFFFFF3FFFC001, + 0xFC7FFFFF0FFFFFFF, + 0xFFFCFFFF3FFFFFFF, + 0xFFFFF3FFF7FFFFFF, + 0xFFFFF7FFFFFFFFFF, + 0xFFFFFDFFFFFFFFFF, + 0x0003FFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFF8, + 0xFFFFFFFFFFFF0000, + 0x0FFFFFFFFFFF7FFF, + 0x0000FFFFFC00FFFF, + 0xFFFFF08000080000, + 0x07FFFFFE03FF0001, + 0xFFFFFFE007FFFFFE, + 0xC1FF8080FFFFFFFF, + 0x007FFFFFFFFFFFF8, + 0x01FFFFFFFFFFFFFF, + 0xFFFFFFFEE0008C00, + 0xE00000000000FFFF, + 0xFFFFFFFFFFFEFFFF, + 0xFFFF7FFFFFFFFFFF, + 0xFE7FFFFFFF7FFFFF, + 0x7FFFFFFF7FFFFFFF, + 0x7FFFFFFFFE00FFFF, + 0xFF80FFFFFFFFFEFF, + 0xFFFFFFFFE1FFFFFF, + 0xFFFFDFFFFFFFFFFF, + 0xFFFFFFFFFC1FFFFF, + 0xFFFFFFFFFFFFFC3F, + 0xFFFFFFFFFFFFC07F, + 0xFFFFFFFFFFFFDFFC, + 0xFFCFFFFFFFFFFFF0, + 0xFFFFFFFF17FFDE1F, + 0xC0FFFFFFFFFFFFFF, + 0xFFFFFFFFD3FF07FF, + 0xFFFFFFFFFFFFFFBF, + 0xFFFFFFFFFF000001, + 0xFFFFE000FFFFFFF1, + 0xFDFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFF8F, + 0xFFFFFFFFFFFFFF80, + 0xFFFFFFF823FFFFFF, + 0xFFFFFFFFFFFFFC00, + 0xFFFCFFFFFFFFFFC1, + 0xFE7FFFFFFFFFFFFF, + 0xFFFFFFFFFFFF0007, + 0x7FFC0000001FFFFF, + 0xFFE0FFFFFFFFFFFF, + 0xFFF9FFFFFFFFFFFF, + 0x0000FFFFFFFFFFFF, + 0xFFDFFFFFFFFFFFFF, + 0x007FFFFFFFFFFFFF, + 0xFFFFFFFFF87FFFFF, + 0xFFFFFFF06FFFFFFF, + 0xFFFFFFFFFFFFFFF0, + 0x0000018000000000, + 0xF807E3E000000000, + 0x00003C0000000FE7, + 0xFFFFF80000000000, + 0xFFFFFFFFFFFFFFDC, + 0xFFFFFFFFFF800000, + 0xF87FFFFFFFFFFFFF, + 0x00201FFFFFFFFFFF, + 0xFFFFFFFFFFFFF010, + 0xFFFFFFFFFFFF7FFF, + 0xFFFEEFFFFFFFFFFF, + 0xFFFFBFFFFFFFFFFF, + 0x0000F00000000000, + 0x00018000FFF00000, + 0xFFC0000000010001, + 0xFFFF000000001FFF, + 0xFFFF03FFFFFF03FF, + 0xFFFFC000000003FF, + 0x0000003FFFFFFFFF, + 0xF00000000000FFF8, + 0xFFFFFFC0FFFCFE00, + 0xE000E0000F000000, + 0x0780000000000000, + 0xFFFEF000FC000000, + 0x000000000000F000, + 0x00000000FC00FF00, + 0xFFFCC0000000FF00, + 0xE000C000FFF00000, + 0x400000000000FE00, + 0xFE00FE00F0003FC0, + 0x0000000000080000, +]; + +#[cfg(test)] +#[test] +fn test_allowed_in_words_casing_closure() { + for c in '\0'..=char::MAX { + if allowed_in_word(c) { + assert!(c.to_uppercase().all(allowed_in_word)); + assert!(c.to_lowercase().all(allowed_in_word)); + } + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[repr(u8)] +pub enum CasedLetterKind { + Lowercase = 1, + Uppercase = 2, + Titlecase = 3, +} + +/// The case of this letter, or `None` if it is not a cased letter. +pub fn letter_casing(c: char) -> Option { + const BOTTOM_BITS_MASK: u32 = !((!0_u32) << 6); + let cp: u32 = c.into(); + let top_bits = cp >> 6; + if top_bits < 0x7A6 { + let leaf_idx: u8 = LETTER_CASING_ROOT[usize::try_from(top_bits).unwrap()]; + let leaf = LETTER_CASING_LEAVES[usize::from(leaf_idx)]; + match (leaf >> ((cp & BOTTOM_BITS_MASK) * 2)) & 3 { + 0 => None, + 1 => Some(CasedLetterKind::Lowercase), + 2 => Some(CasedLetterKind::Uppercase), + 3 => Some(CasedLetterKind::Titlecase), + _ => unreachable!(), + } + } else { + None + } +} + +/// Whether the character is a non-Greek titlecase letter. +pub fn is_non_greek_titlecase(c: char) -> bool { + matches!(c, '\u{01C5}' | '\u{01C8}' | '\u{01CB}' | '\u{01F2}') +} + +static LETTER_CASING_ROOT: [u8; 1958] = [ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x00, 0x00, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x15, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x15, 0x17, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x18, 0x00, 0x19, 0x1A, 0x1B, 0x00, 0x1C, 0x1C, 0x1D, 0x1C, 0x1E, 0x1F, 0x20, 0x21, + 0x00, 0x00, 0x00, 0x00, 0x22, 0x23, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0E, 0x25, 0x1C, 0x26, 0x27, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28, 0x29, 0x00, 0x2A, 0x2B, 0x2C, 0x2D, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2E, 0x2F, 0x30, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x32, 0x33, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x34, 0x35, 0x36, 0x37, 0x00, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x3A, 0x3B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x15, 0x3C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4E, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x4F, 0x50, +]; + +static LETTER_CASING_LEAVES: [u128; 81] = [ + 0x00000000000000000000000000000000, + 0x0015555555555554002AAAAAAAAAAAA8, + 0x00000400000000000000000000000000, + 0x55551555555555556AAA2AAAAAAAAAAA, + 0x99996666666666666666666666666666, + 0x599A6666666666666666666666659999, + 0x561699A9A659A6669A56A69AA5A9A669, + 0x6666A679666666666599999999E79E00, + 0x69A55566666666666666666666666666, + 0x55555555555555555555555566666A99, + 0x00000000555555555555545555555555, + 0x85406066000000000000000000000000, + 0x5555555555AAAA8AAAAAAAA9A22A2000, + 0xA969865566666666666656A595555555, + 0x55555555AAAAAAAAAAAAAAAAAAAAAAAA, + 0x66666666666666665555555555555555, + 0x66666666666666666666666666600006, + 0x6666666666666666666666665999999A, + 0xAAAAAAA8666666666666666666666666, + 0x555555555555555500002AAAAAAAAAAA, + 0x00000000000000000000000000015555, + 0xAAAAAAAAAAAAAAAA0000000000000000, + 0x54155555555555555555555508008AAA, + 0x05550AAAAAAAAAAAAAAAAAAAAAAAAAAA, + 0xA82AAAAAAAAAAAAAAAAAAAAA00015555, + 0x00000000005555555555555555555555, + 0x55545555554000000000000000000000, + 0x00000000000000000015555555555555, + 0x66666666666666666666666666666666, + 0x66666666666666666555566666666666, + 0xAAAA5555AAAA55550AAA0555AAAA5555, + 0x05555555AAAA5555888855550AAA0555, + 0x13AA5155FFFF5555FFFF5555FFFF5555, + 0x03AA515002AA555500AA505503AA5150, + 0xA50401AA4AA222000AA8086A5A908020, + 0x00000000000000000000000010055800, + 0x00000000000000000000000000000180, + 0xA0555966A99996A65555555555555555, + 0x00000060198001666666666666666666, + 0x00000000040045555555555555555555, + 0x00000000066666666666666666666666, + 0x00000000000000000066666666666666, + 0x66666665666666600000000000000000, + 0x69995554666666666666666666666666, + 0x666666AA6AA666666666656619806666, + 0x0010180000000000000664460019AA66, + 0x55555555000000000000000000000000, + 0x55555555000155550015555555555555, + 0x55555555555555555555555555555555, + 0x00000000000000000000554000001555, + 0x002AAAAAAAAAAAA80000000000000000, + 0x00000000000000000015555555555554, + 0x555555555555AAAAAAAAAAAAAAAAAAAA, + 0x00000000000000000000000055555555, + 0xAAAAAAAA000000000000000000000000, + 0x0055555555555555555500AAAAAAAAAA, + 0xAA2AAAAA000000000000000000000000, + 0x014555455555554555554A2AAA2AAAAA, + 0x0000002AAAAAAAAAAAAAAAAAAAAAAAAA, + 0x00000015555555555555555555555555, + 0x00000000000000005555555555555555, + 0x5555555555555555AAAAAAAAAAAAAAAA, + 0xAAAAAA5555555555555AAAAAAAAAAAAA, + 0xAAAAAAAAAAAA5555555551555AAAAAAA, + 0x54455AAAA2A82820A25555555555555A, + 0x55555555555AAAAAAAAAAAAA55555455, + 0x2A8A55555555555552AAA2AAA82A8A55, + 0xAAAAAAAAAA55555555555552AAA022AA, + 0x555AAAAAAAAAAAAA5555555555555AAA, + 0x555555555AAAAAAAAAAAAA5555555555, + 0xAA5555555555555AAAAAAAAAAAAA5555, + 0xAAAAAAAA5555555555555AAAAAAAAAAA, + 0xAAAAAAAAAAAA055555555555555AAAAA, + 0x552AAAAAAAAAAAA55515555555555552, + 0x555552AAAAAAAAAAAA55515555555555, + 0x555555552AAAAAAAAAAAA55515555555, + 0x555555555552AAAAAAAAAAAA55515555, + 0x00000000000000000000000000655515, + 0x00000000001554001555555555455555, + 0x555555555555555AAAAAAAAAAAAAAAAA, + 0x00000000000000000000000000000055, +]; + +/// Whether this character is a nonspacing or enclosing mark. +pub fn is_nonspacing_mark(c: char) -> bool { + const BOTTOM_BITS_MASK: u32 = !((!0_u32) << 7); + let cp: u32 = c.into(); + let top_bits = cp >> 7; + if top_bits < 0x3D3 { + let leaf_idx: u8 = NONSPACING_MARKS_ROOT[usize::try_from(top_bits).unwrap()]; + let leaf = NONSPACING_MARKS_LEAVES[usize::from(leaf_idx)]; + (leaf >> (cp & BOTTOM_BITS_MASK)) & 1 == 1 + } else { + matches!(cp, 0x0E0100..=0x0E01EF) + } +} + +static NONSPACING_MARKS_ROOT: [u8; 979] = [ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x02, 0x00, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1B, 0x1C, + 0x1D, 0x1E, 0x1F, 0x00, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x00, 0x26, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x27, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x28, 0x29, 0x2A, 0x00, 0x00, 0x00, 0x00, + 0x2B, 0x2C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2D, 0x2E, 0x00, 0x00, + 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x00, 0x35, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x38, 0x00, 0x39, 0x3A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x3B, 0x3C, 0x00, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x00, 0x3E, 0x3F, 0x40, + 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x00, 0x48, 0x49, 0x00, 0x4A, 0x4B, 0x4C, 0x4D, 0x00, + 0x4E, 0x00, 0x4F, 0x50, 0x51, 0x52, 0x00, 0x00, 0x53, 0x54, 0x55, 0x56, 0x00, 0x57, 0x58, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x5A, 0x5B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5C, 0x5D, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5F, 0x00, + 0x00, 0x00, 0x60, 0x61, 0x62, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x63, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x65, 0x66, 0x5B, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x69, 0x6A, +]; + +static NONSPACING_MARKS_LEAVES: [u128; 107] = [ + 0x00000000000000000000000000000000, + 0x0000FFFFFFFFFFFFFFFFFFFFFFFFFFFF, + 0x000000000000000000000000000003F8, + 0x00000000000000B6BFFFFFFFFFFE0000, + 0x00010000FFFFF8000000000007FF0000, + 0x00003D9F9FC000000000000000000000, + 0x00000000000007FFFFFF000000020000, + 0x200FF800000000000001FFC000000000, + 0x000000000E00000000003EEFFBC00000, + 0xFFFFFFFBFFFFFC0000000000FF000000, + 0x0000000C00FE21FE1400000000000007, + 0x4000000C0000201E1000000000000002, + 0x00230000000239861000000000000006, + 0xFC00000C000021BE1000000000000006, + 0x0000000C0060201E9000000000000002, + 0x00000000000020010000000000000004, + 0x0000000C00603DC1D000000000000011, + 0x0000000C000030409000000000000002, + 0x0000000C0000201E1800000000000003, + 0x00000000005C04000000000000000002, + 0x0000000000007F8007F2000000000000, + 0x0000000000007F001FF2000000000000, + 0x7FFE00000000000002A0000003000000, + 0x00000000000000401FFFFFFFFEFFE0DF, + 0x001E0001C300000066FDE00000000000, + 0x00000000000000000000000020002064, + 0x00000000E00000000000000000000000, + 0x000C0000000C0000000C0000001C0000, + 0x00000000200FFE403FB0000000000000, + 0x0000000000000000000000000000B800, + 0x00000000000000000000020000000060, + 0x00000000000000000E04018700000000, + 0x9FF81FE57F4000000000000009800000, + 0x0000000000007FFFFFFF000000000000, + 0x000FF8000000000417D000000000000F, + 0x0003A3400000000000003B3C00000003, + 0x000000000000000000CFF00000000000, + 0x031021FDFFF700000000000000000000, + 0xFFFFFFFFFFFFFFFF0000000000000000, + 0x0001FFFFFFFF00000000000000000000, + 0x00038000000000000000000000000000, + 0x80000000000000000000000000000000, + 0xFFFFFFFF000000000000000000000000, + 0x000000000000000000003C0000000000, + 0x00000000000000000000000006000000, + 0x3FF78000000000000000000000000000, + 0x000300000000000000000000C0000000, + 0x00000000000000000000106000000844, + 0x8003FFFF000000300000000000000000, + 0x000000000003FF8000003FC000000000, + 0x000000200000000033C8000000000007, + 0x100000000000100800667E0000000000, + 0x0040300000000002C19D000000000000, + 0x00002120000000000000000000000000, + 0x00000000000000000000000040000000, + 0x00000000000000000000FFFF0000FFFF, + 0x20000000000000000000000000000000, + 0x00000001000000000000000000000000, + 0x07C00000000000000000000000000000, + 0x0000000000000000870000000000F06E, + 0x00000060000000000000000000000000, + 0x0000000000000000000000F000000000, + 0xE0000000000000000000180000000000, + 0x000000000001FFC00000000000000000, + 0x0000000000000000000000000000003C, + 0x801900000000007FFF00000000000002, + 0x00000000000000040678000000000003, + 0x0008000000000000001FEF8000000007, + 0x0000000000009E007FC0000000000003, + 0x000000000000000240D3800000000000, + 0x000007F8800000000000000000000000, + 0x001F1FC0000000011800000000000003, + 0x000000004000005CFF00000000000000, + 0x000000000000000D85F8000000000000, + 0x0000000030000001B03C000000000000, + 0x0000000000000001A7F8000000000000, + 0x000000000000000000BF280000000000, + 0x000000000000000000000FBCE0000000, + 0x000000000000000006FF800000000000, + 0x00000000000000085800000000000000, + 0x000000010CF000000000000000000000, + 0x000000000E7E008079F80000000007FE, + 0x000000000000000000000000037FFC00, + 0x0000000000000000BF7F000000000000, + 0x0000000000000000006DFCFFFFFC0000, + 0x00000000000000BFB47E000000000000, + 0x00000000000000000000000000A30000, + 0x00180000000000000000000000000000, + 0x000000000000000507C0000000000003, + 0x00000000003FFF810000000000000000, + 0x001F0000000000000000000000000000, + 0x0000000000000000007F000000000000, + 0x00000000000080000000000000000000, + 0x00000010000000000000000000078000, + 0x00000000000000000000000060000000, + 0x000000000000007FFFFF3FFFFFFFFFFF, + 0xF8000380000000000000000000000000, + 0x000000000000000000003C0000000FE7, + 0x000000000000001C0000000000000000, + 0x00201FFFFFFFFFFFF87FFFFFFFFFFFFF, + 0x00000000000000000000FFFEF8000010, + 0x0000000000000000000007DBF9FFFF7F, + 0x00000000000000000000000000008000, + 0x0000F000000000000000400000000000, + 0x0000F000000000000000000000000000, + 0x00000000007F00000000000000000000, + 0x00000000000007F00000000000000000, +]; + +use core::{ + fmt::{self, Write}, + iter, +}; + +#[derive(Clone, Debug)] +pub enum ToTitlecase { + Zero, + One(char), + Two(char, char), + Three(char, char, char), +} + +impl Iterator for ToTitlecase { + type Item = char; + + fn next(&mut self) -> Option { + match *self { + Self::Zero => None, + Self::One(c) => { + *self = Self::Zero; + Some(c) + } + Self::Two(b, c) => { + *self = Self::One(c); + Some(b) + } + Self::Three(a, b, c) => { + *self = Self::Two(b, c); + Some(a) + } + } + } + + fn size_hint(&self) -> (usize, Option) { + let size = match self { + Self::Zero => 0, + Self::One(_) => 1, + Self::Two(..) => 2, + Self::Three(..) => 3, + }; + (size, Some(size)) + } +} + +impl iter::ExactSizeIterator for ToTitlecase {} + +impl iter::FusedIterator for ToTitlecase {} + +impl fmt::Display for ToTitlecase { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for c in self.clone() { + f.write_char(c)?; + } + Ok(()) + } +} + +/// Returns an iterator that yields the titlecase mapping of this `char` as one or more `char`s. +pub fn to_titlecase(c: char) -> ToTitlecase { + // ASCII fast path + if c.is_ascii() { + ToTitlecase::One(c.to_ascii_uppercase()) + } else if let Ok(idx) = TITLECASE_MAPPINGS.binary_search_by_key(&c, |&(c2, _)| c2) { + match TITLECASE_MAPPINGS[idx].1 { + [None, ..] => ToTitlecase::Zero, + [Some(a), None, ..] => ToTitlecase::One(a), + [Some(a), Some(b), None] => ToTitlecase::Two(a, b), + [Some(a), Some(b), Some(c)] => ToTitlecase::Three(a, b, c), + } + } else { + let mut uppercase = c.to_uppercase(); + match uppercase.size_hint().0 { + 0 => ToTitlecase::Zero, + 1 => ToTitlecase::One(uppercase.next().unwrap()), + 2 => ToTitlecase::Two(uppercase.next().unwrap(), uppercase.next().unwrap()), + 3 => ToTitlecase::Three( + uppercase.next().unwrap(), + uppercase.next().unwrap(), + uppercase.next().unwrap(), + ), + _ => unreachable!(), + } + } +} + +/// Sorted list of characters and their titlecase mappings. +/// Only characters whose titlecase differs from uppercase are included. +static TITLECASE_MAPPINGS: [(char, [Option; 3]); 135] = [ + ('ß', [Some('S'), Some('s'), None]), + ('DŽ', [Some('Dž'), None, None]), + ('Dž', [Some('Dž'), None, None]), + ('dž', [Some('Dž'), None, None]), + ('LJ', [Some('Lj'), None, None]), + ('Lj', [Some('Lj'), None, None]), + ('lj', [Some('Lj'), None, None]), + ('NJ', [Some('Nj'), None, None]), + ('Nj', [Some('Nj'), None, None]), + ('nj', [Some('Nj'), None, None]), + ('DZ', [Some('Dz'), None, None]), + ('Dz', [Some('Dz'), None, None]), + ('dz', [Some('Dz'), None, None]), + ('և', [Some('Ե'), Some('ւ'), None]), + ('ა', [Some('ა'), None, None]), + ('ბ', [Some('ბ'), None, None]), + ('გ', [Some('გ'), None, None]), + ('დ', [Some('დ'), None, None]), + ('ე', [Some('ე'), None, None]), + ('ვ', [Some('ვ'), None, None]), + ('ზ', [Some('ზ'), None, None]), + ('თ', [Some('თ'), None, None]), + ('ი', [Some('ი'), None, None]), + ('კ', [Some('კ'), None, None]), + ('ლ', [Some('ლ'), None, None]), + ('მ', [Some('მ'), None, None]), + ('ნ', [Some('ნ'), None, None]), + ('ო', [Some('ო'), None, None]), + ('პ', [Some('პ'), None, None]), + ('ჟ', [Some('ჟ'), None, None]), + ('რ', [Some('რ'), None, None]), + ('ს', [Some('ს'), None, None]), + ('ტ', [Some('ტ'), None, None]), + ('უ', [Some('უ'), None, None]), + ('ფ', [Some('ფ'), None, None]), + ('ქ', [Some('ქ'), None, None]), + ('ღ', [Some('ღ'), None, None]), + ('ყ', [Some('ყ'), None, None]), + ('შ', [Some('შ'), None, None]), + ('ჩ', [Some('ჩ'), None, None]), + ('ც', [Some('ც'), None, None]), + ('ძ', [Some('ძ'), None, None]), + ('წ', [Some('წ'), None, None]), + ('ჭ', [Some('ჭ'), None, None]), + ('ხ', [Some('ხ'), None, None]), + ('ჯ', [Some('ჯ'), None, None]), + ('ჰ', [Some('ჰ'), None, None]), + ('ჱ', [Some('ჱ'), None, None]), + ('ჲ', [Some('ჲ'), None, None]), + ('ჳ', [Some('ჳ'), None, None]), + ('ჴ', [Some('ჴ'), None, None]), + ('ჵ', [Some('ჵ'), None, None]), + ('ჶ', [Some('ჶ'), None, None]), + ('ჷ', [Some('ჷ'), None, None]), + ('ჸ', [Some('ჸ'), None, None]), + ('ჹ', [Some('ჹ'), None, None]), + ('ჺ', [Some('ჺ'), None, None]), + ('ჽ', [Some('ჽ'), None, None]), + ('ჾ', [Some('ჾ'), None, None]), + ('ჿ', [Some('ჿ'), None, None]), + ('ᾀ', [Some('ᾈ'), None, None]), + ('ᾁ', [Some('ᾉ'), None, None]), + ('ᾂ', [Some('ᾊ'), None, None]), + ('ᾃ', [Some('ᾋ'), None, None]), + ('ᾄ', [Some('ᾌ'), None, None]), + ('ᾅ', [Some('ᾍ'), None, None]), + ('ᾆ', [Some('ᾎ'), None, None]), + ('ᾇ', [Some('ᾏ'), None, None]), + ('ᾈ', [Some('ᾈ'), None, None]), + ('ᾉ', [Some('ᾉ'), None, None]), + ('ᾊ', [Some('ᾊ'), None, None]), + ('ᾋ', [Some('ᾋ'), None, None]), + ('ᾌ', [Some('ᾌ'), None, None]), + ('ᾍ', [Some('ᾍ'), None, None]), + ('ᾎ', [Some('ᾎ'), None, None]), + ('ᾏ', [Some('ᾏ'), None, None]), + ('ᾐ', [Some('ᾘ'), None, None]), + ('ᾑ', [Some('ᾙ'), None, None]), + ('ᾒ', [Some('ᾚ'), None, None]), + ('ᾓ', [Some('ᾛ'), None, None]), + ('ᾔ', [Some('ᾜ'), None, None]), + ('ᾕ', [Some('ᾝ'), None, None]), + ('ᾖ', [Some('ᾞ'), None, None]), + ('ᾗ', [Some('ᾟ'), None, None]), + ('ᾘ', [Some('ᾘ'), None, None]), + ('ᾙ', [Some('ᾙ'), None, None]), + ('ᾚ', [Some('ᾚ'), None, None]), + ('ᾛ', [Some('ᾛ'), None, None]), + ('ᾜ', [Some('ᾜ'), None, None]), + ('ᾝ', [Some('ᾝ'), None, None]), + ('ᾞ', [Some('ᾞ'), None, None]), + ('ᾟ', [Some('ᾟ'), None, None]), + ('ᾠ', [Some('ᾨ'), None, None]), + ('ᾡ', [Some('ᾩ'), None, None]), + ('ᾢ', [Some('ᾪ'), None, None]), + ('ᾣ', [Some('ᾫ'), None, None]), + ('ᾤ', [Some('ᾬ'), None, None]), + ('ᾥ', [Some('ᾭ'), None, None]), + ('ᾦ', [Some('ᾮ'), None, None]), + ('ᾧ', [Some('ᾯ'), None, None]), + ('ᾨ', [Some('ᾨ'), None, None]), + ('ᾩ', [Some('ᾩ'), None, None]), + ('ᾪ', [Some('ᾪ'), None, None]), + ('ᾫ', [Some('ᾫ'), None, None]), + ('ᾬ', [Some('ᾬ'), None, None]), + ('ᾭ', [Some('ᾭ'), None, None]), + ('ᾮ', [Some('ᾮ'), None, None]), + ('ᾯ', [Some('ᾯ'), None, None]), + ('ᾲ', [Some('Ὰ'), Some('ͅ'), None]), + ('ᾳ', [Some('ᾼ'), None, None]), + ('ᾴ', [Some('Ά'), Some('ͅ'), None]), + ('ᾷ', [Some('Α'), Some('͂'), Some('ͅ')]), + ('ᾼ', [Some('ᾼ'), None, None]), + ('ῂ', [Some('Ὴ'), Some('ͅ'), None]), + ('ῃ', [Some('ῌ'), None, None]), + ('ῄ', [Some('Ή'), Some('ͅ'), None]), + ('ῇ', [Some('Η'), Some('͂'), Some('ͅ')]), + ('ῌ', [Some('ῌ'), None, None]), + ('ῲ', [Some('Ὼ'), Some('ͅ'), None]), + ('ῳ', [Some('ῼ'), None, None]), + ('ῴ', [Some('Ώ'), Some('ͅ'), None]), + ('ῷ', [Some('Ω'), Some('͂'), Some('ͅ')]), + ('ῼ', [Some('ῼ'), None, None]), + ('ff', [Some('F'), Some('f'), None]), + ('fi', [Some('F'), Some('i'), None]), + ('fl', [Some('F'), Some('l'), None]), + ('ffi', [Some('F'), Some('f'), Some('i')]), + ('ffl', [Some('F'), Some('f'), Some('l')]), + ('ſt', [Some('S'), Some('t'), None]), + ('st', [Some('S'), Some('t'), None]), + ('ﬓ', [Some('Մ'), Some('ն'), None]), + ('ﬔ', [Some('Մ'), Some('ե'), None]), + ('ﬕ', [Some('Մ'), Some('ի'), None]), + ('ﬖ', [Some('Վ'), Some('ն'), None]), + ('ﬗ', [Some('Մ'), Some('խ'), None]), +]; diff --git a/src/title.rs b/src/title.rs index 2453430..cc22971 100644 --- a/src/title.rs +++ b/src/title.rs @@ -5,7 +5,7 @@ use alloc::{ string::{String, ToString}, }; -use crate::{capitalize, transform}; +use crate::{titlecase, transform}; /// This trait defines a title case conversion. /// @@ -45,7 +45,7 @@ pub struct AsTitleCase>(pub T); impl> fmt::Display for AsTitleCase { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - transform(self.0.as_ref(), capitalize, |f| write!(f, " "), f) + transform(self.0.as_ref(), titlecase, |f| write!(f, " "), f) } } diff --git a/src/train.rs b/src/train.rs index 4fcc195..cc2f873 100644 --- a/src/train.rs +++ b/src/train.rs @@ -2,7 +2,7 @@ use core::fmt; use alloc::{borrow::ToOwned, string::ToString}; -use crate::{capitalize, transform}; +use crate::{titlecase, transform}; /// This trait defines a train case conversion. /// @@ -42,7 +42,7 @@ pub struct AsTrainCase>(pub T); impl> fmt::Display for AsTrainCase { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - transform(self.0.as_ref(), capitalize, |f| write!(f, "-"), f) + transform(self.0.as_ref(), titlecase, |f| write!(f, "-"), f) } } @@ -67,21 +67,50 @@ mod tests { t!(test6: "SHOUTY_SNAKE_CASE" => "Shouty-Snake-Case"); t!(test7: "snake_case" => "Snake-Case"); t!(test8: "this-contains_ ALLKinds OfWord_Boundaries" => "This-Contains-All-Kinds-Of-Word-Boundaries"); - #[cfg(feature = "unicode")] t!(test9: "XΣXΣ baffle" => "Xσxς-Baffle"); t!(test10: "XMLHttpRequest" => "Xml-Http-Request"); t!(test11: "FIELD_NAME11" => "Field-Name11"); - t!(test12: "99BOTTLES" => "99bottles"); + t!(test12: "99BOTTLES" => "99Bottles"); t!(test13: "FieldNamE11" => "Field-Nam-E11"); t!(test14: "abc123def456" => "Abc123def456"); - t!(test16: "abc123DEF456" => "Abc123-Def456"); + t!(test16: "abc123DEF456" => "Abc123def456"); t!(test17: "abc123Def456" => "Abc123-Def456"); - t!(test18: "abc123DEf456" => "Abc123-D-Ef456"); + t!(test18: "abc123DEf456" => "Abc123d-Ef456"); t!(test19: "ABC123def456" => "Abc123def456"); t!(test20: "ABC123DEF456" => "Abc123def456"); t!(test21: "ABC123Def456" => "Abc123-Def456"); t!(test22: "ABC123DEf456" => "Abc123d-Ef456"); - t!(test23: "ABC123dEEf456FOO" => "Abc123d-E-Ef456-Foo"); + t!(test23: "ABC123dEEf456FOO" => "Abc123d-E-Ef456foo"); t!(test24: "abcDEF" => "Abc-Def"); t!(test25: "ABcDE" => "A-Bc-De"); + t!(test26: "DŽO" => "Džo"); + t!(test27: "džO" => "Dž-O"); + t!(test28: "džo" => "Džo"); + t!(test29: "∇𝐀" => "∇𝐀"); + t!(test30: "∇𝔞" => "∇𝔞"); + t!(test31: "𝔞" => "𝔞"); + t!(test32: "🐈‍⬛🐈" => "\u{200d}"); + t!(test33: "🐈‍⬛🐈a" => "\u{200d}-A"); + t!(test34: "A🐈‍⬛🐈a" => "A-\u{200D}-A"); + t!(test35: "☕" => ""); + t!(test36: "a*️⃣b" => "A-\u{fe0f}-B"); + t!(test37: "a*b" => "A-B"); + t!(test38: "\u{0301}a" => "\u{0301}A"); + t!(test39: "a\u{0301}B" => "A\u{0301}-B"); + t!(test40: "fflololo" => "Fflololo"); + + t!(uts55_test1: "TypeII" => "Type-Ii"); + t!(uts55_test2: "OCaml" => "O-Caml"); + t!(uts55_test3: "HTTPЗапрос" => "Http-Запрос"); + t!(uts55_test4: "UAX9ClauseHL4" => "Uax9-Clause-Hl4"); + t!(uts55_test5: "LOUD_SNAKE" => "Loud-Snake"); + + t!(uts55_test6: "Fancy_Snake" => "Fancy-Snake"); + t!(uts55_test7: "snake-kebab" => "Snake-Kebab"); + t!(uts55_test8: "Paral·lel" => "Paral·lel"); + t!(uts55_test9: "microB" => "Micro-B"); + t!(uts55_test10: "microᖯ" => "Microᖯ"); + t!(uts55_test11: "HTTPसर्वर" => "Httpसर्वर"); + t!(uts55_test12: "dromedaryCamel" => "Dromedary-Camel"); + t!(uts55_test13: "snakeELEPHANTSnake" => "Snake-Elephant-Snake"); } diff --git a/src/upper_camel.rs b/src/upper_camel.rs index c6f29df..f8980f9 100644 --- a/src/upper_camel.rs +++ b/src/upper_camel.rs @@ -5,7 +5,7 @@ use alloc::{ string::{String, ToString}, }; -use crate::{capitalize, transform}; +use crate::{titlecase, transform}; /// This trait defines an upper camel case conversion. /// @@ -58,7 +58,7 @@ pub struct AsUpperCamelCase>(pub T); impl> fmt::Display for AsUpperCamelCase { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - transform(self.0.as_ref(), capitalize, |_| Ok(()), f) + transform(self.0.as_ref(), titlecase, |_| Ok(()), f) } } diff --git a/tables/Cargo.toml b/tables/Cargo.toml new file mode 100644 index 0000000..8897a01 --- /dev/null +++ b/tables/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "tables" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +bitvec = "1.0.1" +regex = "1.10.3" +reqwest = { version = "0.11", features = ["blocking"] } +rustc-hash = "1.1.0" diff --git a/tables/src/allowed_in_word.rs b/tables/src/allowed_in_word.rs new file mode 100644 index 0000000..e89c420 --- /dev/null +++ b/tables/src/allowed_in_word.rs @@ -0,0 +1,221 @@ +//! Construct a lookup table to find whether a particular character is allowed in words. + +use std::{ + any::type_name, + collections::hash_map, + error::Error, + io::{self, Read}, + mem::size_of, +}; + +use bitvec::prelude::*; +use rustc_hash::FxHashMap; + +use crate::unicode_data::{set_by_general_category, set_by_prop, CodepointBitArr, DataFiles}; + +/// Change this to u128 for wider leaves +type LeafElement = u64; + +const ENTRIES_PER_LEAF: usize = LeafElement::BITS as usize; + +/// `true` for all punctuation other than `Other_Punctuation` +/// (`[\p{Punctuation}-\p{Other_Punctuation}]`) +fn punctuation_non_other(data: &DataFiles) -> CodepointBitArr { + let mut arr = BitArray::ZERO; + set_by_general_category(&mut arr, data, "Pc|Pd|Ps|Pe|Pi|Pf", true); + arr +} + +/// `true` for all unassigned and private use characters +fn unassigned_private_use(data: &DataFiles) -> CodepointBitArr { + let mut arr = BitArray::ZERO; + set_by_general_category(&mut arr, data, "[A-Za-z]+", true); + set_by_general_category(&mut arr, data, "Cn|Co", false); + !arr +} + +/// `true` for all codepoints that can be part of a word: +/// `[\p{ID_Continue}\p{ID_Compat_Math_Continue}\p{Cn}\p{Co}\p{Alphabetic}\p{N}-[\p{P}-\p{Po}]]`, +/// plus the extra characters listed below. +pub fn allowed_in_word(data: &DataFiles) -> CodepointBitArr { + let mut word_component = unassigned_private_use(data); + + set_by_prop( + &mut word_component, + &data.derived_core_properties, + "ID_Continue|Alphabetic", + true, + ); + + set_by_general_category(&mut word_component, data, "Nd|Nl|No", true); + + set_by_prop( + &mut word_component, + &data.prop_list, + "ID_Compat_Math_Continue", + true, + ); + + // Choose from characters in https://www.unicode.org/reports/tr31/#Specific_Character_Adjustments + // that are not Punctuation other than Other_Punctuation + // (U+00B7 is already in ID_Continue). + for cp in [ + 0x05F3, // HEBREW PUNCTUATION GERESH https://en.wikipedia.org/wiki/Geresh + 0x05F4, // HEBREW PUNCTUATION GERSHAYIM https://en.wikipedia.org/wiki/Gershayim + 0x0F0B, // TIBETAN MARK INTERSYLLABIC TSHEG https://w3c.github.io/tlreq/#language_overview + ] { + word_component.set(cp, true); + } + + word_component &= !punctuation_non_other(data); + + word_component +} + +fn build_tree(allowed_in_word: &BitSlice) -> (Vec, Vec) { + let mut chunk_to_leaf_idx_map: FxHashMap = FxHashMap::from_iter([(!0, 0)]); + let mut root = Vec::with_capacity(allowed_in_word.len().div_ceil(ENTRIES_PER_LEAF)); + let mut leaves = vec![!0]; + let chunks_iter = allowed_in_word.chunks_exact(ENTRIES_PER_LEAF); + assert!(chunks_iter.remainder().is_empty()); + let mut chunks_iter = chunks_iter.map(|l| { + LeafElement::from_le_bytes( + l.bytes() + .collect::, _>>() + .unwrap() + .try_into() + .unwrap(), + ) + }); + for chunk in &mut chunks_iter { + match chunk_to_leaf_idx_map.entry(chunk) { + hash_map::Entry::Occupied(o) => { + root.push(*o.get()); + } + hash_map::Entry::Vacant(v) => { + let new_index = u8::try_from(leaves.len()).expect("too many leaves"); + v.insert(new_index); + root.push(new_index); + leaves.push(chunk); + } + } + } + (root, leaves) +} + +fn list_of_ranges(cps: impl Iterator) -> Vec<(u32, u32)> { + let mut vec = Vec::new(); + for cp in cps { + if let Some((_, prev)) = vec.last_mut() { + if *prev + 1 == cp { + *prev = cp; + continue; + } + } + vec.push((cp, cp)) + } + vec +} + +pub fn write_table( + out: &mut impl io::Write, + allowed_in_word: &CodepointBitArr, +) -> Result<(), Box> { + let bits_to_shift = ENTRIES_PER_LEAF.ilog2(); + + let first_cp_not_in_tree = + (allowed_in_word[..0x40000].last_zero().unwrap() + 1).next_multiple_of(ENTRIES_PER_LEAF); + + let first_cp_not_in_tree_shifted = first_cp_not_in_tree >> bits_to_shift; + + writeln!( + out, + "/// Whether this character can be part of a word. +pub fn allowed_in_word(c: char) -> bool {{ + const BOTTOM_BITS_MASK: u32 = !((!0_u32) << {bits_to_shift}); + let cp: u32 = c.into(); + let top_bits = cp >> {bits_to_shift}; + if top_bits < 0x{first_cp_not_in_tree_shifted:X} {{ + let leaf_idx: u8 = ALLOWED_IN_WORD_ROOT[usize::try_from(top_bits).unwrap()]; + let leaf = ALLOWED_IN_WORD_LEAVES[usize::from(leaf_idx)]; + (leaf >> (cp & BOTTOM_BITS_MASK)) & 1 == 1 + }} else {{" + )?; + + let mut late_zeros = list_of_ranges( + allowed_in_word[first_cp_not_in_tree..] + .iter_zeros() + .map(|n| u32::try_from(n + first_cp_not_in_tree).unwrap()), + ) + .into_iter(); + if let Some(first_late_zero) = late_zeros.next() { + write!(out, " !matches!(cp, 0x{:06X}", first_late_zero.0)?; + if first_late_zero.0 != first_late_zero.1 { + write!(out, "..=0x{:06X}", first_late_zero.1)?; + } + for late_zero in late_zeros { + write!(out, " | 0x{:06X}", late_zero.0)?; + if late_zero.0 != late_zero.1 { + write!(out, "..=0x{:06X}", late_zero.1)?; + } + } + writeln!(out, ")")?; + } else { + writeln!(out, "true")?; + } + + writeln!( + out, + " }} +}}", + )?; + + let (root, leaves) = build_tree(&allowed_in_word[..first_cp_not_in_tree]); + eprintln!( + "allowed_in_words: {} bytes of static data", + root.len() + leaves.len() * size_of::() + ); + + write!( + out, + "\nstatic ALLOWED_IN_WORD_ROOT: [u8; {}] = [", + root.len() + )?; + + for line in root.chunks(16) { + write!(out, "\n ")?; + for byte in line { + write!(out, " 0x{byte:02X},")?; + } + } + + writeln!( + out, + "\n]; + +static ALLOWED_IN_WORD_LEAVES: [{}; {}] = [", + type_name::(), + leaves.len() + )?; + + for leaf in leaves { + writeln!(out, " 0x{leaf:016X},")?; + } + writeln!( + out, + "]; + +#[cfg(test)] +#[test] +fn test_allowed_in_words_casing_closure() {{ + for c in '\\0'..=char::MAX {{ + if allowed_in_word(c) {{ + assert!(c.to_uppercase().all(allowed_in_word)); + assert!(c.to_lowercase().all(allowed_in_word)); + }} + }} +}}" + )?; + + Ok(()) +} diff --git a/tables/src/letter_casing.rs b/tables/src/letter_casing.rs new file mode 100644 index 0000000..13a3409 --- /dev/null +++ b/tables/src/letter_casing.rs @@ -0,0 +1,199 @@ +//! Construct a lookup table for the casing status of a letter +//! (lowercase, uppercase, or titlecase). +//! +//! This table only concerns itself with letters: +//! for obtaining the case of characters which are not letters, +//! use the functions from `core`. + +use std::{any::type_name, collections::hash_map, error::Error, io, mem::size_of}; + +use bitvec::prelude::*; +use rustc_hash::FxHashMap; + +use crate::unicode_data::{set_by_general_category, set_by_prop, CodepointBitArr, DataFiles}; + +/// Change this to u64 for smaller leaves +type LeafElement = u128; + +const ENTRIES_PER_LEAF: usize = LeafElement::BITS as usize / 2; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[repr(u8)] +pub enum CasedLetterKind { + Lowercase = 1, + Uppercase = 2, + Titlecase = 3, +} + +pub fn letter_casing( + data: &DataFiles, + word_components: &BitSlice, +) -> (Vec>, Vec) { + let mut lowercase = CodepointBitArr::ZERO; + set_by_general_category(&mut lowercase, data, "Ll", true); + lowercase &= word_components; + let mut uppercase = CodepointBitArr::ZERO; + set_by_general_category(&mut uppercase, data, "Lu", true); + uppercase &= word_components; + let mut titlecase = CodepointBitArr::ZERO; + set_by_general_category(&mut titlecase, data, "Lt", true); + titlecase &= word_components; + + let last = [ + lowercase.last_one(), + uppercase.last_one(), + titlecase.last_one(), + ] + .into_iter() + .max() + .flatten() + .unwrap(); + + let mut casing_vec = vec![None; last + 1]; + for cp in lowercase.iter_ones() { + casing_vec[cp] = Some(CasedLetterKind::Lowercase); + } + for cp in uppercase.iter_ones() { + casing_vec[cp] = Some(CasedLetterKind::Uppercase); + } + for cp in titlecase.iter_ones() { + casing_vec[cp] = Some(CasedLetterKind::Titlecase); + } + + set_by_prop(&mut titlecase, &data.scripts, "Greek", false); + + ( + casing_vec, + titlecase + .iter_ones() + .map(|cp| u32::try_from(cp).unwrap()) + .collect(), + ) +} + +fn build_casing_tree(casings_list: &[Option]) -> (Vec, Vec) { + let mut chunk_to_leaf_idx_map: FxHashMap = FxHashMap::from_iter([(!0, 0)]); + let mut root = Vec::with_capacity(casings_list.len().div_ceil(ENTRIES_PER_LEAF)); + let mut leaves = Vec::new(); + let chunks_iter = casings_list.chunks_exact(ENTRIES_PER_LEAF); + assert!(chunks_iter.remainder().is_empty()); + let mut chunks_iter = chunks_iter.map(|c| { + let mut chunk_uint: LeafElement = 0; + for (index, elem) in c.iter().copied().enumerate() { + let bits = elem.map_or(0, |k| k as u8); + chunk_uint |= LeafElement::from(bits) << (index * 2); + } + chunk_uint + }); + for chunk in &mut chunks_iter { + match chunk_to_leaf_idx_map.entry(chunk) { + hash_map::Entry::Occupied(o) => { + root.push(*o.get()); + } + hash_map::Entry::Vacant(v) => { + let new_index = u8::try_from(leaves.len()).expect("too many leaves"); + v.insert(new_index); + root.push(new_index); + leaves.push(chunk); + } + } + } + (root, leaves) +} + +pub fn write_table( + out: &mut impl io::Write, + data: &DataFiles, + allowed_in_word: &CodepointBitArr, +) -> Result<(), Box> { + let (mut casing_vec, non_greek) = letter_casing(data, allowed_in_word); + + let bits_to_shift = ENTRIES_PER_LEAF.ilog2(); + + let first_cp_not_in_tree = casing_vec.len().next_multiple_of(ENTRIES_PER_LEAF); + for _ in casing_vec.len()..first_cp_not_in_tree { + casing_vec.push(None); + } + + let first_cp_not_in_tree_shifted = first_cp_not_in_tree >> bits_to_shift; + + let mut non_greek = non_greek.into_iter(); + + write!( + out, + " +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[repr(u8)] +pub enum CasedLetterKind {{ + Lowercase = 1, + Uppercase = 2, + Titlecase = 3, +}} + +/// The case of this letter, or `None` if it is not a cased letter. +pub fn letter_casing(c: char) -> Option {{ + const BOTTOM_BITS_MASK: u32 = !((!0_u32) << {bits_to_shift}); + let cp: u32 = c.into(); + let top_bits = cp >> {bits_to_shift}; + if top_bits < 0x{first_cp_not_in_tree_shifted:X} {{ + let leaf_idx: u8 = LETTER_CASING_ROOT[usize::try_from(top_bits).unwrap()]; + let leaf = LETTER_CASING_LEAVES[usize::from(leaf_idx)]; + match (leaf >> ((cp & BOTTOM_BITS_MASK) * 2)) & 3 {{ + 0 => None, + 1 => Some(CasedLetterKind::Lowercase), + 2 => Some(CasedLetterKind::Uppercase), + 3 => Some(CasedLetterKind::Titlecase), + _ => unreachable!(), + }} + }} else {{ + None + }} +}} + +/// Whether the character is a non-Greek titlecase letter. +pub fn is_non_greek_titlecase(c: char) -> bool {{ + matches!(c, '\\u{{{:04X}}}'", + non_greek.next().unwrap() + )?; + + for cp in non_greek { + write!(out, " | '\\u{{{cp:04X}}}'")?; + } + + writeln!( + out, + ") +}} +" + )?; + + let (root, leaves) = build_casing_tree(&casing_vec); + eprintln!( + "letter_casing: {} bytes of static data", + root.len() + leaves.len() * size_of::() + ); + + write!(out, "static LETTER_CASING_ROOT: [u8; {}] = [", root.len())?; + + for line in root.chunks(16) { + write!(out, "\n ")?; + for byte in line { + write!(out, " 0x{byte:02X},")?; + } + } + + writeln!( + out, + "\n]; + +static LETTER_CASING_LEAVES: [{}; {}] = [", + type_name::(), + leaves.len() + )?; + + for leaf in leaves { + writeln!(out, " 0x{leaf:032X},")?; + } + writeln!(out, "];")?; + Ok(()) +} diff --git a/tables/src/main.rs b/tables/src/main.rs new file mode 100644 index 0000000..4dd6fea --- /dev/null +++ b/tables/src/main.rs @@ -0,0 +1,51 @@ +use std::{ + fs::OpenOptions, + io::{BufWriter, Write}, +}; + +mod allowed_in_word; +mod letter_casing; +mod nonspacing_marks; +mod titlecase; +mod unicode_data; + +use unicode_data::data_files; + +/// Update this on new Unicode releases +const UNICODE_VERSION: (u8, u8, u8) = (15, 1, 0); + +fn main() -> Result<(), Box> { + let data = data_files()?; + + let mut out = BufWriter::new( + OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open("../src/tables.rs")?, + ); + + writeln!( + &mut out, + "//! Automatically generated by `tables`. +//! Do not edit manually. + +/// The version of Unicode that the data included with this crate is based on. +/// +/// This crate also relies on Unicode data provided by the standard library; +/// that data is versioned according to [`char::UNICODE_VERSION`]. +pub const UNICODE_VERSION: (u8, u8, u8) = {UNICODE_VERSION:?}; +" + )?; + + let allowed_in_word = allowed_in_word::allowed_in_word(&data); + allowed_in_word::write_table(&mut out, &allowed_in_word)?; + + letter_casing::write_table(&mut out, &data, &allowed_in_word)?; + + nonspacing_marks::write_table(&mut out, &data)?; + + titlecase::write_table(&mut out, &data)?; + + Ok(()) +} diff --git a/tables/src/nonspacing_marks.rs b/tables/src/nonspacing_marks.rs new file mode 100644 index 0000000..5f97bb2 --- /dev/null +++ b/tables/src/nonspacing_marks.rs @@ -0,0 +1,172 @@ +//! Construct a lookup table to find whether a particular character is a nonspacing mark +//! (general category `Nonspacing_Mark` or `Enclosing_Mark`). +//! These characters are ignored when determining word boundaries. + +use std::{ + any::type_name, + collections::hash_map, + error::Error, + io::{self, Read}, + mem::size_of, +}; + +use bitvec::prelude::*; +use rustc_hash::FxHashMap; + +use crate::unicode_data::{set_by_general_category, CodepointBitArr, DataFiles}; + +type LeafElement = u128; +const ENTRIES_PER_LEAF: usize = LeafElement::BITS as usize; + +fn nonspacing_marks(data: &DataFiles) -> CodepointBitArr { + let mut arr = BitArray::ZERO; + set_by_general_category(&mut arr, data, "Mn|Me", true); + + arr +} + +fn build_tree(nonspacing_marks: &BitSlice) -> (Vec, Vec) { + let mut chunk_to_leaf_idx_map: FxHashMap = FxHashMap::from_iter([(0, 0)]); + let mut root = Vec::with_capacity(nonspacing_marks.len().div_ceil(ENTRIES_PER_LEAF)); + let mut leaves = vec![0]; + let chunks_iter = nonspacing_marks.chunks_exact(ENTRIES_PER_LEAF); + assert!(chunks_iter.remainder().is_empty()); + let mut chunks_iter = chunks_iter.map(|l| { + LeafElement::from_le_bytes( + l.bytes() + .collect::, _>>() + .unwrap() + .try_into() + .unwrap(), + ) + }); + for chunk in &mut chunks_iter { + match chunk_to_leaf_idx_map.entry(chunk) { + hash_map::Entry::Occupied(o) => { + root.push(*o.get()); + } + hash_map::Entry::Vacant(v) => { + let new_index = u8::try_from(leaves.len()).expect("too many leaves"); + v.insert(new_index); + root.push(new_index); + leaves.push(chunk); + } + } + } + (root, leaves) +} + +fn list_of_ranges(nonspacing_marks: &BitSlice, add: usize) -> Vec<(u32, u32)> { + let mut vec = Vec::new(); + for i in nonspacing_marks.iter_ones() { + let cp = u32::try_from(i + add).unwrap(); + if let Some((_, prev)) = vec.last_mut() { + if *prev + 1 == cp { + *prev = cp; + continue; + } + } + vec.push((cp, cp)) + } + vec +} + +pub fn write_table(out: &mut impl io::Write, data: &DataFiles) -> Result<(), Box> { + let marks = nonspacing_marks(data); + + let bits_to_shift = ENTRIES_PER_LEAF.ilog2(); + + let first_cp_not_in_tree = + (marks[..0x40000].last_one().unwrap() + 1).next_multiple_of(ENTRIES_PER_LEAF); + + /*for i in 3..10 { + let entries_per_leaf: usize = 1 << i; + let bytes_per_leaf = entries_per_leaf / 8; + let first_cp_not_in_tree = + (marks[..0x40000].last_one().unwrap() + 1).next_multiple_of(entries_per_leaf); + let leaves = marks[..first_cp_not_in_tree] + .chunks_exact(entries_per_leaf) + .collect::>(); + dbg!(( + bytes_per_leaf, + leaves.len(), + leaves.len() * bytes_per_leaf + first_cp_not_in_tree / entries_per_leaf + )); + }*/ + + let first_cp_not_in_tree_shifted = first_cp_not_in_tree >> bits_to_shift; + + writeln!( + out, + " +/// Whether this character is a nonspacing or enclosing mark. +pub fn is_nonspacing_mark(c: char) -> bool {{ + const BOTTOM_BITS_MASK: u32 = !((!0_u32) << {bits_to_shift}); + let cp: u32 = c.into(); + let top_bits = cp >> {bits_to_shift}; + if top_bits < 0x{first_cp_not_in_tree_shifted:X} {{ + let leaf_idx: u8 = NONSPACING_MARKS_ROOT[usize::try_from(top_bits).unwrap()]; + let leaf = NONSPACING_MARKS_LEAVES[usize::from(leaf_idx)]; + (leaf >> (cp & BOTTOM_BITS_MASK)) & 1 == 1 + }} else {{" + )?; + + let mut late_marks = + list_of_ranges(&marks[first_cp_not_in_tree..], first_cp_not_in_tree).into_iter(); + + if let Some(first_late_mark) = late_marks.next() { + write!( + out, + " matches!(cp, 0x{:06X}..=0x{:06X}", + first_late_mark.0, first_late_mark.1 + )?; + for late_mark in late_marks { + write!(out, " | 0x{:06X}..=0x{:06X}", late_mark.0, late_mark.1)?; + } + writeln!(out, ")")?; + } else { + writeln!(out, "false")?; + } + + writeln!( + out, + " }} +}} +", + )?; + + let (root, leaves) = build_tree(&marks[..first_cp_not_in_tree]); + eprintln!( + "nonspacing_marks: {} bytes of static data", + root.len() + leaves.len() * size_of::() + ); + + write!( + out, + "static NONSPACING_MARKS_ROOT: [u8; {}] = [", + root.len() + )?; + + for line in root.chunks(16) { + write!(out, "\n ")?; + for byte in line { + write!(out, " 0x{byte:02X},")?; + } + } + + writeln!( + out, + "\n]; + +static NONSPACING_MARKS_LEAVES: [{}; {}] = [", + type_name::(), + leaves.len() + )?; + + for leaf in leaves { + writeln!(out, " 0x{leaf:032X},")?; + } + writeln!(out, "];")?; + + Ok(()) +} diff --git a/tables/src/titlecase.rs b/tables/src/titlecase.rs new file mode 100644 index 0000000..2e7bbcd --- /dev/null +++ b/tables/src/titlecase.rs @@ -0,0 +1,189 @@ +//! Construct table for titlecase character mappings. +//! Only characters whose titlecase differs from their uppercase are included. + +use std::{error::Error, io, mem::size_of}; + +use regex::Regex; +use rustc_hash::FxHashMap; + +use crate::unicode_data::DataFiles; + +fn titlecases(data: &DataFiles) -> Vec<(char, Vec)> { + let mut map = FxHashMap::default(); + + // Single character mappings + let regex = Regex::new( + r"^([0-9A-F]+);(?:.*?);(?:.*?);(?:.*?);(?:.*?);(?:.*?);(?:.*?);(?:.*?);(?:.*?);(?:.*?);(?:.*?);(?:.*?);([0-9A-F]*);(?:.*?);([0-9A-F]+)", + ).unwrap(); + for line in data.unicode_data.lines() { + if let Some(captures) = regex.captures(line) { + if let Some(titlecase) = captures.get(3) { + // Only include if different from uppercase + if titlecase.as_str() != &captures[2] { + let cp = + char::from_u32(u32::from_str_radix(&captures[1], 16).unwrap()).unwrap(); + let titlecase_cp = + char::from_u32(u32::from_str_radix(titlecase.as_str(), 16).unwrap()) + .unwrap(); + assert!(!map.contains_key(&cp)); + map.insert(cp, vec![titlecase_cp]); + } + } + } + } + + // Multi character mappings + let regex = + Regex::new(r"^([0-9A-F]+);(?:[0-9A-F ]*);([0-9A-F ]*);([0-9A-F ]*);[^0-9A-Fa-f_]*#") + .unwrap(); + for line in data.special_casing.lines() { + if let Some(captures) = regex.captures(line) { + let titlecase_mapping = captures[2].trim(); + let uppercase_mapping = captures[3].trim(); + if titlecase_mapping != uppercase_mapping { + let cp = char::from_u32(u32::from_str_radix(&captures[1], 16).unwrap()).unwrap(); + assert!(!map.contains_key(&cp)); + map.insert( + cp, + titlecase_mapping + .split_whitespace() + .map(|s| char::from_u32(u32::from_str_radix(s, 16).unwrap()).unwrap()) + .collect(), + ); + } + } + } + + let mut vec: Vec<(char, Vec)> = map.into_iter().collect(); + vec.sort_unstable_by_key(|(c, _)| *c); + vec +} + +pub fn write_table(out: &mut impl io::Write, data: &DataFiles) -> Result<(), Box> { + let titlecase_mappings = titlecases(data); + let max_expansion = titlecase_mappings.iter().map(|t| t.1.len()).max().unwrap(); + + eprintln!( + "titlecase: {} bytes of static data", + (max_expansion + 1) * size_of::() * titlecase_mappings.len() + ); + + writeln!( + out, + " +use core::{{ + fmt::{{self, Write}}, + iter, +}}; + +#[derive(Clone, Debug)] +pub enum ToTitlecase {{ + Zero, + One(char), + Two(char, char), + Three(char, char, char), +}} + +impl Iterator for ToTitlecase {{ + type Item = char; + + fn next(&mut self) -> Option {{ + match *self {{ + Self::Zero => None, + Self::One(c) => {{ + *self = Self::Zero; + Some(c) + }} + Self::Two(b, c) => {{ + *self = Self::One(c); + Some(b) + }} + Self::Three(a, b, c) => {{ + *self = Self::Two(b, c); + Some(a) + }} + }} + }} + + fn size_hint(&self) -> (usize, Option) {{ + let size = match self {{ + Self::Zero => 0, + Self::One(_) => 1, + Self::Two(..) => 2, + Self::Three(..) => 3, + }}; + (size, Some(size)) + }} +}} + +impl iter::ExactSizeIterator for ToTitlecase {{}} + +impl iter::FusedIterator for ToTitlecase {{}} + +impl fmt::Display for ToTitlecase {{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {{ + for c in self.clone() {{ + f.write_char(c)?; + }} + Ok(()) + }} +}} + +/// Returns an iterator that yields the titlecase mapping of this `char` as one or more `char`s. +pub fn to_titlecase(c: char) -> ToTitlecase {{ + // ASCII fast path + if c.is_ascii() {{ + ToTitlecase::One(c.to_ascii_uppercase()) + }} else if let Ok(idx) = TITLECASE_MAPPINGS.binary_search_by_key(&c, |&(c2, _)| c2) {{ + match TITLECASE_MAPPINGS[idx].1 {{ + [None, ..] => ToTitlecase::Zero, + [Some(a), None, ..] => ToTitlecase::One(a), + [Some(a), Some(b), None] => ToTitlecase::Two(a, b), + [Some(a), Some(b), Some(c)] => ToTitlecase::Three(a, b, c), + }} + }} else {{ + let mut uppercase = c.to_uppercase(); + match uppercase.size_hint().0 {{ + 0 => ToTitlecase::Zero, + 1 => ToTitlecase::One(uppercase.next().unwrap()), + 2 => ToTitlecase::Two(uppercase.next().unwrap(), uppercase.next().unwrap()), + 3 => ToTitlecase::Three( + uppercase.next().unwrap(), + uppercase.next().unwrap(), + uppercase.next().unwrap(), + ), + _ => unreachable!(), + }} + }} +}} + +/// Sorted list of characters and their titlecase mappings. +/// Only characters whose titlecase differs from uppercase are included. +static TITLECASE_MAPPINGS: [(char, [Option; {max_expansion}]); {}] = [", + titlecase_mappings.len() + )?; + for (c, mapping) in titlecase_mappings { + write!(out, " ('{c}', [")?; + + let mut mapping = mapping.into_iter(); + + if let Some(fc) = mapping.next() { + write!(out, "Some('{fc}')")?; + } else { + write!(out, "None")?; + } + + for _ in 1..max_expansion { + if let Some(c) = mapping.next() { + write!(out, ", Some('{c}')")?; + } else { + write!(out, ", None")?; + } + } + + writeln!(out, "]),")?; + } + writeln!(out, "];")?; + + Ok(()) +} diff --git a/tables/src/unicode_data.rs b/tables/src/unicode_data.rs new file mode 100644 index 0000000..b4f9cfb --- /dev/null +++ b/tables/src/unicode_data.rs @@ -0,0 +1,95 @@ +use bitvec::BitArr; +use regex::Regex; + +use crate::UNICODE_VERSION; + +pub type CodepointBitArr = BitArr!(for 0x110000); + +/// Download the specified Unicode data file from the Unicode website, +/// using the version specified in [`UNICODE_VERSION`]. +fn fetch_unicode_file(file: &str) -> Result> { + Ok(reqwest::blocking::get(format!( + "https://www.unicode.org/Public/{}.{}.{}/ucd/{file}", + UNICODE_VERSION.0, UNICODE_VERSION.1, UNICODE_VERSION.2 + ))? + .error_for_status()? + .text()?) +} + +#[derive(Debug)] +pub struct DataFiles { + pub unicode_data: String, + pub derived_core_properties: String, + pub prop_list: String, + pub scripts: String, + pub special_casing: String, +} + +/// Retrieve all the data files we need. +pub fn data_files() -> Result> { + Ok(DataFiles { + unicode_data: fetch_unicode_file("UnicodeData.txt")?, + derived_core_properties: fetch_unicode_file("DerivedCoreProperties.txt")?, + prop_list: fetch_unicode_file("PropList.txt")?, + scripts: fetch_unicode_file("Scripts.txt")?, + special_casing: fetch_unicode_file("SpecialCasing.txt")?, + }) +} + +/// - `arr`: bit array (1 bit per unicode code point) +/// - `data_file`: Unicode data file to look for properties in +/// - `props`: regex matching one or more Unicode properties +/// - `set_to`: what we should set the bits corresponding to matching code points to +pub fn set_by_prop(arr: &mut CodepointBitArr, data_file: &str, props: &str, set_to: bool) { + let regex_string = format!(r"^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(?:{props})"); + let regex = Regex::new(®ex_string).unwrap(); + for line in data_file.lines() { + if let Some(captures) = regex.captures(line) { + let start = usize::from_str_radix(&captures[1], 16).unwrap(); + let codepoint_range = start + ..=captures + .get(2) + .map_or(start, |m| usize::from_str_radix(m.as_str(), 16).unwrap()); + for cp in codepoint_range { + arr.set(cp, set_to); + } + } + } +} + +/// - `arr`: bit array (1 bit per unicode code point) +/// - `props`: regex matching one or more Unicode character categories +/// - `set_to`: what we should set the bits corresponding to matching code points to +pub fn set_by_general_category( + arr: &mut CodepointBitArr, + data: &DataFiles, + categories: &str, + set_to: bool, +) { + let regex_string = format!(r"^([0-9A-F]+);(.*?);({categories});"); + let regex = Regex::new(®ex_string).unwrap(); + let mut range_start: Option<(usize, String, String)> = None; + for line in data.unicode_data.lines() { + if let Some(captures) = regex.captures(line) { + let cp = usize::from_str_radix(&captures[1], 16).unwrap(); + + if let Some((range_start_cp, prefix, category)) = range_start { + assert_eq!(captures[2].strip_suffix(", Last>"), Some(prefix).as_deref()); + assert_eq!(category, &captures[3]); + range_start = None; + for cp_within_range in range_start_cp..=cp { + arr.set(cp_within_range, set_to); + } + } else if let Some(prefix) = captures[2].strip_suffix(", First>") { + assert!(range_start.is_none()); + range_start = Some((cp, prefix.to_owned(), captures[3].to_owned())); + } else { + assert!(range_start.is_none()); + arr.set(cp, set_to); + } + } else { + assert!(range_start.is_none()); + } + } + assert!(range_start.is_none()); +}