diff --git a/.github/workflows/ci_js.yaml b/.github/workflows/ci_js.yaml index 1ea0aa9..ece70a3 100644 --- a/.github/workflows/ci_js.yaml +++ b/.github/workflows/ci_js.yaml @@ -32,7 +32,9 @@ jobs: node-version: '24' - name: Install wasm-bindgen-cli - run: cargo install wasm-bindgen-cli --version 0.2.114 + uses: taiki-e/install-action@v2 + with: + tool: wasm-bindgen-cli@0.2.114 - name: Install Node.js dependencies run: npm ci diff --git a/README.md b/README.md index a140566..b6bd96d 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![CI: Rust](https://github.com/aynumosir/ainu-utils/actions/workflows/ci_rust.yaml/badge.svg)](https://github.com/aynumosir/ainu-utils/actions/workflows/ci_rust.yaml) [![codecov](https://codecov.io/gh/aynumosir/ainu-utils/graph/badge.svg?token=aQHfYRVtsd)](https://codecov.io/gh/aynumosir/ainu-utils) -A collection of utility for with the Ainu language +A toolkit for Ainu language processing, available in Rust, JavaScript, and Python. ## Releases @@ -19,7 +19,7 @@ ainu-utils is distributed as a Rust crate, but you can also use its binding for `ainu-utils` provides several features for working with the Ainu language: -### `tokenize` +### Tokenization Tokenizes Ainu text into morphemes. @@ -41,70 +41,71 @@ tokenize("irankarapte. e=iwanke ya?", { keepWhitespace: false }); // => ["irankarapte", ".", "e=", "iwanke", "ya?"] ``` -### `to_kana` +### Syllabication -Converts Ainu text written in Latin script to Kana. +Parses Ainu text into syllables. **Python:** ```py -from ainu_utils import to_kana +from ainu_utils import syllabicate -to_kana("irankarapte. e=iwanke ya?") -# => "イランカラㇷ゚テ。 エイワンケ ヤ?" +syllabicate("irankarapte. e=iwanke ya?") +# => ["i", "ran", "ka", "rap", "te", ".", " ", "e", "=", "i", "wan", "ke", " ", "ya", "?"] ``` **JS:** ```js -import { toKana } from "ainu-utils"; +import { syllabicate } from "ainu-utils"; -toKana("irankarapte. e=iwanke ya?"); -// => "イランカラㇷ゚テ。 エイワンケ ヤ?" +syllabicate("irankarapte. e=iwanke ya?") +// => ["i", "ran", "ka", "rap", "te", ".", " ", "e", "=", "i", "wan", "ke", " ", "ya", "?"] ``` -### `number_to_words` -Converts integers between 1 and 100 to Ainu words. +### Transliteration + +Converts Ainu text written in Latin script to Kana. **Python:** ```py -from ainu_utils import number_to_words +from ainu_utils import transliterate_to_kana -number_to_words(91) -# => "sine ikasma wan easiknehotne" +transliterate_to_kana("irankarapte. e=iwanke ya?") +# => "イランカラㇷ゚テ。 エイワンケ ヤ?" ``` **JS:** ```js -import { numberToWords } from "ainu-utils"; +import { transliterateToKana } from "ainu-utils"; -numberToWords(91); -// => "sine ikasma wan easiknehotne" +transliterateToKana("irankarapte. e=iwanke ya?"); +// => "イランカラㇷ゚テ。 エイワンケ ヤ?" ``` -### `syllabicate` +### Convertion from a number to words -Parses Ainu text into syllables. +Converts integers between 1 and 100 to Ainu words. **Python:** ```py -from ainu_utils import syllabicate +from ainu_utils import number_to_words -syllabicate("irankarapte. e=iwanke ya?") -# => ["i", "ran", "ka", "rap", "te", ".", " ", "e", "=", "i", "wan", "ke", " ", "ya", "?"] +number_to_words(91) +# => "sine ikasma wan easiknehotne" ``` **JS:** ```js -import { syllabicate } from "ainu-utils"; +import { numberToWords } from "ainu-utils"; -syllabicate("irankarapte. e=iwanke ya?") -// => ["i", "ran", "ka", "rap", "te", ".", " ", "e", "=", "i", "wan", "ke", " ", "ya", "?"] +numberToWords(91); +// => "sine ikasma wan easiknehotne" ``` ## License diff --git a/ainu-utils-js/Cargo.toml b/ainu-utils-js/Cargo.toml index f38e3f0..d7ff8ac 100644 --- a/ainu-utils-js/Cargo.toml +++ b/ainu-utils-js/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "ainu-utils-js" -version = "0.5.1" +version = "0.5.3" edition = "2024" -description = "A collection of utilities for the Ainu language" +description = "A toolkit for Ainu language processing, available in Rust, JavaScript, and Python." license = "MIT" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/ainu-utils-js/src/lib.rs b/ainu-utils-js/src/lib.rs index 072719c..18cdae5 100644 --- a/ainu-utils-js/src/lib.rs +++ b/ainu-utils-js/src/lib.rs @@ -11,17 +11,17 @@ pub fn tokenize(text: &str, options: JsValue) -> Vec { tokenizer::tokenize(text, keep_whitespace) } -#[wasm_bindgen(js_name = toKana)] -pub fn to_kana(text: &str) -> String { - kana::to_kana(text) +#[wasm_bindgen] +pub fn syllabicate(text: &str) -> Vec { + syllables::parse(text) +} + +#[wasm_bindgen(js_name = transliterateToKana)] +pub fn transliterate_to_kana(text: &str) -> String { + kana::transliterate_to_kana(text) } #[wasm_bindgen(js_name = numberToWords)] pub fn number_to_words(input: i32) -> String { numbers::parse(input).unwrap().to_string() } - -#[wasm_bindgen] -pub fn syllabicate(text: &str) -> Vec { - syllables::parse(text) -} diff --git a/ainu-utils-js/tests/index.spec.js b/ainu-utils-js/tests/index.spec.js index d20b09c..5b934ff 100644 --- a/ainu-utils-js/tests/index.spec.js +++ b/ainu-utils-js/tests/index.spec.js @@ -1,7 +1,13 @@ import { test, expect } from "vitest"; -import { tokenize } from "../dist/index.js"; +import { tokenize, transliterateToKana } from "../dist/index.js"; test("tokenize", () => { const tokens = tokenize("irankarapte. e=iwanke ya?", { keepWhitespace: false }); expect(tokens).toEqual(["irankarapte", ".", "e=", "iwanke", "ya", "?"]); }); + + +test("transliterateToKana", () => { + const tokens = transliterateToKana("irankarapte. e=iwanke ya?"); + expect(tokens).toBe("イランカラㇷ゚テ。 エイワンケ ヤ?"); +}); diff --git a/ainu-utils-python/Cargo.toml b/ainu-utils-python/Cargo.toml index 921f7ee..e16101a 100644 --- a/ainu-utils-python/Cargo.toml +++ b/ainu-utils-python/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "ainu-utils-python" -version = "0.5.1" +version = "0.5.3" edition = "2021" -description = "A collection of utilities for the Ainu language" +description = "A toolkit for Ainu language processing, available in Rust, JavaScript, and Python." license = "MIT" [lib] diff --git a/ainu-utils-python/pyproject.toml b/ainu-utils-python/pyproject.toml index 036c2ba..e3b59f3 100644 --- a/ainu-utils-python/pyproject.toml +++ b/ainu-utils-python/pyproject.toml @@ -7,9 +7,9 @@ features = ["pyo3/extension-module"] [project] name = "ainu-utils" -description = "A collection of utility for with the Ainu language" +description = "A toolkit for Ainu language processing" requires-python = ">=3.8" -version = "0.5.1" +version = "0.5.3" license = "MIT" [project.optional-dependencies] diff --git a/ainu-utils-python/src/lib.rs b/ainu-utils-python/src/lib.rs index b3c2d15..ffb6010 100644 --- a/ainu-utils-python/src/lib.rs +++ b/ainu-utils-python/src/lib.rs @@ -9,8 +9,8 @@ fn tokenize(text: &str, keep_whitespace: bool) -> Vec { } #[pyfunction] -fn to_kana(text: &str) -> String { - ainu_utils_rust::kana::to_kana(text) +fn transliterate_to_kana(text: &str) -> String { + ainu_utils_rust::kana::transliterate_to_kana(text) } #[pyfunction] @@ -26,7 +26,7 @@ fn syllabicate(text: &str) -> Vec { #[pymodule] fn ainu_utils(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(tokenize, m)?)?; - m.add_function(wrap_pyfunction!(to_kana, m)?)?; + m.add_function(wrap_pyfunction!(transliterate_to_kana, m)?)?; m.add_function(wrap_pyfunction!(number_to_words, m)?)?; m.add_function(wrap_pyfunction!(syllabicate, m)?)?; m.add("test_number", 123)?; diff --git a/ainu-utils-python/tests/test_to_kana.py b/ainu-utils-python/tests/test_to_kana.py deleted file mode 100644 index db53ddd..0000000 --- a/ainu-utils-python/tests/test_to_kana.py +++ /dev/null @@ -1,6 +0,0 @@ -import ainu_utils - - -def test_to_kana(): - result = ainu_utils.to_kana("irankarapte. e=iwanke ya?") - assert result == "イランカラㇷ゚テ。 エイワンケ ヤ?" diff --git a/ainu-utils-python/tests/test_transliterate_to_kana.py b/ainu-utils-python/tests/test_transliterate_to_kana.py new file mode 100644 index 0000000..1082dd2 --- /dev/null +++ b/ainu-utils-python/tests/test_transliterate_to_kana.py @@ -0,0 +1,6 @@ +import ainu_utils + + +def test_transliterate_to_kana(): + result = ainu_utils.transliterate_to_kana("irankarapte. e=iwanke ya?") + assert result == "イランカラㇷ゚テ。 エイワンケ ヤ?" diff --git a/ainu-utils/Cargo.toml b/ainu-utils/Cargo.toml index efaa86c..6e80aec 100644 --- a/ainu-utils/Cargo.toml +++ b/ainu-utils/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "ainu-utils" -version = "0.5.1" +version = "0.5.3" edition = "2024" -description = "A collection of utilities for the Ainu language" +description = "A toolkit for Ainu language processing, available in Rust, JavaScript, and Python." license = "MIT" [dependencies] diff --git a/ainu-utils/src/kana/kana.rs b/ainu-utils/src/kana/kana.rs index 9bb25ce..6a4e011 100644 --- a/ainu-utils/src/kana/kana.rs +++ b/ainu-utils/src/kana/kana.rs @@ -1,5 +1,6 @@ -use crate::normalizer::normalize; use crate::phoneme::Phoneme; +use unicode_normalization::UnicodeNormalization; +use unicode_normalization::char::is_combining_mark; use super::kana_linking::link; use super::kana_map_c::map_c; @@ -7,7 +8,7 @@ use super::kana_map_cv::map_cv; use super::kana_map_punc::map_punc; use super::kana_map_v::map_v; -pub fn to_kana(input: &str) -> String { +pub fn transliterate_to_kana(input: &str) -> String { let mut input: String = input.to_string(); input = link(&input); @@ -15,7 +16,7 @@ pub fn to_kana(input: &str) -> String { let mut output = String::new(); for word in words { - let kana = word_to_kana(word); + let kana = transliterate_word_to_kana(word); if kana.chars().any(|c| c.is_ascii_alphabetic()) { output += word; @@ -31,10 +32,15 @@ pub fn to_kana(input: &str) -> String { output } -fn word_to_kana(input: &str) -> String { +fn strip_accents(input: &str) -> String { + input.nfkd().filter(|c| !is_combining_mark(*c)).collect() +} + +fn transliterate_word_to_kana(input: &str) -> String { let mut input: String = input.to_string(); - input = normalize(&input); + input = input.to_lowercase(); + input = strip_accents(&input); input = map_punc(&input); let chars: Vec = input.chars().collect(); diff --git a/ainu-utils/src/kana/kana_test.rs b/ainu-utils/src/kana/kana_test.rs index f21bac0..e47bc33 100644 --- a/ainu-utils/src/kana/kana_test.rs +++ b/ainu-utils/src/kana/kana_test.rs @@ -1,7 +1,7 @@ -use super::kana::to_kana; +use super::kana::transliterate_to_kana; #[test] -fn test_to_kana() { +fn test_transliterate_to_kana() { let sentence = [ "tanepo Asirpa tak nispa ne kusu, a=kiyannere na.", "Asirpa ekimne patek ki wa, menoko monrayke eaykap.", @@ -13,7 +13,7 @@ fn test_to_kana() { .join(" "); assert_eq!( - to_kana(&sentence), + transliterate_to_kana(&sentence), [ "タネポ アシㇼパ タㇰ ニㇱパ ネ クス、 アキヤンネレ ナ。", "アシㇼパ エキㇺネ パテㇰ キ ワ、 メノコ モンライケ エアイカㇷ゚。", @@ -27,163 +27,172 @@ fn test_to_kana() { #[test] fn test_dropping_h() { - assert_eq!(to_kana("_hine"), "イネ") + assert_eq!(transliterate_to_kana("_hine"), "イネ") } #[test] fn test_dropping_y() { - assert_eq!(to_kana("_ya?"), "ア?") + assert_eq!(transliterate_to_kana("_ya?"), "ア?") } #[test] fn test_linking_h() { - assert_eq!(to_kana("hawean __hi"), "ハウェアニ") + assert_eq!(transliterate_to_kana("hawean __hi"), "ハウェアニ") } #[test] fn test_linking_y() { - assert_eq!(to_kana("nankor __ya?"), "ナンコラ?") + assert_eq!(transliterate_to_kana("nankor __ya?"), "ナンコラ?") } #[test] fn test_linking_a() { - assert_eq!(to_kana("cis _a cis _a"), "チサ チサ") + assert_eq!(transliterate_to_kana("cis _a cis _a"), "チサ チサ") } #[test] fn test_linking_i() { - assert_eq!(to_kana("oar _isam"), "オアリサㇺ") + assert_eq!(transliterate_to_kana("oar _isam"), "オアリサㇺ") } #[test] fn test_linking_u() { - assert_eq!(to_kana("or _un"), "オルン") + assert_eq!(transliterate_to_kana("or _un"), "オルン") } #[test] fn test_linking_e() { - assert_eq!(to_kana("mat _etun"), "マテトゥン") + assert_eq!(transliterate_to_kana("mat _etun"), "マテトゥン") } #[test] fn test_linking_o() { - assert_eq!(to_kana("pet _or _un"), "ペトルン") + assert_eq!(transliterate_to_kana("pet _or _un"), "ペトルン") } #[test] fn test_linking_and_dropping_a() { - assert_eq!(to_kana("yaypuri ekira __ani"), "ヤイプリ エキラニ") + assert_eq!( + transliterate_to_kana("yaypuri ekira __ani"), + "ヤイプリ エキラニ" + ) } #[test] fn test_linking_and_dropping_i() { - assert_eq!(to_kana("puni __i"), "プニ") + assert_eq!(transliterate_to_kana("puni __i"), "プニ") } #[test] fn test_linking_and_dropping_u() { - assert_eq!(to_kana("a=kotanu __un"), "アコタヌン") + assert_eq!(transliterate_to_kana("a=kotanu __un"), "アコタヌン") } #[test] fn test_linking_and_dropping_e() { - assert_eq!(to_kana("i=samake __en anu"), "イサマケン アヌ") + assert_eq!( + transliterate_to_kana("i=samake __en anu"), + "イサマケン アヌ" + ) } #[test] fn test_linking_and_dropping_o() { // 実例なし。 - assert_eq!(to_kana("sapporo __or"), "サッポロㇿ") + assert_eq!(transliterate_to_kana("sapporo __or"), "サッポロㇿ") } #[test] fn test_linking_r_n() { - assert_eq!(to_kana("a=kor_ nispa"), "アコン ニㇱパ") + assert_eq!(transliterate_to_kana("a=kor_ nispa"), "アコン ニㇱパ") } #[test] fn test_linking_r_r() { - assert_eq!(to_kana("kor_ rusuy"), "コン ルスイ") + assert_eq!(transliterate_to_kana("kor_ rusuy"), "コン ルスイ") } #[test] fn test_linking_r_t() { - assert_eq!(to_kana("or_ ta"), "オッ タ") + assert_eq!(transliterate_to_kana("or_ ta"), "オッ タ") } #[test] fn test_linking_r_c() { - assert_eq!(to_kana("yar_ cise"), "ヤッ チセ") + assert_eq!(transliterate_to_kana("yar_ cise"), "ヤッ チセ") } #[test] fn test_linking_n_s() { - assert_eq!(to_kana("pon_ su"), "ポイ ス") + assert_eq!(transliterate_to_kana("pon_ su"), "ポイ ス") } #[test] fn test_linking_n_y() { - assert_eq!(to_kana("pon_ yam"), "ポイ ヤㇺ") + assert_eq!(transliterate_to_kana("pon_ yam"), "ポイ ヤㇺ") } #[test] fn test_linking_n_w() { - assert_eq!(to_kana("san _wa"), "サン マ") + assert_eq!(transliterate_to_kana("san _wa"), "サン マ") } #[test] fn test_linking_m_w() { - assert_eq!(to_kana("isam _wa"), "イサン マ") + assert_eq!(transliterate_to_kana("isam _wa"), "イサン マ") } #[test] fn test_linking_p_w() { - assert_eq!(to_kana("sap _wa"), "サッ パ") + assert_eq!(transliterate_to_kana("sap _wa"), "サッ パ") } #[test] fn test_special_mp() { - assert_eq!(to_kana("tampaku"), "タンパク") + assert_eq!(transliterate_to_kana("tampaku"), "タンパク") } #[test] fn test_special_mm() { - assert_eq!(to_kana("umma"), "ウンマ") + assert_eq!(transliterate_to_kana("umma"), "ウンマ") } #[test] fn test_symbols() { - assert_eq!(to_kana("“pirka” sekor a=ye"), "「ピㇼカ」 セコㇿ アイェ") + assert_eq!( + transliterate_to_kana("“pirka” sekor a=ye"), + "「ピㇼカ」 セコㇿ アイェ" + ) } #[test] fn test_k_prefix() { assert_eq!( - to_kana("irankarapte. kani anak IMO k=e easkay kur ku=ne."), + transliterate_to_kana("irankarapte. kani anak IMO k=e easkay kur ku=ne."), "イランカラㇷ゚テ。 カニ アナㇰ イモ ケ エアㇱカイ クㇽ クネ。" ) } #[test] fn test_diacritics() { - assert_eq!(to_kana("kamúy"), "カムイ") + assert_eq!(transliterate_to_kana("kamúy"), "カムイ") } #[test] fn test_yy_and_ww() { - assert_eq!(to_kana("kamuyyukar"), "カムイユカㇻ"); - assert_eq!(to_kana("eawwo"), "エアウウォ"); + assert_eq!(transliterate_to_kana("kamuyyukar"), "カムイユカㇻ"); + assert_eq!(transliterate_to_kana("eawwo"), "エアウウォ"); } #[test] fn test_glottal_stop() { - assert_eq!(to_kana("hioy'oy"), "ヒオイオイ"); + assert_eq!(transliterate_to_kana("hioy'oy"), "ヒオイオイ"); } #[test] fn test_rollback() { assert_eq!( - to_kana("Copyright Mojang AB. iteki eymek yan!"), + transliterate_to_kana("Copyright Mojang AB. iteki eymek yan!"), "Copyright Mojang AB. イテキ エイメㇰ ヤン!" ) } diff --git a/ainu-utils/src/lib.rs b/ainu-utils/src/lib.rs index a514455..48183e6 100644 --- a/ainu-utils/src/lib.rs +++ b/ainu-utils/src/lib.rs @@ -1,5 +1,4 @@ pub mod kana; -pub mod normalizer; pub mod numbers; pub mod phoneme; pub mod syllables; diff --git a/ainu-utils/src/normalizer/mod.rs b/ainu-utils/src/normalizer/mod.rs deleted file mode 100644 index bfacb57..0000000 --- a/ainu-utils/src/normalizer/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -mod normalizer; - -pub use self::normalizer::*; diff --git a/ainu-utils/src/normalizer/normalizer.rs b/ainu-utils/src/normalizer/normalizer.rs deleted file mode 100644 index 6e22c07..0000000 --- a/ainu-utils/src/normalizer/normalizer.rs +++ /dev/null @@ -1,15 +0,0 @@ -use unicode_normalization::char::is_combining_mark; -use unicode_normalization::UnicodeNormalization; - -pub fn strip_accents(input: &str) -> String { - input.nfkd().filter(|c| !is_combining_mark(*c)).collect() -} - -pub fn normalize(input: &str) -> String { - let mut result: String; - - result = input.to_lowercase(); - result = strip_accents(result.as_str()).to_string(); - - result -}