From 21b7b30adbf453c6f4ddbe1b5eb9eb17cc7e0caf Mon Sep 17 00:00:00 2001 From: Ryo Igarashi Date: Sun, 8 Mar 2026 22:02:50 +0900 Subject: [PATCH] feat: Add ignore pattern to transliterate --- README.md | 13 +- ainu-utils-js/src/lib.rs | 56 +++- ainu-utils-js/tests/index.spec.js | 38 ++- ainu-utils-python/ainu_utils.pyi | 9 +- ainu-utils-python/src/lib.rs | 44 ++- .../tests/test_transliterate_to_kana.py | 12 + ainu-utils/examples/kana.rs | 4 +- ainu-utils/examples/syllables.rs | 4 +- ainu-utils/examples/tokenize.rs | 4 +- ainu-utils/src/kana/kana.rs | 70 ++++- ainu-utils/src/kana/kana_test.rs | 269 +++++++++--------- 11 files changed, 354 insertions(+), 169 deletions(-) diff --git a/README.md b/README.md index b6bd96d..34a7ecb 100644 --- a/README.md +++ b/README.md @@ -71,9 +71,13 @@ Converts Ainu text written in Latin script to Kana. **Python:** ```py -from ainu_utils import transliterate_to_kana +from ainu_utils import transliterate_to_kana, Whitespace -transliterate_to_kana("irankarapte. e=iwanke ya?") +transliterate_to_kana( + "irankarapte. e=iwanke ya?", + whitespace=Whitespace.Fullwidth, + ignore_pattern=None, +) # => "イランカラㇷ゚テ。 エイワンケ ヤ?" ``` @@ -82,7 +86,10 @@ transliterate_to_kana("irankarapte. e=iwanke ya?") ```js import { transliterateToKana } from "ainu-utils"; -transliterateToKana("irankarapte. e=iwanke ya?"); +transliterateToKana("irankarapte. e=iwanke ya?", { + whitespace: "fullwidth", + ignore_pattern: null, +}); // => "イランカラㇷ゚テ。 エイワンケ ヤ?" ``` diff --git a/ainu-utils-js/src/lib.rs b/ainu-utils-js/src/lib.rs index bff1c7b..b48c44e 100644 --- a/ainu-utils-js/src/lib.rs +++ b/ainu-utils-js/src/lib.rs @@ -1,5 +1,6 @@ use ainu_utils::{ - kana, numbers, syllables, + kana::{self, IgnorePattern, IgnorePatternError, TransliterateToKanaOptions, Whitespace}, + numbers, syllables, tokens::{self, TokenizeOptions}, }; use serde::Deserialize; @@ -35,9 +36,58 @@ pub fn syllabicate(text: &str) -> Vec { syllables::syllabicate(text) } +#[derive(Deserialize, Default)] +#[serde(rename_all = "camelCase")] +pub struct TransliterateToKanaOptionsJs { + whitespace: Option, + ignore_pattern: Option, +} + +impl TryFrom for TransliterateToKanaOptions { + type Error = JsError; + + fn try_from(value: TransliterateToKanaOptionsJs) -> Result { + let defaults = TransliterateToKanaOptions::default(); + + let whitespace = match value.whitespace { + Some(whitespace) => match whitespace.as_str() { + "fullwidth" => Ok(Some(Whitespace::Fullwidth)), + "halfwidth" => Ok(Some(Whitespace::Halfwidth)), + _ => Err(JsError::new(&format!("Invalid whitespace: {}", whitespace))), + }, + _ => Ok(None), + }?; + + let ignore_pattern = value + .ignore_pattern + .map(|ignore_pattern| IgnorePattern::new(&ignore_pattern)) + .transpose() + .map_err(|err| match err { + IgnorePatternError::InvalidPattern => JsError::new("Invalid pattern provided"), + })?; + + let value = Self { + whitespace: whitespace.unwrap_or(defaults.whitespace), + ignore_pattern, + }; + + Ok(value) + } +} + #[wasm_bindgen(js_name = transliterateToKana)] -pub fn transliterate_to_kana(text: &str) -> String { - kana::transliterate_to_kana(text) +pub fn transliterate_to_kana(text: &str, options: Option) -> Result { + let transliterate_to_kana_options = options + .map(serde_wasm_bindgen::from_value::) + .transpose()? + .map(TryInto::::try_into) + .transpose()? + .unwrap_or_default(); + + Ok(kana::transliterate_to_kana( + text, + &transliterate_to_kana_options, + )) } #[wasm_bindgen(js_name = numberToWords)] diff --git a/ainu-utils-js/tests/index.spec.js b/ainu-utils-js/tests/index.spec.js index 8d65ad0..6dc5c3a 100644 --- a/ainu-utils-js/tests/index.spec.js +++ b/ainu-utils-js/tests/index.spec.js @@ -13,7 +13,39 @@ describe("tokenize", () => { }); }); -test("transliterateToKana", () => { - const tokens = transliterateToKana("irankarapte. e=iwanke ya?"); - expect(tokens).toBe("イランカラㇷ゚テ。 エイワンケ ヤ?"); +describe("transliterateToKana", () => { + test("defaults", () => { + const tokens = transliterateToKana("irankarapte. e=iwanke ya?"); + expect(tokens).toBe("イランカラㇷ゚テ。 エイワンケ ヤ?"); + }); + + test("ignore pattern", () => { + const tokens = transliterateToKana("JOHN ku=ne.", { + ignorePattern: "^[A-Z]+$" + }); + expect(tokens).toBe("JOHN クネ。"); + }); + + test("throws for an invalid ignore pattern", () => { + expect(() => { + transliterateToKana("JOHN ku=ne.", { + ignorePattern: "[", + }) + }).toThrowError("Invalid pattern provided"); + }); + + test("whitespace", () => { + const tokens = transliterateToKana("onne paskur ine?", { + whitespace: "halfwidth", + }); + expect(tokens).toBe("オンネ パㇱクㇽ イネ?"); + }); + + test("throws for an invalid whitespace", () => { + expect(() => { + transliterateToKana("irankarapte", { + whitespace: "xxx", + }) + }).toThrowError("Invalid whitespace: xxx"); + }); }); diff --git a/ainu-utils-python/ainu_utils.pyi b/ainu-utils-python/ainu_utils.pyi index ce965a7..7c326e2 100644 --- a/ainu-utils-python/ainu_utils.pyi +++ b/ainu-utils-python/ainu_utils.pyi @@ -1,5 +1,10 @@ # https://www.maturin.rs/project_layout#adding-python-type-information -def tokenize(text: str, *, keep_whitespace: bool = False) -> list[str]: ... -def to_kana(text: str) -> str: ... +def tokenize(text: str, *, keep_whitespace: bool | None = None) -> list[str]: ... +def to_kana( + text: str, + *, + whitespace: Whitespace | None = None, + ignore_pattern: str | None = None, +) -> str: ... def number_to_words(number: int) -> str: ... def syllabicate(text: str) -> list[str]: ... diff --git a/ainu-utils-python/src/lib.rs b/ainu-utils-python/src/lib.rs index e2487a2..e892486 100644 --- a/ainu-utils-python/src/lib.rs +++ b/ainu-utils-python/src/lib.rs @@ -1,7 +1,10 @@ extern crate ainu_utils as ainu_utils_rust; -use ainu_utils_rust::tokens::TokenizeOptions; -use pyo3::prelude::*; +use ainu_utils_rust::{ + kana::{IgnorePattern, IgnorePatternError, TransliterateToKanaOptions}, + tokens::TokenizeOptions, +}; +use pyo3::{exceptions::PyValueError, prelude::*}; #[pyfunction] #[pyo3(signature = (text, *, keep_whitespace = None))] @@ -13,9 +16,41 @@ fn tokenize(text: &str, keep_whitespace: Option) -> Vec { ainu_utils_rust::tokens::tokenize(text, &tokenize_options) } +#[pyclass(eq, from_py_object)] +#[derive(PartialEq, Clone)] +pub enum Whitespace { + Fullwidth, + Halfwidth, +} + #[pyfunction] -fn transliterate_to_kana(text: &str) -> String { - ainu_utils_rust::kana::transliterate_to_kana(text) +#[pyo3(signature = (text, *, whitespace = None, ignore_pattern = None))] +fn transliterate_to_kana( + text: &str, + whitespace: Option, + ignore_pattern: Option<&str>, +) -> Result { + let defaults = TransliterateToKanaOptions::default(); + + let whitespace = match whitespace { + Some(Whitespace::Fullwidth) => ainu_utils_rust::kana::Whitespace::Fullwidth, + Some(Whitespace::Halfwidth) => ainu_utils_rust::kana::Whitespace::Halfwidth, + None => defaults.whitespace, + }; + + let ignore_pattern = ignore_pattern + .map(|p| IgnorePattern::new(p)) + .transpose() + .map_err(|e| match e { + IgnorePatternError::InvalidPattern => PyValueError::new_err("Invalid pattern proivded"), + })?; + + let options = TransliterateToKanaOptions { + ignore_pattern, + whitespace, + }; + + Ok(ainu_utils_rust::kana::transliterate_to_kana(text, &options)) } #[pyfunction] @@ -34,6 +69,7 @@ fn ainu_utils(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(transliterate_to_kana, m)?)?; m.add_function(wrap_pyfunction!(number_to_words, m)?)?; m.add_function(wrap_pyfunction!(syllabicate, m)?)?; + m.add_class::()?; m.add("test_number", 123)?; Ok(()) } diff --git a/ainu-utils-python/tests/test_transliterate_to_kana.py b/ainu-utils-python/tests/test_transliterate_to_kana.py index 1082dd2..78b51e4 100644 --- a/ainu-utils-python/tests/test_transliterate_to_kana.py +++ b/ainu-utils-python/tests/test_transliterate_to_kana.py @@ -4,3 +4,15 @@ def test_transliterate_to_kana(): result = ainu_utils.transliterate_to_kana("irankarapte. e=iwanke ya?") assert result == "イランカラㇷ゚テ。 エイワンケ ヤ?" + + +def test_transliterate_to_kana_halfwidth(): + result = ainu_utils.transliterate_to_kana( + "irankarapte. e=iwanke ya?", whitespace=ainu_utils.Whitespace.Halfwidth + ) + assert result == "イランカラㇷ゚テ。 エイワンケ ヤ?" + + +def test_transliterate_to_kana_ignore(): + result = ainu_utils.transliterate_to_kana("JOHN ku=ne.", ignore_pattern="^[A-Z]+$") + assert result == "JOHN クネ。" diff --git a/ainu-utils/examples/kana.rs b/ainu-utils/examples/kana.rs index 8284a5d..22d5e0b 100644 --- a/ainu-utils/examples/kana.rs +++ b/ainu-utils/examples/kana.rs @@ -1,11 +1,11 @@ -use ainu_utils::kana::to_kana; +use ainu_utils::kana::transliterate_to_kana; use std::env; fn main() { let args: Vec = env::args().collect(); let text = &args[1]; - let kana = to_kana(text); + let kana = transliterate_to_kana(text, &Default::default()); println!("{}", kana); } diff --git a/ainu-utils/examples/syllables.rs b/ainu-utils/examples/syllables.rs index 2dc6737..5ede920 100644 --- a/ainu-utils/examples/syllables.rs +++ b/ainu-utils/examples/syllables.rs @@ -1,11 +1,11 @@ -use ainu_utils::syllables::parse; +use ainu_utils::syllables::syllabicate; use std::env; fn main() { let args: Vec = env::args().collect(); let text = &args[1]; - let syllables = parse(text); + let syllables = syllabicate(text); println!("{:?}", syllables); } diff --git a/ainu-utils/examples/tokenize.rs b/ainu-utils/examples/tokenize.rs index b5911e4..d22b995 100644 --- a/ainu-utils/examples/tokenize.rs +++ b/ainu-utils/examples/tokenize.rs @@ -1,11 +1,11 @@ -use ainu_utils::tokenizer::tokenize; +use ainu_utils::tokens::tokenize; use std::env; fn main() { let args: Vec = env::args().collect(); let text = &args[1]; - let tokens = tokenize(text, false); + let tokens = tokenize(text, &Default::default()); println!("{:?}", tokens); } diff --git a/ainu-utils/src/kana/kana.rs b/ainu-utils/src/kana/kana.rs index 6a4e011..9154ca7 100644 --- a/ainu-utils/src/kana/kana.rs +++ b/ainu-utils/src/kana/kana.rs @@ -1,4 +1,5 @@ use crate::phoneme::Phoneme; +use regex::Regex; use unicode_normalization::UnicodeNormalization; use unicode_normalization::char::is_combining_mark; @@ -8,26 +9,71 @@ use super::kana_map_cv::map_cv; use super::kana_map_punc::map_punc; use super::kana_map_v::map_v; -pub fn transliterate_to_kana(input: &str) -> String { +pub enum Whitespace { + Fullwidth, + Halfwidth, +} + +impl ToString for Whitespace { + fn to_string(&self) -> String { + match self { + Self::Fullwidth => " ".to_string(), + Self::Halfwidth => " ".to_string(), + } + } +} + +impl Default for Whitespace { + fn default() -> Self { + Self::Fullwidth + } +} + +#[derive(Debug)] +pub enum IgnorePatternError { + InvalidPattern, +} + +pub struct IgnorePattern(Regex); + +impl IgnorePattern { + pub fn new(value: &str) -> Result { + let regex = Regex::new(value).map_err(|_| IgnorePatternError::InvalidPattern)?; + Ok(IgnorePattern(regex)) + } +} + +#[derive(Default)] +pub struct TransliterateToKanaOptions { + pub whitespace: Whitespace, + pub ignore_pattern: Option, +} + +#[derive(Debug)] +pub enum TransliterateToKanaError { + InvalidIgnore, +} + +pub fn transliterate_to_kana(input: &str, options: &TransliterateToKanaOptions) -> String { let mut input: String = input.to_string(); input = link(&input); - let words: Vec<&str> = input.split(' ').collect(); - let mut output = String::new(); + let words_latn: Vec<&str> = input.split(' ').collect(); + let mut words_kana: Vec = vec![]; - for word in words { - let kana = transliterate_word_to_kana(word); - - if kana.chars().any(|c| c.is_ascii_alphabetic()) { - output += word; - } else { - output += &kana; + for word_latn in words_latn { + if let Some(ignore) = &options.ignore_pattern + && ignore.0.is_match(word_latn) + { + words_kana.push(word_latn.to_string()); + continue; } - output += " "; + let word_kana = transliterate_word_to_kana(word_latn); + words_kana.push(word_kana); } - output = output.trim_end().to_string(); + let output = words_kana.join(&options.whitespace.to_string()).to_string(); output } diff --git a/ainu-utils/src/kana/kana_test.rs b/ainu-utils/src/kana/kana_test.rs index e47bc33..0de4aab 100644 --- a/ainu-utils/src/kana/kana_test.rs +++ b/ainu-utils/src/kana/kana_test.rs @@ -1,4 +1,4 @@ -use super::kana::transliterate_to_kana; +use super::kana::{IgnorePattern, TransliterateToKanaOptions, Whitespace, transliterate_to_kana}; #[test] fn test_transliterate_to_kana() { @@ -13,7 +13,7 @@ fn test_transliterate_to_kana() { .join(" "); assert_eq!( - transliterate_to_kana(&sentence), + transliterate_to_kana(&sentence, &Default::default()), [ "タネポ アシㇼパ タㇰ ニㇱパ ネ クス、 アキヤンネレ ナ。", "アシㇼパ エキㇺネ パテㇰ キ ワ、 メノコ モンライケ エアイカㇷ゚。", @@ -26,173 +26,170 @@ fn test_transliterate_to_kana() { } #[test] -fn test_dropping_h() { - assert_eq!(transliterate_to_kana("_hine"), "イネ") -} - -#[test] -fn test_dropping_y() { - assert_eq!(transliterate_to_kana("_ya?"), "ア?") -} - -#[test] -fn test_linking_h() { - assert_eq!(transliterate_to_kana("hawean __hi"), "ハウェアニ") -} - -#[test] -fn test_linking_y() { - assert_eq!(transliterate_to_kana("nankor __ya?"), "ナンコラ?") -} - -#[test] -fn test_linking_a() { - assert_eq!(transliterate_to_kana("cis _a cis _a"), "チサ チサ") -} - -#[test] -fn test_linking_i() { - assert_eq!(transliterate_to_kana("oar _isam"), "オアリサㇺ") -} - -#[test] -fn test_linking_u() { - assert_eq!(transliterate_to_kana("or _un"), "オルン") -} - -#[test] -fn test_linking_e() { - assert_eq!(transliterate_to_kana("mat _etun"), "マテトゥン") -} - -#[test] -fn test_linking_o() { - assert_eq!(transliterate_to_kana("pet _or _un"), "ペトルン") -} - -#[test] -fn test_linking_and_dropping_a() { +fn test_linking() { + assert_eq!(transliterate_to_kana("_hine", &Default::default()), "イネ"); + assert_eq!(transliterate_to_kana("_ya?", &Default::default()), "ア?"); + assert_eq!( + transliterate_to_kana("hawean __hi", &Default::default()), + "ハウェアニ" + ); + assert_eq!( + transliterate_to_kana("nankor __ya?", &Default::default()), + "ナンコラ?" + ); + assert_eq!( + transliterate_to_kana("cis _a cis _a", &Default::default()), + "チサ チサ" + ); + assert_eq!( + transliterate_to_kana("oar _isam", &Default::default()), + "オアリサㇺ" + ); + assert_eq!( + transliterate_to_kana("or _un", &Default::default()), + "オルン" + ); + assert_eq!( + transliterate_to_kana("mat _etun", &Default::default()), + "マテトゥン" + ); assert_eq!( - transliterate_to_kana("yaypuri ekira __ani"), + transliterate_to_kana("pet _or _un", &Default::default()), + "ペトルン" + ); + assert_eq!( + transliterate_to_kana("yaypuri ekira __ani", &Default::default()), "ヤイプリ エキラニ" - ) -} - -#[test] -fn test_linking_and_dropping_i() { - assert_eq!(transliterate_to_kana("puni __i"), "プニ") -} - -#[test] -fn test_linking_and_dropping_u() { - assert_eq!(transliterate_to_kana("a=kotanu __un"), "アコタヌン") -} - -#[test] -fn test_linking_and_dropping_e() { + ); + assert_eq!( + transliterate_to_kana("puni __i", &Default::default()), + "プニ" + ); assert_eq!( - transliterate_to_kana("i=samake __en anu"), + transliterate_to_kana("a=kotanu __un", &Default::default()), + "アコタヌン" + ); + assert_eq!( + transliterate_to_kana("i=samake __en anu", &Default::default()), "イサマケン アヌ" - ) -} - -#[test] -fn test_linking_and_dropping_o() { + ); // 実例なし。 - assert_eq!(transliterate_to_kana("sapporo __or"), "サッポロㇿ") -} - -#[test] -fn test_linking_r_n() { - assert_eq!(transliterate_to_kana("a=kor_ nispa"), "アコン ニㇱパ") -} - -#[test] -fn test_linking_r_r() { - assert_eq!(transliterate_to_kana("kor_ rusuy"), "コン ルスイ") -} - -#[test] -fn test_linking_r_t() { - assert_eq!(transliterate_to_kana("or_ ta"), "オッ タ") -} - -#[test] -fn test_linking_r_c() { - assert_eq!(transliterate_to_kana("yar_ cise"), "ヤッ チセ") -} - -#[test] -fn test_linking_n_s() { - assert_eq!(transliterate_to_kana("pon_ su"), "ポイ ス") -} - -#[test] -fn test_linking_n_y() { - assert_eq!(transliterate_to_kana("pon_ yam"), "ポイ ヤㇺ") -} - -#[test] -fn test_linking_n_w() { - assert_eq!(transliterate_to_kana("san _wa"), "サン マ") -} - -#[test] -fn test_linking_m_w() { - assert_eq!(transliterate_to_kana("isam _wa"), "イサン マ") -} - -#[test] -fn test_linking_p_w() { - assert_eq!(transliterate_to_kana("sap _wa"), "サッ パ") + assert_eq!( + transliterate_to_kana("sapporo __or", &Default::default()), + "サッポロㇿ" + ); + assert_eq!( + transliterate_to_kana("a=kor_ nispa", &Default::default()), + "アコン ニㇱパ" + ); + assert_eq!( + transliterate_to_kana("kor_ rusuy", &Default::default()), + "コン ルスイ" + ); + assert_eq!( + transliterate_to_kana("or_ ta", &Default::default()), + "オッ タ" + ); + assert_eq!( + transliterate_to_kana("yar_ cise", &Default::default()), + "ヤッ チセ" + ); + assert_eq!( + transliterate_to_kana("pon_ su", &Default::default()), + "ポイ ス" + ); + assert_eq!( + transliterate_to_kana("pon_ yam", &Default::default()), + "ポイ ヤㇺ" + ); + assert_eq!( + transliterate_to_kana("san _wa", &Default::default()), + "サン マ" + ); + assert_eq!( + transliterate_to_kana("isam _wa", &Default::default()), + "イサン マ" + ); + assert_eq!( + transliterate_to_kana("sap _wa", &Default::default()), + "サッ パ" + ); } #[test] -fn test_special_mp() { - assert_eq!(transliterate_to_kana("tampaku"), "タンパク") +fn test_special_consonant_clusters() { + assert_eq!( + transliterate_to_kana("tampaku", &Default::default()), + "タンパク" + ); + assert_eq!(transliterate_to_kana("umma", &Default::default()), "ウンマ"); + assert_eq!( + transliterate_to_kana("kamuyyukar", &Default::default()), + "カムイユカㇻ" + ); + assert_eq!( + transliterate_to_kana("eawwo", &Default::default()), + "エアウウォ" + ); } #[test] -fn test_special_mm() { - assert_eq!(transliterate_to_kana("umma"), "ウンマ") +fn test_symbols() { + assert_eq!( + transliterate_to_kana("“pirka” sekor a=ye", &Default::default()), + "「ピㇼカ」 セコㇿ アイェ" + ) } #[test] -fn test_symbols() { +fn test_sakhalin_ainu() { assert_eq!( - transliterate_to_kana("“pirka” sekor a=ye"), - "「ピㇼカ」 セコㇿ アイェ" + transliterate_to_kana("ah ih uh eh oh", &Default::default()), + "アㇵ イㇶ ウㇷ エㇸ オㇹ" ) } #[test] fn test_k_prefix() { assert_eq!( - transliterate_to_kana("irankarapte. kani anak IMO k=e easkay kur ku=ne."), - "イランカラㇷ゚テ。 カニ アナㇰ イモ ケ エアㇱカイ クㇽ クネ。" + transliterate_to_kana("ku=ne ruwe ne", &Default::default()), + "クネ ルウェ ネ" ) } #[test] fn test_diacritics() { - assert_eq!(transliterate_to_kana("kamúy"), "カムイ") + assert_eq!( + transliterate_to_kana("kamúy", &Default::default()), + "カムイ" + ); + assert_eq!( + transliterate_to_kana("hioy'oy", &Default::default()), + "ヒオイオイ" + ); } #[test] -fn test_yy_and_ww() { - assert_eq!(transliterate_to_kana("kamuyyukar"), "カムイユカㇻ"); - assert_eq!(transliterate_to_kana("eawwo"), "エアウウォ"); +fn test_halfwidth() { + let options = TransliterateToKanaOptions { + whitespace: Whitespace::Halfwidth, + ..Default::default() + }; + assert_eq!( + transliterate_to_kana("ku=iki kusu ne na", &options), + "クイキ クス ネ ナ" + ) } #[test] -fn test_glottal_stop() { - assert_eq!(transliterate_to_kana("hioy'oy"), "ヒオイオイ"); -} +fn test_ignore_pattern() { + let options = TransliterateToKanaOptions { + ignore_pattern: Some(IgnorePattern::new("^[A-Z]+$").unwrap()), + ..Default::default() + }; -#[test] -fn test_rollback() { assert_eq!( - transliterate_to_kana("Copyright Mojang AB. iteki eymek yan!"), - "Copyright Mojang AB. イテキ エイメㇰ ヤン!" + transliterate_to_kana("JOHN ku=ne wa.", &options), + "JOHN クネ ワ。" ) }