diff --git a/.editorconfig b/.editorconfig index b01b0fa..1cf8dfb 100644 --- a/.editorconfig +++ b/.editorconfig @@ -3,8 +3,7 @@ # Top-most EditorConfig file root = true -# Python files -[*.py] +[*.{rs,py}] indent_style = space indent_size = 4 end_of_line = lf @@ -12,7 +11,7 @@ charset = utf-8 trim_trailing_whitespace = true insert_final_newline = true -[*.{rs,js}] +[*.js] indent_style = space indent_size = 2 end_of_line = lf diff --git a/ainu-utils-js/Cargo.toml b/ainu-utils-js/Cargo.toml index d7ff8ac..4aaf8d0 100644 --- a/ainu-utils-js/Cargo.toml +++ b/ainu-utils-js/Cargo.toml @@ -13,3 +13,5 @@ crate-type = ["cdylib", "rlib"] ainu-utils = { path = "../ainu-utils" } wasm-bindgen = "0.2.114" js-sys = "0.3" +serde = { version = "1.0.228", features = ["derive"] } +serde-wasm-bindgen = "0.6.5" diff --git a/ainu-utils-js/src/lib.rs b/ainu-utils-js/src/lib.rs index 18cdae5..bff1c7b 100644 --- a/ainu-utils-js/src/lib.rs +++ b/ainu-utils-js/src/lib.rs @@ -1,19 +1,38 @@ -use ainu_utils::{kana, numbers, syllables, tokenizer}; -use js_sys::Reflect; +use ainu_utils::{ + kana, numbers, syllables, + tokens::{self, TokenizeOptions}, +}; +use serde::Deserialize; use wasm_bindgen::prelude::*; +#[derive(Deserialize, Default)] +#[serde(rename_all = "camelCase")] +pub struct TokenizeOptionsJs { + keep_whitespace: Option, +} + +impl From for TokenizeOptions { + fn from(value: TokenizeOptionsJs) -> Self { + let defaults = TokenizeOptions::default(); + Self { + keep_whitespace: value.keep_whitespace.unwrap_or(defaults.keep_whitespace), + } + } +} + #[wasm_bindgen] -pub fn tokenize(text: &str, options: JsValue) -> Vec { - let keep_whitespace = Reflect::get(&options, &JsValue::from_str("keepWhitespace")) - .ok() - .and_then(|v| v.as_bool()) - .unwrap_or(false); - tokenizer::tokenize(text, keep_whitespace) +pub fn tokenize(text: &str, options: Option) -> Result, JsError> { + let tokenize_options: TokenizeOptions = options + .map(serde_wasm_bindgen::from_value::) + .transpose()? + .map(Into::into) + .unwrap_or_default(); + Ok(tokens::tokenize(text, &tokenize_options)) } #[wasm_bindgen] pub fn syllabicate(text: &str) -> Vec { - syllables::parse(text) + syllables::syllabicate(text) } #[wasm_bindgen(js_name = transliterateToKana)] diff --git a/ainu-utils-js/tests/index.spec.js b/ainu-utils-js/tests/index.spec.js index 5b934ff..8d65ad0 100644 --- a/ainu-utils-js/tests/index.spec.js +++ b/ainu-utils-js/tests/index.spec.js @@ -1,11 +1,17 @@ -import { test, expect } from "vitest"; +import { test, expect, describe } from "vitest"; import { tokenize, transliterateToKana } from "../dist/index.js"; -test("tokenize", () => { - const tokens = tokenize("irankarapte. e=iwanke ya?", { keepWhitespace: false }); - expect(tokens).toEqual(["irankarapte", ".", "e=", "iwanke", "ya", "?"]); -}); +describe("tokenize", () => { + test("defaults", () => { + const tokens = tokenize("irankarapte. e=iwanke ya?"); + expect(tokens).toEqual(["irankarapte", ".", "e=", "iwanke", "ya", "?"]); + }); + test("keep whitespace", () => { + const tokens = tokenize("irankarapte. e=iwanke ya?", { keepWhitespace: true }); + expect(tokens).toEqual(["irankarapte", ".", " ", "e=", "iwanke", " ", "ya", "?"]); + }); +}); test("transliterateToKana", () => { const tokens = transliterateToKana("irankarapte. e=iwanke ya?"); diff --git a/ainu-utils-python/src/lib.rs b/ainu-utils-python/src/lib.rs index ffb6010..e2487a2 100644 --- a/ainu-utils-python/src/lib.rs +++ b/ainu-utils-python/src/lib.rs @@ -1,11 +1,16 @@ extern crate ainu_utils as ainu_utils_rust; +use ainu_utils_rust::tokens::TokenizeOptions; use pyo3::prelude::*; #[pyfunction] -#[pyo3(signature = (text, *, keep_whitespace = false))] -fn tokenize(text: &str, keep_whitespace: bool) -> Vec { - ainu_utils_rust::tokenizer::tokenize(text, keep_whitespace) +#[pyo3(signature = (text, *, keep_whitespace = None))] +fn tokenize(text: &str, keep_whitespace: Option) -> Vec { + let tokenize_options_default = TokenizeOptions::default(); + let tokenize_options = TokenizeOptions { + keep_whitespace: keep_whitespace.unwrap_or(tokenize_options_default.keep_whitespace), + }; + ainu_utils_rust::tokens::tokenize(text, &tokenize_options) } #[pyfunction] @@ -20,7 +25,7 @@ fn number_to_words(input: i32) -> String { #[pyfunction] fn syllabicate(text: &str) -> Vec { - ainu_utils_rust::syllables::parse(text) + ainu_utils_rust::syllables::syllabicate(text) } #[pymodule] diff --git a/ainu-utils-python/tests/test_tokenize.py b/ainu-utils-python/tests/test_tokenize.py index c5e4828..e94e64e 100644 --- a/ainu-utils-python/tests/test_tokenize.py +++ b/ainu-utils-python/tests/test_tokenize.py @@ -2,7 +2,7 @@ def test_tokenize(): - result = ainu_utils.tokenize("irankarapte. e=iwanke ya?", keep_whitespace=False) + result = ainu_utils.tokenize("irankarapte. e=iwanke ya?") assert result == ["irankarapte", ".", "e=", "iwanke", "ya", "?"] diff --git a/ainu-utils/src/lib.rs b/ainu-utils/src/lib.rs index 48183e6..792b77f 100644 --- a/ainu-utils/src/lib.rs +++ b/ainu-utils/src/lib.rs @@ -2,4 +2,4 @@ pub mod kana; pub mod numbers; pub mod phoneme; pub mod syllables; -pub mod tokenizer; +pub mod tokens; diff --git a/ainu-utils/src/syllables/syllables.rs b/ainu-utils/src/syllables/syllables.rs index 15db401..318a2c2 100644 --- a/ainu-utils/src/syllables/syllables.rs +++ b/ainu-utils/src/syllables/syllables.rs @@ -1,6 +1,6 @@ use crate::phoneme::Phoneme; -pub fn parse(input: &str) -> Vec { +pub fn syllabicate(input: &str) -> Vec { let chars: Vec = input.chars().collect(); let mut syllables = vec![]; diff --git a/ainu-utils/src/syllables/syllables_test.rs b/ainu-utils/src/syllables/syllables_test.rs index 02bb6a5..8a6f078 100644 --- a/ainu-utils/src/syllables/syllables_test.rs +++ b/ainu-utils/src/syllables/syllables_test.rs @@ -1,22 +1,23 @@ -use super::syllables::parse; +use super::syllables::syllabicate; #[test] -fn it_parses() { - assert_eq!(parse("pirka"), ["pir", "ka"]); - assert_eq!(parse("cikappo"), ["ci", "kap", "po"]); - assert_eq!(parse("aep"), ["a", "ep"]); +fn it_syllabicates() { + assert_eq!(syllabicate("pirka"), ["pir", "ka"]); + assert_eq!(syllabicate("cikappo"), ["ci", "kap", "po"]); + assert_eq!(syllabicate("aep"), ["a", "ep"]); assert_eq!( - parse("eyaykosiramsuypa"), + syllabicate("eyaykosiramsuypa"), ["e", "yay", "ko", "si", "ram", "suy", "pa"] ); assert_eq!( - parse("eci=koyayrayke p ne na!"), - ["e", "ci", "=", "ko", "yay", "ray", "ke", " ", "p", " ", "ne", " ", "na", "!"] + syllabicate("eci=koyayrayke p ne na!"), + [ + "e", "ci", "=", "ko", "yay", "ray", "ke", " ", "p", " ", "ne", " ", "na", "!" + ] ) } - #[test] fn it_handles_accent_symbols_as_well() { - assert_eq!(parse("káni"), ["ká", "ni"]) -} \ No newline at end of file + assert_eq!(syllabicate("káni"), ["ká", "ni"]) +} diff --git a/ainu-utils/src/tokenizer/mod.rs b/ainu-utils/src/tokenizer/mod.rs deleted file mode 100644 index 25a3310..0000000 --- a/ainu-utils/src/tokenizer/mod.rs +++ /dev/null @@ -1,10 +0,0 @@ -mod tokenizer; -mod unfix; - -pub use self::tokenizer::*; - -#[cfg(test)] -mod tokenizer_test; - -#[cfg(test)] -mod unfix_test; diff --git a/ainu-utils/src/tokens/mod.rs b/ainu-utils/src/tokens/mod.rs new file mode 100644 index 0000000..b99e174 --- /dev/null +++ b/ainu-utils/src/tokens/mod.rs @@ -0,0 +1,10 @@ +mod tokenize; +mod unfix; + +pub use self::tokenize::*; + +#[cfg(test)] +mod tokenize_test; + +#[cfg(test)] +mod unfix_test; diff --git a/ainu-utils/src/tokenizer/tokenizer.rs b/ainu-utils/src/tokens/tokenize.rs similarity index 75% rename from ainu-utils/src/tokenizer/tokenizer.rs rename to ainu-utils/src/tokens/tokenize.rs index 6fb29a2..f294108 100644 --- a/ainu-utils/src/tokenizer/tokenizer.rs +++ b/ainu-utils/src/tokens/tokenize.rs @@ -1,6 +1,11 @@ -use crate::tokenizer::unfix::unfix; +use super::unfix::unfix; -pub fn tokenize(text: &str, keep_whitespace: bool) -> Vec { +#[derive(Default)] +pub struct TokenizeOptions { + pub keep_whitespace: bool, +} + +pub fn tokenize(text: &str, options: &TokenizeOptions) -> Vec { let mut words = Vec::new(); let mut word = String::new(); @@ -21,7 +26,7 @@ pub fn tokenize(text: &str, keep_whitespace: bool) -> Vec { words.push(c.to_string()); } - if c.is_whitespace() && keep_whitespace { + if c.is_whitespace() && options.keep_whitespace { words.push(c.to_string()); } } diff --git a/ainu-utils/src/tokenizer/tokenizer_test.rs b/ainu-utils/src/tokens/tokenize_test.rs similarity index 78% rename from ainu-utils/src/tokenizer/tokenizer_test.rs rename to ainu-utils/src/tokens/tokenize_test.rs index 08e737e..ead2022 100644 --- a/ainu-utils/src/tokenizer/tokenizer_test.rs +++ b/ainu-utils/src/tokens/tokenize_test.rs @@ -1,9 +1,11 @@ -use super::tokenizer::tokenize; +use crate::tokens::TokenizeOptions; + +use super::tokenize::tokenize; #[test] fn test_tokenize() { let text = "irankarapte! eyami yak a=ye aeywankep ku=kar wa k=an."; - let tokens = tokenize(text, false); + let tokens = tokenize(text, &Default::default()); assert_eq!( tokens, @@ -28,7 +30,7 @@ fn test_tokenize() { #[test] fn test_tokenize_suffix() { let text = "soyenpa=an wa sinot=an ro!"; - let tokens = tokenize(text, false); + let tokens = tokenize(text, &Default::default()); assert_eq!( tokens, @@ -39,7 +41,7 @@ fn test_tokenize_suffix() { #[test] fn test_sentence_does_not_end_with_period() { let text = "a=nukar hike i=yaykohaytare i=yaypokaste wa iki pe"; - let tokens = tokenize(text, false); + let tokens = tokenize(text, &Default::default()); assert_eq!( tokens, @@ -61,7 +63,7 @@ fn test_sentence_does_not_end_with_period() { #[test] fn test_sentence_ending_with_a_fixed_word() { let text = "neno a=ye itak pirka a=ye itak i=koynu wa ... i=konu wa i=kore"; - let tokens = tokenize(text, false); + let tokens = tokenize(text, &Default::default()); assert_eq!( tokens, @@ -75,7 +77,7 @@ fn test_sentence_ending_with_a_fixed_word() { #[test] fn test_parse_numbers() { let text = "1000 yen ku=kor"; - let tokens = tokenize(text, false); + let tokens = tokenize(text, &Default::default()); assert_eq!(tokens, vec!["1000", "yen", "ku=", "kor"]); } @@ -83,14 +85,14 @@ fn test_parse_numbers() { #[test] fn test_handles_hyphen_within_word() { let text = "cep-koyki wa e"; - let tokens = tokenize(text, false); + let tokens = tokenize(text, &Default::default()); assert_eq!(tokens, vec!["cep-koyki", "wa", "e"]); } #[test] fn test_handles_double_prefixes() { let text = "niwen seta ne kusu a=e=kupa na."; - let tokens = tokenize(text, false); + let tokens = tokenize(text, &Default::default()); assert_eq!( tokens, vec!["niwen", "seta", "ne", "kusu", "a=", "e=", "kupa", "na", "."] @@ -100,11 +102,11 @@ fn test_handles_double_prefixes() { #[test] fn test_handles_glottal_stop() { let text = "ku=kor irwak'utari"; - let tokens = tokenize(text, false); + let tokens = tokenize(text, &Default::default()); assert_eq!(tokens, vec!["ku=", "kor", "irwak'utari"]); let text = "'ku=kor rusuy!' sekor hawean"; - let tokens = tokenize(text, false); + let tokens = tokenize(text, &Default::default()); assert_eq!( tokens, vec!["'", "ku=", "kor", "rusuy", "!", "'", "sekor", "hawean"] @@ -114,7 +116,10 @@ fn test_handles_glottal_stop() { #[test] fn test_keep_whitespace() { let text = "irankarapte. tanto sirpirka ne."; - let tokens = tokenize(text, true); + let options = TokenizeOptions { + keep_whitespace: true, + }; + let tokens = tokenize(text, &options); assert_eq!( tokens, vec![ diff --git a/ainu-utils/src/tokenizer/unfix.rs b/ainu-utils/src/tokens/unfix.rs similarity index 100% rename from ainu-utils/src/tokenizer/unfix.rs rename to ainu-utils/src/tokens/unfix.rs diff --git a/ainu-utils/src/tokenizer/unfix_test.rs b/ainu-utils/src/tokens/unfix_test.rs similarity index 100% rename from ainu-utils/src/tokenizer/unfix_test.rs rename to ainu-utils/src/tokens/unfix_test.rs