Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,15 @@
# Top-most EditorConfig file
root = true

# Python files
[*.py]
[*.{rs,py}]
indent_style = space
indent_size = 4
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true

[*.{rs,js}]
[*.js]
indent_style = space
indent_size = 2
end_of_line = lf
Expand Down
2 changes: 2 additions & 0 deletions ainu-utils-js/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ crate-type = ["cdylib", "rlib"]
ainu-utils = { path = "../ainu-utils" }
wasm-bindgen = "0.2.114"
js-sys = "0.3"
serde = { version = "1.0.228", features = ["derive"] }
serde-wasm-bindgen = "0.6.5"
37 changes: 28 additions & 9 deletions ainu-utils-js/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,38 @@
use ainu_utils::{kana, numbers, syllables, tokenizer};
use js_sys::Reflect;
use ainu_utils::{
kana, numbers, syllables,
tokens::{self, TokenizeOptions},
};
use serde::Deserialize;
use wasm_bindgen::prelude::*;

#[derive(Deserialize, Default)]
#[serde(rename_all = "camelCase")]
pub struct TokenizeOptionsJs {
keep_whitespace: Option<bool>,
}

impl From<TokenizeOptionsJs> for TokenizeOptions {
fn from(value: TokenizeOptionsJs) -> Self {
let defaults = TokenizeOptions::default();
Self {
keep_whitespace: value.keep_whitespace.unwrap_or(defaults.keep_whitespace),
}
}
}

#[wasm_bindgen]
pub fn tokenize(text: &str, options: JsValue) -> Vec<String> {
let keep_whitespace = Reflect::get(&options, &JsValue::from_str("keepWhitespace"))
.ok()
.and_then(|v| v.as_bool())
.unwrap_or(false);
tokenizer::tokenize(text, keep_whitespace)
pub fn tokenize(text: &str, options: Option<JsValue>) -> Result<Vec<String>, JsError> {
let tokenize_options: TokenizeOptions = options
.map(serde_wasm_bindgen::from_value::<TokenizeOptionsJs>)
.transpose()?
.map(Into::into)
.unwrap_or_default();
Ok(tokens::tokenize(text, &tokenize_options))
}

#[wasm_bindgen]
pub fn syllabicate(text: &str) -> Vec<String> {
syllables::parse(text)
syllables::syllabicate(text)
}

#[wasm_bindgen(js_name = transliterateToKana)]
Expand Down
16 changes: 11 additions & 5 deletions ainu-utils-js/tests/index.spec.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
import { test, expect } from "vitest";
import { test, expect, describe } from "vitest";
import { tokenize, transliterateToKana } from "../dist/index.js";

test("tokenize", () => {
const tokens = tokenize("irankarapte. e=iwanke ya?", { keepWhitespace: false });
expect(tokens).toEqual(["irankarapte", ".", "e=", "iwanke", "ya", "?"]);
});
describe("tokenize", () => {
test("defaults", () => {
const tokens = tokenize("irankarapte. e=iwanke ya?");
expect(tokens).toEqual(["irankarapte", ".", "e=", "iwanke", "ya", "?"]);
});

test("keep whitespace", () => {
const tokens = tokenize("irankarapte. e=iwanke ya?", { keepWhitespace: true });
expect(tokens).toEqual(["irankarapte", ".", " ", "e=", "iwanke", " ", "ya", "?"]);
});
});

test("transliterateToKana", () => {
const tokens = transliterateToKana("irankarapte. e=iwanke ya?");
Expand Down
13 changes: 9 additions & 4 deletions ainu-utils-python/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
extern crate ainu_utils as ainu_utils_rust;

use ainu_utils_rust::tokens::TokenizeOptions;
use pyo3::prelude::*;

#[pyfunction]
#[pyo3(signature = (text, *, keep_whitespace = false))]
fn tokenize(text: &str, keep_whitespace: bool) -> Vec<String> {
ainu_utils_rust::tokenizer::tokenize(text, keep_whitespace)
#[pyo3(signature = (text, *, keep_whitespace = None))]
fn tokenize(text: &str, keep_whitespace: Option<bool>) -> Vec<String> {
let tokenize_options_default = TokenizeOptions::default();
let tokenize_options = TokenizeOptions {
keep_whitespace: keep_whitespace.unwrap_or(tokenize_options_default.keep_whitespace),
};
ainu_utils_rust::tokens::tokenize(text, &tokenize_options)
}

#[pyfunction]
Expand All @@ -20,7 +25,7 @@ fn number_to_words(input: i32) -> String {

#[pyfunction]
fn syllabicate(text: &str) -> Vec<String> {
ainu_utils_rust::syllables::parse(text)
ainu_utils_rust::syllables::syllabicate(text)
}

#[pymodule]
Expand Down
2 changes: 1 addition & 1 deletion ainu-utils-python/tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


def test_tokenize():
result = ainu_utils.tokenize("irankarapte. e=iwanke ya?", keep_whitespace=False)
result = ainu_utils.tokenize("irankarapte. e=iwanke ya?")
assert result == ["irankarapte", ".", "e=", "iwanke", "ya", "?"]


Expand Down
2 changes: 1 addition & 1 deletion ainu-utils/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ pub mod kana;
pub mod numbers;
pub mod phoneme;
pub mod syllables;
pub mod tokenizer;
pub mod tokens;
2 changes: 1 addition & 1 deletion ainu-utils/src/syllables/syllables.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::phoneme::Phoneme;

pub fn parse(input: &str) -> Vec<String> {
pub fn syllabicate(input: &str) -> Vec<String> {
let chars: Vec<char> = input.chars().collect();

let mut syllables = vec![];
Expand Down
23 changes: 12 additions & 11 deletions ainu-utils/src/syllables/syllables_test.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
use super::syllables::parse;
use super::syllables::syllabicate;

#[test]
fn it_parses() {
assert_eq!(parse("pirka"), ["pir", "ka"]);
assert_eq!(parse("cikappo"), ["ci", "kap", "po"]);
assert_eq!(parse("aep"), ["a", "ep"]);
fn it_syllabicates() {
assert_eq!(syllabicate("pirka"), ["pir", "ka"]);
assert_eq!(syllabicate("cikappo"), ["ci", "kap", "po"]);
assert_eq!(syllabicate("aep"), ["a", "ep"]);
assert_eq!(
parse("eyaykosiramsuypa"),
syllabicate("eyaykosiramsuypa"),
["e", "yay", "ko", "si", "ram", "suy", "pa"]
);
assert_eq!(
parse("eci=koyayrayke p ne na!"),
["e", "ci", "=", "ko", "yay", "ray", "ke", " ", "p", " ", "ne", " ", "na", "!"]
syllabicate("eci=koyayrayke p ne na!"),
[
"e", "ci", "=", "ko", "yay", "ray", "ke", " ", "p", " ", "ne", " ", "na", "!"
]
)
}


#[test]
fn it_handles_accent_symbols_as_well() {
assert_eq!(parse("káni"), ["ká", "ni"])
}
assert_eq!(syllabicate("káni"), ["ká", "ni"])
}
10 changes: 0 additions & 10 deletions ainu-utils/src/tokenizer/mod.rs

This file was deleted.

10 changes: 10 additions & 0 deletions ainu-utils/src/tokens/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
mod tokenize;
mod unfix;

pub use self::tokenize::*;

#[cfg(test)]
mod tokenize_test;

#[cfg(test)]
mod unfix_test;
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
use crate::tokenizer::unfix::unfix;
use super::unfix::unfix;

pub fn tokenize(text: &str, keep_whitespace: bool) -> Vec<String> {
#[derive(Default)]
pub struct TokenizeOptions {
pub keep_whitespace: bool,
}

pub fn tokenize(text: &str, options: &TokenizeOptions) -> Vec<String> {
let mut words = Vec::new();
let mut word = String::new();

Expand All @@ -21,7 +26,7 @@ pub fn tokenize(text: &str, keep_whitespace: bool) -> Vec<String> {
words.push(c.to_string());
}

if c.is_whitespace() && keep_whitespace {
if c.is_whitespace() && options.keep_whitespace {
words.push(c.to_string());
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
use super::tokenizer::tokenize;
use crate::tokens::TokenizeOptions;

use super::tokenize::tokenize;

#[test]
fn test_tokenize() {
let text = "irankarapte! eyami yak a=ye aeywankep ku=kar wa k=an.";
let tokens = tokenize(text, false);
let tokens = tokenize(text, &Default::default());

assert_eq!(
tokens,
Expand All @@ -28,7 +30,7 @@ fn test_tokenize() {
#[test]
fn test_tokenize_suffix() {
let text = "soyenpa=an wa sinot=an ro!";
let tokens = tokenize(text, false);
let tokens = tokenize(text, &Default::default());

assert_eq!(
tokens,
Expand All @@ -39,7 +41,7 @@ fn test_tokenize_suffix() {
#[test]
fn test_sentence_does_not_end_with_period() {
let text = "a=nukar hike i=yaykohaytare i=yaypokaste wa iki pe";
let tokens = tokenize(text, false);
let tokens = tokenize(text, &Default::default());

assert_eq!(
tokens,
Expand All @@ -61,7 +63,7 @@ fn test_sentence_does_not_end_with_period() {
#[test]
fn test_sentence_ending_with_a_fixed_word() {
let text = "neno a=ye itak pirka a=ye itak i=koynu wa ... i=konu wa i=kore";
let tokens = tokenize(text, false);
let tokens = tokenize(text, &Default::default());

assert_eq!(
tokens,
Expand All @@ -75,22 +77,22 @@ fn test_sentence_ending_with_a_fixed_word() {
#[test]
fn test_parse_numbers() {
let text = "1000 yen ku=kor";
let tokens = tokenize(text, false);
let tokens = tokenize(text, &Default::default());

assert_eq!(tokens, vec!["1000", "yen", "ku=", "kor"]);
}

#[test]
fn test_handles_hyphen_within_word() {
let text = "cep-koyki wa e";
let tokens = tokenize(text, false);
let tokens = tokenize(text, &Default::default());
assert_eq!(tokens, vec!["cep-koyki", "wa", "e"]);
}

#[test]
fn test_handles_double_prefixes() {
let text = "niwen seta ne kusu a=e=kupa na.";
let tokens = tokenize(text, false);
let tokens = tokenize(text, &Default::default());
assert_eq!(
tokens,
vec!["niwen", "seta", "ne", "kusu", "a=", "e=", "kupa", "na", "."]
Expand All @@ -100,11 +102,11 @@ fn test_handles_double_prefixes() {
#[test]
fn test_handles_glottal_stop() {
let text = "ku=kor irwak'utari";
let tokens = tokenize(text, false);
let tokens = tokenize(text, &Default::default());
assert_eq!(tokens, vec!["ku=", "kor", "irwak'utari"]);

let text = "'ku=kor rusuy!' sekor hawean";
let tokens = tokenize(text, false);
let tokens = tokenize(text, &Default::default());
assert_eq!(
tokens,
vec!["'", "ku=", "kor", "rusuy", "!", "'", "sekor", "hawean"]
Expand All @@ -114,7 +116,10 @@ fn test_handles_glottal_stop() {
#[test]
fn test_keep_whitespace() {
let text = "irankarapte. tanto sirpirka ne.";
let tokens = tokenize(text, true);
let options = TokenizeOptions {
keep_whitespace: true,
};
let tokens = tokenize(text, &options);
assert_eq!(
tokens,
vec![
Expand Down
File renamed without changes.
Loading