Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,13 @@ Converts Ainu text written in Latin script to Kana.
**Python:**

```py
from ainu_utils import transliterate_to_kana
from ainu_utils import transliterate_to_kana, Whitespace

transliterate_to_kana("irankarapte. e=iwanke ya?")
transliterate_to_kana(
"irankarapte. e=iwanke ya?",
whitespace=Whitespace.Fullwidth,
ignore_pattern=None,
)
# => "イランカラㇷ゚テ。 エイワンケ ヤ?"
```

Expand All @@ -82,7 +86,10 @@ transliterate_to_kana("irankarapte. e=iwanke ya?")
```js
import { transliterateToKana } from "ainu-utils";

transliterateToKana("irankarapte. e=iwanke ya?");
transliterateToKana("irankarapte. e=iwanke ya?", {
whitespace: "fullwidth",
ignore_pattern: null,
});
// => "イランカラㇷ゚テ。 エイワンケ ヤ?"
```

Expand Down
56 changes: 53 additions & 3 deletions ainu-utils-js/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use ainu_utils::{
kana, numbers, syllables,
kana::{self, IgnorePattern, IgnorePatternError, TransliterateToKanaOptions, Whitespace},
numbers, syllables,
tokens::{self, TokenizeOptions},
};
use serde::Deserialize;
Expand Down Expand Up @@ -35,9 +36,58 @@ pub fn syllabicate(text: &str) -> Vec<String> {
syllables::syllabicate(text)
}

#[derive(Deserialize, Default)]
#[serde(rename_all = "camelCase")]
pub struct TransliterateToKanaOptionsJs {
whitespace: Option<String>,
ignore_pattern: Option<String>,
}

impl TryFrom<TransliterateToKanaOptionsJs> for TransliterateToKanaOptions {
type Error = JsError;

fn try_from(value: TransliterateToKanaOptionsJs) -> Result<Self, Self::Error> {
let defaults = TransliterateToKanaOptions::default();

let whitespace = match value.whitespace {
Some(whitespace) => match whitespace.as_str() {
"fullwidth" => Ok(Some(Whitespace::Fullwidth)),
"halfwidth" => Ok(Some(Whitespace::Halfwidth)),
_ => Err(JsError::new(&format!("Invalid whitespace: {}", whitespace))),
},
_ => Ok(None),
}?;

let ignore_pattern = value
.ignore_pattern
.map(|ignore_pattern| IgnorePattern::new(&ignore_pattern))
.transpose()
.map_err(|err| match err {
IgnorePatternError::InvalidPattern => JsError::new("Invalid pattern provided"),
})?;

let value = Self {
whitespace: whitespace.unwrap_or(defaults.whitespace),
ignore_pattern,
};

Ok(value)
}
}

#[wasm_bindgen(js_name = transliterateToKana)]
pub fn transliterate_to_kana(text: &str) -> String {
kana::transliterate_to_kana(text)
pub fn transliterate_to_kana(text: &str, options: Option<JsValue>) -> Result<String, JsError> {
let transliterate_to_kana_options = options
.map(serde_wasm_bindgen::from_value::<TransliterateToKanaOptionsJs>)
.transpose()?
.map(TryInto::<TransliterateToKanaOptions>::try_into)
.transpose()?
.unwrap_or_default();

Ok(kana::transliterate_to_kana(
text,
&transliterate_to_kana_options,
))
}

#[wasm_bindgen(js_name = numberToWords)]
Expand Down
38 changes: 35 additions & 3 deletions ainu-utils-js/tests/index.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,39 @@ describe("tokenize", () => {
});
});

test("transliterateToKana", () => {
const tokens = transliterateToKana("irankarapte. e=iwanke ya?");
expect(tokens).toBe("イランカラㇷ゚テ。 エイワンケ ヤ?");
describe("transliterateToKana", () => {
test("defaults", () => {
const tokens = transliterateToKana("irankarapte. e=iwanke ya?");
expect(tokens).toBe("イランカラㇷ゚テ。 エイワンケ ヤ?");
});

test("ignore pattern", () => {
const tokens = transliterateToKana("JOHN ku=ne.", {
ignorePattern: "^[A-Z]+$"
});
expect(tokens).toBe("JOHN クネ。");
});

test("throws for an invalid ignore pattern", () => {
expect(() => {
transliterateToKana("JOHN ku=ne.", {
ignorePattern: "[",
})
}).toThrowError("Invalid pattern provided");
});

test("whitespace", () => {
const tokens = transliterateToKana("onne paskur ine?", {
whitespace: "halfwidth",
});
expect(tokens).toBe("オンネ パㇱクㇽ イネ?");
});

test("throws for an invalid whitespace", () => {
expect(() => {
transliterateToKana("irankarapte", {
whitespace: "xxx",
})
}).toThrowError("Invalid whitespace: xxx");
});
});
9 changes: 7 additions & 2 deletions ainu-utils-python/ainu_utils.pyi
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# https://www.maturin.rs/project_layout#adding-python-type-information
def tokenize(text: str, *, keep_whitespace: bool = False) -> list[str]: ...
def to_kana(text: str) -> str: ...
def tokenize(text: str, *, keep_whitespace: bool | None = None) -> list[str]: ...
def to_kana(
text: str,
*,
whitespace: Whitespace | None = None,
ignore_pattern: str | None = None,
) -> str: ...
def number_to_words(number: int) -> str: ...
def syllabicate(text: str) -> list[str]: ...
44 changes: 40 additions & 4 deletions ainu-utils-python/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
extern crate ainu_utils as ainu_utils_rust;

use ainu_utils_rust::tokens::TokenizeOptions;
use pyo3::prelude::*;
use ainu_utils_rust::{
kana::{IgnorePattern, IgnorePatternError, TransliterateToKanaOptions},
tokens::TokenizeOptions,
};
use pyo3::{exceptions::PyValueError, prelude::*};

#[pyfunction]
#[pyo3(signature = (text, *, keep_whitespace = None))]
Expand All @@ -13,9 +16,41 @@ fn tokenize(text: &str, keep_whitespace: Option<bool>) -> Vec<String> {
ainu_utils_rust::tokens::tokenize(text, &tokenize_options)
}

#[pyclass(eq, from_py_object)]
#[derive(PartialEq, Clone)]
pub enum Whitespace {
Fullwidth,
Halfwidth,
}

#[pyfunction]
fn transliterate_to_kana(text: &str) -> String {
ainu_utils_rust::kana::transliterate_to_kana(text)
#[pyo3(signature = (text, *, whitespace = None, ignore_pattern = None))]
fn transliterate_to_kana(
text: &str,
whitespace: Option<Whitespace>,
ignore_pattern: Option<&str>,
) -> Result<String, PyErr> {
let defaults = TransliterateToKanaOptions::default();

let whitespace = match whitespace {
Some(Whitespace::Fullwidth) => ainu_utils_rust::kana::Whitespace::Fullwidth,
Some(Whitespace::Halfwidth) => ainu_utils_rust::kana::Whitespace::Halfwidth,
None => defaults.whitespace,
};

let ignore_pattern = ignore_pattern
.map(|p| IgnorePattern::new(p))
.transpose()
.map_err(|e| match e {
IgnorePatternError::InvalidPattern => PyValueError::new_err("Invalid pattern proivded"),
})?;

let options = TransliterateToKanaOptions {
ignore_pattern,
whitespace,
};

Ok(ainu_utils_rust::kana::transliterate_to_kana(text, &options))
}

#[pyfunction]
Expand All @@ -34,6 +69,7 @@ fn ainu_utils(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(transliterate_to_kana, m)?)?;
m.add_function(wrap_pyfunction!(number_to_words, m)?)?;
m.add_function(wrap_pyfunction!(syllabicate, m)?)?;
m.add_class::<Whitespace>()?;
m.add("test_number", 123)?;
Ok(())
}
12 changes: 12 additions & 0 deletions ainu-utils-python/tests/test_transliterate_to_kana.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,15 @@
def test_transliterate_to_kana():
result = ainu_utils.transliterate_to_kana("irankarapte. e=iwanke ya?")
assert result == "イランカラㇷ゚テ。 エイワンケ ヤ?"


def test_transliterate_to_kana_halfwidth():
result = ainu_utils.transliterate_to_kana(
"irankarapte. e=iwanke ya?", whitespace=ainu_utils.Whitespace.Halfwidth
)
assert result == "イランカラㇷ゚テ。 エイワンケ ヤ?"


def test_transliterate_to_kana_ignore():
result = ainu_utils.transliterate_to_kana("JOHN ku=ne.", ignore_pattern="^[A-Z]+$")
assert result == "JOHN クネ。"
4 changes: 2 additions & 2 deletions ainu-utils/examples/kana.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use ainu_utils::kana::to_kana;
use ainu_utils::kana::transliterate_to_kana;
use std::env;

fn main() {
let args: Vec<String> = env::args().collect();
let text = &args[1];

let kana = to_kana(text);
let kana = transliterate_to_kana(text, &Default::default());

println!("{}", kana);
}
4 changes: 2 additions & 2 deletions ainu-utils/examples/syllables.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use ainu_utils::syllables::parse;
use ainu_utils::syllables::syllabicate;
use std::env;

fn main() {
let args: Vec<String> = env::args().collect();
let text = &args[1];

let syllables = parse(text);
let syllables = syllabicate(text);

println!("{:?}", syllables);
}
4 changes: 2 additions & 2 deletions ainu-utils/examples/tokenize.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use ainu_utils::tokenizer::tokenize;
use ainu_utils::tokens::tokenize;
use std::env;

fn main() {
let args: Vec<String> = env::args().collect();
let text = &args[1];

let tokens = tokenize(text, false);
let tokens = tokenize(text, &Default::default());

println!("{:?}", tokens);
}
70 changes: 58 additions & 12 deletions ainu-utils/src/kana/kana.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::phoneme::Phoneme;
use regex::Regex;
use unicode_normalization::UnicodeNormalization;
use unicode_normalization::char::is_combining_mark;

Expand All @@ -8,26 +9,71 @@ use super::kana_map_cv::map_cv;
use super::kana_map_punc::map_punc;
use super::kana_map_v::map_v;

pub fn transliterate_to_kana(input: &str) -> String {
pub enum Whitespace {
Fullwidth,
Halfwidth,
}

impl ToString for Whitespace {
fn to_string(&self) -> String {
match self {
Self::Fullwidth => " ".to_string(),
Self::Halfwidth => " ".to_string(),
}
}
}

impl Default for Whitespace {
fn default() -> Self {
Self::Fullwidth
}
}

#[derive(Debug)]
pub enum IgnorePatternError {
InvalidPattern,
}

pub struct IgnorePattern(Regex);

impl IgnorePattern {
pub fn new(value: &str) -> Result<Self, IgnorePatternError> {
let regex = Regex::new(value).map_err(|_| IgnorePatternError::InvalidPattern)?;
Ok(IgnorePattern(regex))
}
}

#[derive(Default)]
pub struct TransliterateToKanaOptions {
pub whitespace: Whitespace,
pub ignore_pattern: Option<IgnorePattern>,
}

#[derive(Debug)]
pub enum TransliterateToKanaError {
InvalidIgnore,
}

pub fn transliterate_to_kana(input: &str, options: &TransliterateToKanaOptions) -> String {
let mut input: String = input.to_string();
input = link(&input);

let words: Vec<&str> = input.split(' ').collect();
let mut output = String::new();
let words_latn: Vec<&str> = input.split(' ').collect();
let mut words_kana: Vec<String> = vec![];

for word in words {
let kana = transliterate_word_to_kana(word);

if kana.chars().any(|c| c.is_ascii_alphabetic()) {
output += word;
} else {
output += &kana;
for word_latn in words_latn {
if let Some(ignore) = &options.ignore_pattern
&& ignore.0.is_match(word_latn)
{
words_kana.push(word_latn.to_string());
continue;
}

output += " ";
let word_kana = transliterate_word_to_kana(word_latn);
words_kana.push(word_kana);
}

output = output.trim_end().to_string();
let output = words_kana.join(&options.whitespace.to_string()).to_string();

output
}
Expand Down
Loading
Loading