From 953e5a07ec416fe33dbf9364995a651237c0ce6a Mon Sep 17 00:00:00 2001 From: NaokiKimura Date: Sun, 13 Jul 2025 07:50:23 +0900 Subject: [PATCH] Add Japanese tokenizer (fugashi) and minimal unit test --- llmlingua/tokenizer_jp.py | 117 +++++++++++++++++++++++++++ setup.py | 3 + tests/test_tokenizer_jp.py | 160 +++++++++++++++++++++++++++++++++++++ 3 files changed, 280 insertions(+) create mode 100644 llmlingua/tokenizer_jp.py create mode 100644 tests/test_tokenizer_jp.py diff --git a/llmlingua/tokenizer_jp.py b/llmlingua/tokenizer_jp.py new file mode 100644 index 0000000..c856afe --- /dev/null +++ b/llmlingua/tokenizer_jp.py @@ -0,0 +1,117 @@ +""" +Japanese tokenizer for LLMLingua using fugashi + unidic-lite. +Provides tokenization that preserves sentence structure and punctuation. +""" + +import re +from typing import List, Tuple + +try: + import fugashi + import unidic_lite # noqa: F401 - required by fugashi +except ImportError: + raise ImportError( + "Japanese tokenization requires fugashi and unidic-lite. " + "Install with: pip install fugashi unidic-lite" + ) + + +class JapaneseTokenizer: + """Japanese text tokenizer using fugashi with unidic-lite dictionary.""" + + def __init__(self): + """Initialize the Japanese tokenizer.""" + self.tagger = fugashi.Tagger() + + def tokenize(self, text: str) -> List[str]: + """ + Tokenize Japanese text into words. + + Args: + text: Input Japanese text + + Returns: + List of tokenized words + """ + if not text or not text.strip(): + return [] + + # Parse with fugashi + words = self.tagger(text) + + # Extract surface forms and filter empty tokens + tokens = [word.surface for word in words if word.surface.strip()] + + return tokens + + def tokenize_with_pos(self, text: str) -> List[Tuple[str, str]]: + """ + Tokenize Japanese text with part-of-speech information. + + Args: + text: Input Japanese text + + Returns: + List of (token, pos) tuples + """ + if not text or not text.strip(): + return [] + + words = self.tagger(text) + tokens_with_pos = [ + (word.surface, word.pos) for word in words if word.surface.strip() + ] + + return tokens_with_pos + + +def tokenize_jp(text: str, preserve_punctuation: bool = True) -> str: + """ + Tokenize Japanese text and return space-separated string. + + Args: + text: Input Japanese text + preserve_punctuation: Whether to preserve punctuation marks + + Returns: + Space-separated tokenized text + """ + tokenizer = JapaneseTokenizer() + tokens = tokenizer.tokenize(text) + + if not preserve_punctuation: + # Remove punctuation tokens + tokens = [token for token in tokens if not re.match(r"^[^\w\s]+$", token)] + + return " ".join(tokens) + + +def is_japanese_text(text: str, threshold: float = 0.3) -> bool: + """ + Detect if text contains Japanese characters. + + Args: + text: Input text to check + threshold: Minimum ratio of Japanese characters to consider as Japanese + + Returns: + True if text is likely Japanese + """ + if not text: + return False + + # Japanese character ranges + hiragana = "\u3040-\u309f" + katakana = "\u30a0-\u30ff" + kanji = "\u4e00-\u9faf" + jp_chars = f"[{hiragana}{katakana}{kanji}]" + + # Count Japanese characters + jp_char_count = len(re.findall(jp_chars, text)) + total_chars = len(text.strip()) + + if total_chars == 0: + return False + + ratio = jp_char_count / total_chars + return ratio >= threshold diff --git a/setup.py b/setup.py index ae0f4fb..4f3bee2 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,8 @@ "tiktoken", "nltk", "numpy", + "fugashi>=1.2.0", + "unidic-lite>=1.0.8", ] QUANLITY_REQUIRES = [ "black==21.4b0", @@ -62,6 +64,7 @@ extras_require={ "dev": DEV_REQUIRES, "quality": QUANLITY_REQUIRES, + "ja": ["fugashi>=1.2.0", "unidic-lite>=1.0.8"], }, install_requires=INSTALL_REQUIRES, include_package_data=True, diff --git a/tests/test_tokenizer_jp.py b/tests/test_tokenizer_jp.py new file mode 100644 index 0000000..aa17199 --- /dev/null +++ b/tests/test_tokenizer_jp.py @@ -0,0 +1,160 @@ +""" +Tests for Japanese tokenizer functionality. +""" + +import pytest +from llmlingua.tokenizer_jp import JapaneseTokenizer, tokenize_jp, is_japanese_text + + +class TestJapaneseTokenizer: + """Test cases for JapaneseTokenizer class.""" + + def test_tokenizer_initialization(self): + """Test tokenizer can be initialized.""" + tokenizer = JapaneseTokenizer() + assert tokenizer is not None + assert hasattr(tokenizer, "tagger") + + def test_tokenize_basic_japanese(self): + """Test basic Japanese tokenization.""" + tokenizer = JapaneseTokenizer() + text = "私は学生です。" + tokens = tokenizer.tokenize(text) + + assert isinstance(tokens, list) + assert len(tokens) > 0 + assert all(isinstance(token, str) for token in tokens) + + def test_tokenize_with_pos(self): + """Test tokenization with part-of-speech information.""" + tokenizer = JapaneseTokenizer() + text = "美しい花が咲いています。" + tokens_with_pos = tokenizer.tokenize_with_pos(text) + + assert isinstance(tokens_with_pos, list) + assert len(tokens_with_pos) > 0 + assert all( + isinstance(item, tuple) and len(item) == 2 for item in tokens_with_pos + ) + + def test_empty_text(self): + """Test handling of empty text.""" + tokenizer = JapaneseTokenizer() + + assert tokenizer.tokenize("") == [] + assert tokenizer.tokenize(" ") == [] + assert tokenizer.tokenize_with_pos("") == [] + assert tokenizer.tokenize_with_pos(" ") == [] + + +class TestTokenizeJp: + """Test cases for tokenize_jp function.""" + + def test_basic_tokenization(self): + """Test basic tokenize_jp functionality.""" + text = "今日は良い天気ですね。" + result = tokenize_jp(text) + + assert isinstance(result, str) + assert len(result) > 0 + assert " " in result # Should be space-separated + + def test_preserve_punctuation(self): + """Test punctuation preservation.""" + text = "こんにちは!元気ですか?" + + # With punctuation + result_with = tokenize_jp(text, preserve_punctuation=True) + assert "!" in result_with or "?" in result_with + + # Without punctuation + result_without = tokenize_jp(text, preserve_punctuation=False) + assert "!" not in result_without and "?" not in result_without + + def test_mixed_text(self): + """Test mixed Japanese and English text.""" + text = "Hello 世界!This is a test." + result = tokenize_jp(text) + + assert isinstance(result, str) + assert len(result) > 0 + + def test_empty_input(self): + """Test empty input handling.""" + assert tokenize_jp("") == "" + assert tokenize_jp(" ") == "" + + +class TestIsJapaneseText: + """Test cases for is_japanese_text function.""" + + def test_pure_japanese(self): + """Test pure Japanese text detection.""" + text = "日本語のテキストです。" + assert is_japanese_text(text) is True + + def test_mixed_text(self): + """Test mixed text detection.""" + text = "Hello 世界!This is a test." + # This text has 2 Japanese chars out of 24 total = 0.083 ratio + # With default threshold 0.3, this should be False + assert is_japanese_text(text) is False + + def test_english_only(self): + """Test English-only text.""" + text = "This is English text only." + assert is_japanese_text(text) is False + + def test_empty_text(self): + """Test empty text handling.""" + assert is_japanese_text("") is False + assert is_japanese_text(" ") is False + + def test_custom_threshold(self): + """Test custom threshold setting.""" + text = "Hello 世界" # 2 Japanese chars, 8 total chars = 0.25 ratio + + # Default threshold (0.3) should return False + assert is_japanese_text(text) is False + + # Lower threshold should return True + assert is_japanese_text(text, threshold=0.2) is True + + def test_hiragana_katakana_kanji(self): + """Test different Japanese character types.""" + hiragana = "あいうえお" + katakana = "アイウエオ" + kanji = "漢字" + + assert is_japanese_text(hiragana) is True + assert is_japanese_text(katakana) is True + assert is_japanese_text(kanji) is True + + +@pytest.mark.integration +class TestTokenizerIntegration: + """Integration tests for tokenizer.""" + + def test_long_text(self): + """Test tokenization of longer text.""" + text = """ + 自然言語処理(しぜんげんごしょり、英語: natural language processing、略称: NLP)は、 + 人間が日常的に使っている自然言語をコンピュータに処理させる一連の技術であり、 + 人工知能と言語学の一分野である。 + """ + + result = tokenize_jp(text) + assert isinstance(result, str) + assert len(result) > 0 + + # Should preserve sentence structure + tokens = result.split() + assert len(tokens) > 10 # Should have multiple tokens + + def test_special_characters(self): + """Test handling of special characters.""" + text = "「引用」や(括弧)など、様々な記号を含むテキスト。" + result = tokenize_jp(text) + + assert isinstance(result, str) + assert len(result) > 0