Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions llmlingua/tokenizer_jp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""
Japanese tokenizer for LLMLingua using fugashi + unidic-lite.
Provides tokenization that preserves sentence structure and punctuation.
"""

import re
from typing import List, Tuple

try:
import fugashi
import unidic_lite # noqa: F401 - required by fugashi
except ImportError:
raise ImportError(
"Japanese tokenization requires fugashi and unidic-lite. "
"Install with: pip install fugashi unidic-lite"
)


class JapaneseTokenizer:
"""Japanese text tokenizer using fugashi with unidic-lite dictionary."""

def __init__(self):
"""Initialize the Japanese tokenizer."""
self.tagger = fugashi.Tagger()

def tokenize(self, text: str) -> List[str]:
"""
Tokenize Japanese text into words.

Args:
text: Input Japanese text

Returns:
List of tokenized words
"""
if not text or not text.strip():
return []

# Parse with fugashi
words = self.tagger(text)

# Extract surface forms and filter empty tokens
tokens = [word.surface for word in words if word.surface.strip()]

return tokens

def tokenize_with_pos(self, text: str) -> List[Tuple[str, str]]:
"""
Tokenize Japanese text with part-of-speech information.

Args:
text: Input Japanese text

Returns:
List of (token, pos) tuples
"""
if not text or not text.strip():
return []

words = self.tagger(text)
tokens_with_pos = [
(word.surface, word.pos) for word in words if word.surface.strip()
]

return tokens_with_pos


def tokenize_jp(text: str, preserve_punctuation: bool = True) -> str:
"""
Tokenize Japanese text and return space-separated string.

Args:
text: Input Japanese text
preserve_punctuation: Whether to preserve punctuation marks

Returns:
Space-separated tokenized text
"""
tokenizer = JapaneseTokenizer()
tokens = tokenizer.tokenize(text)

if not preserve_punctuation:
# Remove punctuation tokens
tokens = [token for token in tokens if not re.match(r"^[^\w\s]+$", token)]

return " ".join(tokens)


def is_japanese_text(text: str, threshold: float = 0.3) -> bool:
"""
Detect if text contains Japanese characters.

Args:
text: Input text to check
threshold: Minimum ratio of Japanese characters to consider as Japanese

Returns:
True if text is likely Japanese
"""
if not text:
return False

# Japanese character ranges
hiragana = "\u3040-\u309f"
katakana = "\u30a0-\u30ff"
kanji = "\u4e00-\u9faf"
jp_chars = f"[{hiragana}{katakana}{kanji}]"

# Count Japanese characters
jp_char_count = len(re.findall(jp_chars, text))
total_chars = len(text.strip())

if total_chars == 0:
return False

ratio = jp_char_count / total_chars
return ratio >= threshold
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
"tiktoken",
"nltk",
"numpy",
"fugashi>=1.2.0",
"unidic-lite>=1.0.8",
]
QUANLITY_REQUIRES = [
"black==21.4b0",
Expand Down Expand Up @@ -62,6 +64,7 @@
extras_require={
"dev": DEV_REQUIRES,
"quality": QUANLITY_REQUIRES,
"ja": ["fugashi>=1.2.0", "unidic-lite>=1.0.8"],
},
install_requires=INSTALL_REQUIRES,
include_package_data=True,
Expand Down
160 changes: 160 additions & 0 deletions tests/test_tokenizer_jp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""
Tests for Japanese tokenizer functionality.
"""

import pytest
from llmlingua.tokenizer_jp import JapaneseTokenizer, tokenize_jp, is_japanese_text


class TestJapaneseTokenizer:
"""Test cases for JapaneseTokenizer class."""

def test_tokenizer_initialization(self):
"""Test tokenizer can be initialized."""
tokenizer = JapaneseTokenizer()
assert tokenizer is not None
assert hasattr(tokenizer, "tagger")

def test_tokenize_basic_japanese(self):
"""Test basic Japanese tokenization."""
tokenizer = JapaneseTokenizer()
text = "私は学生です。"
tokens = tokenizer.tokenize(text)

assert isinstance(tokens, list)
assert len(tokens) > 0
assert all(isinstance(token, str) for token in tokens)

def test_tokenize_with_pos(self):
"""Test tokenization with part-of-speech information."""
tokenizer = JapaneseTokenizer()
text = "美しい花が咲いています。"
tokens_with_pos = tokenizer.tokenize_with_pos(text)

assert isinstance(tokens_with_pos, list)
assert len(tokens_with_pos) > 0
assert all(
isinstance(item, tuple) and len(item) == 2 for item in tokens_with_pos
)

def test_empty_text(self):
"""Test handling of empty text."""
tokenizer = JapaneseTokenizer()

assert tokenizer.tokenize("") == []
assert tokenizer.tokenize(" ") == []
assert tokenizer.tokenize_with_pos("") == []
assert tokenizer.tokenize_with_pos(" ") == []


class TestTokenizeJp:
"""Test cases for tokenize_jp function."""

def test_basic_tokenization(self):
"""Test basic tokenize_jp functionality."""
text = "今日は良い天気ですね。"
result = tokenize_jp(text)

assert isinstance(result, str)
assert len(result) > 0
assert " " in result # Should be space-separated

def test_preserve_punctuation(self):
"""Test punctuation preservation."""
text = "こんにちは!元気ですか?"

# With punctuation
result_with = tokenize_jp(text, preserve_punctuation=True)
assert "!" in result_with or "?" in result_with

# Without punctuation
result_without = tokenize_jp(text, preserve_punctuation=False)
assert "!" not in result_without and "?" not in result_without

def test_mixed_text(self):
"""Test mixed Japanese and English text."""
text = "Hello 世界!This is a test."
result = tokenize_jp(text)

assert isinstance(result, str)
assert len(result) > 0

def test_empty_input(self):
"""Test empty input handling."""
assert tokenize_jp("") == ""
assert tokenize_jp(" ") == ""


class TestIsJapaneseText:
"""Test cases for is_japanese_text function."""

def test_pure_japanese(self):
"""Test pure Japanese text detection."""
text = "日本語のテキストです。"
assert is_japanese_text(text) is True

def test_mixed_text(self):
"""Test mixed text detection."""
text = "Hello 世界!This is a test."
# This text has 2 Japanese chars out of 24 total = 0.083 ratio
# With default threshold 0.3, this should be False
assert is_japanese_text(text) is False

def test_english_only(self):
"""Test English-only text."""
text = "This is English text only."
assert is_japanese_text(text) is False

def test_empty_text(self):
"""Test empty text handling."""
assert is_japanese_text("") is False
assert is_japanese_text(" ") is False

def test_custom_threshold(self):
"""Test custom threshold setting."""
text = "Hello 世界" # 2 Japanese chars, 8 total chars = 0.25 ratio

# Default threshold (0.3) should return False
assert is_japanese_text(text) is False

# Lower threshold should return True
assert is_japanese_text(text, threshold=0.2) is True

def test_hiragana_katakana_kanji(self):
"""Test different Japanese character types."""
hiragana = "あいうえお"
katakana = "アイウエオ"
kanji = "漢字"

assert is_japanese_text(hiragana) is True
assert is_japanese_text(katakana) is True
assert is_japanese_text(kanji) is True


@pytest.mark.integration
class TestTokenizerIntegration:
"""Integration tests for tokenizer."""

def test_long_text(self):
"""Test tokenization of longer text."""
text = """
自然言語処理(しぜんげんごしょり、英語: natural language processing、略称: NLP)は、
人間が日常的に使っている自然言語をコンピュータに処理させる一連の技術であり、
人工知能と言語学の一分野である。
"""

result = tokenize_jp(text)
assert isinstance(result, str)
assert len(result) > 0

# Should preserve sentence structure
tokens = result.split()
assert len(tokens) > 10 # Should have multiple tokens

def test_special_characters(self):
"""Test handling of special characters."""
text = "「引用」や(括弧)など、様々な記号を含むテキスト。"
result = tokenize_jp(text)

assert isinstance(result, str)
assert len(result) > 0