diff --git a/src/lib.rs b/src/lib.rs index 77ffbd1..25d35f5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -181,6 +181,49 @@ mod tests { assert_contains_doc(&hits, DOC_EN); } + #[test] + fn cyrillic_term_matches_inside_phrase() { + let mut index = InMemoryIndex::default(); + let doc_id = "doc-ru"; + index.add_doc(INDEX, doc_id, "привет мир", true); + + let hits = index.search_with_mode(INDEX, "привет", SearchMode::Exact); + assert_contains_doc(&hits, doc_id); + } + + #[test] + fn cyrillic_single_word_matches_exact() { + let mut index = InMemoryIndex::default(); + let doc_id = "doc-cyr-single"; + index.add_doc(INDEX, doc_id, "привет", true); + + let hits = index.search_with_mode(INDEX, "привет", SearchMode::Exact); + assert_contains_doc(&hits, doc_id); + } + + #[test] + fn cyrillic_term_matches_with_punctuation() { + let mut index = InMemoryIndex::default(); + let doc_id = "doc-ru-punct"; + index.add_doc(INDEX, doc_id, "привет, привет", true); + + let hits = index.search_with_mode(INDEX, "привет", SearchMode::Exact); + assert_contains_doc(&hits, doc_id); + } + + #[test] + fn latin_and_cyrillic_boundary_is_searchable() { + let mut index = InMemoryIndex::default(); + let doc_id = "doc-latin-cyr"; + index.add_doc(INDEX, doc_id, "helloпривет", true); + + let cyr_hits = index.search_with_mode(INDEX, "привет", SearchMode::Exact); + assert_contains_doc(&cyr_hits, doc_id); + + let latin_hits = index.search_with_mode(INDEX, "hello", SearchMode::Exact); + assert_contains_doc(&latin_hits, doc_id); + } + #[test] fn fuzzy_search_allows_alphanumeric_terms() { let mut index = InMemoryIndex::default(); diff --git a/src/tokenizer/normalize.rs b/src/tokenizer/normalize.rs index b0b73d9..0213474 100644 --- a/src/tokenizer/normalize.rs +++ b/src/tokenizer/normalize.rs @@ -206,6 +206,39 @@ impl DefaultTextNormalizer { }) } + fn normalize_unicode_split( + raw: &str, + base_start: usize, + script: SegmentScript, + out: &mut Vec, + seen: &mut HashSet<(String, usize, usize)>, + ) { + let chars: Vec<(usize, char)> = raw.char_indices().collect(); + let len = chars.len(); + let mut i = 0; + + while i < len { + while i < len && !(chars[i].1.is_alphanumeric() || is_combining_mark(chars[i].1)) { + i += 1; + } + if i >= len { + break; + } + + let start_char = i; + while i < len && (chars[i].1.is_alphanumeric() || is_combining_mark(chars[i].1)) { + i += 1; + } + + let start = chars[start_char].0; + let end = if i < len { chars[i].0 } else { raw.len() }; + + if let Some(token) = Self::normalize_span(&raw[start..end], base_start + start, script) { + Self::push_token(out, seen, token); + } + } + } + fn normalize_text(raw: &str, base_start: usize) -> (String, Vec) { let mut normalized = String::new(); let mut mapping = Vec::new(); @@ -252,6 +285,11 @@ impl TextNormalizer for DefaultTextNormalizer { return; } + if matches!(script, SegmentScript::Other) { + Self::normalize_unicode_split(raw, base_start, script, out, seen); + return; + } + if !raw.chars().any(|c| c.is_alphanumeric()) { return; }