Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,49 @@ mod tests {
assert_contains_doc(&hits, DOC_EN);
}

#[test]
fn cyrillic_term_matches_inside_phrase() {
let mut index = InMemoryIndex::default();
let doc_id = "doc-ru";
index.add_doc(INDEX, doc_id, "привет мир", true);

let hits = index.search_with_mode(INDEX, "привет", SearchMode::Exact);
assert_contains_doc(&hits, doc_id);
}

#[test]
fn cyrillic_single_word_matches_exact() {
let mut index = InMemoryIndex::default();
let doc_id = "doc-cyr-single";
index.add_doc(INDEX, doc_id, "привет", true);

let hits = index.search_with_mode(INDEX, "привет", SearchMode::Exact);
assert_contains_doc(&hits, doc_id);
}

#[test]
fn cyrillic_term_matches_with_punctuation() {
let mut index = InMemoryIndex::default();
let doc_id = "doc-ru-punct";
index.add_doc(INDEX, doc_id, "привет, привет", true);

let hits = index.search_with_mode(INDEX, "привет", SearchMode::Exact);
assert_contains_doc(&hits, doc_id);
}

#[test]
fn latin_and_cyrillic_boundary_is_searchable() {
let mut index = InMemoryIndex::default();
let doc_id = "doc-latin-cyr";
index.add_doc(INDEX, doc_id, "helloпривет", true);

let cyr_hits = index.search_with_mode(INDEX, "привет", SearchMode::Exact);
assert_contains_doc(&cyr_hits, doc_id);

let latin_hits = index.search_with_mode(INDEX, "hello", SearchMode::Exact);
assert_contains_doc(&latin_hits, doc_id);
}

#[test]
fn fuzzy_search_allows_alphanumeric_terms() {
let mut index = InMemoryIndex::default();
Expand Down
38 changes: 38 additions & 0 deletions src/tokenizer/normalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,39 @@ impl DefaultTextNormalizer {
})
}

fn normalize_unicode_split(
raw: &str,
base_start: usize,
script: SegmentScript,
out: &mut Vec<TokenWithScript>,
seen: &mut HashSet<(String, usize, usize)>,
) {
let chars: Vec<(usize, char)> = raw.char_indices().collect();
let len = chars.len();
let mut i = 0;

while i < len {
while i < len && !(chars[i].1.is_alphanumeric() || is_combining_mark(chars[i].1)) {
i += 1;
}
if i >= len {
break;
}

let start_char = i;
while i < len && (chars[i].1.is_alphanumeric() || is_combining_mark(chars[i].1)) {
i += 1;
}

let start = chars[start_char].0;
let end = if i < len { chars[i].0 } else { raw.len() };

if let Some(token) = Self::normalize_span(&raw[start..end], base_start + start, script) {
Self::push_token(out, seen, token);
}
}
}

fn normalize_text(raw: &str, base_start: usize) -> (String, Vec<usize>) {
let mut normalized = String::new();
let mut mapping = Vec::new();
Expand Down Expand Up @@ -252,6 +285,11 @@ impl TextNormalizer for DefaultTextNormalizer {
return;
}

if matches!(script, SegmentScript::Other) {
Self::normalize_unicode_split(raw, base_start, script, out, seen);
return;
}

if !raw.chars().any(|c| c.is_alphanumeric()) {
return;
}
Expand Down
Loading