From d68a2384d072387b09985f21d2fdbf747b5aeb39 Mon Sep 17 00:00:00 2001 From: Elie Gambache Date: Tue, 20 Jan 2026 18:46:36 +0200 Subject: [PATCH 1/6] fix: add hallucination blacklist to filter incorrect dictionary expansions Add a blacklist mechanism to filter out LLM-generated dictionary mappings that incorrectly link unrelated Hebrew words. The blacklist is loaded from a TSV resource file for easy modification. This is a temporary workaround until the lexical dictionary itself can be corrected. --- .../search/LuceneSearchEngine.kt | 58 +++++++++++++++++++ .../resources/hallucination_blacklist.tsv | 29 ++++++++++ 2 files changed, 87 insertions(+) create mode 100644 search/src/jvmMain/resources/hallucination_blacklist.tsv diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt index 74b268f..447de30 100644 --- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt @@ -45,6 +45,58 @@ class LuceneSearchEngine( // Constants for snippet source building (must match indexer) private const val SNIPPET_NEIGHBOR_WINDOW = 4 private const val SNIPPET_MIN_LENGTH = 280 + + /** + * Blacklist of hallucinated dictionary mappings. + * Loaded from resources/hallucination_blacklist.tsv + * Key: normalized token, Value: set of incorrect base forms to reject + * + * TODO: This is a temporary workaround. The proper fix is to correct the + * hallucinated mappings directly in the lexical dictionary (lexical.db) + * so this blacklist becomes unnecessary. + */ + private val HALLUCINATION_BLACKLIST: Map> by lazy { + loadHallucinationBlacklist() + } + + private fun loadHallucinationBlacklist(): Map> { + val result = mutableMapOf>() + try { + val inputStream = LuceneSearchEngine::class.java.getResourceAsStream("/hallucination_blacklist.tsv") + if (inputStream == null) { + logger.w { "hallucination_blacklist.tsv not found in resources" } + return emptyMap() + } + inputStream.bufferedReader().useLines { lines -> + lines.forEach { line -> + val trimmed = line.trim() + if (trimmed.isEmpty() || trimmed.startsWith("#")) return@forEach + val parts = trimmed.split("\t") + if (parts.size >= 2) { + val token = HebrewTextUtils.normalizeHebrew(parts[0]) + val base = HebrewTextUtils.normalizeHebrew(parts[1]) + result.getOrPut(token) { mutableSetOf() }.add(base) + } + } + } + logger.d { "Loaded ${result.size} hallucination blacklist entries" } + } catch (e: Exception) { + logger.e(e) { "Failed to load hallucination blacklist" } + } + return result + } + + /** + * Check if an expansion should be rejected based on the hallucination blacklist. + */ + private fun isHallucinatedExpansion(token: String, expansion: MagicDictionaryIndex.Expansion): Boolean { + val normalizedToken = HebrewTextUtils.normalizeHebrew(token) + val blacklistedBases = HALLUCINATION_BLACKLIST[normalizedToken] ?: return false + return expansion.base.any { base -> + val normalizedBase = HebrewTextUtils.normalizeHebrew(base) + blacklistedBases.contains(normalizedBase) + } + } } // Open Lucene directory lazily to avoid any I/O at app startup @@ -241,10 +293,16 @@ class LuceneSearchEngine( logger.d { "[DEBUG] Analyzed tokens: $analyzedStd" } // Get all possible expansions for each token (a token can belong to multiple bases) + // Filter out hallucinated expansions using the blacklist val tokenExpansionsRaw: Map> = analyzedStd.associateWith { token -> // Get best expansion (prefers matching base, then largest) val expansion = magicDict?.expansionFor(token) ?: return@associateWith emptyList() + // Check if this expansion is a known hallucination + if (isHallucinatedExpansion(token, expansion)) { + logger.d { "[DEBUG] Token '$token' -> BLOCKED hallucinated expansion: base=${expansion.base}" } + return@associateWith emptyList() + } listOf(expansion) } tokenExpansionsRaw.forEach { (token, exps) -> diff --git a/search/src/jvmMain/resources/hallucination_blacklist.tsv b/search/src/jvmMain/resources/hallucination_blacklist.tsv new file mode 100644 index 0000000..dd146a1 --- /dev/null +++ b/search/src/jvmMain/resources/hallucination_blacklist.tsv @@ -0,0 +1,29 @@ +# Hallucination Blacklist +# Format: tokenbase +# Lines starting with # are comments +# These are LLM-generated mappings that are incorrect and should be ignored +לחתוכ כנ +לחתוך כנ +לבקש רצה +מתפלל בעה +לשבת נתנ +לעבוד עשה +להעמיד קומ +כנ כנ +בנ בנ +לשימ שמ +לשים שמ +לדונ יד +לדון יד +אמ אמ +להרגיע יושב +לחייכ חוה +לחייך חוה +שנ שנ +עצ עצ +דנ דנ +להיכנס עלה +לשקול לקח +לבנות בנ +לשחרר שרה +להתיר שרה From 2901cd9a98568e329421dce1ad87c44a8407a886 Mon Sep 17 00:00:00 2001 From: Elie Gambache Date: Tue, 20 Jan 2026 18:49:57 +0200 Subject: [PATCH 2/6] fix: filter hallucinations only for highlighting, not search Keep full dictionary expansions for search to maintain good recall, but filter out hallucinated mappings only when building highlight terms to avoid highlighting unrelated words. --- .../search/LuceneSearchEngine.kt | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt index 447de30..d36ee8a 100644 --- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt @@ -293,28 +293,33 @@ class LuceneSearchEngine( logger.d { "[DEBUG] Analyzed tokens: $analyzedStd" } // Get all possible expansions for each token (a token can belong to multiple bases) - // Filter out hallucinated expansions using the blacklist - val tokenExpansionsRaw: Map> = + // These expansions are used for SEARCH - we keep all of them for better recall + val tokenExpansions: Map> = analyzedStd.associateWith { token -> // Get best expansion (prefers matching base, then largest) val expansion = magicDict?.expansionFor(token) ?: return@associateWith emptyList() - // Check if this expansion is a known hallucination - if (isHallucinatedExpansion(token, expansion)) { - logger.d { "[DEBUG] Token '$token' -> BLOCKED hallucinated expansion: base=${expansion.base}" } - return@associateWith emptyList() - } listOf(expansion) } - tokenExpansionsRaw.forEach { (token, exps) -> + tokenExpansions.forEach { (token, exps) -> exps.forEach { exp -> logger.d { "[DEBUG] Token '$token' -> expansion: surface=${exp.surface.take(10)}..., variants=${exp.variants.take(10)}..., base=${exp.base}" } } } - val tokenExpansions: Map> = tokenExpansionsRaw + // For HIGHLIGHTING, filter out hallucinated expansions to avoid highlighting unrelated words + val tokenExpansionsForHighlight: Map> = + tokenExpansions.mapValues { (token, exps) -> + exps.filter { exp -> + val isHallucination = isHallucinatedExpansion(token, exp) + if (isHallucination) { + logger.d { "[DEBUG] Token '$token' -> BLOCKED for highlight (hallucination): base=${exp.base}" } + } + !isHallucination + } + } - val allExpansions = tokenExpansions.values.flatten() - val expandedTerms = allExpansions.flatMap { it.surface + it.variants + it.base }.distinct() + val allExpansionsForHighlight = tokenExpansionsForHighlight.values.flatten() + val expandedTerms = allExpansionsForHighlight.flatMap { it.surface + it.variants + it.base }.distinct() // Add 4-gram terms used in the query (matches text_ng4 clauses) so highlighting can // reflect matches that were found via the n-gram branch. val ngramTerms = buildNgramTerms(analyzedStd, gram = 4) From abae21fca2375f1a2b053f0bdaca77d3b6211acc Mon Sep 17 00:00:00 2001 From: Elie Gambache Date: Tue, 20 Jan 2026 18:53:26 +0200 Subject: [PATCH 3/6] fix: exclude 2-letter terms from dictionary expansion highlighting 2-letter words should only be highlighted if they were explicitly written in the query, not when they come from dictionary expansion. --- .../seforimlibrary/search/LuceneSearchEngine.kt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt index d36ee8a..e0b2070 100644 --- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt @@ -319,7 +319,12 @@ class LuceneSearchEngine( } val allExpansionsForHighlight = tokenExpansionsForHighlight.values.flatten() - val expandedTerms = allExpansionsForHighlight.flatMap { it.surface + it.variants + it.base }.distinct() + // Filter out 2-letter terms from dictionary expansions for highlighting + // (2-letter words should only be highlighted if explicitly in the query) + val expandedTerms = allExpansionsForHighlight + .flatMap { it.surface + it.variants + it.base } + .filter { it.length > 2 } + .distinct() // Add 4-gram terms used in the query (matches text_ng4 clauses) so highlighting can // reflect matches that were found via the n-gram branch. val ngramTerms = buildNgramTerms(analyzedStd, gram = 4) From a2907f4b5e0e23539b444e9bf4a60eedd4c2806b Mon Sep 17 00:00:00 2001 From: Elie Gambache Date: Tue, 20 Jan 2026 18:57:49 +0200 Subject: [PATCH 4/6] fix: improve snippet positioning to show best match cluster Instead of centering the snippet around the first occurrence of any anchor term, find the position where the most query terms cluster together. This ensures the snippet shows the most relevant part of the text when multiple query terms appear scattered throughout. --- .../search/LuceneSearchEngine.kt | 62 +++++++++++++++++-- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt index e0b2070..6677b0a 100644 --- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt @@ -749,6 +749,60 @@ class LuceneSearchEngine( .sortedByDescending { it.length } } + /** + * Find the best position to center the snippet around. + * Instead of just finding the first occurrence of the first anchor term, + * this finds where the most anchor terms cluster together. + * Returns (position, length) of the best anchor. + */ + private fun findBestAnchorPosition(plainSearch: String, anchorTerms: List, windowSize: Int): Pair { + if (anchorTerms.isEmpty()) return Pair(0, 0) + + // Find all occurrences of all anchor terms + data class TermOccurrence(val term: String, val position: Int) + val occurrences = mutableListOf() + + for (term in anchorTerms) { + if (term.isEmpty()) continue + var from = 0 + while (from <= plainSearch.length - term.length) { + val idx = plainSearch.indexOf(term, startIndex = from) + if (idx == -1) break + occurrences.add(TermOccurrence(term, idx)) + from = idx + 1 + } + } + + if (occurrences.isEmpty()) return Pair(0, anchorTerms.firstOrNull()?.length ?: 0) + + // Score each occurrence by how many unique terms appear nearby + var bestPosition = occurrences.first().position + var bestLength = occurrences.first().term.length + var bestScore = 0 + + for (occ in occurrences) { + val windowStart = occ.position - windowSize + val windowEnd = occ.position + occ.term.length + windowSize + + // Count unique terms within this window + val termsInWindow = occurrences + .filter { it.position >= windowStart && it.position <= windowEnd } + .map { it.term } + .toSet() + + // Score: number of unique terms + bonus for longer anchor term + val score = termsInWindow.size * 100 + occ.term.length + + if (score > bestScore) { + bestScore = score + bestPosition = occ.position + bestLength = occ.term.length + } + } + + return Pair(bestPosition, bestLength) + } + private fun buildSnippetInternal(raw: String, anchorTerms: List, highlightTerms: List, context: Int = 220): String { if (raw.isEmpty()) return "" val (plain, mapToOrig) = HebrewTextUtils.stripDiacriticsWithMap(raw) @@ -756,12 +810,8 @@ class LuceneSearchEngine( val effContext = if (hasDiacritics) maxOf(context, 360) else context val plainSearch = HebrewTextUtils.replaceFinalsWithBase(plain) - val plainIdx = anchorTerms.asSequence().mapNotNull { t -> - val i = plainSearch.indexOf(t) - if (i >= 0) i else null - }.firstOrNull() ?: 0 - - val plainLen = anchorTerms.firstOrNull()?.length ?: 0 + // Find the best position: where most anchor terms cluster together + val (plainIdx, plainLen) = findBestAnchorPosition(plainSearch, anchorTerms, effContext) val plainStart = (plainIdx - effContext).coerceAtLeast(0) val plainEnd = (plainIdx + plainLen + effContext).coerceAtMost(plain.length) val origStart = HebrewTextUtils.mapToOrigIndex(mapToOrig, plainStart) From 2149be98c7f04709af321721aa1b65bb04181d10 Mon Sep 17 00:00:00 2001 From: Elie Gambache Date: Tue, 20 Jan 2026 18:59:26 +0200 Subject: [PATCH 5/6] perf: optimize snippet positioning algorithm - Inline findBestAnchorPosition into buildSnippetInternal - Limit to 5 occurrences per term to bound search space - Early exit when all terms found clustered together - Use simpler data structures (Pair instead of data class) --- .../search/LuceneSearchEngine.kt | 103 ++++++++---------- 1 file changed, 47 insertions(+), 56 deletions(-) diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt index 6677b0a..b01dc63 100644 --- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt @@ -749,60 +749,6 @@ class LuceneSearchEngine( .sortedByDescending { it.length } } - /** - * Find the best position to center the snippet around. - * Instead of just finding the first occurrence of the first anchor term, - * this finds where the most anchor terms cluster together. - * Returns (position, length) of the best anchor. - */ - private fun findBestAnchorPosition(plainSearch: String, anchorTerms: List, windowSize: Int): Pair { - if (anchorTerms.isEmpty()) return Pair(0, 0) - - // Find all occurrences of all anchor terms - data class TermOccurrence(val term: String, val position: Int) - val occurrences = mutableListOf() - - for (term in anchorTerms) { - if (term.isEmpty()) continue - var from = 0 - while (from <= plainSearch.length - term.length) { - val idx = plainSearch.indexOf(term, startIndex = from) - if (idx == -1) break - occurrences.add(TermOccurrence(term, idx)) - from = idx + 1 - } - } - - if (occurrences.isEmpty()) return Pair(0, anchorTerms.firstOrNull()?.length ?: 0) - - // Score each occurrence by how many unique terms appear nearby - var bestPosition = occurrences.first().position - var bestLength = occurrences.first().term.length - var bestScore = 0 - - for (occ in occurrences) { - val windowStart = occ.position - windowSize - val windowEnd = occ.position + occ.term.length + windowSize - - // Count unique terms within this window - val termsInWindow = occurrences - .filter { it.position >= windowStart && it.position <= windowEnd } - .map { it.term } - .toSet() - - // Score: number of unique terms + bonus for longer anchor term - val score = termsInWindow.size * 100 + occ.term.length - - if (score > bestScore) { - bestScore = score - bestPosition = occ.position - bestLength = occ.term.length - } - } - - return Pair(bestPosition, bestLength) - } - private fun buildSnippetInternal(raw: String, anchorTerms: List, highlightTerms: List, context: Int = 220): String { if (raw.isEmpty()) return "" val (plain, mapToOrig) = HebrewTextUtils.stripDiacriticsWithMap(raw) @@ -810,8 +756,53 @@ class LuceneSearchEngine( val effContext = if (hasDiacritics) maxOf(context, 360) else context val plainSearch = HebrewTextUtils.replaceFinalsWithBase(plain) - // Find the best position: where most anchor terms cluster together - val (plainIdx, plainLen) = findBestAnchorPosition(plainSearch, anchorTerms, effContext) + // Find best anchor position: where most terms cluster together + // Optimized: limit occurrences per term, early exit on perfect score + var plainIdx = 0 + var plainLen = anchorTerms.firstOrNull()?.length ?: 0 + + if (anchorTerms.isNotEmpty()) { + val maxOccPerTerm = 5 // Limit occurrences per term for perf + val positions = mutableListOf>() // (position, term) + + for (term in anchorTerms) { + if (term.isEmpty()) continue + var from = 0 + var count = 0 + while (from <= plainSearch.length - term.length && count < maxOccPerTerm) { + val idx = plainSearch.indexOf(term, startIndex = from) + if (idx == -1) break + positions.add(idx to term) + from = idx + 1 + count++ + } + } + + if (positions.isNotEmpty()) { + val maxPossibleScore = anchorTerms.size + var bestScore = 0 + + for ((pos, term) in positions) { + // Count unique terms in window around this position + val windowStart = pos - effContext + val windowEnd = pos + term.length + effContext + var uniqueTerms = 0 + val seen = mutableSetOf() + for ((p, t) in positions) { + if (p in windowStart..windowEnd && seen.add(t)) uniqueTerms++ + } + val score = uniqueTerms * 100 + term.length + + if (score > bestScore) { + bestScore = score + plainIdx = pos + plainLen = term.length + // Early exit if we found all terms clustered + if (uniqueTerms >= maxPossibleScore) break + } + } + } + } val plainStart = (plainIdx - effContext).coerceAtLeast(0) val plainEnd = (plainIdx + plainLen + effContext).coerceAtMost(plain.length) val origStart = HebrewTextUtils.mapToOrigIndex(mapToOrig, plainStart) From c4091f1abfb3c0603ba139c09772e6591a65c50c Mon Sep 17 00:00:00 2001 From: Elie Gambache Date: Tue, 20 Jan 2026 19:27:50 +0200 Subject: [PATCH 6/6] feat: add buildHighlightTerms for intelligent find-in-page Add public method to build highlight terms with dictionary expansion, filtered for hallucinations and 2-letter terms. This enables find-in-page to highlight the same words as global search. --- .../search/LuceneSearchEngine.kt | 42 +++++++++++++++++++ .../seforimlibrary/search/SearchEngine.kt | 13 ++++++ 2 files changed, 55 insertions(+) diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt index b01dc63..a1e5b8e 100644 --- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt @@ -195,6 +195,48 @@ class LuceneSearchEngine( return buildSnippetInternal(rawClean, anchorTerms, highlightTerms) } + override fun buildHighlightTerms(query: String): List { + val norm = HebrewTextUtils.normalizeHebrew(query) + if (norm.isBlank()) return emptyList() + + val analyzedRaw = analyzeToTerms(stdAnalyzer, norm) ?: emptyList() + val hasHashem = query.contains("ה׳") || query.contains("ה'") + + // Filter single letters and stop words (same logic as buildSearchContext) + val analyzedStd = analyzedRaw.filter { token -> + if (token == "ה" && hasHashem) return@filter true + if (token.any { it.isDigit() }) return@filter true + token.length >= 2 && token !in setOf( + "א", "ב", "ג", "ד", "ה", "ו", "ז", "ח", "ט", "י", "כ", "ל", "מ", + "נ", "ס", "ע", "פ", "צ", "ק", "ר", "ש", "ת", + ) + } + + // Get dictionary expansions + val tokenExpansions: Map> = + analyzedStd.associateWith { token -> + val expansion = magicDict?.expansionFor(token) ?: return@associateWith emptyList() + listOf(expansion) + } + + // Filter hallucinations for highlighting + val tokenExpansionsForHighlight = tokenExpansions.mapValues { (token, exps) -> + exps.filter { exp -> !isHallucinatedExpansion(token, exp) } + } + + // Build expanded terms (filter 2-letter from expansions only) + val allExpansionsForHighlight = tokenExpansionsForHighlight.values.flatten() + val expandedTerms = allExpansionsForHighlight + .flatMap { it.surface + it.variants + it.base } + .filter { it.length > 2 } + .distinct() + + val ngramTerms = buildNgramTerms(analyzedStd, gram = 4) + val hashemTerms = if (hasHashem) loadHashemHighlightTerms() else emptyList() + + return filterTermsForHighlight(analyzedStd + expandedTerms + ngramTerms + hashemTerms) + } + override fun close() { // Directory is closed automatically when readers are closed } diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt index 6a4b4ed..2b7b247 100644 --- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt @@ -82,4 +82,17 @@ interface SearchEngine : Closeable { * @return HTML string with `` tags around matches, possibly with `...` for truncation */ fun buildSnippet(rawText: String, query: String, near: Int): String + + /** + * Builds a list of terms to highlight for a given query, using dictionary expansion. + * + * This is useful for intelligent find-in-page that matches the same words + * as the global search (including synonyms and morphological variants). + * The terms are filtered to exclude hallucinated mappings and short words + * that only came from dictionary expansion. + * + * @param query The search query in Hebrew + * @return List of normalized terms to highlight (includes original tokens + expansions) + */ + fun buildHighlightTerms(query: String): List }