Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,58 @@ class LuceneSearchEngine(
// Constants for snippet source building (must match indexer)
private const val SNIPPET_NEIGHBOR_WINDOW = 4
private const val SNIPPET_MIN_LENGTH = 280

/**
* Blacklist of hallucinated dictionary mappings.
* Loaded from resources/hallucination_blacklist.tsv
* Key: normalized token, Value: set of incorrect base forms to reject
*
* TODO: This is a temporary workaround. The proper fix is to correct the
* hallucinated mappings directly in the lexical dictionary (lexical.db)
* so this blacklist becomes unnecessary.
*/
private val HALLUCINATION_BLACKLIST: Map<String, Set<String>> by lazy {
loadHallucinationBlacklist()
}

private fun loadHallucinationBlacklist(): Map<String, Set<String>> {
val result = mutableMapOf<String, MutableSet<String>>()
try {
val inputStream = LuceneSearchEngine::class.java.getResourceAsStream("/hallucination_blacklist.tsv")
if (inputStream == null) {
logger.w { "hallucination_blacklist.tsv not found in resources" }
return emptyMap()
}
inputStream.bufferedReader().useLines { lines ->
lines.forEach { line ->
val trimmed = line.trim()
if (trimmed.isEmpty() || trimmed.startsWith("#")) return@forEach
val parts = trimmed.split("\t")
if (parts.size >= 2) {
val token = HebrewTextUtils.normalizeHebrew(parts[0])
val base = HebrewTextUtils.normalizeHebrew(parts[1])
result.getOrPut(token) { mutableSetOf() }.add(base)
}
}
}
logger.d { "Loaded ${result.size} hallucination blacklist entries" }
} catch (e: Exception) {
logger.e(e) { "Failed to load hallucination blacklist" }
}
return result
}

/**
* Check if an expansion should be rejected based on the hallucination blacklist.
*/
private fun isHallucinatedExpansion(token: String, expansion: MagicDictionaryIndex.Expansion): Boolean {
val normalizedToken = HebrewTextUtils.normalizeHebrew(token)
val blacklistedBases = HALLUCINATION_BLACKLIST[normalizedToken] ?: return false
return expansion.base.any { base ->
val normalizedBase = HebrewTextUtils.normalizeHebrew(base)
blacklistedBases.contains(normalizedBase)
}
}
}

// Open Lucene directory lazily to avoid any I/O at app startup
Expand Down Expand Up @@ -143,6 +195,48 @@ class LuceneSearchEngine(
return buildSnippetInternal(rawClean, anchorTerms, highlightTerms)
}

override fun buildHighlightTerms(query: String): List<String> {
val norm = HebrewTextUtils.normalizeHebrew(query)
if (norm.isBlank()) return emptyList()

val analyzedRaw = analyzeToTerms(stdAnalyzer, norm) ?: emptyList()
val hasHashem = query.contains("ה׳") || query.contains("ה'")

// Filter single letters and stop words (same logic as buildSearchContext)
val analyzedStd = analyzedRaw.filter { token ->
if (token == "ה" && hasHashem) return@filter true
if (token.any { it.isDigit() }) return@filter true
token.length >= 2 && token !in setOf(
"א", "ב", "ג", "ד", "ה", "ו", "ז", "ח", "ט", "י", "כ", "ל", "מ",
"נ", "ס", "ע", "פ", "צ", "ק", "ר", "ש", "ת",
)
}

// Get dictionary expansions
val tokenExpansions: Map<String, List<MagicDictionaryIndex.Expansion>> =
analyzedStd.associateWith { token ->
val expansion = magicDict?.expansionFor(token) ?: return@associateWith emptyList()
listOf(expansion)
}

// Filter hallucinations for highlighting
val tokenExpansionsForHighlight = tokenExpansions.mapValues { (token, exps) ->
exps.filter { exp -> !isHallucinatedExpansion(token, exp) }
}

// Build expanded terms (filter 2-letter from expansions only)
val allExpansionsForHighlight = tokenExpansionsForHighlight.values.flatten()
val expandedTerms = allExpansionsForHighlight
.flatMap { it.surface + it.variants + it.base }
.filter { it.length > 2 }
.distinct()

val ngramTerms = buildNgramTerms(analyzedStd, gram = 4)
val hashemTerms = if (hasHashem) loadHashemHighlightTerms() else emptyList()

return filterTermsForHighlight(analyzedStd + expandedTerms + ngramTerms + hashemTerms)
}

override fun close() {
// Directory is closed automatically when readers are closed
}
Expand Down Expand Up @@ -241,22 +335,38 @@ class LuceneSearchEngine(
logger.d { "[DEBUG] Analyzed tokens: $analyzedStd" }

// Get all possible expansions for each token (a token can belong to multiple bases)
val tokenExpansionsRaw: Map<String, List<MagicDictionaryIndex.Expansion>> =
// These expansions are used for SEARCH - we keep all of them for better recall
val tokenExpansions: Map<String, List<MagicDictionaryIndex.Expansion>> =
analyzedStd.associateWith { token ->
// Get best expansion (prefers matching base, then largest)
val expansion = magicDict?.expansionFor(token) ?: return@associateWith emptyList()
listOf(expansion)
}
tokenExpansionsRaw.forEach { (token, exps) ->
tokenExpansions.forEach { (token, exps) ->
exps.forEach { exp ->
logger.d { "[DEBUG] Token '$token' -> expansion: surface=${exp.surface.take(10)}..., variants=${exp.variants.take(10)}..., base=${exp.base}" }
}
}

val tokenExpansions: Map<String, List<MagicDictionaryIndex.Expansion>> = tokenExpansionsRaw
// For HIGHLIGHTING, filter out hallucinated expansions to avoid highlighting unrelated words
val tokenExpansionsForHighlight: Map<String, List<MagicDictionaryIndex.Expansion>> =
tokenExpansions.mapValues { (token, exps) ->
exps.filter { exp ->
val isHallucination = isHallucinatedExpansion(token, exp)
if (isHallucination) {
logger.d { "[DEBUG] Token '$token' -> BLOCKED for highlight (hallucination): base=${exp.base}" }
}
!isHallucination
}
}

val allExpansions = tokenExpansions.values.flatten()
val expandedTerms = allExpansions.flatMap { it.surface + it.variants + it.base }.distinct()
val allExpansionsForHighlight = tokenExpansionsForHighlight.values.flatten()
// Filter out 2-letter terms from dictionary expansions for highlighting
// (2-letter words should only be highlighted if explicitly in the query)
val expandedTerms = allExpansionsForHighlight
.flatMap { it.surface + it.variants + it.base }
.filter { it.length > 2 }
.distinct()
// Add 4-gram terms used in the query (matches text_ng4 clauses) so highlighting can
// reflect matches that were found via the n-gram branch.
val ngramTerms = buildNgramTerms(analyzedStd, gram = 4)
Expand Down Expand Up @@ -688,12 +798,53 @@ class LuceneSearchEngine(
val effContext = if (hasDiacritics) maxOf(context, 360) else context
val plainSearch = HebrewTextUtils.replaceFinalsWithBase(plain)

val plainIdx = anchorTerms.asSequence().mapNotNull { t ->
val i = plainSearch.indexOf(t)
if (i >= 0) i else null
}.firstOrNull() ?: 0
// Find best anchor position: where most terms cluster together
// Optimized: limit occurrences per term, early exit on perfect score
var plainIdx = 0
var plainLen = anchorTerms.firstOrNull()?.length ?: 0

if (anchorTerms.isNotEmpty()) {
val maxOccPerTerm = 5 // Limit occurrences per term for perf
val positions = mutableListOf<Pair<Int, String>>() // (position, term)

for (term in anchorTerms) {
if (term.isEmpty()) continue
var from = 0
var count = 0
while (from <= plainSearch.length - term.length && count < maxOccPerTerm) {
val idx = plainSearch.indexOf(term, startIndex = from)
if (idx == -1) break
positions.add(idx to term)
from = idx + 1
count++
}
}

val plainLen = anchorTerms.firstOrNull()?.length ?: 0
if (positions.isNotEmpty()) {
val maxPossibleScore = anchorTerms.size
var bestScore = 0

for ((pos, term) in positions) {
// Count unique terms in window around this position
val windowStart = pos - effContext
val windowEnd = pos + term.length + effContext
var uniqueTerms = 0
val seen = mutableSetOf<String>()
for ((p, t) in positions) {
if (p in windowStart..windowEnd && seen.add(t)) uniqueTerms++
}
val score = uniqueTerms * 100 + term.length

if (score > bestScore) {
bestScore = score
plainIdx = pos
plainLen = term.length
// Early exit if we found all terms clustered
if (uniqueTerms >= maxPossibleScore) break
}
}
}
}
val plainStart = (plainIdx - effContext).coerceAtLeast(0)
val plainEnd = (plainIdx + plainLen + effContext).coerceAtMost(plain.length)
val origStart = HebrewTextUtils.mapToOrigIndex(mapToOrig, plainStart)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,17 @@ interface SearchEngine : Closeable {
* @return HTML string with `<b>` tags around matches, possibly with `...` for truncation
*/
fun buildSnippet(rawText: String, query: String, near: Int): String

/**
* Builds a list of terms to highlight for a given query, using dictionary expansion.
*
* This is useful for intelligent find-in-page that matches the same words
* as the global search (including synonyms and morphological variants).
* The terms are filtered to exclude hallucinated mappings and short words
* that only came from dictionary expansion.
*
* @param query The search query in Hebrew
* @return List of normalized terms to highlight (includes original tokens + expansions)
*/
fun buildHighlightTerms(query: String): List<String>
}
29 changes: 29 additions & 0 deletions search/src/jvmMain/resources/hallucination_blacklist.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Hallucination Blacklist
# Format: token<TAB>base
# Lines starting with # are comments
# These are LLM-generated mappings that are incorrect and should be ignored
לחתוכ כנ
לחתוך כנ
לבקש רצה
מתפלל בעה
לשבת נתנ
לעבוד עשה
להעמיד קומ
כנ כנ
בנ בנ
לשימ שמ
לשים שמ
לדונ יד
לדון יד
אמ אמ
להרגיע יושב
לחייכ חוה
לחייך חוה
שנ שנ
עצ עצ
דנ דנ
להיכנס עלה
לשקול לקח
לבנות בנ
לשחרר שרה
להתיר שרה
Loading