diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..63aa6e7 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,41 @@ +name: CI - Tests + +on: + pull_request: + branches: + - master + - main + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - name: Checkout source + uses: actions/checkout@v4 + + - name: Install JBR 25 + run: | + curl -L -o jbr.tar.gz "https://cache-redirector.jetbrains.com/intellij-jbr/jbrsdk-25.0.1-linux-x64-b268.52.tar.gz" + mkdir -p "$RUNNER_TEMP/jbr" + tar -xzf jbr.tar.gz -C "$RUNNER_TEMP/jbr" + JBR_DIR=$(find "$RUNNER_TEMP/jbr" -mindepth 1 -maxdepth 1 -type d -name "jbr*" -o -name "jbrsdk*" | head -n 1) + echo "JAVA_HOME=$JBR_DIR" >> "$GITHUB_ENV" + echo "$JBR_DIR/bin" >> "$GITHUB_PATH" + + - name: Setup Gradle + uses: gradle/gradle-build-action@v3 + + - name: Grant execute permission for gradlew + run: chmod +x gradlew + + - name: Run all tests + run: ./gradlew allTests --no-daemon + + - name: Upload test reports + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-reports + path: '**/build/reports/tests/' + retention-days: 7 diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 174fbf1..2d2831f 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -13,10 +13,11 @@ kotlinx-serialization = "1.9.0" kotlinx-datetime = "0.7.1" kermit = "2.0.8" sqlDelight = "2.1.0" -compose = "1.9.3" +compose = "1.10.0" androidx-activityCompose = "1.12.1" commons-compress = "1.28.0" zstd-jni = "1.5.7-6" +jsoup = "1.22.1" [libraries] @@ -39,6 +40,7 @@ sqlDelight-driver-js = { module = "app.cash.sqldelight:web-worker-driver", versi commons-compress = { module = "org.apache.commons:commons-compress", version.ref = "commons-compress" } zstd = { module = "com.github.luben:zstd-jni", version.ref = "zstd-jni" } androidx-activityCompose = { module = "androidx.activity:activity-compose", version.ref = "androidx-activityCompose" } +jsoup = { module = "org.jsoup:jsoup", version.ref = "jsoup" } [plugins] diff --git a/search/build.gradle.kts b/search/build.gradle.kts new file mode 100644 index 0000000..52c283f --- /dev/null +++ b/search/build.gradle.kts @@ -0,0 +1,30 @@ +plugins { + alias(libs.plugins.multiplatform) + alias(libs.plugins.kotlinx.serialization) +} + +group = "io.github.kdroidfilter.seforimlibrary" + +kotlin { + jvmToolchain(libs.versions.jvmToolchain.get().toInt()) + + jvm() + + sourceSets { + jvmMain.dependencies { + api(project(":core")) + implementation(libs.kotlinx.coroutines.core) + implementation(libs.kotlinx.serialization.json) + implementation(libs.lucene.core) + implementation(libs.lucene.analysis.common) + implementation(libs.sqlDelight.driver.sqlite) + implementation(libs.kermit) + implementation(libs.jsoup) + } + + jvmTest.dependencies { + implementation(kotlin("test")) + implementation(libs.kotlinx.coroutines.test) + } + } +} diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HebrewTextUtils.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HebrewTextUtils.kt new file mode 100644 index 0000000..a2c766f --- /dev/null +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HebrewTextUtils.kt @@ -0,0 +1,129 @@ +package io.github.kdroidfilter.seforimlibrary.search + +/** + * Utility functions for Hebrew text processing. + * Includes normalization, diacritic removal, and final letter handling. + */ +object HebrewTextUtils { + + /** + * Map of Hebrew final letters (sofit) to their base forms. + */ + val SOFIT_MAP = mapOf( + 'ך' to 'כ', // U+05DA -> U+05DB + 'ם' to 'מ', // U+05DD -> U+05DE + 'ן' to 'נ', // U+05DF -> U+05E0 + 'ף' to 'פ', // U+05E3 -> U+05E4 + 'ץ' to 'צ' // U+05E5 -> U+05E6 + ) + + /** + * Normalizes Hebrew text by: + * - Removing teamim (cantillation marks) U+0591–U+05AF + * - Removing nikud (vowel points) U+05B0–U+05BD and related + * - Replacing maqaf U+05BE with space + * - Removing gershayim/geresh + * - Normalizing final letters to base forms + * - Collapsing whitespace + * + * @param input The input string to normalize + * @return The normalized string + */ + fun normalizeHebrew(input: String): String { + if (input.isBlank()) return "" + var s = input.trim() + + // Remove biblical cantillation marks (teamim) U+0591–U+05AF + s = s.replace("[\u0591-\u05AF]".toRegex(), "") + // Remove nikud signs including meteg and qamatz qatan + s = s.replace("[\u05B0\u05B1\u05B2\u05B3\u05B4\u05B5\u05B6\u05B7\u05B8\u05B9\u05BB\u05BC\u05BD\u05C1\u05C2\u05C7]".toRegex(), "") + // Replace maqaf U+05BE with space + s = s.replace('\u05BE', ' ') + // Remove gershayim/geresh + s = s.replace("\u05F4", "").replace("\u05F3", "") + // Normalize Hebrew final letters (sofit) to base forms + s = replaceFinalsWithBase(s) + // Collapse whitespace + s = s.replace("\\s+".toRegex(), " ").trim() + return s + } + + /** + * Replaces Hebrew final letters (sofit) with their base forms. + * + * @param text The input text + * @return Text with final letters replaced + */ + fun replaceFinalsWithBase(text: String): String = text + .replace('\u05DA', '\u05DB') // ך -> כ + .replace('\u05DD', '\u05DE') // ם -> מ + .replace('\u05DF', '\u05E0') // ן -> נ + .replace('\u05E3', '\u05E4') // ף -> פ + .replace('\u05E5', '\u05E6') // ץ -> צ + + /** + * Checks if a character is a Hebrew diacritic (nikud or teamim). + * + * @param c The character to check + * @return true if the character is a diacritic + */ + fun isNikudOrTeamim(c: Char): Boolean { + val code = c.code + return (code in 0x0591..0x05AF) || // teamim + (code in 0x05B0..0x05BD) || // nikud + meteg + (c == '\u05C1') || (c == '\u05C2') || (c == '\u05C7') + } + + /** + * Strips Hebrew diacritics (nikud and teamim) from text and returns + * both the plain text and an index map from plain indices to original indices. + * + * @param src The source string + * @return Pair of (plain text, index map) + */ + fun stripDiacriticsWithMap(src: String): Pair { + val out = StringBuilder(src.length) + val map = ArrayList(src.length) + var i = 0 + while (i < src.length) { + val ch = src[i] + if (!isNikudOrTeamim(ch)) { + out.append(ch) + map.add(i) + } + i++ + } + val arr = IntArray(map.size) { map[it] } + return out.toString() to arr + } + + /** + * Strips Hebrew diacritics from text without preserving index mapping. + * + * @param text The input text + * @return Text without diacritics + */ + fun stripDiacritics(text: String): String { + if (text.isEmpty()) return text + val sb = StringBuilder(text.length) + for (ch in text) { + if (!isNikudOrTeamim(ch)) { + sb.append(ch) + } + } + return sb.toString() + } + + /** + * Maps a plain text index back to the original text index. + * + * @param mapToOrig The index map from stripDiacriticsWithMap + * @param plainIndex The index in the plain text + * @return The corresponding index in the original text + */ + fun mapToOrigIndex(mapToOrig: IntArray, plainIndex: Int): Int { + if (mapToOrig.isEmpty()) return plainIndex + val idx = plainIndex.coerceIn(0, mapToOrig.size - 1) + return mapToOrig[idx] + } +} diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt new file mode 100644 index 0000000..74b268f --- /dev/null +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt @@ -0,0 +1,854 @@ +package io.github.kdroidfilter.seforimlibrary.search + +import co.touchlab.kermit.Logger +import org.apache.lucene.analysis.Analyzer +import org.apache.lucene.analysis.TokenStream +import org.apache.lucene.analysis.standard.StandardAnalyzer +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute +import org.apache.lucene.index.DirectoryReader +import org.apache.lucene.index.StoredFields +import org.apache.lucene.index.Term +import org.apache.lucene.search.BooleanClause +import org.apache.lucene.search.BooleanQuery +import org.apache.lucene.search.BoostQuery +import org.apache.lucene.search.FuzzyQuery +import org.apache.lucene.search.IndexSearcher +import org.apache.lucene.search.PrefixQuery +import org.apache.lucene.search.Query +import org.apache.lucene.search.ScoreDoc +import org.apache.lucene.search.TermQuery +import org.apache.lucene.util.QueryBuilder +import org.apache.lucene.store.FSDirectory +import org.apache.lucene.document.IntPoint +import org.jsoup.Jsoup +import org.jsoup.safety.Safelist +import java.io.Closeable +import java.nio.file.Path + +/** + * Lucene-based implementation of SearchEngine for full-text search. + * Supports Hebrew text with diacritics handling, dictionary expansion, and fuzzy matching. + */ +class LuceneSearchEngine( + private val indexDir: Path, + private val snippetProvider: SnippetProvider? = null, + private val analyzer: Analyzer = StandardAnalyzer(), + private val dictionaryPath: Path? = null +) : SearchEngine { + + companion object { + private val logger = Logger.withTag("LuceneSearchEngine") + // Hard cap on how many synonym/expansion terms we allow per token + private const val MAX_SYNONYM_TERMS_PER_TOKEN: Int = 32 + // Global cap for boost queries built from dictionary expansions + private const val MAX_SYNONYM_BOOST_TERMS: Int = 256 + // Constants for snippet source building (must match indexer) + private const val SNIPPET_NEIGHBOR_WINDOW = 4 + private const val SNIPPET_MIN_LENGTH = 280 + } + + // Open Lucene directory lazily to avoid any I/O at app startup + private val dir by lazy { FSDirectory.open(indexDir) } + + private val stdAnalyzer: Analyzer by lazy { analyzer } + private val magicDict: MagicDictionaryIndex? by lazy { + val candidates = listOfNotNull( + dictionaryPath, + System.getProperty("magicDict")?.let { Path.of(it) }, + System.getenv("SEFORIM_MAGIC_DICT")?.let { Path.of(it) }, + indexDir.resolveSibling("lexical.db"), + indexDir.resolveSibling("seforim.db").resolveSibling("lexical.db"), + Path.of("SeforimLibrary/SeforimMagicIndexer/magicindexer/build/db/lexical.db") + ).distinct() + val firstExisting = MagicDictionaryIndex.findValidDictionary(candidates) + if (firstExisting == null) { + logger.d { + "[MagicDictionary] Missing lexical.db; search will run without dictionary expansions. " + + "Provide -DmagicDict=/path/lexical.db or SEFORIM_MAGIC_DICT. Checked: " + + candidates.joinToString() + } + return@lazy null + } + logger.d { "[MagicDictionary] Loading lexical db from $firstExisting" } + val loaded = MagicDictionaryIndex.load(HebrewTextUtils::normalizeHebrew, firstExisting) + if (loaded == null) { + logger.d { + "[MagicDictionary] Failed to load lexical db at $firstExisting; " + + "continuing without dictionary expansions" + } + } + loaded + } + + private inline fun withSearcher(block: (IndexSearcher) -> T): T { + DirectoryReader.open(dir).use { reader -> + val searcher = IndexSearcher(reader) + return block(searcher) + } + } + + // --- SearchEngine interface implementation --- + + override fun openSession( + query: String, + near: Int, + bookFilter: Long?, + categoryFilter: Long?, + bookIds: Collection?, + lineIds: Collection? + ): SearchSession? { + val context = buildSearchContext(query, near, bookFilter, categoryFilter, bookIds, lineIds) ?: return null + val reader = DirectoryReader.open(dir) + return LuceneSearchSession(context.query, context.anchorTerms, context.highlightTerms, reader) + } + + override fun searchBooksByTitlePrefix(query: String, limit: Int): List { + val q = HebrewTextUtils.normalizeHebrew(query) + if (q.isBlank()) return emptyList() + val tokens = q.split("\\s+".toRegex()).map { it.trim() }.filter { it.isNotEmpty() } + if (tokens.isEmpty()) return emptyList() + + return withSearcher { searcher -> + val must = BooleanQuery.Builder() + // Restrict to book_title docs + must.add(TermQuery(Term("type", "book_title")), BooleanClause.Occur.FILTER) + tokens.forEach { tok -> + // prefix on analyzed 'title' + must.add(PrefixQuery(Term("title", tok)), BooleanClause.Occur.MUST) + } + val luceneQuery = must.build() + val top = searcher.search(luceneQuery, limit) + val stored: StoredFields = searcher.storedFields() + val ids = LinkedHashSet() + for (sd in top.scoreDocs) { + val doc = stored.document(sd.doc) + val id = doc.getField("book_id")?.numericValue()?.toLong() + if (id != null) ids.add(id) + } + ids.toList().take(limit) + } + } + + override fun buildSnippet(rawText: String, query: String, near: Int): String { + val norm = HebrewTextUtils.normalizeHebrew(query) + if (norm.isBlank()) return Jsoup.clean(rawText, Safelist.none()) + val rawClean = Jsoup.clean(rawText, Safelist.none()) + val analyzedStd = (analyzeToTerms(stdAnalyzer, norm) ?: emptyList()) + val hasHashem = query.contains("ה׳") || query.contains("ה'") + val hashemTerms = if (hasHashem) loadHashemHighlightTerms() else emptyList() + val highlightTerms = filterTermsForHighlight( + analyzedStd + buildNgramTerms(analyzedStd, gram = 4) + hashemTerms + ) + val anchorTerms = buildAnchorTerms(norm, highlightTerms) + return buildSnippetInternal(rawClean, anchorTerms, highlightTerms) + } + + override fun close() { + // Directory is closed automatically when readers are closed + } + + // --- Inner SearchSession class --- + + inner class LuceneSearchSession internal constructor( + private val query: Query, + private val anchorTerms: List, + private val highlightTerms: List, + private val reader: DirectoryReader + ) : SearchSession { + private val searcher = IndexSearcher(reader) + private var after: ScoreDoc? = null + private var finished = false + private var totalHitsValue: Long? = null + + override fun nextPage(limit: Int): SearchPage? { + if (finished) return null + val top = searcher.searchAfter(after, query, limit) + if (totalHitsValue == null) totalHitsValue = top.totalHits?.value + if (top.scoreDocs.isEmpty()) { + finished = true + return null + } + val stored = searcher.storedFields() + val hits = mapScoreDocs(stored, top.scoreDocs.toList(), anchorTerms, highlightTerms) + after = top.scoreDocs.last() + val isLast = top.scoreDocs.size < limit + if (isLast) finished = true + return SearchPage( + hits = hits, + totalHits = totalHitsValue ?: hits.size.toLong(), + isLastPage = isLast + ) + } + + override fun close() { + reader.close() + } + } + + // --- Additional public search methods --- + + fun searchAllText(rawQuery: String, near: Int = 5, limit: Int, offset: Int = 0): List = + doSearch(rawQuery, near, limit, offset, bookFilter = null, categoryFilter = null) + + fun searchInBook(rawQuery: String, near: Int, bookId: Long, limit: Int, offset: Int = 0): List = + doSearch(rawQuery, near, limit, offset, bookFilter = bookId, categoryFilter = null) + + fun searchInCategory(rawQuery: String, near: Int, categoryId: Long, limit: Int, offset: Int = 0): List = + doSearch(rawQuery, near, limit, offset, bookFilter = null, categoryFilter = categoryId) + + fun searchInBooks(rawQuery: String, near: Int, bookIds: Collection, limit: Int, offset: Int = 0): List = + doSearchInBooks(rawQuery, near, limit, offset, bookIds) + + // --- Private implementation --- + + private data class SearchContext( + val query: Query, + val anchorTerms: List, + val highlightTerms: List + ) + + private fun buildSearchContext( + rawQuery: String, + near: Int, + bookFilter: Long?, + categoryFilter: Long?, + bookIds: Collection?, + lineIds: Collection? + ): SearchContext? { + val norm = HebrewTextUtils.normalizeHebrew(rawQuery) + if (norm.isBlank()) return null + + val analyzedRaw = analyzeToTerms(stdAnalyzer, norm) ?: emptyList() + + // Check if the original query contained ה׳ (Hashem) before normalization + val hasHashem = rawQuery.contains("ה׳") || rawQuery.contains("ה'") + + // Filter out single Hebrew letters and stop words BEFORE dictionary expansion + // BUT preserve "ה" if the original query had "ה׳" (Hashem) + val analyzedStd = analyzedRaw.filter { token -> + // Special case: if query has ה׳, keep "ה" token + if (token == "ה" && hasHashem) return@filter true + // Preserve numeric tokens (e.g., "6") so they can expand via MagicDictionary + if (token.any { it.isDigit() }) return@filter true + + token.length >= 2 && token !in setOf( + "א", "ב", "ג", "ד", "ה", "ו", "ז", "ח", "ט", "י", "כ", "ל", "מ", + "נ", "ס", "ע", "פ", "צ", "ק", "ר", "ש", "ת", + ) + } + + logger.d { "[DEBUG] Original query had Hashem (ה׳): $hasHashem" } + logger.d { "[DEBUG] Analyzed tokens: $analyzedStd" } + + // Get all possible expansions for each token (a token can belong to multiple bases) + val tokenExpansionsRaw: Map> = + analyzedStd.associateWith { token -> + // Get best expansion (prefers matching base, then largest) + val expansion = magicDict?.expansionFor(token) ?: return@associateWith emptyList() + listOf(expansion) + } + tokenExpansionsRaw.forEach { (token, exps) -> + exps.forEach { exp -> + logger.d { "[DEBUG] Token '$token' -> expansion: surface=${exp.surface.take(10)}..., variants=${exp.variants.take(10)}..., base=${exp.base}" } + } + } + + val tokenExpansions: Map> = tokenExpansionsRaw + + val allExpansions = tokenExpansions.values.flatten() + val expandedTerms = allExpansions.flatMap { it.surface + it.variants + it.base }.distinct() + // Add 4-gram terms used in the query (matches text_ng4 clauses) so highlighting can + // reflect matches that were found via the n-gram branch. + val ngramTerms = buildNgramTerms(analyzedStd, gram = 4) + // For highlighting/snippets, use the actual query tokens plus the concrete + // terms that the search query uses (expansions + n-grams), and if the query + // mentions Hashem explicitly, also include dictionary-based variants of the + // divine name from the lexical DB + val hashemTerms = if (hasHashem) loadHashemHighlightTerms() else emptyList() + val highlightTerms = filterTermsForHighlight(analyzedStd + expandedTerms + ngramTerms + hashemTerms) + val anchorTerms = buildAnchorTerms(norm, highlightTerms) + + val rankedQuery = buildExpandedQuery(norm, near, analyzedStd, tokenExpansions) + val mustAllTokensQuery: Query? = buildPresenceFilterForTokens(analyzedStd, near, tokenExpansions) + val phraseQuery: Query? = buildSynonymPhraseQuery(analyzedStd, tokenExpansions, near) + + val builder = BooleanQuery.Builder() + builder.add(TermQuery(Term("type", "line")), BooleanClause.Occur.FILTER) + if (bookFilter != null) builder.add(IntPoint.newExactQuery("book_id", bookFilter.toInt()), BooleanClause.Occur.FILTER) + if (categoryFilter != null) builder.add(IntPoint.newExactQuery("category_id", categoryFilter.toInt()), BooleanClause.Occur.FILTER) + val bookIdsArray = bookIds?.map { it.toInt() }?.toIntArray() + if (bookIdsArray != null && bookIdsArray.isNotEmpty()) { + builder.add(IntPoint.newSetQuery("book_id", *bookIdsArray), BooleanClause.Occur.FILTER) + } + val lineIdsArray = lineIds?.map { it.toInt() }?.toIntArray() + if (lineIdsArray != null && lineIdsArray.isNotEmpty()) { + builder.add(IntPoint.newSetQuery("line_id", *lineIdsArray), BooleanClause.Occur.FILTER) + } + if (mustAllTokensQuery != null) { + builder.add(mustAllTokensQuery, BooleanClause.Occur.FILTER) + logger.d { "[DEBUG] Added mustAllTokensQuery as FILTER" } + } + val analyzedCount = analyzedStd.size + if (phraseQuery != null && analyzedCount >= 2) { + val occur = if (near == 0) BooleanClause.Occur.MUST else BooleanClause.Occur.SHOULD + builder.add(phraseQuery, occur) + logger.d { "[DEBUG] Added phraseQuery with occur=$occur, near=$near" } + } + builder.add(rankedQuery, BooleanClause.Occur.SHOULD) + logger.d { "[DEBUG] Added rankedQuery as SHOULD" } + + val finalQuery = builder.build() + logger.d { "[DEBUG] Final query: $finalQuery" } + + return SearchContext( + query = finalQuery, + anchorTerms = anchorTerms, + highlightTerms = highlightTerms + ) + } + + private fun mapScoreDocs( + stored: StoredFields, + scoreDocs: List, + anchorTerms: List, + highlightTerms: List + ): List { + if (scoreDocs.isEmpty()) return emptyList() + + // First pass: extract metadata from index + data class DocMeta( + val sd: ScoreDoc, + val bookId: Long, + val bookTitle: String, + val lineId: Long, + val lineIndex: Int, + val isBaseBook: Boolean, + val orderIndex: Int, + val indexedRaw: String // from text_raw field, may be empty if not stored + ) + + val docMetas = scoreDocs.map { sd -> + val doc = stored.document(sd.doc) + DocMeta( + sd = sd, + bookId = doc.getField("book_id").numericValue().toLong(), + bookTitle = doc.getField("book_title").stringValue() ?: "", + lineId = doc.getField("line_id").numericValue().toLong(), + lineIndex = doc.getField("line_index").numericValue().toInt(), + isBaseBook = doc.getField("is_base_book")?.numericValue()?.toInt() == 1, + orderIndex = doc.getField("order_index")?.numericValue()?.toInt() ?: 999, + indexedRaw = doc.getField("text_raw")?.stringValue() ?: "" + ) + } + + // Get snippet sources: from provider if available, otherwise from index + val snippetSources: Map = if (snippetProvider != null) { + val lineInfos = docMetas.map { LineSnippetInfo(it.lineId, it.bookId, it.lineIndex) } + snippetProvider.getSnippetSources(lineInfos) + } else { + // Fallback to indexed text_raw + docMetas.associate { it.lineId to it.indexedRaw } + } + + val hits = docMetas.map { meta -> + val raw = snippetSources[meta.lineId] ?: meta.indexedRaw + val baseScore = meta.sd.score + + // Calculate boost: lower orderIndex = higher boost (only for base books) + val boostedScore = if (meta.isBaseBook) { + // Formula: boost = baseScore * (1 + (120 - orderIndex) / 60) + // orderIndex 1 gets ~3x boost, orderIndex 50 gets ~2.2x boost, orderIndex 100+ gets ~1.3x boost + val boostFactor = 1.0f + (120 - meta.orderIndex).coerceAtLeast(0) / 60.0f + baseScore * boostFactor + } else { + baseScore + } + + val snippet = buildSnippetInternal(raw, anchorTerms, highlightTerms) + LineHit( + bookId = meta.bookId, + bookTitle = meta.bookTitle, + lineId = meta.lineId, + lineIndex = meta.lineIndex, + snippet = snippet, + score = boostedScore, + rawText = raw + ) + } + // Re-sort by boosted score (descending) + return hits.sortedByDescending { it.score } + } + + private fun doSearch( + rawQuery: String, + near: Int, + limit: Int, + offset: Int, + bookFilter: Long?, + categoryFilter: Long? + ): List { + val context = buildSearchContext(rawQuery, near, bookFilter, categoryFilter, null, null) ?: return emptyList() + return withSearcher { searcher -> + val top = searcher.search(context.query, offset + limit) + val stored: StoredFields = searcher.storedFields() + val sliced = top.scoreDocs.drop(offset) + mapScoreDocs(stored, sliced, context.anchorTerms, context.highlightTerms) + } + } + + private fun doSearchInBooks( + rawQuery: String, + near: Int, + limit: Int, + offset: Int, + bookIds: Collection + ): List { + if (bookIds.isEmpty()) return emptyList() + val context = buildSearchContext(rawQuery, near, bookFilter = null, categoryFilter = null, bookIds = bookIds, lineIds = null) ?: return emptyList() + return withSearcher { searcher -> + val top = searcher.search(context.query, offset + limit) + val stored: StoredFields = searcher.storedFields() + val sliced = top.scoreDocs.drop(offset) + mapScoreDocs(stored, sliced, context.anchorTerms, context.highlightTerms) + } + } + + private fun analyzeToTerms(analyzer: Analyzer, text: String): List? = try { + val out = mutableListOf() + val ts: TokenStream = analyzer.tokenStream("text", text) + val termAtt = ts.addAttribute(CharTermAttribute::class.java) + ts.reset() + while (ts.incrementToken()) { + val t = termAtt.toString() + if (t.isNotBlank()) out += t + } + ts.end(); ts.close() + out + } catch (_: Exception) { null } + + private fun buildNgramPresenceForToken(token: String): Query? { + if (token.length < 4) return null + val grams = mutableListOf() + var i = 0 + val L = token.length + while (i + 4 <= L) { + grams += token.substring(i, i + 4) + i += 1 + } + if (grams.isEmpty()) return null + val b = BooleanQuery.Builder() + for (g in grams.distinct()) { + b.add(TermQuery(Term("text_ng4", g)), BooleanClause.Occur.MUST) + } + return b.build() + } + + private fun buildPresenceFilterForTokens( + tokens: List, + near: Int, + expansionsByToken: Map> + ): Query? { + if (tokens.isEmpty()) return null + val outer = BooleanQuery.Builder() + for (t in tokens) { + val expansions = expansionsByToken[t] ?: emptyList() + val synonymTerms = buildLimitedTermsForToken(t, expansions) + val ngram = if (near > 0) buildNgramPresenceForToken(t) else null + val clause = BooleanQuery.Builder().apply { + add(TermQuery(Term("text", t)), BooleanClause.Occur.SHOULD) + if (ngram != null) add(ngram, BooleanClause.Occur.SHOULD) + for (term in synonymTerms) { + if (term != t) { + add(TermQuery(Term("text", term)), BooleanClause.Occur.SHOULD) + } + } + }.build() + outer.add(clause, BooleanClause.Occur.MUST) + } + return outer.build() + } + + private fun buildHebrewStdQuery(norm: String, near: Int): Query { + val qb = QueryBuilder(stdAnalyzer) + val phrase = qb.createPhraseQuery("text", norm, near) + if (phrase != null) return phrase + val bool = qb.createBooleanQuery("text", norm, BooleanClause.Occur.MUST) + return bool ?: BooleanQuery.Builder().build() + } + + private fun buildMagicBoostQuery(expansions: List): Query? { + if (expansions.isEmpty()) return null + val surfaceTerms = LinkedHashSet() + val variantTerms = LinkedHashSet() + val baseTerms = LinkedHashSet() + for (exp in expansions) { + surfaceTerms.addAll(exp.surface) + variantTerms.addAll(exp.variants) + baseTerms.addAll(exp.base) + } + + val limitedSurfaces = surfaceTerms.take(MAX_SYNONYM_BOOST_TERMS) + val limitedVariants = variantTerms.take(MAX_SYNONYM_BOOST_TERMS) + val limitedBases = baseTerms.take(MAX_SYNONYM_BOOST_TERMS) + if (surfaceTerms.size > limitedSurfaces.size || + variantTerms.size > limitedVariants.size || + baseTerms.size > limitedBases.size + ) { + logger.d { + "[DEBUG] Capped magic boost terms: " + + "surface=${surfaceTerms.size}->${limitedSurfaces.size}, " + + "variants=${variantTerms.size}->${limitedVariants.size}, " + + "base=${baseTerms.size}->${limitedBases.size}" + } + } + + val b = BooleanQuery.Builder() + for (s in limitedSurfaces) { + b.add(BoostQuery(TermQuery(Term("text", s)), 2.0f), BooleanClause.Occur.SHOULD) + } + for (v in limitedVariants) { + b.add(BoostQuery(TermQuery(Term("text", v)), 1.5f), BooleanClause.Occur.SHOULD) + } + for (ba in limitedBases) { + b.add(BoostQuery(TermQuery(Term("text", ba)), 1.0f), BooleanClause.Occur.SHOULD) + } + return b.build() + } + + private fun buildSynonymBoostQuery(expansions: List): Query? { + if (expansions.isEmpty()) return null + val surfaceTerms = LinkedHashSet() + val variantTerms = LinkedHashSet() + val baseTerms = LinkedHashSet() + for (exp in expansions) { + surfaceTerms.addAll(exp.surface) + variantTerms.addAll(exp.variants) + baseTerms.addAll(exp.base) + } + + val limitedSurfaces = surfaceTerms.take(MAX_SYNONYM_BOOST_TERMS) + val limitedVariants = variantTerms.take(MAX_SYNONYM_BOOST_TERMS) + val limitedBases = baseTerms.take(MAX_SYNONYM_BOOST_TERMS) + if (surfaceTerms.size > limitedSurfaces.size || + variantTerms.size > limitedVariants.size || + baseTerms.size > limitedBases.size + ) { + logger.d { + "[DEBUG] Capped synonym boost terms: " + + "surface=${surfaceTerms.size}->${limitedSurfaces.size}, " + + "variants=${variantTerms.size}->${limitedVariants.size}, " + + "base=${baseTerms.size}->${limitedBases.size}" + } + } + + val b = BooleanQuery.Builder() + for (s in limitedSurfaces) { + b.add(TermQuery(Term("text", s)), BooleanClause.Occur.SHOULD) + } + for (v in limitedVariants) { + b.add(TermQuery(Term("text", v)), BooleanClause.Occur.SHOULD) + } + for (ba in limitedBases) { + b.add(TermQuery(Term("text", ba)), BooleanClause.Occur.SHOULD) + } + return b.build() + } + + private fun buildSynonymPhrases( + tokens: List, + expansionsByToken: Map> + ): List> { + if (tokens.isEmpty()) return emptyList() + val termExpansions = buildTermAlternativesForTokens(tokens, expansionsByToken) + logger.d { "[DEBUG] buildSynonymPhrases - termExpansions sizes: ${termExpansions.map { it.size }}" } + fun buildMultiPhrase(slop: Int): Query { + val builder = org.apache.lucene.search.MultiPhraseQuery.Builder() + builder.setSlop(slop) + var pos = 0 + for (alts in termExpansions) { + builder.add(alts.map { Term("text", it) }.toTypedArray(), pos) + pos++ + } + return builder.build() + } + return listOf( + buildMultiPhrase(0) to 50.0f, + buildMultiPhrase(3) to 20.0f, + buildMultiPhrase(8) to 5.0f + ) + } + + private fun buildSynonymPhraseQuery( + tokens: List, + expansionsByToken: Map>, + near: Int + ): Query? { + if (tokens.isEmpty()) return null + val termExpansions = buildTermAlternativesForTokens(tokens, expansionsByToken) + val builder = org.apache.lucene.search.MultiPhraseQuery.Builder() + builder.setSlop(near) + var position = 0 + for (alts in termExpansions) { + builder.add(alts.map { Term("text", it) }.toTypedArray(), position) + position++ + } + return builder.build() + } + + private fun buildNgram4Query(norm: String): Query? { + val tokens = norm.split("\\s+".toRegex()).map { it.trim() }.filter { it.length >= 4 } + if (tokens.isEmpty()) return null + val grams = mutableListOf() + for (t in tokens) { + val L = t.length + var i = 0 + while (i + 4 <= L) { + grams += t.substring(i, i + 4) + i += 1 + } + } + val uniq = grams.distinct() + if (uniq.isEmpty()) return null + val b = BooleanQuery.Builder() + for (g in uniq) { + b.add(TermQuery(Term("text_ng4", g)), BooleanClause.Occur.MUST) + } + return b.build() + } + + private fun buildExpandedQuery( + norm: String, + near: Int, + tokens: List, + expansionsByToken: Map> + ): Query { + val base = buildHebrewStdQuery(norm, near) + val allExpansions = expansionsByToken.values.flatten() + val synonymPhrases = buildSynonymPhrases(tokens, expansionsByToken) + val ngram = buildNgram4Query(norm) + val fuzzy = buildFuzzyQuery(norm, near) + val builder = BooleanQuery.Builder() + builder.add(base, BooleanClause.Occur.SHOULD) + for ((query, boost) in synonymPhrases) { + builder.add(BoostQuery(query, boost), BooleanClause.Occur.SHOULD) + } + if (ngram != null) builder.add(ngram, BooleanClause.Occur.SHOULD) + if (fuzzy != null) builder.add(fuzzy, BooleanClause.Occur.SHOULD) + val magic = buildMagicBoostQuery(allExpansions) + if (magic != null) builder.add(magic, BooleanClause.Occur.SHOULD) + val synonymBoost = buildSynonymBoostQuery(allExpansions) + if (synonymBoost != null) builder.add(synonymBoost, BooleanClause.Occur.SHOULD) + return builder.build() + } + + private fun buildFuzzyQuery(norm: String, near: Int): Query? { + if (near == 0) return null + if (norm.length < 4) return null + val tokens = analyzeToTerms(stdAnalyzer, norm)?.filter { it.length >= 4 } ?: emptyList() + if (tokens.isEmpty()) return null + val b = BooleanQuery.Builder() + for (t in tokens.distinct()) { + b.add(FuzzyQuery(Term("text", t), 1), BooleanClause.Occur.MUST) + } + return b.build() + } + + private fun buildAnchorTerms(normQuery: String, analyzedTerms: List): List { + val qTokens = normQuery.split("\\s+".toRegex()) + .map { it.trim() } + .filter { it.isNotEmpty() } + val combined = (qTokens + analyzedTerms.map { it.trimEnd('$') }) + val filtered = filterTermsForHighlight(combined) + if (filtered.isNotEmpty()) return filtered + val qFiltered = filterTermsForHighlight(qTokens) + return qFiltered.ifEmpty { qTokens } + } + + private fun filterTermsForHighlight(terms: List): List { + if (terms.isEmpty()) return emptyList() + + fun useful(t: String): Boolean { + val s = t.trim() + if (s.isEmpty()) return false + if (s.length < 2) return false + if (s.none { it.isLetterOrDigit() }) return false + return true + } + return terms + .map { it.trim() } + .filter { useful(it) } + .distinct() + .sortedByDescending { it.length } + } + + private fun buildSnippetInternal(raw: String, anchorTerms: List, highlightTerms: List, context: Int = 220): String { + if (raw.isEmpty()) return "" + val (plain, mapToOrig) = HebrewTextUtils.stripDiacriticsWithMap(raw) + val hasDiacritics = plain.length != raw.length + val effContext = if (hasDiacritics) maxOf(context, 360) else context + val plainSearch = HebrewTextUtils.replaceFinalsWithBase(plain) + + val plainIdx = anchorTerms.asSequence().mapNotNull { t -> + val i = plainSearch.indexOf(t) + if (i >= 0) i else null + }.firstOrNull() ?: 0 + + val plainLen = anchorTerms.firstOrNull()?.length ?: 0 + val plainStart = (plainIdx - effContext).coerceAtLeast(0) + val plainEnd = (plainIdx + plainLen + effContext).coerceAtMost(plain.length) + val origStart = HebrewTextUtils.mapToOrigIndex(mapToOrig, plainStart) + val origEnd = HebrewTextUtils.mapToOrigIndex(mapToOrig, plainEnd).coerceAtMost(raw.length) + + val base = raw.substring(origStart, origEnd) + val basePlain = plain.substring(plainStart, plainEnd) + val basePlainSearch = HebrewTextUtils.replaceFinalsWithBase(basePlain) + val baseMap = IntArray(plainEnd - plainStart) { idx -> + (mapToOrig[plainStart + idx] - origStart).coerceIn(0, base.length.coerceAtLeast(1) - 1) + } + + val pool = (highlightTerms + highlightTerms.map { it.trimEnd('$') }).distinct().filter { it.isNotBlank() } + val intervals = mutableListOf() + val basePlainLower = basePlainSearch.lowercase() + + fun isWordBoundary(text: String, index: Int): Boolean { + if (index < 0 || index >= text.length) return true + val ch = text[index] + return ch.isWhitespace() || !ch.isLetterOrDigit() + } + + for (term in pool) { + if (term.isEmpty()) continue + val t = term.lowercase() + var from = 0 + while (from <= basePlainLower.length - t.length && t.isNotEmpty()) { + val idx = basePlainLower.indexOf(t, startIndex = from) + if (idx == -1) break + + val isAtWordStart = isWordBoundary(basePlainLower, idx - 1) + val isAtWordEnd = isWordBoundary(basePlainLower, idx + t.length) + val isWholeWord = isAtWordStart && isAtWordEnd + val shouldHighlight = isWholeWord + + if (shouldHighlight) { + val startOrig = HebrewTextUtils.mapToOrigIndex(baseMap, idx) + val endOrig = HebrewTextUtils.mapToOrigIndex(baseMap, (idx + t.length - 1)) + 1 + if (startOrig in 0 until endOrig && endOrig <= base.length) { + intervals += (startOrig until endOrig) + } + } + from = idx + 1 + } + } + + val merged = mergeIntervals(intervals.sortedBy { it.first }) + val highlighted = insertBoldTags(base, merged) + val prefix = if (origStart > 0) "..." else "" + val suffix = if (origEnd < raw.length) "..." else "" + return prefix + highlighted + suffix + } + + private fun mergeIntervals(ranges: List): List { + if (ranges.isEmpty()) return ranges + val out = mutableListOf() + var cur = ranges[0] + for (i in 1 until ranges.size) { + val r = ranges[i] + if (r.first <= cur.last + 1) { + cur = cur.first .. maxOf(cur.last, r.last) + } else { + out += cur + cur = r + } + } + out += cur + return out + } + + private fun insertBoldTags(text: String, intervals: List): String { + if (intervals.isEmpty()) return text + val sb = StringBuilder(text) + for (r in intervals.asReversed()) { + val start = r.first.coerceIn(0, sb.length) + val end = (r.last + 1).coerceIn(0, sb.length) + if (end > start) { + sb.insert(end, "") + sb.insert(start, "") + } + } + return sb.toString() + } + + private fun buildNgramTerms(tokens: List, gram: Int = 4): List { + if (gram <= 0) return emptyList() + val out = mutableListOf() + tokens.forEach { t -> + val trimmed = t.trim() + if (trimmed.length >= gram) { + var i = 0 + while (i + gram <= trimmed.length) { + out += trimmed.substring(i, i + gram) + i += 1 + } + } + } + return out.distinct() + } + + private fun buildLimitedTermsForToken( + token: String, + expansions: List + ): List { + if (expansions.isEmpty()) return listOf(token) + + val baseTerms = expansions.flatMap { it.base }.distinct() + val otherTerms = expansions.flatMap { it.surface + it.variants }.distinct() + + val ordered = LinkedHashSet() + if (token.isNotBlank()) { + ordered += token + } + baseTerms.forEach { ordered += it } + otherTerms.forEach { ordered += it } + + val totalSize = ordered.size + val limited = ordered.take(MAX_SYNONYM_TERMS_PER_TOKEN) + if (totalSize > limited.size) { + logger.d { + "[DEBUG] Capped synonym terms for token '$token' from $totalSize to ${limited.size}" + } + } + return limited + } + + private fun buildTermAlternativesForTokens( + tokens: List, + expansionsByToken: Map> + ): List> { + if (tokens.isEmpty()) return emptyList() + return tokens.map { token -> + val expansions = expansionsByToken[token] ?: emptyList() + buildLimitedTermsForToken(token, expansions) + } + } + + private fun loadHashemHighlightTerms(): List { + val dict = magicDict ?: return emptyList() + val raw = dict.loadHashemSurfaces() + if (raw.isEmpty()) return emptyList() + + val terms = linkedSetOf() + raw.forEach { value -> + val trimmed = value.trim() + if (trimmed.isEmpty()) return@forEach + terms += trimmed + val stripped = HebrewTextUtils.stripDiacritics(trimmed).trim() + if (stripped.isNotEmpty()) terms += stripped + val normalized = HebrewTextUtils.normalizeHebrew(trimmed).trim() + if (normalized.isNotEmpty()) terms += normalized + } + + val out = terms.toList() + logger.d { "[DEBUG] Hashem highlight terms from lexical DB: ${out.take(20)}..." } + return out + } +} diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/MagicDictionaryIndex.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/MagicDictionaryIndex.kt new file mode 100644 index 0000000..c5ebfb5 --- /dev/null +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/MagicDictionaryIndex.kt @@ -0,0 +1,324 @@ +package io.github.kdroidfilter.seforimlibrary.search + +import co.touchlab.kermit.Logger +import java.nio.file.Files +import java.nio.file.Path +import java.sql.Connection +import java.sql.DriverManager +import java.sql.PreparedStatement + +/** + * Streaming dictionary index backed by SQLite (tables: surface, variant, base). + * Previously we loaded the entire dictionary into memory; now we stream lookups + * on-demand with a small LRU cache while preserving the exact expansion shape + * used by the search ranking logic. + */ +class MagicDictionaryIndex private constructor( + private val norm: (String) -> String, + private val dbFile: Path +) { + data class Expansion( + val surface: List, + val variants: List, + val base: List + ) + + private val url = "jdbc:sqlite:${dbFile.toAbsolutePath()}" + + /** + * Prepared statement per thread to avoid re-opening connections on every token. + */ + private val stmtProvider: ThreadLocal = ThreadLocal.withInitial { + val conn = DriverManager.getConnection(url).apply { + autoCommit = false + // Enforce read-only queries without altering connection flags post-open + createStatement().use { stmt -> stmt.execute("PRAGMA query_only=ON") } + } + LookupContext( + conn = conn, + stmt = conn.prepareStatement(LOOKUP_SQL) + ) + } + + /** + * Cache expansions per normalized token to avoid repeated DB hits. + */ + private val tokenCache = object : LinkedHashMap>(TOKEN_CACHE_SIZE, 0.75f, true) { + override fun removeEldestEntry(eldest: MutableMap.MutableEntry>?): Boolean = + size > TOKEN_CACHE_SIZE + } + + /** + * Cache fully-normalized expansions per base id so repeated hits to the same base + * avoid re-normalizing rows. + */ + private val baseCache = object : LinkedHashMap(BASE_CACHE_SIZE, 0.75f, true) { + override fun removeEldestEntry(eldest: MutableMap.MutableEntry?): Boolean = + size > BASE_CACHE_SIZE + } + + fun expansionsFor(tokens: List): List = + tokens.flatMap { expansionsForToken(it) }.distinct() + + fun expansionFor(token: String): Expansion? { + val expansions = expansionsForToken(token) + if (expansions.isEmpty()) return null + + val normalized = norm(token) + // Strategy: prefer the expansion whose base matches the token + val matchingBase = expansions.firstOrNull { exp -> + exp.base.any { it == normalized } + } + if (matchingBase != null) return matchingBase + + // Otherwise, prefer the largest expansion (more terms = more complete paradigm) + return expansions.maxByOrNull { it.surface.size } + } + + private fun expansionsForToken(token: String): List { + val normalized = norm(token) + if (normalized.isEmpty()) return emptyList() + + synchronized(tokenCache) { + tokenCache[normalized]?.let { return it } + } + + // Try raw, normalized, and final-form variants to match DB values. + val candidates = buildLookupCandidates(token, normalized) + val mergedByBase = LinkedHashMap() + + for (candidate in candidates) { + val fetched = fetchExpansions(candidate, normalized) + for ((baseId, exp) in fetched) { + val existing = mergedByBase[baseId] + if (existing == null) { + mergedByBase[baseId] = exp + } else { + val surfaces = (existing.surface + exp.surface).distinct() + val variants = (existing.variants + exp.variants).distinct() + val base = (existing.base + exp.base).distinct() + mergedByBase[baseId] = Expansion(surfaces, variants, base) + } + } + } + + val expansions = mergedByBase.values.toList() + synchronized(tokenCache) { + tokenCache[normalized] = expansions + } + return expansions + } + + /** + * Fetch expansions for a token. Returns a list of (baseId, Expansion) pairs so callers can merge by base id. + */ + private fun fetchExpansions(rawToken: String, normalizedToken: String): List> { + val expansions = mutableListOf>() + val ctx = stmtProvider.get() + + runCatching { + synchronized(ctx) { + repeat(3) { idx -> ctx.stmt.setString(idx + 1, rawToken) } + val rs = ctx.stmt.executeQuery() + val accum = mutableMapOf() + while (rs.next()) { + val baseId = rs.getLong("base_id") + val bucket = accum.getOrPut(baseId) { + BaseBucket( + baseRaw = rs.getString("base") ?: "", + surfaces = linkedSetOf(), + variants = linkedSetOf() + ) + } + rs.getString("surface")?.let { bucket.surfaces += it } + rs.getString("variant")?.let { bucket.variants += it } + } + + for ((baseId, bucket) in accum) { + val cached = synchronized(baseCache) { baseCache[baseId] } + if (cached != null) { + expansions += baseId to cached + continue + } + + val surfaceN = bucket.surfaces.mapNotNull { v -> norm(v).takeIf { it.isNotEmpty() } } + val variantsN = bucket.variants.mapNotNull { v -> norm(v).takeIf { it.isNotEmpty() } } + val baseN = norm(bucket.baseRaw).takeIf { it.isNotEmpty() } + ?: surfaceN.firstOrNull() + ?: normalizedToken + + val baseTerms = listOfNotNull(baseN.takeIf { it.isNotEmpty() }) + val allTerms = (surfaceN + variantsN + baseTerms).distinct() + if (allTerms.isEmpty()) continue + + val exp = Expansion( + surface = allTerms, + variants = emptyList(), + base = baseTerms + ) + + synchronized(baseCache) { + baseCache[baseId] = exp + } + expansions += baseId to exp + } + } + }.onFailure { + logger.d { "[MagicDictionary] Failed to fetch expansions for '$rawToken' : ${it.message}" } + } + + return expansions + } + + companion object { + private val logger = Logger.withTag("MagicDictionary") + private const val TOKEN_CACHE_SIZE = 1024 + private const val BASE_CACHE_SIZE = 512 + + /** + * Load from SQLite DB (expected tables: surface(value, base_id), variant(value, surface_id), base(value)). + * Uses streaming lookup to avoid holding the entire dictionary in memory. + */ + fun load(norm: (String) -> String, candidate: Path?): MagicDictionaryIndex? { + val file = candidate?.takeIf { Files.isRegularFile(it) && hasRequiredTables(it) } ?: run { + if (candidate != null) { + logger.d { "[MagicDictionary] Ignoring candidate $candidate because required tables are missing" } + } + return null + } + return runCatching { + // Validate DB is reachable + DriverManager.getConnection("jdbc:sqlite:${file.toAbsolutePath()}").use { conn -> + conn.createStatement().use { stmt -> + stmt.execute("SELECT 1") + } + } + logger.d { "[MagicDictionary] Streaming lexical db from $file (lazy on-demand)" } + MagicDictionaryIndex(norm, file) + }.onFailure { + logger.d { "[MagicDictionary] Failed to load from $file : ${it.message}" } + }.getOrNull() + } + + /** + * Find the first candidate path that exists and contains the required tables. + */ + fun findValidDictionary(candidates: List): Path? { + for (candidate in candidates) { + if (!Files.isRegularFile(candidate)) continue + if (hasRequiredTables(candidate)) { + logger.d { "[MagicDictionary] Using validated lexical db at $candidate" } + return candidate + } else { + logger.d { + "[MagicDictionary] Candidate $candidate is present but missing required tables; skipping" + } + } + } + return null + } + + private fun hasRequiredTables(file: Path): Boolean = runCatching { + DriverManager.getConnection("jdbc:sqlite:${file.toAbsolutePath()}").use { conn -> + val sql = """ + SELECT name FROM sqlite_master + WHERE type = 'table' AND name IN ('surface', 'variant', 'base', 'surface_variant') + """.trimIndent() + conn.createStatement().use { stmt -> + val rs = stmt.executeQuery(sql) + val names = mutableSetOf() + while (rs.next()) names += rs.getString("name") ?: "" + names.containsAll(listOf("surface", "variant", "base", "surface_variant")) + } + } + }.getOrElse { false } + + private const val LOOKUP_SQL = """ + WITH matches AS ( + SELECT s.base_id AS base_id FROM surface s WHERE s.value = ? + UNION + SELECT b.id FROM base b WHERE b.value = ? + UNION + SELECT s.base_id FROM variant v + JOIN surface_variant sv ON sv.variant_id = v.id + JOIN surface s ON sv.surface_id = s.id + WHERE v.value = ? + ) + SELECT b.id as base_id, + b.value as base, + s.value as surface, + v.value as variant + FROM base b + JOIN matches m ON m.base_id = b.id + LEFT JOIN surface s ON s.base_id = b.id + LEFT JOIN surface_variant sv ON sv.surface_id = s.id + LEFT JOIN variant v ON sv.variant_id = v.id + """ + } + + private data class LookupContext( + val conn: Connection, + val stmt: PreparedStatement + ) + + private data class BaseBucket( + val baseRaw: String, + val surfaces: MutableSet, + val variants: MutableSet + ) + + private fun buildLookupCandidates(rawToken: String, normalized: String): List { + val finalsMap = mapOf( + 'כ' to 'ך', + 'מ' to 'ם', + 'נ' to 'ן', + 'פ' to 'ף', + 'צ' to 'ץ' + ) + + fun applyFinalForm(t: String): String { + if (t.isEmpty()) return t + val last = t.last() + val final = finalsMap[last] ?: last + return if (final == last) t else t.dropLast(1) + final + } + + return listOf( + rawToken, + normalized, + applyFinalForm(rawToken), + applyFinalForm(normalized) + ).filter { it.isNotBlank() }.distinct() + } + + /** + * Load all surface forms whose base lemma directly from the underlying SQLite DB. + * This is used for snippet highlighting of Hashem names, independent of token-level expansions. + */ + fun loadHashemSurfaces(): List { + val terms = linkedSetOf() + runCatching { + DriverManager.getConnection(url).use { conn -> + val sql = """ + SELECT s.value AS surface + FROM surface s + JOIN base b ON s.base_id = b.id + WHERE b.value = 'יהוה' + """.trimIndent() + conn.createStatement().use { stmt -> + val rs = stmt.executeQuery(sql) + while (rs.next()) { + val v = rs.getString("surface") ?: continue + val trimmed = v.trim() + if (trimmed.isNotEmpty()) { + terms += trimmed + } + } + } + } + }.onFailure { + logger.d { "[MagicDictionary] Failed to load Hashem surfaces: ${it.message}" } + } + return terms.toList() + } +} diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt new file mode 100644 index 0000000..6a4b4ed --- /dev/null +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt @@ -0,0 +1,85 @@ +package io.github.kdroidfilter.seforimlibrary.search + +import java.io.Closeable + +/** + * Main interface for full-text search operations on Hebrew religious texts. + * + * This interface provides session-based search with pagination support, + * book title suggestions, and snippet generation with term highlighting. + * + * ## Usage Example + * ```kotlin + * val engine: SearchEngine = LuceneSearchEngine(indexPath) + * + * // Open a search session + * val session = engine.openSession("בראשית", near = 5) + * session?.use { + * while (true) { + * val page = it.nextPage(20) ?: break + * page.hits.forEach { hit -> + * println("${hit.bookTitle}: ${hit.snippet}") + * } + * if (page.isLastPage) break + * } + * } + * ``` + * + * ## Thread Safety + * Implementations should be thread-safe for concurrent search operations. + * + * @see SearchSession for paginated result access + * @see LineHit for individual search result structure + */ +interface SearchEngine : Closeable { + + /** + * Opens a search session for the given query with optional filters. + * + * The query is normalized internally (nikud/teamim removed, final letters converted). + * Returns null if the query is empty, blank, or contains only stop words. + * + * @param query The search query in Hebrew (may contain nikud/teamim) + * @param near Proximity slop for phrase matching. Use 0 for exact phrase, + * higher values allow more words between terms (default: 5) + * @param bookFilter Optional single book ID to restrict results + * @param categoryFilter Optional category ID to restrict results + * @param bookIds Optional collection of book IDs to restrict results (OR logic) + * @param lineIds Optional collection of line IDs to restrict results (OR logic) + * @return A [SearchSession] for paginated access to results, or null if query is invalid + */ + fun openSession( + query: String, + near: Int = 5, + bookFilter: Long? = null, + categoryFilter: Long? = null, + bookIds: Collection? = null, + lineIds: Collection? = null + ): SearchSession? + + /** + * Searches for books whose titles match the given prefix. + * + * Useful for autocomplete/typeahead functionality in search UI. + * The query is normalized before matching. + * + * @param query The prefix to search for (e.g., "בראש" matches "בראשית רבה") + * @param limit Maximum number of book IDs to return (default: 20) + * @return List of matching book IDs, ordered by relevance + */ + fun searchBooksByTitlePrefix(query: String, limit: Int = 20): List + + /** + * Builds an HTML snippet with highlighted search terms from raw text. + * + * The snippet extracts a context window around the first match and wraps + * matching terms in `` tags for highlighting. Handles Hebrew text with + * nikud/teamim correctly by matching on normalized forms. + * + * @param rawText The raw text content (may contain HTML, will be sanitized) + * @param query The search query for term highlighting + * @param near Proximity value affecting context window size + * @return HTML string with `` tags around matches, possibly with `...` for truncation + */ + fun buildSnippet(rawText: String, query: String, near: Int): String +} diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchSession.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchSession.kt new file mode 100644 index 0000000..8984aaa --- /dev/null +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchSession.kt @@ -0,0 +1,74 @@ +package io.github.kdroidfilter.seforimlibrary.search + +import java.io.Closeable + +/** + * A stateful search session providing paginated access to search results. + * + * Sessions maintain internal cursor state for efficient pagination using + * Lucene's searchAfter mechanism. Must be closed when no longer needed + * to release underlying index reader resources. + * + * ## Usage + * ```kotlin + * engine.openSession("שלום")?.use { session -> + * var page = session.nextPage(20) + * while (page != null && !page.isLastPage) { + * processResults(page.hits) + * page = session.nextPage(20) + * } + * } + * ``` + * + * ## Thread Safety + * Sessions are NOT thread-safe. Use one session per thread or synchronize access. + * + * @see SearchEngine.openSession to create a session + * @see SearchPage for the structure of returned pages + */ +interface SearchSession : Closeable { + /** + * Retrieves the next page of search results. + * + * Each call advances the internal cursor. Results are ordered by relevance score. + * Returns null when all results have been exhausted. + * + * @param limit Maximum number of results to return in this page + * @return [SearchPage] containing hits and metadata, or null if no more results + */ + fun nextPage(limit: Int): SearchPage? +} + +/** + * A page of search results with metadata. + * + * @property hits List of matching lines for this page + * @property totalHits Total number of matches across all pages (approximate for large result sets) + * @property isLastPage True if this is the final page of results + */ +data class SearchPage( + val hits: List, + val totalHits: Long, + val isLastPage: Boolean +) + +/** + * A single search result representing a matched line in a book. + * + * @property bookId Unique identifier of the book containing this line + * @property bookTitle Display title of the book + * @property lineId Unique identifier of the matched line + * @property lineIndex Zero-based index of the line within the book + * @property snippet HTML snippet with highlighted matching terms (contains `` tags) + * @property score Relevance score (higher = more relevant). Includes boosts for base books. + * @property rawText Original unprocessed text content of the line + */ +data class LineHit( + val bookId: Long, + val bookTitle: String, + val lineId: Long, + val lineIndex: Int, + val snippet: String, + val score: Float, + val rawText: String +) diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SnippetProvider.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SnippetProvider.kt new file mode 100644 index 0000000..c30615c --- /dev/null +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SnippetProvider.kt @@ -0,0 +1,54 @@ +package io.github.kdroidfilter.seforimlibrary.search + +/** + * Metadata about a line needed to fetch its snippet source text. + * + * @property lineId Unique identifier of the line + * @property bookId Book containing this line (used for context fetching) + * @property lineIndex Zero-based position of the line in the book + */ +data class LineSnippetInfo( + val lineId: Long, + val bookId: Long, + val lineIndex: Int +) + +/** + * Provider interface for fetching snippet source text from a data store. + * + * The search engine uses this to retrieve the full text content needed for + * snippet generation. Implementations typically fetch from a database and + * may include neighboring lines for better context. + * + * ## Implementation Notes + * - Should return HTML-cleaned text (no raw HTML tags) + * - May include neighboring lines for context (typically 4 lines before/after) + * - Should handle missing lines gracefully (omit from result map) + * - Should be efficient for batch lookups (single DB query for all lines) + * + * ## Example Implementation + * ```kotlin + * class RepositorySnippetProvider( + * private val repository: SeforimRepository + * ) : SnippetProvider { + * override fun getSnippetSources(lines: List): Map { + * return repository.getSnippetSourcesForLines( + * lines.map { it.lineId }, + * neighborWindow = 4, + * minLength = 280 + * ) + * } + * } + * ``` + * + * @see SearchEngine for how this provider is used during search + */ +fun interface SnippetProvider { + /** + * Fetches snippet source text for multiple lines in a single batch. + * + * @param lines List of line metadata for which to fetch text + * @return Map of lineId to its source text. Missing lines should be omitted. + */ + fun getSnippetSources(lines: List): Map +} diff --git a/search/src/jvmTest/kotlin/io/github/kdroidfilter/seforimlibrary/search/HebrewTextUtilsTest.kt b/search/src/jvmTest/kotlin/io/github/kdroidfilter/seforimlibrary/search/HebrewTextUtilsTest.kt new file mode 100644 index 0000000..b612e7b --- /dev/null +++ b/search/src/jvmTest/kotlin/io/github/kdroidfilter/seforimlibrary/search/HebrewTextUtilsTest.kt @@ -0,0 +1,270 @@ +package io.github.kdroidfilter.seforimlibrary.search + +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertTrue +import kotlin.test.assertFalse + +class HebrewTextUtilsTest { + + // --- normalizeHebrew tests --- + + @Test + fun `normalizeHebrew returns empty string for blank input`() { + assertEquals("", HebrewTextUtils.normalizeHebrew("")) + assertEquals("", HebrewTextUtils.normalizeHebrew(" ")) + assertEquals("", HebrewTextUtils.normalizeHebrew("\t\n")) + } + + @Test + fun `normalizeHebrew removes nikud vowel points`() { + // בְּרֵאשִׁית -> בראשית + val withNikud = "בְּרֵאשִׁית" + val expected = "בראשית" + assertEquals(expected, HebrewTextUtils.normalizeHebrew(withNikud)) + } + + @Test + fun `normalizeHebrew removes teamim cantillation marks`() { + // Text with teamim (U+0591-U+05AF range) + val withTeamim = "בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים" + val result = HebrewTextUtils.normalizeHebrew(withTeamim) + // Should not contain any teamim + assertFalse(result.any { it.code in 0x0591..0x05AF }) + } + + @Test + fun `normalizeHebrew replaces maqaf with space`() { + // מַה־טֹּבוּ -> מה טבו + val withMaqaf = "מה־טבו" + val result = HebrewTextUtils.normalizeHebrew(withMaqaf) + assertTrue(result.contains(' ')) + assertFalse(result.contains('\u05BE')) + } + + @Test + fun `normalizeHebrew removes gershayim and geresh`() { + val withGershayim = "רש\"י" + val withGeresh = "ר'" + // Gershayim U+05F4 and geresh U+05F3 + val text = "רש\u05F4י ר\u05F3" + val result = HebrewTextUtils.normalizeHebrew(text) + assertFalse(result.contains('\u05F4')) + assertFalse(result.contains('\u05F3')) + } + + @Test + fun `normalizeHebrew converts final letters to base forms`() { + // ך -> כ, ם -> מ, ן -> נ, ף -> פ, ץ -> צ + val withFinals = "מלך שלום אמן סוף ארץ" + val result = HebrewTextUtils.normalizeHebrew(withFinals) + assertEquals("מלכ שלומ אמנ סופ ארצ", result) + } + + @Test + fun `normalizeHebrew collapses multiple spaces`() { + val withSpaces = "שלום עולם" + val result = HebrewTextUtils.normalizeHebrew(withSpaces) + assertEquals("שלומ עולמ", result) + } + + @Test + fun `normalizeHebrew trims whitespace`() { + val withWhitespace = " שלום " + val result = HebrewTextUtils.normalizeHebrew(withWhitespace) + assertEquals("שלומ", result) + } + + @Test + fun `normalizeHebrew handles mixed Hebrew and ASCII`() { + val mixed = "Hello שלום World" + val result = HebrewTextUtils.normalizeHebrew(mixed) + assertEquals("Hello שלומ World", result) + } + + // --- replaceFinalsWithBase tests --- + + @Test + fun `replaceFinalsWithBase converts all final letters`() { + assertEquals("כ", HebrewTextUtils.replaceFinalsWithBase("ך")) + assertEquals("מ", HebrewTextUtils.replaceFinalsWithBase("ם")) + assertEquals("נ", HebrewTextUtils.replaceFinalsWithBase("ן")) + assertEquals("פ", HebrewTextUtils.replaceFinalsWithBase("ף")) + assertEquals("צ", HebrewTextUtils.replaceFinalsWithBase("ץ")) + } + + @Test + fun `replaceFinalsWithBase preserves non-final letters`() { + val text = "אבגדהוזחטיכלמנסעפצקרשת" + val result = HebrewTextUtils.replaceFinalsWithBase(text) + assertEquals(text, result) + } + + @Test + fun `replaceFinalsWithBase handles word with final letter at end`() { + assertEquals("מלכ", HebrewTextUtils.replaceFinalsWithBase("מלך")) + assertEquals("שלומ", HebrewTextUtils.replaceFinalsWithBase("שלום")) + } + + // --- isNikudOrTeamim tests --- + + @Test + fun `isNikudOrTeamim returns true for nikud characters`() { + // Shva, Patach, Kamatz, etc. + val nikudChars = listOf('\u05B0', '\u05B1', '\u05B2', '\u05B3', '\u05B4', + '\u05B5', '\u05B6', '\u05B7', '\u05B8', '\u05B9', + '\u05BB', '\u05BC', '\u05BD') + nikudChars.forEach { char -> + assertTrue(HebrewTextUtils.isNikudOrTeamim(char), "Expected $char to be nikud") + } + } + + @Test + fun `isNikudOrTeamim returns true for teamim characters`() { + // Some teamim characters + val teamimChars = listOf('\u0591', '\u0592', '\u0593', '\u05A0', '\u05AF') + teamimChars.forEach { char -> + assertTrue(HebrewTextUtils.isNikudOrTeamim(char), "Expected $char to be teamim") + } + } + + @Test + fun `isNikudOrTeamim returns false for regular Hebrew letters`() { + val hebrewLetters = "אבגדהוזחטיכלמנסעפצקרשת" + hebrewLetters.forEach { char -> + assertFalse(HebrewTextUtils.isNikudOrTeamim(char), "Expected $char to NOT be nikud/teamim") + } + } + + @Test + fun `isNikudOrTeamim returns false for ASCII characters`() { + ('a'..'z').forEach { char -> + assertFalse(HebrewTextUtils.isNikudOrTeamim(char)) + } + ('0'..'9').forEach { char -> + assertFalse(HebrewTextUtils.isNikudOrTeamim(char)) + } + } + + // --- stripDiacriticsWithMap tests --- + + @Test + fun `stripDiacriticsWithMap removes all diacritics`() { + val withDiacritics = "בְּרֵאשִׁית" + val (plain, _) = HebrewTextUtils.stripDiacriticsWithMap(withDiacritics) + assertEquals("בראשית", plain) + } + + @Test + fun `stripDiacriticsWithMap returns correct index mapping`() { + // Simple case: "אָב" -> "אב" with mapping [0, 2] + val input = "א\u05B8ב" // א with kamatz, then ב + val (plain, map) = HebrewTextUtils.stripDiacriticsWithMap(input) + assertEquals("אב", plain) + assertEquals(2, map.size) + assertEquals(0, map[0]) // 'א' at position 0 in original + assertEquals(2, map[1]) // 'ב' at position 2 in original (after kamatz) + } + + @Test + fun `stripDiacriticsWithMap handles text without diacritics`() { + val plain = "שלום" + val (result, map) = HebrewTextUtils.stripDiacriticsWithMap(plain) + assertEquals(plain, result) + assertEquals(4, map.size) + // Each character maps to itself + for (i in map.indices) { + assertEquals(i, map[i]) + } + } + + @Test + fun `stripDiacriticsWithMap handles empty string`() { + val (plain, map) = HebrewTextUtils.stripDiacriticsWithMap("") + assertEquals("", plain) + assertEquals(0, map.size) + } + + // --- stripDiacritics tests --- + + @Test + fun `stripDiacritics removes diacritics without mapping`() { + val withDiacritics = "בְּרֵאשִׁית" + val result = HebrewTextUtils.stripDiacritics(withDiacritics) + assertEquals("בראשית", result) + } + + @Test + fun `stripDiacritics preserves text without diacritics`() { + val plain = "שלום עולם" + assertEquals(plain, HebrewTextUtils.stripDiacritics(plain)) + } + + @Test + fun `stripDiacritics handles empty string`() { + assertEquals("", HebrewTextUtils.stripDiacritics("")) + } + + // --- mapToOrigIndex tests --- + + @Test + fun `mapToOrigIndex returns correct index for valid input`() { + val map = intArrayOf(0, 2, 4, 6) + assertEquals(0, HebrewTextUtils.mapToOrigIndex(map, 0)) + assertEquals(2, HebrewTextUtils.mapToOrigIndex(map, 1)) + assertEquals(4, HebrewTextUtils.mapToOrigIndex(map, 2)) + assertEquals(6, HebrewTextUtils.mapToOrigIndex(map, 3)) + } + + @Test + fun `mapToOrigIndex clamps out of bounds index`() { + val map = intArrayOf(0, 2, 4) + // Index too high should return last valid + assertEquals(4, HebrewTextUtils.mapToOrigIndex(map, 10)) + // Negative index should return first + assertEquals(0, HebrewTextUtils.mapToOrigIndex(map, -1)) + } + + @Test + fun `mapToOrigIndex returns plainIndex for empty map`() { + val emptyMap = intArrayOf() + assertEquals(5, HebrewTextUtils.mapToOrigIndex(emptyMap, 5)) + } + + // --- SOFIT_MAP tests --- + + @Test + fun `SOFIT_MAP contains all five final letter mappings`() { + assertEquals(5, HebrewTextUtils.SOFIT_MAP.size) + assertEquals('כ', HebrewTextUtils.SOFIT_MAP['ך']) + assertEquals('מ', HebrewTextUtils.SOFIT_MAP['ם']) + assertEquals('נ', HebrewTextUtils.SOFIT_MAP['ן']) + assertEquals('פ', HebrewTextUtils.SOFIT_MAP['ף']) + assertEquals('צ', HebrewTextUtils.SOFIT_MAP['ץ']) + } + + // --- Edge cases and regression tests --- + + @Test + fun `normalizeHebrew handles real Torah verse`() { + // Genesis 1:1 with full nikud and teamim + val verse = "בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃" + val normalized = HebrewTextUtils.normalizeHebrew(verse) + // Should be plain Hebrew without any diacritics + assertFalse(normalized.any { HebrewTextUtils.isNikudOrTeamim(it) }) + // Should contain the main words + assertTrue(normalized.contains("בראשית")) + assertTrue(normalized.contains("ברא")) + assertTrue(normalized.contains("אלהימ")) + } + + @Test + fun `normalizeHebrew handles Divine Name variants`() { + // Various representations of the Divine Name + val names = listOf("יהוה", "יְהוָה", "ה׳", "ה'") + names.forEach { name -> + val result = HebrewTextUtils.normalizeHebrew(name) + assertTrue(result.isNotEmpty(), "Normalized name should not be empty for: $name") + } + } +} diff --git a/search/src/jvmTest/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngineTest.kt b/search/src/jvmTest/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngineTest.kt new file mode 100644 index 0000000..324173b --- /dev/null +++ b/search/src/jvmTest/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngineTest.kt @@ -0,0 +1,625 @@ +package io.github.kdroidfilter.seforimlibrary.search + +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertNotNull +import kotlin.test.assertNull +import kotlin.test.assertTrue +import kotlin.test.assertFalse +import org.apache.lucene.analysis.standard.StandardAnalyzer +import org.apache.lucene.document.Document +import org.apache.lucene.document.Field +import org.apache.lucene.document.IntPoint +import org.apache.lucene.document.NumericDocValuesField +import org.apache.lucene.document.StoredField +import org.apache.lucene.document.StringField +import org.apache.lucene.document.TextField +import org.apache.lucene.index.IndexWriter +import org.apache.lucene.index.IndexWriterConfig +import org.apache.lucene.store.ByteBuffersDirectory +import org.apache.lucene.store.FSDirectory +import java.nio.file.Files +import java.nio.file.Path + +class LuceneSearchEngineTest { + + // --- buildSnippet tests (no index required) --- + + @Test + fun `buildSnippet returns clean text for empty query`() { + val tempDir = createTempIndexDir() + try { + createMinimalIndex(tempDir) + val engine = LuceneSearchEngine(tempDir) + + val rawText = "

שלום עולם

" + val result = engine.buildSnippet(rawText, "", 5) + + // Should return clean text without HTML + assertFalse(result.contains("

")) + assertTrue(result.contains("שלום עולם")) + + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + @Test + fun `buildSnippet highlights matching Hebrew text`() { + val tempDir = createTempIndexDir() + try { + createMinimalIndex(tempDir) + val engine = LuceneSearchEngine(tempDir) + + val rawText = "בראשית ברא אלהים את השמים ואת הארץ" + val result = engine.buildSnippet(rawText, "בראשית", 5) + + // Should contain bold tags around the match + assertTrue(result.contains("") && result.contains("")) + + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + @Test + fun `buildSnippet handles text with nikud`() { + val tempDir = createTempIndexDir() + try { + createMinimalIndex(tempDir) + val engine = LuceneSearchEngine(tempDir) + + val rawText = "בְּרֵאשִׁית בָּרָא אֱלֹהִים" + val result = engine.buildSnippet(rawText, "בראשית", 5) + + // Should still find and highlight despite nikud + assertTrue(result.contains("") || result.contains("בְּרֵאשִׁית")) + + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + @Test + fun `buildSnippet truncates long text with ellipsis`() { + val tempDir = createTempIndexDir() + try { + createMinimalIndex(tempDir) + val engine = LuceneSearchEngine(tempDir) + + // Create a very long text + val longText = "הקדמה ארוכה מאוד " + "מילה ".repeat(200) + " סיום הטקסט" + val result = engine.buildSnippet(longText, "הקדמה", 5) + + // Should contain ellipsis if truncated + // The snippet extracts context around the match, may have ... at end + assertTrue(result.length < longText.length || result.contains("...")) + + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + // --- openSession tests --- + + @Test + fun `openSession returns null for blank query`() { + val tempDir = createTempIndexDir() + try { + createMinimalIndex(tempDir) + val engine = LuceneSearchEngine(tempDir) + + val session = engine.openSession("", 5) + assertNull(session) + + val sessionBlank = engine.openSession(" ", 5) + assertNull(sessionBlank) + + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + @Test + fun `openSession returns session for valid query`() { + val tempDir = createTempIndexDir() + try { + createIndexWithContent(tempDir) + val engine = LuceneSearchEngine(tempDir) + + val session = engine.openSession("שלום", 5) + assertNotNull(session) + session.close() + + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + @Test + fun `session nextPage returns results`() { + val tempDir = createTempIndexDir() + try { + createIndexWithContent(tempDir) + val engine = LuceneSearchEngine(tempDir) + + val session = engine.openSession("שלום", 5) + assertNotNull(session) + + val page = session.nextPage(10) + assertNotNull(page) + assertTrue(page.hits.isNotEmpty()) + assertTrue(page.totalHits > 0) + + session.close() + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + @Test + fun `session pagination works correctly`() { + val tempDir = createTempIndexDir() + try { + // Create index with multiple documents + createIndexWithMultipleDocuments(tempDir, count = 25) + val engine = LuceneSearchEngine(tempDir) + + val session = engine.openSession("טקסט", 5) + assertNotNull(session) + + // First page + val page1 = session.nextPage(10) + assertNotNull(page1) + assertEquals(10, page1.hits.size) + assertFalse(page1.isLastPage) + + // Second page + val page2 = session.nextPage(10) + assertNotNull(page2) + assertEquals(10, page2.hits.size) + assertFalse(page2.isLastPage) + + // Third page (last, should have 5 items) + val page3 = session.nextPage(10) + assertNotNull(page3) + assertEquals(5, page3.hits.size) + assertTrue(page3.isLastPage) + + // No more pages + val page4 = session.nextPage(10) + assertNull(page4) + + session.close() + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + @Test + fun `session can be closed multiple times safely`() { + val tempDir = createTempIndexDir() + try { + createIndexWithContent(tempDir) + val engine = LuceneSearchEngine(tempDir) + + val session = engine.openSession("שלום", 5) + assertNotNull(session) + + // Close multiple times should not throw + session.close() + // Second close - should be safe + try { + session.close() + } catch (e: Exception) { + // Some implementations may throw on double close, that's acceptable + } + + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + // --- searchBooksByTitlePrefix tests --- + + @Test + fun `searchBooksByTitlePrefix returns empty for blank query`() { + val tempDir = createTempIndexDir() + try { + createIndexWithBookTitles(tempDir) + val engine = LuceneSearchEngine(tempDir) + + val result = engine.searchBooksByTitlePrefix("") + assertTrue(result.isEmpty()) + + val resultBlank = engine.searchBooksByTitlePrefix(" ") + assertTrue(resultBlank.isEmpty()) + + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + @Test + fun `searchBooksByTitlePrefix finds matching books`() { + val tempDir = createTempIndexDir() + try { + createIndexWithBookTitles(tempDir) + val engine = LuceneSearchEngine(tempDir) + + val result = engine.searchBooksByTitlePrefix("בראשית") + assertTrue(result.isNotEmpty()) + + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + @Test + fun `searchBooksByTitlePrefix respects limit`() { + val tempDir = createTempIndexDir() + try { + createIndexWithMultipleBookTitles(tempDir, count = 10) + val engine = LuceneSearchEngine(tempDir) + + val result = engine.searchBooksByTitlePrefix("ספר", limit = 5) + assertTrue(result.size <= 5) + + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + // --- Filter tests --- + + @Test + fun `openSession with book filter returns only matching book`() { + val tempDir = createTempIndexDir() + try { + createIndexWithMultipleBooks(tempDir) + val engine = LuceneSearchEngine(tempDir) + + val session = engine.openSession("טקסט", 5, bookFilter = 1L) + assertNotNull(session) + + val page = session.nextPage(100) + assertNotNull(page) + + // All results should be from book 1 + assertTrue(page.hits.all { it.bookId == 1L }) + + session.close() + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + @Test + fun `openSession with bookIds filter returns only matching books`() { + val tempDir = createTempIndexDir() + try { + createIndexWithMultipleBooks(tempDir) + val engine = LuceneSearchEngine(tempDir) + + val session = engine.openSession("טקסט", 5, bookIds = listOf(1L, 2L)) + assertNotNull(session) + + val page = session.nextPage(100) + assertNotNull(page) + + // All results should be from books 1 or 2 + assertTrue(page.hits.all { it.bookId in listOf(1L, 2L) }) + + session.close() + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + // --- LineHit data tests --- + + @Test + fun `LineHit contains all required fields`() { + val tempDir = createTempIndexDir() + try { + createIndexWithContent(tempDir) + val engine = LuceneSearchEngine(tempDir) + + val session = engine.openSession("שלום", 5) + assertNotNull(session) + + val page = session.nextPage(10) + assertNotNull(page) + assertTrue(page.hits.isNotEmpty()) + + val hit = page.hits.first() + assertTrue(hit.bookId > 0) + assertTrue(hit.bookTitle.isNotEmpty()) + assertTrue(hit.lineId > 0) + assertTrue(hit.lineIndex >= 0) + assertTrue(hit.snippet.isNotEmpty()) + assertTrue(hit.score > 0) + assertTrue(hit.rawText.isNotEmpty()) + + session.close() + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + // --- Edge cases --- + + @Test + fun `handles special Hebrew characters in query`() { + val tempDir = createTempIndexDir() + try { + createIndexWithContent(tempDir) + val engine = LuceneSearchEngine(tempDir) + + // Query with gershayim + val session1 = engine.openSession("רש\"י", 5) + // Should not throw, may return null or empty results + session1?.close() + + // Query with maqaf + val session2 = engine.openSession("מה-טבו", 5) + session2?.close() + + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + @Test + fun `handles query with Hashem representation`() { + val tempDir = createTempIndexDir() + try { + createIndexWithHashemContent(tempDir) + val engine = LuceneSearchEngine(tempDir) + + // Query with ה׳ + val session = engine.openSession("ה׳", 5) + // Should handle without error + session?.close() + + engine.close() + } finally { + deleteDirectory(tempDir) + } + } + + // --- Helper methods --- + + private fun createTempIndexDir(): Path { + return Files.createTempDirectory("lucene_test_index") + } + + private fun deleteDirectory(path: Path) { + if (Files.exists(path)) { + Files.walk(path) + .sorted(Comparator.reverseOrder()) + .forEach { Files.deleteIfExists(it) } + } + } + + private fun createMinimalIndex(indexDir: Path) { + FSDirectory.open(indexDir).use { dir -> + val config = IndexWriterConfig(StandardAnalyzer()) + IndexWriter(dir, config).use { writer -> + // Create at least one document so the index is valid + val doc = Document().apply { + add(StringField("type", "line", Field.Store.YES)) + add(StoredField("book_id", 1)) + add(IntPoint("book_id", 1)) + add(StoredField("book_title", "Test Book")) + add(StoredField("line_id", 1)) + add(IntPoint("line_id", 1)) + add(StoredField("line_index", 0)) + add(TextField("text", "שלום עולם", Field.Store.NO)) + add(StoredField("text_raw", "שלום עולם")) + add(StoredField("is_base_book", 0)) + add(StoredField("order_index", 1)) + } + writer.addDocument(doc) + } + } + } + + private fun createIndexWithContent(indexDir: Path) { + FSDirectory.open(indexDir).use { dir -> + val config = IndexWriterConfig(StandardAnalyzer()) + IndexWriter(dir, config).use { writer -> + val texts = listOf( + "שלום עולם, ברוכים הבאים", + "בראשית ברא אלהים את השמים ואת הארץ", + "שלום וברכה לכל הקוראים" + ) + + texts.forEachIndexed { idx, text -> + val doc = Document().apply { + add(StringField("type", "line", Field.Store.YES)) + add(StoredField("book_id", 1)) + add(IntPoint("book_id", 1)) + add(StoredField("book_title", "ספר בדיקה")) + add(StoredField("line_id", idx + 1L)) + add(IntPoint("line_id", idx + 1)) + add(StoredField("line_index", idx)) + add(TextField("text", HebrewTextUtils.normalizeHebrew(text), Field.Store.NO)) + add(StoredField("text_raw", text)) + add(StoredField("is_base_book", 1)) + add(StoredField("order_index", 1)) + } + writer.addDocument(doc) + } + } + } + } + + private fun createIndexWithMultipleDocuments(indexDir: Path, count: Int) { + FSDirectory.open(indexDir).use { dir -> + val config = IndexWriterConfig(StandardAnalyzer()) + IndexWriter(dir, config).use { writer -> + repeat(count) { idx -> + val text = "טקסט מספר $idx עם תוכן לבדיקה" + val doc = Document().apply { + add(StringField("type", "line", Field.Store.YES)) + add(StoredField("book_id", 1)) + add(IntPoint("book_id", 1)) + add(StoredField("book_title", "ספר בדיקה")) + add(StoredField("line_id", idx + 1L)) + add(IntPoint("line_id", idx + 1)) + add(StoredField("line_index", idx)) + add(TextField("text", HebrewTextUtils.normalizeHebrew(text), Field.Store.NO)) + add(StoredField("text_raw", text)) + add(StoredField("is_base_book", 1)) + add(StoredField("order_index", 1)) + } + writer.addDocument(doc) + } + } + } + } + + private fun createIndexWithBookTitles(indexDir: Path) { + FSDirectory.open(indexDir).use { dir -> + val config = IndexWriterConfig(StandardAnalyzer()) + IndexWriter(dir, config).use { writer -> + val titles = listOf( + "בראשית רבה", + "שמות רבה", + "ויקרא רבה" + ) + + titles.forEachIndexed { idx, title -> + val doc = Document().apply { + add(StringField("type", "book_title", Field.Store.YES)) + add(StoredField("book_id", idx + 1L)) + add(IntPoint("book_id", idx + 1)) + add(TextField("title", HebrewTextUtils.normalizeHebrew(title), Field.Store.YES)) + } + writer.addDocument(doc) + } + + // Also add some line documents + createLineDocuments(writer) + } + } + } + + private fun createIndexWithMultipleBookTitles(indexDir: Path, count: Int) { + FSDirectory.open(indexDir).use { dir -> + val config = IndexWriterConfig(StandardAnalyzer()) + IndexWriter(dir, config).use { writer -> + repeat(count) { idx -> + val title = "ספר מספר ${idx + 1}" + val doc = Document().apply { + add(StringField("type", "book_title", Field.Store.YES)) + add(StoredField("book_id", idx + 1L)) + add(IntPoint("book_id", idx + 1)) + add(TextField("title", HebrewTextUtils.normalizeHebrew(title), Field.Store.YES)) + } + writer.addDocument(doc) + } + } + } + } + + private fun createIndexWithMultipleBooks(indexDir: Path) { + FSDirectory.open(indexDir).use { dir -> + val config = IndexWriterConfig(StandardAnalyzer()) + IndexWriter(dir, config).use { writer -> + val booksData = listOf( + 1L to "ספר ראשון", + 2L to "ספר שני", + 3L to "ספר שלישי" + ) + + var lineId = 1L + booksData.forEach { (bookId, bookTitle) -> + repeat(5) { idx -> + val text = "טקסט בספר $bookId שורה $idx" + val doc = Document().apply { + add(StringField("type", "line", Field.Store.YES)) + add(StoredField("book_id", bookId)) + add(IntPoint("book_id", bookId.toInt())) + add(StoredField("book_title", bookTitle)) + add(StoredField("line_id", lineId)) + add(IntPoint("line_id", lineId.toInt())) + add(StoredField("line_index", idx)) + add(TextField("text", HebrewTextUtils.normalizeHebrew(text), Field.Store.NO)) + add(StoredField("text_raw", text)) + add(StoredField("is_base_book", 1)) + add(StoredField("order_index", bookId.toInt())) + } + writer.addDocument(doc) + lineId++ + } + } + } + } + } + + private fun createIndexWithHashemContent(indexDir: Path) { + FSDirectory.open(indexDir).use { dir -> + val config = IndexWriterConfig(StandardAnalyzer()) + IndexWriter(dir, config).use { writer -> + val texts = listOf( + "ברוך ה׳ לעולם", + "יהוה אלהי ישראל", + "ה' מלך ה' מלך" + ) + + texts.forEachIndexed { idx, text -> + val doc = Document().apply { + add(StringField("type", "line", Field.Store.YES)) + add(StoredField("book_id", 1)) + add(IntPoint("book_id", 1)) + add(StoredField("book_title", "ספר תפילות")) + add(StoredField("line_id", idx + 1L)) + add(IntPoint("line_id", idx + 1)) + add(StoredField("line_index", idx)) + add(TextField("text", HebrewTextUtils.normalizeHebrew(text), Field.Store.NO)) + add(StoredField("text_raw", text)) + add(StoredField("is_base_book", 1)) + add(StoredField("order_index", 1)) + } + writer.addDocument(doc) + } + } + } + } + + private fun createLineDocuments(writer: IndexWriter) { + val text = "טקסט לבדיקה עם תוכן" + val doc = Document().apply { + add(StringField("type", "line", Field.Store.YES)) + add(StoredField("book_id", 1)) + add(IntPoint("book_id", 1)) + add(StoredField("book_title", "ספר בדיקה")) + add(StoredField("line_id", 1L)) + add(IntPoint("line_id", 1)) + add(StoredField("line_index", 0)) + add(TextField("text", HebrewTextUtils.normalizeHebrew(text), Field.Store.NO)) + add(StoredField("text_raw", text)) + add(StoredField("is_base_book", 1)) + add(StoredField("order_index", 1)) + } + writer.addDocument(doc) + } +} diff --git a/search/src/jvmTest/kotlin/io/github/kdroidfilter/seforimlibrary/search/MagicDictionaryIndexTest.kt b/search/src/jvmTest/kotlin/io/github/kdroidfilter/seforimlibrary/search/MagicDictionaryIndexTest.kt new file mode 100644 index 0000000..973707a --- /dev/null +++ b/search/src/jvmTest/kotlin/io/github/kdroidfilter/seforimlibrary/search/MagicDictionaryIndexTest.kt @@ -0,0 +1,425 @@ +package io.github.kdroidfilter.seforimlibrary.search + +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertNotNull +import kotlin.test.assertNull +import kotlin.test.assertTrue +import java.nio.file.Files +import java.nio.file.Path +import java.sql.DriverManager + +class MagicDictionaryIndexTest { + + private val simpleNorm: (String) -> String = { it.trim().lowercase() } + private val hebrewNorm: (String) -> String = { HebrewTextUtils.normalizeHebrew(it) } + + // --- load() tests --- + + @Test + fun `load returns null for null path`() { + val result = MagicDictionaryIndex.load(simpleNorm, null) + assertNull(result) + } + + @Test + fun `load returns null for non-existent file`() { + val nonExistentPath = Path.of("/non/existent/path/lexical.db") + val result = MagicDictionaryIndex.load(simpleNorm, nonExistentPath) + assertNull(result) + } + + @Test + fun `load returns null for file without required tables`() { + // Create a temporary SQLite file without the required tables + val tempFile = Files.createTempFile("invalid_lexical", ".db") + try { + DriverManager.getConnection("jdbc:sqlite:${tempFile.toAbsolutePath()}").use { conn -> + conn.createStatement().use { stmt -> + stmt.execute("CREATE TABLE other_table (id INTEGER PRIMARY KEY)") + } + } + + val result = MagicDictionaryIndex.load(simpleNorm, tempFile) + assertNull(result) + } finally { + Files.deleteIfExists(tempFile) + } + } + + @Test + fun `load returns index for valid database`() { + val tempFile = createValidTestDatabase() + try { + val result = MagicDictionaryIndex.load(simpleNorm, tempFile) + assertNotNull(result) + } finally { + Files.deleteIfExists(tempFile) + } + } + + // --- findValidDictionary() tests --- + + @Test + fun `findValidDictionary returns null for empty list`() { + val result = MagicDictionaryIndex.findValidDictionary(emptyList()) + assertNull(result) + } + + @Test + fun `findValidDictionary returns null when no valid candidate`() { + val candidates = listOf( + Path.of("/non/existent/path1.db"), + Path.of("/non/existent/path2.db") + ) + val result = MagicDictionaryIndex.findValidDictionary(candidates) + assertNull(result) + } + + @Test + fun `findValidDictionary returns first valid candidate`() { + val tempFile1 = createValidTestDatabase() + val tempFile2 = createValidTestDatabase() + try { + val candidates = listOf(tempFile1, tempFile2) + val result = MagicDictionaryIndex.findValidDictionary(candidates) + assertEquals(tempFile1, result) + } finally { + Files.deleteIfExists(tempFile1) + Files.deleteIfExists(tempFile2) + } + } + + @Test + fun `findValidDictionary skips invalid candidates`() { + val invalidFile = Files.createTempFile("invalid", ".db") + val validFile = createValidTestDatabase() + try { + // Create invalid database without required tables + DriverManager.getConnection("jdbc:sqlite:${invalidFile.toAbsolutePath()}").use { conn -> + conn.createStatement().use { stmt -> + stmt.execute("CREATE TABLE other_table (id INTEGER PRIMARY KEY)") + } + } + + val candidates = listOf(invalidFile, validFile) + val result = MagicDictionaryIndex.findValidDictionary(candidates) + assertEquals(validFile, result) + } finally { + Files.deleteIfExists(invalidFile) + Files.deleteIfExists(validFile) + } + } + + // --- expansionFor() tests --- + + @Test + fun `expansionFor returns null for empty token`() { + val tempFile = createValidTestDatabase() + try { + val index = MagicDictionaryIndex.load(simpleNorm, tempFile) + assertNotNull(index) + + val result = index.expansionFor("") + assertNull(result) + + val resultBlank = index.expansionFor(" ") + assertNull(resultBlank) + } finally { + Files.deleteIfExists(tempFile) + } + } + + @Test + fun `expansionFor returns null for unknown token`() { + val tempFile = createValidTestDatabase() + try { + val index = MagicDictionaryIndex.load(simpleNorm, tempFile) + assertNotNull(index) + + val result = index.expansionFor("unknownword12345") + assertNull(result) + } finally { + Files.deleteIfExists(tempFile) + } + } + + @Test + fun `expansionFor returns expansion for known token`() { + val tempFile = createTestDatabaseWithData() + try { + val index = MagicDictionaryIndex.load(simpleNorm, tempFile) + assertNotNull(index) + + val result = index.expansionFor("word1") + assertNotNull(result) + assertTrue(result.surface.isNotEmpty()) + } finally { + Files.deleteIfExists(tempFile) + } + } + + // --- expansionsFor() tests --- + + @Test + fun `expansionsFor returns empty list for empty token list`() { + val tempFile = createValidTestDatabase() + try { + val index = MagicDictionaryIndex.load(simpleNorm, tempFile) + assertNotNull(index) + + val result = index.expansionsFor(emptyList()) + assertTrue(result.isEmpty()) + } finally { + Files.deleteIfExists(tempFile) + } + } + + @Test + fun `expansionsFor returns empty list for unknown tokens`() { + val tempFile = createValidTestDatabase() + try { + val index = MagicDictionaryIndex.load(simpleNorm, tempFile) + assertNotNull(index) + + val result = index.expansionsFor(listOf("unknown1", "unknown2")) + assertTrue(result.isEmpty()) + } finally { + Files.deleteIfExists(tempFile) + } + } + + @Test + fun `expansionsFor returns distinct expansions`() { + val tempFile = createTestDatabaseWithData() + try { + val index = MagicDictionaryIndex.load(simpleNorm, tempFile) + assertNotNull(index) + + // Query the same token twice - should return distinct results + val result = index.expansionsFor(listOf("word1", "word1")) + // Distinct should deduplicate + assertTrue(result.size <= 1 || result.distinct().size == result.size) + } finally { + Files.deleteIfExists(tempFile) + } + } + + @Test + fun `expansionsFor handles multiple tokens`() { + val tempFile = createTestDatabaseWithData() + try { + val index = MagicDictionaryIndex.load(simpleNorm, tempFile) + assertNotNull(index) + + val result = index.expansionsFor(listOf("word1", "word2")) + assertNotNull(result) + } finally { + Files.deleteIfExists(tempFile) + } + } + + // --- loadHashemSurfaces() tests --- + + @Test + fun `loadHashemSurfaces returns empty list when no Hashem entries`() { + val tempFile = createValidTestDatabase() + try { + val index = MagicDictionaryIndex.load(simpleNorm, tempFile) + assertNotNull(index) + + val result = index.loadHashemSurfaces() + assertTrue(result.isEmpty()) + } finally { + Files.deleteIfExists(tempFile) + } + } + + @Test + fun `loadHashemSurfaces returns surfaces for יהוה base`() { + val tempFile = createTestDatabaseWithHashemData() + try { + val index = MagicDictionaryIndex.load(hebrewNorm, tempFile) + assertNotNull(index) + + val result = index.loadHashemSurfaces() + assertTrue(result.isNotEmpty()) + assertTrue(result.contains("יהוה")) + } finally { + Files.deleteIfExists(tempFile) + } + } + + // --- Hebrew normalization integration tests --- + + @Test + fun `expansionFor works with Hebrew normalization`() { + val tempFile = createTestDatabaseWithHebrewData() + try { + val index = MagicDictionaryIndex.load(hebrewNorm, tempFile) + assertNotNull(index) + + // Search for a word with sofit letter + val result = index.expansionFor("מלך") + assertNotNull(result) + } finally { + Files.deleteIfExists(tempFile) + } + } + + @Test + fun `expansion prefers base match over largest expansion`() { + val tempFile = createTestDatabaseWithMultipleExpansions() + try { + val index = MagicDictionaryIndex.load(simpleNorm, tempFile) + assertNotNull(index) + + // When token matches a base, prefer that expansion + val result = index.expansionFor("root") + assertNotNull(result) + assertTrue(result.base.contains("root")) + } finally { + Files.deleteIfExists(tempFile) + } + } + + // --- Caching tests --- + + @Test + fun `repeated lookups use cache`() { + val tempFile = createTestDatabaseWithData() + try { + val index = MagicDictionaryIndex.load(simpleNorm, tempFile) + assertNotNull(index) + + // First lookup + val result1 = index.expansionFor("word1") + // Second lookup (should use cache) + val result2 = index.expansionFor("word1") + + assertEquals(result1, result2) + } finally { + Files.deleteIfExists(tempFile) + } + } + + // --- Helper methods --- + + private fun createValidTestDatabase(): Path { + val tempFile = Files.createTempFile("valid_lexical", ".db") + DriverManager.getConnection("jdbc:sqlite:${tempFile.toAbsolutePath()}").use { conn -> + conn.createStatement().use { stmt -> + stmt.execute(""" + CREATE TABLE base ( + id INTEGER PRIMARY KEY, + value TEXT NOT NULL + ) + """.trimIndent()) + stmt.execute(""" + CREATE TABLE surface ( + id INTEGER PRIMARY KEY, + value TEXT NOT NULL, + base_id INTEGER NOT NULL, + FOREIGN KEY (base_id) REFERENCES base(id) + ) + """.trimIndent()) + stmt.execute(""" + CREATE TABLE variant ( + id INTEGER PRIMARY KEY, + value TEXT NOT NULL + ) + """.trimIndent()) + stmt.execute(""" + CREATE TABLE surface_variant ( + surface_id INTEGER NOT NULL, + variant_id INTEGER NOT NULL, + PRIMARY KEY (surface_id, variant_id), + FOREIGN KEY (surface_id) REFERENCES surface(id), + FOREIGN KEY (variant_id) REFERENCES variant(id) + ) + """.trimIndent()) + } + } + return tempFile + } + + private fun createTestDatabaseWithData(): Path { + val tempFile = createValidTestDatabase() + DriverManager.getConnection("jdbc:sqlite:${tempFile.toAbsolutePath()}").use { conn -> + conn.createStatement().use { stmt -> + // Insert base + stmt.execute("INSERT INTO base (id, value) VALUES (1, 'root1')") + stmt.execute("INSERT INTO base (id, value) VALUES (2, 'root2')") + + // Insert surfaces + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (1, 'word1', 1)") + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (2, 'word1a', 1)") + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (3, 'word2', 2)") + + // Insert variants + stmt.execute("INSERT INTO variant (id, value) VALUES (1, 'var1')") + stmt.execute("INSERT INTO variant (id, value) VALUES (2, 'var2')") + + // Link surfaces to variants + stmt.execute("INSERT INTO surface_variant (surface_id, variant_id) VALUES (1, 1)") + stmt.execute("INSERT INTO surface_variant (surface_id, variant_id) VALUES (1, 2)") + } + } + return tempFile + } + + private fun createTestDatabaseWithHashemData(): Path { + val tempFile = createValidTestDatabase() + DriverManager.getConnection("jdbc:sqlite:${tempFile.toAbsolutePath()}").use { conn -> + conn.createStatement().use { stmt -> + // Insert base with יהוה + stmt.execute("INSERT INTO base (id, value) VALUES (1, 'יהוה')") + + // Insert surfaces for Hashem + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (1, 'יהוה', 1)") + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (2, 'ה׳', 1)") + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (3, 'השם', 1)") + } + } + return tempFile + } + + private fun createTestDatabaseWithHebrewData(): Path { + val tempFile = createValidTestDatabase() + DriverManager.getConnection("jdbc:sqlite:${tempFile.toAbsolutePath()}").use { conn -> + conn.createStatement().use { stmt -> + // Insert Hebrew base + stmt.execute("INSERT INTO base (id, value) VALUES (1, 'מלכ')") + + // Insert surfaces with final letter forms + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (1, 'מלך', 1)") + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (2, 'מלכים', 1)") + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (3, 'מלכות', 1)") + } + } + return tempFile + } + + private fun createTestDatabaseWithMultipleExpansions(): Path { + val tempFile = createValidTestDatabase() + DriverManager.getConnection("jdbc:sqlite:${tempFile.toAbsolutePath()}").use { conn -> + conn.createStatement().use { stmt -> + // Base that matches search token + stmt.execute("INSERT INTO base (id, value) VALUES (1, 'root')") + // Another base with more surfaces + stmt.execute("INSERT INTO base (id, value) VALUES (2, 'otherroot')") + + // Surfaces for first base (small) + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (1, 'root', 1)") + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (2, 'rooted', 1)") + + // Surfaces for second base (larger) + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (3, 'root', 2)") + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (4, 'roots', 2)") + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (5, 'rooting', 2)") + stmt.execute("INSERT INTO surface (id, value, base_id) VALUES (6, 'rootless', 2)") + } + } + return tempFile + } +} diff --git a/settings.gradle.kts b/settings.gradle.kts index 7fae642..18cec49 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -30,6 +30,7 @@ dependencyResolutionManagement { } include(":core") include(":dao") +include(":search") include(":catalog") include(":searchindex") include(":packaging")