Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,14 @@ class SeforimRepository(databasePath: String, private val driver: SqlDriver) {
database.categoryClosureQueriesQueries.selectDescendants(ancestorId).executeAsList()
}

/**
* Returns all ancestor category IDs (including the category itself) using the
* category_closure table. Used for pre-indexing ancestors in search indexes.
*/
suspend fun getAncestorCategoryIds(categoryId: Long): List<Long> = withContext(Dispatchers.IO) {
database.categoryClosureQueriesQueries.selectAncestors(categoryId).executeAsList()
}

/**
* Finds categories whose title matches the LIKE pattern. Use %term% for contains.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ interface TextIndexWriter : AutoCloseable {
* @param bookId The book id
* @param bookTitle The book title (for display)
* @param categoryId The category id of the book
* @param ancestorCategoryIds List of ancestor category IDs (including categoryId itself) for filtering
* @param lineId The line id
* @param lineIndex The 0-based line index within the book
* @param normalizedText Normalized text to index in the primary field (typically StandardAnalyzer)
Expand All @@ -24,6 +25,7 @@ interface TextIndexWriter : AutoCloseable {
bookId: Long,
bookTitle: String,
categoryId: Long,
ancestorCategoryIds: List<Long> = emptyList(),
lineId: Long,
lineIndex: Int,
normalizedText: String,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import kotlinx.coroutines.runBlocking
import kotlinx.coroutines.async
import kotlinx.coroutines.awaitAll
import kotlinx.coroutines.Dispatchers
import java.util.concurrent.ConcurrentHashMap
import org.apache.lucene.analysis.standard.StandardAnalyzer
import org.apache.lucene.analysis.Analyzer
import org.apache.lucene.analysis.TokenStream
Expand Down Expand Up @@ -123,6 +124,9 @@ fun main() = runBlocking {
logger.i { "Indexing $totalBooks books into $indexDir using StandardAnalyzer + 4-gram field" }
val progress = java.util.concurrent.atomic.AtomicInteger(0)

// Pre-compute ancestor category IDs per category (cached across books)
val ancestorsByCategory = ConcurrentHashMap<Long, List<Long>>()

books.map { book ->
async(workerDispatcher) {
val current = progress.incrementAndGet()
Expand Down Expand Up @@ -191,6 +195,10 @@ fun main() = runBlocking {
val allLines = runCatching { localRepo.getLines(book.id, 0, total - 1) }.getOrDefault(emptyList())
// Note: rawPlainText is no longer stored in the index.
// Snippet source is fetched from DB at query time by RepositorySnippetSourceProvider.
// Pre-compute ancestor category IDs for this book's category (cached)
val ancestors = ancestorsByCategory.getOrPut(book.categoryId) {
runBlocking { localRepo.getAncestorCategoryIds(book.categoryId) }
}
var processed = 0
var nextLogPct = 10
for (ln in allLines) {
Expand All @@ -199,6 +207,7 @@ fun main() = runBlocking {
bookId = book.id,
bookTitle = book.title,
categoryId = book.categoryId,
ancestorCategoryIds = ancestors,
lineId = ln.id,
lineIndex = ln.lineIndex,
normalizedText = normalized,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class LuceneTextIndexWriter(

const val FIELD_BOOK_ID = "book_id"
const val FIELD_CATEGORY_ID = "category_id"
const val FIELD_ANCESTOR_CATEGORY_IDS = "ancestor_category_ids"
const val FIELD_BOOK_TITLE = "book_title"
const val FIELD_LINE_ID = "line_id"
const val FIELD_LINE_INDEX = "line_index"
Expand All @@ -59,6 +60,7 @@ class LuceneTextIndexWriter(
bookId: Long,
bookTitle: String,
categoryId: Long,
ancestorCategoryIds: List<Long>,
lineId: Long,
lineIndex: Int,
normalizedText: String,
Expand All @@ -76,6 +78,14 @@ class LuceneTextIndexWriter(
add(IntPoint(FIELD_CATEGORY_ID, categoryId.toInt()))
add(StoredField(FIELD_BOOK_TITLE, bookTitle))

// Index ancestor category IDs for efficient filtering and retrieval
// IntPoint for filtering (multi-valued)
for (ancestorId in ancestorCategoryIds) {
add(IntPoint(FIELD_ANCESTOR_CATEGORY_IDS, ancestorId.toInt()))
}
// StoredField for retrieval (comma-separated)
add(StoredField(FIELD_ANCESTOR_CATEGORY_IDS, ancestorCategoryIds.joinToString(",")))

add(StoredField(FIELD_LINE_ID, lineId))
add(IntPoint(FIELD_LINE_ID, lineId.toInt()))
add(StoredField(FIELD_LINE_INDEX, lineIndex))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,19 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
import org.apache.lucene.index.DirectoryReader
import org.apache.lucene.index.StoredFields
import org.apache.lucene.index.Term
import org.apache.lucene.index.LeafReaderContext
import org.apache.lucene.search.BooleanClause
import org.apache.lucene.search.BooleanQuery
import org.apache.lucene.search.BoostQuery
import org.apache.lucene.search.Collector
import org.apache.lucene.search.FuzzyQuery
import org.apache.lucene.search.IndexSearcher
import org.apache.lucene.search.LeafCollector
import org.apache.lucene.search.PrefixQuery
import org.apache.lucene.search.Query
import org.apache.lucene.search.Scorable
import org.apache.lucene.search.ScoreDoc
import org.apache.lucene.search.ScoreMode
import org.apache.lucene.search.TermQuery
import org.apache.lucene.util.QueryBuilder
import org.apache.lucene.store.FSDirectory
Expand Down Expand Up @@ -147,9 +152,10 @@ class LuceneSearchEngine(
bookFilter: Long?,
categoryFilter: Long?,
bookIds: Collection<Long>?,
lineIds: Collection<Long>?
lineIds: Collection<Long>?,
baseBookOnly: Boolean
): SearchSession? {
val context = buildSearchContext(query, near, bookFilter, categoryFilter, bookIds, lineIds) ?: return null
val context = buildSearchContext(query, near, bookFilter, categoryFilter, bookIds, lineIds, baseBookOnly) ?: return null
val reader = DirectoryReader.open(dir)
return LuceneSearchSession(context.query, context.anchorTerms, context.highlightTerms, reader)
}
Expand Down Expand Up @@ -241,6 +247,68 @@ class LuceneSearchEngine(
// Directory is closed automatically when readers are closed
}

override fun computeFacets(
query: String,
near: Int,
bookFilter: Long?,
categoryFilter: Long?,
bookIds: Collection<Long>?,
lineIds: Collection<Long>?,
baseBookOnly: Boolean
): SearchFacets? {
val context = buildSearchContext(query, near, bookFilter, categoryFilter, bookIds, lineIds, baseBookOnly)
?: return null

return withSearcher { searcher ->
val categoryCounts = mutableMapOf<Long, Int>()
val bookCounts = mutableMapOf<Long, Int>()
var totalHits = 0L

// Lightweight collector that only reads stored fields for aggregation
val collector = object : Collector {
override fun getLeafCollector(leafContext: LeafReaderContext): LeafCollector {
val storedFields = leafContext.reader().storedFields()

return object : LeafCollector {
override fun setScorer(scorer: Scorable) {
// No scoring needed for facet counting
}

override fun collect(doc: Int) {
totalHits++
val luceneDoc = storedFields.document(doc)

// Book count
val bookId = luceneDoc.getField("book_id")?.numericValue()?.toLong()
if (bookId != null) {
bookCounts[bookId] = (bookCounts[bookId] ?: 0) + 1
}

// Category counts from ancestors (stored as comma-separated string)
val ancestorStr = luceneDoc.getField("ancestor_category_ids")?.stringValue() ?: ""
if (ancestorStr.isNotEmpty()) {
for (idStr in ancestorStr.split(",")) {
val catId = idStr.trim().toLongOrNull() ?: continue
categoryCounts[catId] = (categoryCounts[catId] ?: 0) + 1
}
}
}
}
}

override fun scoreMode(): ScoreMode = ScoreMode.COMPLETE_NO_SCORES
}

searcher.search(context.query, collector)

SearchFacets(
totalHits = totalHits,
categoryCounts = categoryCounts.toMap(),
bookCounts = bookCounts.toMap()
)
}
}

// --- Inner SearchSession class ---

inner class LuceneSearchSession internal constructor(
Expand Down Expand Up @@ -307,7 +375,8 @@ class LuceneSearchEngine(
bookFilter: Long?,
categoryFilter: Long?,
bookIds: Collection<Long>?,
lineIds: Collection<Long>?
lineIds: Collection<Long>?,
baseBookOnly: Boolean = false
): SearchContext? {
val norm = HebrewTextUtils.normalizeHebrew(rawQuery)
if (norm.isBlank()) return null
Expand Down Expand Up @@ -386,6 +455,8 @@ class LuceneSearchEngine(
builder.add(TermQuery(Term("type", "line")), BooleanClause.Occur.FILTER)
if (bookFilter != null) builder.add(IntPoint.newExactQuery("book_id", bookFilter.toInt()), BooleanClause.Occur.FILTER)
if (categoryFilter != null) builder.add(IntPoint.newExactQuery("category_id", categoryFilter.toInt()), BooleanClause.Occur.FILTER)
// Filter by base books only (is_base_book = 1) when baseBookOnly is true
if (baseBookOnly) builder.add(IntPoint.newExactQuery("is_base_book", 1), BooleanClause.Occur.FILTER)
val bookIdsArray = bookIds?.map { it.toInt() }?.toIntArray()
if (bookIdsArray != null && bookIdsArray.isNotEmpty()) {
builder.add(IntPoint.newSetQuery("book_id", *bookIdsArray), BooleanClause.Occur.FILTER)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ interface SearchEngine : Closeable {
* @param categoryFilter Optional category ID to restrict results
* @param bookIds Optional collection of book IDs to restrict results (OR logic)
* @param lineIds Optional collection of line IDs to restrict results (OR logic)
* @param baseBookOnly If true, restrict results to base books only (default: false)
* @return A [SearchSession] for paginated access to results, or null if query is invalid
*/
fun openSession(
Expand All @@ -54,7 +55,8 @@ interface SearchEngine : Closeable {
bookFilter: Long? = null,
categoryFilter: Long? = null,
bookIds: Collection<Long>? = null,
lineIds: Collection<Long>? = null
lineIds: Collection<Long>? = null,
baseBookOnly: Boolean = false
): SearchSession?

/**
Expand Down Expand Up @@ -95,4 +97,30 @@ interface SearchEngine : Closeable {
* @return List of normalized terms to highlight (includes original tokens + expansions)
*/
fun buildHighlightTerms(query: String): List<String>

/**
* Computes aggregate facet counts without loading full results.
*
* Uses a lightweight Lucene collector that only reads book IDs and ancestor
* category IDs from the index. This is much faster than streaming all results
* and allows the UI to display the category/book tree immediately.
*
* @param query The search query in Hebrew (may contain nikud/teamim)
* @param near Proximity slop for phrase matching (default: 5)
* @param bookFilter Optional single book ID to restrict results
* @param categoryFilter Optional category ID to restrict results
* @param bookIds Optional collection of book IDs to restrict results (OR logic)
* @param lineIds Optional collection of line IDs to restrict results (OR logic)
* @param baseBookOnly If true, restrict results to base books only (default: false)
* @return [SearchFacets] with counts, or null if query is invalid
*/
fun computeFacets(
query: String,
near: Int = 5,
bookFilter: Long? = null,
categoryFilter: Long? = null,
bookIds: Collection<Long>? = null,
lineIds: Collection<Long>? = null,
baseBookOnly: Boolean = false
): SearchFacets?
}
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,17 @@ data class LineHit(
val score: Float,
val rawText: String
)

/**
* Aggregated facet counts from a search query.
* Computed once via a lightweight Lucene collector without loading full results.
*
* @property totalHits Total number of matching documents
* @property categoryCounts Map of categoryId to count (includes ancestor categories)
* @property bookCounts Map of bookId to count
*/
data class SearchFacets(
val totalHits: Long,
val categoryCounts: Map<Long, Int>,
val bookCounts: Map<Long, Int>,
)
Loading