Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: CI - Tests

on:
pull_request:
branches:
- master
- main

jobs:
test:
runs-on: ubuntu-latest

steps:
- name: Checkout source
uses: actions/checkout@v4

- name: Install JBR 25
run: |
curl -L -o jbr.tar.gz "https://cache-redirector.jetbrains.com/intellij-jbr/jbrsdk-25.0.1-linux-x64-b268.52.tar.gz"
mkdir -p "$RUNNER_TEMP/jbr"
tar -xzf jbr.tar.gz -C "$RUNNER_TEMP/jbr"
JBR_DIR=$(find "$RUNNER_TEMP/jbr" -mindepth 1 -maxdepth 1 -type d -name "jbr*" -o -name "jbrsdk*" | head -n 1)
echo "JAVA_HOME=$JBR_DIR" >> "$GITHUB_ENV"
echo "$JBR_DIR/bin" >> "$GITHUB_PATH"

- name: Setup Gradle
uses: gradle/gradle-build-action@v3

- name: Grant execute permission for gradlew
run: chmod +x gradlew

- name: Run all tests
run: ./gradlew allTests --no-daemon

- name: Upload test reports
if: always()
uses: actions/upload-artifact@v4
with:
name: test-reports
path: '**/build/reports/tests/'
retention-days: 7
4 changes: 3 additions & 1 deletion gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ kotlinx-serialization = "1.9.0"
kotlinx-datetime = "0.7.1"
kermit = "2.0.8"
sqlDelight = "2.1.0"
compose = "1.9.3"
compose = "1.10.0"
androidx-activityCompose = "1.12.1"
commons-compress = "1.28.0"
zstd-jni = "1.5.7-6"
jsoup = "1.22.1"

[libraries]

Expand All @@ -39,6 +40,7 @@ sqlDelight-driver-js = { module = "app.cash.sqldelight:web-worker-driver", versi
commons-compress = { module = "org.apache.commons:commons-compress", version.ref = "commons-compress" }
zstd = { module = "com.github.luben:zstd-jni", version.ref = "zstd-jni" }
androidx-activityCompose = { module = "androidx.activity:activity-compose", version.ref = "androidx-activityCompose" }
jsoup = { module = "org.jsoup:jsoup", version.ref = "jsoup" }

[plugins]

Expand Down
30 changes: 30 additions & 0 deletions search/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
plugins {
alias(libs.plugins.multiplatform)
alias(libs.plugins.kotlinx.serialization)
}

group = "io.github.kdroidfilter.seforimlibrary"

kotlin {
jvmToolchain(libs.versions.jvmToolchain.get().toInt())

jvm()

sourceSets {
jvmMain.dependencies {
api(project(":core"))
implementation(libs.kotlinx.coroutines.core)
implementation(libs.kotlinx.serialization.json)
implementation(libs.lucene.core)
implementation(libs.lucene.analysis.common)
implementation(libs.sqlDelight.driver.sqlite)
implementation(libs.kermit)
implementation(libs.jsoup)
}

jvmTest.dependencies {
implementation(kotlin("test"))
implementation(libs.kotlinx.coroutines.test)
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
package io.github.kdroidfilter.seforimlibrary.search

/**
* Utility functions for Hebrew text processing.
* Includes normalization, diacritic removal, and final letter handling.
*/
object HebrewTextUtils {

/**
* Map of Hebrew final letters (sofit) to their base forms.
*/
val SOFIT_MAP = mapOf(
'ך' to 'כ', // U+05DA -> U+05DB
'ם' to 'מ', // U+05DD -> U+05DE
'ן' to 'נ', // U+05DF -> U+05E0
'ף' to 'פ', // U+05E3 -> U+05E4
'ץ' to 'צ' // U+05E5 -> U+05E6
)

/**
* Normalizes Hebrew text by:
* - Removing teamim (cantillation marks) U+0591–U+05AF
* - Removing nikud (vowel points) U+05B0–U+05BD and related
* - Replacing maqaf U+05BE with space
* - Removing gershayim/geresh
* - Normalizing final letters to base forms
* - Collapsing whitespace
*
* @param input The input string to normalize
* @return The normalized string
*/
fun normalizeHebrew(input: String): String {
if (input.isBlank()) return ""
var s = input.trim()

// Remove biblical cantillation marks (teamim) U+0591–U+05AF
s = s.replace("[\u0591-\u05AF]".toRegex(), "")
// Remove nikud signs including meteg and qamatz qatan
s = s.replace("[\u05B0\u05B1\u05B2\u05B3\u05B4\u05B5\u05B6\u05B7\u05B8\u05B9\u05BB\u05BC\u05BD\u05C1\u05C2\u05C7]".toRegex(), "")
// Replace maqaf U+05BE with space
s = s.replace('\u05BE', ' ')
// Remove gershayim/geresh
s = s.replace("\u05F4", "").replace("\u05F3", "")
// Normalize Hebrew final letters (sofit) to base forms
s = replaceFinalsWithBase(s)
// Collapse whitespace
s = s.replace("\\s+".toRegex(), " ").trim()
return s
}

/**
* Replaces Hebrew final letters (sofit) with their base forms.
*
* @param text The input text
* @return Text with final letters replaced
*/
fun replaceFinalsWithBase(text: String): String = text
.replace('\u05DA', '\u05DB') // ך -> כ
.replace('\u05DD', '\u05DE') // ם -> מ
.replace('\u05DF', '\u05E0') // ן -> נ
.replace('\u05E3', '\u05E4') // ף -> פ
.replace('\u05E5', '\u05E6') // ץ -> צ

/**
* Checks if a character is a Hebrew diacritic (nikud or teamim).
*
* @param c The character to check
* @return true if the character is a diacritic
*/
fun isNikudOrTeamim(c: Char): Boolean {
val code = c.code
return (code in 0x0591..0x05AF) || // teamim
(code in 0x05B0..0x05BD) || // nikud + meteg
(c == '\u05C1') || (c == '\u05C2') || (c == '\u05C7')
}

/**
* Strips Hebrew diacritics (nikud and teamim) from text and returns
* both the plain text and an index map from plain indices to original indices.
*
* @param src The source string
* @return Pair of (plain text, index map)
*/
fun stripDiacriticsWithMap(src: String): Pair<String, IntArray> {
val out = StringBuilder(src.length)
val map = ArrayList<Int>(src.length)
var i = 0
while (i < src.length) {
val ch = src[i]
if (!isNikudOrTeamim(ch)) {
out.append(ch)
map.add(i)
}
i++
}
val arr = IntArray(map.size) { map[it] }
return out.toString() to arr
}

/**
* Strips Hebrew diacritics from text without preserving index mapping.
*
* @param text The input text
* @return Text without diacritics
*/
fun stripDiacritics(text: String): String {
if (text.isEmpty()) return text
val sb = StringBuilder(text.length)
for (ch in text) {
if (!isNikudOrTeamim(ch)) {
sb.append(ch)
}
}
return sb.toString()
}

/**
* Maps a plain text index back to the original text index.
*
* @param mapToOrig The index map from stripDiacriticsWithMap
* @param plainIndex The index in the plain text
* @return The corresponding index in the original text
*/
fun mapToOrigIndex(mapToOrig: IntArray, plainIndex: Int): Int {
if (mapToOrig.isEmpty()) return plainIndex
val idx = plainIndex.coerceIn(0, mapToOrig.size - 1)
return mapToOrig[idx]
}
}
Loading
Loading