From 66ee63b0082af173ff5aeeba817d01142e9b61a5 Mon Sep 17 00:00:00 2001 From: tenextractor <139619642+tenextractor@users.noreply.github.com> Date: Tue, 17 Feb 2026 18:35:31 +0530 Subject: [PATCH 1/2] add Vietnamese Telex and VNI combiners --- .../event/combiners/vietnamese/Common.kt | 98 +++++++ .../event/combiners/vietnamese/Telex.kt | 253 ++++++++++++++++++ .../event/combiners/vietnamese/ToneMark.kt | 148 ++++++++++ .../event/combiners/vietnamese/VNI.kt | 133 +++++++++ .../event/combiners/vietnamese/VNICombiner.kt | 58 ++++ .../combiners/vietnamese/VietTelexCombiner.kt | 43 +++ .../inputmethod/v2keyboard/CombinerKind.kt | 4 + 7 files changed, 737 insertions(+) create mode 100644 java/src/org/futo/inputmethod/event/combiners/vietnamese/Common.kt create mode 100644 java/src/org/futo/inputmethod/event/combiners/vietnamese/Telex.kt create mode 100644 java/src/org/futo/inputmethod/event/combiners/vietnamese/ToneMark.kt create mode 100644 java/src/org/futo/inputmethod/event/combiners/vietnamese/VNI.kt create mode 100644 java/src/org/futo/inputmethod/event/combiners/vietnamese/VNICombiner.kt create mode 100644 java/src/org/futo/inputmethod/event/combiners/vietnamese/VietTelexCombiner.kt diff --git a/java/src/org/futo/inputmethod/event/combiners/vietnamese/Common.kt b/java/src/org/futo/inputmethod/event/combiners/vietnamese/Common.kt new file mode 100644 index 0000000000..eb5e4def13 --- /dev/null +++ b/java/src/org/futo/inputmethod/event/combiners/vietnamese/Common.kt @@ -0,0 +1,98 @@ +package org.futo.inputmethod.event.combiners.vietnamese + +/** Code common to both Telex and VNI */ +object Common { + /** get_tone_mark_placement() function from vi-rs/src/editing.rs + * Get nth character to place tone mark + * + * # Rules: + * 1. If a vowel contains ơ or ê, tone mark goes there + * 2. If a vowel contains `oa`, `oe`, `oo`, `oy`, tone mark should be on the + * second character + * + * If the accent style is [`AccentStyle::Old`], then: + * - 3. For vowel length 3 or vowel length 2 with a final consonant, put it on the second vowel character + * - 4. Else, put it on the first vowel character + * + * Otherwise: + * - 3. If a vowel has 2 characters, put the tone mark on the first one + * - 4. Otherwise, put the tone mark on the second vowel character + */ + fun getToneMarkPosition( + outputWithoutTone: CharSequence, + firstVowelIndex: Int, + vowelCount: Int + ): Int { + val specialVowelPairs = setOf("oa", "oe", "oo", "uy", "uo", "ie") + + // If there's only one vowel, then it's guaranteed that the tone mark will go there + if (vowelCount == 1) return firstVowelIndex + + for (i in firstVowelIndex ..< firstVowelIndex + vowelCount) { + when (outputWithoutTone[i]) { + 'ơ', 'Ơ' -> return i + 'ê', 'Ê' -> return i + 'â', 'Â' -> return i + } + } + + val vowel = outputWithoutTone.slice(firstVowelIndex ..< firstVowelIndex + vowelCount) + + // If there is only one vowel with a diacritic (circumflex, breve, horn, etc.), it should + // get the tone mark + val vowelsWithDiacritics = vowel.withIndex().filter { it.value !in VOWELS } + if (vowelsWithDiacritics.size == 1) { + return firstVowelIndex + vowelsWithDiacritics[0].index + } + + // Special vowels require the tone mark to be placed on the second character + if (specialVowelPairs.any { vowel.contains(it, ignoreCase = true) }) + return firstVowelIndex + 1 + + // If a syllable end with 2 character vowel, put it on the first character + if (firstVowelIndex + vowelCount == outputWithoutTone.length && vowelCount == 2) + return firstVowelIndex + + // Else, put tone mark on second vowel + return firstVowelIndex + 1 + } + + + val CONSONANTS = setOf( + 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z') + + val VOWELS = setOf('a', 'e', 'i', 'o', 'u', 'y', 'A', 'E', 'I', 'O', 'U', 'Y') + + /** A map of characters without accent to character with circumflex accent */ + public val CIRCUMFLEX_MAP = mapOf( + 'a' to 'â', + 'e' to 'ê', + 'o' to 'ô', + // uppercase + 'A' to 'Â', + 'E' to 'Ê', + 'O' to 'Ô', + ) + + /** A map of characters without accent to character with dyet (D WITH STROKE) accent */ + public val STROKE_MAP = mapOf( + 'd' to 'đ', + 'D' to 'Đ', + ) + + /** A map of characters without accent to character with horn accent */ + public val HORN_MAP = mapOf( + 'u' to 'ư', + 'o' to 'ơ', + // uppercase + 'U' to 'Ư', + 'O' to 'Ơ', + ) + + /** A map of characters without accent to character with breve accent */ + public val BREVE_MAP = mapOf( + 'a' to 'ă', + // uppercase + 'A' to 'Ă', + ) +} diff --git a/java/src/org/futo/inputmethod/event/combiners/vietnamese/Telex.kt b/java/src/org/futo/inputmethod/event/combiners/vietnamese/Telex.kt new file mode 100644 index 0000000000..3c7685f986 --- /dev/null +++ b/java/src/org/futo/inputmethod/event/combiners/vietnamese/Telex.kt @@ -0,0 +1,253 @@ +package org.futo.inputmethod.event.combiners.vietnamese + +object Telex { + val TONES = mapOf( + 'f' to ToneMark.GRAVE, + 'j' to ToneMark.DOT, + 'r' to ToneMark.HOOK, + 's' to ToneMark.ACUTE, + 'x' to ToneMark.TILDE + ) + + /** These are the modifiers that should only be active if they come after the first vowel letter. + * For example, `sao` should not output any tone marks, but `aso` should output `áo`. + */ + val AFTER_VOWEL_MODIFIERS = setOf('f', 'j', 'r', 's', 'w', 'x') + + /** Convert a string that represents a Vietnamese syllable written in the Telex convention ([input]) + * to a syllable written in Vietnamese orthography. + * Example: input = "vietej", output = "việt" + */ + public fun telexToVietnamese(input: String): String { + + // STAGE 1: calculate modifierIndices, firstVowelIndex, startedFinal and lowercaseVowel + // Example: + // Input: "vietej" + // Output: + // modifierIndices: { 'e': [2, 4], 'j': [5], the rest are empty lists } + // firstVowelIndex: 1 + // startedFinal: true + // lowercaseVowel: "ie" + val lowercaseInput = input.lowercase() + var startedVowel = false + var startedFinal = false + var firstVowelIndex = -1 + + val lowercaseVowel = StringBuilder() + + /** Map of 'modifier' characters that can add a diacritic or tone mark, + * to lists of indices of occurrences of these characters + */ + val modifierIndices: Map> = mapOf( + 'a' to mutableListOf(), + 'd' to mutableListOf(), + 'e' to mutableListOf(), + 'f' to mutableListOf(), + 'j' to mutableListOf(), + 'o' to mutableListOf(), + 'r' to mutableListOf(), + 's' to mutableListOf(), + 'w' to mutableListOf(), + 'x' to mutableListOf(), + ) + + for ((index, ch) in lowercaseInput.withIndex()) { + + if (!startedVowel) { + if (Common.VOWELS.contains(ch)) { + // TODO: this code needs to be refined further + // if a syllable has a weird initial (like 'cl' in 'clown') that we are sure does not belong to Vietnamese, + // then stop the conversion process and just output the input as it is + // if (!(index in 0..3)) return input + // if (index in 2..3) + // if (!INITIALS.contains(lowercaseInput.slice(0.. "đi", "dddi" > "ddi"), one of the characters will be deleted + // and therefore the firstVowelIndex needs to be corrected to account for this + if (modifierIndices['d']!!.size > 1 && modifierIndices['d']!!.last() < firstVowelIndex) + firstVowelIndex-- + + // apply correction to lowercaseVowel: + // "gi" (unless there is no other vowel letter) and "qu" should be considered consonants + if (lowercaseVowel.length > 1 && (lowercaseInput.slice(0..<2) == "gi" || lowercaseInput.slice(0..<2) == "qu")) + lowercaseVowel.deleteAt(0) + + + // STAGE 2: use modifierIndices to apply diacritics (except tone marks) to the syllable + // Example: + // Input: "vietej" with its modifierIndices and firstVowelIndex as detailed in Stage 1 + // Output: + // outputWithoutTone: "viêt" + // tone: ToneMark.DOT + // vowelCount: 2 + val output = StringBuilder() + var tone: ToneMark? = null + var doNotOutputNextChar = false // this handles the "uwow" edge case + var vowelCount = 0 + var wHasBeenUsed = false + + for ((index, ch) in input.withIndex()) { + if (doNotOutputNextChar) { + doNotOutputNextChar = false + continue + } + + val lowercaseCh = lowercaseInput[index] + + when (lowercaseCh) { + 'a', 'd', 'e', 'o' -> { + // handle letters that can be doubled + + val thisModifierIndices = modifierIndices[lowercaseCh]!! + + // if there is a string such as `ddi` (output: `đi`) or `dddi` (output: ddi), + // the last `d` (or any modifier that can be doubled) needs to be omitted from the output + if (thisModifierIndices.size >= 2 && index == thisModifierIndices.last()) continue + + // if there is a string such as `ddi` (output: `đi`), + // a diacritic needs to be applied to the first `d` + if (thisModifierIndices.size == 2 && index == thisModifierIndices[0]) { + if (lowercaseCh == 'd') { + output.append(Common.STROKE_MAP[ch]) + } else if (lowercaseCh == 'o' && lowercaseVowel.contentEquals("oeo")) { + // handle "oeo" edge case (should output "oeo", not "ôe"): + // remove the second 'o''s index from modifierIndices so that it will be outputted + modifierIndices['o']!!.removeAt(modifierIndices['o']!!.lastIndex) + output.append(ch) + } else { + output.append(Common.CIRCUMFLEX_MAP[ch]) + vowelCount++ + } + + continue // after outputting the character with diacritic, + // suppress outputting the original character + } + + val wIndices = modifierIndices['w']!! + + if (wIndices.size == 1 && lowercaseCh == 'a' && !wHasBeenUsed) { + output.append(Common.BREVE_MAP[ch]) + wHasBeenUsed = true + vowelCount++ + continue + } + + if (wIndices.size == 1 && lowercaseCh == 'o' + && !lowercaseVowel.contentEquals("oa") + // ↑ add edge case for "oaw" (should output "oă", not "ơă" or "ơa") + && !(firstVowelIndex != 0 && lowercaseVowel.contentEquals("ou")) + // ↑ add edge case: any initial consonant + vowel "ou" with modifier 'w' + no final + // should output "oư" and not "ơư" + ) { + output.append(Common.HORN_MAP[ch]) + wHasBeenUsed = true + vowelCount++ + continue + } + } + + // handling tones + 'f', 'j', 'r', 's', 'x' -> { + val thisModifierIndices = modifierIndices[lowercaseCh]!! + + if (thisModifierIndices.size == 1) + tone = TONES[lowercaseCh]!! + + if (thisModifierIndices.size >= 1 && index == thisModifierIndices.last()) continue + } + + 'u' -> { + // edge case for `uwow` > `ươ`: + // the first instance of + if (lowercaseInput.length >= index + 4) { + if (lowercaseInput.slice(index.. "huơ" (uowIsNotUwow=true), but "uow" -> "ươ" (uowIsNotUwow=false) + var uowIsNotUwow = false + if ((firstVowelIndex > 0) && !startedFinal && !doNotOutputNextChar + && modifierIndices['w']!!.size == 1 && lowercaseVowel.contentEquals("uo")) { + uowIsNotUwow = true + } + + if (modifierIndices['w']!!.size == 1 && !wHasBeenUsed && !(lowercaseInput[0] == 'q' && index == 1) && !uowIsNotUwow) { + output.append(Common.HORN_MAP[ch]) + vowelCount++ + wHasBeenUsed = true + continue + } + } + + 'w' -> { + if (modifierIndices['w']!!.size >= 1 && index == modifierIndices['w']!!.last() && + lowercaseVowel.any { it == 'a' || it == 'o' || it == 'u'}) continue + } + } + + output.append(ch) // default behavior: just output the character from input as it is + if (Common.VOWELS.contains(lowercaseCh)) vowelCount++ + } + + // STAGE 3: apply a tone mark (if any) + if (tone == null) return output.toString() + + // edge case: "gija" should output "gịa" + if (lowercaseInput == "gija") { + output[1] = tone.map[output[1]] ?: output[1] + return output.toString() + } + + // apply corrections to vowelCount and firstVowelIndex: + // 'gi' (if there is another vowel after it) and 'qu' should be considered as consonants + // There is no Vietnamese word which consists of the initial 'qu' without another vowel letter, + // but for the sake of better error/edge case handling the correction will only be applied + // if there is another vowel letter. + if (vowelCount > 1 && (lowercaseInput.slice(0..<2) == "gi" || lowercaseInput.slice(0..<2) == "qu")) { + vowelCount-- + firstVowelIndex++ + } + + // if there has been some error applying the correction, just output without the tone mark + if (vowelCount <= 0 || firstVowelIndex < 0 || firstVowelIndex + vowelCount - 1 >= output.length) + return output.toString() + + // add tone mark + val toneMarkPosition = Common.getToneMarkPosition(output, firstVowelIndex, vowelCount) + output[toneMarkPosition] = tone.map[output[toneMarkPosition]] ?: + output[toneMarkPosition] + + return output.toString() + } +} diff --git a/java/src/org/futo/inputmethod/event/combiners/vietnamese/ToneMark.kt b/java/src/org/futo/inputmethod/event/combiners/vietnamese/ToneMark.kt new file mode 100644 index 0000000000..2e572b4ada --- /dev/null +++ b/java/src/org/futo/inputmethod/event/combiners/vietnamese/ToneMark.kt @@ -0,0 +1,148 @@ +package org.futo.inputmethod.event.combiners.vietnamese + +/** Vietnamese tone marks. + * + * Represents the five tone marks used in Vietnamese writing system. + */ +enum class ToneMark(val map: Map) { + /** Dấu sắc (acute accent) - rising tone */ + ACUTE(mapOf( + 'a' to 'á', + 'â' to 'ấ', + 'ă' to 'ắ', + 'e' to 'é', + 'ê' to 'ế', + 'i' to 'í', + 'o' to 'ó', + 'ô' to 'ố', + 'ơ' to 'ớ', + 'u' to 'ú', + 'ư' to 'ứ', + 'y' to 'ý', + // uppercase + 'A' to 'Á', + 'Â' to 'Ấ', + 'Ă' to 'Ắ', + 'E' to 'É', + 'Ê' to 'Ế', + 'I' to 'Í', + 'O' to 'Ó', + 'Ô' to 'Ố', + 'Ơ' to 'Ớ', + 'U' to 'Ú', + 'Ư' to 'Ứ', + 'Y' to 'Ý', + )), + /** Dấu huyền (grave accent) - falling tone */ + GRAVE(mapOf( + 'a' to 'à', + 'â' to 'ầ', + 'ă' to 'ằ', + 'e' to 'è', + 'ê' to 'ề', + 'i' to 'ì', + 'o' to 'ò', + 'ô' to 'ồ', + 'ơ' to 'ờ', + 'u' to 'ù', + 'ư' to 'ừ', + 'y' to 'ỳ', + // uppercase + 'A' to 'À', + 'Â' to 'Ầ', + 'Ă' to 'Ằ', + 'E' to 'È', + 'Ê' to 'Ề', + 'I' to 'Ì', + 'O' to 'Ò', + 'Ô' to 'Ồ', + 'Ơ' to 'Ờ', + 'U' to 'Ù', + 'Ư' to 'Ừ', + 'Y' to 'Ỳ', + )), + /** Dấu hỏi (hook above) - dipping tone */ + HOOK(mapOf( + 'a' to 'ả', + 'â' to 'ẩ', + 'ă' to 'ẳ', + 'e' to 'ẻ', + 'ê' to 'ể', + 'i' to 'ỉ', + 'o' to 'ỏ', + 'ô' to 'ổ', + 'ơ' to 'ở', + 'u' to 'ủ', + 'ư' to 'ử', + 'y' to 'ỷ', + // uppercase + 'A' to 'Ả', + 'Ă' to 'Ẳ', + 'Â' to 'Ẩ', + 'E' to 'Ẻ', + 'Ê' to 'Ể', + 'O' to 'Ỏ', + 'Ô' to 'Ổ', + 'Ơ' to 'Ở', + 'I' to 'Ỉ', + 'U' to 'Ủ', + 'Ư' to 'Ử', + 'Y' to 'Ỷ', + )), + /** Dấu ngã (tilde) - creaky rising tone */ + TILDE(mapOf( + 'a' to 'ã', + 'ă' to 'ẵ', + 'â' to 'ẫ', + 'e' to 'ẽ', + 'ê' to 'ễ', + 'o' to 'õ', + 'ô' to 'ỗ', + 'ơ' to 'ỡ', + 'i' to 'ĩ', + 'u' to 'ũ', + 'ư' to 'ữ', + 'y' to 'ỹ', + // uppercase + 'A' to 'Ã', + 'Ă' to 'Ẵ', + 'Â' to 'Ẫ', + 'E' to 'Ẽ', + 'Ê' to 'Ễ', + 'O' to 'Õ', + 'Ô' to 'Ỗ', + 'Ơ' to 'Ỡ', + 'I' to 'Ĩ', + 'U' to 'Ũ', + 'Ư' to 'Ữ', + 'Y' to 'Ỹ', + )), + /** Dấu nặng (dot below) - creaky falling tone */ + DOT(mapOf( + 'a' to 'ạ', + 'ă' to 'ặ', + 'â' to 'ậ', + 'e' to 'ẹ', + 'ê' to 'ệ', + 'o' to 'ọ', + 'ô' to 'ộ', + 'ơ' to 'ợ', + 'i' to 'ị', + 'u' to 'ụ', + 'ư' to 'ự', + 'y' to 'ỵ', + // uppercase + 'A' to 'Ạ', + 'Ă' to 'Ặ', + 'Â' to 'Ậ', + 'E' to 'Ẹ', + 'Ê' to 'Ệ', + 'O' to 'Ọ', + 'Ô' to 'Ộ', + 'Ơ' to 'Ợ', + 'I' to 'Ị', + 'U' to 'Ụ', + 'Ư' to 'Ự', + 'Y' to 'Ỵ', + )), +} diff --git a/java/src/org/futo/inputmethod/event/combiners/vietnamese/VNI.kt b/java/src/org/futo/inputmethod/event/combiners/vietnamese/VNI.kt new file mode 100644 index 0000000000..e5ff613180 --- /dev/null +++ b/java/src/org/futo/inputmethod/event/combiners/vietnamese/VNI.kt @@ -0,0 +1,133 @@ +package org.futo.inputmethod.event.combiners.vietnamese + +object VNI { + val TONES = mapOf( + '1' to ToneMark.ACUTE, + '2' to ToneMark.GRAVE, + '3' to ToneMark.HOOK, + '4' to ToneMark.TILDE, + '5' to ToneMark.DOT + ) + + fun VNIToVietnamese(input: String): String { + val lowercaseInput = input.lowercase() + + val modifierExists = MutableList(10) { false } + + val lowercaseInitial = StringBuilder() + val lowercaseVowel = StringBuilder() + + var hasLetters = false + + var startedVowel = false + var startedFinal = false + + var tone: ToneMark? = null + + // STAGE 1: build modifierIndices and lowercaseVowel + for ((index, ch) in lowercaseInput.withIndex()) { + //if (ch.isAsciiDigit()) modifierIndices[ch.digitToInt()].add(index) + if (ch.isLetter()) hasLetters = true + + // update firstModifierIndex + if (ch.isDigit() && !modifierExists[ch.digitToInt()]) + modifierExists[ch.digitToInt()] = true + + if (!startedVowel && Common.CONSONANTS.contains(ch)) lowercaseInitial.append(ch) + + if (!startedFinal && Common.VOWELS.contains(ch)) { + if (!startedVowel) startedVowel = true + lowercaseVowel.append(ch) + } + + if (startedVowel && Common.CONSONANTS.contains(ch)) + startedFinal = true + + when (ch) { + '1', '2', '3', '4', '5' -> tone = TONES[ch]!! + } + } + + // apply correction to lowercaseInitial and lowercaseVowel + var giQuCorrectionApplied = false + if (lowercaseVowel.length > 1 && (lowercaseInitial.contentEquals("q") && lowercaseVowel[0] == 'u' || + lowercaseInitial.contentEquals("g") && lowercaseVowel[0] == 'i' + )) { + giQuCorrectionApplied = true + lowercaseInitial.append(lowercaseVowel[0]) + lowercaseVowel.deleteAt(0) + } + + if (!hasLetters) return input + + // STAGE 2: remove numbers and add diacritics + val output = StringBuilder() + + /** Tracks if an 'u' has been converted to 'ư'. + * This variable is checked to ensure that only the first 'u' is converted to 'ư' when there are multiple 'u's. + * For example, "uou7" should output "ươu", not "ươư"; "uu7" should output "ưu", not "ưư".*/ + var uHornOutputted = false + + for ((index, ch) in lowercaseInput.withIndex()) { + when (ch) { + // handle numbers + '1', '2', '3', '4', '5', '6', '7', '8', '9', '0' -> continue + + // handle modifiable characters + 'a' -> { + if (modifierExists[8]) { + output.append(Common.BREVE_MAP[input[index]]) + continue + } + + if (modifierExists[6]) { + output.append(Common.CIRCUMFLEX_MAP[input[index]]) + continue + } + } + 'd' -> if (modifierExists[9]) { + output.append(Common.STROKE_MAP[input[index]]) + continue + } + 'e', 'o' -> { + if (modifierExists[6]) { + output.append(Common.CIRCUMFLEX_MAP[input[index]]) + continue + } + + if (ch == 'o' && modifierExists[7] && + !(output.length != 0 && lowercaseVowel.contentEquals("ou") && !startedFinal)) { + output.append(Common.HORN_MAP[input[index]]) + continue + } + } + + 'u' -> if (modifierExists[7] && + !uHornOutputted && + !(output.getOrNull(0)?.lowercaseChar() == 'q' && output.length == 1) && + !(output.length != 0 && lowercaseVowel.contentEquals("uo") && !startedFinal)) { + output.append(Common.HORN_MAP[input[index]]) + uHornOutputted = true + continue + } + } + + //default behavior: output the char in input + output.append(input[index]) + } + + // STAGE 3: add tone mark + if (tone == null) return output.toString() + + //edge case for gi5a > gịa + if (lowercaseInput == "gi5a") { + output[1] = tone.map[output[1]] ?: output[1] + return output.toString() + } + + val toneMarkPosition = Common.getToneMarkPosition(output, lowercaseInitial.length, lowercaseVowel.length) + output[toneMarkPosition] = tone.map[output[toneMarkPosition]] ?: output[toneMarkPosition] + + return output.toString() + } +} diff --git a/java/src/org/futo/inputmethod/event/combiners/vietnamese/VNICombiner.kt b/java/src/org/futo/inputmethod/event/combiners/vietnamese/VNICombiner.kt new file mode 100644 index 0000000000..973e68d202 --- /dev/null +++ b/java/src/org/futo/inputmethod/event/combiners/vietnamese/VNICombiner.kt @@ -0,0 +1,58 @@ +package org.futo.inputmethod.event.combiners.vietnamese + +import android.text.TextUtils +import org.futo.inputmethod.event.Combiner +import org.futo.inputmethod.event.Event +import org.futo.inputmethod.latin.common.Constants +import java.util.ArrayList + +class VNICombiner: Combiner { + private val buffer = StringBuilder() // holds a single Vietnamese word/syllable + + override fun processEvent( + previousEvents: ArrayList?, + event: Event? + ): Event { + if (event == null) return Event.createNotHandledEvent() + if (event.eventType != Event.EVENT_TYPE_INPUT_KEYPRESS) return event + + val keypress = event.mCodePoint.toChar() + + // The normal ASCII digits are left untouched by the combiner and always result in digits + // being committed to the output. On the other hand, fullwidth digits are intercepted by + // this combiner, converted into ASCII digits, and sent to the VNI converter. + // This lets the user explicitly enter numbers that will not get converted into diacritics. + // For example, if ASCII '1' (U+0031 DIGIT ONE) is given to this combiner, it will always + // output an ASCII '1' (U+0031). + // But if a fullwidth '1' (U+FF11 FULLWIDTH DIGIT ONE) is given to this combiner, it will be + // converted to an ASCII '1' (U+0031) and given to the VNI converter, where it might result + // in an acute accent being placed over a letter. + // So, the input sequence [V][i][e][t][U+FF15][U+FF16] will result in the output "Việt" + if (keypress.code in 0xFF10..0xFF19) { + buffer.append((keypress.code - 0xFEE0).toChar()) + return Event.createConsumedEvent(event) + } + + if (!(keypress in 'A'..'Z' || keypress in 'a'..'z')) { + if (!TextUtils.isEmpty(buffer)) { + if (event.mKeyCode == Constants.CODE_DELETE) { + buffer.setLength(buffer.length - 1) + return Event.createConsumedEvent(event) + } + } + + if(!event.isFunctionalKeyEvent) return Event.createResetEvent(event) + return event + } + + buffer.append(keypress) + return Event.createConsumedEvent(event) + } + + override fun getCombiningStateFeedback(): CharSequence? = + VNI.VNIToVietnamese(buffer.toString()) + + override fun reset() { + buffer.clear() + } +} \ No newline at end of file diff --git a/java/src/org/futo/inputmethod/event/combiners/vietnamese/VietTelexCombiner.kt b/java/src/org/futo/inputmethod/event/combiners/vietnamese/VietTelexCombiner.kt new file mode 100644 index 0000000000..59841c534b --- /dev/null +++ b/java/src/org/futo/inputmethod/event/combiners/vietnamese/VietTelexCombiner.kt @@ -0,0 +1,43 @@ +package org.futo.inputmethod.event.combiners.vietnamese + +import android.text.TextUtils +import org.futo.inputmethod.event.Combiner +import org.futo.inputmethod.event.Event +import org.futo.inputmethod.latin.common.Constants +import java.util.ArrayList + +class VietTelexCombiner: Combiner { + private val buffer = StringBuilder() // holds a single Vietnamese word/syllable + + override fun processEvent( + previousEvents: ArrayList?, + event: Event? + ): Event { + if (event == null) return Event.createNotHandledEvent() + if (event.eventType != Event.EVENT_TYPE_INPUT_KEYPRESS) return event + + val keypress = event.mCodePoint.toChar() + + if (!(keypress in 'A'..'Z' || keypress in 'a'..'z')) { + if (!TextUtils.isEmpty(buffer)) { + if (event.mKeyCode == Constants.CODE_DELETE) { + buffer.setLength(buffer.length - 1) + return Event.createConsumedEvent(event) + } + } + + if(!event.isFunctionalKeyEvent) return Event.createResetEvent(event) + return event + } + + buffer.append(keypress) + return Event.createConsumedEvent(event) + } + + override fun getCombiningStateFeedback(): CharSequence? = + Telex.telexToVietnamese(buffer.toString()) + + override fun reset() { + buffer.clear() + } +} \ No newline at end of file diff --git a/java/src/org/futo/inputmethod/v2keyboard/CombinerKind.kt b/java/src/org/futo/inputmethod/v2keyboard/CombinerKind.kt index bf3bb765c4..5bff16fcc9 100644 --- a/java/src/org/futo/inputmethod/v2keyboard/CombinerKind.kt +++ b/java/src/org/futo/inputmethod/v2keyboard/CombinerKind.kt @@ -5,6 +5,8 @@ import org.futo.inputmethod.event.DeadKeyCombiner import org.futo.inputmethod.event.combiners.NFCNormalizingCombiner import org.futo.inputmethod.event.combiners.DeadKeyPreCombiner import org.futo.inputmethod.event.combiners.KoreanCombiner +import org.futo.inputmethod.event.combiners.vietnamese.VNICombiner +import org.futo.inputmethod.event.combiners.vietnamese.VietTelexCombiner import org.futo.inputmethod.event.combiners.wylie.WylieCombiner enum class CombinerKind(val factory: () -> Combiner) { @@ -13,5 +15,7 @@ enum class CombinerKind(val factory: () -> Combiner) { NFCNormalize({ NFCNormalizingCombiner() }), Korean({ KoreanCombiner() }), KoreanCombineInitials({ KoreanCombiner(combineInitials = true) }), + VietTelex( { VietTelexCombiner() }), + VNI( { VNICombiner() }), Wylie({ WylieCombiner() }), } \ No newline at end of file From c4e3f0da82d5e3f677352be0b38e1d5391a6075f Mon Sep 17 00:00:00 2001 From: tenextractor <139619642+tenextractor@users.noreply.github.com> Date: Tue, 17 Feb 2026 20:14:47 +0530 Subject: [PATCH 2/2] fix index out of bounds error --- .../event/combiners/vietnamese/Telex.kt | 23 +-- .../event/combiners/vietnamese/VNI.kt | 145 +++++++++--------- .../event/combiners/vietnamese/VNICombiner.kt | 6 +- .../combiners/vietnamese/VietTelexCombiner.kt | 6 +- 4 files changed, 99 insertions(+), 81 deletions(-) diff --git a/java/src/org/futo/inputmethod/event/combiners/vietnamese/Telex.kt b/java/src/org/futo/inputmethod/event/combiners/vietnamese/Telex.kt index 3c7685f986..32c5152e93 100644 --- a/java/src/org/futo/inputmethod/event/combiners/vietnamese/Telex.kt +++ b/java/src/org/futo/inputmethod/event/combiners/vietnamese/Telex.kt @@ -17,7 +17,7 @@ object Telex { /** Convert a string that represents a Vietnamese syllable written in the Telex convention ([input]) * to a syllable written in Vietnamese orthography. * Example: input = "vietej", output = "việt" - */ + */ public fun telexToVietnamese(input: String): String { // STAGE 1: calculate modifierIndices, firstVowelIndex, startedFinal and lowercaseVowel @@ -90,7 +90,7 @@ object Telex { // and therefore the firstVowelIndex needs to be corrected to account for this if (modifierIndices['d']!!.size > 1 && modifierIndices['d']!!.last() < firstVowelIndex) firstVowelIndex-- - + // apply correction to lowercaseVowel: // "gi" (unless there is no other vowel letter) and "qu" should be considered consonants if (lowercaseVowel.length > 1 && (lowercaseInput.slice(0..<2) == "gi" || lowercaseInput.slice(0..<2) == "qu")) @@ -136,7 +136,7 @@ object Telex { } else if (lowercaseCh == 'o' && lowercaseVowel.contentEquals("oeo")) { // handle "oeo" edge case (should output "oeo", not "ôe"): // remove the second 'o''s index from modifierIndices so that it will be outputted - modifierIndices['o']!!.removeAt(modifierIndices['o']!!.lastIndex) + modifierIndices['o']!!.removeLast() output.append(ch) } else { output.append(Common.CIRCUMFLEX_MAP[ch]) @@ -157,9 +157,9 @@ object Telex { } if (wIndices.size == 1 && lowercaseCh == 'o' - && !lowercaseVowel.contentEquals("oa") - // ↑ add edge case for "oaw" (should output "oă", not "ơă" or "ơa") - && !(firstVowelIndex != 0 && lowercaseVowel.contentEquals("ou")) + && !lowercaseVowel.contentEquals("oa") + // ↑ add edge case for "oaw" (should output "oă", not "ơă" or "ơa") + && !(firstVowelIndex != 0 && lowercaseVowel.contentEquals("ou")) // ↑ add edge case: any initial consonant + vowel "ou" with modifier 'w' + no final // should output "oư" and not "ơư" ) { @@ -199,7 +199,7 @@ object Telex { var uowIsNotUwow = false if ((firstVowelIndex > 0) && !startedFinal && !doNotOutputNextChar && modifierIndices['w']!!.size == 1 && lowercaseVowel.contentEquals("uo")) { - uowIsNotUwow = true + uowIsNotUwow = true } if (modifierIndices['w']!!.size == 1 && !wHasBeenUsed && !(lowercaseInput[0] == 'q' && index == 1) && !uowIsNotUwow) { @@ -235,8 +235,8 @@ object Telex { // but for the sake of better error/edge case handling the correction will only be applied // if there is another vowel letter. if (vowelCount > 1 && (lowercaseInput.slice(0..<2) == "gi" || lowercaseInput.slice(0..<2) == "qu")) { - vowelCount-- - firstVowelIndex++ + vowelCount-- + firstVowelIndex++ } // if there has been some error applying the correction, just output without the tone mark @@ -245,8 +245,11 @@ object Telex { // add tone mark val toneMarkPosition = Common.getToneMarkPosition(output, firstVowelIndex, vowelCount) + // avoid index out of bounds error + if (toneMarkPosition !in 0.. tone = TONES[ch]!! - } + when (ch) { + '1', '2', '3', '4', '5' -> tone = TONES[ch]!! + } } // apply correction to lowercaseInitial and lowercaseVowel var giQuCorrectionApplied = false if (lowercaseVowel.length > 1 && (lowercaseInitial.contentEquals("q") && lowercaseVowel[0] == 'u' || - lowercaseInitial.contentEquals("g") && lowercaseVowel[0] == 'i' - )) { + lowercaseInitial.contentEquals("g") && lowercaseVowel[0] == 'i' + )) { giQuCorrectionApplied = true lowercaseInitial.append(lowercaseVowel[0]) lowercaseVowel.deleteAt(0) @@ -60,74 +60,81 @@ object VNI { if (!hasLetters) return input - // STAGE 2: remove numbers and add diacritics - val output = StringBuilder() - - /** Tracks if an 'u' has been converted to 'ư'. - * This variable is checked to ensure that only the first 'u' is converted to 'ư' when there are multiple 'u's. - * For example, "uou7" should output "ươu", not "ươư"; "uu7" should output "ưu", not "ưư".*/ - var uHornOutputted = false - - for ((index, ch) in lowercaseInput.withIndex()) { - when (ch) { - // handle numbers - '1', '2', '3', '4', '5', '6', '7', '8', '9', '0' -> continue - - // handle modifiable characters - 'a' -> { - if (modifierExists[8]) { - output.append(Common.BREVE_MAP[input[index]]) - continue - } - - if (modifierExists[6]) { - output.append(Common.CIRCUMFLEX_MAP[input[index]]) - continue - } + // STAGE 2: remove numbers and add diacritics + val output = StringBuilder() + + /** Tracks if an 'u' has been converted to 'ư'. + * This variable is checked to ensure that only the first 'u' is converted to 'ư' when there are multiple 'u's. + * For example, "uou7" should output "ươu", not "ươư"; "uu7" should output "ưu", not "ưư".*/ + var uHornOutputted = false + + for ((index, ch) in lowercaseInput.withIndex()) { + when (ch) { + // handle numbers + '1', '2', '3', '4', '5', '6', '7', '8', '9', '0' -> continue + + // handle modifiable characters + 'a' -> { + if (modifierExists[8]) { + output.append(Common.BREVE_MAP[input[index]]) + continue } - 'd' -> if (modifierExists[9]) { - output.append(Common.STROKE_MAP[input[index]]) + + if (modifierExists[6]) { + output.append(Common.CIRCUMFLEX_MAP[input[index]]) continue } - 'e', 'o' -> { - if (modifierExists[6]) { - output.append(Common.CIRCUMFLEX_MAP[input[index]]) - continue - } - - if (ch == 'o' && modifierExists[7] && - !(output.length != 0 && lowercaseVowel.contentEquals("ou") && !startedFinal)) { - output.append(Common.HORN_MAP[input[index]]) - continue - } + } + 'd' -> if (modifierExists[9]) { + output.append(Common.STROKE_MAP[input[index]]) + continue + } + 'e', 'o' -> { + if (modifierExists[6]) { + output.append(Common.CIRCUMFLEX_MAP[input[index]]) + continue } - 'u' -> if (modifierExists[7] && - !uHornOutputted && - !(output.getOrNull(0)?.lowercaseChar() == 'q' && output.length == 1) && - !(output.length != 0 && lowercaseVowel.contentEquals("uo") && !startedFinal)) { + if (ch == 'o' && modifierExists[7] && + !(output.length != 0 && lowercaseVowel.contentEquals("ou") && !startedFinal)) { output.append(Common.HORN_MAP[input[index]]) - uHornOutputted = true continue } } - //default behavior: output the char in input - output.append(input[index]) + 'u' -> if (modifierExists[7] && + !uHornOutputted && + !(output.getOrNull(0)?.lowercaseChar() == 'q' && output.length == 1) && + !(output.length != 0 && lowercaseVowel.contentEquals("uo") && !startedFinal)) { + output.append(Common.HORN_MAP[input[index]]) + uHornOutputted = true + continue + } } - // STAGE 3: add tone mark - if (tone == null) return output.toString() + //default behavior: output the char in input + output.append(input[index]) + } - //edge case for gi5a > gịa - if (lowercaseInput == "gi5a") { - output[1] = tone.map[output[1]] ?: output[1] - return output.toString() - } + // STAGE 3: add tone mark + if (tone == null) return output.toString() + + //edge case for gi5a > gịa + if (lowercaseInput == "gi5a") { + output[1] = tone.map[output[1]] ?: output[1] + return output.toString() + } + + // handle errors + if (lowercaseVowel.isEmpty() || lowercaseInitial.length + lowercaseVowel.length - 1 >= output.length) + return output.toString() - val toneMarkPosition = Common.getToneMarkPosition(output, lowercaseInitial.length, lowercaseVowel.length) - output[toneMarkPosition] = tone.map[output[toneMarkPosition]] ?: output[toneMarkPosition] + val toneMarkPosition = Common.getToneMarkPosition(output, lowercaseInitial.length, lowercaseVowel.length) + // avoid index out of bounds error + if (toneMarkPosition !in 0..