Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions java/src/org/futo/inputmethod/event/combiners/vietnamese/Common.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package org.futo.inputmethod.event.combiners.vietnamese

/** Code common to both Telex and VNI */
object Common {
/** get_tone_mark_placement() function from vi-rs/src/editing.rs
* Get nth character to place tone mark
*
* # Rules:
* 1. If a vowel contains ơ or ê, tone mark goes there
* 2. If a vowel contains `oa`, `oe`, `oo`, `oy`, tone mark should be on the
* second character
*
* If the accent style is [`AccentStyle::Old`], then:
* - 3. For vowel length 3 or vowel length 2 with a final consonant, put it on the second vowel character
* - 4. Else, put it on the first vowel character
*
* Otherwise:
* - 3. If a vowel has 2 characters, put the tone mark on the first one
* - 4. Otherwise, put the tone mark on the second vowel character
*/
fun getToneMarkPosition(
outputWithoutTone: CharSequence,
firstVowelIndex: Int,
vowelCount: Int
): Int {
val specialVowelPairs = setOf("oa", "oe", "oo", "uy", "uo", "ie")

// If there's only one vowel, then it's guaranteed that the tone mark will go there
if (vowelCount == 1) return firstVowelIndex

for (i in firstVowelIndex ..< firstVowelIndex + vowelCount) {
when (outputWithoutTone[i]) {
'ơ', 'Ơ' -> return i
'ê', 'Ê' -> return i
'â', 'Â' -> return i
}
}

val vowel = outputWithoutTone.slice(firstVowelIndex ..< firstVowelIndex + vowelCount)

// If there is only one vowel with a diacritic (circumflex, breve, horn, etc.), it should
// get the tone mark
val vowelsWithDiacritics = vowel.withIndex().filter { it.value !in VOWELS }
if (vowelsWithDiacritics.size == 1) {
return firstVowelIndex + vowelsWithDiacritics[0].index
}

// Special vowels require the tone mark to be placed on the second character
if (specialVowelPairs.any { vowel.contains(it, ignoreCase = true) })
return firstVowelIndex + 1

// If a syllable end with 2 character vowel, put it on the first character
if (firstVowelIndex + vowelCount == outputWithoutTone.length && vowelCount == 2)
return firstVowelIndex

// Else, put tone mark on second vowel
return firstVowelIndex + 1
}


val CONSONANTS = setOf(
'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z')

val VOWELS = setOf('a', 'e', 'i', 'o', 'u', 'y', 'A', 'E', 'I', 'O', 'U', 'Y')

/** A map of characters without accent to character with circumflex accent */
public val CIRCUMFLEX_MAP = mapOf(
'a' to 'â',
'e' to 'ê',
'o' to 'ô',
// uppercase
'A' to 'Â',
'E' to 'Ê',
'O' to 'Ô',
)

/** A map of characters without accent to character with dyet (D WITH STROKE) accent */
public val STROKE_MAP = mapOf(
'd' to 'đ',
'D' to 'Đ',
)

/** A map of characters without accent to character with horn accent */
public val HORN_MAP = mapOf(
'u' to 'ư',
'o' to 'ơ',
// uppercase
'U' to 'Ư',
'O' to 'Ơ',
)

/** A map of characters without accent to character with breve accent */
public val BREVE_MAP = mapOf(
'a' to 'ă',
// uppercase
'A' to 'Ă',
)
}
256 changes: 256 additions & 0 deletions java/src/org/futo/inputmethod/event/combiners/vietnamese/Telex.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
package org.futo.inputmethod.event.combiners.vietnamese

object Telex {
val TONES = mapOf(
'f' to ToneMark.GRAVE,
'j' to ToneMark.DOT,
'r' to ToneMark.HOOK,
's' to ToneMark.ACUTE,
'x' to ToneMark.TILDE
)

/** These are the modifiers that should only be active if they come after the first vowel letter.
* For example, `sao` should not output any tone marks, but `aso` should output `áo`.
*/
val AFTER_VOWEL_MODIFIERS = setOf('f', 'j', 'r', 's', 'w', 'x')

/** Convert a string that represents a Vietnamese syllable written in the Telex convention ([input])
* to a syllable written in Vietnamese orthography.
* Example: input = "vietej", output = "việt"
*/
public fun telexToVietnamese(input: String): String {

// STAGE 1: calculate modifierIndices, firstVowelIndex, startedFinal and lowercaseVowel
// Example:
// Input: "vietej"
// Output:
// modifierIndices: { 'e': [2, 4], 'j': [5], the rest are empty lists }
// firstVowelIndex: 1
// startedFinal: true
// lowercaseVowel: "ie"
val lowercaseInput = input.lowercase()
var startedVowel = false
var startedFinal = false
var firstVowelIndex = -1

val lowercaseVowel = StringBuilder()

/** Map of 'modifier' characters that can add a diacritic or tone mark,
* to lists of indices of occurrences of these characters
*/
val modifierIndices: Map<Char, MutableList<Int>> = mapOf(
'a' to mutableListOf(),
'd' to mutableListOf(),
'e' to mutableListOf(),
'f' to mutableListOf(),
'j' to mutableListOf(),
'o' to mutableListOf(),
'r' to mutableListOf(),
's' to mutableListOf(),
'w' to mutableListOf(),
'x' to mutableListOf(),
)

for ((index, ch) in lowercaseInput.withIndex()) {

if (!startedVowel) {
if (Common.VOWELS.contains(ch)) {
// TODO: this code needs to be refined further
// if a syllable has a weird initial (like 'cl' in 'clown') that we are sure does not belong to Vietnamese,
// then stop the conversion process and just output the input as it is
// if (!(index in 0..3)) return input
// if (index in 2..3)
// if (!INITIALS.contains(lowercaseInput.slice(0..<index)))
// return input

firstVowelIndex = index
startedVowel = true
}
}

if (startedVowel && !startedFinal && !AFTER_VOWEL_MODIFIERS.contains(ch)) {
if (Common.CONSONANTS.contains(ch)) {
startedFinal = true
} else {
lowercaseVowel.append(ch)
}
}

if (AFTER_VOWEL_MODIFIERS.contains(ch)) {
if (startedVowel) modifierIndices[ch]!!.add(index)
} else if (modifierIndices.containsKey(ch)) {
modifierIndices[ch]!!.add(index)
}
}


// STAGE 1.5: apply a correction to firstVowelIndex
// If the input contains more than one 'd' before the vowel starts
// (example: "ddi" > "đi", "dddi" > "ddi"), one of the characters will be deleted
// and therefore the firstVowelIndex needs to be corrected to account for this
if (modifierIndices['d']!!.size > 1 && modifierIndices['d']!!.last() < firstVowelIndex)
firstVowelIndex--

// apply correction to lowercaseVowel:
// "gi" (unless there is no other vowel letter) and "qu" should be considered consonants
if (lowercaseVowel.length > 1 && (lowercaseInput.slice(0..<2) == "gi" || lowercaseInput.slice(0..<2) == "qu"))
lowercaseVowel.deleteAt(0)


// STAGE 2: use modifierIndices to apply diacritics (except tone marks) to the syllable
// Example:
// Input: "vietej" with its modifierIndices and firstVowelIndex as detailed in Stage 1
// Output:
// outputWithoutTone: "viêt"
// tone: ToneMark.DOT
// vowelCount: 2
val output = StringBuilder()
var tone: ToneMark? = null
var doNotOutputNextChar = false // this handles the "uwow" edge case
var vowelCount = 0
var wHasBeenUsed = false

for ((index, ch) in input.withIndex()) {
if (doNotOutputNextChar) {
doNotOutputNextChar = false
continue
}

val lowercaseCh = lowercaseInput[index]

when (lowercaseCh) {
'a', 'd', 'e', 'o' -> {
// handle letters that can be doubled

val thisModifierIndices = modifierIndices[lowercaseCh]!!

// if there is a string such as `ddi` (output: `đi`) or `dddi` (output: ddi),
// the last `d` (or any modifier that can be doubled) needs to be omitted from the output
if (thisModifierIndices.size >= 2 && index == thisModifierIndices.last()) continue

// if there is a string such as `ddi` (output: `đi`),
// a diacritic needs to be applied to the first `d`
if (thisModifierIndices.size == 2 && index == thisModifierIndices[0]) {
if (lowercaseCh == 'd') {
output.append(Common.STROKE_MAP[ch])
} else if (lowercaseCh == 'o' && lowercaseVowel.contentEquals("oeo")) {
// handle "oeo" edge case (should output "oeo", not "ôe"):
// remove the second 'o''s index from modifierIndices so that it will be outputted
modifierIndices['o']!!.removeLast()
output.append(ch)
} else {
output.append(Common.CIRCUMFLEX_MAP[ch])
vowelCount++
}

continue // after outputting the character with diacritic,
// suppress outputting the original character
}

val wIndices = modifierIndices['w']!!

if (wIndices.size == 1 && lowercaseCh == 'a' && !wHasBeenUsed) {
output.append(Common.BREVE_MAP[ch])
wHasBeenUsed = true
vowelCount++
continue
}

if (wIndices.size == 1 && lowercaseCh == 'o'
&& !lowercaseVowel.contentEquals("oa")
// ↑ add edge case for "oaw" (should output "oă", not "ơă" or "ơa")
&& !(firstVowelIndex != 0 && lowercaseVowel.contentEquals("ou"))
// ↑ add edge case: any initial consonant + vowel "ou" with modifier 'w' + no final
// should output "oư" and not "ơư"
) {
output.append(Common.HORN_MAP[ch])
wHasBeenUsed = true
vowelCount++
continue
}
}

// handling tones
'f', 'j', 'r', 's', 'x' -> {
val thisModifierIndices = modifierIndices[lowercaseCh]!!

if (thisModifierIndices.size == 1)
tone = TONES[lowercaseCh]!!

if (thisModifierIndices.size >= 1 && index == thisModifierIndices.last()) continue
}

'u' -> {
// edge case for `uwow` > `ươ`:
// the first instance of
if (lowercaseInput.length >= index + 4) {
if (lowercaseInput.slice(index..<index+4) == "uwow" && modifierIndices['w']!!.size == 2) {
modifierIndices['w']!!.removeAt(0)
doNotOutputNextChar = true
}
}

// Check if "uo" with modifier 'w' should output "uơ" instead of "ươ"
// This only applies when:
// * There is an initial consonant, i.e. the syllable does not start with a vowel
// * The vowel is only "uo", nothing else
// * There is no final consonant
// For example: "huow" -> "huơ" (uowIsNotUwow=true), but "uow" -> "ươ" (uowIsNotUwow=false)
var uowIsNotUwow = false
if ((firstVowelIndex > 0) && !startedFinal && !doNotOutputNextChar
&& modifierIndices['w']!!.size == 1 && lowercaseVowel.contentEquals("uo")) {
uowIsNotUwow = true
}

if (modifierIndices['w']!!.size == 1 && !wHasBeenUsed && !(lowercaseInput[0] == 'q' && index == 1) && !uowIsNotUwow) {
output.append(Common.HORN_MAP[ch])
vowelCount++
wHasBeenUsed = true
continue
}
}

'w' -> {
if (modifierIndices['w']!!.size >= 1 && index == modifierIndices['w']!!.last() &&
lowercaseVowel.any { it == 'a' || it == 'o' || it == 'u'}) continue
}
}

output.append(ch) // default behavior: just output the character from input as it is
if (Common.VOWELS.contains(lowercaseCh)) vowelCount++
}

// STAGE 3: apply a tone mark (if any)
if (tone == null) return output.toString()

// edge case: "gija" should output "gịa"
if (lowercaseInput == "gija") {
output[1] = tone.map[output[1]] ?: output[1]
return output.toString()
}

// apply corrections to vowelCount and firstVowelIndex:
// 'gi' (if there is another vowel after it) and 'qu' should be considered as consonants
// There is no Vietnamese word which consists of the initial 'qu' without another vowel letter,
// but for the sake of better error/edge case handling the correction will only be applied
// if there is another vowel letter.
if (vowelCount > 1 && (lowercaseInput.slice(0..<2) == "gi" || lowercaseInput.slice(0..<2) == "qu")) {
vowelCount--
firstVowelIndex++
}

// if there has been some error applying the correction, just output without the tone mark
if (vowelCount <= 0 || firstVowelIndex < 0 || firstVowelIndex + vowelCount - 1 >= output.length)
return output.toString()

// add tone mark
val toneMarkPosition = Common.getToneMarkPosition(output, firstVowelIndex, vowelCount)
// avoid index out of bounds error
if (toneMarkPosition !in 0..<output.length)
return output.toString()
output[toneMarkPosition] = tone.map[output[toneMarkPosition]] ?:
output[toneMarkPosition]

return output.toString()
}
}
Loading