From 305a8a07b7fff7c9d666c9fdc1557c2fba6b4f18 Mon Sep 17 00:00:00 2001 From: Hossam Hassan Date: Thu, 25 Sep 2025 16:11:42 +0300 Subject: [PATCH] feat: add phonemes to uthmani mapping output to the quran_phonetizer --- src/quran_transcript/phonetics/phonetizer.py | 82 ++++++++++++++++++-- 1 file changed, 77 insertions(+), 5 deletions(-) diff --git a/src/quran_transcript/phonetics/phonetizer.py b/src/quran_transcript/phonetics/phonetizer.py index 6cf03f8..b3107e7 100644 --- a/src/quran_transcript/phonetics/phonetizer.py +++ b/src/quran_transcript/phonetics/phonetizer.py @@ -1,5 +1,6 @@ import re -from dataclasses import dataclass +from dataclasses import dataclass, field +from difflib import SequenceMatcher from .operations import OPERATION_ORDER from .moshaf_attributes import MoshafAttributes @@ -7,10 +8,65 @@ from .. import alphabet as alph +def _distribute_mapping(segment: list[int | None], new_len: int) -> list[int | None]: + """Spread original indices over a new segment length.""" + if new_len <= 0: + return [] + if not segment: + return [None] * new_len + if len(segment) == 1: + return [segment[0]] * new_len + + distributed: list[int | None] = [] + seg_len = len(segment) + for k in range(new_len): + idx = int(k * seg_len / new_len) + if idx >= seg_len: + idx = seg_len - 1 + distributed.append(segment[idx]) + return distributed + + +def _align_mapping( + before_text: str, after_text: str, before_mapping: list[int | None] +) -> list[int | None]: + """Align mapping list after a text transformation using sequence matching.""" + + matcher = SequenceMatcher(a=before_text, b=after_text) + new_mapping: list[int | None] = [] + + for tag, i1, i2, j1, j2 in matcher.get_opcodes(): + if tag == "equal": + new_mapping.extend(before_mapping[i1:i2]) + elif tag == "replace": + segment = before_mapping[i1:i2] + new_mapping.extend(_distribute_mapping(segment, j2 - j1)) + elif tag == "insert": + new_mapping.extend([None] * (j2 - j1)) + elif tag == "delete": + continue + + return new_mapping + + +def _substitute_with_mapping( + text: str, mapping: list[int | None], pattern: str, replacement: str +) -> tuple[str, list[int | None]]: + """Apply regex substitution while updating the origin mapping via alignment.""" + + new_text = re.sub(pattern, replacement, text) + if new_text == text: + return text, mapping + + new_mapping = _align_mapping(text, new_text, mapping) + return new_text, new_mapping + + @dataclass class QuranPhoneticScriptOutput: phonemes: str sifat: list[SifaOutput] + char_map: list[int | None] = field(default_factory=list) def quran_phonetizer( @@ -18,13 +74,22 @@ def quran_phonetizer( ) -> QuranPhoneticScriptOutput: """الرسم الصوتي للقآن الكريم على طبقتين: طبقة الأحرف وطبقة الصفات""" text = uhtmani_text + mapping: list[int | None] = list(range(len(text))) # cleaning extra scpace - text = re.sub(r"\s+", rf"{alph.uthmani.space}", text) - text = re.sub(r"(\s$|^\s)", r"", text) + text, mapping = _substitute_with_mapping( + text, mapping, r"\s+", rf"{alph.uthmani.space}" + ) + text, mapping = _substitute_with_mapping(text, mapping, r"(\s$|^\s)", r"") for op in OPERATION_ORDER: + prev_text = text text = op.apply(text, moshaf) + if text != prev_text: + # print(f"Applied: {op.arabic_name}") + # print(f" Before: {prev_text}") + # print(f" After : {text}") + mapping = _align_mapping(prev_text, text, mapping) sifat = process_sifat( uthmani_script=uhtmani_text, @@ -33,6 +98,13 @@ def quran_phonetizer( ) if remove_spaces: - text = re.sub(alph.uthmani.space, r"", text) + filtered_chars: list[str] = [] + filtered_map: list[int | None] = [] + for ch, idx in zip(text, mapping): + if ch != alph.uthmani.space: + filtered_chars.append(ch) + filtered_map.append(idx) + text = "".join(filtered_chars) + mapping = filtered_map - return QuranPhoneticScriptOutput(phonemes=text, sifat=sifat) + return QuranPhoneticScriptOutput(phonemes=text, sifat=sifat, char_map=mapping)