diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c212c2..eedeab8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,17 @@ You should also add project tags for each release in Github, see [Managing relea # [Unreleased] +# [2.0.0] - 11/25/2024 +### Added +- A Phonecodes enum class to the phonecodes module, to enforce valid conversion and language pairs more explicitly. +- Support for post-processing after conversion to/from IPA is performed, to allow for reduction to a shared symbol set. This is useful, for example, to convert standard TIMIT symbol reductions or a shared symbol set between Buckeye and TIMIT. + +### Changed +- All codeA2codeB conversion functions in phonecodes now rely on the convert function, which should increase maintainability and reusability of the code. + +### Fixed +- Added missing ARPABET IPA vowels (diphthongs and r-colored vowels) to the set of IPA vowels in phonecode_tables, so that stress markers would be added correctly. Fixes https://github.com/ginic/phonecodes/issues/15. + # [1.2.3] - 10/23/2025 ### Changed - Added python 3.14 to package and pytest GitHub actions diff --git a/README.md b/README.md index ddc376b..b134c85 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Developers may refer to the CONTRIBUTIONS.md for information on the development # Basic Usage ## Converting between Phonetic Alphabets If you want to convert to or from IPA to some other phonetic code, use `phonecodes.phonecodes` as follows: -``` +```python >>> from phonecodes import phonecodes >>> print(phonecodes.CODES) # available phonetic alphabets {'arpabet', 'buckeye', 'ipa', 'timit', 'callhome', 'xsampa', 'disc'} @@ -35,6 +35,27 @@ For 'callhome' and 'disc' you should also specify a language code from the follo - DISC/CELEX: Dutch `'nld'`, English `'eng'`, German `'deu'`. Uses German if unspecified. - Callhome: Spanish `'spa'`, Egyptian Arabic `'arz'`, Mandarin Chinese `'cmn'`. You MUST specify an appropriate language code or you'll get a KeyError. +## Additional post-processing +An additional use case when converting between phonecodes is to normalize the final mapping to a subset of IPA symbols. This is useful if you are collapsing similar sounds together to a reduced symbol inventory or if you are standardizing two corpora with different IPA inventories/conventions to a shared subset. + +We support this use case through the `post_conversion_mapping` keyword argument, an optional dictionary remapping provided with all phonecodes conversion functions. You can provide a custom mapping. Be aware that the remapping algorithm is greedy, proceeds in the order that keys appear in the dictionary, and diacritics need to appear with a base symbol in the mapping. + +Additionally, we provide IPA-to-IPA post-processing dictionary mappings in `phonecodes.phonecode_tables`: +- `phonecodes.phonecode_tables.STANDARD_TIMIT_IPA_REDUCTION`: The 'standard' TIMIT label reduction used in Lee and Hon (1989) that reduces the original 64 TIMIT phonetic labels to 39 categories. This reduction is widely used in the speech recognition community. +- `phonecodes.phonecode_tables.BUCKEYE_IPA_TO_TIMIT_BUCKEYE_SHARED` and `phonecodes.phonecode_tables.TIMIT_IPA_TO_TIMIT_BUCKEYE_SHARED`: A conservative reduction from the Buckeye and TIMIT IPA inventories, respectively, to a shared symbol set. This maps nasalized vowels and flaps to their non-nasalized versions, r-colored vowels ('ɚ', 'ɝ') to syllabic r ('ɹ̩'), and normalizes variants of 'ʌ' and schwa to sch + +```python +>>> from phonecodes import phonecodes +# Conversion from Buckeye to IPA using the original published Buckeye mapping +>>> phonecodes.convert("B AHN NX AAN NX AH", "buckeye", "ipa") +'b ʌ̃ ɾ̃ ɑ̃ ɾ̃ ʌ' +# Conversion from Buckeye to IPA with postprocessing to an IPA inventory shared with TIMIT +>>> phonecodes.convert("B AHN NX AAN NX AH", "buckeye", "ipa", post_conversion_mapping = phonecodes.phonecode_tables.BUCKEYE_IPA_TO_TIMIT_BUCKEYE_SHARED) +'b ə n ɑ n ə' +# Custom mapping example - note that the nasalized diacritics are not affected by the remapping +>>> phonecodes.convert("B AHN NX AAN NX AH", "buckeye", "ipa", post_conversion_mapping = {'ʌ':'ə'}) +'b ə̃ ɾ̃ ɑ̃ ɾ̃ ə' +``` ## Reading Corpus Files If you are working with specific corpora, you can also convert between certain corpus formats as follows: diff --git a/pyproject.toml b/pyproject.toml index 77b4855..d490b32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,17 +4,15 @@ build-backend = "setuptools.build_meta" [project] name = "phonecodes" -version = "1.2.3" +version = "2.0.0" description = "Tools for loading dictionaries with various phonecodes (IPA, Callhome, X-SAMPA, ARPABET, DISC=CELEX, Buckeye), for converting among those phonecodes, and for searching those dictionaries for word sequences matching a target." readme = "README.md" license = {file = "LICENSE.txt"} -requires-python = ">=3.7" +requires-python = ">=3.9" classifiers = [ "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", diff --git a/src/phonecodes/phonecode_tables.py b/src/phonecodes/phonecode_tables.py index b074f4e..2633c29 100644 --- a/src/phonecodes/phonecode_tables.py +++ b/src/phonecodes/phonecode_tables.py @@ -1,5 +1,8 @@ """ -Tables mapping other phonecodes to/from IPA +Tables mapping other phonecodes to/from IPA. +Note that working with unicode symbols can be tricky. +Refer to the Unicode standards at https://www.unicode.org/charts/ and +check symbols against a Unicode character inspector like https://apps.timwhitlock.info/unicode/inspect. """ import re @@ -541,6 +544,10 @@ # TIMIT is written in a variant of ARPABET that includes a couple # of non-standard allophones, and most significantly, includes # separate symbols for the closure and release portions of each stop and affricate. +# This is the official mapping published with the TIMIT corpus, but you +# likely want to post-process to one of the standard shared IPA inventories +# defined below. +# # Because the TIMIT corpus has separate symbols for closure and release, # but IPA only has one corresponding symbol, we need to map all # possibilities for inputs with and without spaces. @@ -566,7 +573,7 @@ "DCL JH": "dʒ", "DX": "ɾ", "ENG": "ŋ̩", - "ER": "ɹ̩", + "ER": "ɝ", "EPI": "", "GCL": "ɡ", "GCLG": "ɡ", @@ -608,7 +615,7 @@ "AXN": "ə̃", "IYN": "ĩ", "EYN": "ẽɪ̃", - "OWN": "õʊ̃", + "OWN": "õʊ̃", "DX": "ɾ", "AYN": "ãɪ̃", "AAN": "ɑ̃", @@ -633,10 +640,9 @@ _ipa2buckeye = {v: k for k, v in _buckeye2ipa.items()} - ####################################################################### # IPA -_ipa_vowels = set("aeiouyɑɒɛɪɔʘʊʌʏəɘæʉɨøɜɞɐɤɵœɶ") | set(("ɪ̈", "ʊ̈")) +_ipa_vowels = set("aeiouyɑɒɛɪɔʘʊʌʏəɘæʉɨøɜɞɐɤɵœɶɝɚ") | set(("ɪ̈", "ʊ̈", "oʊ", "aʊ", "eɪ", "ɔɪ", "aɪ")) _ipa_consonants = set("bɓcdɖɗfɡɠhɦjʝklɭɺmnɳpɸqrɽɹɻsʂɕtʈvʋwxɧzʐʑβʙçðɱɣɢʛɥʜɲɟʄɬɮʎʟɯɰŋɴʋɒʁʀʃθʍχħʒɾɫʔʕʢʡꜛꜜǃ|ǀ‖ǁǂ") _ipa_diacritics = set(re.sub(r"◌", "", "◌̈◌̟◌̠◌̌◌̥◌̩◌◌◌̂◌̯◌̚◌◌̃◌̘◌̺◌̏◌◌̜◌̪◌̴◌̂◌◌́◌◌◌◌̰◌̀◌◌̄◌̻◌̼◌◌̹◌̞◌̙◌̌◌◌̝◌̋◌̤◌̬◌◌̆◌̽ːʰˀʷʱʼʲˤ")) _ipa_stressmarkers = set("ˈˌ") @@ -648,3 +654,82 @@ _ipa_tones |= set(x + y for x in _ipa_tones for y in _ipa_tones) _ipa_symbols = _ipa_vowels | _ipa_consonants | _ipa_diacritics + +####################################################################### +# Shared IPA inventories +# Many projects will actually use a subset of the full IPA inventory and it's useful to have an +# explicitly defined mapping to transform and validate. +# These are some standard mappings from an original IPA inventory to a subset of IPA symbols. +# These mappings are expected to be one-to-many reductions, +# Since each replacements are done by iterating over the mapping, cascading +# replacements are supported, but they are not recommended. +# Use the functions in the phonecodes module to check for cascading replacements. + +# N.B. This assumes mapping takes place in dictionary insertion order (this is guaranteed since python 3.7). +# + +# but can support multi-character symbols. + +# This is the standard TIMIT label reduction described by Lee and Hon (1989) +# described in https://drive.google.com/file/d/1QI4_omp8E9EvO71jZQBGdH2GV6Pn7FPh/view?usp=sharing. +# Closure symbols are also removed using the standard reduction, but +# this is already handled by _timit2ipa +STANDARD_TIMIT_IPA_REDUCTION = { + "ɔ": "ɑ", + "ɚ": "ɝ", + "ʒ": "ʃ", + "ɦ": "h", + "ɨ": "ɪ", + "ʉ": "u", + # Syllabic markers are dropped + "l̩": "l", + "m̩": "m", + "n̩": "n", + "ŋ̩": "ŋ", + "ʔ": "", + # "ə" and "ʌ" are collapsed + "ə": "ʌ", + "ə̥": "ʌ", +} + +# In many cases it is important that the mapped subsets match, +# especially working with models that are trained on one corpus and evaluated on another. +# These dictionaries map TIMIT and Buckeye IPA inventories to the same IPA subset in post-processing. +BUCKEYE_IPA_TO_TIMIT_BUCKEYE_SHARED = { + # Reduced nasalized vowels and diphthongs to non-nasal versions + "ãʊ̃": "aʊ", + "ẽɪ̃": "eɪ", + "õʊ̃": "oʊ", + "ãɪ̃": "aɪ", + "ɔ̃ɪ̃": "ɔɪ", + "æ̃": "æ", + "ɔ̃": "ɔ", + "ə̃": "ə", + "ĩ": "i", + "ɑ̃": "ɑ", + "ũ": "u", + "ɛ̃": "ɛ", + "ʊ̃": "ʊ", + "ɪ̃": "ɪ", + "ɹ̩̃": "ɹ̩", + # β doesn't appear in TIMIT annotations, so must be reduced + "β": "f", + # Nasalized flap is too inconsistently annotated, so reduce to 'n' + "ɾ̃": "n", + # Use schwa in final vocabulary + "ʌ̃": "ə", + "ʌ": "ə", +} +TIMIT_IPA_TO_TIMIT_BUCKEYE_SHARED = { + # These symbols are not present in Buckeye IPA, so must be reduced + "ɦ": "h", + "ɨ": "ɪ", + "ʉ": "u", + # Nasalized flap is too inconsistently annotated, so reduce to 'n' + "ɾ̃": "n", + # Vocalic r all map to ɹ̩ + "ɝ": "ɹ̩", + "ɚ": "ɹ̩", + # Use schwa in final vocabulary + "ʌ": "ə", +} diff --git a/src/phonecodes/phonecodes.py b/src/phonecodes/phonecodes.py index 2e86b07..54d1ead 100644 --- a/src/phonecodes/phonecodes.py +++ b/src/phonecodes/phonecodes.py @@ -16,13 +16,151 @@ # list known IPA symbols of vowels, consonants. # for other tables, see phonecode_tables.py """ + +from __future__ import annotations +from collections import abc +from dataclasses import dataclass +from enum import Enum +import re +import warnings + import phonecodes.phonecode_tables as phonecode_tables -CODES = set(("ipa", "arpabet", "xsampa", "disc", "callhome", "buckeye", "timit")) +# Phonecodes constants for easier maintenance +IPA_KEY = "ipa" +ARPABET_KEY = "arpabet" +XSAMPA_KEY = "xsampa" +DISC_KEY = "disc" +CALLHOME_KEY = "callhome" +BUCKEYE_KEY = "buckeye" +TIMIT_KEY = "timit" + +CODES = set((IPA_KEY, ARPABET_KEY, XSAMPA_KEY, DISC_KEY, CALLHOME_KEY, BUCKEYE_KEY, TIMIT_KEY)) LANGUAGES = set(("eng", "deu", "nld", "arz", "cmn", "spa", "yue", "lao", "vie")) -vowels = phonecode_tables._ipa_vowels -consonants = phonecode_tables._ipa_consonants + +class Phonecodes(Enum): + """Defines the set of valid phonecode mapping options supported + and which languages are covered. When language is not specified, + the mapping does not change depending on language. + """ + + # XSAMPA + IPA2XSAMPA = IPA_KEY, XSAMPA_KEY + XSAMPA2IPA = XSAMPA_KEY, IPA_KEY + + # DISC + DISC2IPA = DISC_KEY, IPA_KEY + DISC2IPA_NLD = DISC_KEY, IPA_KEY, "nld" + DISC2IPA_ENG = DISC_KEY, IPA_KEY, "eng" + IPA2DISC = IPA_KEY, DISC_KEY + + # CALLHOME + CALLHOME2IPA_ARZ = CALLHOME_KEY, IPA_KEY, "arz" + CALLHOME2IPA_CMN = CALLHOME_KEY, IPA_KEY, "cmn" + CALLHOME2IPA_SPA = CALLHOME_KEY, IPA_KEY, "spa" + IPA2CALLHOME_ARZ = IPA_KEY, CALLHOME_KEY, "arz" + IPA2CALLHOME_CMN = IPA_KEY, CALLHOME_KEY, "cmn" + IPA2CALLHOME_SPA = IPA_KEY, CALLHOME_KEY, "spa" + + # ARPABET + ARPABET2IPA = ARPABET_KEY, IPA_KEY + IPA2ARPABET = IPA_KEY, ARPABET_KEY + + # TIMIT - There is no way to convert from IPA to TIMIT due to closure symbols + TIMIT2IPA = TIMIT_KEY, IPA_KEY + + # Buckeye + BUCKEYE2IPA = BUCKEYE_KEY, IPA_KEY + IPA2BUCKEYE = IPA_KEY, BUCKEYE_KEY + + def __init__(self, in_code, out_code, language=None): + self.in_code = in_code + self.out_code = out_code + self.language = language + + @classmethod + def as_member(cls, in_code, out_code, language=None): + valid_codes = set(item.value for item in cls) + phonecode_tuple = (in_code, out_code, language) + if phonecode_tuple in valid_codes: + return Phonecodes((in_code, out_code, language)) + + phonecode_tuple = (in_code, out_code) + if (in_code, out_code) in valid_codes: + return Phonecodes((in_code, out_code)) + raise ValueError( + f"Phonecode pairing {phonecode_tuple} is not valid. Must convert to/from 'ipa' in supported languages or leave language unspecified." + ) + + +# Which symbol mapping will be used in conversion? +_phonecode_lookup = { + # XSAMPA + Phonecodes.IPA2XSAMPA: phonecode_tables._ipa2xsampa, + Phonecodes.XSAMPA2IPA: phonecode_tables._xsampa_and_diac2ipa, + # DISC + Phonecodes.DISC2IPA: phonecode_tables._disc2ipa, + Phonecodes.DISC2IPA_NLD: phonecode_tables._disc2ipa_dutch, + Phonecodes.DISC2IPA_ENG: phonecode_tables._disc2ipa_english, + Phonecodes.IPA2DISC: phonecode_tables._ipa2disc, + # CALLHOME + Phonecodes.CALLHOME2IPA_ARZ: phonecode_tables._callhome2ipa[Phonecodes.CALLHOME2IPA_ARZ.language], + Phonecodes.CALLHOME2IPA_CMN: phonecode_tables._callhome2ipa[Phonecodes.CALLHOME2IPA_CMN.language], + Phonecodes.CALLHOME2IPA_SPA: phonecode_tables._callhome2ipa[Phonecodes.CALLHOME2IPA_SPA.language], + Phonecodes.IPA2CALLHOME_ARZ: phonecode_tables._ipa2callhome[Phonecodes.IPA2CALLHOME_ARZ.language], + Phonecodes.IPA2CALLHOME_CMN: phonecode_tables._ipa2callhome[Phonecodes.IPA2CALLHOME_CMN.language], + Phonecodes.IPA2CALLHOME_SPA: phonecode_tables._ipa2callhome[Phonecodes.IPA2CALLHOME_SPA.language], + # ARPABET + Phonecodes.ARPABET2IPA: phonecode_tables._arpabet2ipa, + Phonecodes.IPA2ARPABET: phonecode_tables._ipa2arpabet, + # Buckeye + Phonecodes.BUCKEYE2IPA: phonecode_tables._buckeye2ipa, + Phonecodes.IPA2BUCKEYE: phonecode_tables._ipa2buckeye, + # TIMIT + Phonecodes.TIMIT2IPA: phonecode_tables._timit2ipa, +} + + +@dataclass +class AttachStressTonesConfig: + """Stores the settings for tone or stress attachment algorithms, + which can be different depending on the language and corpus format.""" + + tones: abc.Iterable[str] | str + vowels: abc.Iterable[str] | str + searchstep: int + catdir: int + + +# Is there a configuration for adding tones or stress markers to the final output? +_tone_stress_settings = { + Phonecodes.CALLHOME2IPA_ARZ: AttachStressTonesConfig( + phonecode_tables._ipa_stressmarkers, phonecode_tables._ipa_vowels, -1, -1 + ), + Phonecodes.CALLHOME2IPA_CMN: AttachStressTonesConfig( + phonecode_tables._ipa_tones, phonecode_tables._ipa_vowels, -1, 1 + ), + Phonecodes.CALLHOME2IPA_SPA: AttachStressTonesConfig( + phonecode_tables._ipa_stressmarkers, + phonecode_tables._ipa_vowels, + -1, + -1, + ), + Phonecodes.IPA2CALLHOME_ARZ: AttachStressTonesConfig( + "012", phonecode_tables._callhome_vowels[Phonecodes.IPA2CALLHOME_ARZ.language], 1, 1 + ), + Phonecodes.IPA2CALLHOME_CMN: AttachStressTonesConfig( + "012345", phonecode_tables._callhome_vowels[Phonecodes.IPA2CALLHOME_CMN.language], -1, 1 + ), + Phonecodes.IPA2CALLHOME_SPA: AttachStressTonesConfig( + "012", phonecode_tables._callhome_vowels[Phonecodes.IPA2CALLHOME_SPA.language], 1, 1 + ), + Phonecodes.ARPABET2IPA: AttachStressTonesConfig( + phonecode_tables._ipa_stressmarkers, phonecode_tables._ipa_vowels, -1, -1 + ), + Phonecodes.IPA2ARPABET: AttachStressTonesConfig("012", phonecode_tables._arpabet_vowels, 1, 1), +} ##################################################################### @@ -61,7 +199,7 @@ def translate_string(s, d): return (tl[::-1], translated[::-1]) -def attach_tones_to_vowels(il, tones, vowels, searchstep, catdir): +def attach_tones_to_vowels(il: list[str], tones, vowels, searchstep, catdir) -> list[str]: """Return a copy of il, with each tone attached to nearest vowel if any. searchstep=1 means search for next vowel, searchstep=-1 means prev vowel. catdir>=0 means concatenate after vowel, catdir<0 means cat before vowel. @@ -83,16 +221,14 @@ def attach_tones_to_vowels(il, tones, vowels, searchstep, catdir): ##################################################################### # X-SAMPA -def ipa2xsampa(x, language=None): +def ipa2xsampa(x, language=None, post_conversion_mapping=None): """Attempt to return X-SAMPA equivalent of an IPA phone x.""" - (tl, ttf) = translate_string(x, phonecode_tables._ipa2xsampa) - return "".join(tl) + return convert(x, IPA_KEY, XSAMPA_KEY, language, post_conversion_mapping) -def xsampa2ipa(x, language=None): +def xsampa2ipa(x, language=None, post_conversion_mapping=None): """Return the IPA equivalent of X-SAMPA phone x.""" - (tl, ttf) = translate_string(x, phonecode_tables._xsampa_and_diac2ipa) - return "".join(tl) + return convert(x, XSAMPA_KEY, IPA_KEY, language, post_conversion_mapping) ###################################################################### @@ -103,117 +239,59 @@ def tone2ipa(n, language): ##################################################################### # DISC, the system used by CELEX -def disc2ipa(x, language): +def disc2ipa(x, language=None, post_conversion_mapping=None): """Convert DISC symbol x into IPA, for language L""" - if language == "nld": - (tl, ttf) = translate_string(x, phonecode_tables._disc2ipa_dutch) - return "".join(tl) - elif language == "eng": - (tl, ttf) = translate_string(x, phonecode_tables._disc2ipa_english) - return "".join(tl) - else: - (tl, ttf) = translate_string(x, phonecode_tables._disc2ipa) - return "".join(tl) - - -def ipa2disc(x, language): - """Convert IPA symbol x into DISC""" - (tl, ttf) = translate_string(x, phonecode_tables._ipa2disc) - return "".join(tl) + return convert(x, DISC_KEY, IPA_KEY, language, post_conversion_mapping) -def ipa2disc_old(x, language): - """Convert IPA symbol x into DISC for given language""" - # Convert whole thing if possible; otherwise try prefix+vowel; else quit - if x in phonecode_tables._ipa2disc: - return phonecode_tables._ipa2disc[x] - elif x[0] in phonecode_tables._ipa2disc and x[1:] in phonecode_tables._ipa2disc: - return phonecode_tables._ipa2disc[x[0]] + phonecode_tables._ipa2disc[x[1:]] - else: - raise KeyError("Unknown IPA symbol %s for language %s" % (x, language)) +def ipa2disc(x, language=None, post_conversion_mapping=None): + """Convert IPA symbol x into DISC""" + return convert(x, IPA_KEY, DISC_KEY, language, post_conversion_mapping) ####################################################################### # Callhome phone codes -def callhome2ipa(x, language): +def callhome2ipa(x, language, post_conversion_mapping=None): """Convert callhome phone symbol x into IPA for given language""" - (il, ttf) = translate_string(x, phonecode_tables._callhome2ipa[language]) - if language == "arz": - ol = attach_tones_to_vowels( - il, - phonecode_tables._ipa_stressmarkers, - phonecode_tables._ipa_vowels, - -1, - -1, - ) - elif language == "cmn": - ol = attach_tones_to_vowels(il, phonecode_tables._ipa_tones, phonecode_tables._ipa_vowels, -1, 1) - elif language == "spa": - ol = attach_tones_to_vowels( - il, - phonecode_tables._ipa_stressmarkers, - phonecode_tables._ipa_vowels, - -1, - -1, - ) - # TODO What to do if language doesn't match - return "".join(ol) + return convert(x, CALLHOME_KEY, IPA_KEY, language, post_conversion_mapping) -def ipa2callhome(x, language): +def ipa2callhome(x, language=None, post_conversion_mapping=None): """Convert IPA symbol x into callhome notation for given language""" - (il, ttf) = translate_string(x, phonecode_tables._ipa2callhome[language]) - if language == "arz": - ol = attach_tones_to_vowels(il, "012", phonecode_tables._callhome_vowels["arz"], 1, 1) - elif language == "cmn": - ol = attach_tones_to_vowels(il, "012345", phonecode_tables._callhome_vowels["cmn"], -1, 1) - elif language == "spa": - ol = attach_tones_to_vowels(il, "012", phonecode_tables._callhome_vowels["spa"], 1, 1) - # TODO What to do if language doesn't match? - return "".join(ol) + return convert(x, IPA_KEY, CALLHOME_KEY, language, post_conversion_mapping) ######################################################################### # ARPABET, TIMIT, Buckeye -def arpabet2ipa(x, language=None): +def arpabet2ipa(x, language=None, post_conversion_mapping=None): """Convert ARPABET symbol X to IPA""" - (il, ttf) = translate_string(x, phonecode_tables._arpabet2ipa) - ol = attach_tones_to_vowels(il, phonecode_tables._ipa_stressmarkers, phonecode_tables._ipa_vowels, -1, -1) - return "".join(ol) + return convert(x, ARPABET_KEY, IPA_KEY, language, post_conversion_mapping) -def ipa2arpabet(x, language=None): +def ipa2arpabet(x, language=None, post_conversion_mapping=None): """Convert IPA symbols to ARPABET""" - (il, ttf) = translate_string(x, phonecode_tables._ipa2arpabet) - ol = attach_tones_to_vowels(il, "012", phonecode_tables._arpabet_vowels, 1, 1) - return "".join(ol) + return convert(x, IPA_KEY, ARPABET_KEY, language, post_conversion_mapping) -def timit2ipa(x, language=None): +def timit2ipa(x, language=None, post_conversion_mapping=None): """Convert TIMIT phone codes to IPA""" - x = x.upper() - (il, ttf) = translate_string(x, phonecode_tables._timit2ipa) - return "".join(il) + return convert(x, TIMIT_KEY, IPA_KEY, language, post_conversion_mapping) -def ipa2timit(x, language=None): +def ipa2timit(x, language=None, post_conversion_mapping=None): raise ValueError( - "Converting to 'timit' is unsupported, because TIMIT closure symbols for stops" - " cannot be determined from text." + "Converting to 'timit' is unsupported, because TIMIT closure symbols for stops cannot be determined from text." ) -def buckeye2ipa(x, language=None): +def buckeye2ipa(x, language=None, post_conversion_mapping=None): """Convert Buckeye phone codes to IPA""" - x = x.upper() - (il, ttf) = translate_string(x, phonecode_tables._buckeye2ipa) - return "".join(il) + return convert(x, BUCKEYE_KEY, IPA_KEY, language, post_conversion_mapping) -def ipa2buckeye(x, language=None): +def ipa2buckeye(x, language=None, post_conversion_mapping=None): "Convert IPA symbols to Buckeye phone codes" - (il, ttf) = translate_string(x, phonecode_tables._ipa2buckeye) - return "".join(il) + return convert(x, IPA_KEY, BUCKEYE_KEY, language, post_conversion_mapping) ####################################################################### @@ -221,22 +299,14 @@ def ipa2buckeye(x, language=None): # are used to convert symbols and lists of symbols, respectively, # to or from IPA, by calling appropriate other functions. # -_convertfuncs = { - "arpabet": (arpabet2ipa, ipa2arpabet), - "xsampa": (xsampa2ipa, ipa2xsampa), - "disc": (disc2ipa, ipa2disc), - "callhome": (callhome2ipa, ipa2callhome), - "buckeye": (buckeye2ipa, ipa2buckeye), - "timit": (timit2ipa, ipa2timit), -} - - def _verify_code(code): if code not in CODES: raise ValueError(f"{code} is not a valid phonecode. Choose from: {' '.join(CODES)}") -def convert(s0, c0, c1, language=None): +def convert( + s0: str, c0: str, c1: str, language: str | None = None, post_conversion_mapping: dict[str, str] | None = None +) -> str: """Convert a string between a given phonecode and IPA Args: @@ -244,24 +314,111 @@ def convert(s0, c0, c1, language=None): c0 (str): Input phonecode: 'arpabet', 'xsampa','disc', 'callhome' or 'ipa' c1 (str): Output phonecode: 'arpabet', 'xsampa','disc', 'callhome' or 'ipa' language (str | None): The language of the string, optional since it is only required for 'disc' and 'callhome' phonecodes + post_conversion_mapping dict[str, str]: Optional additional normalization mapping that occurs after conversion and stress assignments (greedy, in the same order as the dictionary keys) Raises: ValueError: If the phonecode is not a valid option Returns: - _type_: _description_ + str: String converted and post processed according to the specified phonecode mappings """ _verify_code(c0) _verify_code(c1) - if c0 == "ipa" and c1 != "ipa": - x = _convertfuncs[c1][1](s0, language) - return x - elif c0 != "ipa" and c1 == "ipa": - return _convertfuncs[c0][0](s0, language) - else: - raise ValueError(f"Must convert to/from 'ipa', not '{c0}' to '{c1}'") + # Get the right enumerator for looking up mappings + phonecode_enum = Phonecodes.as_member(c0, c1, language) + + # Most basic mapping + input_string = s0 + translation_mapping = _phonecode_lookup[phonecode_enum] + if phonecode_enum in [Phonecodes.ARPABET2IPA, Phonecodes.BUCKEYE2IPA, Phonecodes.TIMIT2IPA]: + input_string = input_string.upper() + (mapped_string, ttf) = translate_string(input_string, translation_mapping) + + # Add tones/stress if it's configured for this enum + if phonecode_enum in _tone_stress_settings: + stress_config = _tone_stress_settings[phonecode_enum] + mapped_string = attach_tones_to_vowels( + mapped_string, stress_config.tones, stress_config.vowels, stress_config.searchstep, stress_config.catdir + ) + + final_string = "".join(mapped_string) + + # Optional post processing normalization + if post_conversion_mapping is not None: + final_string = _post_process_reduction(final_string, translation_mapping, post_conversion_mapping) + + return final_string.strip() -def convertlist(l0, c0, c1, language): - return [convert(s0, c0, c1, language) for s0 in l0] +def convertlist(l0, c0, c1, language, post_ipa_mapping: dict[str, str] | None = None): + return [convert(s0, c0, c1, language, post_ipa_mapping) for s0 in l0] + + +def _post_process_reduction( + input_string: str, original_translation_mapping: dict[str, str], reduction_mapping: dict[str, str] +) -> str: + """Additional normalization step, replaces symbols in the input_str according to the reduction_mapping (original symbol -> desired symbol). + + Before the replacement occurs, checks for conflicting behaviors, such as mapping symbols which don't appear in the + original_translation_mapping symbol inventory or potential cascading replacements. + These checks show a warning, but will not raise an Exception. + + Args: + input_string: str, string to do symbol replacements/substitutions on + original_translation_mapping: dict[str, str], original mapping to/from IPA, used only for validation checks + reduction_mapping: dict[str, str], additional symbol reduction, usually for IPA inventories + """ + cascading_keys = _find_cascading_keys_in_symbol_mapping(reduction_mapping) + if len(cascading_keys) > 0: + warnings.warn( + f"Post-processing does not perfrom cascading replacements, but overlapping key/value pairs are detected. Check that this is intended. These keys are affected: {cascading_keys}." + ) + + new_keys = _get_extra_reduction_keys(original_translation_mapping, reduction_mapping) + if len(new_keys) > 0: + warnings.warn(f"There are keys in post-processing which do not appear in the original phonetable: {new_keys}.") + + # Replacements happen greedily in the order of the post processing map, + # because there may be intentional orderings of substitutions. + pattern = "|".join(re.escape(k) for k in reduction_mapping.keys()) + + return re.sub(pattern, lambda match: reduction_mapping[match.group()], input_string) + + +def _find_cascading_keys_in_symbol_mapping(symbol_inventory_map: dict[str, str]) -> list[tuple[str, str]]: + """Returns any keys that might have values that would cascade to later keys during replacement. + Used as a warning if there seem to be cascading replacements involving the same symbol. + This doesn't impact the behavior of the substitution, but serves as a check + against unexpected cascading replacements. + + Args: + symbol_inventory_map: An ordered dictionary mapping substrings to their desired replacement values. + """ + result = [] + ordered_keys = list(symbol_inventory_map.keys()) + for i, k1 in enumerate(ordered_keys[:-1]): + current_value = symbol_inventory_map[k1] + + # Skip empty strings + if current_value == "": + continue + + for k2 in ordered_keys[i + 1 :]: + if current_value in k2: + result.append((k1, k2)) + + return result + + +def _get_extra_reduction_keys(original_mapping: dict[str, str], reduction_mapping: dict[str, str]) -> set[str]: + """Returns the set of keys in reduction_map that are not used in the corpus' official symbol inventory. + + Args: + original_mapping: The original corpus symbols mapped to/from IPA symbols. + reduction_mapping: An symbol to symbol mapping for standardizing the output. + """ + ipa_original = set(original_mapping.values()) + ipa_reduction_keys = set(reduction_mapping.keys()) + overlap = ipa_reduction_keys - ipa_original + return overlap diff --git a/test/test_phonecodes.py b/test/test_phonecodes.py index 9346951..02aadf5 100644 --- a/test/test_phonecodes.py +++ b/test/test_phonecodes.py @@ -1,8 +1,11 @@ -"""Load some pronlexes from the 'fixtures' subdirectory, -test phone code conversion, and test both word and phone searches. -""" +"""Unit tests for phonecodes.phonecodes functionality""" import phonecodes.phonecodes as phonecodes +from phonecodes.phonecode_tables import ( + BUCKEYE_IPA_TO_TIMIT_BUCKEYE_SHARED, + TIMIT_IPA_TO_TIMIT_BUCKEYE_SHARED, + STANDARD_TIMIT_IPA_REDUCTION, +) import pytest @@ -34,6 +37,7 @@ ] +# Test basic functionality of the phonecodes.x2y conversion functions @pytest.mark.parametrize("in_code, out_code, fn_call, language", phonecode_cases) def test_conversion_functions(in_code, out_code, fn_call, language, sentences): result = fn_call(sentences[language][in_code], language) @@ -41,6 +45,7 @@ def test_conversion_functions(in_code, out_code, fn_call, language, sentences): assert result == expected +# Test basic functionality of the convert function with input and output phonecodes @pytest.mark.parametrize("in_code, out_code, fn_call, language", phonecode_cases) def test_convert(in_code, out_code, fn_call, language, sentences): s_in = sentences[language][in_code] @@ -49,6 +54,7 @@ def test_convert(in_code, out_code, fn_call, language, sentences): assert converted == expected +# Invalid phonecode pairs raise a value error @pytest.mark.parametrize( "input_code, output_code", [ @@ -72,10 +78,7 @@ def test_additional_buckeye_examples(ipa_str, buckeye_str): @pytest.mark.parametrize( "ipa_str, timit_str", [ - ( - " tʃ ɑ k l ɨ t ", - "h# ch aa kcl k l ix tcl t h#", - ), # 'chocolate' with start/stop tokens and no initial closure + ("tʃ ɑ k l ɨ t", "h# ch aa kcl k l ix tcl t h#"), # 'chocolate' with start/stop tokens and no initial closure ("tʃ ɑ k l ɨ t", "tcl ch aa k l ix tcl t"), # 'chocolate' with mixed closure inclusion ("tʃ ɑ k l ɨ t", "tcl ch aa k l ix t"), # 'chocolate' with mixed closure inclusion ("tʃɑklɨt", "tclchaaklixtclt"), # 'chocolate' with mixed closure inclusion, no spaces @@ -90,7 +93,7 @@ def test_additional_buckeye_examples(ipa_str, buckeye_str): ("bɪɡtɪps", "bihgclgtcltihps"), # 'big tips' lower case no spaces, flip closures # 'This has been attributed to helium film flow in the vapor pressure thermometer.' ( - "ðɪs hɛz bɛn ɪtʃɪbʉɾɪd tʉ ɦɪliɨm fɪlm floʊ ən ðɨ veɪpə pɹɛʃɹ̩ θəmɑmɨɾɚ", + "ðɪs hɛz bɛn ɪtʃɪbʉɾɪd tʉ ɦɪliɨm fɪlm floʊ ən ðɨ veɪpə pɹɛʃɝ θəmɑmɨɾɚ", "DHIHS HHEHZ BCLBEHN IHTCLCHIHBCLBUXDXIHDCL TUX HVIHLIYIXM FIHLM FLOW AXN DHIX VEYPCLPAX PCLPREHSHER THAXMAAMIXDXAXR", ), # 'About dawn he got up to blow.' @@ -108,3 +111,137 @@ def test_additional_buckeye_examples(ipa_str, buckeye_str): ) def test_additional_timit_examples(ipa_str, timit_str): assert phonecodes.timit2ipa(timit_str) == ipa_str + + +# Tests the check for potential cascading keys in postprocessing +@pytest.mark.parametrize( + "mapping, expected_value", + [ + # Valid - key set and value set are disjoint + ({"a": "x", "b": "y"}, []), + ({"a": "b", "bc": "a"}, [("a", "bc")]), + # Diacritical markers alone won't cause cascasdes + ({"◌̩": "", "ɹ̩": "ɝ"}, []), + # These will cause cascades + ({"ax": "b", "bc": "d"}, [("ax", "bc")]), + ({"r": "ɹ", "ɹ̩": "ɝ"}, [("r", "ɹ̩")]), + # These standard reductions should not have any cascading changes + (phonecodes.phonecode_tables.STANDARD_TIMIT_IPA_REDUCTION, []), + (phonecodes.phonecode_tables.BUCKEYE_IPA_TO_TIMIT_BUCKEYE_SHARED, []), + (phonecodes.phonecode_tables.TIMIT_IPA_TO_TIMIT_BUCKEYE_SHARED, []), + ], +) +def test_find_cascading_keys_in_inventory_map(mapping, expected_value): + assert phonecodes._find_cascading_keys_in_symbol_mapping(mapping) == expected_value + + +@pytest.mark.parametrize( + "example, incode, outcode, expected", + [ + ("AE1 D V ER0 T", "arpabet", "ipa", "ˈæ d v ɚ t"), + ("AE1 D V ER1 T", "arpabet", "ipa", "ˈæ d v ˈɝ t"), + ("AE0 ER1 T", "arpabet", "ipa", "æ ˈɝ t"), + ("AE0 D V ER1 T AH0 Z M AH0 N T", "arpabet", "ipa", "æ d v ˈɝ t ə z m ə n t"), + ], +) +def test_arpabet_stress_attachment(example, incode, outcode, expected): + assert phonecodes.convert(example, incode, outcode) == expected + + +# Tests the standard conversion mappings work as expected with the phonecodes.convert function +@pytest.mark.parametrize( + "example, incode, outcode, post_conversion_mapping, expected", + [ + ("h# ch aa kcl k l ix tcl t h#", "timit", "ipa", STANDARD_TIMIT_IPA_REDUCTION, "tʃ ɑ k l ɪ t"), + ("h#chaakclklixzhh#", "timit", "ipa", STANDARD_TIMIT_IPA_REDUCTION, "tʃɑklɪʃ"), + ( + "w iyn w ern k ih n aan nx eh tq", + "buckeye", + "ipa", + BUCKEYE_IPA_TO_TIMIT_BUCKEYE_SHARED, + "w i w ɹ̩ k ɪ n ɑ n ɛ ʔ", + ), + ( + "w ax w axr k ih n aa nx eh q hv zh", + "timit", + "ipa", + TIMIT_IPA_TO_TIMIT_BUCKEYE_SHARED, + "w ə w ɹ̩ k ɪ n ɑ n ɛ ʔ h ʒ", + ), + ], +) +def test_convert_with_post_conversion_mapping(example, incode, outcode, post_conversion_mapping, expected): + assert phonecodes.convert(example, incode, outcode, post_conversion_mapping=post_conversion_mapping) == expected + + +# Test cases for _get_extra_reduction_keys will +@pytest.mark.parametrize( + "original_mapping, reduction_mapping, expected_extra_keys", + [ + # Case 1: No extra keys - reduction keys are subset of original values + ({"AA": "ɑ", "AE": "æ", "AH": "ə"}, {"ɑ": "a", "æ": "a"}, set()), + # Case 2: Extra keys present - reduction has keys not in original values + ({"AA": "ɑ", "AE": "æ"}, {"ɑ": "a", "ɪ": "i", "ʊ": "u"}, {"ɪ", "ʊ"}), + # Case 3: All extra keys + ({"AA": "ɑ", "AE": "æ"}, {"x": "y", "z": "w"}, {"x", "z"}), + # Case 4: Empty reduction mapping + ({"AA": "ɑ", "AE": "æ"}, {}, set()), + # Case 5: Empty original mapping + ({}, {"ɑ": "a", "æ": "e"}, {"ɑ", "æ"}), + # Standard reductions should not have any extra keys + (phonecodes.phonecode_tables._timit2ipa, STANDARD_TIMIT_IPA_REDUCTION, set()), + (phonecodes.phonecode_tables._timit2ipa, TIMIT_IPA_TO_TIMIT_BUCKEYE_SHARED, set()), + (phonecodes.phonecode_tables._buckeye2ipa, BUCKEYE_IPA_TO_TIMIT_BUCKEYE_SHARED, set()), + ], +) +def test_get_extra_reduction_keys(original_mapping, reduction_mapping, expected_extra_keys): + result = phonecodes._get_extra_reduction_keys(original_mapping, reduction_mapping) + assert result == expected_extra_keys + + +# Test cases for _post_process_reduction without any warnings +@pytest.mark.parametrize( + "input_string, original_mapping, reduction_mapping, expected_output", + [ + # Simple single character substitution + ("ɑ æ ə", {"AA": "ɑ", "AE": "æ", "AH": "ə"}, {"ɑ": "a", "æ": "a", "ə": "a"}, "a a a"), + # Multi-character symbol substitution + ("tʃɑklɪʃ", {"CH": "tʃ"}, {"tʃ": "č"}, "čɑklɪʃ"), + # Partial substitution (some symbols not in reduction map) + ("ɑ b æ c ə", {"AA": "ɑ", "AE": "æ"}, {"ɑ": "a", "æ": "e"}, "a b e c ə"), + # No substitutions needed + ("hello world", {"A": "a", "z": "x"}, {"x": "y"}, "hello world"), + # Empty string + ("", {"A": "a"}, {"a": "b"}, ""), + # Deletion (map to empty string) + ("ɑzæ", {"AA": "ɑ", "x": "z"}, {"z": ""}, "ɑæ"), + # Order matters - greedy left-to-right substitution + ("abc", {"A": "a", "B": "b", "z": "ab"}, {"a": "x", "b": "y", "ab": "z"}, "xyc"), + # Real TIMIT reduction example + ("tʃ ɑ k l ɨ ʃ", phonecodes.phonecode_tables._timit2ipa, STANDARD_TIMIT_IPA_REDUCTION, "tʃ ɑ k l ɪ ʃ"), + ], +) +def test_post_process_reduction_substitution(input_string, original_mapping, reduction_mapping, expected_output): + result = phonecodes._post_process_reduction(input_string, original_mapping, reduction_mapping) + assert result == expected_output + + +# Test warning behavior for _post_process_reduction +def test_post_process_reduction_warns_on_cascading_keys(): + """Test that cascading keys trigger a warning""" + input_string = "abc" + original_mapping = {"A": "a", "B": "b", "z": "bc"} + reduction_mapping = {"a": "b", "bc": "d"} # "a"->"b" could cascade to "bc" + + with pytest.warns(UserWarning, match="cascading replacements"): + phonecodes._post_process_reduction(input_string, original_mapping, reduction_mapping) + + +def test_post_process_reduction_warns_on_extra_keys(): + """Test that extra keys trigger a warning""" + input_string = "ɑ æ" + original_mapping = {"AA": "ɑ", "AE": "æ"} + reduction_mapping = {"ɑ": "a", "ɪ": "i"} # "ɪ" not in original mapping values + + with pytest.warns(UserWarning, match="do not appear in the original phonetable"): + phonecodes._post_process_reduction(input_string, original_mapping, reduction_mapping) diff --git a/test/test_pronlex.py b/test/test_pronlex.py index 1f03c27..5a42d4d 100644 --- a/test/test_pronlex.py +++ b/test/test_pronlex.py @@ -1,6 +1,7 @@ -""" Load some pronlexes from the 'fixtures' subdirectory, +"""Load some pronlexes from the 'fixtures' subdirectory, test phone code conversion, and test both word and phone searches. """ + import re import pytest