From dde4b47e4d6c8300340627f1510c7ac71d379a89 Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Thu, 20 Nov 2025 16:50:37 -0500 Subject: [PATCH 01/12] Initial IPA post processing mappings added --- src/phonecodes/phonecode_tables.py | 83 ++++++++++++++++++++++++++++-- src/phonecodes/phonecodes.py | 28 ++++++++-- test/test_pronlex.py | 3 +- 3 files changed, 105 insertions(+), 9 deletions(-) diff --git a/src/phonecodes/phonecode_tables.py b/src/phonecodes/phonecode_tables.py index b074f4e..be0457e 100644 --- a/src/phonecodes/phonecode_tables.py +++ b/src/phonecodes/phonecode_tables.py @@ -541,6 +541,10 @@ # TIMIT is written in a variant of ARPABET that includes a couple # of non-standard allophones, and most significantly, includes # separate symbols for the closure and release portions of each stop and affricate. +# This is the official mapping published with the TIMIT corpus, but you +# likely want to post-process to one of the standard shared IPA inventories +# defined below. +# # Because the TIMIT corpus has separate symbols for closure and release, # but IPA only has one corresponding symbol, we need to map all # possibilities for inputs with and without spaces. @@ -566,7 +570,7 @@ "DCL JH": "dʒ", "DX": "ɾ", "ENG": "ŋ̩", - "ER": "ɹ̩", + "ER": "ɝ", "EPI": "", "GCL": "ɡ", "GCLG": "ɡ", @@ -608,7 +612,7 @@ "AXN": "ə̃", "IYN": "ĩ", "EYN": "ẽɪ̃", - "OWN": "õʊ̃", + "OWN": "õʊ̃", "DX": "ɾ", "AYN": "ãɪ̃", "AAN": "ɑ̃", @@ -633,7 +637,6 @@ _ipa2buckeye = {v: k for k, v in _buckeye2ipa.items()} - ####################################################################### # IPA _ipa_vowels = set("aeiouyɑɒɛɪɔʘʊʌʏəɘæʉɨøɜɞɐɤɵœɶ") | set(("ɪ̈", "ʊ̈")) @@ -648,3 +651,77 @@ _ipa_tones |= set(x + y for x in _ipa_tones for y in _ipa_tones) _ipa_symbols = _ipa_vowels | _ipa_consonants | _ipa_diacritics + +####################################################################### +# Shared IPA inventories +# Many projects will actually use a subset of the full IPA inventory and it's useful to have an +# explicitly defined mapping to transform and validate. +# These are some standard mappings from an original IPA inventory to a subset of IPA symbols. +# +# These mappings are expected to be one-to-many reductions, +# but can support multi-character symbols. Symbols will be re-mapped in +# order of longest key to shortest. + +# This is the standard TIMIT label reduction described by Lee and Hon (1989) +# described in https://drive.google.com/file/d/1QI4_omp8E9EvO71jZQBGdH2GV6Pn7FPh/view?usp=sharing. +# Closure symbols are also removed using the standard reduction, but +# this is already handled by _timit2ipa +STANDARD_TIMIT_IPA_REDUCTION = { + "ɔ": "ɑ", + "ɚ": "ɝ", + "ʒ": "ʃ", + "ɦ": "h", + "ɨ": "ɪ", + "ʉ": "u", + # Syllabic markers are dropped + "l̩": "l", + "m̩": "m", + "n̩": "n", + "ŋ̩": "ŋ", + "ʔ": "", + # "ə" and "ʌ" are collapsed + "ə": "ʌ", + "ə̥": "ʌ", +} + +# In many cases it is important that the mapped subsets match, +# especially working with models that are trained on one corpus and evaluated on another. +# These dictionaries map TIMIT and Buckeye IPA inventories to the same IPA subset in post-processing. +BUCKEYE_IPA_TO_TIMIT_BUCKEYE_SHARED = { + # Reduced nasalized vowels and diphthongs to non-nasal versions + "ãʊ̃": "aʊ", + "ẽɪ̃": "eɪ", + "õʊ̃": "oʊ", + "ãɪ̃": "aɪ", + "ɔ̃ɪ̃": "ɔɪ", + "æ̃": "æ", + "ɔ̃": "ɔ", + "ə̃": "ə", + "ĩ": "i", + "ɑ̃": "ɑ", + "ũ": "u", + "ɛ̃": "ɛ", + "ʊ̃": "ʊ", + "ɪ̃": "ɪ", + "ɹ̩̃": "ɹ̩", + # β doesn't appear in TIMIT annotations, so must be reduced + "β": "f", + # Nasalized flap is too inconsistently annotated, so reduce to 'n' + "ɾ̃": "n", + # Use schwa in final vocabulary + "ʌ": "ə", + "ʌ̃": "ə", +} +TIMIT_IPA_TO_TIMIT_BUCKEYE_SHARED = { + # These symbols are not present in Buckeye IPA, so must be reduced + "ɦ": "h", + "ɨ": "ɪ", + "ʉ": "u", + # Nasalized flap is too inconsistently annotated, so reduce to 'n' + "ɾ̃": "n", + # Vocalic r all map to ɹ̩ + "ɝ": "ɹ̩", + "ɚ": "ɹ̩", + # Use schwa in final vocabulary + "ʌ": "ə", +} diff --git a/src/phonecodes/phonecodes.py b/src/phonecodes/phonecodes.py index 2e86b07..4071723 100644 --- a/src/phonecodes/phonecodes.py +++ b/src/phonecodes/phonecodes.py @@ -16,6 +16,7 @@ # list known IPA symbols of vowels, consonants. # for other tables, see phonecode_tables.py """ + import phonecodes.phonecode_tables as phonecode_tables CODES = set(("ipa", "arpabet", "xsampa", "disc", "callhome", "buckeye", "timit")) @@ -198,8 +199,7 @@ def timit2ipa(x, language=None): def ipa2timit(x, language=None): raise ValueError( - "Converting to 'timit' is unsupported, because TIMIT closure symbols for stops" - " cannot be determined from text." + "Converting to 'timit' is unsupported, because TIMIT closure symbols for stops cannot be determined from text." ) @@ -236,7 +236,7 @@ def _verify_code(code): raise ValueError(f"{code} is not a valid phonecode. Choose from: {' '.join(CODES)}") -def convert(s0, c0, c1, language=None): +def convert(s0, c0, c1, language=None, post_ipa_mapping: str[str, str] | None = None): """Convert a string between a given phonecode and IPA Args: @@ -263,5 +263,23 @@ def convert(s0, c0, c1, language=None): raise ValueError(f"Must convert to/from 'ipa', not '{c0}' to '{c1}'") -def convertlist(l0, c0, c1, language): - return [convert(s0, c0, c1, language) for s0 in l0] +def convertlist(l0, c0, c1, language, post_ipa_mapping: dict[str, str] | None = None): + return [convert(s0, c0, c1, language, post_ipa_mapping) for s0 in l0] + + +def _post_process_ipa_inventories(post_ipa_mapping: dict[str, str]): + keys_sorted_by_length = sorted(post_ipa_mapping.keys(), key=len) + for k in keys_sorted_by_length: + # TODO + pass + + +def _validate_post_processing_inventory_map(symbol_inventory_map: dict[str, str]): + """The + Checks that input key is a substring + + Args: + symbol_inventory_map: _description_ + """ + # TODO + pass diff --git a/test/test_pronlex.py b/test/test_pronlex.py index 1f03c27..5a42d4d 100644 --- a/test/test_pronlex.py +++ b/test/test_pronlex.py @@ -1,6 +1,7 @@ -""" Load some pronlexes from the 'fixtures' subdirectory, +"""Load some pronlexes from the 'fixtures' subdirectory, test phone code conversion, and test both word and phone searches. """ + import re import pytest From 79ac3d33ef5c9173e45c7ee345c86d12c096d405 Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Fri, 21 Nov 2025 17:55:47 -0500 Subject: [PATCH 02/12] Validation function for postproc mapping with unicode support added --- src/phonecodes/phonecode_tables.py | 16 ++++++++++---- src/phonecodes/phonecodes.py | 34 ++++++++++++++++++++---------- test/test_phonecodes.py | 23 +++++++++++++++++++- 3 files changed, 57 insertions(+), 16 deletions(-) diff --git a/src/phonecodes/phonecode_tables.py b/src/phonecodes/phonecode_tables.py index be0457e..6600749 100644 --- a/src/phonecodes/phonecode_tables.py +++ b/src/phonecodes/phonecode_tables.py @@ -1,5 +1,8 @@ """ -Tables mapping other phonecodes to/from IPA +Tables mapping other phonecodes to/from IPA. +Note that working with unicode symbols can be tricky. +Refer to the Unicode standards at https://www.unicode.org/charts/ and +check symbols against a Unicode character inspector like https://apps.timwhitlock.info/unicode/inspect. """ import re @@ -657,10 +660,15 @@ # Many projects will actually use a subset of the full IPA inventory and it's useful to have an # explicitly defined mapping to transform and validate. # These are some standard mappings from an original IPA inventory to a subset of IPA symbols. -# # These mappings are expected to be one-to-many reductions, -# but can support multi-character symbols. Symbols will be re-mapped in -# order of longest key to shortest. +# Since each replacements are done by iterating over the mapping, cascading +# replacements are supported, but they are not recommended. +# Use the functions in the phonecodes module to check for cascading replacements. + +# N.B. This assumes mapping takes place in dictionary insertion order (this is guaranteed since python 3.7). +# + +# but can support multi-character symbols. # This is the standard TIMIT label reduction described by Lee and Hon (1989) # described in https://drive.google.com/file/d/1QI4_omp8E9EvO71jZQBGdH2GV6Pn7FPh/view?usp=sharing. diff --git a/src/phonecodes/phonecodes.py b/src/phonecodes/phonecodes.py index 4071723..5068e9b 100644 --- a/src/phonecodes/phonecodes.py +++ b/src/phonecodes/phonecodes.py @@ -236,7 +236,7 @@ def _verify_code(code): raise ValueError(f"{code} is not a valid phonecode. Choose from: {' '.join(CODES)}") -def convert(s0, c0, c1, language=None, post_ipa_mapping: str[str, str] | None = None): +def convert(s0, c0, c1, language=None, post_ipa_mapping: dict[str, str] | None = None): """Convert a string between a given phonecode and IPA Args: @@ -268,18 +268,30 @@ def convertlist(l0, c0, c1, language, post_ipa_mapping: dict[str, str] | None = def _post_process_ipa_inventories(post_ipa_mapping: dict[str, str]): - keys_sorted_by_length = sorted(post_ipa_mapping.keys(), key=len) - for k in keys_sorted_by_length: - # TODO - pass + # TODO + pass -def _validate_post_processing_inventory_map(symbol_inventory_map: dict[str, str]): - """The - Checks that input key is a substring +def _find_cascading_keys_in_inventory_map(symbol_inventory_map: dict[str, str]) -> list[tuple[str, str]]: + """Returns any keys that might have values that would cascade to later keys during replacement. + Used as a warning if there seem to be cascading replacements involving the same symbol. + This doesn't impact the behavior of the substitution, but serves as a check + against unexpected cascading replacements. Args: - symbol_inventory_map: _description_ + symbol_inventory_map: An ordered dictionary mapping substrings to their desired replacement values. """ - # TODO - pass + result = [] + ordered_keys = list(symbol_inventory_map.keys()) + for i, k1 in enumerate(ordered_keys[:-1]): + current_value = symbol_inventory_map[k1] + + # Skip empty strings + if current_value == "": + continue + + for k2 in ordered_keys[i + 1 :]: + if current_value in k2: + result.append((k1, k2)) + + return result diff --git a/test/test_phonecodes.py b/test/test_phonecodes.py index 9346951..ac22907 100644 --- a/test/test_phonecodes.py +++ b/test/test_phonecodes.py @@ -90,7 +90,7 @@ def test_additional_buckeye_examples(ipa_str, buckeye_str): ("bɪɡtɪps", "bihgclgtcltihps"), # 'big tips' lower case no spaces, flip closures # 'This has been attributed to helium film flow in the vapor pressure thermometer.' ( - "ðɪs hɛz bɛn ɪtʃɪbʉɾɪd tʉ ɦɪliɨm fɪlm floʊ ən ðɨ veɪpə pɹɛʃɹ̩ θəmɑmɨɾɚ", + "ðɪs hɛz bɛn ɪtʃɪbʉɾɪd tʉ ɦɪliɨm fɪlm floʊ ən ðɨ veɪpə pɹɛʃɝ θəmɑmɨɾɚ", "DHIHS HHEHZ BCLBEHN IHTCLCHIHBCLBUXDXIHDCL TUX HVIHLIYIXM FIHLM FLOW AXN DHIX VEYPCLPAX PCLPREHSHER THAXMAAMIXDXAXR", ), # 'About dawn he got up to blow.' @@ -108,3 +108,24 @@ def test_additional_buckeye_examples(ipa_str, buckeye_str): ) def test_additional_timit_examples(ipa_str, timit_str): assert phonecodes.timit2ipa(timit_str) == ipa_str + + +@pytest.mark.parametrize( + "mapping, expected_value", + [ + # Valid - key set and value set are disjoint + ({"a": "x", "b": "y"}, []), + ({"a": "b", "bc": "a"}, [("a", "bc")]), + # Diacritical markers alone won't cause cascasdes + ({"◌̩": "", "ɹ̩": "ɝ"}, []), + # These will cause cascades + ({"ax": "b", "bc": "d"}, [("ax", "bc")]), + ({"r": "ɹ", "ɹ̩": "ɝ"}, [("r", "ɹ̩")]), + # These standard reductions should not have any cascading changes + (phonecodes.phonecode_tables.STANDARD_TIMIT_IPA_REDUCTION, []), + (phonecodes.phonecode_tables.BUCKEYE_IPA_TO_TIMIT_BUCKEYE_SHARED, []), + (phonecodes.phonecode_tables.TIMIT_IPA_TO_TIMIT_BUCKEYE_SHARED, []), + ], +) +def test_find_cascading_keys_in_inventory_map(mapping, expected_value): + assert phonecodes._find_cascading_keys_in_inventory_map(mapping) == expected_value From df88ecd554af2bc0e201e341cf6b0e8989f3fa4f Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Mon, 24 Nov 2025 17:33:28 -0500 Subject: [PATCH 03/12] Added enumerators for valid phonecodes and postprocess mappings. Things are still failing. --- src/phonecodes/phonecodes.py | 218 ++++++++++++++++++++++++++++++----- test/test_phonecodes.py | 2 +- 2 files changed, 188 insertions(+), 32 deletions(-) diff --git a/src/phonecodes/phonecodes.py b/src/phonecodes/phonecodes.py index 5068e9b..8d5159e 100644 --- a/src/phonecodes/phonecodes.py +++ b/src/phonecodes/phonecodes.py @@ -17,13 +17,123 @@ # for other tables, see phonecode_tables.py """ +from collections import abc +from dataclasses import dataclass +from enum import Enum +import re +import warnings + import phonecodes.phonecode_tables as phonecode_tables -CODES = set(("ipa", "arpabet", "xsampa", "disc", "callhome", "buckeye", "timit")) +# Phone codes +IPA_KEY = "ipa" +ARPABET_KEY = "arpabet" +XSAMPA_KEY = "xsampa" +DISC_KEY = "disc" +CALLHOME_KEY = "callhome" +BUCKEYE_KEY = "buckeye" +TIMIT_KEY = "timit" + +CODES = set((IPA_KEY, ARPABET_KEY, XSAMPA_KEY, DISC_KEY, CALLHOME_KEY, BUCKEYE_KEY, TIMIT_KEY)) LANGUAGES = set(("eng", "deu", "nld", "arz", "cmn", "spa", "yue", "lao", "vie")) -vowels = phonecode_tables._ipa_vowels -consonants = phonecode_tables._ipa_consonants + +class Phonecodes(Enum): + """Defines the set of valid phonecode mapping options supported + and which languages are covered. When language is not specified, + the mapping does not change depending on language. + """ + + # XSAMPA + IPA2XSAMPA = IPA_KEY, XSAMPA_KEY + XSAMPA2IPA = XSAMPA_KEY, IPA_KEY + + # DISC + DISC2IPA = DISC_KEY, IPA_KEY + DISC2IPA_NLD = DISC_KEY, IPA_KEY, "nld" + DISC2IPA_ENG = DISC_KEY, IPA_KEY, "eng" + IPA2DISC = IPA_KEY, DISC_KEY + + # CALLHOME + CALLHOME2IPA_ARZ = CALLHOME_KEY, IPA_KEY, "arz" + CALLHOME2IPA_CMN = CALLHOME_KEY, IPA_KEY, "cmn" + CALLHOME2IPA_SPA = CALLHOME_KEY, IPA_KEY, "spa" + IPA2CALLHOME_ARZ = IPA_KEY, CALLHOME_KEY, "arz" + IPA2CALLHOME_CMN = IPA_KEY, CALLHOME_KEY, "cmn" + IPA2CALLHOME_SPA = IPA_KEY, CALLHOME_KEY, "spa" + + # ARPABET + ARPABET2IPA = ARPABET_KEY, IPA_KEY + IPA2ARPABET = IPA_KEY, ARPABET_KEY + + # TIMIT + TIMIT2IPA = TIMIT_KEY, IPA_KEY + + # Buckeye + BUCKEYE2IPA = BUCKEYE_KEY, IPA_KEY + IPA2BUCKEYE = IPA_KEY, BUCKEYE_KEY + + def __init__(self, in_code, out_code, language=None): + self.in_code = in_code + self.out_code = out_code + self.language = language + + +# Which symbol mapping will be used in conversion? +_phonecode_lookup = { + # XSAMPA + Phonecodes.IPA2XSAMPA: phonecode_tables._ipa2xsampa, + Phonecodes.XSAMPA2IPA: phonecode_tables._xsampa_and_diac2ipa, + # DISC + Phonecodes.DISC2IPA: phonecode_tables._disc2ipa, + Phonecodes.DISC2IPA_NLD: phonecode_tables._disc2ipa_dutch, + Phonecodes.DISC2IPA_ENG: phonecode_tables._disc2ipa_english, + Phonecodes.IPA2DISC: phonecode_tables._ipa2disc, + # CALLHOME + Phonecodes.CALLHOME2IPA_ARZ: phonecode_tables._callhome2ipa[Phonecodes.CALLHOME2IPA_ARZ.language], + Phonecodes.CALLHOME2IPA_CMN: phonecode_tables._callhome2ipa[Phonecodes.CALLHOME2IPA_CMN.language], + Phonecodes.CALLHOME2IPA_SPA: phonecode_tables._callhome2ipa[Phonecodes.CALLHOME2IPA_SPA.language], + # ARPABET + Phonecodes.ARPABET2IPA: phonecode_tables._arpabet2ipa, + Phonecodes.IPA2ARPABET: phonecode_tables._ipa2arpabet, +} + + +@dataclass +class AttachStressTonesConfig: + """Stores the settings for tone or stress attachment algorithms, + which can be different depending on the language and corpus format.""" + + tones: abc.Iterable[str] | str + vowels: abc.Iterable[str] | str + searchstep: int + catdir: int + + +# Is there a configuration for adding tones or stress markers to the final output? +_tone_stress_settings = { + Phonecodes.CALLHOME2IPA_ARZ: AttachStressTonesConfig( + phonecode_tables._ipa_stressmarkers, phonecode_tables._ipa_vowels, -1, -1 + ), + Phonecodes.CALLHOME2IPA_CMN: AttachStressTonesConfig( + phonecode_tables._ipa_tones, phonecode_tables._ipa_vowels, -1, 1 + ), + Phonecodes.CALLHOME2IPA_SPA: AttachStressTonesConfig( + phonecode_tables._ipa_stressmarkers, + phonecode_tables._ipa_vowels, + -1, + -1, + ), + Phonecodes.IPA2CALLHOME_ARZ: AttachStressTonesConfig( + "012", phonecode_tables._callhome_vowels[Phonecodes.IPA2CALLHOME_ARZ.language], 1, 1 + ), + Phonecodes.IPA2CALLHOME_CMN: AttachStressTonesConfig( + "012345", phonecode_tables._callhome_vowels[Phonecodes.IPA2CALLHOME_CMN.language], -1, 1 + ), + Phonecodes.IPA2CALLHOME_SPA: AttachStressTonesConfig( + "012", phonecode_tables._callhome_vowels[Phonecodes.IPA2CALLHOME_SPA.language], 1, 1 + ), +} ##################################################################### @@ -62,7 +172,7 @@ def translate_string(s, d): return (tl[::-1], translated[::-1]) -def attach_tones_to_vowels(il, tones, vowels, searchstep, catdir): +def attach_tones_to_vowels(il: list[str], tones, vowels, searchstep, catdir) -> list[str]: """Return a copy of il, with each tone attached to nearest vowel if any. searchstep=1 means search for next vowel, searchstep=-1 means prev vowel. catdir>=0 means concatenate after vowel, catdir<0 means cat before vowel. @@ -104,7 +214,7 @@ def tone2ipa(n, language): ##################################################################### # DISC, the system used by CELEX -def disc2ipa(x, language): +def disc2ipa(x, language=None): """Convert DISC symbol x into IPA, for language L""" if language == "nld": (tl, ttf) = translate_string(x, phonecode_tables._disc2ipa_dutch) @@ -117,23 +227,12 @@ def disc2ipa(x, language): return "".join(tl) -def ipa2disc(x, language): +def ipa2disc(x, language=None): """Convert IPA symbol x into DISC""" (tl, ttf) = translate_string(x, phonecode_tables._ipa2disc) return "".join(tl) -def ipa2disc_old(x, language): - """Convert IPA symbol x into DISC for given language""" - # Convert whole thing if possible; otherwise try prefix+vowel; else quit - if x in phonecode_tables._ipa2disc: - return phonecode_tables._ipa2disc[x] - elif x[0] in phonecode_tables._ipa2disc and x[1:] in phonecode_tables._ipa2disc: - return phonecode_tables._ipa2disc[x[0]] + phonecode_tables._ipa2disc[x[1:]] - else: - raise KeyError("Unknown IPA symbol %s for language %s" % (x, language)) - - ####################################################################### # Callhome phone codes def callhome2ipa(x, language): @@ -157,11 +256,10 @@ def callhome2ipa(x, language): -1, -1, ) - # TODO What to do if language doesn't match return "".join(ol) -def ipa2callhome(x, language): +def ipa2callhome(x, language=None): """Convert IPA symbol x into callhome notation for given language""" (il, ttf) = translate_string(x, phonecode_tables._ipa2callhome[language]) if language == "arz": @@ -170,7 +268,6 @@ def ipa2callhome(x, language): ol = attach_tones_to_vowels(il, "012345", phonecode_tables._callhome_vowels["cmn"], -1, 1) elif language == "spa": ol = attach_tones_to_vowels(il, "012", phonecode_tables._callhome_vowels["spa"], 1, 1) - # TODO What to do if language doesn't match? return "".join(ol) @@ -244,6 +341,9 @@ def convert(s0, c0, c1, language=None, post_ipa_mapping: dict[str, str] | None = c0 (str): Input phonecode: 'arpabet', 'xsampa','disc', 'callhome' or 'ipa' c1 (str): Output phonecode: 'arpabet', 'xsampa','disc', 'callhome' or 'ipa' language (str | None): The language of the string, optional since it is only required for 'disc' and 'callhome' phonecodes + post_ipa_mapping dict[str, str]: Optional additional normalization of + + Raises: ValueError: If the phonecode is not a valid option @@ -254,25 +354,68 @@ def convert(s0, c0, c1, language=None, post_ipa_mapping: dict[str, str] | None = _verify_code(c0) _verify_code(c1) - if c0 == "ipa" and c1 != "ipa": - x = _convertfuncs[c1][1](s0, language) - return x - elif c0 != "ipa" and c1 == "ipa": - return _convertfuncs[c0][0](s0, language) - else: - raise ValueError(f"Must convert to/from 'ipa', not '{c0}' to '{c1}'") + # Get the right enumerator for looking up mappings + valid_phonecodes = Phonecodes.__members__.values() + phonecode_enum = (c0, c1, language) + if phonecode_enum not in valid_phonecodes: + phonecode_enum = (c0, c1, None) # Fall back to language not specified + if phonecode_enum not in valid_phonecodes: + raise ValueError( + f"Phonecode pairing ({c0, c1, language}) is not valid. Must convert to/from 'ipa' in supported languages or leave language unspecified." + ) + + # Most basic mapping + input_string = s0 + translation_mapping = _phonecode_lookup[phonecode_enum] + if phonecode_enum in [Phonecodes.ARPABET2IPA, Phonecodes.BUCKEYE2IPA, Phonecodes.TIMIT2IPA]: + input_string = input_string.upper() + (mapped_string, ttf) = translate_string(input_string, translation_mapping) + + # Add tones/stress if it's configured for this enum + if phonecode_enum in _tone_stress_settings: + stress_config = _tone_stress_settings[phonecode_enum] + mapped_string = attach_tones_to_vowels( + mapped_string, stress_config.tones, stress_config.vowels, stress_config.searchstep, stress_config.catdir + ) + + final_string = "".join(mapped_string) + if post_ipa_mapping is not None: + final_string = _post_process_reduction(final_string, translation_mapping, post_ipa_mapping) + + return final_string def convertlist(l0, c0, c1, language, post_ipa_mapping: dict[str, str] | None = None): return [convert(s0, c0, c1, language, post_ipa_mapping) for s0 in l0] -def _post_process_ipa_inventories(post_ipa_mapping: dict[str, str]): - # TODO - pass +def _post_process_reduction( + input_string: str, original_translation_mapping: dict[str, str], post_ipa_mapping: dict[str, str] +): + """ + + Args: + post_ipa_mapping: _description_ + """ + cascading_keys = _find_cascading_keys_in_symbol_mapping(post_ipa_mapping) + if len(cascading_keys) > 0: + warnings.warn( + f"Post-processing does not perfrom cascading replacements, but overlapping key/value pairs are detected. Check that this is intended. These keys are affected: {cascading_keys}." + ) + + new_keys = _get_extra_reduction_keys(original_translation_mapping, post_ipa_mapping) + if len(new_keys) > 0: + warnings.warn(f"There are keys in post-processing which do not appear in the original phonetable: {new_keys}.") + # Replacements happen greedily in the order of the post processing map, + # because there may be intentional orderings of substitutions. + pattern = "|".join(re.escape(k) for k in post_ipa_mapping.keys()) -def _find_cascading_keys_in_inventory_map(symbol_inventory_map: dict[str, str]) -> list[tuple[str, str]]: + re.sub(pattern, lambda match: post_ipa_mapping[match.group()], input_string) + return + + +def _find_cascading_keys_in_symbol_mapping(symbol_inventory_map: dict[str, str]) -> list[tuple[str, str]]: """Returns any keys that might have values that would cascade to later keys during replacement. Used as a warning if there seem to be cascading replacements involving the same symbol. This doesn't impact the behavior of the substitution, but serves as a check @@ -295,3 +438,16 @@ def _find_cascading_keys_in_inventory_map(symbol_inventory_map: dict[str, str]) result.append((k1, k2)) return result + + +def _get_extra_reduction_keys(to_ipa_map: dict[str, str], ipa_reduction_map: dict[str, str]) -> set[str]: + """Returns the set of keys in ipa_reduction_map that are not used in the corpus' official IPA inventory. + + Args: + to_ipa_map: The original corpus symbols mapped to IPA symbols. + ipa_symbol_inventory_map: An IPA symbols to IPA symbol mapping for standardizing the corpus. + """ + ipa_original = set(to_ipa_map.values()) + ipa_reduction_keys = set(ipa_reduction_map.keys()) + overlap = ipa_reduction_keys - ipa_original + return overlap diff --git a/test/test_phonecodes.py b/test/test_phonecodes.py index ac22907..27f5b33 100644 --- a/test/test_phonecodes.py +++ b/test/test_phonecodes.py @@ -128,4 +128,4 @@ def test_additional_timit_examples(ipa_str, timit_str): ], ) def test_find_cascading_keys_in_inventory_map(mapping, expected_value): - assert phonecodes._find_cascading_keys_in_inventory_map(mapping) == expected_value + assert phonecodes._find_cascading_keys_in_symbol_mapping(mapping) == expected_value From 170c4c7363f5e85b1b1d17c481547f93dc838fb4 Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Mon, 24 Nov 2025 18:14:22 -0500 Subject: [PATCH 04/12] Added missing keys --- src/phonecodes/phonecodes.py | 37 +++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/src/phonecodes/phonecodes.py b/src/phonecodes/phonecodes.py index 8d5159e..c29530f 100644 --- a/src/phonecodes/phonecodes.py +++ b/src/phonecodes/phonecodes.py @@ -66,7 +66,7 @@ class Phonecodes(Enum): ARPABET2IPA = ARPABET_KEY, IPA_KEY IPA2ARPABET = IPA_KEY, ARPABET_KEY - # TIMIT + # TIMIT - There is no way to convert from IPA to TIMIT due to closure symbols TIMIT2IPA = TIMIT_KEY, IPA_KEY # Buckeye @@ -78,6 +78,20 @@ def __init__(self, in_code, out_code, language=None): self.out_code = out_code self.language = language + @classmethod + def as_member(cls, in_code, out_code, language=None): + valid_codes = set(item.value for item in cls) + phonecode_tuple = (in_code, out_code, language) + if phonecode_tuple in valid_codes: + return Phonecodes((in_code, out_code, language)) + + phonecode_tuple = (in_code, out_code) + if (in_code, out_code) in valid_codes: + return Phonecodes((in_code, out_code)) + raise ValueError( + f"Phonecode pairing {phonecode_tuple} is not valid. Must convert to/from 'ipa' in supported languages or leave language unspecified." + ) + # Which symbol mapping will be used in conversion? _phonecode_lookup = { @@ -93,9 +107,17 @@ def __init__(self, in_code, out_code, language=None): Phonecodes.CALLHOME2IPA_ARZ: phonecode_tables._callhome2ipa[Phonecodes.CALLHOME2IPA_ARZ.language], Phonecodes.CALLHOME2IPA_CMN: phonecode_tables._callhome2ipa[Phonecodes.CALLHOME2IPA_CMN.language], Phonecodes.CALLHOME2IPA_SPA: phonecode_tables._callhome2ipa[Phonecodes.CALLHOME2IPA_SPA.language], + Phonecodes.IPA2CALLHOME_ARZ: phonecode_tables._ipa2callhome[Phonecodes.IPA2CALLHOME_ARZ.language], + Phonecodes.IPA2CALLHOME_CMN: phonecode_tables._ipa2callhome[Phonecodes.IPA2CALLHOME_CMN.language], + Phonecodes.IPA2CALLHOME_SPA: phonecode_tables._ipa2callhome[Phonecodes.IPA2CALLHOME_SPA.language], # ARPABET Phonecodes.ARPABET2IPA: phonecode_tables._arpabet2ipa, Phonecodes.IPA2ARPABET: phonecode_tables._ipa2arpabet, + # Buckeye + Phonecodes.BUCKEYE2IPA: phonecode_tables._buckeye2ipa, + Phonecodes.IPA2ARPABET: phonecode_tables._ipa2buckeye, + # TIMIT + Phonecodes.TIMIT2IPA: phonecode_tables._timit2ipa, } @@ -133,6 +155,10 @@ class AttachStressTonesConfig: Phonecodes.IPA2CALLHOME_SPA: AttachStressTonesConfig( "012", phonecode_tables._callhome_vowels[Phonecodes.IPA2CALLHOME_SPA.language], 1, 1 ), + Phonecodes.ARPABET2IPA: AttachStressTonesConfig( + phonecode_tables._ipa_stressmarkers, phonecode_tables._ipa_vowels, -1, -1 + ), + Phonecodes.IPA2ARPABET: AttachStressTonesConfig("012", phonecode_tables._arpabet_vowels, 1, 1), } @@ -355,14 +381,7 @@ def convert(s0, c0, c1, language=None, post_ipa_mapping: dict[str, str] | None = _verify_code(c1) # Get the right enumerator for looking up mappings - valid_phonecodes = Phonecodes.__members__.values() - phonecode_enum = (c0, c1, language) - if phonecode_enum not in valid_phonecodes: - phonecode_enum = (c0, c1, None) # Fall back to language not specified - if phonecode_enum not in valid_phonecodes: - raise ValueError( - f"Phonecode pairing ({c0, c1, language}) is not valid. Must convert to/from 'ipa' in supported languages or leave language unspecified." - ) + phonecode_enum = Phonecodes.as_member(c0, c1, language) # Most basic mapping input_string = s0 From 92fe28f8fb89aaee7d9c74fc85a69cd177cfd994 Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Mon, 24 Nov 2025 18:44:22 -0500 Subject: [PATCH 05/12] Phonecodes with new conversion function and passing tests --- src/phonecodes/phonecodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phonecodes/phonecodes.py b/src/phonecodes/phonecodes.py index c29530f..bb5f02c 100644 --- a/src/phonecodes/phonecodes.py +++ b/src/phonecodes/phonecodes.py @@ -115,7 +115,7 @@ def as_member(cls, in_code, out_code, language=None): Phonecodes.IPA2ARPABET: phonecode_tables._ipa2arpabet, # Buckeye Phonecodes.BUCKEYE2IPA: phonecode_tables._buckeye2ipa, - Phonecodes.IPA2ARPABET: phonecode_tables._ipa2buckeye, + Phonecodes.IPA2BUCKEYE: phonecode_tables._ipa2buckeye, # TIMIT Phonecodes.TIMIT2IPA: phonecode_tables._timit2ipa, } From 84ef2ddecf11a11c45c70b5b58b75fb736a0530e Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Mon, 24 Nov 2025 18:49:09 -0500 Subject: [PATCH 06/12] Replaced all x2y functions with calls to the convert function --- src/phonecodes/phonecodes.py | 78 +++++------------------------------- 1 file changed, 11 insertions(+), 67 deletions(-) diff --git a/src/phonecodes/phonecodes.py b/src/phonecodes/phonecodes.py index bb5f02c..69d87b4 100644 --- a/src/phonecodes/phonecodes.py +++ b/src/phonecodes/phonecodes.py @@ -222,14 +222,12 @@ def attach_tones_to_vowels(il: list[str], tones, vowels, searchstep, catdir) -> # X-SAMPA def ipa2xsampa(x, language=None): """Attempt to return X-SAMPA equivalent of an IPA phone x.""" - (tl, ttf) = translate_string(x, phonecode_tables._ipa2xsampa) - return "".join(tl) + return convert(x, IPA_KEY, XSAMPA_KEY, language) def xsampa2ipa(x, language=None): """Return the IPA equivalent of X-SAMPA phone x.""" - (tl, ttf) = translate_string(x, phonecode_tables._xsampa_and_diac2ipa) - return "".join(tl) + return convert(x, XSAMPA_KEY, IPA_KEY, language) ###################################################################### @@ -242,82 +240,41 @@ def tone2ipa(n, language): # DISC, the system used by CELEX def disc2ipa(x, language=None): """Convert DISC symbol x into IPA, for language L""" - if language == "nld": - (tl, ttf) = translate_string(x, phonecode_tables._disc2ipa_dutch) - return "".join(tl) - elif language == "eng": - (tl, ttf) = translate_string(x, phonecode_tables._disc2ipa_english) - return "".join(tl) - else: - (tl, ttf) = translate_string(x, phonecode_tables._disc2ipa) - return "".join(tl) + return convert(x, DISC_KEY, IPA_KEY, language) def ipa2disc(x, language=None): """Convert IPA symbol x into DISC""" - (tl, ttf) = translate_string(x, phonecode_tables._ipa2disc) - return "".join(tl) + return convert(x, IPA_KEY, DISC_KEY, language) ####################################################################### # Callhome phone codes def callhome2ipa(x, language): """Convert callhome phone symbol x into IPA for given language""" - (il, ttf) = translate_string(x, phonecode_tables._callhome2ipa[language]) - if language == "arz": - ol = attach_tones_to_vowels( - il, - phonecode_tables._ipa_stressmarkers, - phonecode_tables._ipa_vowels, - -1, - -1, - ) - elif language == "cmn": - ol = attach_tones_to_vowels(il, phonecode_tables._ipa_tones, phonecode_tables._ipa_vowels, -1, 1) - elif language == "spa": - ol = attach_tones_to_vowels( - il, - phonecode_tables._ipa_stressmarkers, - phonecode_tables._ipa_vowels, - -1, - -1, - ) - return "".join(ol) + return convert(x, CALLHOME_KEY, IPA_KEY, language) def ipa2callhome(x, language=None): """Convert IPA symbol x into callhome notation for given language""" - (il, ttf) = translate_string(x, phonecode_tables._ipa2callhome[language]) - if language == "arz": - ol = attach_tones_to_vowels(il, "012", phonecode_tables._callhome_vowels["arz"], 1, 1) - elif language == "cmn": - ol = attach_tones_to_vowels(il, "012345", phonecode_tables._callhome_vowels["cmn"], -1, 1) - elif language == "spa": - ol = attach_tones_to_vowels(il, "012", phonecode_tables._callhome_vowels["spa"], 1, 1) - return "".join(ol) + return convert(x, IPA_KEY, CALLHOME_KEY, language) ######################################################################### # ARPABET, TIMIT, Buckeye def arpabet2ipa(x, language=None): """Convert ARPABET symbol X to IPA""" - (il, ttf) = translate_string(x, phonecode_tables._arpabet2ipa) - ol = attach_tones_to_vowels(il, phonecode_tables._ipa_stressmarkers, phonecode_tables._ipa_vowels, -1, -1) - return "".join(ol) + return convert(x, ARPABET_KEY, IPA_KEY, language) def ipa2arpabet(x, language=None): """Convert IPA symbols to ARPABET""" - (il, ttf) = translate_string(x, phonecode_tables._ipa2arpabet) - ol = attach_tones_to_vowels(il, "012", phonecode_tables._arpabet_vowels, 1, 1) - return "".join(ol) + return convert(x, IPA_KEY, ARPABET_KEY, language) def timit2ipa(x, language=None): """Convert TIMIT phone codes to IPA""" - x = x.upper() - (il, ttf) = translate_string(x, phonecode_tables._timit2ipa) - return "".join(il) + return convert(x, TIMIT_KEY, IPA_KEY, language) def ipa2timit(x, language=None): @@ -328,15 +285,12 @@ def ipa2timit(x, language=None): def buckeye2ipa(x, language=None): """Convert Buckeye phone codes to IPA""" - x = x.upper() - (il, ttf) = translate_string(x, phonecode_tables._buckeye2ipa) - return "".join(il) + return convert(x, BUCKEYE_KEY, IPA_KEY, language) def ipa2buckeye(x, language=None): "Convert IPA symbols to Buckeye phone codes" - (il, ttf) = translate_string(x, phonecode_tables._ipa2buckeye) - return "".join(il) + return convert(x, IPA_KEY, BUCKEYE_KEY, language) ####################################################################### @@ -344,16 +298,6 @@ def ipa2buckeye(x, language=None): # are used to convert symbols and lists of symbols, respectively, # to or from IPA, by calling appropriate other functions. # -_convertfuncs = { - "arpabet": (arpabet2ipa, ipa2arpabet), - "xsampa": (xsampa2ipa, ipa2xsampa), - "disc": (disc2ipa, ipa2disc), - "callhome": (callhome2ipa, ipa2callhome), - "buckeye": (buckeye2ipa, ipa2buckeye), - "timit": (timit2ipa, ipa2timit), -} - - def _verify_code(code): if code not in CODES: raise ValueError(f"{code} is not a valid phonecode. Choose from: {' '.join(CODES)}") From 1594a025b1ddf13a957d3361d4ba2165aed1b9e2 Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Mon, 24 Nov 2025 19:35:41 -0500 Subject: [PATCH 07/12] Stress on r-colored vowels bugfix and unit test --- src/phonecodes/phonecode_tables.py | 2 +- test/test_phonecodes.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/phonecodes/phonecode_tables.py b/src/phonecodes/phonecode_tables.py index 6600749..3106ddd 100644 --- a/src/phonecodes/phonecode_tables.py +++ b/src/phonecodes/phonecode_tables.py @@ -642,7 +642,7 @@ ####################################################################### # IPA -_ipa_vowels = set("aeiouyɑɒɛɪɔʘʊʌʏəɘæʉɨøɜɞɐɤɵœɶ") | set(("ɪ̈", "ʊ̈")) +_ipa_vowels = set("aeiouyɑɒɛɪɔʘʊʌʏəɘæʉɨøɜɞɐɤɵœɶɝɚ") | set(("ɪ̈", "ʊ̈", "oʊ", "aʊ", "eɪ", "ɔɪ", "aɪ")) _ipa_consonants = set("bɓcdɖɗfɡɠhɦjʝklɭɺmnɳpɸqrɽɹɻsʂɕtʈvʋwxɧzʐʑβʙçðɱɣɢʛɥʜɲɟʄɬɮʎʟɯɰŋɴʋɒʁʀʃθʍχħʒɾɫʔʕʢʡꜛꜜǃ|ǀ‖ǁǂ") _ipa_diacritics = set(re.sub(r"◌", "", "◌̈◌̟◌̠◌̌◌̥◌̩◌◌◌̂◌̯◌̚◌◌̃◌̘◌̺◌̏◌◌̜◌̪◌̴◌̂◌◌́◌◌◌◌̰◌̀◌◌̄◌̻◌̼◌◌̹◌̞◌̙◌̌◌◌̝◌̋◌̤◌̬◌◌̆◌̽ːʰˀʷʱʼʲˤ")) _ipa_stressmarkers = set("ˈˌ") diff --git a/test/test_phonecodes.py b/test/test_phonecodes.py index 27f5b33..2d3252e 100644 --- a/test/test_phonecodes.py +++ b/test/test_phonecodes.py @@ -129,3 +129,16 @@ def test_additional_timit_examples(ipa_str, timit_str): ) def test_find_cascading_keys_in_inventory_map(mapping, expected_value): assert phonecodes._find_cascading_keys_in_symbol_mapping(mapping) == expected_value + + +@pytest.mark.parametrize( + "example, incode, outcode, expected", + [ + ("AE1 D V ER0 T", "arpabet", "ipa", "ˈæ d v ɚ t"), + ("AE1 D V ER1 T", "arpabet", "ipa", "ˈæ d v ˈɝ t"), + ("AE0 ER1 T", "arpabet", "ipa", "æ ˈɝ t"), + ("AE0 D V ER1 T AH0 Z M AH0 N T", "arpabet", "ipa", "æ d v ˈɝ t ə z m ə n t"), + ], +) +def test_arpabet_stress_attachment(example, incode, outcode, expected): + assert phonecodes.convert(example, incode, outcode) == expected From a2d653b8d6cc67404ab3e05921529d1a3582327c Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Mon, 24 Nov 2025 19:35:56 -0500 Subject: [PATCH 08/12] Prepare for version bump --- CHANGELOG.md | 11 +++++++++++ pyproject.toml | 6 ++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c212c2..76bcca6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,17 @@ You should also add project tags for each release in Github, see [Managing relea # [Unreleased] +# [2.0.0] - 11/24/2024 +### Added +- A Phonecodes enum class to the phonecodes module, to enforce valid conversion and language pairs more explicitly. +- Support for post-processing after conversion to/from IPA is performed, to allow for reduction to a shared symbol set. This is useful, for example, to convert standard TIMIT symbol reductions or a shared symbol set between Buckeye and TIMIT. + +### Changed +- All codeA2codeB conversion functions in phonecodes now rely on the convert function, which should increase maintainability and reusability of the code. + +### Fixed +- Added missing ARPABET IPA vowels (diphthongs and r-colored vowels) to the set of IPA vowels in phonecode_tables, so that stress markers would be added correctly. Fixes https://github.com/ginic/phonecodes/issues/15. + # [1.2.3] - 10/23/2025 ### Changed - Added python 3.14 to package and pytest GitHub actions diff --git a/pyproject.toml b/pyproject.toml index 77b4855..d490b32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,17 +4,15 @@ build-backend = "setuptools.build_meta" [project] name = "phonecodes" -version = "1.2.3" +version = "2.0.0" description = "Tools for loading dictionaries with various phonecodes (IPA, Callhome, X-SAMPA, ARPABET, DISC=CELEX, Buckeye), for converting among those phonecodes, and for searching those dictionaries for word sequences matching a target." readme = "README.md" license = {file = "LICENSE.txt"} -requires-python = ">=3.7" +requires-python = ">=3.9" classifiers = [ "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", From 96665b63eff8d42156a6fa9274785ebf5d211546 Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Tue, 25 Nov 2025 15:32:12 -0500 Subject: [PATCH 09/12] Type checks and unit tests for conversion with a reduction --- src/phonecodes/phonecodes.py | 102 +++++++++++++++++++---------------- test/test_phonecodes.py | 35 ++++++++++-- 2 files changed, 86 insertions(+), 51 deletions(-) diff --git a/src/phonecodes/phonecodes.py b/src/phonecodes/phonecodes.py index 69d87b4..54d1ead 100644 --- a/src/phonecodes/phonecodes.py +++ b/src/phonecodes/phonecodes.py @@ -17,6 +17,7 @@ # for other tables, see phonecode_tables.py """ +from __future__ import annotations from collections import abc from dataclasses import dataclass from enum import Enum @@ -25,7 +26,7 @@ import phonecodes.phonecode_tables as phonecode_tables -# Phone codes +# Phonecodes constants for easier maintenance IPA_KEY = "ipa" ARPABET_KEY = "arpabet" XSAMPA_KEY = "xsampa" @@ -220,14 +221,14 @@ def attach_tones_to_vowels(il: list[str], tones, vowels, searchstep, catdir) -> ##################################################################### # X-SAMPA -def ipa2xsampa(x, language=None): +def ipa2xsampa(x, language=None, post_conversion_mapping=None): """Attempt to return X-SAMPA equivalent of an IPA phone x.""" - return convert(x, IPA_KEY, XSAMPA_KEY, language) + return convert(x, IPA_KEY, XSAMPA_KEY, language, post_conversion_mapping) -def xsampa2ipa(x, language=None): +def xsampa2ipa(x, language=None, post_conversion_mapping=None): """Return the IPA equivalent of X-SAMPA phone x.""" - return convert(x, XSAMPA_KEY, IPA_KEY, language) + return convert(x, XSAMPA_KEY, IPA_KEY, language, post_conversion_mapping) ###################################################################### @@ -238,59 +239,59 @@ def tone2ipa(n, language): ##################################################################### # DISC, the system used by CELEX -def disc2ipa(x, language=None): +def disc2ipa(x, language=None, post_conversion_mapping=None): """Convert DISC symbol x into IPA, for language L""" - return convert(x, DISC_KEY, IPA_KEY, language) + return convert(x, DISC_KEY, IPA_KEY, language, post_conversion_mapping) -def ipa2disc(x, language=None): +def ipa2disc(x, language=None, post_conversion_mapping=None): """Convert IPA symbol x into DISC""" - return convert(x, IPA_KEY, DISC_KEY, language) + return convert(x, IPA_KEY, DISC_KEY, language, post_conversion_mapping) ####################################################################### # Callhome phone codes -def callhome2ipa(x, language): +def callhome2ipa(x, language, post_conversion_mapping=None): """Convert callhome phone symbol x into IPA for given language""" - return convert(x, CALLHOME_KEY, IPA_KEY, language) + return convert(x, CALLHOME_KEY, IPA_KEY, language, post_conversion_mapping) -def ipa2callhome(x, language=None): +def ipa2callhome(x, language=None, post_conversion_mapping=None): """Convert IPA symbol x into callhome notation for given language""" - return convert(x, IPA_KEY, CALLHOME_KEY, language) + return convert(x, IPA_KEY, CALLHOME_KEY, language, post_conversion_mapping) ######################################################################### # ARPABET, TIMIT, Buckeye -def arpabet2ipa(x, language=None): +def arpabet2ipa(x, language=None, post_conversion_mapping=None): """Convert ARPABET symbol X to IPA""" - return convert(x, ARPABET_KEY, IPA_KEY, language) + return convert(x, ARPABET_KEY, IPA_KEY, language, post_conversion_mapping) -def ipa2arpabet(x, language=None): +def ipa2arpabet(x, language=None, post_conversion_mapping=None): """Convert IPA symbols to ARPABET""" - return convert(x, IPA_KEY, ARPABET_KEY, language) + return convert(x, IPA_KEY, ARPABET_KEY, language, post_conversion_mapping) -def timit2ipa(x, language=None): +def timit2ipa(x, language=None, post_conversion_mapping=None): """Convert TIMIT phone codes to IPA""" - return convert(x, TIMIT_KEY, IPA_KEY, language) + return convert(x, TIMIT_KEY, IPA_KEY, language, post_conversion_mapping) -def ipa2timit(x, language=None): +def ipa2timit(x, language=None, post_conversion_mapping=None): raise ValueError( "Converting to 'timit' is unsupported, because TIMIT closure symbols for stops cannot be determined from text." ) -def buckeye2ipa(x, language=None): +def buckeye2ipa(x, language=None, post_conversion_mapping=None): """Convert Buckeye phone codes to IPA""" - return convert(x, BUCKEYE_KEY, IPA_KEY, language) + return convert(x, BUCKEYE_KEY, IPA_KEY, language, post_conversion_mapping) -def ipa2buckeye(x, language=None): +def ipa2buckeye(x, language=None, post_conversion_mapping=None): "Convert IPA symbols to Buckeye phone codes" - return convert(x, IPA_KEY, BUCKEYE_KEY, language) + return convert(x, IPA_KEY, BUCKEYE_KEY, language, post_conversion_mapping) ####################################################################### @@ -303,7 +304,9 @@ def _verify_code(code): raise ValueError(f"{code} is not a valid phonecode. Choose from: {' '.join(CODES)}") -def convert(s0, c0, c1, language=None, post_ipa_mapping: dict[str, str] | None = None): +def convert( + s0: str, c0: str, c1: str, language: str | None = None, post_conversion_mapping: dict[str, str] | None = None +) -> str: """Convert a string between a given phonecode and IPA Args: @@ -311,15 +314,13 @@ def convert(s0, c0, c1, language=None, post_ipa_mapping: dict[str, str] | None = c0 (str): Input phonecode: 'arpabet', 'xsampa','disc', 'callhome' or 'ipa' c1 (str): Output phonecode: 'arpabet', 'xsampa','disc', 'callhome' or 'ipa' language (str | None): The language of the string, optional since it is only required for 'disc' and 'callhome' phonecodes - post_ipa_mapping dict[str, str]: Optional additional normalization of - - + post_conversion_mapping dict[str, str]: Optional additional normalization mapping that occurs after conversion and stress assignments (greedy, in the same order as the dictionary keys) Raises: ValueError: If the phonecode is not a valid option Returns: - _type_: _description_ + str: String converted and post processed according to the specified phonecode mappings """ _verify_code(c0) _verify_code(c1) @@ -342,10 +343,12 @@ def convert(s0, c0, c1, language=None, post_ipa_mapping: dict[str, str] | None = ) final_string = "".join(mapped_string) - if post_ipa_mapping is not None: - final_string = _post_process_reduction(final_string, translation_mapping, post_ipa_mapping) - return final_string + # Optional post processing normalization + if post_conversion_mapping is not None: + final_string = _post_process_reduction(final_string, translation_mapping, post_conversion_mapping) + + return final_string.strip() def convertlist(l0, c0, c1, language, post_ipa_mapping: dict[str, str] | None = None): @@ -353,29 +356,34 @@ def convertlist(l0, c0, c1, language, post_ipa_mapping: dict[str, str] | None = def _post_process_reduction( - input_string: str, original_translation_mapping: dict[str, str], post_ipa_mapping: dict[str, str] -): - """ + input_string: str, original_translation_mapping: dict[str, str], reduction_mapping: dict[str, str] +) -> str: + """Additional normalization step, replaces symbols in the input_str according to the reduction_mapping (original symbol -> desired symbol). + + Before the replacement occurs, checks for conflicting behaviors, such as mapping symbols which don't appear in the + original_translation_mapping symbol inventory or potential cascading replacements. + These checks show a warning, but will not raise an Exception. Args: - post_ipa_mapping: _description_ + input_string: str, string to do symbol replacements/substitutions on + original_translation_mapping: dict[str, str], original mapping to/from IPA, used only for validation checks + reduction_mapping: dict[str, str], additional symbol reduction, usually for IPA inventories """ - cascading_keys = _find_cascading_keys_in_symbol_mapping(post_ipa_mapping) + cascading_keys = _find_cascading_keys_in_symbol_mapping(reduction_mapping) if len(cascading_keys) > 0: warnings.warn( f"Post-processing does not perfrom cascading replacements, but overlapping key/value pairs are detected. Check that this is intended. These keys are affected: {cascading_keys}." ) - new_keys = _get_extra_reduction_keys(original_translation_mapping, post_ipa_mapping) + new_keys = _get_extra_reduction_keys(original_translation_mapping, reduction_mapping) if len(new_keys) > 0: warnings.warn(f"There are keys in post-processing which do not appear in the original phonetable: {new_keys}.") # Replacements happen greedily in the order of the post processing map, # because there may be intentional orderings of substitutions. - pattern = "|".join(re.escape(k) for k in post_ipa_mapping.keys()) + pattern = "|".join(re.escape(k) for k in reduction_mapping.keys()) - re.sub(pattern, lambda match: post_ipa_mapping[match.group()], input_string) - return + return re.sub(pattern, lambda match: reduction_mapping[match.group()], input_string) def _find_cascading_keys_in_symbol_mapping(symbol_inventory_map: dict[str, str]) -> list[tuple[str, str]]: @@ -403,14 +411,14 @@ def _find_cascading_keys_in_symbol_mapping(symbol_inventory_map: dict[str, str]) return result -def _get_extra_reduction_keys(to_ipa_map: dict[str, str], ipa_reduction_map: dict[str, str]) -> set[str]: - """Returns the set of keys in ipa_reduction_map that are not used in the corpus' official IPA inventory. +def _get_extra_reduction_keys(original_mapping: dict[str, str], reduction_mapping: dict[str, str]) -> set[str]: + """Returns the set of keys in reduction_map that are not used in the corpus' official symbol inventory. Args: - to_ipa_map: The original corpus symbols mapped to IPA symbols. - ipa_symbol_inventory_map: An IPA symbols to IPA symbol mapping for standardizing the corpus. + original_mapping: The original corpus symbols mapped to/from IPA symbols. + reduction_mapping: An symbol to symbol mapping for standardizing the output. """ - ipa_original = set(to_ipa_map.values()) - ipa_reduction_keys = set(ipa_reduction_map.keys()) + ipa_original = set(original_mapping.values()) + ipa_reduction_keys = set(reduction_mapping.keys()) overlap = ipa_reduction_keys - ipa_original return overlap diff --git a/test/test_phonecodes.py b/test/test_phonecodes.py index 2d3252e..c9ddff0 100644 --- a/test/test_phonecodes.py +++ b/test/test_phonecodes.py @@ -3,6 +3,11 @@ """ import phonecodes.phonecodes as phonecodes +from phonecodes.phonecode_tables import ( + BUCKEYE_IPA_TO_TIMIT_BUCKEYE_SHARED, + TIMIT_IPA_TO_TIMIT_BUCKEYE_SHARED, + STANDARD_TIMIT_IPA_REDUCTION, +) import pytest @@ -72,10 +77,7 @@ def test_additional_buckeye_examples(ipa_str, buckeye_str): @pytest.mark.parametrize( "ipa_str, timit_str", [ - ( - " tʃ ɑ k l ɨ t ", - "h# ch aa kcl k l ix tcl t h#", - ), # 'chocolate' with start/stop tokens and no initial closure + ("tʃ ɑ k l ɨ t", "h# ch aa kcl k l ix tcl t h#"), # 'chocolate' with start/stop tokens and no initial closure ("tʃ ɑ k l ɨ t", "tcl ch aa k l ix tcl t"), # 'chocolate' with mixed closure inclusion ("tʃ ɑ k l ɨ t", "tcl ch aa k l ix t"), # 'chocolate' with mixed closure inclusion ("tʃɑklɨt", "tclchaaklixtclt"), # 'chocolate' with mixed closure inclusion, no spaces @@ -142,3 +144,28 @@ def test_find_cascading_keys_in_inventory_map(mapping, expected_value): ) def test_arpabet_stress_attachment(example, incode, outcode, expected): assert phonecodes.convert(example, incode, outcode) == expected + + +@pytest.mark.parametrize( + "example, incode, outcode, post_conversion_mapping, expected", + [ + ("h# ch aa kcl k l ix tcl t h#", "timit", "ipa", STANDARD_TIMIT_IPA_REDUCTION, "tʃ ɑ k l ɪ t"), + ("h#chaakclklixzhh#", "timit", "ipa", STANDARD_TIMIT_IPA_REDUCTION, "tʃɑklɪʃ"), + ( + "w iyn w ern k ih n aan nx eh tq", + "buckeye", + "ipa", + BUCKEYE_IPA_TO_TIMIT_BUCKEYE_SHARED, + "w i w ɹ̩ k ɪ n ɑ n ɛ ʔ", + ), + ( + "w ax w axr k ih n aa nx eh q hv zh", + "timit", + "ipa", + TIMIT_IPA_TO_TIMIT_BUCKEYE_SHARED, + "w ə w ɹ̩ k ɪ n ɑ n ɛ ʔ h ʒ", + ), + ], +) +def test_convert_with_post_conversion_mapping(example, incode, outcode, post_conversion_mapping, expected): + assert phonecodes.convert(example, incode, outcode, post_conversion_mapping=post_conversion_mapping) == expected From 3504356119ad7106c0f3f59985081c5153477c1e Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Tue, 25 Nov 2025 15:56:39 -0500 Subject: [PATCH 10/12] Happy with unit tests --- test/test_phonecodes.py | 82 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/test/test_phonecodes.py b/test/test_phonecodes.py index c9ddff0..02aadf5 100644 --- a/test/test_phonecodes.py +++ b/test/test_phonecodes.py @@ -1,6 +1,4 @@ -"""Load some pronlexes from the 'fixtures' subdirectory, -test phone code conversion, and test both word and phone searches. -""" +"""Unit tests for phonecodes.phonecodes functionality""" import phonecodes.phonecodes as phonecodes from phonecodes.phonecode_tables import ( @@ -39,6 +37,7 @@ ] +# Test basic functionality of the phonecodes.x2y conversion functions @pytest.mark.parametrize("in_code, out_code, fn_call, language", phonecode_cases) def test_conversion_functions(in_code, out_code, fn_call, language, sentences): result = fn_call(sentences[language][in_code], language) @@ -46,6 +45,7 @@ def test_conversion_functions(in_code, out_code, fn_call, language, sentences): assert result == expected +# Test basic functionality of the convert function with input and output phonecodes @pytest.mark.parametrize("in_code, out_code, fn_call, language", phonecode_cases) def test_convert(in_code, out_code, fn_call, language, sentences): s_in = sentences[language][in_code] @@ -54,6 +54,7 @@ def test_convert(in_code, out_code, fn_call, language, sentences): assert converted == expected +# Invalid phonecode pairs raise a value error @pytest.mark.parametrize( "input_code, output_code", [ @@ -112,6 +113,7 @@ def test_additional_timit_examples(ipa_str, timit_str): assert phonecodes.timit2ipa(timit_str) == ipa_str +# Tests the check for potential cascading keys in postprocessing @pytest.mark.parametrize( "mapping, expected_value", [ @@ -146,6 +148,7 @@ def test_arpabet_stress_attachment(example, incode, outcode, expected): assert phonecodes.convert(example, incode, outcode) == expected +# Tests the standard conversion mappings work as expected with the phonecodes.convert function @pytest.mark.parametrize( "example, incode, outcode, post_conversion_mapping, expected", [ @@ -169,3 +172,76 @@ def test_arpabet_stress_attachment(example, incode, outcode, expected): ) def test_convert_with_post_conversion_mapping(example, incode, outcode, post_conversion_mapping, expected): assert phonecodes.convert(example, incode, outcode, post_conversion_mapping=post_conversion_mapping) == expected + + +# Test cases for _get_extra_reduction_keys will +@pytest.mark.parametrize( + "original_mapping, reduction_mapping, expected_extra_keys", + [ + # Case 1: No extra keys - reduction keys are subset of original values + ({"AA": "ɑ", "AE": "æ", "AH": "ə"}, {"ɑ": "a", "æ": "a"}, set()), + # Case 2: Extra keys present - reduction has keys not in original values + ({"AA": "ɑ", "AE": "æ"}, {"ɑ": "a", "ɪ": "i", "ʊ": "u"}, {"ɪ", "ʊ"}), + # Case 3: All extra keys + ({"AA": "ɑ", "AE": "æ"}, {"x": "y", "z": "w"}, {"x", "z"}), + # Case 4: Empty reduction mapping + ({"AA": "ɑ", "AE": "æ"}, {}, set()), + # Case 5: Empty original mapping + ({}, {"ɑ": "a", "æ": "e"}, {"ɑ", "æ"}), + # Standard reductions should not have any extra keys + (phonecodes.phonecode_tables._timit2ipa, STANDARD_TIMIT_IPA_REDUCTION, set()), + (phonecodes.phonecode_tables._timit2ipa, TIMIT_IPA_TO_TIMIT_BUCKEYE_SHARED, set()), + (phonecodes.phonecode_tables._buckeye2ipa, BUCKEYE_IPA_TO_TIMIT_BUCKEYE_SHARED, set()), + ], +) +def test_get_extra_reduction_keys(original_mapping, reduction_mapping, expected_extra_keys): + result = phonecodes._get_extra_reduction_keys(original_mapping, reduction_mapping) + assert result == expected_extra_keys + + +# Test cases for _post_process_reduction without any warnings +@pytest.mark.parametrize( + "input_string, original_mapping, reduction_mapping, expected_output", + [ + # Simple single character substitution + ("ɑ æ ə", {"AA": "ɑ", "AE": "æ", "AH": "ə"}, {"ɑ": "a", "æ": "a", "ə": "a"}, "a a a"), + # Multi-character symbol substitution + ("tʃɑklɪʃ", {"CH": "tʃ"}, {"tʃ": "č"}, "čɑklɪʃ"), + # Partial substitution (some symbols not in reduction map) + ("ɑ b æ c ə", {"AA": "ɑ", "AE": "æ"}, {"ɑ": "a", "æ": "e"}, "a b e c ə"), + # No substitutions needed + ("hello world", {"A": "a", "z": "x"}, {"x": "y"}, "hello world"), + # Empty string + ("", {"A": "a"}, {"a": "b"}, ""), + # Deletion (map to empty string) + ("ɑzæ", {"AA": "ɑ", "x": "z"}, {"z": ""}, "ɑæ"), + # Order matters - greedy left-to-right substitution + ("abc", {"A": "a", "B": "b", "z": "ab"}, {"a": "x", "b": "y", "ab": "z"}, "xyc"), + # Real TIMIT reduction example + ("tʃ ɑ k l ɨ ʃ", phonecodes.phonecode_tables._timit2ipa, STANDARD_TIMIT_IPA_REDUCTION, "tʃ ɑ k l ɪ ʃ"), + ], +) +def test_post_process_reduction_substitution(input_string, original_mapping, reduction_mapping, expected_output): + result = phonecodes._post_process_reduction(input_string, original_mapping, reduction_mapping) + assert result == expected_output + + +# Test warning behavior for _post_process_reduction +def test_post_process_reduction_warns_on_cascading_keys(): + """Test that cascading keys trigger a warning""" + input_string = "abc" + original_mapping = {"A": "a", "B": "b", "z": "bc"} + reduction_mapping = {"a": "b", "bc": "d"} # "a"->"b" could cascade to "bc" + + with pytest.warns(UserWarning, match="cascading replacements"): + phonecodes._post_process_reduction(input_string, original_mapping, reduction_mapping) + + +def test_post_process_reduction_warns_on_extra_keys(): + """Test that extra keys trigger a warning""" + input_string = "ɑ æ" + original_mapping = {"AA": "ɑ", "AE": "æ"} + reduction_mapping = {"ɑ": "a", "ɪ": "i"} # "ɪ" not in original mapping values + + with pytest.warns(UserWarning, match="do not appear in the original phonetable"): + phonecodes._post_process_reduction(input_string, original_mapping, reduction_mapping) From a42f3ef3d8b33e30a0d828470f188b3b36fc5948 Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Tue, 25 Nov 2025 16:29:22 -0500 Subject: [PATCH 11/12] Caught ordering issue in buckeye conversion mapping. Added remapping example to readme. --- README.md | 23 ++++++++++++++++++++++- src/phonecodes/phonecode_tables.py | 2 +- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ddc376b..b134c85 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Developers may refer to the CONTRIBUTIONS.md for information on the development # Basic Usage ## Converting between Phonetic Alphabets If you want to convert to or from IPA to some other phonetic code, use `phonecodes.phonecodes` as follows: -``` +```python >>> from phonecodes import phonecodes >>> print(phonecodes.CODES) # available phonetic alphabets {'arpabet', 'buckeye', 'ipa', 'timit', 'callhome', 'xsampa', 'disc'} @@ -35,6 +35,27 @@ For 'callhome' and 'disc' you should also specify a language code from the follo - DISC/CELEX: Dutch `'nld'`, English `'eng'`, German `'deu'`. Uses German if unspecified. - Callhome: Spanish `'spa'`, Egyptian Arabic `'arz'`, Mandarin Chinese `'cmn'`. You MUST specify an appropriate language code or you'll get a KeyError. +## Additional post-processing +An additional use case when converting between phonecodes is to normalize the final mapping to a subset of IPA symbols. This is useful if you are collapsing similar sounds together to a reduced symbol inventory or if you are standardizing two corpora with different IPA inventories/conventions to a shared subset. + +We support this use case through the `post_conversion_mapping` keyword argument, an optional dictionary remapping provided with all phonecodes conversion functions. You can provide a custom mapping. Be aware that the remapping algorithm is greedy, proceeds in the order that keys appear in the dictionary, and diacritics need to appear with a base symbol in the mapping. + +Additionally, we provide IPA-to-IPA post-processing dictionary mappings in `phonecodes.phonecode_tables`: +- `phonecodes.phonecode_tables.STANDARD_TIMIT_IPA_REDUCTION`: The 'standard' TIMIT label reduction used in Lee and Hon (1989) that reduces the original 64 TIMIT phonetic labels to 39 categories. This reduction is widely used in the speech recognition community. +- `phonecodes.phonecode_tables.BUCKEYE_IPA_TO_TIMIT_BUCKEYE_SHARED` and `phonecodes.phonecode_tables.TIMIT_IPA_TO_TIMIT_BUCKEYE_SHARED`: A conservative reduction from the Buckeye and TIMIT IPA inventories, respectively, to a shared symbol set. This maps nasalized vowels and flaps to their non-nasalized versions, r-colored vowels ('ɚ', 'ɝ') to syllabic r ('ɹ̩'), and normalizes variants of 'ʌ' and schwa to sch + +```python +>>> from phonecodes import phonecodes +# Conversion from Buckeye to IPA using the original published Buckeye mapping +>>> phonecodes.convert("B AHN NX AAN NX AH", "buckeye", "ipa") +'b ʌ̃ ɾ̃ ɑ̃ ɾ̃ ʌ' +# Conversion from Buckeye to IPA with postprocessing to an IPA inventory shared with TIMIT +>>> phonecodes.convert("B AHN NX AAN NX AH", "buckeye", "ipa", post_conversion_mapping = phonecodes.phonecode_tables.BUCKEYE_IPA_TO_TIMIT_BUCKEYE_SHARED) +'b ə n ɑ n ə' +# Custom mapping example - note that the nasalized diacritics are not affected by the remapping +>>> phonecodes.convert("B AHN NX AAN NX AH", "buckeye", "ipa", post_conversion_mapping = {'ʌ':'ə'}) +'b ə̃ ɾ̃ ɑ̃ ɾ̃ ə' +``` ## Reading Corpus Files If you are working with specific corpora, you can also convert between certain corpus formats as follows: diff --git a/src/phonecodes/phonecode_tables.py b/src/phonecodes/phonecode_tables.py index 3106ddd..2633c29 100644 --- a/src/phonecodes/phonecode_tables.py +++ b/src/phonecodes/phonecode_tables.py @@ -717,8 +717,8 @@ # Nasalized flap is too inconsistently annotated, so reduce to 'n' "ɾ̃": "n", # Use schwa in final vocabulary - "ʌ": "ə", "ʌ̃": "ə", + "ʌ": "ə", } TIMIT_IPA_TO_TIMIT_BUCKEYE_SHARED = { # These symbols are not present in Buckeye IPA, so must be reduced From 7da5591e112efe1494e6604a59085006bc7f2028 Mon Sep 17 00:00:00 2001 From: Virginia Partridge Date: Tue, 25 Nov 2025 16:55:34 -0500 Subject: [PATCH 12/12] Fix release date for 2.0.0 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76bcca6..eedeab8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ You should also add project tags for each release in Github, see [Managing relea # [Unreleased] -# [2.0.0] - 11/24/2024 +# [2.0.0] - 11/25/2024 ### Added - A Phonecodes enum class to the phonecodes module, to enforce valid conversion and language pairs more explicitly. - Support for post-processing after conversion to/from IPA is performed, to allow for reduction to a shared symbol set. This is useful, for example, to convert standard TIMIT symbol reductions or a shared symbol set between Buckeye and TIMIT.