From 5b8987fbaa8bd9d3ff9efc9e8f24d9b79c47da9b Mon Sep 17 00:00:00 2001 From: Lijiawei <1456470136@qq.com> Date: Wed, 21 Jan 2026 17:04:02 +0800 Subject: [PATCH] fix: support multiple language codes per entry for WhisperLiveKit compatibility When using WhisperLiveKit with NLLW, the --lan parameter is passed to both Whisper and NLLW. Whisper only accepts "zh" for Chinese while NLLW previously only accepted "zh-CN", causing incompatibility. Changes: - Allow language_code field to be a list (e.g., ["zh-CN", "zh"]) - Add helper functions _get_language_codes() and _match_language_code() - Update dictionary building to map all codes to the same nllb code - Update lookup functions to support list type language_code --- nllw/languages.py | 47 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/nllw/languages.py b/nllw/languages.py index 2883bef..b231d5f 100644 --- a/nllw/languages.py +++ b/nllw/languages.py @@ -199,20 +199,34 @@ {"name": "Eastern Yiddish", "nllb": "ydd_Hebr", "language_code": "yi"}, {"name": "Yoruba", "nllb": "yor_Latn", "language_code": "yo"}, {"name": "Yue Chinese", "nllb": "yue_Hant", "language_code": "yue"}, - {"name": "Chinese (Simplified)", "nllb": "zho_Hans", "language_code": "zh-CN"}, + {"name": "Chinese (Simplified)", "nllb": "zho_Hans", "language_code": ["zh-CN", "zh"]}, {"name": "Chinese (Traditional)", "nllb": "zho_Hant", "language_code": "zh-TW"}, {"name": "Standard Malay", "nllb": "zsm_Latn", "language_code": "ms"}, {"name": "Zulu", "nllb": "zul_Latn", "language_code": "zu"}, ] NAME_TO_NLLB = {lang["name"]: lang["nllb"] for lang in LANGUAGES} -NAME_TO_LANGUAGE_CODE = {lang["name"]: lang["language_code"] for lang in LANGUAGES} -LANGUAGE_CODE_TO_NLLB = {lang["language_code"]: lang["nllb"] for lang in LANGUAGES} -NLLB_TO_LANGUAGE_CODE = {lang["nllb"]: lang["language_code"] for lang in LANGUAGES} -LANGUAGE_CODE_TO_NAME = {lang["language_code"]: lang["name"] for lang in LANGUAGES} NLLB_TO_NAME = {lang["nllb"]: lang["name"] for lang in LANGUAGES} +# Helper function to normalize language_code to list +def _get_language_codes(lang): + codes = lang["language_code"] + return codes if isinstance(codes, list) else [codes] + + +# Build dictionaries that support multiple language_codes per entry +NAME_TO_LANGUAGE_CODE = {lang["name"]: _get_language_codes(lang)[0] for lang in LANGUAGES} +NLLB_TO_LANGUAGE_CODE = {lang["nllb"]: _get_language_codes(lang)[0] for lang in LANGUAGES} + +LANGUAGE_CODE_TO_NLLB = {} +LANGUAGE_CODE_TO_NAME = {} +for lang in LANGUAGES: + for code in _get_language_codes(lang): + LANGUAGE_CODE_TO_NLLB[code] = lang["nllb"] + LANGUAGE_CODE_TO_NAME[code] = lang["name"] + + def get_nllb_code(language_code_code): result = LANGUAGE_CODE_TO_NLLB.get(language_code_code, None) if result is not None: @@ -241,14 +255,20 @@ def get_language_name_by_nllb(nllb_code): return NLLB_TO_NAME.get(nllb_code) +def _match_language_code(lang, identifier): + """Check if identifier matches any of the language codes (case-insensitive).""" + codes = _get_language_codes(lang) + identifier_lower = identifier.lower() + return any(code == identifier or code.lower() == identifier_lower for code in codes) + + def get_language_info(identifier, identifier_type="auto"): if identifier_type == "auto": for lang in LANGUAGES: - if (lang["name"].lower() == identifier.lower() or - lang["nllb"] == identifier or - lang["nllb"].lower() == identifier.lower() or - lang["language_code"] == identifier or - lang["language_code"].lower() == identifier.lower()): + if (lang["name"].lower() == identifier.lower() or + lang["nllb"] == identifier or + lang["nllb"].lower() == identifier.lower() or + _match_language_code(lang, identifier)): return lang elif identifier_type == "name": for lang in LANGUAGES: @@ -260,7 +280,7 @@ def get_language_info(identifier, identifier_type="auto"): return lang elif identifier_type == "language_code": for lang in LANGUAGES: - if lang["language_code"] == identifier or lang["language_code"].lower() == identifier.lower(): + if _match_language_code(lang, identifier): return lang return None @@ -286,4 +306,7 @@ def list_all_nllb_codes(): def list_all_language_code_codes(): - return [lang["language_code"] for lang in LANGUAGES] + codes = [] + for lang in LANGUAGES: + codes.extend(_get_language_codes(lang)) + return codes