From 3f68d87191fdb06391db66c393d7e20bcc75f38b Mon Sep 17 00:00:00 2001 From: Hanish Keloth Date: Mon, 16 Feb 2026 09:23:23 +0530 Subject: [PATCH] feat: Add Korean language support with CosyVoice2 TTS pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a parallel Korean conversation pipeline using: - faster-whisper for Korean ASR (speech-to-text) - OpenAI-compatible LLM API for Korean text generation - CosyVoice2-0.5B for Korean TTS (text-to-speech, streaming) Backend: - New moshi/moshi/korean/ package with ASR, LLM, TTS, and pipeline modules - Korean voice presets (4 voices: 2 female, 2 male) - New /api/chat-ko WebSocket endpoint using same binary protocol - CLI args: --language, --llm-endpoint, --llm-model, --whisper-model, --cosyvoice-model Frontend: - i18n system with English and Korean translations - Language selector toggle (EN / 한국어) on the Queue page - Korean text prompt presets and voice options - All hardcoded UI strings replaced with i18n keys The existing English end-to-end mode remains untouched. --- README.md | 76 +++++++ client/src/app.tsx | 40 +++- client/src/i18n/en.json | 48 +++++ client/src/i18n/index.ts | 45 ++++ client/src/i18n/ko.json | 48 +++++ .../src/pages/Conversation/Conversation.tsx | 20 +- .../components/ServerAudio/ServerAudio.tsx | 6 +- .../ServerAudio/ServerAudioStats.tsx | 12 +- .../components/ServerInfo/ServerInfo.tsx | 22 +- client/src/pages/Queue/Queue.tsx | 115 ++++++++-- moshi/moshi/korean/__init__.py | 17 ++ moshi/moshi/korean/asr.py | 119 +++++++++++ moshi/moshi/korean/llm.py | 86 ++++++++ moshi/moshi/korean/pipeline.py | 202 ++++++++++++++++++ moshi/moshi/korean/tts.py | 116 ++++++++++ moshi/moshi/korean/voices.py | 64 ++++++ moshi/moshi/server.py | 65 ++++++ moshi/requirements.txt | 7 +- 18 files changed, 1058 insertions(+), 50 deletions(-) create mode 100644 client/src/i18n/en.json create mode 100644 client/src/i18n/index.ts create mode 100644 client/src/i18n/ko.json create mode 100644 moshi/moshi/korean/__init__.py create mode 100644 moshi/moshi/korean/asr.py create mode 100644 moshi/moshi/korean/llm.py create mode 100644 moshi/moshi/korean/pipeline.py create mode 100644 moshi/moshi/korean/tts.py create mode 100644 moshi/moshi/korean/voices.py diff --git a/README.md b/README.md index 301cc652..3aff65b2 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,82 @@ Personaplex finetunes Moshi and benefits from the generalization capabilities of You enjoy having a good conversation. Have a technical discussion about fixing a reactor core on a spaceship to Mars. You are an astronaut on a Mars mission. Your name is Alex. You are already dealing with a reactor core meltdown on a Mars mission. Several ship systems are failing, and continued instability will lead to catastrophic failure. You explain what is happening and you urgently ask for help thinking through how to stabilize the reactor. ``` +## Korean Language Support + +PersonaPlex supports Korean conversations through a parallel pipeline using best-in-class open-source components: + +- **ASR**: `faster-whisper` (Korean speech to text) +- **LLM**: Any OpenAI-compatible API (Korean text generation) +- **TTS**: CosyVoice2-0.5B (Korean text to speech, streaming) + +### Additional Dependencies + +Install Korean language support dependencies: +```bash +pip install faster-whisper>=1.0.0 openai>=1.0.0 librosa>=0.10.0 +``` + +For CosyVoice2 TTS, follow the [CosyVoice2 installation guide](https://github.com/FunAudioLLM/CosyVoice). + +### LLM Backend Setup + +Korean mode requires an OpenAI-compatible LLM backend. The easiest option is [Ollama](https://ollama.ai): + +```bash +# Install and start Ollama, then pull a Korean-capable model +ollama pull qwen2.5:7b +``` + +### Launching with Korean Support + +```bash +# English + Korean (both pipelines) +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --language all \ + --llm-endpoint http://localhost:11434/v1 --llm-model qwen2.5:7b + +# Korean only +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --language ko \ + --llm-endpoint http://localhost:11434/v1 --llm-model qwen2.5:7b +``` + +### Korean CLI Arguments + +| Argument | Default | Description | +|----------|---------|-------------| +| `--language` | `en` | Language mode: `en`, `ko`, or `all` | +| `--llm-endpoint` | `http://localhost:11434/v1` | OpenAI-compatible LLM API endpoint | +| `--llm-model` | `qwen2.5:7b` | LLM model name | +| `--llm-api-key` | `ollama` | API key for the LLM endpoint | +| `--whisper-model` | `large-v3` | Whisper model size for Korean ASR | +| `--cosyvoice-model` | `FunAudioLLM/CosyVoice2-0.5B` | CosyVoice2 model for Korean TTS | + +### Korean Voices + +| Voice | Description | Gender | +|-------|------------|--------| +| 한국어 여성 1 | Korean Female Natural | F | +| 한국어 여성 2 | Korean Female Expressive | F | +| 한국어 남성 1 | Korean Male Natural | M | +| 한국어 남성 2 | Korean Male Expressive | M | + +### Korean Pipeline Architecture + +``` +User Mic → Opus → WebSocket → faster-whisper (Korean ASR) + ↓ + Korean text + ↓ + LLM (OpenAI-compatible API) + ↓ + Korean response text + ↓ + CosyVoice2-0.5B (Korean TTS, streaming) + ↓ + PCM → Opus → WebSocket → Client Speaker +``` + +The Korean pipeline runs on a separate WebSocket endpoint (`/api/chat-ko`) and uses the same binary protocol as the English pipeline. Users select their language from the UI before connecting. + ## License The present code is provided under the MIT license. The weights for the models are released under the NVIDIA Open Model license. diff --git a/client/src/app.tsx b/client/src/app.tsx index 49d6d3b0..8459d942 100644 --- a/client/src/app.tsx +++ b/client/src/app.tsx @@ -1,3 +1,4 @@ +import { useState, useCallback, useMemo } from "react"; import ReactDOM from "react-dom/client"; import { createBrowserRouter, @@ -5,14 +6,39 @@ import { } from "react-router-dom"; import "./index.css"; import { Queue } from "./pages/Queue/Queue"; +import { I18nContext, Language, translate } from "./i18n"; -const router = createBrowserRouter([ - { - path: "/", - element: , - }, -]); +const App = () => { + const [language, setLanguage] = useState("en"); + + const t = useCallback( + (key: string) => translate(language, key), + [language], + ); + + const i18nValue = useMemo( + () => ({ language, setLanguage, t }), + [language, t], + ); + + const router = useMemo( + () => + createBrowserRouter([ + { + path: "/", + element: , + }, + ]), + [], + ); + + return ( + + + + ); +}; ReactDOM.createRoot(document.getElementById("root") as HTMLElement).render( - + ); diff --git a/client/src/i18n/en.json b/client/src/i18n/en.json new file mode 100644 index 00000000..bc62f4f2 --- /dev/null +++ b/client/src/i18n/en.json @@ -0,0 +1,48 @@ +{ + "app": { + "title": "PersonaPlex", + "description": "Full duplex conversational AI with text and voice control." + }, + "queue": { + "textPromptLabel": "Text Prompt:", + "examplesLabel": "Examples:", + "textPromptPlaceholder": "Enter your text prompt...", + "voiceLabel": "Voice:", + "connectButton": "Connect", + "microphoneError": "Please enable your microphone before proceeding", + "languageLabel": "Language:" + }, + "conversation": { + "newConversation": "New Conversation", + "disconnect": "Disconnect", + "connecting": "Connecting...", + "downloadAudio": "Download audio", + "connectionIssue": "A connection issue has been detected, you've been reconnected", + "dismiss": "Dismiss" + }, + "serverInfo": { + "header": "Our server is running on the following configuration:", + "textTemperature": "Text temperature", + "textTopk": "Text topk", + "audioTemperature": "Audio temperature", + "audioTopk": "Audio topk", + "padMult": "Pad mult", + "repeatPenaltyLastN": "Repeat penalty last N", + "repeatPenalty": "Repeat penalty", + "lmModelFile": "LM model file", + "instanceName": "Instance name" + }, + "stats": { + "title": "Server Audio Stats", + "audioPlayed": "Audio played:", + "missedAudio": "Missed audio:", + "latency": "Latency:", + "minMaxBuffer": "Min/Max buffer:" + }, + "presets": { + "assistant": "Assistant (default)", + "medical": "Medical office (service)", + "bank": "Bank (service)", + "astronaut": "Astronaut (fun)" + } +} diff --git a/client/src/i18n/index.ts b/client/src/i18n/index.ts new file mode 100644 index 00000000..7c391589 --- /dev/null +++ b/client/src/i18n/index.ts @@ -0,0 +1,45 @@ +import { createContext, useContext } from "react"; +import en from "./en.json"; +import ko from "./ko.json"; + +export type Language = "en" | "ko"; + +const translations: Record = { en, ko }; + +export type I18nContextType = { + language: Language; + setLanguage: (lang: Language) => void; + t: (key: string) => string; +}; + +/** + * Get a nested value from an object using a dot-separated key path. + */ +function getNestedValue(obj: Record, keyPath: string): string { + const keys = keyPath.split("."); + let current: unknown = obj; + for (const k of keys) { + if (current === null || current === undefined || typeof current !== "object") { + return keyPath; + } + current = (current as Record)[k]; + } + return typeof current === "string" ? current : keyPath; +} + +export function translate(language: Language, key: string): string { + return getNestedValue( + translations[language] as unknown as Record, + key, + ); +} + +export const I18nContext = createContext({ + language: "en", + setLanguage: () => {}, + t: (key: string) => translate("en", key), +}); + +export function useI18n(): I18nContextType { + return useContext(I18nContext); +} diff --git a/client/src/i18n/ko.json b/client/src/i18n/ko.json new file mode 100644 index 00000000..7df6e463 --- /dev/null +++ b/client/src/i18n/ko.json @@ -0,0 +1,48 @@ +{ + "app": { + "title": "PersonaPlex", + "description": "텍스트와 음성 제어가 가능한 전이중 대화형 AI." + }, + "queue": { + "textPromptLabel": "텍스트 프롬프트:", + "examplesLabel": "예시:", + "textPromptPlaceholder": "텍스트 프롬프트를 입력하세요...", + "voiceLabel": "음성:", + "connectButton": "연결", + "microphoneError": "진행하기 전에 마이크를 활성화해 주세요", + "languageLabel": "언어:" + }, + "conversation": { + "newConversation": "새 대화", + "disconnect": "연결 해제", + "connecting": "연결 중...", + "downloadAudio": "오디오 다운로드", + "connectionIssue": "연결 문제가 감지되어 다시 연결되었습니다", + "dismiss": "닫기" + }, + "serverInfo": { + "header": "서버 구성 정보:", + "textTemperature": "텍스트 온도", + "textTopk": "텍스트 Top-K", + "audioTemperature": "오디오 온도", + "audioTopk": "오디오 Top-K", + "padMult": "패드 배수", + "repeatPenaltyLastN": "반복 패널티 마지막 N", + "repeatPenalty": "반복 패널티", + "lmModelFile": "LM 모델 파일", + "instanceName": "인스턴스 이름" + }, + "stats": { + "title": "서버 오디오 통계", + "audioPlayed": "재생된 오디오:", + "missedAudio": "누락된 오디오:", + "latency": "지연 시간:", + "minMaxBuffer": "최소/최대 버퍼:" + }, + "presets": { + "assistant": "AI 비서 (기본)", + "medical": "의료 상담 (서비스)", + "bank": "은행 상담 (서비스)", + "astronaut": "우주비행사 (재미)" + } +} diff --git a/client/src/pages/Conversation/Conversation.tsx b/client/src/pages/Conversation/Conversation.tsx index 4c7711a0..7b79d519 100644 --- a/client/src/pages/Conversation/Conversation.tsx +++ b/client/src/pages/Conversation/Conversation.tsx @@ -13,6 +13,7 @@ import { ModelParamsValues, useModelParams } from "./hooks/useModelParams"; import fixWebmDuration from "webm-duration-fix"; import { getMimeType, getExtension } from "./getMimeType"; import { type ThemeType } from "./hooks/useSystemTheme"; +import { useI18n, type Language } from "../../i18n"; type ConversationProps = { workerAddr: string; @@ -21,6 +22,7 @@ type ConversationProps = { sessionId?: number; email?: string; theme: ThemeType; + language?: Language; audioContext: MutableRefObject; worklet: MutableRefObject; onConversationEnd?: () => void; @@ -36,6 +38,7 @@ const buildURL = ({ email, textSeed, audioSeed, + language = "en", }: { workerAddr: string; params: ModelParamsValues; @@ -43,6 +46,7 @@ const buildURL = ({ email?: string; textSeed: number; audioSeed: number; + language?: Language; }) => { const newWorkerAddr = useMemo(() => { if (workerAddr == "same" || workerAddr == "") { @@ -53,7 +57,8 @@ const buildURL = ({ return workerAddr; }, [workerAddr]); const wsProtocol = (window.location.protocol === 'https:') ? 'wss' : 'ws'; - const url = new URL(`${wsProtocol}://${newWorkerAddr}/api/chat`); + const chatEndpoint = language === "ko" ? "/api/chat-ko" : "/api/chat"; + const url = new URL(`${wsProtocol}://${newWorkerAddr}${chatEndpoint}`); if(workerAuthId) { url.searchParams.append("worker_auth_id", workerAuthId); } @@ -88,8 +93,10 @@ export const Conversation:FC = ({ isBypass=false, email, theme, + language = "en", ...params }) => { + const { t } = useI18n(); const getAudioStats = useRef<() => AudioStats>(() => ({ playedAudioDuration: 0, missedAudioDuration: 0, @@ -120,6 +127,7 @@ export const Conversation:FC = ({ email: email, textSeed: textSeed, audioSeed: audioSeed, + language, }); const onDisconnect = useCallback(() => { @@ -223,14 +231,14 @@ export const Conversation:FC = ({ const socketButtonMsg = useMemo(() => { if (isOver) { - return 'New Conversation'; + return t("conversation.newConversation"); } if (socketStatus === "connected") { - return 'Disconnect'; + return t("conversation.disconnect"); } else { - return 'Connecting...'; + return t("conversation.connecting"); } - }, [isOver, socketStatus]); + }, [isOver, socketStatus, t]); return ( = ({ />
- {audioURL && } + {audioURL && }
diff --git a/client/src/pages/Conversation/components/ServerAudio/ServerAudio.tsx b/client/src/pages/Conversation/components/ServerAudio/ServerAudio.tsx index f6bb4584..7f1c4852 100644 --- a/client/src/pages/Conversation/components/ServerAudio/ServerAudio.tsx +++ b/client/src/pages/Conversation/components/ServerAudio/ServerAudio.tsx @@ -2,6 +2,7 @@ import { FC, useRef } from "react"; import { AudioStats, useServerAudio } from "../../hooks/useServerAudio"; import { ServerVisualizer } from "../AudioVisualizer/ServerVisualizer"; import { type ThemeType } from "../../hooks/useSystemTheme"; +import { useI18n } from "../../../../i18n"; type ServerAudioProps = { setGetAudioStats: (getAudioStats: () => AudioStats) => void; @@ -12,18 +13,19 @@ export const ServerAudio: FC = ({ setGetAudioStats, theme }) = setGetAudioStats, }); const containerRef = useRef(null); + const { t } = useI18n(); return ( <> {hasCriticalDelay && (
-

A connection issue has been detected, you've been reconnected

+

{t("conversation.connectionIssue")}

)} diff --git a/client/src/pages/Conversation/components/ServerAudio/ServerAudioStats.tsx b/client/src/pages/Conversation/components/ServerAudio/ServerAudioStats.tsx index a802bf23..ac591bde 100644 --- a/client/src/pages/Conversation/components/ServerAudio/ServerAudioStats.tsx +++ b/client/src/pages/Conversation/components/ServerAudio/ServerAudioStats.tsx @@ -1,4 +1,5 @@ import { useState, useEffect, useRef } from "react"; +import { useI18n } from "../../../../i18n"; type ServerAudioStatsProps = { getAudioStats: React.MutableRefObject< @@ -15,6 +16,7 @@ type ServerAudioStatsProps = { export const ServerAudioStats = ({ getAudioStats }: ServerAudioStatsProps) => { const [audioStats, setAudioStats] = useState(getAudioStats.current()); + const { t } = useI18n(); const movingAverageSum = useRef(0.); const movingAverageCount = useRef(0.); @@ -51,23 +53,23 @@ export const ServerAudioStats = ({ getAudioStats }: ServerAudioStatsProps) => { return (
-

Server Audio Stats

+

{t("stats.title")}

- + - + - + - + diff --git a/client/src/pages/Conversation/components/ServerInfo/ServerInfo.tsx b/client/src/pages/Conversation/components/ServerInfo/ServerInfo.tsx index 290d4f31..caf3360a 100644 --- a/client/src/pages/Conversation/components/ServerInfo/ServerInfo.tsx +++ b/client/src/pages/Conversation/components/ServerInfo/ServerInfo.tsx @@ -1,22 +1,24 @@ import { useServerInfo } from "../../hooks/useServerInfo"; +import { useI18n } from "../../../../i18n"; export const ServerInfo = () => { const { serverInfo } = useServerInfo(); + const { t } = useI18n(); if (!serverInfo) { return null; } return (
- Our server is running on the following configuration: -
Text temperature: {serverInfo.text_temperature}
-
Text topk: {serverInfo.text_topk}
-
Audio temperature: {serverInfo.audio_temperature}
-
Audio topk: {serverInfo.audio_topk}
-
Pad mult: {serverInfo.pad_mult}
-
Repeat penalty last N: {serverInfo.repetition_penalty_context}
-
Repeat penalty: {serverInfo.repetition_penalty}
-
LM model file: {serverInfo.lm_model_file}
-
Instance name: {serverInfo.instance_name}
+ {t("serverInfo.header")} +
{t("serverInfo.textTemperature")}: {serverInfo.text_temperature}
+
{t("serverInfo.textTopk")}: {serverInfo.text_topk}
+
{t("serverInfo.audioTemperature")}: {serverInfo.audio_temperature}
+
{t("serverInfo.audioTopk")}: {serverInfo.audio_topk}
+
{t("serverInfo.padMult")}: {serverInfo.pad_mult}
+
{t("serverInfo.repeatPenaltyLastN")}: {serverInfo.repetition_penalty_context}
+
{t("serverInfo.repeatPenalty")}: {serverInfo.repetition_penalty}
+
{t("serverInfo.lmModelFile")}: {serverInfo.lm_model_file}
+
{t("serverInfo.instanceName")}: {serverInfo.instance_name}
); }; diff --git a/client/src/pages/Queue/Queue.tsx b/client/src/pages/Queue/Queue.tsx index 7d0d44bb..95350d3d 100644 --- a/client/src/pages/Queue/Queue.tsx +++ b/client/src/pages/Queue/Queue.tsx @@ -7,6 +7,7 @@ import { Button } from "../../components/Button/Button"; import { useModelParams } from "../Conversation/hooks/useModelParams"; import { env } from "../../env"; import { prewarmDecoderWorker } from "../../decoder/decoderWorker"; +import { useI18n, Language } from "../../i18n"; const VOICE_OPTIONS = [ "NATF0.pt", "NATF1.pt", "NATF2.pt", "NATF3.pt", @@ -15,6 +16,13 @@ const VOICE_OPTIONS = [ "VARM0.pt", "VARM1.pt", "VARM2.pt", "VARM3.pt", "VARM4.pt", ]; +const KOREAN_VOICE_OPTIONS = [ + { key: "ko_female_1", label: "한국어 여성 1 (Natural)" }, + { key: "ko_female_2", label: "한국어 여성 2 (Expressive)" }, + { key: "ko_male_1", label: "한국어 남성 1 (Natural)" }, + { key: "ko_male_2", label: "한국어 남성 2 (Expressive)" }, +]; + const TEXT_PROMPT_PRESETS = [ { label: "Assistant (default)", @@ -34,6 +42,25 @@ const TEXT_PROMPT_PRESETS = [ }, ]; +const KOREAN_TEXT_PROMPT_PRESETS = [ + { + label: "AI 비서 (기본)", + text: "당신은 친절한 AI 비서입니다. 사용자의 질문에 한국어로 자연스럽게 대답하세요.", + }, + { + label: "은행 상담 (서비스)", + text: "당신은 은행 고객 서비스 상담원입니다. 정중하게 고객을 도와주세요.", + }, + { + label: "의료 상담 (서비스)", + text: "당신은 의료 상담 안내원입니다. 환자의 질문에 친절하게 답변하세요.", + }, + { + label: "우주비행사 (재미)", + text: "당신은 화성 임무 중인 우주비행사입니다. 우주선의 원자로 문제를 해결하기 위해 도움을 요청하고 있습니다. 긴급한 상황을 설명하고 함께 해결책을 찾아보세요.", + }, +]; + interface HomepageProps { showMicrophoneAccessMessage: boolean; startConnection: () => Promise; @@ -51,24 +78,66 @@ const Homepage = ({ voicePrompt, setVoicePrompt, }: HomepageProps) => { + const { language, setLanguage, t } = useI18n(); + + const presets = language === "ko" ? KOREAN_TEXT_PROMPT_PRESETS : TEXT_PROMPT_PRESETS; + const isKorean = language === "ko"; + return (
-

PersonaPlex

+

{t("app.title")}

- Full duplex conversational AI with text and voice control. + {t("app.description")}

+ {/* Language selector */} +
+ +
+ + +
+
+
- Examples: + {t("queue.examplesLabel")}
- {TEXT_PROMPT_PRESETS.map((preset) => ( + {presets.map((preset) => ( + +
); @@ -127,6 +202,7 @@ const Homepage = ({ export const Queue:FC = () => { const theme = "light" as const; // Always use light theme + const { language } = useI18n(); const [searchParams] = useSearchParams(); const overrideWorkerAddr = searchParams.get("worker_addr"); const [hasMicrophoneAccess, setHasMicrophoneAccess] = useState(false); @@ -135,7 +211,7 @@ export const Queue:FC = () => { const audioContext = useRef(null); const worklet = useRef(null); - + // enable eruda in development useEffect(() => { if(env.VITE_ENV === "development") { @@ -199,6 +275,7 @@ export const Queue:FC = () => { worklet={worklet as MutableRefObject} theme={theme} startConnection={startConnection} + language={language} {...modelParams} /> ) : ( diff --git a/moshi/moshi/korean/__init__.py b/moshi/moshi/korean/__init__.py new file mode 100644 index 00000000..da4a37e8 --- /dev/null +++ b/moshi/moshi/korean/__init__.py @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: MIT + +from .pipeline import KoreanPipeline +from .asr import KoreanASR +from .tts import KoreanTTS +from .llm import KoreanLLM +from .voices import KOREAN_VOICES, DEFAULT_KOREAN_VOICE + +__all__ = [ + "KoreanPipeline", + "KoreanASR", + "KoreanTTS", + "KoreanLLM", + "KOREAN_VOICES", + "DEFAULT_KOREAN_VOICE", +] diff --git a/moshi/moshi/korean/asr.py b/moshi/moshi/korean/asr.py new file mode 100644 index 00000000..ed2b3cf3 --- /dev/null +++ b/moshi/moshi/korean/asr.py @@ -0,0 +1,119 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: MIT + +import logging +from typing import Optional + +import numpy as np + +logger = logging.getLogger(__name__) + +# Silence detection threshold (RMS energy) +SILENCE_THRESHOLD = 0.01 +# Minimum audio length in seconds before attempting transcription +MIN_AUDIO_LENGTH_SEC = 0.5 +# Maximum buffer length in seconds before forcing transcription +MAX_BUFFER_LENGTH_SEC = 10.0 + + +class KoreanASR: + """Korean ASR using faster-whisper for speech-to-text.""" + + def __init__( + self, + model_size: str = "large-v3", + device: str = "cuda", + compute_type: str = "float16", + sample_rate: int = 24000, + ): + from faster_whisper import WhisperModel + + self.sample_rate = sample_rate + self.model = WhisperModel( + model_size, + device=device, + compute_type=compute_type, + ) + self._buffer: Optional[np.ndarray] = None + self._silence_frames = 0 + # Number of consecutive silent frames to trigger transcription + self._silence_trigger = int(0.5 * sample_rate / 480) # ~0.5s of silence + logger.info(f"KoreanASR initialized with model={model_size}, device={device}") + + def reset(self): + self._buffer = None + self._silence_frames = 0 + + def _is_silent(self, audio: np.ndarray) -> bool: + rms = np.sqrt(np.mean(audio ** 2)) + return rms < SILENCE_THRESHOLD + + def add_audio(self, audio_pcm: np.ndarray) -> Optional[str]: + """Add audio frames and return transcription when ready. + + Accumulates audio frames and triggers transcription when silence + is detected or the buffer is full. + + Args: + audio_pcm: Audio samples as float32 numpy array, mono, at self.sample_rate. + + Returns: + Transcribed Korean text, or None if more audio is needed. + """ + if self._buffer is None: + self._buffer = audio_pcm + else: + self._buffer = np.concatenate([self._buffer, audio_pcm]) + + buffer_duration = len(self._buffer) / self.sample_rate + + if self._is_silent(audio_pcm): + self._silence_frames += 1 + else: + self._silence_frames = 0 + + should_transcribe = False + if buffer_duration >= MAX_BUFFER_LENGTH_SEC: + should_transcribe = True + elif ( + self._silence_frames >= self._silence_trigger + and buffer_duration >= MIN_AUDIO_LENGTH_SEC + ): + should_transcribe = True + + if should_transcribe: + text = self.transcribe(self._buffer) + self._buffer = None + self._silence_frames = 0 + return text + + return None + + def transcribe(self, audio_pcm: np.ndarray) -> str: + """Transcribe audio to Korean text. + + Args: + audio_pcm: Audio samples as float32 numpy array, mono, at self.sample_rate. + + Returns: + Transcribed Korean text. + """ + segments, info = self.model.transcribe( + audio_pcm, + language="ko", + beam_size=5, + vad_filter=True, + vad_parameters=dict( + min_silence_duration_ms=300, + speech_pad_ms=200, + ), + ) + + text_parts = [] + for segment in segments: + text_parts.append(segment.text.strip()) + + result = " ".join(text_parts).strip() + if result: + logger.debug(f"Transcribed: {result}") + return result diff --git a/moshi/moshi/korean/llm.py b/moshi/moshi/korean/llm.py new file mode 100644 index 00000000..9da9f0b8 --- /dev/null +++ b/moshi/moshi/korean/llm.py @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: MIT + +import logging +from typing import AsyncGenerator, List, Dict, Optional + +logger = logging.getLogger(__name__) + +DEFAULT_KOREAN_SYSTEM_PROMPT = ( + "당신은 친절한 AI 비서입니다. 사용자의 질문에 한국어로 자연스럽게 대답하세요. " + "답변은 간결하고 대화체로 해주세요." +) + + +class KoreanLLM: + """Korean LLM using OpenAI-compatible API (works with vLLM, Ollama, or cloud).""" + + def __init__( + self, + endpoint: str = "http://localhost:11434/v1", + model: str = "qwen2.5:7b", + api_key: str = "ollama", + max_tokens: int = 256, + ): + import openai + + self.model = model + self.max_tokens = max_tokens + self.client = openai.AsyncOpenAI( + base_url=endpoint, + api_key=api_key, + ) + self._conversation_history: List[Dict[str, str]] = [] + logger.info( + f"KoreanLLM initialized: endpoint={endpoint}, model={model}" + ) + + def reset(self): + self._conversation_history = [] + + async def generate( + self, + user_text: str, + system_prompt: Optional[str] = None, + ) -> AsyncGenerator[str, None]: + """Generate a Korean response via streaming. + + Args: + user_text: User's message in Korean. + system_prompt: Optional system prompt override. Defaults to Korean assistant prompt. + + Yields: + Text chunks as they arrive from the LLM. + """ + if system_prompt is None: + system_prompt = DEFAULT_KOREAN_SYSTEM_PROMPT + + messages = [{"role": "system", "content": system_prompt}] + messages.extend(self._conversation_history) + messages.append({"role": "user", "content": user_text}) + + self._conversation_history.append({"role": "user", "content": user_text}) + + full_response = [] + stream = await self.client.chat.completions.create( + model=self.model, + messages=messages, + max_tokens=self.max_tokens, + stream=True, + temperature=0.7, + ) + + async for chunk in stream: + if chunk.choices and chunk.choices[0].delta.content: + text_piece = chunk.choices[0].delta.content + full_response.append(text_piece) + yield text_piece + + assistant_message = "".join(full_response) + self._conversation_history.append( + {"role": "assistant", "content": assistant_message} + ) + + # Keep conversation history manageable + if len(self._conversation_history) > 20: + self._conversation_history = self._conversation_history[-16:] diff --git a/moshi/moshi/korean/pipeline.py b/moshi/moshi/korean/pipeline.py new file mode 100644 index 00000000..e7737ed8 --- /dev/null +++ b/moshi/moshi/korean/pipeline.py @@ -0,0 +1,202 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: MIT + +import asyncio +import logging +from typing import Optional + +import numpy as np +import sphn +from aiohttp import web +import aiohttp + +from .asr import KoreanASR +from .llm import KoreanLLM +from .tts import KoreanTTS +from .voices import DEFAULT_KOREAN_VOICE +from ..utils.logging import ColorizedLog + +logger = logging.getLogger(__name__) + +# PersonaPlex native sample rate +SAMPLE_RATE = 24000 +# Frame size matching Moshi's frame size at 24kHz +FRAME_SIZE = 1920 + + +class KoreanPipeline: + """Orchestrates Korean ASR -> LLM -> TTS pipeline over WebSocket.""" + + def __init__( + self, + asr: KoreanASR, + llm: KoreanLLM, + tts: KoreanTTS, + ): + self.asr = asr + self.llm = llm + self.tts = tts + self.lock = asyncio.Lock() + logger.info("KoreanPipeline initialized") + + async def handle_chat(self, request: web.Request) -> web.WebSocketResponse: + """WebSocket handler for Korean conversation, matching the Moshi protocol.""" + ws = web.WebSocketResponse() + await ws.prepare(request) + clog = ColorizedLog.randomize() + peer = request.remote + peer_port = request.transport.get_extra_info("peername")[1] + clog.log("info", f"[KO] Incoming connection from {peer}:{peer_port}") + + text_prompt = request.query.get("text_prompt", "") + voice_prompt = request.query.get("voice_prompt", DEFAULT_KOREAN_VOICE) + + close = False + opus_reader = sphn.OpusStreamReader(SAMPLE_RATE) + opus_writer = sphn.OpusStreamWriter(SAMPLE_RATE) + + # Pending text from ASR that needs LLM processing + pending_text: Optional[str] = None + pending_text_event = asyncio.Event() + + async def recv_loop(): + """Receive audio from client and run ASR.""" + nonlocal close, pending_text + try: + async for message in ws: + if message.type == aiohttp.WSMsgType.ERROR: + clog.log("error", f"{ws.exception()}") + break + elif message.type in ( + aiohttp.WSMsgType.CLOSED, + aiohttp.WSMsgType.CLOSE, + ): + break + elif message.type != aiohttp.WSMsgType.BINARY: + clog.log("error", f"unexpected message type {message.type}") + continue + + data = message.data + if not isinstance(data, bytes) or len(data) == 0: + continue + + kind = data[0] + if kind == 1: # audio + opus_reader.append_bytes(data[1:]) + pcm = opus_reader.read_pcm() + if pcm.shape[-1] == 0: + continue + + transcription = self.asr.add_audio(pcm) + if transcription: + clog.log("info", f"[KO] ASR: {transcription}") + # Send transcribed text back to client + text_msg = b"\x02" + bytes( + f"[User] {transcription}", encoding="utf8" + ) + await ws.send_bytes(text_msg) + pending_text = transcription + pending_text_event.set() + elif kind == 3: # control + if len(data) > 1 and data[1] == 0x03: # restart + self.asr.reset() + self.llm.reset() + else: + clog.log("warning", f"unknown message kind {kind}") + finally: + close = True + clog.log("info", "[KO] recv_loop closed") + + async def process_loop(): + """Process ASR text through LLM and TTS.""" + nonlocal close, pending_text + while not close: + await pending_text_event.wait() + pending_text_event.clear() + + if close or pending_text is None: + continue + + user_text = pending_text + pending_text = None + + try: + # Stream LLM response + llm_text_buffer = [] + async for text_chunk in self.llm.generate( + user_text, system_prompt=text_prompt or None + ): + if close: + break + llm_text_buffer.append(text_chunk) + + # Send text to client as it streams + text_msg = b"\x02" + bytes(text_chunk, encoding="utf8") + await ws.send_bytes(text_msg) + + if close: + continue + + # Synthesize full LLM response + full_response = "".join(llm_text_buffer) + if not full_response.strip(): + continue + + clog.log("info", f"[KO] LLM: {full_response[:80]}...") + + # TTS: convert response to audio and stream it + for audio_chunk in self.tts.synthesize(full_response, voice_prompt): + if close: + break + opus_writer.append_pcm(audio_chunk) + + except Exception as e: + clog.log("error", f"[KO] process error: {e}") + error_msg = b"\x05" + bytes(str(e), encoding="utf8") + try: + await ws.send_bytes(error_msg) + except Exception: + pass + + async def send_loop(): + """Send TTS audio back to client.""" + while not close: + await asyncio.sleep(0.001) + msg = opus_writer.read_bytes() + if len(msg) > 0: + try: + await ws.send_bytes(b"\x01" + msg) + except Exception: + break + + async with self.lock: + self.asr.reset() + self.llm.reset() + + # Send handshake + await ws.send_bytes(b"\x00") + clog.log("info", "[KO] sent handshake") + + tasks = [ + asyncio.create_task(recv_loop()), + asyncio.create_task(process_loop()), + asyncio.create_task(send_loop()), + ] + + done, pending_tasks = await asyncio.wait( + tasks, return_when=asyncio.FIRST_COMPLETED + ) + close = True + pending_text_event.set() # unblock process_loop + + for task in pending_tasks: + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + + await ws.close() + clog.log("info", "[KO] session closed") + + return ws diff --git a/moshi/moshi/korean/tts.py b/moshi/moshi/korean/tts.py new file mode 100644 index 00000000..c7d7d456 --- /dev/null +++ b/moshi/moshi/korean/tts.py @@ -0,0 +1,116 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: MIT + +import logging +from typing import Generator, Optional + +import numpy as np +import torch + +from .voices import KOREAN_VOICES, DEFAULT_KOREAN_VOICE, get_voice + +logger = logging.getLogger(__name__) + +# PersonaPlex native sample rate +TARGET_SAMPLE_RATE = 24000 + + +class KoreanTTS: + """Korean TTS using CosyVoice2-0.5B for text-to-speech synthesis.""" + + def __init__( + self, + model_id: str = "FunAudioLLM/CosyVoice2-0.5B", + device: str = "cuda", + ): + self.device = device + self.model_id = model_id + self._model = None + self._model_sample_rate: Optional[int] = None + logger.info(f"KoreanTTS will load model={model_id} on device={device}") + + def _ensure_model(self): + """Lazy-load the CosyVoice2 model on first use.""" + if self._model is not None: + return + + from cosyvoice import CosyVoice2 + + logger.info(f"Loading CosyVoice2 model: {self.model_id}") + self._model = CosyVoice2(self.model_id, load_jit=True, load_trt=False) + self._model_sample_rate = self._model.sample_rate + logger.info( + f"CosyVoice2 loaded, native sample_rate={self._model_sample_rate}" + ) + + def _resample(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: + """Resample audio to target sample rate.""" + if orig_sr == target_sr: + return audio + import librosa + + return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) + + def synthesize( + self, + text: str, + voice: str = DEFAULT_KOREAN_VOICE, + ) -> Generator[np.ndarray, None, None]: + """Synthesize Korean text to audio, yielding chunks as they're generated. + + Args: + text: Korean text to synthesize. + voice: Voice key from KOREAN_VOICES. + + Yields: + Audio chunks as float32 numpy arrays at TARGET_SAMPLE_RATE (24kHz). + """ + self._ensure_model() + + voice_config = get_voice(voice) + speaker_id = voice_config.speaker_id + + logger.debug(f"Synthesizing: '{text[:50]}...' with voice={voice}") + + for chunk_result in self._model.inference_zero_shot_streaming( + tts_text=text, + prompt_text="", + prompt_speech_16k=None, + stream=True, + speed=1.0, + speaker_id=speaker_id, + ): + if isinstance(chunk_result, dict) and "tts_speech" in chunk_result: + audio_tensor = chunk_result["tts_speech"] + elif isinstance(chunk_result, torch.Tensor): + audio_tensor = chunk_result + else: + continue + + audio_np = audio_tensor.cpu().numpy().squeeze() + if audio_np.ndim == 0 or len(audio_np) == 0: + continue + + audio_np = self._resample( + audio_np, self._model_sample_rate, TARGET_SAMPLE_RATE + ) + yield audio_np + + def synthesize_full( + self, + text: str, + voice: str = DEFAULT_KOREAN_VOICE, + ) -> np.ndarray: + """Synthesize Korean text to a single audio array. + + Args: + text: Korean text to synthesize. + voice: Voice key from KOREAN_VOICES. + + Returns: + Complete audio as float32 numpy array at TARGET_SAMPLE_RATE (24kHz). + """ + chunks = list(self.synthesize(text, voice)) + if not chunks: + return np.array([], dtype=np.float32) + return np.concatenate(chunks) diff --git a/moshi/moshi/korean/voices.py b/moshi/moshi/korean/voices.py new file mode 100644 index 00000000..e7d848f8 --- /dev/null +++ b/moshi/moshi/korean/voices.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: MIT + +from dataclasses import dataclass +from typing import Dict, List + + +@dataclass +class KoreanVoice: + name: str + description: str + gender: str + speaker_id: str + + +KOREAN_VOICES: Dict[str, KoreanVoice] = { + "ko_female_1": KoreanVoice( + name="한국어 여성 1", + description="Korean Female Natural", + gender="F", + speaker_id="korean_female_natural_1", + ), + "ko_female_2": KoreanVoice( + name="한국어 여성 2", + description="Korean Female Expressive", + gender="F", + speaker_id="korean_female_expressive_1", + ), + "ko_male_1": KoreanVoice( + name="한국어 남성 1", + description="Korean Male Natural", + gender="M", + speaker_id="korean_male_natural_1", + ), + "ko_male_2": KoreanVoice( + name="한국어 남성 2", + description="Korean Male Expressive", + gender="M", + speaker_id="korean_male_expressive_1", + ), +} + +DEFAULT_KOREAN_VOICE = "ko_female_1" + + +def get_voice(voice_key: str) -> KoreanVoice: + if voice_key not in KOREAN_VOICES: + raise ValueError( + f"Unknown Korean voice '{voice_key}'. " + f"Available voices: {list(KOREAN_VOICES.keys())}" + ) + return KOREAN_VOICES[voice_key] + + +def list_voices() -> List[Dict[str, str]]: + return [ + { + "key": key, + "name": voice.name, + "description": voice.description, + "gender": voice.gender, + } + for key, voice in KOREAN_VOICES.items() + ] diff --git a/moshi/moshi/server.py b/moshi/moshi/server.py index 771f491d..871e76a6 100644 --- a/moshi/moshi/server.py +++ b/moshi/moshi/server.py @@ -391,6 +391,45 @@ def main(): ) ) + # Korean language support arguments + parser.add_argument( + "--language", + type=str, + default="en", + choices=["en", "ko", "all"], + help="Language mode: 'en' (English only, default), 'ko' (Korean only), 'all' (both)." + ) + parser.add_argument( + "--llm-endpoint", + type=str, + default="http://localhost:11434/v1", + help="OpenAI-compatible LLM API endpoint for Korean mode (e.g., Ollama, vLLM)." + ) + parser.add_argument( + "--llm-model", + type=str, + default="qwen2.5:7b", + help="LLM model name for Korean mode." + ) + parser.add_argument( + "--llm-api-key", + type=str, + default="ollama", + help="API key for the LLM endpoint." + ) + parser.add_argument( + "--whisper-model", + type=str, + default="large-v3", + help="Whisper model size for Korean ASR." + ) + parser.add_argument( + "--cosyvoice-model", + type=str, + default="FunAudioLLM/CosyVoice2-0.5B", + help="CosyVoice2 model for Korean TTS." + ) + args = parser.parse_args() args.voice_prompt_dir = _get_voice_prompt_dir( args.voice_prompt_dir, @@ -458,6 +497,32 @@ def main(): state.warmup() app = web.Application() app.router.add_get("/api/chat", state.handle_chat) + + # Korean language pipeline + if args.language in ("ko", "all"): + logger.info("initializing Korean language pipeline") + from .korean.asr import KoreanASR + from .korean.llm import KoreanLLM + from .korean.tts import KoreanTTS + from .korean.pipeline import KoreanPipeline + + ko_device = "cuda" if args.device.type == "cuda" else "cpu" + ko_asr = KoreanASR( + model_size=args.whisper_model, + device=ko_device, + ) + ko_llm = KoreanLLM( + endpoint=args.llm_endpoint, + model=args.llm_model, + api_key=args.llm_api_key, + ) + ko_tts = KoreanTTS( + model_id=args.cosyvoice_model, + device=ko_device, + ) + ko_pipeline = KoreanPipeline(asr=ko_asr, llm=ko_llm, tts=ko_tts) + app.router.add_get("/api/chat-ko", ko_pipeline.handle_chat) + logger.info("Korean pipeline ready at /api/chat-ko") if static_path is not None: async def handle_root(_): return web.FileResponse(os.path.join(static_path, "index.html")) diff --git a/moshi/requirements.txt b/moshi/requirements.txt index e0608228..41477f33 100644 --- a/moshi/requirements.txt +++ b/moshi/requirements.txt @@ -6,4 +6,9 @@ sentencepiece==0.2 sounddevice==0.5 sphn>=0.1.4,<0.2 torch>=2.2.0,<2.5 -aiohttp>=3.10.5,<3.11 \ No newline at end of file +aiohttp>=3.10.5,<3.11 + +# Korean language support +faster-whisper>=1.0.0 +openai>=1.0.0 +librosa>=0.10.0 \ No newline at end of file
Audio played: {t("stats.audioPlayed")} {convertMinSecs(audioStats.playedAudioDuration)}
Missed audio: {t("stats.missedAudio")} {convertMinSecs(audioStats.missedAudioDuration)}
Latency: {t("stats.latency")} {(movingAverageSum.current / movingAverageCount.current).toFixed(3)}
Min/Max buffer: {t("stats.minMaxBuffer")} {audioStats.minPlaybackDelay.toFixed(3)} / {audioStats.maxPlaybackDelay.toFixed(3)}