@@ -75,15 +75,13 @@ def extract_voice_prompt(
7575 self ,
7676 ref_audio : Tuple [Any , int ],
7777 ref_text : Optional [str ] = None ,
78- language : str = "English" ,
7978 ) -> str :
8079 """
8180 Extract a reusable voice prompt from reference audio.
8281
8382 Args:
8483 ref_audio: Tuple of (audio_data, sample_rate)
8584 ref_text: Optional transcript of reference audio
86- language: Language of the reference audio
8785
8886 Returns:
8987 Base64-encoded voice prompt that can be reused
@@ -166,11 +164,15 @@ def synthesize(
166164
167165 if ref_audio :
168166 # Voice cloning path - use Base model
167+ # Normalize ref_text - strip whitespace, convert empty to None
168+ normalized_ref_text = ref_text .strip () if ref_text else None
169+
169170 wavs , sr = self .base_model .generate_voice_clone (
170171 text = text ,
171172 language = language ,
172173 ref_audio = ref_audio ,
173- ref_text = ref_text ,
174+ ref_text = normalized_ref_text ,
175+ max_new_tokens = 2048 ,
174176 )
175177 else :
176178 # Basic TTS path - use CustomVoice model with preset speaker
@@ -213,17 +215,18 @@ def extract_voice_prompt(
213215 self ,
214216 ref_audio : Tuple [Any , int ],
215217 ref_text : Optional [str ] = None ,
216- language : str = "English" ,
217218 ) -> str :
218219 """Extract a reusable voice prompt from reference audio."""
219220 if self .base_model is None :
220221 raise RuntimeError ("Base model not loaded" )
221222
223+ # Normalize ref_text - strip whitespace, convert empty to None
224+ normalized_ref_text = ref_text .strip () if ref_text else None
225+
222226 # Use the Base model's create_voice_clone_prompt method
223227 voice_prompt = self .base_model .create_voice_clone_prompt (
224228 ref_audio = ref_audio ,
225- ref_text = ref_text ,
226- language = language ,
229+ ref_text = normalized_ref_text ,
227230 )
228231
229232 # Serialize to base64 - voice_prompt is typically tensor data
@@ -259,6 +262,7 @@ def synthesize_with_prompt(
259262 text = text ,
260263 language = language ,
261264 voice_clone_prompt = prompt_tensor ,
265+ max_new_tokens = 2048 ,
262266 )
263267
264268 # Convert to WAV bytes
@@ -309,7 +313,7 @@ async def lifespan(app: FastAPI):
309313app = FastAPI (
310314 title = "TTS Server" ,
311315 description = "Multi-model text-to-speech API with voice cloning support" ,
312- version = "0.3.0 " ,
316+ version = "0.3.1 " ,
313317 lifespan = lifespan ,
314318)
315319
@@ -340,10 +344,9 @@ async def list_speakers():
340344
341345
342346@app .post ("/v1/voice/extract" )
343- async def extract_voice_prompt (
347+ async def extract_voice_prompt_endpoint (
344348 ref_audio : UploadFile = File (..., description = "Reference audio for voice extraction" ),
345349 ref_text : Optional [str ] = Form (None , description = "Transcript of reference audio" ),
346- language : str = Form ("English" , description = "Language of the reference audio" ),
347350):
348351 """
349352 Extract a reusable voice prompt from reference audio.
@@ -369,14 +372,12 @@ async def extract_voice_prompt(
369372 voice_prompt = backend .extract_voice_prompt (
370373 ref_audio = ref_audio_data ,
371374 ref_text = ref_text ,
372- language = language ,
373375 )
374376
375377 return JSONResponse ({
376378 "voice_prompt" : voice_prompt ,
377379 "format" : "base64-numpy" ,
378380 "ref_text" : ref_text ,
379- "language" : language ,
380381 })
381382
382383 except NotImplementedError as e :
0 commit comments