Skip to content

Commit b4bb456

Browse files
committed
fix: correct qwen_tts API parameter issues
Bug fixes: 1. Remove unsupported 'language' param from create_voice_clone_prompt 2. Normalize ref_text (strip whitespace, convert empty to None) 3. Add max_new_tokens=2048 to generate_voice_clone calls The ref_text parameter must be properly normalized to avoid the model misinterpreting the voice cloning request.
1 parent 9cd19f1 commit b4bb456

File tree

2 files changed

+14
-14
lines changed

2 files changed

+14
-14
lines changed

server.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,13 @@ def extract_voice_prompt(
7575
self,
7676
ref_audio: Tuple[Any, int],
7777
ref_text: Optional[str] = None,
78-
language: str = "English",
7978
) -> str:
8079
"""
8180
Extract a reusable voice prompt from reference audio.
8281
8382
Args:
8483
ref_audio: Tuple of (audio_data, sample_rate)
8584
ref_text: Optional transcript of reference audio
86-
language: Language of the reference audio
8785
8886
Returns:
8987
Base64-encoded voice prompt that can be reused
@@ -166,11 +164,15 @@ def synthesize(
166164

167165
if ref_audio:
168166
# Voice cloning path - use Base model
167+
# Normalize ref_text - strip whitespace, convert empty to None
168+
normalized_ref_text = ref_text.strip() if ref_text else None
169+
169170
wavs, sr = self.base_model.generate_voice_clone(
170171
text=text,
171172
language=language,
172173
ref_audio=ref_audio,
173-
ref_text=ref_text,
174+
ref_text=normalized_ref_text,
175+
max_new_tokens=2048,
174176
)
175177
else:
176178
# Basic TTS path - use CustomVoice model with preset speaker
@@ -213,17 +215,18 @@ def extract_voice_prompt(
213215
self,
214216
ref_audio: Tuple[Any, int],
215217
ref_text: Optional[str] = None,
216-
language: str = "English",
217218
) -> str:
218219
"""Extract a reusable voice prompt from reference audio."""
219220
if self.base_model is None:
220221
raise RuntimeError("Base model not loaded")
221222

223+
# Normalize ref_text - strip whitespace, convert empty to None
224+
normalized_ref_text = ref_text.strip() if ref_text else None
225+
222226
# Use the Base model's create_voice_clone_prompt method
223227
voice_prompt = self.base_model.create_voice_clone_prompt(
224228
ref_audio=ref_audio,
225-
ref_text=ref_text,
226-
language=language,
229+
ref_text=normalized_ref_text,
227230
)
228231

229232
# Serialize to base64 - voice_prompt is typically tensor data
@@ -259,6 +262,7 @@ def synthesize_with_prompt(
259262
text=text,
260263
language=language,
261264
voice_clone_prompt=prompt_tensor,
265+
max_new_tokens=2048,
262266
)
263267

264268
# Convert to WAV bytes
@@ -309,7 +313,7 @@ async def lifespan(app: FastAPI):
309313
app = FastAPI(
310314
title="TTS Server",
311315
description="Multi-model text-to-speech API with voice cloning support",
312-
version="0.3.0",
316+
version="0.3.1",
313317
lifespan=lifespan,
314318
)
315319

@@ -340,10 +344,9 @@ async def list_speakers():
340344

341345

342346
@app.post("/v1/voice/extract")
343-
async def extract_voice_prompt(
347+
async def extract_voice_prompt_endpoint(
344348
ref_audio: UploadFile = File(..., description="Reference audio for voice extraction"),
345349
ref_text: Optional[str] = Form(None, description="Transcript of reference audio"),
346-
language: str = Form("English", description="Language of the reference audio"),
347350
):
348351
"""
349352
Extract a reusable voice prompt from reference audio.
@@ -369,14 +372,12 @@ async def extract_voice_prompt(
369372
voice_prompt = backend.extract_voice_prompt(
370373
ref_audio=ref_audio_data,
371374
ref_text=ref_text,
372-
language=language,
373375
)
374376

375377
return JSONResponse({
376378
"voice_prompt": voice_prompt,
377379
"format": "base64-numpy",
378380
"ref_text": ref_text,
379-
"language": language,
380381
})
381382

382383
except NotImplementedError as e:

tests/test_server.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@ def mock_custom_voice(text, language, speaker):
2424
return [audio], sample_rate
2525

2626
# Mock generate_voice_clone for voice cloning
27-
def mock_voice_clone(text, language, ref_audio=None, ref_text=None, voice_clone_prompt=None):
27+
def mock_voice_clone(text, language, ref_audio=None, ref_text=None, voice_clone_prompt=None, max_new_tokens=2048):
2828
sample_rate = 24000
2929
audio = np.zeros(sample_rate, dtype=np.float32)
3030
return [audio], sample_rate
3131

3232
# Mock create_voice_clone_prompt for voice embedding extraction
33-
def mock_create_prompt(ref_audio, ref_text=None, language="English"):
33+
def mock_create_prompt(ref_audio, ref_text=None):
3434
# Return a mock tensor-like object
3535
return np.zeros((256,), dtype=np.float32)
3636

@@ -194,7 +194,6 @@ def test_extract_voice_prompt(self, client, mock_qwen3_model):
194194
"/v1/voice/extract",
195195
data={
196196
"ref_text": "This is a reference transcript",
197-
"language": "English"
198197
},
199198
files={"ref_audio": ("ref.wav", wav_buffer, "audio/wav")}
200199
)

0 commit comments

Comments
 (0)