diff --git a/README.md b/README.md index 2f91a14..752f599 100644 --- a/README.md +++ b/README.md @@ -128,17 +128,15 @@ curl http://localhost:8000/api/v1/health **Batch Transcription** ```bash -curl -X POST "http://localhost:8000/api/v1/stt/transcribe?engine=whisper" \ - -H "accept: application/json" \ - -H "Content-Type: multipart/form-data" \ - -F "file=@/path/to/audio.wav" +curl -X POST "http://localhost:8000/api/v1/stt/transcribe?engine=whisper&language=en" \ + -F "audio=@/path/to/audio.wav" ``` **Real-time Streaming (SSE)** ```bash curl -N -X POST "http://localhost:8000/api/v1/stt/transcribe/stream?engine=whisper" \ - -F "file=@/path/to/audio.wav" + -F "audio=@/path/to/audio.wav" ``` ### 🔊 Text-to-Speech (TTS) @@ -146,13 +144,24 @@ curl -N -X POST "http://localhost:8000/api/v1/stt/transcribe/stream?engine=whisp **Batch Synthesis** ```bash -curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm&text=Hello%20world" +curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \ + -F "text=Hello world" ``` **Streaming Synthesis** ```bash -curl -N -X POST "http://localhost:8000/api/v1/tts/synthesize/stream?engine=voxcpm&text=Hello%20world" +curl -N -X POST "http://localhost:8000/api/v1/tts/synthesize/stream?engine=voxcpm" \ + -F "text=Hello world" +``` + +**Voice Cloning** + +```bash +curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \ + -F "text=Hello world" \ + -F "reference_audio=@/path/to/reference.wav" \ + -F "reference_text=This is the reference transcript" ``` --- @@ -182,7 +191,7 @@ Detailed documentation is available in the `docs/` directory: | Engine | Backend | Status | Features | | :--- | :--- | :---: | :--- | -| **VoxCPM** | `voxcpm` | ✅ Ready | Zero-shot voice cloning, streaming, 24kHz | +| **VoxCPM** | `voxcpm` | ✅ Ready | Zero-shot voice cloning, streaming | | **Coqui TTS** | `TTS` | 🚧 Planned | High-quality open source voices | | **OpenAI TTS** | OpenAI API | 🚧 Planned | Natural sounding commercial voices | diff --git a/app/api/routers/stt.py b/app/api/routers/stt.py index 4e4cc35..436ce09 100644 --- a/app/api/routers/stt.py +++ b/app/api/routers/stt.py @@ -4,6 +4,7 @@ from fastapi import ( APIRouter, Depends, + Form, HTTPException, Query, WebSocket, @@ -23,7 +24,7 @@ async def transcribe_audio( audio: Annotated[bytes, Depends(validate_audio_upload)], language: str | None = Query(None, description="Language hint"), - engine_params: str | None = Query(None, description="JSON engine parameters"), + engine_params: str | None = Form(None, description="JSON engine parameters"), stt_engine: BaseSTTEngine = Depends(get_stt_engine), ): """ @@ -32,6 +33,8 @@ async def transcribe_audio( Query params: - engine: STT engine name (required, e.g., "whisper") - language: Optional language hint + + Form params: - engine_params: Optional JSON engine parameters Returns complete transcription with segments and metrics. @@ -50,8 +53,8 @@ async def transcribe_audio( @router.post("/transcribe/stream") async def transcribe_audio_stream( audio: Annotated[bytes, Depends(validate_audio_upload)], - language: str | None = Query(None), - engine_params: str | None = Query(None), + language: str | None = Query(None, description="Language hint"), + engine_params: str | None = Form(None, description="JSON engine parameters"), stt_engine: BaseSTTEngine = Depends(get_stt_engine), ): """ @@ -59,6 +62,10 @@ async def transcribe_audio_stream( Query params: - engine: STT engine name (required, e.g., "whisper") + - language: Optional language hint + + Form params: + - engine_params: Optional JSON engine parameters Returns progressive chunks followed by final response. Event types: "chunk" (STTChunk), "complete" (STTResponse) diff --git a/app/api/routers/tts.py b/app/api/routers/tts.py index ae83231..8ee1ea6 100644 --- a/app/api/routers/tts.py +++ b/app/api/routers/tts.py @@ -1,6 +1,6 @@ import json -from fastapi import APIRouter, Depends, HTTPException, Query +from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile from sse_starlette.sse import EventSourceResponse from app.api.deps import get_tts_engine @@ -12,10 +12,16 @@ @router.post("/synthesize", response_model=TTSResponse) async def synthesize_text( - text: str = Query(..., description="Text to synthesize"), + text: str = Form(..., description="Text to synthesize"), + reference_audio: UploadFile | None = File( + None, description="Reference audio for voice cloning" + ), + reference_text: str | None = Form( + None, description="Transcript of reference audio" + ), + engine_params: str | None = Form(None, description="JSON engine parameters"), voice: str | None = Query(None, description="Voice name/ID to use"), speed: float = Query(1.0, gt=0, le=3.0, description="Speech speed multiplier"), - engine_params: str | None = Query(None, description="JSON engine parameters"), tts_engine: BaseTTSEngine = Depends(get_tts_engine), ): """ @@ -23,9 +29,13 @@ async def synthesize_text( Query params: - engine: TTS engine name (required) - - text: Text to synthesize (required) - voice: Optional voice name/ID - speed: Speech speed multiplier (0 < speed <= 3.0) + + Form params: + - text: Text to synthesize (required) + - reference_audio: Optional reference audio file for voice cloning + - reference_text: Optional transcript of reference audio - engine_params: Optional JSON engine parameters Returns complete audio (base64 encoded) with metrics. @@ -37,17 +47,34 @@ async def synthesize_text( except json.JSONDecodeError as e: raise HTTPException(400, "Invalid engine_params JSON") from e - result = await tts_engine.synthesize(text, voice=voice, speed=speed, **kwargs) + # Read reference audio bytes if provided + reference_audio_bytes = None + if reference_audio: + reference_audio_bytes = await reference_audio.read() + result = await tts_engine.synthesize( + text, + voice=voice, + speed=speed, + reference_audio=reference_audio_bytes, + reference_text=reference_text, + **kwargs, + ) return result @router.post("/synthesize/stream") async def synthesize_text_stream( - text: str = Query(..., description="Text to synthesize"), + text: str = Form(..., description="Text to synthesize"), + reference_audio: UploadFile | None = File( + None, description="Reference audio for voice cloning" + ), + reference_text: str | None = Form( + None, description="Transcript of reference audio" + ), + engine_params: str | None = Form(None, description="JSON engine parameters"), voice: str | None = Query(None, description="Voice name/ID to use"), speed: float = Query(1.0, gt=0, le=3.0, description="Speech speed multiplier"), - engine_params: str | None = Query(None, description="JSON engine parameters"), tts_engine: BaseTTSEngine = Depends(get_tts_engine), ): """ @@ -55,9 +82,13 @@ async def synthesize_text_stream( Query params: - engine: TTS engine name (required) - - text: Text to synthesize (required) - voice: Optional voice name/ID - speed: Speech speed multiplier (0 < speed <= 3.0) + + Form params: + - text: Text to synthesize (required) + - reference_audio: Optional reference audio file for voice cloning + - reference_text: Optional transcript of reference audio - engine_params: Optional JSON engine parameters Returns progressive audio chunks followed by final response. @@ -70,9 +101,19 @@ async def synthesize_text_stream( except json.JSONDecodeError as e: raise HTTPException(400, "Invalid engine_params JSON") from e + # Read reference audio bytes if provided + reference_audio_bytes = None + if reference_audio: + reference_audio_bytes = await reference_audio.read() + async def event_generator(): async for result in tts_engine.synthesize_stream( - text, voice=voice, speed=speed, **kwargs + text, + voice=voice, + speed=speed, + reference_audio=reference_audio_bytes, + reference_text=reference_text, + **kwargs, ): if isinstance(result, TTSChunk): yield {"event": "chunk", "data": result.model_dump_json()} diff --git a/app/engines/base.py b/app/engines/base.py index 86c687c..d3e9ce1 100644 --- a/app/engines/base.py +++ b/app/engines/base.py @@ -194,7 +194,13 @@ class BaseTTSEngine(BaseEngine): @abstractmethod async def synthesize( - self, text: str, voice: str | None = None, speed: float = 1.0, **kwargs + self, + text: str, + voice: str | None = None, + speed: float = 1.0, + reference_audio: bytes | None = None, + reference_text: str | None = None, + **kwargs, ) -> TTSResponse: """ Synthesize text to speech (invoke/batch mode) @@ -203,6 +209,8 @@ async def synthesize( text: Text to synthesize voice: Optional voice name (overrides config default) speed: Speech speed (1.0 = normal, overrides config default) + reference_audio: Reference audio bytes for voice cloning + reference_text: Transcript of reference audio for voice cloning **kwargs: Additional engine-specific parameters (passed via engine_params) Returns: @@ -212,7 +220,13 @@ async def synthesize( @abstractmethod async def synthesize_stream( - self, text: str, **kwargs + self, + text: str, + voice: str | None = None, + speed: float = 1.0, + reference_audio: bytes | None = None, + reference_text: str | None = None, + **kwargs, ) -> AsyncIterator[TTSChunk | TTSResponse]: """ Synthesize text to speech (streaming mode) @@ -222,7 +236,11 @@ async def synthesize_stream( Args: text: Text to synthesize - **kwargs: Engine-specific params (voice, speed, etc.) + voice: Optional voice name (overrides config default) + speed: Speech speed (1.0 = normal, overrides config default) + reference_audio: Reference audio bytes for voice cloning + reference_text: Transcript of reference audio for voice cloning + **kwargs: Additional engine-specific parameters (passed via engine_params) Yields: TTSChunk: Audio chunks with progressive generation diff --git a/app/engines/tts/voxcpm/engine.py b/app/engines/tts/voxcpm/engine.py index 585e170..cf9d9fc 100644 --- a/app/engines/tts/voxcpm/engine.py +++ b/app/engines/tts/voxcpm/engine.py @@ -6,7 +6,6 @@ import time from collections.abc import AsyncIterator -from pathlib import Path import numpy as np @@ -15,7 +14,7 @@ from app.exceptions import EngineNotReadyError, SynthesisError from app.models.engine import TTSChunk, TTSResponse from app.models.metrics import TTSPerformanceMetrics -from app.utils.audio import AudioProcessor +from app.utils.audio import AudioProcessor, temp_audio_file class VoxCPMEngine(BaseTTSEngine): @@ -72,6 +71,8 @@ async def synthesize( text: str, voice: str | None = None, speed: float = 1.0, + reference_audio: bytes | None = None, + reference_text: str | None = None, **kwargs, ) -> TTSResponse: """ @@ -79,11 +80,10 @@ async def synthesize( Args: text: Text to synthesize - voice: Not used (voice cloning via kwargs instead) + voice: Not used (voice cloning via reference_audio instead) speed: Not directly supported by VoxCPM - **kwargs: Additional parameters: - - prompt_wav_path: Path to reference audio for voice cloning - - prompt_text: Transcript of reference audio + reference_audio: Reference audio bytes for voice cloning + reference_text: Transcript of reference audio Returns: TTSResponse with audio data and metrics @@ -95,43 +95,36 @@ async def synthesize( if self._model is None: raise EngineNotReadyError("VoxCPM model not loaded") - # Extract voice cloning parameters - prompt_wav_path = kwargs.get("prompt_wav_path") - prompt_text = kwargs.get("prompt_text") - - # Validate prompt audio if provided - if prompt_wav_path is not None: - prompt_path = Path(prompt_wav_path) - if not prompt_path.exists(): - raise SynthesisError(f"Prompt audio file not found: {prompt_wav_path}") - - processing_start = time.time() - try: - # Generate audio using VoxCPM - wav = self._model.generate( - text=text, - prompt_wav_path=prompt_wav_path, - prompt_text=prompt_text, - cfg_value=self.voxcpm_config.cfg_value, - inference_timesteps=self.voxcpm_config.inference_timesteps, - normalize=self.voxcpm_config.normalize, - denoise=self.voxcpm_config.denoise, - retry_badcase=self.voxcpm_config.retry_badcase, - retry_badcase_max_times=self.voxcpm_config.retry_badcase_max_times, - retry_badcase_ratio_threshold=self.voxcpm_config.retry_badcase_ratio_threshold, - ) + with temp_audio_file(reference_audio) as prompt_wav_path: + processing_start = time.time() + try: + # Generate audio using VoxCPM + wav = self._model.generate( + text=text, + prompt_wav_path=prompt_wav_path, + prompt_text=reference_text, + cfg_value=self.voxcpm_config.cfg_value, + inference_timesteps=self.voxcpm_config.inference_timesteps, + normalize=self.voxcpm_config.normalize, + denoise=self.voxcpm_config.denoise, + retry_badcase=self.voxcpm_config.retry_badcase, + retry_badcase_max_times=self.voxcpm_config.retry_badcase_max_times, + retry_badcase_ratio_threshold=self.voxcpm_config.retry_badcase_ratio_threshold, + ) - # Get sample rate from model - sample_rate = self._model.tts_model.sample_rate + # Get sample rate from model + sample_rate = self._model.tts_model.sample_rate - # Convert numpy array to bytes (16-bit PCM WAV) - audio_bytes = self.audio_processor.numpy_to_wav_bytes(wav, sample_rate) + # Convert numpy array to bytes (16-bit PCM WAV) + audio_bytes = self.audio_processor.numpy_to_wav_bytes(wav, sample_rate) - # Calculate duration - duration_seconds = len(wav) / sample_rate + # Calculate duration + duration_seconds = len(wav) / sample_rate - except Exception as e: - raise SynthesisError(f"VoxCPM synthesis failed: {e}") from e + except Exception as e: + if isinstance(e, (EngineNotReadyError, SynthesisError)): + raise + raise SynthesisError(f"VoxCPM synthesis failed: {e}") from e processing_end = time.time() end_time = time.time() @@ -164,6 +157,10 @@ async def synthesize( async def synthesize_stream( self, text: str, + voice: str | None = None, + speed: float = 1.0, + reference_audio: bytes | None = None, + reference_text: str | None = None, **kwargs, ) -> AsyncIterator[TTSChunk | TTSResponse]: """ @@ -171,7 +168,10 @@ async def synthesize_stream( Args: text: Text to synthesize - **kwargs: Additional parameters (same as synthesize) + voice: Not used (voice cloning via reference_audio instead) + speed: Not directly supported by VoxCPM + reference_audio: Reference audio bytes for voice cloning + reference_text: Transcript of reference audio Yields: TTSChunk: Audio chunks with progressive generation @@ -187,104 +187,95 @@ async def synthesize_stream( if self._model is None: raise EngineNotReadyError("VoxCPM model not loaded") - # Extract voice cloning parameters - prompt_wav_path = kwargs.get("prompt_wav_path") - prompt_text = kwargs.get("prompt_text") - - # Validate prompt audio if provided - if prompt_wav_path is not None: - prompt_path = Path(prompt_wav_path) - if not prompt_path.exists(): - raise SynthesisError(f"Prompt audio file not found: {prompt_wav_path}") - - try: - # Stream audio chunks - for chunk in self._model.generate_streaming( - text=text, - prompt_wav_path=prompt_wav_path, - prompt_text=prompt_text, - cfg_value=self.voxcpm_config.cfg_value, - inference_timesteps=self.voxcpm_config.inference_timesteps, - normalize=self.voxcpm_config.normalize, - denoise=self.voxcpm_config.denoise, - retry_badcase=self.voxcpm_config.retry_badcase, - retry_badcase_max_times=self.voxcpm_config.retry_badcase_max_times, - retry_badcase_ratio_threshold=self.voxcpm_config.retry_badcase_ratio_threshold, - ): - chunk_time = time.time() - - if first_chunk_time is None: - first_chunk_time = chunk_time - - # Store raw numpy chunk for final concatenation - all_audio_chunks.append(chunk) - - # Get sample rate from model - sample_rate = self._model.tts_model.sample_rate - - # Convert chunk to bytes - chunk_bytes = self.audio_processor.numpy_to_wav_bytes( - chunk, sample_rate + with temp_audio_file(reference_audio) as prompt_wav_path: + try: + # Stream audio chunks + for chunk in self._model.generate_streaming( + text=text, + prompt_wav_path=prompt_wav_path, + prompt_text=reference_text, + cfg_value=self.voxcpm_config.cfg_value, + inference_timesteps=self.voxcpm_config.inference_timesteps, + normalize=self.voxcpm_config.normalize, + denoise=self.voxcpm_config.denoise, + retry_badcase=self.voxcpm_config.retry_badcase, + retry_badcase_max_times=self.voxcpm_config.retry_badcase_max_times, + retry_badcase_ratio_threshold=self.voxcpm_config.retry_badcase_ratio_threshold, + ): + chunk_time = time.time() + + if first_chunk_time is None: + first_chunk_time = chunk_time + + # Store raw numpy chunk for final concatenation + all_audio_chunks.append(chunk) + + # Get sample rate from model + sample_rate = self._model.tts_model.sample_rate + + # Convert chunk to bytes + chunk_bytes = self.audio_processor.numpy_to_wav_bytes( + chunk, sample_rate + ) + + chunk_latency_ms = (chunk_time - start_time) * 1000 + + yield TTSChunk( + audio_data=chunk_bytes, + sequence_number=total_chunks, + chunk_latency_ms=chunk_latency_ms, + ) + + total_chunks += 1 + + # Final response + end_time = time.time() + + # Concatenate all chunks + if all_audio_chunks: + full_audio = np.concatenate(all_audio_chunks) + sample_rate = self._model.tts_model.sample_rate + audio_bytes = self.audio_processor.numpy_to_wav_bytes( + full_audio, sample_rate + ) + duration_seconds = len(full_audio) / sample_rate + else: + audio_bytes = b"" + sample_rate = 16000 + duration_seconds = 0.0 + + total_duration_ms = (end_time - start_time) * 1000 + time_to_first_byte_ms = ( + (first_chunk_time - start_time) * 1000 if first_chunk_time else None ) - chunk_latency_ms = (chunk_time - start_time) * 1000 - - yield TTSChunk( - audio_data=chunk_bytes, - sequence_number=total_chunks, - chunk_latency_ms=chunk_latency_ms, + metrics = TTSPerformanceMetrics( + latency_ms=total_duration_ms, + processing_time_ms=total_duration_ms, + audio_duration_ms=duration_seconds * 1000, + real_time_factor=( + total_duration_ms / (duration_seconds * 1000) + if duration_seconds > 0 + else None + ), + characters_processed=len(text), + time_to_first_byte_ms=time_to_first_byte_ms, + total_stream_duration_ms=total_duration_ms, + total_chunks=total_chunks, ) - total_chunks += 1 - - # Final response - end_time = time.time() - - # Concatenate all chunks - if all_audio_chunks: - full_audio = np.concatenate(all_audio_chunks) - sample_rate = self._model.tts_model.sample_rate - audio_bytes = self.audio_processor.numpy_to_wav_bytes( - full_audio, sample_rate + yield TTSResponse( + audio_data=audio_bytes, + sample_rate=sample_rate, + duration_seconds=duration_seconds, + format="wav", + performance_metrics=metrics, ) - duration_seconds = len(full_audio) / sample_rate - else: - audio_bytes = b"" - sample_rate = 16000 - duration_seconds = 0.0 - - total_duration_ms = (end_time - start_time) * 1000 - time_to_first_byte_ms = ( - (first_chunk_time - start_time) * 1000 if first_chunk_time else None - ) - - metrics = TTSPerformanceMetrics( - latency_ms=total_duration_ms, - processing_time_ms=total_duration_ms, - audio_duration_ms=duration_seconds * 1000, - real_time_factor=( - total_duration_ms / (duration_seconds * 1000) - if duration_seconds > 0 - else None - ), - characters_processed=len(text), - time_to_first_byte_ms=time_to_first_byte_ms, - total_stream_duration_ms=total_duration_ms, - total_chunks=total_chunks, - ) - yield TTSResponse( - audio_data=audio_bytes, - sample_rate=sample_rate, - duration_seconds=duration_seconds, - format="wav", - performance_metrics=metrics, - ) - - except Exception as e: - if isinstance(e, (EngineNotReadyError, SynthesisError)): - raise - raise SynthesisError(f"VoxCPM streaming failed: {e}") from e + except Exception as e: + if isinstance(e, (EngineNotReadyError, SynthesisError)): + raise + raise SynthesisError(f"VoxCPM streaming failed: {e}") from e @property def supported_voices(self) -> list[str]: diff --git a/app/utils/audio.py b/app/utils/audio.py index 1569a4c..85a6fff 100644 --- a/app/utils/audio.py +++ b/app/utils/audio.py @@ -2,6 +2,10 @@ import io import pathlib +import tempfile +from collections.abc import Generator +from contextlib import contextmanager +from pathlib import Path import librosa import numpy as np @@ -11,6 +15,39 @@ from app.types.audio import AudioInput +@contextmanager +def temp_audio_file( + audio_bytes: bytes | None, suffix: str = ".wav" +) -> Generator[str | None]: + """ + Context manager that saves audio bytes to a temp file and cleans up after use. + + Args: + audio_bytes: Audio data as bytes, or None (yields None without creating file) + suffix: File extension (default: ".wav") + + Yields: + Path to the temporary file, or None if audio_bytes is None + + Example: + with temp_audio_file(audio_bytes) as temp_path: + model.generate(prompt_wav_path=temp_path) + # temp file is automatically deleted + """ + if audio_bytes is None: + yield None + return + + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file: + temp_file.write(audio_bytes) + temp_path = temp_file.name + + try: + yield temp_path + finally: + Path(temp_path).unlink(missing_ok=True) + + class AudioProcessor: """Handles audio format conversion, resampling, and validation""" diff --git a/docs/api.md b/docs/api.md index 28fd72e..e159d07 100644 --- a/docs/api.md +++ b/docs/api.md @@ -79,18 +79,23 @@ Batch transcription - upload audio file and receive complete transcription. |-----------|------|----------|-------------| | `engine` | string | Yes | Engine name (e.g., "whisper") | | `language` | string | No | Language hint (e.g., "en", "es") | + +**Form Parameters:** + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `audio` | file | Yes | Audio file (wav, mp3, flac, ogg, m4a, opus) | | `engine_params` | string | No | JSON string with engine-specific parameters | **Request:** - Content-Type: `multipart/form-data` -- Body: `file` - Audio file (wav, mp3, flac, ogg, m4a, opus) **Example:** ```bash curl -X POST "http://localhost:8000/api/v1/stt/transcribe?engine=whisper&language=en" \ - -F "file=@audio.wav" + -F "audio=@audio.wav" ``` **Response:** @@ -249,15 +254,36 @@ Batch synthesis - convert text to speech audio. | Parameter | Type | Required | Description | |-----------|------|----------|-------------| | `engine` | string | Yes | Engine name (e.g., "voxcpm") | -| `text` | string | Yes | Text to synthesize | | `voice` | string | No | Voice name/ID to use | | `speed` | float | No | Speech speed multiplier (0 < speed <= 3.0, default: 1.0) | + +**Form Parameters:** + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `text` | string | Yes | Text to synthesize | +| `reference_audio` | file | No | Reference audio file for voice cloning | +| `reference_text` | string | No | Transcript of reference audio (required with reference_audio) | | `engine_params` | string | No | JSON string with engine-specific parameters | +**Request:** + +- Content-Type: `multipart/form-data` + **Example:** ```bash -curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm&text=Hello%20world" +curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \ + -F "text=Hello world" +``` + +**Voice Cloning Example:** + +```bash +curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \ + -F "text=Hello world" \ + -F "reference_audio=@/path/to/reference.wav" \ + -F "reference_text=This is the reference transcript" ``` **Response:** @@ -286,6 +312,14 @@ SSE (Server-Sent Events) streaming synthesis - receive progressive audio chunks. Same as `/tts/synthesize`. +**Form Parameters:** + +Same as `/tts/synthesize`. + +**Request:** + +- Content-Type: `multipart/form-data` + **Response:** Server-Sent Events stream with two event types: @@ -307,32 +341,32 @@ data: {"audio_data": "", "sample_rate": 22050, "duration_seco **Example:** ```bash -curl -N -X POST "http://localhost:8000/api/v1/tts/synthesize/stream?engine=voxcpm&text=Hello%20world" +curl -N -X POST "http://localhost:8000/api/v1/tts/synthesize/stream?engine=voxcpm" \ + -F "text=Hello world" ``` **JavaScript Client Example:** ```javascript -const params = new URLSearchParams({ - engine: 'voxcpm', - text: 'Hello, how are you today?' -}); +const formData = new FormData(); +formData.append('text', 'Hello, how are you today?'); -const eventSource = new EventSource( - `http://localhost:8000/api/v1/tts/synthesize/stream?${params}` +const response = await fetch( + 'http://localhost:8000/api/v1/tts/synthesize/stream?engine=voxcpm', + { method: 'POST', body: formData } ); -eventSource.addEventListener('chunk', (event) => { - const chunk = JSON.parse(event.data); - // Process audio chunk - console.log('Chunk:', chunk.sequence_number); -}); +const reader = response.body.getReader(); +const decoder = new TextDecoder(); -eventSource.addEventListener('complete', (event) => { - const result = JSON.parse(event.data); - console.log('Complete, duration:', result.duration_seconds); - eventSource.close(); -}); +while (true) { + const { done, value } = await reader.read(); + if (done) break; + + const text = decoder.decode(value); + // Parse SSE events from text + console.log('Received:', text); +} ``` --- @@ -474,7 +508,7 @@ All errors follow this format: ### Whisper (faster-whisper) -Pass these via `engine_params` query parameter as JSON string. +Pass these via `engine_params` form parameter as JSON string. | Parameter | Type | Default | Description | |-----------|------|---------|-------------| @@ -489,29 +523,34 @@ Pass these via `engine_params` query parameter as JSON string. **Example:** ```bash -curl -X POST "http://localhost:8000/api/v1/stt/transcribe?engine=whisper&engine_params={\"beam_size\":3,\"vad_filter\":true}" \ - -F "file=@audio.wav" +curl -X POST "http://localhost:8000/api/v1/stt/transcribe?engine=whisper" \ + -F "audio=@audio.wav" \ + -F 'engine_params={"beam_size":3,"vad_filter":true}' ``` ### VoxCPM (Text-to-Speech) -Pass these via `engine_params` query parameter as JSON string. +VoxCPM supports zero-shot voice cloning using reference audio. Pass the reference audio and its transcript using the top-level form parameters. -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `prompt_wav_path` | string | null | Path to reference audio for zero-shot voice cloning | -| `prompt_text` | string | null | Transcript of the reference audio (required with prompt_wav_path) | +| Parameter | Type | Description | +|-----------|------|-------------| +| `reference_audio` | file | Reference audio file for voice cloning (wav format recommended) | +| `reference_text` | string | Transcript of the reference audio (required with reference_audio) | **Basic Example:** ```bash -curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm&text=Hello%20world" +curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \ + -F "text=Hello world" ``` **Voice Cloning Example:** ```bash -curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm&text=Hello%20world&engine_params={\"prompt_wav_path\":\"/path/to/reference.wav\",\"prompt_text\":\"This%20is%20the%20reference%20transcript\"}" +curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \ + -F "text=Hello world" \ + -F "reference_audio=@/path/to/reference.wav" \ + -F "reference_text=This is the reference transcript" ``` --- diff --git a/docs/custom-engines.md b/docs/custom-engines.md index 80d6cfc..f2afa0d 100644 --- a/docs/custom-engines.md +++ b/docs/custom-engines.md @@ -422,6 +422,8 @@ class MyTTSEngine(BaseTTSEngine): text: str, voice: str | None = None, speed: float = 1.0, + reference_audio: bytes | None = None, + reference_text: str | None = None, **kwargs ) -> TTSResponse: """ @@ -431,6 +433,8 @@ class MyTTSEngine(BaseTTSEngine): text: Text to synthesize voice: Voice name (uses default if None) speed: Speech speed multiplier + reference_audio: Reference audio bytes for voice cloning (optional) + reference_text: Transcript of reference audio (required with reference_audio) **kwargs: Additional engine parameters Returns: @@ -448,6 +452,21 @@ class MyTTSEngine(BaseTTSEngine): processing_start = time.time() try: # Your synthesis logic here + # If your model supports voice cloning: + if reference_audio is not None: + # Option 1: Model accepts bytes directly + # audio_data = self._model.synthesize( + # text, reference_audio=reference_audio, reference_text=reference_text + # ) + + # Option 2: Model requires file path - use temp_audio_file helper + # from app.utils.audio import temp_audio_file + # with temp_audio_file(reference_audio) as ref_path: + # audio_data = self._model.synthesize( + # text, prompt_wav_path=ref_path, prompt_text=reference_text + # ) + pass + audio_data = b"..." # Generated audio bytes duration_seconds = 1.0 # Calculated duration @@ -482,12 +501,24 @@ class MyTTSEngine(BaseTTSEngine): async def synthesize_stream( self, text: str, + voice: str | None = None, + speed: float = 1.0, + reference_audio: bytes | None = None, + reference_text: str | None = None, **kwargs ) -> AsyncIterator[TTSChunk | TTSResponse]: """ Streaming synthesis. Yields TTSChunk for audio chunks, then TTSResponse for final. + + Args: + text: Text to synthesize + voice: Voice name (uses default if None) + speed: Speech speed multiplier + reference_audio: Reference audio bytes for voice cloning (optional) + reference_text: Transcript of reference audio (required with reference_audio) + **kwargs: Additional engine parameters """ start_time = time.time() total_chunks = 0 @@ -627,6 +658,8 @@ class BaseTTSEngine(BaseEngine): text: str, voice: str | None = None, speed: float = 1.0, + reference_audio: bytes | None = None, + reference_text: str | None = None, **kwargs ) -> TTSResponse: ... @@ -634,6 +667,10 @@ class BaseTTSEngine(BaseEngine): async def synthesize_stream( self, text: str, + voice: str | None = None, + speed: float = 1.0, + reference_audio: bytes | None = None, + reference_text: str | None = None, **kwargs ) -> AsyncIterator[TTSChunk | TTSResponse]: ... diff --git a/tests/unit/api/conftest.py b/tests/unit/api/conftest.py index f36e4c0..ebb08d8 100644 --- a/tests/unit/api/conftest.py +++ b/tests/unit/api/conftest.py @@ -63,7 +63,14 @@ def mock_tts_engine(): ) engine.synthesize = AsyncMock(return_value=mock_response) - async def mock_stream_generator(text, **kwargs): + async def mock_stream_generator( + text, + voice=None, + speed=1.0, + reference_audio=None, + reference_text=None, + **kwargs, + ): yield TTSChunk(audio_data=b"chunk1", sequence_number=0) yield TTSChunk(audio_data=b"chunk2", sequence_number=1) yield mock_response diff --git a/tests/unit/api/test_stt_router.py b/tests/unit/api/test_stt_router.py index a8a3cf2..922794a 100644 --- a/tests/unit/api/test_stt_router.py +++ b/tests/unit/api/test_stt_router.py @@ -55,7 +55,8 @@ def test_returns_400_for_invalid_json_params(self, client, test_audio_bytes): """Returns 400 for invalid JSON in engine_params""" response = client.post( "/api/v1/stt/transcribe", - params={"engine": "default", "engine_params": "invalid json{"}, + params={"engine": "default"}, + data={"engine_params": "invalid json{"}, files={"audio": ("test.wav", test_audio_bytes, "audio/wav")}, ) @@ -68,7 +69,8 @@ def test_passes_valid_engine_params( """Passes valid engine_params to engine""" response = client.post( "/api/v1/stt/transcribe", - params={"engine": "default", "engine_params": '{"beam_size": 5}'}, + params={"engine": "default"}, + data={"engine_params": '{"beam_size": 5}'}, files={"audio": ("test.wav", test_audio_bytes, "audio/wav")}, ) @@ -181,7 +183,8 @@ def test_returns_400_for_invalid_json_params(self, client, test_audio_bytes): """Returns 400 for invalid JSON""" response = client.post( "/api/v1/stt/transcribe/stream", - params={"engine": "default", "engine_params": "bad json"}, + params={"engine": "default"}, + data={"engine_params": "bad json"}, files={"audio": ("test.wav", test_audio_bytes, "audio/wav")}, ) @@ -307,7 +310,8 @@ def test_handles_non_dict_json_params(self, client, test_audio_bytes): # should convert to 500. response = client.post( "/api/v1/stt/transcribe", - params={"engine": "default", "engine_params": '["array"]'}, + params={"engine": "default"}, + data={"engine_params": '["array"]'}, files={"audio": ("test.wav", test_audio_bytes, "audio/wav")}, ) diff --git a/tests/unit/api/test_tts_router.py b/tests/unit/api/test_tts_router.py index e0dbc39..2dcbdf0 100644 --- a/tests/unit/api/test_tts_router.py +++ b/tests/unit/api/test_tts_router.py @@ -10,7 +10,8 @@ def test_returns_200_with_valid_text(self, client_both): """Returns 200 with valid text and audio data""" response = client_both.post( "/api/v1/tts/synthesize", - params={"text": "Hello world", "engine": "default"}, + params={"engine": "default"}, + data={"text": "Hello world"}, ) assert response.status_code == 200 @@ -37,7 +38,8 @@ def test_returns_404_when_engine_not_found(self, client_both): """Returns 404 when engine not found""" response = client_both.post( "/api/v1/tts/synthesize", - params={"text": "Hello", "engine": "nonexistent"}, + params={"engine": "nonexistent"}, + data={"text": "Hello"}, ) assert response.status_code == 404 @@ -46,9 +48,9 @@ def test_returns_400_for_invalid_json_params(self, client_both): """Returns 400 for invalid JSON in engine_params""" response = client_both.post( "/api/v1/tts/synthesize", - params={ + params={"engine": "default"}, + data={ "text": "Hello", - "engine": "default", "engine_params": "invalid json{", }, ) @@ -60,9 +62,9 @@ def test_passes_valid_engine_params(self, client_both, mock_tts_engine): """Passes valid engine_params to engine""" response = client_both.post( "/api/v1/tts/synthesize", - params={ + params={"engine": "default"}, + data={ "text": "Hello", - "engine": "default", "engine_params": '{"pitch": 1.5}', }, ) @@ -78,11 +80,11 @@ def test_passes_voice_and_speed_params(self, client_both, mock_tts_engine): response = client_both.post( "/api/v1/tts/synthesize", params={ - "text": "Hello", "engine": "default", "voice": "voice2", "speed": 1.5, }, + data={"text": "Hello"}, ) assert response.status_code == 200 @@ -90,6 +92,23 @@ def test_passes_voice_and_speed_params(self, client_both, mock_tts_engine): assert call_kwargs.get("voice") == "voice2" assert call_kwargs.get("speed") == 1.5 + def test_passes_reference_audio_and_text(self, client_both, mock_tts_engine): + """Passes reference_audio and reference_text to engine for voice cloning""" + response = client_both.post( + "/api/v1/tts/synthesize", + params={"engine": "default"}, + data={ + "text": "Hello", + "reference_text": "Reference transcript", + }, + files={"reference_audio": ("ref.wav", b"fake audio data", "audio/wav")}, + ) + + assert response.status_code == 200 + call_kwargs = mock_tts_engine.synthesize.call_args.kwargs + assert call_kwargs.get("reference_audio") == b"fake audio data" + assert call_kwargs.get("reference_text") == "Reference transcript" + class TestSynthesizeStreamEndpoint: """POST /synthesize/stream tests""" @@ -98,7 +117,8 @@ def test_returns_200_with_event_stream(self, client_both): """Returns 200 with event-stream content type""" response = client_both.post( "/api/v1/tts/synthesize/stream", - params={"text": "Hello", "engine": "default"}, + params={"engine": "default"}, + data={"text": "Hello"}, ) assert response.status_code == 200 @@ -108,7 +128,8 @@ def test_streams_chunks_and_complete(self, client_both): """Streams chunk events and complete event""" response = client_both.post( "/api/v1/tts/synthesize/stream", - params={"text": "Hello", "engine": "default"}, + params={"engine": "default"}, + data={"text": "Hello"}, ) # Parse SSE events @@ -128,9 +149,9 @@ def test_returns_400_for_invalid_json_params(self, client_both): """Returns 400 for invalid JSON in engine_params""" response = client_both.post( "/api/v1/tts/synthesize/stream", - params={ + params={"engine": "default"}, + data={ "text": "Hello", - "engine": "default", "engine_params": "bad json", }, ) @@ -143,9 +164,9 @@ def test_passes_engine_params_stream(self, client_both, mock_tts_engine): """Passes engine_params to engine in stream mode""" response = client_both.post( "/api/v1/tts/synthesize/stream", - params={ + params={"engine": "default"}, + data={ "text": "Hello", - "engine": "default", "engine_params": '{"style": "happy"}', }, ) @@ -156,3 +177,20 @@ def test_passes_engine_params_stream(self, client_both, mock_tts_engine): mock_tts_engine.synthesize_stream.assert_called_once() call_kwargs = mock_tts_engine.synthesize_stream.call_args.kwargs assert call_kwargs.get("style") == "happy" + + def test_passes_reference_audio_stream(self, client_both, mock_tts_engine): + """Passes reference_audio to engine in stream mode""" + response = client_both.post( + "/api/v1/tts/synthesize/stream", + params={"engine": "default"}, + data={ + "text": "Hello", + "reference_text": "Reference", + }, + files={"reference_audio": ("ref.wav", b"audio bytes", "audio/wav")}, + ) + + assert response.status_code == 200 + call_kwargs = mock_tts_engine.synthesize_stream.call_args.kwargs + assert call_kwargs.get("reference_audio") == b"audio bytes" + assert call_kwargs.get("reference_text") == "Reference" diff --git a/tests/unit/engines/test_base.py b/tests/unit/engines/test_base.py index 7da3554..7199adc 100644 --- a/tests/unit/engines/test_base.py +++ b/tests/unit/engines/test_base.py @@ -88,6 +88,9 @@ async def synthesize( text: str, voice: str | None = None, speed: float = 1.0, + reference_audio: bytes | None = None, + reference_text: str | None = None, + **kwargs, ) -> TTSResponse: await self._ensure_ready() # Auto-initialize if needed return TTSResponse( @@ -100,7 +103,13 @@ async def synthesize( ) async def synthesize_stream( - self, text: str, **kwargs + self, + text: str, + voice: str | None = None, + speed: float = 1.0, + reference_audio: bytes | None = None, + reference_text: str | None = None, + **kwargs, ) -> AsyncIterator[TTSChunk | TTSResponse]: await self._ensure_ready() yield TTSChunk( diff --git a/tests/unit/engines/tts/voxcpm/test_voxcpm_engine.py b/tests/unit/engines/tts/voxcpm/test_voxcpm_engine.py index acbd3ff..2a45029 100644 --- a/tests/unit/engines/tts/voxcpm/test_voxcpm_engine.py +++ b/tests/unit/engines/tts/voxcpm/test_voxcpm_engine.py @@ -40,12 +40,13 @@ def mock_voxcpm_model(): # Mock generate method - returns numpy array mock_model.generate.return_value = np.zeros(16000, dtype=np.float32) # 1 second - # Mock generate_streaming - yields chunks + # Mock generate_streaming - yields chunks (use MagicMock to track calls) def mock_streaming(*args, **kwargs): yield np.zeros(4000, dtype=np.float32) # 0.25 second chunk yield np.zeros(4000, dtype=np.float32) # 0.25 second chunk - mock_model.generate_streaming = mock_streaming + # Wrap in MagicMock to track call arguments + mock_model.generate_streaming = MagicMock(side_effect=mock_streaming) return mock_model @@ -206,58 +207,53 @@ async def test_synthesize_passes_config_params(self, config, mock_voxcpm_model): assert call_kwargs["denoise"] == config.denoise @pytest.mark.asyncio - async def test_synthesize_with_voice_cloning( - self, config, mock_voxcpm_model, tmp_path - ): + async def test_synthesize_with_voice_cloning(self, config, mock_voxcpm_model): """Synthesize should accept voice cloning parameters.""" engine = VoxCPMEngine(config) engine._model = mock_voxcpm_model engine._initialized = True - # Create a temporary audio file - prompt_file = tmp_path / "prompt.wav" - prompt_file.touch() + # Use bytes for reference audio + reference_audio = b"fake audio bytes" await engine.synthesize( "Hello", - prompt_wav_path=str(prompt_file), - prompt_text="Reference text", + reference_audio=reference_audio, + reference_text="Reference text", ) call_kwargs = mock_voxcpm_model.generate.call_args.kwargs - assert call_kwargs["prompt_wav_path"] == str(prompt_file) + # Engine saves bytes to temp file and passes path + assert call_kwargs["prompt_wav_path"] is not None assert call_kwargs["prompt_text"] == "Reference text" @pytest.mark.asyncio - async def test_synthesize_error_on_missing_prompt_file( - self, config, mock_voxcpm_model - ): - """Synthesize should raise error if prompt file doesn't exist.""" + async def test_synthesize_wraps_model_errors(self, config, mock_voxcpm_model): + """Model errors should be wrapped in SynthesisError.""" engine = VoxCPMEngine(config) engine._model = mock_voxcpm_model engine._initialized = True + mock_voxcpm_model.generate.side_effect = RuntimeError("Model failed") + with pytest.raises(SynthesisError) as exc_info: - await engine.synthesize( - "Hello", - prompt_wav_path="/nonexistent/file.wav", - ) + await engine.synthesize("Test") - assert "not found" in str(exc_info.value).lower() + assert "Model failed" in str(exc_info.value) @pytest.mark.asyncio - async def test_synthesize_wraps_model_errors(self, config, mock_voxcpm_model): - """Model errors should be wrapped in SynthesisError.""" + async def test_synthesize_reraises_typed_errors(self, config, mock_voxcpm_model): + """Synthesize should re-raise SynthesisError as-is without wrapping.""" engine = VoxCPMEngine(config) engine._model = mock_voxcpm_model engine._initialized = True - mock_voxcpm_model.generate.side_effect = RuntimeError("Model failed") + mock_voxcpm_model.generate.side_effect = SynthesisError("Already typed error") with pytest.raises(SynthesisError) as exc_info: await engine.synthesize("Test") - assert "Model failed" in str(exc_info.value) + assert "Already typed error" in str(exc_info.value) @pytest.mark.asyncio async def test_synthesize_model_not_loaded(self, config): @@ -338,19 +334,25 @@ async def test_stream_model_not_loaded(self, config): assert "not loaded" in str(exc_info.value) @pytest.mark.asyncio - async def test_stream_missing_prompt_file(self, config, mock_voxcpm_model): - """Stream should raise SynthesisError if prompt file doesn't exist.""" + async def test_stream_with_reference_audio(self, config, mock_voxcpm_model): + """Stream should accept reference_audio bytes.""" engine = VoxCPMEngine(config) engine._model = mock_voxcpm_model engine._initialized = True - with pytest.raises(SynthesisError) as exc_info: - async for _ in engine.synthesize_stream( - "Test", prompt_wav_path="/nonexistent.wav" - ): - pass + reference_audio = b"fake audio bytes" + results = [] + async for item in engine.synthesize_stream( + "Test", + reference_audio=reference_audio, + reference_text="Reference", + ): + results.append(item) - assert "not found" in str(exc_info.value) + # Verify model was called with prompt_wav_path (temp file path) + call_kwargs = mock_voxcpm_model.generate_streaming.call_args[1] + assert call_kwargs["prompt_wav_path"] is not None + assert call_kwargs["prompt_text"] == "Reference" @pytest.mark.asyncio async def test_stream_empty_result(self, config, mock_voxcpm_model):