Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,31 +128,40 @@ curl http://localhost:8000/api/v1/health
**Batch Transcription**

```bash
curl -X POST "http://localhost:8000/api/v1/stt/transcribe?engine=whisper" \
-H "accept: application/json" \
-H "Content-Type: multipart/form-data" \
-F "file=@/path/to/audio.wav"
curl -X POST "http://localhost:8000/api/v1/stt/transcribe?engine=whisper&language=en" \
-F "audio=@/path/to/audio.wav"
```

**Real-time Streaming (SSE)**

```bash
curl -N -X POST "http://localhost:8000/api/v1/stt/transcribe/stream?engine=whisper" \
-F "file=@/path/to/audio.wav"
-F "audio=@/path/to/audio.wav"
```

### 🔊 Text-to-Speech (TTS)

**Batch Synthesis**

```bash
curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm&text=Hello%20world"
curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \
-F "text=Hello world"
```

**Streaming Synthesis**

```bash
curl -N -X POST "http://localhost:8000/api/v1/tts/synthesize/stream?engine=voxcpm&text=Hello%20world"
curl -N -X POST "http://localhost:8000/api/v1/tts/synthesize/stream?engine=voxcpm" \
-F "text=Hello world"
```

**Voice Cloning**

```bash
curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \
-F "text=Hello world" \
-F "reference_audio=@/path/to/reference.wav" \
-F "reference_text=This is the reference transcript"
```

---
Expand Down Expand Up @@ -182,7 +191,7 @@ Detailed documentation is available in the `docs/` directory:

| Engine | Backend | Status | Features |
| :--- | :--- | :---: | :--- |
| **VoxCPM** | `voxcpm` | ✅ Ready | Zero-shot voice cloning, streaming, 24kHz |
| **VoxCPM** | `voxcpm` | ✅ Ready | Zero-shot voice cloning, streaming |
| **Coqui TTS** | `TTS` | 🚧 Planned | High-quality open source voices |
| **OpenAI TTS** | OpenAI API | 🚧 Planned | Natural sounding commercial voices |

Expand Down
13 changes: 10 additions & 3 deletions app/api/routers/stt.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from fastapi import (
APIRouter,
Depends,
Form,
HTTPException,
Query,
WebSocket,
Expand All @@ -23,7 +24,7 @@
async def transcribe_audio(
audio: Annotated[bytes, Depends(validate_audio_upload)],
language: str | None = Query(None, description="Language hint"),
engine_params: str | None = Query(None, description="JSON engine parameters"),
engine_params: str | None = Form(None, description="JSON engine parameters"),
stt_engine: BaseSTTEngine = Depends(get_stt_engine),
):
"""
Expand All @@ -32,6 +33,8 @@ async def transcribe_audio(
Query params:
- engine: STT engine name (required, e.g., "whisper")
- language: Optional language hint

Form params:
- engine_params: Optional JSON engine parameters

Returns complete transcription with segments and metrics.
Expand All @@ -50,15 +53,19 @@ async def transcribe_audio(
@router.post("/transcribe/stream")
async def transcribe_audio_stream(
audio: Annotated[bytes, Depends(validate_audio_upload)],
language: str | None = Query(None),
engine_params: str | None = Query(None),
language: str | None = Query(None, description="Language hint"),
engine_params: str | None = Form(None, description="JSON engine parameters"),
stt_engine: BaseSTTEngine = Depends(get_stt_engine),
):
"""
Transcribe audio with Server-Sent Events streaming

Query params:
- engine: STT engine name (required, e.g., "whisper")
- language: Optional language hint

Form params:
- engine_params: Optional JSON engine parameters

Returns progressive chunks followed by final response.
Event types: "chunk" (STTChunk), "complete" (STTResponse)
Expand Down
59 changes: 50 additions & 9 deletions app/api/routers/tts.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json

from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
from sse_starlette.sse import EventSourceResponse

from app.api.deps import get_tts_engine
Expand All @@ -12,20 +12,30 @@

@router.post("/synthesize", response_model=TTSResponse)
async def synthesize_text(
text: str = Query(..., description="Text to synthesize"),
text: str = Form(..., description="Text to synthesize"),
reference_audio: UploadFile | None = File(
None, description="Reference audio for voice cloning"
),
reference_text: str | None = Form(
None, description="Transcript of reference audio"
),
engine_params: str | None = Form(None, description="JSON engine parameters"),
voice: str | None = Query(None, description="Voice name/ID to use"),
speed: float = Query(1.0, gt=0, le=3.0, description="Speech speed multiplier"),
engine_params: str | None = Query(None, description="JSON engine parameters"),
tts_engine: BaseTTSEngine = Depends(get_tts_engine),
):
"""
Synthesize text to speech (invoke mode)

Query params:
- engine: TTS engine name (required)
- text: Text to synthesize (required)
- voice: Optional voice name/ID
- speed: Speech speed multiplier (0 < speed <= 3.0)

Form params:
- text: Text to synthesize (required)
- reference_audio: Optional reference audio file for voice cloning
- reference_text: Optional transcript of reference audio
- engine_params: Optional JSON engine parameters

Returns complete audio (base64 encoded) with metrics.
Expand All @@ -37,27 +47,48 @@ async def synthesize_text(
except json.JSONDecodeError as e:
raise HTTPException(400, "Invalid engine_params JSON") from e

result = await tts_engine.synthesize(text, voice=voice, speed=speed, **kwargs)
# Read reference audio bytes if provided
reference_audio_bytes = None
if reference_audio:
reference_audio_bytes = await reference_audio.read()

result = await tts_engine.synthesize(
text,
voice=voice,
speed=speed,
reference_audio=reference_audio_bytes,
reference_text=reference_text,
**kwargs,
)
return result


@router.post("/synthesize/stream")
async def synthesize_text_stream(
text: str = Query(..., description="Text to synthesize"),
text: str = Form(..., description="Text to synthesize"),
reference_audio: UploadFile | None = File(
None, description="Reference audio for voice cloning"
),
reference_text: str | None = Form(
None, description="Transcript of reference audio"
),
engine_params: str | None = Form(None, description="JSON engine parameters"),
voice: str | None = Query(None, description="Voice name/ID to use"),
speed: float = Query(1.0, gt=0, le=3.0, description="Speech speed multiplier"),
engine_params: str | None = Query(None, description="JSON engine parameters"),
tts_engine: BaseTTSEngine = Depends(get_tts_engine),
):
"""
Synthesize text to speech with streaming

Query params:
- engine: TTS engine name (required)
- text: Text to synthesize (required)
- voice: Optional voice name/ID
- speed: Speech speed multiplier (0 < speed <= 3.0)

Form params:
- text: Text to synthesize (required)
- reference_audio: Optional reference audio file for voice cloning
- reference_text: Optional transcript of reference audio
- engine_params: Optional JSON engine parameters

Returns progressive audio chunks followed by final response.
Expand All @@ -70,9 +101,19 @@ async def synthesize_text_stream(
except json.JSONDecodeError as e:
raise HTTPException(400, "Invalid engine_params JSON") from e

# Read reference audio bytes if provided
reference_audio_bytes = None
if reference_audio:
reference_audio_bytes = await reference_audio.read()

async def event_generator():
async for result in tts_engine.synthesize_stream(
text, voice=voice, speed=speed, **kwargs
text,
voice=voice,
speed=speed,
reference_audio=reference_audio_bytes,
reference_text=reference_text,
**kwargs,
):
if isinstance(result, TTSChunk):
yield {"event": "chunk", "data": result.model_dump_json()}
Expand Down
24 changes: 21 additions & 3 deletions app/engines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,13 @@ class BaseTTSEngine(BaseEngine):

@abstractmethod
async def synthesize(
self, text: str, voice: str | None = None, speed: float = 1.0, **kwargs
self,
text: str,
voice: str | None = None,
speed: float = 1.0,
reference_audio: bytes | None = None,
reference_text: str | None = None,
**kwargs,
) -> TTSResponse:
"""
Synthesize text to speech (invoke/batch mode)
Expand All @@ -203,6 +209,8 @@ async def synthesize(
text: Text to synthesize
voice: Optional voice name (overrides config default)
speed: Speech speed (1.0 = normal, overrides config default)
reference_audio: Reference audio bytes for voice cloning
reference_text: Transcript of reference audio for voice cloning
**kwargs: Additional engine-specific parameters (passed via engine_params)

Returns:
Expand All @@ -212,7 +220,13 @@ async def synthesize(

@abstractmethod
async def synthesize_stream(
self, text: str, **kwargs
self,
text: str,
voice: str | None = None,
speed: float = 1.0,
reference_audio: bytes | None = None,
reference_text: str | None = None,
**kwargs,
) -> AsyncIterator[TTSChunk | TTSResponse]:
"""
Synthesize text to speech (streaming mode)
Expand All @@ -222,7 +236,11 @@ async def synthesize_stream(

Args:
text: Text to synthesize
**kwargs: Engine-specific params (voice, speed, etc.)
voice: Optional voice name (overrides config default)
speed: Speech speed (1.0 = normal, overrides config default)
reference_audio: Reference audio bytes for voice cloning
reference_text: Transcript of reference audio for voice cloning
**kwargs: Additional engine-specific parameters (passed via engine_params)

Yields:
TTSChunk: Audio chunks with progressive generation
Expand Down
Loading