Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions app/api/routers/tts.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import json

from fastapi import APIRouter, Depends, HTTPException, Query
from sse_starlette.sse import EventSourceResponse

from app.api.deps import get_tts_engine
from app.engines.base import BaseTTSEngine
from app.models.engine import TTSChunk, TTSResponse

router = APIRouter()


@router.post("/synthesize")
@router.post("/synthesize", response_model=TTSResponse)
async def synthesize_text(
text: str = Query(..., description="Text to synthesize"),
voice: str | None = Query(None, description="Voice name/ID to use"),
Expand All @@ -24,9 +28,18 @@ async def synthesize_text(
- speed: Speech speed multiplier (0 < speed <= 3.0)
- engine_params: Optional JSON engine parameters

Returns complete audio with metrics.
Returns complete audio (base64 encoded) with metrics.
"""
raise HTTPException(501, "TTS not implemented yet")
kwargs = {}
if engine_params:
try:
kwargs = json.loads(engine_params)
except json.JSONDecodeError as e:
raise HTTPException(400, "Invalid engine_params JSON") from e

result = await tts_engine.synthesize(text, voice=voice, speed=speed, **kwargs)

return result


@router.post("/synthesize/stream")
Expand All @@ -48,5 +61,22 @@ async def synthesize_text_stream(
- engine_params: Optional JSON engine parameters

Returns progressive audio chunks followed by final response.
Event types: "chunk" (TTSChunk with base64 audio), "complete" (full response)
"""
raise HTTPException(501, "TTS streaming not implemented yet")
kwargs = {}
if engine_params:
try:
kwargs = json.loads(engine_params)
except json.JSONDecodeError as e:
raise HTTPException(400, "Invalid engine_params JSON") from e

async def event_generator():
async for result in tts_engine.synthesize_stream(
text, voice=voice, speed=speed, **kwargs
):
if isinstance(result, TTSChunk):
yield {"event": "chunk", "data": result.model_dump_json()}
elif isinstance(result, TTSResponse):
yield {"event": "complete", "data": result.model_dump_json()}

return EventSourceResponse(event_generator())
5 changes: 5 additions & 0 deletions app/engines/tts/voxcpm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# VoxCPM TTS Engine
from app.engines.tts.voxcpm.config import VoxCPMConfig
from app.engines.tts.voxcpm.engine import VoxCPMEngine

__all__ = ["VoxCPMConfig", "VoxCPMEngine"]
61 changes: 61 additions & 0 deletions app/engines/tts/voxcpm/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""
VoxCPM TTS Engine Configuration

Configuration class for VoxCPM - a tokenizer-free TTS system with voice cloning support.
"""

from pydantic import Field

from app.models.engine import EngineConfig


class VoxCPMConfig(EngineConfig):
"""
Configuration for VoxCPM TTS Engine

Attributes:
model_name: HuggingFace model ID (e.g., "openbmb/VoxCPM-0.5B", "openbmb/VoxCPM1.5")
device: Device to run on ("cpu", "cuda", "mps")
cfg_value: LM guidance value for LocDiT (higher = better adherence to prompt)
inference_timesteps: LocDiT inference timesteps (higher = better quality, slower)
normalize: Enable external text normalization tool
denoise: Enable external denoiser (restricts output to 16kHz)
retry_badcase: Enable retrying mode for bad cases (unstoppable generation)
retry_badcase_max_times: Maximum retry attempts for bad cases
retry_badcase_ratio_threshold: Length restriction threshold for bad case detection
"""

# VoxCPM-specific settings
cfg_value: float = Field(
default=2.0,
ge=0.0,
description="LM guidance on LocDiT, higher for better adherence to prompt",
)
inference_timesteps: int = Field(
default=10,
ge=1,
le=50,
description="LocDiT inference timesteps, higher for better quality",
)
normalize: bool = Field(
default=False,
description="Enable external text normalization tool",
)
denoise: bool = Field(
default=False,
description="Enable external denoiser (restricts to 16kHz)",
)
retry_badcase: bool = Field(
default=True,
description="Enable retrying mode for bad cases",
)
retry_badcase_max_times: int = Field(
default=3,
ge=1,
description="Maximum retry attempts for bad cases",
)
retry_badcase_ratio_threshold: float = Field(
default=6.0,
ge=1.0,
description="Length restriction threshold for bad case detection",
)
Loading