minhsaco99 · minhsaco99 · Jan 24, 2026 · Jan 23, 2026 · Jan 24, 2026 · Jan 24, 2026
diff --git a/app/api/routers/tts.py b/app/api/routers/tts.py
@@ -1,12 +1,16 @@
+import json
+
 from fastapi import APIRouter, Depends, HTTPException, Query
+from sse_starlette.sse import EventSourceResponse
 
 from app.api.deps import get_tts_engine
 from app.engines.base import BaseTTSEngine
+from app.models.engine import TTSChunk, TTSResponse
 
 router = APIRouter()
 
 
-@router.post("/synthesize")
+@router.post("/synthesize", response_model=TTSResponse)
 async def synthesize_text(
     text: str = Query(..., description="Text to synthesize"),
     voice: str | None = Query(None, description="Voice name/ID to use"),
@@ -24,9 +28,18 @@ async def synthesize_text(
     - speed: Speech speed multiplier (0 < speed <= 3.0)
     - engine_params: Optional JSON engine parameters
 
-    Returns complete audio with metrics.
+    Returns complete audio (base64 encoded) with metrics.
     """
-    raise HTTPException(501, "TTS not implemented yet")
+    kwargs = {}
+    if engine_params:
+        try:
+            kwargs = json.loads(engine_params)
+        except json.JSONDecodeError as e:
+            raise HTTPException(400, "Invalid engine_params JSON") from e
+
+    result = await tts_engine.synthesize(text, voice=voice, speed=speed, **kwargs)
+
+    return result
 
 
 @router.post("/synthesize/stream")
@@ -48,5 +61,22 @@ async def synthesize_text_stream(
     - engine_params: Optional JSON engine parameters
 
     Returns progressive audio chunks followed by final response.
+    Event types: "chunk" (TTSChunk with base64 audio), "complete" (full response)
     """
-    raise HTTPException(501, "TTS streaming not implemented yet")
+    kwargs = {}
+    if engine_params:
+        try:
+            kwargs = json.loads(engine_params)
+        except json.JSONDecodeError as e:
+            raise HTTPException(400, "Invalid engine_params JSON") from e
+
+    async def event_generator():
+        async for result in tts_engine.synthesize_stream(
+            text, voice=voice, speed=speed, **kwargs
+        ):
+            if isinstance(result, TTSChunk):
+                yield {"event": "chunk", "data": result.model_dump_json()}
+            elif isinstance(result, TTSResponse):
+                yield {"event": "complete", "data": result.model_dump_json()}
+
+    return EventSourceResponse(event_generator())
diff --git a/app/engines/tts/voxcpm/__init__.py b/app/engines/tts/voxcpm/__init__.py
@@ -0,0 +1,5 @@
+# VoxCPM TTS Engine
+from app.engines.tts.voxcpm.config import VoxCPMConfig
+from app.engines.tts.voxcpm.engine import VoxCPMEngine
+
+__all__ = ["VoxCPMConfig", "VoxCPMEngine"]
diff --git a/app/engines/tts/voxcpm/config.py b/app/engines/tts/voxcpm/config.py
@@ -0,0 +1,61 @@
+"""
+VoxCPM TTS Engine Configuration
+
+Configuration class for VoxCPM - a tokenizer-free TTS system with voice cloning support.
+"""
+
+from pydantic import Field
+
+from app.models.engine import EngineConfig
+
+
+class VoxCPMConfig(EngineConfig):
+    """
+    Configuration for VoxCPM TTS Engine
+
+    Attributes:
+        model_name: HuggingFace model ID (e.g., "openbmb/VoxCPM-0.5B", "openbmb/VoxCPM1.5")
+        device: Device to run on ("cpu", "cuda", "mps")
+        cfg_value: LM guidance value for LocDiT (higher = better adherence to prompt)
+        inference_timesteps: LocDiT inference timesteps (higher = better quality, slower)
+        normalize: Enable external text normalization tool
+        denoise: Enable external denoiser (restricts output to 16kHz)
+        retry_badcase: Enable retrying mode for bad cases (unstoppable generation)
+        retry_badcase_max_times: Maximum retry attempts for bad cases
+        retry_badcase_ratio_threshold: Length restriction threshold for bad case detection
+    """
+
+    # VoxCPM-specific settings
+    cfg_value: float = Field(
+        default=2.0,
+        ge=0.0,
+        description="LM guidance on LocDiT, higher for better adherence to prompt",
+    )
+    inference_timesteps: int = Field(
+        default=10,
+        ge=1,
+        le=50,
+        description="LocDiT inference timesteps, higher for better quality",
+    )
+    normalize: bool = Field(
+        default=False,
+        description="Enable external text normalization tool",
+    )
+    denoise: bool = Field(
+        default=False,
+        description="Enable external denoiser (restricts to 16kHz)",
+    )
+    retry_badcase: bool = Field(
+        default=True,
+        description="Enable retrying mode for bad cases",
+    )
+    retry_badcase_max_times: int = Field(
+        default=3,
+        ge=1,
+        description="Maximum retry attempts for bad cases",
+    )
+    retry_badcase_ratio_threshold: float = Field(
+        default=6.0,
+        ge=1.0,
+        description="Length restriction threshold for bad case detection",
+    )