minhsaco99 · minhsaco99 · Jan 26, 2026 · Jan 24, 2026 · Jan 24, 2026 · Jan 24, 2026
diff --git a/README.md b/README.md
@@ -128,31 +128,40 @@ curl http://localhost:8000/api/v1/health
 **Batch Transcription**
 
 ```bash
-curl -X POST "http://localhost:8000/api/v1/stt/transcribe?engine=whisper" \
-  -H "accept: application/json" \
-  -H "Content-Type: multipart/form-data" \
-  -F "file=@/path/to/audio.wav"
+curl -X POST "http://localhost:8000/api/v1/stt/transcribe?engine=whisper&language=en" \
+  -F "audio=@/path/to/audio.wav"
 ```
 
 **Real-time Streaming (SSE)**
 
 ```bash
 curl -N -X POST "http://localhost:8000/api/v1/stt/transcribe/stream?engine=whisper" \
-  -F "file=@/path/to/audio.wav"
+  -F "audio=@/path/to/audio.wav"
 ```
 
 ### 🔊 Text-to-Speech (TTS)
 
 **Batch Synthesis**
 
 ```bash
-curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm&text=Hello%20world"
+curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \
+  -F "text=Hello world"
 ```
 
 **Streaming Synthesis**
 
 ```bash
-curl -N -X POST "http://localhost:8000/api/v1/tts/synthesize/stream?engine=voxcpm&text=Hello%20world"
+curl -N -X POST "http://localhost:8000/api/v1/tts/synthesize/stream?engine=voxcpm" \
+  -F "text=Hello world"
+```
+
+**Voice Cloning**
+
+```bash
+curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \
+  -F "text=Hello world" \
+  -F "reference_audio=@/path/to/reference.wav" \
+  -F "reference_text=This is the reference transcript"
 ```
 
 ---
@@ -182,7 +191,7 @@ Detailed documentation is available in the `docs/` directory:
 
 | Engine | Backend | Status | Features |
 | :--- | :--- | :---: | :--- |
-| **VoxCPM** | `voxcpm` | ✅ Ready | Zero-shot voice cloning, streaming, 24kHz |
+| **VoxCPM** | `voxcpm` | ✅ Ready | Zero-shot voice cloning, streaming |
 | **Coqui TTS** | `TTS` | 🚧 Planned | High-quality open source voices |
 | **OpenAI TTS** | OpenAI API | 🚧 Planned | Natural sounding commercial voices |
 

diff --git a/app/api/routers/stt.py b/app/api/routers/stt.py
@@ -4,6 +4,7 @@
 from fastapi import (
     APIRouter,
     Depends,
+    Form,
     HTTPException,
     Query,
     WebSocket,
@@ -23,7 +24,7 @@
 async def transcribe_audio(
     audio: Annotated[bytes, Depends(validate_audio_upload)],
     language: str | None = Query(None, description="Language hint"),
-    engine_params: str | None = Query(None, description="JSON engine parameters"),
+    engine_params: str | None = Form(None, description="JSON engine parameters"),
     stt_engine: BaseSTTEngine = Depends(get_stt_engine),
 ):
     """
@@ -32,6 +33,8 @@ async def transcribe_audio(
     Query params:
     - engine: STT engine name (required, e.g., "whisper")
     - language: Optional language hint
+
+    Form params:
     - engine_params: Optional JSON engine parameters
 
     Returns complete transcription with segments and metrics.
@@ -50,15 +53,19 @@ async def transcribe_audio(
 @router.post("/transcribe/stream")
 async def transcribe_audio_stream(
     audio: Annotated[bytes, Depends(validate_audio_upload)],
-    language: str | None = Query(None),
-    engine_params: str | None = Query(None),
+    language: str | None = Query(None, description="Language hint"),
+    engine_params: str | None = Form(None, description="JSON engine parameters"),
     stt_engine: BaseSTTEngine = Depends(get_stt_engine),
 ):
     """
     Transcribe audio with Server-Sent Events streaming
 
     Query params:
     - engine: STT engine name (required, e.g., "whisper")
+    - language: Optional language hint
+
+    Form params:
+    - engine_params: Optional JSON engine parameters
 
     Returns progressive chunks followed by final response.
     Event types: "chunk" (STTChunk), "complete" (STTResponse)

diff --git a/app/api/routers/tts.py b/app/api/routers/tts.py
@@ -1,6 +1,6 @@
 import json
 
-from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
 from sse_starlette.sse import EventSourceResponse
 
 from app.api.deps import get_tts_engine
@@ -12,20 +12,30 @@
 
 @router.post("/synthesize", response_model=TTSResponse)
 async def synthesize_text(
-    text: str = Query(..., description="Text to synthesize"),
+    text: str = Form(..., description="Text to synthesize"),
+    reference_audio: UploadFile | None = File(
+        None, description="Reference audio for voice cloning"
+    ),
+    reference_text: str | None = Form(
+        None, description="Transcript of reference audio"
+    ),
+    engine_params: str | None = Form(None, description="JSON engine parameters"),
     voice: str | None = Query(None, description="Voice name/ID to use"),
     speed: float = Query(1.0, gt=0, le=3.0, description="Speech speed multiplier"),
-    engine_params: str | None = Query(None, description="JSON engine parameters"),
     tts_engine: BaseTTSEngine = Depends(get_tts_engine),
 ):
     """
     Synthesize text to speech (invoke mode)
 
     Query params:
     - engine: TTS engine name (required)
-    - text: Text to synthesize (required)
     - voice: Optional voice name/ID
     - speed: Speech speed multiplier (0 < speed <= 3.0)
+
+    Form params:
+    - text: Text to synthesize (required)
+    - reference_audio: Optional reference audio file for voice cloning
+    - reference_text: Optional transcript of reference audio
     - engine_params: Optional JSON engine parameters
 
     Returns complete audio (base64 encoded) with metrics.
@@ -37,27 +47,48 @@ async def synthesize_text(
         except json.JSONDecodeError as e:
             raise HTTPException(400, "Invalid engine_params JSON") from e
 
-    result = await tts_engine.synthesize(text, voice=voice, speed=speed, **kwargs)
+    # Read reference audio bytes if provided
+    reference_audio_bytes = None
+    if reference_audio:
+        reference_audio_bytes = await reference_audio.read()
 
+    result = await tts_engine.synthesize(
+        text,
+        voice=voice,
+        speed=speed,
+        reference_audio=reference_audio_bytes,
+        reference_text=reference_text,
+        **kwargs,
+    )
     return result
 
 
 @router.post("/synthesize/stream")
 async def synthesize_text_stream(
-    text: str = Query(..., description="Text to synthesize"),
+    text: str = Form(..., description="Text to synthesize"),
+    reference_audio: UploadFile | None = File(
+        None, description="Reference audio for voice cloning"
+    ),
+    reference_text: str | None = Form(
+        None, description="Transcript of reference audio"
+    ),
+    engine_params: str | None = Form(None, description="JSON engine parameters"),
     voice: str | None = Query(None, description="Voice name/ID to use"),
     speed: float = Query(1.0, gt=0, le=3.0, description="Speech speed multiplier"),
-    engine_params: str | None = Query(None, description="JSON engine parameters"),
     tts_engine: BaseTTSEngine = Depends(get_tts_engine),
 ):
     """
     Synthesize text to speech with streaming
 
     Query params:
     - engine: TTS engine name (required)
-    - text: Text to synthesize (required)
     - voice: Optional voice name/ID
     - speed: Speech speed multiplier (0 < speed <= 3.0)
+
+    Form params:
+    - text: Text to synthesize (required)
+    - reference_audio: Optional reference audio file for voice cloning
+    - reference_text: Optional transcript of reference audio
     - engine_params: Optional JSON engine parameters
 
     Returns progressive audio chunks followed by final response.
@@ -70,9 +101,19 @@ async def synthesize_text_stream(
         except json.JSONDecodeError as e:
             raise HTTPException(400, "Invalid engine_params JSON") from e
 
+    # Read reference audio bytes if provided
+    reference_audio_bytes = None
+    if reference_audio:
+        reference_audio_bytes = await reference_audio.read()
+
     async def event_generator():
         async for result in tts_engine.synthesize_stream(
-            text, voice=voice, speed=speed, **kwargs
+            text,
+            voice=voice,
+            speed=speed,
+            reference_audio=reference_audio_bytes,
+            reference_text=reference_text,
+            **kwargs,
         ):
             if isinstance(result, TTSChunk):
                 yield {"event": "chunk", "data": result.model_dump_json()}

diff --git a/app/engines/base.py b/app/engines/base.py
@@ -194,7 +194,13 @@ class BaseTTSEngine(BaseEngine):
 
     @abstractmethod
     async def synthesize(
-        self, text: str, voice: str | None = None, speed: float = 1.0, **kwargs
+        self,
+        text: str,
+        voice: str | None = None,
+        speed: float = 1.0,
+        reference_audio: bytes | None = None,
+        reference_text: str | None = None,
+        **kwargs,
     ) -> TTSResponse:
         """
         Synthesize text to speech (invoke/batch mode)
@@ -203,6 +209,8 @@ async def synthesize(
             text: Text to synthesize
             voice: Optional voice name (overrides config default)
             speed: Speech speed (1.0 = normal, overrides config default)
+            reference_audio: Reference audio bytes for voice cloning
+            reference_text: Transcript of reference audio for voice cloning
             **kwargs: Additional engine-specific parameters (passed via engine_params)
 
         Returns:
@@ -212,7 +220,13 @@ async def synthesize(
 
     @abstractmethod
     async def synthesize_stream(
-        self, text: str, **kwargs
+        self,
+        text: str,
+        voice: str | None = None,
+        speed: float = 1.0,
+        reference_audio: bytes | None = None,
+        reference_text: str | None = None,
+        **kwargs,
     ) -> AsyncIterator[TTSChunk | TTSResponse]:
         """
         Synthesize text to speech (streaming mode)
@@ -222,7 +236,11 @@ async def synthesize_stream(
 
         Args:
             text: Text to synthesize
-            **kwargs: Engine-specific params (voice, speed, etc.)
+            voice: Optional voice name (overrides config default)
+            speed: Speech speed (1.0 = normal, overrides config default)
+            reference_audio: Reference audio bytes for voice cloning
+            reference_text: Transcript of reference audio for voice cloning
+            **kwargs: Additional engine-specific parameters (passed via engine_params)
 
         Yields:
             TTSChunk: Audio chunks with progressive generation