diff --git a/README.md b/README.md
index 2f91a14..752f599 100644
--- a/README.md
+++ b/README.md
@@ -128,17 +128,15 @@ curl http://localhost:8000/api/v1/health
 **Batch Transcription**
 
 ```bash
-curl -X POST "http://localhost:8000/api/v1/stt/transcribe?engine=whisper" \
-  -H "accept: application/json" \
-  -H "Content-Type: multipart/form-data" \
-  -F "file=@/path/to/audio.wav"
+curl -X POST "http://localhost:8000/api/v1/stt/transcribe?engine=whisper&language=en" \
+  -F "audio=@/path/to/audio.wav"
 ```
 
 **Real-time Streaming (SSE)**
 
 ```bash
 curl -N -X POST "http://localhost:8000/api/v1/stt/transcribe/stream?engine=whisper" \
-  -F "file=@/path/to/audio.wav"
+  -F "audio=@/path/to/audio.wav"
 ```
 
 ### 🔊 Text-to-Speech (TTS)
@@ -146,13 +144,24 @@ curl -N -X POST "http://localhost:8000/api/v1/stt/transcribe/stream?engine=whisp
 **Batch Synthesis**
 
 ```bash
-curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm&text=Hello%20world"
+curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \
+  -F "text=Hello world"
 ```
 
 **Streaming Synthesis**
 
 ```bash
-curl -N -X POST "http://localhost:8000/api/v1/tts/synthesize/stream?engine=voxcpm&text=Hello%20world"
+curl -N -X POST "http://localhost:8000/api/v1/tts/synthesize/stream?engine=voxcpm" \
+  -F "text=Hello world"
+```
+
+**Voice Cloning**
+
+```bash
+curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \
+  -F "text=Hello world" \
+  -F "reference_audio=@/path/to/reference.wav" \
+  -F "reference_text=This is the reference transcript"
 ```
 
 ---
@@ -182,7 +191,7 @@ Detailed documentation is available in the `docs/` directory:
 
 | Engine | Backend | Status | Features |
 | :--- | :--- | :---: | :--- |
-| **VoxCPM** | `voxcpm` | ✅ Ready | Zero-shot voice cloning, streaming, 24kHz |
+| **VoxCPM** | `voxcpm` | ✅ Ready | Zero-shot voice cloning, streaming |
 | **Coqui TTS** | `TTS` | 🚧 Planned | High-quality open source voices |
 | **OpenAI TTS** | OpenAI API | 🚧 Planned | Natural sounding commercial voices |
 
diff --git a/app/api/routers/stt.py b/app/api/routers/stt.py
index 4e4cc35..436ce09 100644
--- a/app/api/routers/stt.py
+++ b/app/api/routers/stt.py
@@ -4,6 +4,7 @@
 from fastapi import (
     APIRouter,
     Depends,
+    Form,
     HTTPException,
     Query,
     WebSocket,
@@ -23,7 +24,7 @@
 async def transcribe_audio(
     audio: Annotated[bytes, Depends(validate_audio_upload)],
     language: str | None = Query(None, description="Language hint"),
-    engine_params: str | None = Query(None, description="JSON engine parameters"),
+    engine_params: str | None = Form(None, description="JSON engine parameters"),
     stt_engine: BaseSTTEngine = Depends(get_stt_engine),
 ):
     """
@@ -32,6 +33,8 @@ async def transcribe_audio(
     Query params:
     - engine: STT engine name (required, e.g., "whisper")
     - language: Optional language hint
+
+    Form params:
     - engine_params: Optional JSON engine parameters
 
     Returns complete transcription with segments and metrics.
@@ -50,8 +53,8 @@ async def transcribe_audio(
 @router.post("/transcribe/stream")
 async def transcribe_audio_stream(
     audio: Annotated[bytes, Depends(validate_audio_upload)],
-    language: str | None = Query(None),
-    engine_params: str | None = Query(None),
+    language: str | None = Query(None, description="Language hint"),
+    engine_params: str | None = Form(None, description="JSON engine parameters"),
     stt_engine: BaseSTTEngine = Depends(get_stt_engine),
 ):
     """
@@ -59,6 +62,10 @@ async def transcribe_audio_stream(
 
     Query params:
     - engine: STT engine name (required, e.g., "whisper")
+    - language: Optional language hint
+
+    Form params:
+    - engine_params: Optional JSON engine parameters
 
     Returns progressive chunks followed by final response.
     Event types: "chunk" (STTChunk), "complete" (STTResponse)
diff --git a/app/api/routers/tts.py b/app/api/routers/tts.py
index ae83231..8ee1ea6 100644
--- a/app/api/routers/tts.py
+++ b/app/api/routers/tts.py
@@ -1,6 +1,6 @@
 import json
 
-from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
 from sse_starlette.sse import EventSourceResponse
 
 from app.api.deps import get_tts_engine
@@ -12,10 +12,16 @@
 
 @router.post("/synthesize", response_model=TTSResponse)
 async def synthesize_text(
-    text: str = Query(..., description="Text to synthesize"),
+    text: str = Form(..., description="Text to synthesize"),
+    reference_audio: UploadFile | None = File(
+        None, description="Reference audio for voice cloning"
+    ),
+    reference_text: str | None = Form(
+        None, description="Transcript of reference audio"
+    ),
+    engine_params: str | None = Form(None, description="JSON engine parameters"),
     voice: str | None = Query(None, description="Voice name/ID to use"),
     speed: float = Query(1.0, gt=0, le=3.0, description="Speech speed multiplier"),
-    engine_params: str | None = Query(None, description="JSON engine parameters"),
     tts_engine: BaseTTSEngine = Depends(get_tts_engine),
 ):
     """
@@ -23,9 +29,13 @@ async def synthesize_text(
 
     Query params:
     - engine: TTS engine name (required)
-    - text: Text to synthesize (required)
     - voice: Optional voice name/ID
     - speed: Speech speed multiplier (0 < speed <= 3.0)
+
+    Form params:
+    - text: Text to synthesize (required)
+    - reference_audio: Optional reference audio file for voice cloning
+    - reference_text: Optional transcript of reference audio
     - engine_params: Optional JSON engine parameters
 
     Returns complete audio (base64 encoded) with metrics.
@@ -37,17 +47,34 @@ async def synthesize_text(
         except json.JSONDecodeError as e:
             raise HTTPException(400, "Invalid engine_params JSON") from e
 
-    result = await tts_engine.synthesize(text, voice=voice, speed=speed, **kwargs)
+    # Read reference audio bytes if provided
+    reference_audio_bytes = None
+    if reference_audio:
+        reference_audio_bytes = await reference_audio.read()
 
+    result = await tts_engine.synthesize(
+        text,
+        voice=voice,
+        speed=speed,
+        reference_audio=reference_audio_bytes,
+        reference_text=reference_text,
+        **kwargs,
+    )
     return result
 
 
 @router.post("/synthesize/stream")
 async def synthesize_text_stream(
-    text: str = Query(..., description="Text to synthesize"),
+    text: str = Form(..., description="Text to synthesize"),
+    reference_audio: UploadFile | None = File(
+        None, description="Reference audio for voice cloning"
+    ),
+    reference_text: str | None = Form(
+        None, description="Transcript of reference audio"
+    ),
+    engine_params: str | None = Form(None, description="JSON engine parameters"),
     voice: str | None = Query(None, description="Voice name/ID to use"),
     speed: float = Query(1.0, gt=0, le=3.0, description="Speech speed multiplier"),
-    engine_params: str | None = Query(None, description="JSON engine parameters"),
     tts_engine: BaseTTSEngine = Depends(get_tts_engine),
 ):
     """
@@ -55,9 +82,13 @@ async def synthesize_text_stream(
 
     Query params:
     - engine: TTS engine name (required)
-    - text: Text to synthesize (required)
     - voice: Optional voice name/ID
     - speed: Speech speed multiplier (0 < speed <= 3.0)
+
+    Form params:
+    - text: Text to synthesize (required)
+    - reference_audio: Optional reference audio file for voice cloning
+    - reference_text: Optional transcript of reference audio
     - engine_params: Optional JSON engine parameters
 
     Returns progressive audio chunks followed by final response.
@@ -70,9 +101,19 @@ async def synthesize_text_stream(
         except json.JSONDecodeError as e:
             raise HTTPException(400, "Invalid engine_params JSON") from e
 
+    # Read reference audio bytes if provided
+    reference_audio_bytes = None
+    if reference_audio:
+        reference_audio_bytes = await reference_audio.read()
+
     async def event_generator():
         async for result in tts_engine.synthesize_stream(
-            text, voice=voice, speed=speed, **kwargs
+            text,
+            voice=voice,
+            speed=speed,
+            reference_audio=reference_audio_bytes,
+            reference_text=reference_text,
+            **kwargs,
         ):
             if isinstance(result, TTSChunk):
                 yield {"event": "chunk", "data": result.model_dump_json()}
diff --git a/app/engines/base.py b/app/engines/base.py
index 86c687c..d3e9ce1 100644
--- a/app/engines/base.py
+++ b/app/engines/base.py
@@ -194,7 +194,13 @@ class BaseTTSEngine(BaseEngine):
 
     @abstractmethod
     async def synthesize(
-        self, text: str, voice: str | None = None, speed: float = 1.0, **kwargs
+        self,
+        text: str,
+        voice: str | None = None,
+        speed: float = 1.0,
+        reference_audio: bytes | None = None,
+        reference_text: str | None = None,
+        **kwargs,
     ) -> TTSResponse:
         """
         Synthesize text to speech (invoke/batch mode)
@@ -203,6 +209,8 @@ async def synthesize(
             text: Text to synthesize
             voice: Optional voice name (overrides config default)
             speed: Speech speed (1.0 = normal, overrides config default)
+            reference_audio: Reference audio bytes for voice cloning
+            reference_text: Transcript of reference audio for voice cloning
             **kwargs: Additional engine-specific parameters (passed via engine_params)
 
         Returns:
@@ -212,7 +220,13 @@ async def synthesize(
 
     @abstractmethod
     async def synthesize_stream(
-        self, text: str, **kwargs
+        self,
+        text: str,
+        voice: str | None = None,
+        speed: float = 1.0,
+        reference_audio: bytes | None = None,
+        reference_text: str | None = None,
+        **kwargs,
     ) -> AsyncIterator[TTSChunk | TTSResponse]:
         """
         Synthesize text to speech (streaming mode)
@@ -222,7 +236,11 @@ async def synthesize_stream(
 
         Args:
             text: Text to synthesize
-            **kwargs: Engine-specific params (voice, speed, etc.)
+            voice: Optional voice name (overrides config default)
+            speed: Speech speed (1.0 = normal, overrides config default)
+            reference_audio: Reference audio bytes for voice cloning
+            reference_text: Transcript of reference audio for voice cloning
+            **kwargs: Additional engine-specific parameters (passed via engine_params)
 
         Yields:
             TTSChunk: Audio chunks with progressive generation
diff --git a/app/engines/tts/voxcpm/engine.py b/app/engines/tts/voxcpm/engine.py
index 585e170..cf9d9fc 100644
--- a/app/engines/tts/voxcpm/engine.py
+++ b/app/engines/tts/voxcpm/engine.py
@@ -6,7 +6,6 @@
 
 import time
 from collections.abc import AsyncIterator
-from pathlib import Path
 
 import numpy as np
 
@@ -15,7 +14,7 @@
 from app.exceptions import EngineNotReadyError, SynthesisError
 from app.models.engine import TTSChunk, TTSResponse
 from app.models.metrics import TTSPerformanceMetrics
-from app.utils.audio import AudioProcessor
+from app.utils.audio import AudioProcessor, temp_audio_file
 
 
 class VoxCPMEngine(BaseTTSEngine):
@@ -72,6 +71,8 @@ async def synthesize(
         text: str,
         voice: str | None = None,
         speed: float = 1.0,
+        reference_audio: bytes | None = None,
+        reference_text: str | None = None,
         **kwargs,
     ) -> TTSResponse:
         """
@@ -79,11 +80,10 @@ async def synthesize(
 
         Args:
             text: Text to synthesize
-            voice: Not used (voice cloning via kwargs instead)
+            voice: Not used (voice cloning via reference_audio instead)
             speed: Not directly supported by VoxCPM
-            **kwargs: Additional parameters:
-                - prompt_wav_path: Path to reference audio for voice cloning
-                - prompt_text: Transcript of reference audio
+            reference_audio: Reference audio bytes for voice cloning
+            reference_text: Transcript of reference audio
 
         Returns:
             TTSResponse with audio data and metrics
@@ -95,43 +95,36 @@ async def synthesize(
         if self._model is None:
             raise EngineNotReadyError("VoxCPM model not loaded")
 
-        # Extract voice cloning parameters
-        prompt_wav_path = kwargs.get("prompt_wav_path")
-        prompt_text = kwargs.get("prompt_text")
-
-        # Validate prompt audio if provided
-        if prompt_wav_path is not None:
-            prompt_path = Path(prompt_wav_path)
-            if not prompt_path.exists():
-                raise SynthesisError(f"Prompt audio file not found: {prompt_wav_path}")
-
-        processing_start = time.time()
-        try:
-            # Generate audio using VoxCPM
-            wav = self._model.generate(
-                text=text,
-                prompt_wav_path=prompt_wav_path,
-                prompt_text=prompt_text,
-                cfg_value=self.voxcpm_config.cfg_value,
-                inference_timesteps=self.voxcpm_config.inference_timesteps,
-                normalize=self.voxcpm_config.normalize,
-                denoise=self.voxcpm_config.denoise,
-                retry_badcase=self.voxcpm_config.retry_badcase,
-                retry_badcase_max_times=self.voxcpm_config.retry_badcase_max_times,
-                retry_badcase_ratio_threshold=self.voxcpm_config.retry_badcase_ratio_threshold,
-            )
+        with temp_audio_file(reference_audio) as prompt_wav_path:
+            processing_start = time.time()
+            try:
+                # Generate audio using VoxCPM
+                wav = self._model.generate(
+                    text=text,
+                    prompt_wav_path=prompt_wav_path,
+                    prompt_text=reference_text,
+                    cfg_value=self.voxcpm_config.cfg_value,
+                    inference_timesteps=self.voxcpm_config.inference_timesteps,
+                    normalize=self.voxcpm_config.normalize,
+                    denoise=self.voxcpm_config.denoise,
+                    retry_badcase=self.voxcpm_config.retry_badcase,
+                    retry_badcase_max_times=self.voxcpm_config.retry_badcase_max_times,
+                    retry_badcase_ratio_threshold=self.voxcpm_config.retry_badcase_ratio_threshold,
+                )
 
-            # Get sample rate from model
-            sample_rate = self._model.tts_model.sample_rate
+                # Get sample rate from model
+                sample_rate = self._model.tts_model.sample_rate
 
-            # Convert numpy array to bytes (16-bit PCM WAV)
-            audio_bytes = self.audio_processor.numpy_to_wav_bytes(wav, sample_rate)
+                # Convert numpy array to bytes (16-bit PCM WAV)
+                audio_bytes = self.audio_processor.numpy_to_wav_bytes(wav, sample_rate)
 
-            # Calculate duration
-            duration_seconds = len(wav) / sample_rate
+                # Calculate duration
+                duration_seconds = len(wav) / sample_rate
 
-        except Exception as e:
-            raise SynthesisError(f"VoxCPM synthesis failed: {e}") from e
+            except Exception as e:
+                if isinstance(e, (EngineNotReadyError, SynthesisError)):
+                    raise
+                raise SynthesisError(f"VoxCPM synthesis failed: {e}") from e
 
         processing_end = time.time()
         end_time = time.time()
@@ -164,6 +157,10 @@ async def synthesize(
     async def synthesize_stream(
         self,
         text: str,
+        voice: str | None = None,
+        speed: float = 1.0,
+        reference_audio: bytes | None = None,
+        reference_text: str | None = None,
         **kwargs,
     ) -> AsyncIterator[TTSChunk | TTSResponse]:
         """
@@ -171,7 +168,10 @@ async def synthesize_stream(
 
         Args:
             text: Text to synthesize
-            **kwargs: Additional parameters (same as synthesize)
+            voice: Not used (voice cloning via reference_audio instead)
+            speed: Not directly supported by VoxCPM
+            reference_audio: Reference audio bytes for voice cloning
+            reference_text: Transcript of reference audio
 
         Yields:
             TTSChunk: Audio chunks with progressive generation
@@ -187,104 +187,95 @@ async def synthesize_stream(
         if self._model is None:
             raise EngineNotReadyError("VoxCPM model not loaded")
 
-        # Extract voice cloning parameters
-        prompt_wav_path = kwargs.get("prompt_wav_path")
-        prompt_text = kwargs.get("prompt_text")
-
-        # Validate prompt audio if provided
-        if prompt_wav_path is not None:
-            prompt_path = Path(prompt_wav_path)
-            if not prompt_path.exists():
-                raise SynthesisError(f"Prompt audio file not found: {prompt_wav_path}")
-
-        try:
-            # Stream audio chunks
-            for chunk in self._model.generate_streaming(
-                text=text,
-                prompt_wav_path=prompt_wav_path,
-                prompt_text=prompt_text,
-                cfg_value=self.voxcpm_config.cfg_value,
-                inference_timesteps=self.voxcpm_config.inference_timesteps,
-                normalize=self.voxcpm_config.normalize,
-                denoise=self.voxcpm_config.denoise,
-                retry_badcase=self.voxcpm_config.retry_badcase,
-                retry_badcase_max_times=self.voxcpm_config.retry_badcase_max_times,
-                retry_badcase_ratio_threshold=self.voxcpm_config.retry_badcase_ratio_threshold,
-            ):
-                chunk_time = time.time()
-
-                if first_chunk_time is None:
-                    first_chunk_time = chunk_time
-
-                # Store raw numpy chunk for final concatenation
-                all_audio_chunks.append(chunk)
-
-                # Get sample rate from model
-                sample_rate = self._model.tts_model.sample_rate
-
-                # Convert chunk to bytes
-                chunk_bytes = self.audio_processor.numpy_to_wav_bytes(
-                    chunk, sample_rate
+        with temp_audio_file(reference_audio) as prompt_wav_path:
+            try:
+                # Stream audio chunks
+                for chunk in self._model.generate_streaming(
+                    text=text,
+                    prompt_wav_path=prompt_wav_path,
+                    prompt_text=reference_text,
+                    cfg_value=self.voxcpm_config.cfg_value,
+                    inference_timesteps=self.voxcpm_config.inference_timesteps,
+                    normalize=self.voxcpm_config.normalize,
+                    denoise=self.voxcpm_config.denoise,
+                    retry_badcase=self.voxcpm_config.retry_badcase,
+                    retry_badcase_max_times=self.voxcpm_config.retry_badcase_max_times,
+                    retry_badcase_ratio_threshold=self.voxcpm_config.retry_badcase_ratio_threshold,
+                ):
+                    chunk_time = time.time()
+
+                    if first_chunk_time is None:
+                        first_chunk_time = chunk_time
+
+                    # Store raw numpy chunk for final concatenation
+                    all_audio_chunks.append(chunk)
+
+                    # Get sample rate from model
+                    sample_rate = self._model.tts_model.sample_rate
+
+                    # Convert chunk to bytes
+                    chunk_bytes = self.audio_processor.numpy_to_wav_bytes(
+                        chunk, sample_rate
+                    )
+
+                    chunk_latency_ms = (chunk_time - start_time) * 1000
+
+                    yield TTSChunk(
+                        audio_data=chunk_bytes,
+                        sequence_number=total_chunks,
+                        chunk_latency_ms=chunk_latency_ms,
+                    )
+
+                    total_chunks += 1
+
+                # Final response
+                end_time = time.time()
+
+                # Concatenate all chunks
+                if all_audio_chunks:
+                    full_audio = np.concatenate(all_audio_chunks)
+                    sample_rate = self._model.tts_model.sample_rate
+                    audio_bytes = self.audio_processor.numpy_to_wav_bytes(
+                        full_audio, sample_rate
+                    )
+                    duration_seconds = len(full_audio) / sample_rate
+                else:
+                    audio_bytes = b""
+                    sample_rate = 16000
+                    duration_seconds = 0.0
+
+                total_duration_ms = (end_time - start_time) * 1000
+                time_to_first_byte_ms = (
+                    (first_chunk_time - start_time) * 1000 if first_chunk_time else None
                 )
 
-                chunk_latency_ms = (chunk_time - start_time) * 1000
-
-                yield TTSChunk(
-                    audio_data=chunk_bytes,
-                    sequence_number=total_chunks,
-                    chunk_latency_ms=chunk_latency_ms,
+                metrics = TTSPerformanceMetrics(
+                    latency_ms=total_duration_ms,
+                    processing_time_ms=total_duration_ms,
+                    audio_duration_ms=duration_seconds * 1000,
+                    real_time_factor=(
+                        total_duration_ms / (duration_seconds * 1000)
+                        if duration_seconds > 0
+                        else None
+                    ),
+                    characters_processed=len(text),
+                    time_to_first_byte_ms=time_to_first_byte_ms,
+                    total_stream_duration_ms=total_duration_ms,
+                    total_chunks=total_chunks,
                 )
 
-                total_chunks += 1
-
-            # Final response
-            end_time = time.time()
-
-            # Concatenate all chunks
-            if all_audio_chunks:
-                full_audio = np.concatenate(all_audio_chunks)
-                sample_rate = self._model.tts_model.sample_rate
-                audio_bytes = self.audio_processor.numpy_to_wav_bytes(
-                    full_audio, sample_rate
+                yield TTSResponse(
+                    audio_data=audio_bytes,
+                    sample_rate=sample_rate,
+                    duration_seconds=duration_seconds,
+                    format="wav",
+                    performance_metrics=metrics,
                 )
-                duration_seconds = len(full_audio) / sample_rate
-            else:
-                audio_bytes = b""
-                sample_rate = 16000
-                duration_seconds = 0.0
-
-            total_duration_ms = (end_time - start_time) * 1000
-            time_to_first_byte_ms = (
-                (first_chunk_time - start_time) * 1000 if first_chunk_time else None
-            )
-
-            metrics = TTSPerformanceMetrics(
-                latency_ms=total_duration_ms,
-                processing_time_ms=total_duration_ms,
-                audio_duration_ms=duration_seconds * 1000,
-                real_time_factor=(
-                    total_duration_ms / (duration_seconds * 1000)
-                    if duration_seconds > 0
-                    else None
-                ),
-                characters_processed=len(text),
-                time_to_first_byte_ms=time_to_first_byte_ms,
-                total_stream_duration_ms=total_duration_ms,
-                total_chunks=total_chunks,
-            )
 
-            yield TTSResponse(
-                audio_data=audio_bytes,
-                sample_rate=sample_rate,
-                duration_seconds=duration_seconds,
-                format="wav",
-                performance_metrics=metrics,
-            )
-
-        except Exception as e:
-            if isinstance(e, (EngineNotReadyError, SynthesisError)):
-                raise
-            raise SynthesisError(f"VoxCPM streaming failed: {e}") from e
+            except Exception as e:
+                if isinstance(e, (EngineNotReadyError, SynthesisError)):
+                    raise
+                raise SynthesisError(f"VoxCPM streaming failed: {e}") from e
 
     @property
     def supported_voices(self) -> list[str]:
diff --git a/app/utils/audio.py b/app/utils/audio.py
index 1569a4c..85a6fff 100644
--- a/app/utils/audio.py
+++ b/app/utils/audio.py
@@ -2,6 +2,10 @@
 
 import io
 import pathlib
+import tempfile
+from collections.abc import Generator
+from contextlib import contextmanager
+from pathlib import Path
 
 import librosa
 import numpy as np
@@ -11,6 +15,39 @@
 from app.types.audio import AudioInput
 
 
+@contextmanager
+def temp_audio_file(
+    audio_bytes: bytes | None, suffix: str = ".wav"
+) -> Generator[str | None]:
+    """
+    Context manager that saves audio bytes to a temp file and cleans up after use.
+
+    Args:
+        audio_bytes: Audio data as bytes, or None (yields None without creating file)
+        suffix: File extension (default: ".wav")
+
+    Yields:
+        Path to the temporary file, or None if audio_bytes is None
+
+    Example:
+        with temp_audio_file(audio_bytes) as temp_path:
+            model.generate(prompt_wav_path=temp_path)
+        # temp file is automatically deleted
+    """
+    if audio_bytes is None:
+        yield None
+        return
+
+    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
+        temp_file.write(audio_bytes)
+        temp_path = temp_file.name
+
+    try:
+        yield temp_path
+    finally:
+        Path(temp_path).unlink(missing_ok=True)
+
+
 class AudioProcessor:
     """Handles audio format conversion, resampling, and validation"""
 
diff --git a/docs/api.md b/docs/api.md
index 28fd72e..e159d07 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -79,18 +79,23 @@ Batch transcription - upload audio file and receive complete transcription.
 |-----------|------|----------|-------------|
 | `engine` | string | Yes | Engine name (e.g., "whisper") |
 | `language` | string | No | Language hint (e.g., "en", "es") |
+
+**Form Parameters:**
+
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| `audio` | file | Yes | Audio file (wav, mp3, flac, ogg, m4a, opus) |
 | `engine_params` | string | No | JSON string with engine-specific parameters |
 
 **Request:**
 
 - Content-Type: `multipart/form-data`
-- Body: `file` - Audio file (wav, mp3, flac, ogg, m4a, opus)
 
 **Example:**
 
 ```bash
 curl -X POST "http://localhost:8000/api/v1/stt/transcribe?engine=whisper&language=en" \
-  -F "file=@audio.wav"
+  -F "audio=@audio.wav"
 ```
 
 **Response:**
@@ -249,15 +254,36 @@ Batch synthesis - convert text to speech audio.
 | Parameter | Type | Required | Description |
 |-----------|------|----------|-------------|
 | `engine` | string | Yes | Engine name (e.g., "voxcpm") |
-| `text` | string | Yes | Text to synthesize |
 | `voice` | string | No | Voice name/ID to use |
 | `speed` | float | No | Speech speed multiplier (0 < speed <= 3.0, default: 1.0) |
+
+**Form Parameters:**
+
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| `text` | string | Yes | Text to synthesize |
+| `reference_audio` | file | No | Reference audio file for voice cloning |
+| `reference_text` | string | No | Transcript of reference audio (required with reference_audio) |
 | `engine_params` | string | No | JSON string with engine-specific parameters |
 
+**Request:**
+
+- Content-Type: `multipart/form-data`
+
 **Example:**
 
 ```bash
-curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm&text=Hello%20world"
+curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \
+  -F "text=Hello world"
+```
+
+**Voice Cloning Example:**
+
+```bash
+curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \
+  -F "text=Hello world" \
+  -F "reference_audio=@/path/to/reference.wav" \
+  -F "reference_text=This is the reference transcript"
 ```
 
 **Response:**
@@ -286,6 +312,14 @@ SSE (Server-Sent Events) streaming synthesis - receive progressive audio chunks.
 
 Same as `/tts/synthesize`.
 
+**Form Parameters:**
+
+Same as `/tts/synthesize`.
+
+**Request:**
+
+- Content-Type: `multipart/form-data`
+
 **Response:**
 
 Server-Sent Events stream with two event types:
@@ -307,32 +341,32 @@ data: {"audio_data": "<base64-full-audio>", "sample_rate": 22050, "duration_seco
 **Example:**
 
 ```bash
-curl -N -X POST "http://localhost:8000/api/v1/tts/synthesize/stream?engine=voxcpm&text=Hello%20world"
+curl -N -X POST "http://localhost:8000/api/v1/tts/synthesize/stream?engine=voxcpm" \
+  -F "text=Hello world"
 ```
 
 **JavaScript Client Example:**
 
 ```javascript
-const params = new URLSearchParams({
-  engine: 'voxcpm',
-  text: 'Hello, how are you today?'
-});
+const formData = new FormData();
+formData.append('text', 'Hello, how are you today?');
 
-const eventSource = new EventSource(
-  `http://localhost:8000/api/v1/tts/synthesize/stream?${params}`
+const response = await fetch(
+  'http://localhost:8000/api/v1/tts/synthesize/stream?engine=voxcpm',
+  { method: 'POST', body: formData }
 );
 
-eventSource.addEventListener('chunk', (event) => {
-  const chunk = JSON.parse(event.data);
-  // Process audio chunk
-  console.log('Chunk:', chunk.sequence_number);
-});
+const reader = response.body.getReader();
+const decoder = new TextDecoder();
 
-eventSource.addEventListener('complete', (event) => {
-  const result = JSON.parse(event.data);
-  console.log('Complete, duration:', result.duration_seconds);
-  eventSource.close();
-});
+while (true) {
+  const { done, value } = await reader.read();
+  if (done) break;
+
+  const text = decoder.decode(value);
+  // Parse SSE events from text
+  console.log('Received:', text);
+}
 ```
 
 ---
@@ -474,7 +508,7 @@ All errors follow this format:
 
 ### Whisper (faster-whisper)
 
-Pass these via `engine_params` query parameter as JSON string.
+Pass these via `engine_params` form parameter as JSON string.
 
 | Parameter | Type | Default | Description |
 |-----------|------|---------|-------------|
@@ -489,29 +523,34 @@ Pass these via `engine_params` query parameter as JSON string.
 **Example:**
 
 ```bash
-curl -X POST "http://localhost:8000/api/v1/stt/transcribe?engine=whisper&engine_params={\"beam_size\":3,\"vad_filter\":true}" \
-  -F "file=@audio.wav"
+curl -X POST "http://localhost:8000/api/v1/stt/transcribe?engine=whisper" \
+  -F "audio=@audio.wav" \
+  -F 'engine_params={"beam_size":3,"vad_filter":true}'
 ```
 
 ### VoxCPM (Text-to-Speech)
 
-Pass these via `engine_params` query parameter as JSON string.
+VoxCPM supports zero-shot voice cloning using reference audio. Pass the reference audio and its transcript using the top-level form parameters.
 
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `prompt_wav_path` | string | null | Path to reference audio for zero-shot voice cloning |
-| `prompt_text` | string | null | Transcript of the reference audio (required with prompt_wav_path) |
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `reference_audio` | file | Reference audio file for voice cloning (wav format recommended) |
+| `reference_text` | string | Transcript of the reference audio (required with reference_audio) |
 
 **Basic Example:**
 
 ```bash
-curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm&text=Hello%20world"
+curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \
+  -F "text=Hello world"
 ```
 
 **Voice Cloning Example:**
 
 ```bash
-curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm&text=Hello%20world&engine_params={\"prompt_wav_path\":\"/path/to/reference.wav\",\"prompt_text\":\"This%20is%20the%20reference%20transcript\"}"
+curl -X POST "http://localhost:8000/api/v1/tts/synthesize?engine=voxcpm" \
+  -F "text=Hello world" \
+  -F "reference_audio=@/path/to/reference.wav" \
+  -F "reference_text=This is the reference transcript"
 ```
 
 ---
diff --git a/docs/custom-engines.md b/docs/custom-engines.md
index 80d6cfc..f2afa0d 100644
--- a/docs/custom-engines.md
+++ b/docs/custom-engines.md
@@ -422,6 +422,8 @@ class MyTTSEngine(BaseTTSEngine):
         text: str,
         voice: str | None = None,
         speed: float = 1.0,
+        reference_audio: bytes | None = None,
+        reference_text: str | None = None,
         **kwargs
     ) -> TTSResponse:
         """
@@ -431,6 +433,8 @@ class MyTTSEngine(BaseTTSEngine):
             text: Text to synthesize
             voice: Voice name (uses default if None)
             speed: Speech speed multiplier
+            reference_audio: Reference audio bytes for voice cloning (optional)
+            reference_text: Transcript of reference audio (required with reference_audio)
             **kwargs: Additional engine parameters
 
         Returns:
@@ -448,6 +452,21 @@ class MyTTSEngine(BaseTTSEngine):
         processing_start = time.time()
         try:
             # Your synthesis logic here
+            # If your model supports voice cloning:
+            if reference_audio is not None:
+                # Option 1: Model accepts bytes directly
+                # audio_data = self._model.synthesize(
+                #     text, reference_audio=reference_audio, reference_text=reference_text
+                # )
+
+                # Option 2: Model requires file path - use temp_audio_file helper
+                # from app.utils.audio import temp_audio_file
+                # with temp_audio_file(reference_audio) as ref_path:
+                #     audio_data = self._model.synthesize(
+                #         text, prompt_wav_path=ref_path, prompt_text=reference_text
+                #     )
+                pass
+
             audio_data = b"..."  # Generated audio bytes
             duration_seconds = 1.0  # Calculated duration
 
@@ -482,12 +501,24 @@ class MyTTSEngine(BaseTTSEngine):
     async def synthesize_stream(
         self,
         text: str,
+        voice: str | None = None,
+        speed: float = 1.0,
+        reference_audio: bytes | None = None,
+        reference_text: str | None = None,
         **kwargs
     ) -> AsyncIterator[TTSChunk | TTSResponse]:
         """
         Streaming synthesis.
 
         Yields TTSChunk for audio chunks, then TTSResponse for final.
+
+        Args:
+            text: Text to synthesize
+            voice: Voice name (uses default if None)
+            speed: Speech speed multiplier
+            reference_audio: Reference audio bytes for voice cloning (optional)
+            reference_text: Transcript of reference audio (required with reference_audio)
+            **kwargs: Additional engine parameters
         """
         start_time = time.time()
         total_chunks = 0
@@ -627,6 +658,8 @@ class BaseTTSEngine(BaseEngine):
         text: str,
         voice: str | None = None,
         speed: float = 1.0,
+        reference_audio: bytes | None = None,
+        reference_text: str | None = None,
         **kwargs
     ) -> TTSResponse: ...
 
@@ -634,6 +667,10 @@ class BaseTTSEngine(BaseEngine):
     async def synthesize_stream(
         self,
         text: str,
+        voice: str | None = None,
+        speed: float = 1.0,
+        reference_audio: bytes | None = None,
+        reference_text: str | None = None,
         **kwargs
     ) -> AsyncIterator[TTSChunk | TTSResponse]: ...
 
diff --git a/tests/unit/api/conftest.py b/tests/unit/api/conftest.py
index f36e4c0..ebb08d8 100644
--- a/tests/unit/api/conftest.py
+++ b/tests/unit/api/conftest.py
@@ -63,7 +63,14 @@ def mock_tts_engine():
     )
     engine.synthesize = AsyncMock(return_value=mock_response)
 
-    async def mock_stream_generator(text, **kwargs):
+    async def mock_stream_generator(
+        text,
+        voice=None,
+        speed=1.0,
+        reference_audio=None,
+        reference_text=None,
+        **kwargs,
+    ):
         yield TTSChunk(audio_data=b"chunk1", sequence_number=0)
         yield TTSChunk(audio_data=b"chunk2", sequence_number=1)
         yield mock_response
diff --git a/tests/unit/api/test_stt_router.py b/tests/unit/api/test_stt_router.py
index a8a3cf2..922794a 100644
--- a/tests/unit/api/test_stt_router.py
+++ b/tests/unit/api/test_stt_router.py
@@ -55,7 +55,8 @@ def test_returns_400_for_invalid_json_params(self, client, test_audio_bytes):
         """Returns 400 for invalid JSON in engine_params"""
         response = client.post(
             "/api/v1/stt/transcribe",
-            params={"engine": "default", "engine_params": "invalid json{"},
+            params={"engine": "default"},
+            data={"engine_params": "invalid json{"},
             files={"audio": ("test.wav", test_audio_bytes, "audio/wav")},
         )
 
@@ -68,7 +69,8 @@ def test_passes_valid_engine_params(
         """Passes valid engine_params to engine"""
         response = client.post(
             "/api/v1/stt/transcribe",
-            params={"engine": "default", "engine_params": '{"beam_size": 5}'},
+            params={"engine": "default"},
+            data={"engine_params": '{"beam_size": 5}'},
             files={"audio": ("test.wav", test_audio_bytes, "audio/wav")},
         )
 
@@ -181,7 +183,8 @@ def test_returns_400_for_invalid_json_params(self, client, test_audio_bytes):
         """Returns 400 for invalid JSON"""
         response = client.post(
             "/api/v1/stt/transcribe/stream",
-            params={"engine": "default", "engine_params": "bad json"},
+            params={"engine": "default"},
+            data={"engine_params": "bad json"},
             files={"audio": ("test.wav", test_audio_bytes, "audio/wav")},
         )
 
@@ -307,7 +310,8 @@ def test_handles_non_dict_json_params(self, client, test_audio_bytes):
         # should convert to 500.
         response = client.post(
             "/api/v1/stt/transcribe",
-            params={"engine": "default", "engine_params": '["array"]'},
+            params={"engine": "default"},
+            data={"engine_params": '["array"]'},
             files={"audio": ("test.wav", test_audio_bytes, "audio/wav")},
         )
 
diff --git a/tests/unit/api/test_tts_router.py b/tests/unit/api/test_tts_router.py
index e0dbc39..2dcbdf0 100644
--- a/tests/unit/api/test_tts_router.py
+++ b/tests/unit/api/test_tts_router.py
@@ -10,7 +10,8 @@ def test_returns_200_with_valid_text(self, client_both):
         """Returns 200 with valid text and audio data"""
         response = client_both.post(
             "/api/v1/tts/synthesize",
-            params={"text": "Hello world", "engine": "default"},
+            params={"engine": "default"},
+            data={"text": "Hello world"},
         )
 
         assert response.status_code == 200
@@ -37,7 +38,8 @@ def test_returns_404_when_engine_not_found(self, client_both):
         """Returns 404 when engine not found"""
         response = client_both.post(
             "/api/v1/tts/synthesize",
-            params={"text": "Hello", "engine": "nonexistent"},
+            params={"engine": "nonexistent"},
+            data={"text": "Hello"},
         )
 
         assert response.status_code == 404
@@ -46,9 +48,9 @@ def test_returns_400_for_invalid_json_params(self, client_both):
         """Returns 400 for invalid JSON in engine_params"""
         response = client_both.post(
             "/api/v1/tts/synthesize",
-            params={
+            params={"engine": "default"},
+            data={
                 "text": "Hello",
-                "engine": "default",
                 "engine_params": "invalid json{",
             },
         )
@@ -60,9 +62,9 @@ def test_passes_valid_engine_params(self, client_both, mock_tts_engine):
         """Passes valid engine_params to engine"""
         response = client_both.post(
             "/api/v1/tts/synthesize",
-            params={
+            params={"engine": "default"},
+            data={
                 "text": "Hello",
-                "engine": "default",
                 "engine_params": '{"pitch": 1.5}',
             },
         )
@@ -78,11 +80,11 @@ def test_passes_voice_and_speed_params(self, client_both, mock_tts_engine):
         response = client_both.post(
             "/api/v1/tts/synthesize",
             params={
-                "text": "Hello",
                 "engine": "default",
                 "voice": "voice2",
                 "speed": 1.5,
             },
+            data={"text": "Hello"},
         )
 
         assert response.status_code == 200
@@ -90,6 +92,23 @@ def test_passes_voice_and_speed_params(self, client_both, mock_tts_engine):
         assert call_kwargs.get("voice") == "voice2"
         assert call_kwargs.get("speed") == 1.5
 
+    def test_passes_reference_audio_and_text(self, client_both, mock_tts_engine):
+        """Passes reference_audio and reference_text to engine for voice cloning"""
+        response = client_both.post(
+            "/api/v1/tts/synthesize",
+            params={"engine": "default"},
+            data={
+                "text": "Hello",
+                "reference_text": "Reference transcript",
+            },
+            files={"reference_audio": ("ref.wav", b"fake audio data", "audio/wav")},
+        )
+
+        assert response.status_code == 200
+        call_kwargs = mock_tts_engine.synthesize.call_args.kwargs
+        assert call_kwargs.get("reference_audio") == b"fake audio data"
+        assert call_kwargs.get("reference_text") == "Reference transcript"
+
 
 class TestSynthesizeStreamEndpoint:
     """POST /synthesize/stream tests"""
@@ -98,7 +117,8 @@ def test_returns_200_with_event_stream(self, client_both):
         """Returns 200 with event-stream content type"""
         response = client_both.post(
             "/api/v1/tts/synthesize/stream",
-            params={"text": "Hello", "engine": "default"},
+            params={"engine": "default"},
+            data={"text": "Hello"},
         )
 
         assert response.status_code == 200
@@ -108,7 +128,8 @@ def test_streams_chunks_and_complete(self, client_both):
         """Streams chunk events and complete event"""
         response = client_both.post(
             "/api/v1/tts/synthesize/stream",
-            params={"text": "Hello", "engine": "default"},
+            params={"engine": "default"},
+            data={"text": "Hello"},
         )
 
         # Parse SSE events
@@ -128,9 +149,9 @@ def test_returns_400_for_invalid_json_params(self, client_both):
         """Returns 400 for invalid JSON in engine_params"""
         response = client_both.post(
             "/api/v1/tts/synthesize/stream",
-            params={
+            params={"engine": "default"},
+            data={
                 "text": "Hello",
-                "engine": "default",
                 "engine_params": "bad json",
             },
         )
@@ -143,9 +164,9 @@ def test_passes_engine_params_stream(self, client_both, mock_tts_engine):
         """Passes engine_params to engine in stream mode"""
         response = client_both.post(
             "/api/v1/tts/synthesize/stream",
-            params={
+            params={"engine": "default"},
+            data={
                 "text": "Hello",
-                "engine": "default",
                 "engine_params": '{"style": "happy"}',
             },
         )
@@ -156,3 +177,20 @@ def test_passes_engine_params_stream(self, client_both, mock_tts_engine):
         mock_tts_engine.synthesize_stream.assert_called_once()
         call_kwargs = mock_tts_engine.synthesize_stream.call_args.kwargs
         assert call_kwargs.get("style") == "happy"
+
+    def test_passes_reference_audio_stream(self, client_both, mock_tts_engine):
+        """Passes reference_audio to engine in stream mode"""
+        response = client_both.post(
+            "/api/v1/tts/synthesize/stream",
+            params={"engine": "default"},
+            data={
+                "text": "Hello",
+                "reference_text": "Reference",
+            },
+            files={"reference_audio": ("ref.wav", b"audio bytes", "audio/wav")},
+        )
+
+        assert response.status_code == 200
+        call_kwargs = mock_tts_engine.synthesize_stream.call_args.kwargs
+        assert call_kwargs.get("reference_audio") == b"audio bytes"
+        assert call_kwargs.get("reference_text") == "Reference"
diff --git a/tests/unit/engines/test_base.py b/tests/unit/engines/test_base.py
index 7da3554..7199adc 100644
--- a/tests/unit/engines/test_base.py
+++ b/tests/unit/engines/test_base.py
@@ -88,6 +88,9 @@ async def synthesize(
         text: str,
         voice: str | None = None,
         speed: float = 1.0,
+        reference_audio: bytes | None = None,
+        reference_text: str | None = None,
+        **kwargs,
     ) -> TTSResponse:
         await self._ensure_ready()  # Auto-initialize if needed
         return TTSResponse(
@@ -100,7 +103,13 @@ async def synthesize(
         )
 
     async def synthesize_stream(
-        self, text: str, **kwargs
+        self,
+        text: str,
+        voice: str | None = None,
+        speed: float = 1.0,
+        reference_audio: bytes | None = None,
+        reference_text: str | None = None,
+        **kwargs,
     ) -> AsyncIterator[TTSChunk | TTSResponse]:
         await self._ensure_ready()
         yield TTSChunk(
diff --git a/tests/unit/engines/tts/voxcpm/test_voxcpm_engine.py b/tests/unit/engines/tts/voxcpm/test_voxcpm_engine.py
index acbd3ff..2a45029 100644
--- a/tests/unit/engines/tts/voxcpm/test_voxcpm_engine.py
+++ b/tests/unit/engines/tts/voxcpm/test_voxcpm_engine.py
@@ -40,12 +40,13 @@ def mock_voxcpm_model():
     # Mock generate method - returns numpy array
     mock_model.generate.return_value = np.zeros(16000, dtype=np.float32)  # 1 second
 
-    # Mock generate_streaming - yields chunks
+    # Mock generate_streaming - yields chunks (use MagicMock to track calls)
     def mock_streaming(*args, **kwargs):
         yield np.zeros(4000, dtype=np.float32)  # 0.25 second chunk
         yield np.zeros(4000, dtype=np.float32)  # 0.25 second chunk
 
-    mock_model.generate_streaming = mock_streaming
+    # Wrap in MagicMock to track call arguments
+    mock_model.generate_streaming = MagicMock(side_effect=mock_streaming)
 
     return mock_model
 
@@ -206,58 +207,53 @@ async def test_synthesize_passes_config_params(self, config, mock_voxcpm_model):
         assert call_kwargs["denoise"] == config.denoise
 
     @pytest.mark.asyncio
-    async def test_synthesize_with_voice_cloning(
-        self, config, mock_voxcpm_model, tmp_path
-    ):
+    async def test_synthesize_with_voice_cloning(self, config, mock_voxcpm_model):
         """Synthesize should accept voice cloning parameters."""
         engine = VoxCPMEngine(config)
         engine._model = mock_voxcpm_model
         engine._initialized = True
 
-        # Create a temporary audio file
-        prompt_file = tmp_path / "prompt.wav"
-        prompt_file.touch()
+        # Use bytes for reference audio
+        reference_audio = b"fake audio bytes"
 
         await engine.synthesize(
             "Hello",
-            prompt_wav_path=str(prompt_file),
-            prompt_text="Reference text",
+            reference_audio=reference_audio,
+            reference_text="Reference text",
         )
 
         call_kwargs = mock_voxcpm_model.generate.call_args.kwargs
-        assert call_kwargs["prompt_wav_path"] == str(prompt_file)
+        # Engine saves bytes to temp file and passes path
+        assert call_kwargs["prompt_wav_path"] is not None
         assert call_kwargs["prompt_text"] == "Reference text"
 
     @pytest.mark.asyncio
-    async def test_synthesize_error_on_missing_prompt_file(
-        self, config, mock_voxcpm_model
-    ):
-        """Synthesize should raise error if prompt file doesn't exist."""
+    async def test_synthesize_wraps_model_errors(self, config, mock_voxcpm_model):
+        """Model errors should be wrapped in SynthesisError."""
         engine = VoxCPMEngine(config)
         engine._model = mock_voxcpm_model
         engine._initialized = True
 
+        mock_voxcpm_model.generate.side_effect = RuntimeError("Model failed")
+
         with pytest.raises(SynthesisError) as exc_info:
-            await engine.synthesize(
-                "Hello",
-                prompt_wav_path="/nonexistent/file.wav",
-            )
+            await engine.synthesize("Test")
 
-        assert "not found" in str(exc_info.value).lower()
+        assert "Model failed" in str(exc_info.value)
 
     @pytest.mark.asyncio
-    async def test_synthesize_wraps_model_errors(self, config, mock_voxcpm_model):
-        """Model errors should be wrapped in SynthesisError."""
+    async def test_synthesize_reraises_typed_errors(self, config, mock_voxcpm_model):
+        """Synthesize should re-raise SynthesisError as-is without wrapping."""
         engine = VoxCPMEngine(config)
         engine._model = mock_voxcpm_model
         engine._initialized = True
 
-        mock_voxcpm_model.generate.side_effect = RuntimeError("Model failed")
+        mock_voxcpm_model.generate.side_effect = SynthesisError("Already typed error")
 
         with pytest.raises(SynthesisError) as exc_info:
             await engine.synthesize("Test")
 
-        assert "Model failed" in str(exc_info.value)
+        assert "Already typed error" in str(exc_info.value)
 
     @pytest.mark.asyncio
     async def test_synthesize_model_not_loaded(self, config):
@@ -338,19 +334,25 @@ async def test_stream_model_not_loaded(self, config):
         assert "not loaded" in str(exc_info.value)
 
     @pytest.mark.asyncio
-    async def test_stream_missing_prompt_file(self, config, mock_voxcpm_model):
-        """Stream should raise SynthesisError if prompt file doesn't exist."""
+    async def test_stream_with_reference_audio(self, config, mock_voxcpm_model):
+        """Stream should accept reference_audio bytes."""
         engine = VoxCPMEngine(config)
         engine._model = mock_voxcpm_model
         engine._initialized = True
 
-        with pytest.raises(SynthesisError) as exc_info:
-            async for _ in engine.synthesize_stream(
-                "Test", prompt_wav_path="/nonexistent.wav"
-            ):
-                pass
+        reference_audio = b"fake audio bytes"
+        results = []
+        async for item in engine.synthesize_stream(
+            "Test",
+            reference_audio=reference_audio,
+            reference_text="Reference",
+        ):
+            results.append(item)
 
-        assert "not found" in str(exc_info.value)
+        # Verify model was called with prompt_wav_path (temp file path)
+        call_kwargs = mock_voxcpm_model.generate_streaming.call_args[1]
+        assert call_kwargs["prompt_wav_path"] is not None
+        assert call_kwargs["prompt_text"] == "Reference"
 
     @pytest.mark.asyncio
     async def test_stream_empty_result(self, config, mock_voxcpm_model):