From 10db1bda8cee09805fcfe254516c2b25b2d9cb80 Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Mon, 25 Aug 2025 13:59:46 +0300 Subject: [PATCH 01/22] Add a dependency group for Respeecher --- pyproject.toml | 1 + uv.lock | 22 +++++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d61c3efe3d..a4ea467639 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,6 +85,7 @@ openrouter = [] perplexity = [] playht = [ "websockets>=13.1,<15.0" ] qwen = [] +respeecher = [ "respeecher~=1.0" ] rime = [ "websockets>=13.1,<15.0" ] riva = [ "nvidia-riva-client~=2.21.1" ] runner = [ "python-dotenv>=1.0.0,<2.0.0", "uvicorn>=0.32.0,<1.0.0", "fastapi>=0.115.6,<0.117.0", "pipecat-ai-small-webrtc-prebuilt>=1.0.0"] diff --git a/uv.lock b/uv.lock index bbe6a1ca87..19612d0a31 100644 --- a/uv.lock +++ b/uv.lock @@ -4315,6 +4315,9 @@ openpipe = [ playht = [ { name = "websockets" }, ] +respeecher = [ + { name = "respeecher" }, +] rime = [ { name = "websockets" }, ] @@ -4452,6 +4455,7 @@ requires-dist = [ { name = "python-dotenv", marker = "extra == 'runner'", specifier = ">=1.0.0,<2.0.0" }, { name = "pyvips", extras = ["binary"], marker = "extra == 'moondream'", specifier = "~=3.0.0" }, { name = "resampy", specifier = "~=0.4.3" }, + { name = "respeecher", marker = "extra == 'respeecher'", specifier = "~=1.0" }, { name = "sentry-sdk", marker = "extra == 'sentry'", specifier = "~=2.23.1" }, { name = "simli-ai", marker = "extra == 'simli'", specifier = "~=0.1.10" }, { name = "soundfile", marker = "extra == 'soundfile'", specifier = "~=0.13.0" }, @@ -4483,7 +4487,7 @@ requires-dist = [ { name = "websockets", marker = "extra == 'soniox'", specifier = ">=13.1,<15.0" }, { name = "websockets", marker = "extra == 'websocket'", specifier = ">=13.1,<15.0" }, ] -provides-extras = ["anthropic", "assemblyai", "asyncai", "aws", "aws-nova-sonic", "azure", "cartesia", "cerebras", "deepseek", "daily", "deepgram", "elevenlabs", "fal", "fireworks", "fish", "gladia", "google", "grok", "groq", "gstreamer", "heygen", "inworld", "krisp", "koala", "langchain", "livekit", "lmnt", "local", "mcp", "mem0", "mistral", "mlx-whisper", "moondream", "nim", "neuphonic", "noisereduce", "openai", "openpipe", "openrouter", "perplexity", "playht", "qwen", "rime", "riva", "runner", "sambanova", "sentry", "local-smart-turn", "remote-smart-turn", "silero", "simli", "soniox", "soundfile", "speechmatics", "tavus", "together", "tracing", "ultravox", "webrtc", "websocket", "whisper"] +provides-extras = ["anthropic", "assemblyai", "asyncai", "aws", "aws-nova-sonic", "azure", "cartesia", "cerebras", "deepseek", "daily", "deepgram", "elevenlabs", "fal", "fireworks", "fish", "gladia", "google", "grok", "groq", "gstreamer", "heygen", "inworld", "krisp", "koala", "langchain", "livekit", "lmnt", "local", "mcp", "mem0", "mistral", "mlx-whisper", "moondream", "nim", "neuphonic", "noisereduce", "openai", "openpipe", "openrouter", "perplexity", "playht", "qwen", "respeecher", "rime", "riva", "runner", "sambanova", "sentry", "local-smart-turn", "remote-smart-turn", "silero", "simli", "soniox", "soundfile", "speechmatics", "tavus", "together", "tracing", "ultravox", "webrtc", "websocket", "whisper"] [package.metadata.requires-dev] dev = [ @@ -5749,6 +5753,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4d/b9/3b00ac340a1aab3389ebcc52c779914a44aadf7b0cb7a3bf053195735607/resampy-0.4.3-py3-none-any.whl", hash = "sha256:ad2ed64516b140a122d96704e32bc0f92b23f45419e8b8f478e5a05f83edcebd", size = 3076529, upload-time = "2024-03-05T20:36:02.439Z" }, ] +[[package]] +name = "respeecher" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, + { name = "pydantic" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "websockets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/a5/9eed1454f3c737733dffbd66c2212ea6a72168442393db6b52c138ba4dc7/respeecher-1.0.1.tar.gz", hash = "sha256:74a1f423d8a5fdd600b1634a1830525a0cb3996252687cc477c76254ce4d79a1", size = 28762, upload-time = "2025-08-25T10:50:53.533Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/bf/adc9d59abfdc6fee22ea8af97088b1713e090d4ab640a53d49e01e38b863/respeecher-1.0.1-py3-none-any.whl", hash = "sha256:7c940791cc310f939b2a9b7dc99f11830402b6b7eb934d7fb0282b38386f1f5c", size = 50565, upload-time = "2025-08-25T10:50:52.537Z" }, +] + [[package]] name = "rich" version = "14.1.0" From 24565b20615e62a6aea5c4b4cd6ccb145a95b80f Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Mon, 25 Aug 2025 15:33:12 +0300 Subject: [PATCH 02/22] Add Respeecher to env.example --- env.example | 3 +++ 1 file changed, 3 insertions(+) diff --git a/env.example b/env.example index fae6bb2a4d..81d662425a 100644 --- a/env.example +++ b/env.example @@ -140,3 +140,6 @@ SENTRY_DSN=... # Heygen HEYGEN_API_KEY=... + +# Respeecher +RESPEECHER_API_KEY=... From c488706a2728f9b095d755fd3c3821908017fe1f Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Mon, 25 Aug 2025 15:36:53 +0300 Subject: [PATCH 03/22] Spit the audio context functionality out --- src/pipecat/services/tts_service.py | 188 +++++++++++++++++++--------- 1 file changed, 129 insertions(+), 59 deletions(-) diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index 5562764944..77d1286935 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -702,29 +702,11 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): self._bot_speaking = False -class AudioContextWordTTSService(WebsocketWordTTSService): - """Websocket-based TTS service with word timestamps and audio context management. - - This is a base class for websocket-based TTS services that support word - timestamps and also allow correlating the generated audio with the requested - text. - - Each request could be multiple sentences long which are grouped by - context. For this to work, the TTS service needs to support handling - multiple requests at once (i.e. multiple simultaneous contexts). - - The audio received from the TTS will be played in context order. That is, if - we requested audio for a context "A" and then audio for context "B", the - audio from context ID "A" will be played first. - """ - - def __init__(self, **kwargs): - """Initialize the Audio Context Word TTS service. +class _AudioContextContainer: + """A container for audio contexts.""" - Args: - **kwargs: Additional arguments passed to the parent WebsocketWordTTSService. - """ - super().__init__(**kwargs) + def __init__(self): + """Initialize the container.""" self._contexts: Dict[str, asyncio.Queue] = {} self._audio_context_task = None @@ -777,43 +759,6 @@ def audio_context_available(self, context_id: str) -> bool: """ return context_id in self._contexts - async def start(self, frame: StartFrame): - """Start the audio context TTS service. - - Args: - frame: The start frame containing initialization parameters. - """ - await super().start(frame) - self._create_audio_context_task() - - async def stop(self, frame: EndFrame): - """Stop the audio context TTS service. - - Args: - frame: The end frame. - """ - await super().stop(frame) - if self._audio_context_task: - # Indicate no more audio contexts are available. this will end the - # task cleanly after all contexts have been processed. - await self._contexts_queue.put(None) - await self.wait_for_task(self._audio_context_task) - self._audio_context_task = None - - async def cancel(self, frame: CancelFrame): - """Cancel the audio context TTS service. - - Args: - frame: The cancel frame. - """ - await super().cancel(frame) - await self._stop_audio_context_task() - - async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection): - await super()._handle_interruption(frame, direction) - await self._stop_audio_context_task() - self._create_audio_context_task() - def _create_audio_context_task(self): if not self._audio_context_task: self._contexts_queue = WatchdogQueue(self.task_manager) @@ -868,3 +813,128 @@ async def _handle_audio_context(self, context_id: str): # We didn't get audio, so let's consider this context finished. logger.trace(f"{self} time out on audio context {context_id}") break + + +class AudioContextTTSService(WebsocketTTSService, _AudioContextContainer): + """Websocket-based TTS service with audio context management. + + This is a base class for websocket-based TTS services that allow correlating + the generated audio with the requested text. + + Each request could be multiple sentences long which are grouped by + context. For this to work, the TTS service needs to support handling + multiple requests at once (i.e. multiple simultaneous contexts). + + The audio received from the TTS will be played in context order. That is, if + we requested audio for a context "A" and then audio for context "B", the + audio from context ID "A" will be played first. + """ + + def __init__(self, **kwargs): + """Initialize the Audio Context TTS service. + + Args: + **kwargs: Additional arguments passed to the parent WebsocketTTSService. + """ + WebSocketTTSService.__init__(self, **kwargs) + _AudioContextContainer.__init__(self) + + async def start(self, frame: StartFrame): + """Start the audio context TTS service. + + Args: + frame: The start frame containing initialization parameters. + """ + await super().start(frame) + self._create_audio_context_task() + + async def stop(self, frame: EndFrame): + """Stop the audio context TTS service. + + Args: + frame: The end frame. + """ + await super().stop(frame) + if self._audio_context_task: + # Indicate no more audio contexts are available. this will end the + # task cleanly after all contexts have been processed. + await self._contexts_queue.put(None) + await self.wait_for_task(self._audio_context_task) + self._audio_context_task = None + + async def cancel(self, frame: CancelFrame): + """Cancel the audio context TTS service. + + Args: + frame: The cancel frame. + """ + await super().cancel(frame) + await self._stop_audio_context_task() + + async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection): + await super()._handle_interruption(frame, direction) + await self._stop_audio_context_task() + self._create_audio_context_task() + + +class AudioContextWordTTSService(WebsocketWordTTSService, _AudioContextContainer): + """Websocket-based TTS service with word timestamps and audio context management. + + This is a base class for websocket-based TTS services that support word + timestamps and also allow correlating the generated audio with the requested + text. + + Each request could be multiple sentences long which are grouped by + context. For this to work, the TTS service needs to support handling + multiple requests at once (i.e. multiple simultaneous contexts). + + The audio received from the TTS will be played in context order. That is, if + we requested audio for a context "A" and then audio for context "B", the + audio from context ID "A" will be played first. + """ + + def __init__(self, **kwargs): + """Initialize the Audio Context Word TTS service. + + Args: + **kwargs: Additional arguments passed to the parent WebsocketWordTTSService. + """ + WebSocketTTSService.__init__(self, **kwargs) + _AudioContextContainer.__init__(self) + + async def start(self, frame: StartFrame): + """Start the audio context TTS service. + + Args: + frame: The start frame containing initialization parameters. + """ + await super().start(frame) + self._create_audio_context_task() + + async def stop(self, frame: EndFrame): + """Stop the audio context TTS service. + + Args: + frame: The end frame. + """ + await super().stop(frame) + if self._audio_context_task: + # Indicate no more audio contexts are available. this will end the + # task cleanly after all contexts have been processed. + await self._contexts_queue.put(None) + await self.wait_for_task(self._audio_context_task) + self._audio_context_task = None + + async def cancel(self, frame: CancelFrame): + """Cancel the audio context TTS service. + + Args: + frame: The cancel frame. + """ + await super().cancel(frame) + await self._stop_audio_context_task() + + async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection): + await super()._handle_interruption(frame, direction) + await self._stop_audio_context_task() + self._create_audio_context_task() From 1417791aaa7e626ba43608c1dddcac0e47718974 Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Thu, 28 Aug 2025 15:38:25 +0300 Subject: [PATCH 04/22] Fix some typos --- src/pipecat/services/tts_service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index 77d1286935..966babb35a 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -836,7 +836,7 @@ def __init__(self, **kwargs): Args: **kwargs: Additional arguments passed to the parent WebsocketTTSService. """ - WebSocketTTSService.__init__(self, **kwargs) + WebsocketTTSService.__init__(self, **kwargs) _AudioContextContainer.__init__(self) async def start(self, frame: StartFrame): @@ -899,7 +899,7 @@ def __init__(self, **kwargs): Args: **kwargs: Additional arguments passed to the parent WebsocketWordTTSService. """ - WebSocketTTSService.__init__(self, **kwargs) + WebsocketWordTTSService.__init__(self, **kwargs) _AudioContextContainer.__init__(self) async def start(self, frame: StartFrame): From 051c5fc9f0c9111e1d8dad39fb190c30aabf762c Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Thu, 28 Aug 2025 16:11:07 +0300 Subject: [PATCH 05/22] Make the type checker happy --- src/pipecat/services/tts_service.py | 33 ++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index 966babb35a..e224ef5b5a 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -7,8 +7,8 @@ """Base classes for Text-to-speech services.""" import asyncio -from abc import abstractmethod -from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Sequence, Tuple +from abc import ABC, abstractmethod +from typing import Any, AsyncGenerator, Coroutine, Dict, List, Mapping, Optional, Sequence, Tuple from loguru import logger @@ -37,6 +37,7 @@ from pipecat.services.ai_service import AIService from pipecat.services.websocket_service import WebsocketService from pipecat.transcriptions.language import Language +from pipecat.utils.asyncio.task_manager import BaseTaskManager from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue from pipecat.utils.text.base_text_aggregator import BaseTextAggregator from pipecat.utils.text.base_text_filter import BaseTextFilter @@ -702,7 +703,7 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): self._bot_speaking = False -class _AudioContextContainer: +class _AudioContextContainer(ABC): """A container for audio contexts.""" def __init__(self): @@ -814,6 +815,32 @@ async def _handle_audio_context(self, context_id: str): logger.trace(f"{self} time out on audio context {context_id}") break + @abstractmethod + def reset_watchdog(self) -> None: + pass + + @abstractmethod + def create_task(self, coroutine: Coroutine) -> asyncio.Task: + pass + + @abstractmethod + async def cancel_task(self, task: asyncio.Task) -> None: + pass + + @abstractmethod + async def push_frame(self, frame: Frame) -> None: + pass + + @property + @abstractmethod + def sample_rate(self) -> int: + pass + + @property + @abstractmethod + def task_manager(self) -> BaseTaskManager: + pass + class AudioContextTTSService(WebsocketTTSService, _AudioContextContainer): """Websocket-based TTS service with audio context management. From ee2f11dc4c9af6981638e7b6a465dc96b45189cd Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Thu, 28 Aug 2025 18:06:20 +0300 Subject: [PATCH 06/22] Rename a class --- src/pipecat/services/tts_service.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index e224ef5b5a..f4cd1891eb 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -703,11 +703,11 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): self._bot_speaking = False -class _AudioContextContainer(ABC): - """A container for audio contexts.""" +class _AudioContextBaseService(ABC): + """A service that supports audio contexts.""" def __init__(self): - """Initialize the container.""" + """Initialize the service.""" self._contexts: Dict[str, asyncio.Queue] = {} self._audio_context_task = None @@ -842,7 +842,7 @@ def task_manager(self) -> BaseTaskManager: pass -class AudioContextTTSService(WebsocketTTSService, _AudioContextContainer): +class AudioContextTTSService(WebsocketTTSService, _AudioContextBaseService): """Websocket-based TTS service with audio context management. This is a base class for websocket-based TTS services that allow correlating @@ -864,7 +864,7 @@ def __init__(self, **kwargs): **kwargs: Additional arguments passed to the parent WebsocketTTSService. """ WebsocketTTSService.__init__(self, **kwargs) - _AudioContextContainer.__init__(self) + _AudioContextBaseService.__init__(self) async def start(self, frame: StartFrame): """Start the audio context TTS service. @@ -904,7 +904,7 @@ async def _handle_interruption(self, frame: StartInterruptionFrame, direction: F self._create_audio_context_task() -class AudioContextWordTTSService(WebsocketWordTTSService, _AudioContextContainer): +class AudioContextWordTTSService(WebsocketWordTTSService, _AudioContextBaseService): """Websocket-based TTS service with word timestamps and audio context management. This is a base class for websocket-based TTS services that support word @@ -927,7 +927,7 @@ def __init__(self, **kwargs): **kwargs: Additional arguments passed to the parent WebsocketWordTTSService. """ WebsocketWordTTSService.__init__(self, **kwargs) - _AudioContextContainer.__init__(self) + _AudioContextBaseService.__init__(self) async def start(self, frame: StartFrame): """Start the audio context TTS service. From eb467d4e27322a768086062672efb2ed9d1e033b Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Thu, 28 Aug 2025 19:43:02 +0300 Subject: [PATCH 07/22] Add a link to the future docs page --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fb66a4f271..7c086c8bfc 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ You can connect to Pipecat from any platform using our official SDKs: | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [SambaNova (Whisper)](https://docs.pipecat.ai/server/services/stt/sambanova), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | | LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova) [Together AI](https://docs.pipecat.ai/server/services/llm/together) | -| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | +| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Respeecher](https://docs.pipecat.ai/server/services/tts/respeecher), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | | Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai) | | Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), Local | | Serializers | [Plivo](https://docs.pipecat.ai/server/utilities/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/utilities/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/utilities/serializers/telnyx) | From c9a12b47089342c5464f57203103a0fd2d6b2b65 Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Thu, 28 Aug 2025 19:48:20 +0300 Subject: [PATCH 08/22] Add initial implementation --- src/pipecat/services/respeecher/__init__.py | 0 src/pipecat/services/respeecher/tts.py | 290 ++++++++++++++++++++ 2 files changed, 290 insertions(+) create mode 100644 src/pipecat/services/respeecher/__init__.py create mode 100644 src/pipecat/services/respeecher/tts.py diff --git a/src/pipecat/services/respeecher/__init__.py b/src/pipecat/services/respeecher/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/pipecat/services/respeecher/tts.py b/src/pipecat/services/respeecher/tts.py new file mode 100644 index 0000000000..00c60b95eb --- /dev/null +++ b/src/pipecat/services/respeecher/tts.py @@ -0,0 +1,290 @@ +# +# Copyright (c) 2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +"""Respeecher Space text-to-speech service implementation.""" + +import base64 +import json +import uuid +from typing import AsyncGenerator, Optional + +from loguru import logger +from pydantic import BaseModel + +from pipecat.frames.frames import ( + CancelFrame, + EndFrame, + ErrorFrame, + Frame, + StartFrame, + StartInterruptionFrame, + TTSAudioRawFrame, + TTSStartedFrame, + TTSStoppedFrame, +) +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.tts_service import AudioContextTTSService +from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator +from pipecat.utils.tracing.service_decorators import traced_tts + +# See .env.example for Respeecher configuration needed +try: + from respeecher import SamplingParams + from respeecher.tts import StreamingEncoding + from websockets.asyncio.client import connect as websocket_connect + from websockets.protocol import State +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error("In order to use Respeecher, you need to `pip install pipecat-ai[respeecher]`.") + raise Exception(f"Missing module: {e}") + + +class RespeecherTTSService(AudioContextTTSService): + """Respeecher Space TTS service with WebSocket streaming and audio contexts. + + Provides text-to-speech using Respeecher's streaming WebSocket API. + Supports audio context management and voice customization via sampling parameters. + """ + + class InputParams(BaseModel): + """Input parameters for Respeecher TTS configuration. + + Parameters: + sampling_params: Sampling parameters used for speech synthesis. + """ + + sampling_params: SamplingParams = SamplingParams() + + def __init__( + self, + *, + api_key: str, + voice_id: str, + model: str = "public/tts/en-rt", + url: str = "wss://api.respeecher.com/v1", + sample_rate: Optional[int] = None, + encoding: StreamingEncoding = "pcm_s16le", + params: Optional[InputParams] = None, + **kwargs, + ): + """Initialize the Respeecher TTS service. + + Args: + api_key: Respeecher API key for authentication. + voice_id: ID of the voice to use for synthesis. + model: Model path for the Respeecher TTS API. + url: WebSocket base URL for Respeecher TTS API. + sample_rate: Audio sample rate. If None, uses default. + encoding: Audio encoding format. + params: Additional input parameters for voice customization. + **kwargs: Additional arguments passed to the parent service. + """ + super().__init__( + push_text_frames=False, + pause_frame_processing=True, + sample_rate=sample_rate, + **kwargs, + ) + + self._params = params or RespeecherTTSService.InputParams() + self._api_key = api_key + self._url = url + self._output_format = { + "encoding": encoding, + "sample_rate": sample_rate, + } + self.set_model_name(model) + self.set_voice(voice_id) + + self._context_id = None + self._receive_task = None + + def can_generate_metrics(self) -> bool: + """Check if this service can generate processing metrics. + + Returns: + True + """ + return True + + async def set_model(self, model: str): + """Set the TTS model. + + Args: + model: The model name to use for synthesis. + """ + self._model_id = model + await super().set_model(model) + await self._disconnect() + await self._connect() + logger.info(f"Switching TTS model to: [{model}]") + + def _build_msg(self, text: str = "", continue_transcript: bool = True): + msg = { + "transcript": text, + "continue": continue_transcript, + "context_id": self._context_id, + "voice": { + "id": self._voice_id, + "sampling_params": self._params.sampling_params.model_dump(), + }, + "output_format": self._output_format, + } + + return json.dumps(msg) + + async def start(self, frame: StartFrame): + """Start the Respeecher TTS service. + + Args: + frame: The start frame containing initialization parameters. + """ + await super().start(frame) + self._output_format["sample_rate"] = self.sample_rate + await self._connect() + + async def stop(self, frame: EndFrame): + """Stop the Respeecher TTS service. + + Args: + frame: The end frame. + """ + await super().stop(frame) + await self._disconnect() + + async def cancel(self, frame: CancelFrame): + """Stop the Respeecher TTS service. + + Args: + frame: The end frame. + """ + await super().cancel(frame) + await self._disconnect() + + async def _connect(self): + await self._connect_websocket() + + if self._websocket and not self._receive_task: + self._receive_task = self.create_task(self._receive_task_handler(self._report_error)) + + async def _disconnect(self): + if self._receive_task: + await self.cancel_task(self._receive_task) + self._receive_task = None + + await self._disconnect_websocket() + + async def _connect_websocket(self): + try: + if self._websocket and self._websocket.state is State.OPEN: + return + logger.debug("Connecting to Respeecher") + self._websocket = await websocket_connect( + f"{self._url}/{self._model_name}?api_key={self._api_key}" + ) + except Exception as e: + logger.error(f"{self} initialization error: {e}") + self._websocket = None + await self._call_event_handler("on_connection_error", f"{e}") + + async def _disconnect_websocket(self): + try: + await self.stop_all_metrics() + + if self._websocket: + logger.debug("Disconnecting from Respeecher") + await self._websocket.close() + except Exception as e: + logger.error(f"{self} error closing websocket: {e}") + finally: + self._context_id = None + self._websocket = None + + def _get_websocket(self): + if self._websocket: + return self._websocket + raise Exception("Websocket not connected") + + async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection): + await super()._handle_interruption(frame, direction) + await self.stop_all_metrics() + if self._context_id: + cancel_msg = json.dumps({"context_id": self._context_id, "cancel": True}) + await self._get_websocket().send(cancel_msg) + self._context_id = None + + async def flush_audio(self): + """Flush any pending audio and finalize the current context.""" + if not self._context_id or not self._websocket: + return + logger.trace(f"{self}: flushing audio") + msg = self._build_msg(text="", continue_transcript=False) + await self._websocket.send(msg) + self._context_id = None + + async def _receive_messages(self): + async for message in WatchdogAsyncIterator( + self._get_websocket(), manager=self.task_manager + ): + msg = json.loads(message) + if not msg or not self.audio_context_available(msg["context_id"]): + continue + if msg["type"] == "done": + await self.stop_ttfb_metrics() + await self.remove_audio_context(msg["context_id"]) + elif msg["type"] == "chunk": + await self.stop_ttfb_metrics() + frame = TTSAudioRawFrame( + audio=base64.b64decode(msg["data"]), + sample_rate=self.sample_rate, + num_channels=1, + ) + await self.append_to_audio_context(msg["context_id"], frame) + elif msg["type"] == "error": + logger.error(f"{self} error: {msg}") + await self.push_frame(TTSStoppedFrame()) + await self.stop_all_metrics() + await self.push_error(ErrorFrame(f"{self} error: {msg['error']}")) + self._context_id = None + else: + logger.error(f"{self} error, unknown message type: {msg}") + + @traced_tts + async def run_tts(self, text: str) -> AsyncGenerator[Frame | None, None]: + """Generate speech from text using Respeecher's streaming API. + + Args: + text: The text to synthesize into speech. + + Yields: + Frame: Audio frames containing the synthesized speech. + """ + logger.debug(f"{self}: Generating TTS [{text}]") + + try: + if not self._websocket or self._websocket.state is State.CLOSED: + await self._connect() + + if not self._context_id: + await self.start_ttfb_metrics() + yield TTSStartedFrame() + self._context_id = str(uuid.uuid4()) + await self.create_audio_context(self._context_id) + + msg = self._build_msg(text=text) + + try: + await self._get_websocket().send(msg) + await self.start_tts_usage_metrics(text) + except Exception as e: + logger.error(f"{self} error sending message: {e}") + yield TTSStoppedFrame() + await self._disconnect() + await self._connect() + return + yield None + except Exception as e: + logger.error(f"{self} exception: {e}") From 5efe01be3906c4239c21ffd9031b2ee073b925ca Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Fri, 29 Aug 2025 21:00:39 +0300 Subject: [PATCH 09/22] Fix some issues --- src/pipecat/services/respeecher/tts.py | 37 +++++++++----------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/src/pipecat/services/respeecher/tts.py b/src/pipecat/services/respeecher/tts.py index 00c60b95eb..598b83ed40 100644 --- a/src/pipecat/services/respeecher/tts.py +++ b/src/pipecat/services/respeecher/tts.py @@ -99,7 +99,6 @@ def __init__( self.set_model_name(model) self.set_voice(voice_id) - self._context_id = None self._receive_task = None def can_generate_metrics(self) -> bool: @@ -118,18 +117,17 @@ async def set_model(self, model: str): """ self._model_id = model await super().set_model(model) + logger.info(f"Switching TTS model to: [{model}]") await self._disconnect() await self._connect() - logger.info(f"Switching TTS model to: [{model}]") - def _build_msg(self, text: str = "", continue_transcript: bool = True): + def _build_msg(self, text: str, context_id: str): msg = { "transcript": text, - "continue": continue_transcript, - "context_id": self._context_id, + "context_id": context_id, "voice": { "id": self._voice_id, - "sampling_params": self._params.sampling_params.model_dump(), + "sampling_params": self._params.sampling_params.model_dump(exclude_none=True), }, "output_format": self._output_format, } @@ -183,7 +181,7 @@ async def _connect_websocket(self): return logger.debug("Connecting to Respeecher") self._websocket = await websocket_connect( - f"{self._url}/{self._model_name}?api_key={self._api_key}" + f"{self._url}/{self._model_name}/tts/websocket?api_key={self._api_key}" ) except Exception as e: logger.error(f"{self} initialization error: {e}") @@ -200,7 +198,6 @@ async def _disconnect_websocket(self): except Exception as e: logger.error(f"{self} error closing websocket: {e}") finally: - self._context_id = None self._websocket = None def _get_websocket(self): @@ -211,19 +208,10 @@ def _get_websocket(self): async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection): await super()._handle_interruption(frame, direction) await self.stop_all_metrics() - if self._context_id: - cancel_msg = json.dumps({"context_id": self._context_id, "cancel": True}) + for context_id in self._contexts: + cancel_msg = json.dumps({"context_id": context_id, "cancel": True}) await self._get_websocket().send(cancel_msg) - self._context_id = None - - async def flush_audio(self): - """Flush any pending audio and finalize the current context.""" - if not self._context_id or not self._websocket: - return - logger.trace(f"{self}: flushing audio") - msg = self._build_msg(text="", continue_transcript=False) - await self._websocket.send(msg) - self._context_id = None + await self.remove_audio_context(context_id) async def _receive_messages(self): async for message in WatchdogAsyncIterator( @@ -248,7 +236,6 @@ async def _receive_messages(self): await self.push_frame(TTSStoppedFrame()) await self.stop_all_metrics() await self.push_error(ErrorFrame(f"{self} error: {msg['error']}")) - self._context_id = None else: logger.error(f"{self} error, unknown message type: {msg}") @@ -268,13 +255,13 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame | None, None]: if not self._websocket or self._websocket.state is State.CLOSED: await self._connect() - if not self._context_id: + if not self._contexts: await self.start_ttfb_metrics() yield TTSStartedFrame() - self._context_id = str(uuid.uuid4()) - await self.create_audio_context(self._context_id) - msg = self._build_msg(text=text) + context_id = str(uuid.uuid4()) + await self.create_audio_context(context_id) + msg = self._build_msg(text=text, context_id=context_id) try: await self._get_websocket().send(msg) From adadc6ab7c866d91eec170e96e7c0d221d6c6198 Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Mon, 1 Sep 2025 12:54:01 +0300 Subject: [PATCH 10/22] Refactor --- src/pipecat/services/respeecher/tts.py | 45 ++++++++++------ src/pipecat/services/tts_service.py | 74 ++++++++++++++++++-------- 2 files changed, 81 insertions(+), 38 deletions(-) diff --git a/src/pipecat/services/respeecher/tts.py b/src/pipecat/services/respeecher/tts.py index 598b83ed40..2bb87ec4de 100644 --- a/src/pipecat/services/respeecher/tts.py +++ b/src/pipecat/services/respeecher/tts.py @@ -12,7 +12,7 @@ from typing import AsyncGenerator, Optional from loguru import logger -from pydantic import BaseModel +from pydantic import BaseModel, TypeAdapter, ValidationError from pipecat.frames.frames import ( CancelFrame, @@ -33,6 +33,7 @@ # See .env.example for Respeecher configuration needed try: from respeecher import SamplingParams + from respeecher.tts import Response as TTSResponse from respeecher.tts import StreamingEncoding from websockets.asyncio.client import connect as websocket_connect from websockets.protocol import State @@ -68,6 +69,7 @@ def __init__( sample_rate: Optional[int] = None, encoding: StreamingEncoding = "pcm_s16le", params: Optional[InputParams] = None, + additional_silence_between_sentences_time_s: float = 0, **kwargs, ): """Initialize the Respeecher TTS service. @@ -80,12 +82,14 @@ def __init__( sample_rate: Audio sample rate. If None, uses default. encoding: Audio encoding format. params: Additional input parameters for voice customization. + additional_silence_between_sentences_time_s: Duration of additional silence to insert between sentences in seconds. **kwargs: Additional arguments passed to the parent service. """ super().__init__( push_text_frames=False, pause_frame_processing=True, sample_rate=sample_rate, + silence_between_contexts_time_s=additional_silence_between_sentences_time_s, **kwargs, ) @@ -217,27 +221,38 @@ async def _receive_messages(self): async for message in WatchdogAsyncIterator( self._get_websocket(), manager=self.task_manager ): - msg = json.loads(message) - if not msg or not self.audio_context_available(msg["context_id"]): + try: + response = TypeAdapter(TTSResponse).validate_json(message) + except ValidationError as e: + logger.error(f"{self} cannot parse message: {e}") + continue + + if response.context_id is not None and not self.audio_context_available( + response.context_id + ): + logger.error( + f"{self} error, received {response.type} for unknown context_id: {response.context_id}" + ) continue - if msg["type"] == "done": + + if response.type == "error": + logger.error(f"{self} error: {response}") + await self.push_frame(TTSStoppedFrame()) + await self.stop_all_metrics() + await self.push_error(ErrorFrame(f"{self} error: {response.error}")) + continue + + if response.type == "done": await self.stop_ttfb_metrics() - await self.remove_audio_context(msg["context_id"]) - elif msg["type"] == "chunk": + await self.remove_audio_context(response.context_id) + elif response.type == "chunk": await self.stop_ttfb_metrics() frame = TTSAudioRawFrame( - audio=base64.b64decode(msg["data"]), + audio=base64.b64decode(response.data), sample_rate=self.sample_rate, num_channels=1, ) - await self.append_to_audio_context(msg["context_id"], frame) - elif msg["type"] == "error": - logger.error(f"{self} error: {msg}") - await self.push_frame(TTSStoppedFrame()) - await self.stop_all_metrics() - await self.push_error(ErrorFrame(f"{self} error: {msg['error']}")) - else: - logger.error(f"{self} error, unknown message type: {msg}") + await self.append_to_audio_context(response.context_id, frame) @traced_tts async def run_tts(self, text: str) -> AsyncGenerator[Frame | None, None]: diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index f4cd1891eb..4415a79ddc 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -347,12 +347,7 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect direction: The direction to push the frame. """ if self._push_silence_after_stop and isinstance(frame, TTSStoppedFrame): - silence_num_bytes = int(self._silence_time_s * self.sample_rate * 2) # 16-bit - silence_frame = TTSAudioRawFrame( - audio=b"\x00" * silence_num_bytes, - sample_rate=self.sample_rate, - num_channels=1, - ) + silence_frame = self.silence_frame(self._silence_time_s) silence_frame.transport_destination = self._transport_destination await self.push_frame(silence_frame) @@ -369,6 +364,20 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect ): await self._stop_frame_queue.put(frame) + def silence_frame(self, duration_s: float) -> TTSAudioRawFrame: + """Create a frame of silence. + + Args: + duration_s: Silence duration in seconds. + """ + silence_num_bytes = int(duration_s * self.sample_rate * 2) # 16-bit + + return TTSAudioRawFrame( + audio=b"\x00" * silence_num_bytes, + sample_rate=self.sample_rate, + num_channels=1, + ) + async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection): self._processing_text = False await self._text_aggregator.handle_interruption() @@ -703,13 +712,25 @@ async def process_frame(self, frame: Frame, direction: FrameDirection): self._bot_speaking = False -class _AudioContextBaseService(ABC): - """A service that supports audio contexts.""" +class _AudioContextServiceMixin(ABC): + """A service that supports audio contexts. + + This class does not inherit from other service base classes to avoid + diamond inheritance. + """ - def __init__(self): - """Initialize the service.""" + def __init__( + self, + silence_between_contexts_time_s: float = 1.0, + ): + """Initialize the service. + + Args: + silence_between_contexts_time_s: Duration of silence to push between contexts. + """ self._contexts: Dict[str, asyncio.Queue] = {} self._audio_context_task = None + self._silence_between_contexts_time_s = silence_between_contexts_time_s async def create_audio_context(self, context_id: str): """Create a new audio context for grouping related audio. @@ -787,11 +808,9 @@ async def _audio_context_task_handler(self): del self._contexts[context_id] # Append some silence between sentences. - silence = b"\x00" * self.sample_rate - frame = TTSAudioRawFrame( - audio=silence, sample_rate=self.sample_rate, num_channels=1 - ) - await self.push_frame(frame) + if self._silence_between_contexts_time_s > 0: + silence_frame = self.silence_frame(self._silence_between_contexts_time_s) + await self.push_frame(silence_frame) else: running = False @@ -831,9 +850,8 @@ async def cancel_task(self, task: asyncio.Task) -> None: async def push_frame(self, frame: Frame) -> None: pass - @property @abstractmethod - def sample_rate(self) -> int: + def silence_frame(self, duration_s: float) -> TTSAudioRawFrame: pass @property @@ -842,7 +860,7 @@ def task_manager(self) -> BaseTaskManager: pass -class AudioContextTTSService(WebsocketTTSService, _AudioContextBaseService): +class AudioContextTTSService(WebsocketTTSService, _AudioContextServiceMixin): """Websocket-based TTS service with audio context management. This is a base class for websocket-based TTS services that allow correlating @@ -857,14 +875,19 @@ class AudioContextTTSService(WebsocketTTSService, _AudioContextBaseService): audio from context ID "A" will be played first. """ - def __init__(self, **kwargs): + def __init__( + self, + silence_between_contexts_time_s: float = 1.0, + **kwargs, + ): """Initialize the Audio Context TTS service. Args: + silence_between_contexts_time_s: Duration of silence to push between contexts. **kwargs: Additional arguments passed to the parent WebsocketTTSService. """ WebsocketTTSService.__init__(self, **kwargs) - _AudioContextBaseService.__init__(self) + _AudioContextServiceMixin.__init__(self, silence_between_contexts_time_s) async def start(self, frame: StartFrame): """Start the audio context TTS service. @@ -904,7 +927,7 @@ async def _handle_interruption(self, frame: StartInterruptionFrame, direction: F self._create_audio_context_task() -class AudioContextWordTTSService(WebsocketWordTTSService, _AudioContextBaseService): +class AudioContextWordTTSService(WebsocketWordTTSService, _AudioContextServiceMixin): """Websocket-based TTS service with word timestamps and audio context management. This is a base class for websocket-based TTS services that support word @@ -920,14 +943,19 @@ class AudioContextWordTTSService(WebsocketWordTTSService, _AudioContextBaseServi audio from context ID "A" will be played first. """ - def __init__(self, **kwargs): + def __init__( + self, + silence_between_contexts_time_s: float = 1.0, + **kwargs, + ): """Initialize the Audio Context Word TTS service. Args: + silence_between_contexts_time_s: Duration of silence to push between contexts. **kwargs: Additional arguments passed to the parent WebsocketWordTTSService. """ WebsocketWordTTSService.__init__(self, **kwargs) - _AudioContextBaseService.__init__(self) + _AudioContextServiceMixin.__init__(self, silence_between_contexts_time_s) async def start(self, frame: StartFrame): """Start the audio context TTS service. From 554ff8e83fc097b287313d3fc5b07a81bd847544 Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Mon, 1 Sep 2025 13:02:44 +0300 Subject: [PATCH 11/22] Forbid overriding encoding --- src/pipecat/services/respeecher/tts.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/pipecat/services/respeecher/tts.py b/src/pipecat/services/respeecher/tts.py index 2bb87ec4de..65eee0472b 100644 --- a/src/pipecat/services/respeecher/tts.py +++ b/src/pipecat/services/respeecher/tts.py @@ -34,7 +34,6 @@ try: from respeecher import SamplingParams from respeecher.tts import Response as TTSResponse - from respeecher.tts import StreamingEncoding from websockets.asyncio.client import connect as websocket_connect from websockets.protocol import State except ModuleNotFoundError as e: @@ -67,7 +66,6 @@ def __init__( model: str = "public/tts/en-rt", url: str = "wss://api.respeecher.com/v1", sample_rate: Optional[int] = None, - encoding: StreamingEncoding = "pcm_s16le", params: Optional[InputParams] = None, additional_silence_between_sentences_time_s: float = 0, **kwargs, @@ -80,7 +78,6 @@ def __init__( model: Model path for the Respeecher TTS API. url: WebSocket base URL for Respeecher TTS API. sample_rate: Audio sample rate. If None, uses default. - encoding: Audio encoding format. params: Additional input parameters for voice customization. additional_silence_between_sentences_time_s: Duration of additional silence to insert between sentences in seconds. **kwargs: Additional arguments passed to the parent service. @@ -97,7 +94,7 @@ def __init__( self._api_key = api_key self._url = url self._output_format = { - "encoding": encoding, + "encoding": "pcm_s16le", "sample_rate": sample_rate, } self.set_model_name(model) From d26e2fb0488369664c28406980c1ec8118b7f3f1 Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Mon, 1 Sep 2025 13:13:59 +0300 Subject: [PATCH 12/22] Use a TypedDict for convenience --- src/pipecat/services/respeecher/tts.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/respeecher/tts.py b/src/pipecat/services/respeecher/tts.py index 65eee0472b..d1324231e7 100644 --- a/src/pipecat/services/respeecher/tts.py +++ b/src/pipecat/services/respeecher/tts.py @@ -34,6 +34,9 @@ try: from respeecher import SamplingParams from respeecher.tts import Response as TTSResponse + from respeecher.voices import ( + SamplingParamsParams as SamplingParams, # TypedDict instead of a Pydantic model + ) from websockets.asyncio.client import connect as websocket_connect from websockets.protocol import State except ModuleNotFoundError as e: @@ -56,7 +59,7 @@ class InputParams(BaseModel): sampling_params: Sampling parameters used for speech synthesis. """ - sampling_params: SamplingParams = SamplingParams() + sampling_params: SamplingParams = {} def __init__( self, @@ -128,7 +131,7 @@ def _build_msg(self, text: str, context_id: str): "context_id": context_id, "voice": { "id": self._voice_id, - "sampling_params": self._params.sampling_params.model_dump(exclude_none=True), + "sampling_params": self._params.sampling_params, }, "output_format": self._output_format, } From 44b3ed044db70dd4c3e4523a96896b0952e40d15 Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Mon, 1 Sep 2025 13:21:01 +0300 Subject: [PATCH 13/22] Add some more type checking --- src/pipecat/services/respeecher/tts.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pipecat/services/respeecher/tts.py b/src/pipecat/services/respeecher/tts.py index d1324231e7..d0ad06f2f1 100644 --- a/src/pipecat/services/respeecher/tts.py +++ b/src/pipecat/services/respeecher/tts.py @@ -32,7 +32,7 @@ # See .env.example for Respeecher configuration needed try: - from respeecher import SamplingParams + from respeecher.tts import ContextfulGenerationRequestParams, StreamingOutputFormatParams from respeecher.tts import Response as TTSResponse from respeecher.voices import ( SamplingParamsParams as SamplingParams, # TypedDict instead of a Pydantic model @@ -96,9 +96,9 @@ def __init__( self._params = params or RespeecherTTSService.InputParams() self._api_key = api_key self._url = url - self._output_format = { + self._output_format: StreamingOutputFormatParams = { "encoding": "pcm_s16le", - "sample_rate": sample_rate, + "sample_rate": sample_rate or 0, } self.set_model_name(model) self.set_voice(voice_id) @@ -126,7 +126,7 @@ async def set_model(self, model: str): await self._connect() def _build_msg(self, text: str, context_id: str): - msg = { + msg: ContextfulGenerationRequestParams = { "transcript": text, "context_id": context_id, "voice": { From 999d073ef7d3631666a98e4d4e73c5b4b0b97f7d Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Mon, 1 Sep 2025 14:28:59 +0300 Subject: [PATCH 14/22] Add a foundational example --- .../07ad-interruptible-respeecher.py | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 examples/foundational/07ad-interruptible-respeecher.py diff --git a/examples/foundational/07ad-interruptible-respeecher.py b/examples/foundational/07ad-interruptible-respeecher.py new file mode 100644 index 0000000000..a70fd4d33f --- /dev/null +++ b/examples/foundational/07ad-interruptible-respeecher.py @@ -0,0 +1,124 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import os + +from dotenv import load_dotenv +from loguru import logger + +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.runner.types import RunnerArguments +from pipecat.runner.utils import create_transport +from pipecat.services.deepgram.stt import DeepgramSTTService +from pipecat.services.openai.llm import OpenAILLMService +from pipecat.services.respeecher.tts import RespeecherTTSService +from pipecat.transports.base_transport import BaseTransport, TransportParams +from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams +from pipecat.transports.services.daily import DailyParams + +load_dotenv(override=True) + +# We store functions so objects (e.g. SileroVADAnalyzer) don't get +# instantiated. The function will be called when the desired transport gets +# selected. +transport_params = { + "daily": lambda: DailyParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + "twilio": lambda: FastAPIWebsocketParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + "webrtc": lambda: TransportParams( + audio_in_enabled=True, + audio_out_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), +} + + +async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): + logger.info(f"Starting bot") + + stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) + + tts = RespeecherTTSService( + api_key=os.getenv("RESPEECHER_API_KEY"), + voice_id="samantha", + sampling_params={ + # Optional sampling params overrides + # See https://space.respeecher.com/docs/api/tts/sampling-params-guide + # "temperature": 0.5 + }, + ) + + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + context = OpenAILLMContext(messages) + context_aggregator = llm.create_context_aggregator(context) + + pipeline = Pipeline( + [ + transport.input(), # Transport user input + stt, + context_aggregator.user(), # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + context_aggregator.assistant(), # Assistant spoken responses + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + enable_metrics=True, + enable_usage_metrics=True, + ), + idle_timeout_secs=runner_args.pipeline_idle_timeout_secs, + ) + + @transport.event_handler("on_client_connected") + async def on_client_connected(transport, client): + logger.info(f"Client connected") + # Kick off the conversation. + messages.append({"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frames([context_aggregator.user().get_context_frame()]) + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + await task.cancel() + + runner = PipelineRunner(handle_sigint=runner_args.handle_sigint) + + await runner.run(task) + + +async def bot(runner_args: RunnerArguments): + """Main bot entry point compatible with Pipecat Cloud.""" + transport = await create_transport(runner_args, transport_params) + await run_bot(transport, runner_args) + + +if __name__ == "__main__": + from pipecat.runner.run import main + + main() From f640eae9fbe70d18ff2fedc8d111a66798b3ada7 Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Mon, 1 Sep 2025 15:13:33 +0300 Subject: [PATCH 15/22] Fix an example --- .../foundational/07ad-interruptible-respeecher.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/foundational/07ad-interruptible-respeecher.py b/examples/foundational/07ad-interruptible-respeecher.py index a70fd4d33f..c07183075a 100644 --- a/examples/foundational/07ad-interruptible-respeecher.py +++ b/examples/foundational/07ad-interruptible-respeecher.py @@ -55,11 +55,13 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): tts = RespeecherTTSService( api_key=os.getenv("RESPEECHER_API_KEY"), voice_id="samantha", - sampling_params={ - # Optional sampling params overrides - # See https://space.respeecher.com/docs/api/tts/sampling-params-guide - # "temperature": 0.5 - }, + params=RespeecherTTSService.InputParams( + sampling_params={ + # Optional sampling params overrides + # See https://space.respeecher.com/docs/api/tts/sampling-params-guide + # "temperature": 0.5 + }, + ), ) llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) From 1e19dac2df74d23db3db24238dc583819f9137b4 Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Mon, 1 Sep 2025 15:26:50 +0300 Subject: [PATCH 16/22] Move sampling params into settings --- src/pipecat/services/respeecher/tts.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/respeecher/tts.py b/src/pipecat/services/respeecher/tts.py index d0ad06f2f1..a8438c9c01 100644 --- a/src/pipecat/services/respeecher/tts.py +++ b/src/pipecat/services/respeecher/tts.py @@ -93,13 +93,15 @@ def __init__( **kwargs, ) - self._params = params or RespeecherTTSService.InputParams() + params = params or RespeecherTTSService.InputParams() + self._api_key = api_key self._url = url self._output_format: StreamingOutputFormatParams = { "encoding": "pcm_s16le", "sample_rate": sample_rate or 0, } + self._settings = {"sampling_params": params.sampling_params} self.set_model_name(model) self.set_voice(voice_id) @@ -131,7 +133,7 @@ def _build_msg(self, text: str, context_id: str): "context_id": context_id, "voice": { "id": self._voice_id, - "sampling_params": self._params.sampling_params, + "sampling_params": self._settings["sampling_params"], }, "output_format": self._output_format, } From d565fc0a4a28d67a92da06efc743efff2af06ec9 Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Mon, 1 Sep 2025 17:14:12 +0300 Subject: [PATCH 17/22] Switch back to single context --- src/pipecat/services/respeecher/tts.py | 32 ++++++++++++++++++-------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/src/pipecat/services/respeecher/tts.py b/src/pipecat/services/respeecher/tts.py index a8438c9c01..f9bbbc2c91 100644 --- a/src/pipecat/services/respeecher/tts.py +++ b/src/pipecat/services/respeecher/tts.py @@ -105,6 +105,7 @@ def __init__( self.set_model_name(model) self.set_voice(voice_id) + self._context_id: str | None = None self._receive_task = None def can_generate_metrics(self) -> bool: @@ -127,10 +128,13 @@ async def set_model(self, model: str): await self._disconnect() await self._connect() - def _build_msg(self, text: str, context_id: str): + def _build_msg(self, text: str, continue_transcript: bool = True): + assert self._context_id is not None + msg: ContextfulGenerationRequestParams = { "transcript": text, - "context_id": context_id, + "continue": continue_transcript, + "context_id": self._context_id, "voice": { "id": self._voice_id, "sampling_params": self._settings["sampling_params"], @@ -191,6 +195,7 @@ async def _connect_websocket(self): ) except Exception as e: logger.error(f"{self} initialization error: {e}") + self._context_id = None self._websocket = None await self._call_event_handler("on_connection_error", f"{e}") @@ -214,10 +219,19 @@ def _get_websocket(self): async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection): await super()._handle_interruption(frame, direction) await self.stop_all_metrics() - for context_id in self._contexts: - cancel_msg = json.dumps({"context_id": context_id, "cancel": True}) + if self._context_id: + cancel_msg = json.dumps({"context_id": self._context_id, "cancel": True}) await self._get_websocket().send(cancel_msg) - await self.remove_audio_context(context_id) + self._context_id = None + + async def flush_audio(self): + """Flush any pending audio and finalize the current context.""" + if not self._context_id or not self._websocket: + return + logger.trace(f"{self}: flushing audio") + msg = self._build_msg(text="", continue_transcript=False) + await self._websocket.send(msg) + self._context_id = None async def _receive_messages(self): async for message in WatchdogAsyncIterator( @@ -272,13 +286,13 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame | None, None]: if not self._websocket or self._websocket.state is State.CLOSED: await self._connect() - if not self._contexts: + if not self._context_id: await self.start_ttfb_metrics() yield TTSStartedFrame() + self._context_id = str(uuid.uuid4()) + await self.create_audio_context(self._context_id) - context_id = str(uuid.uuid4()) - await self.create_audio_context(context_id) - msg = self._build_msg(text=text, context_id=context_id) + msg = self._build_msg(text=text) try: await self._get_websocket().send(msg) From fa4624b14e9677306e0bd63e99e9a967734d101d Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Mon, 1 Sep 2025 18:07:29 +0300 Subject: [PATCH 18/22] Refactor --- src/pipecat/services/respeecher/tts.py | 37 +++++++++++++++++-------- src/pipecat/services/tts_service.py | 38 +++++++------------------- 2 files changed, 35 insertions(+), 40 deletions(-) diff --git a/src/pipecat/services/respeecher/tts.py b/src/pipecat/services/respeecher/tts.py index f9bbbc2c91..a5433726fb 100644 --- a/src/pipecat/services/respeecher/tts.py +++ b/src/pipecat/services/respeecher/tts.py @@ -19,6 +19,7 @@ EndFrame, ErrorFrame, Frame, + LLMFullResponseEndFrame, StartFrame, StartInterruptionFrame, TTSAudioRawFrame, @@ -70,7 +71,7 @@ def __init__( url: str = "wss://api.respeecher.com/v1", sample_rate: Optional[int] = None, params: Optional[InputParams] = None, - additional_silence_between_sentences_time_s: float = 0, + aggregate_sentences: bool = False, **kwargs, ): """Initialize the Respeecher TTS service. @@ -82,14 +83,14 @@ def __init__( url: WebSocket base URL for Respeecher TTS API. sample_rate: Audio sample rate. If None, uses default. params: Additional input parameters for voice customization. - additional_silence_between_sentences_time_s: Duration of additional silence to insert between sentences in seconds. + aggregate_sentences: Whether to aggregate text into sentences client-side. **kwargs: Additional arguments passed to the parent service. """ super().__init__( push_text_frames=False, pause_frame_processing=True, sample_rate=sample_rate, - silence_between_contexts_time_s=additional_silence_between_sentences_time_s, + aggregate_sentences=aggregate_sentences, **kwargs, ) @@ -128,10 +129,10 @@ async def set_model(self, model: str): await self._disconnect() await self._connect() - def _build_msg(self, text: str, continue_transcript: bool = True): + def _build_request(self, text: str, continue_transcript: bool = True): assert self._context_id is not None - msg: ContextfulGenerationRequestParams = { + request: ContextfulGenerationRequestParams = { "transcript": text, "continue": continue_transcript, "context_id": self._context_id, @@ -142,7 +143,7 @@ def _build_msg(self, text: str, continue_transcript: bool = True): "output_format": self._output_format, } - return json.dumps(msg) + return json.dumps(request) async def start(self, frame: StartFrame): """Start the Respeecher TTS service. @@ -220,17 +221,29 @@ async def _handle_interruption(self, frame: StartInterruptionFrame, direction: F await super()._handle_interruption(frame, direction) await self.stop_all_metrics() if self._context_id: - cancel_msg = json.dumps({"context_id": self._context_id, "cancel": True}) - await self._get_websocket().send(cancel_msg) + cancel_request = json.dumps({"context_id": self._context_id, "cancel": True}) + await self._get_websocket().send(cancel_request) self._context_id = None + async def process_frame(self, frame: Frame, direction: FrameDirection): + """Process frames with context awareness. + + Args: + frame: The frame to process. + direction: The direction of frame processing. + """ + await super().process_frame(frame, direction) + + if isinstance(frame, (LLMFullResponseEndFrame, EndFrame)): + await self.flush_audio() + async def flush_audio(self): """Flush any pending audio and finalize the current context.""" if not self._context_id or not self._websocket: return logger.trace(f"{self}: flushing audio") - msg = self._build_msg(text="", continue_transcript=False) - await self._websocket.send(msg) + flush_request = self._build_request(text="", continue_transcript=False) + await self._websocket.send(flush_request) self._context_id = None async def _receive_messages(self): @@ -292,10 +305,10 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame | None, None]: self._context_id = str(uuid.uuid4()) await self.create_audio_context(self._context_id) - msg = self._build_msg(text=text) + generation_request = self._build_request(text=text) try: - await self._get_websocket().send(msg) + await self._get_websocket().send(generation_request) await self.start_tts_usage_metrics(text) except Exception as e: logger.error(f"{self} error sending message: {e}") diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index 4415a79ddc..3cf0e73d73 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -719,18 +719,10 @@ class _AudioContextServiceMixin(ABC): diamond inheritance. """ - def __init__( - self, - silence_between_contexts_time_s: float = 1.0, - ): - """Initialize the service. - - Args: - silence_between_contexts_time_s: Duration of silence to push between contexts. - """ + def __init__(self): + """Initialize the service.""" self._contexts: Dict[str, asyncio.Queue] = {} self._audio_context_task = None - self._silence_between_contexts_time_s = silence_between_contexts_time_s async def create_audio_context(self, context_id: str): """Create a new audio context for grouping related audio. @@ -807,10 +799,10 @@ async def _audio_context_task_handler(self): # We just finished processing the context, so we can safely remove it. del self._contexts[context_id] - # Append some silence between sentences. - if self._silence_between_contexts_time_s > 0: - silence_frame = self.silence_frame(self._silence_between_contexts_time_s) - await self.push_frame(silence_frame) + # Append some silence between contexts. + SILENCE_BETWEEN_CONTEXTS = 1 + silence_frame = self.silence_frame(SILENCE_BETWEEN_CONTEXTS) + await self.push_frame(silence_frame) else: running = False @@ -875,19 +867,14 @@ class AudioContextTTSService(WebsocketTTSService, _AudioContextServiceMixin): audio from context ID "A" will be played first. """ - def __init__( - self, - silence_between_contexts_time_s: float = 1.0, - **kwargs, - ): + def __init__(self, **kwargs): """Initialize the Audio Context TTS service. Args: - silence_between_contexts_time_s: Duration of silence to push between contexts. **kwargs: Additional arguments passed to the parent WebsocketTTSService. """ WebsocketTTSService.__init__(self, **kwargs) - _AudioContextServiceMixin.__init__(self, silence_between_contexts_time_s) + _AudioContextServiceMixin.__init__(self) async def start(self, frame: StartFrame): """Start the audio context TTS service. @@ -943,19 +930,14 @@ class AudioContextWordTTSService(WebsocketWordTTSService, _AudioContextServiceMi audio from context ID "A" will be played first. """ - def __init__( - self, - silence_between_contexts_time_s: float = 1.0, - **kwargs, - ): + def __init__(self, **kwargs): """Initialize the Audio Context Word TTS service. Args: - silence_between_contexts_time_s: Duration of silence to push between contexts. **kwargs: Additional arguments passed to the parent WebsocketWordTTSService. """ WebsocketWordTTSService.__init__(self, **kwargs) - _AudioContextServiceMixin.__init__(self, silence_between_contexts_time_s) + _AudioContextServiceMixin.__init__(self) async def start(self, frame: StartFrame): """Start the audio context TTS service. From 233fef46cf2e0a17b0676b6b5cbefabcdc206852 Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Mon, 1 Sep 2025 19:07:23 +0300 Subject: [PATCH 19/22] Remove the watchdog --- src/pipecat/services/respeecher/tts.py | 5 +---- src/pipecat/services/tts_service.py | 7 ------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/src/pipecat/services/respeecher/tts.py b/src/pipecat/services/respeecher/tts.py index a5433726fb..1106a6fa6b 100644 --- a/src/pipecat/services/respeecher/tts.py +++ b/src/pipecat/services/respeecher/tts.py @@ -28,7 +28,6 @@ ) from pipecat.processors.frame_processor import FrameDirection from pipecat.services.tts_service import AudioContextTTSService -from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator from pipecat.utils.tracing.service_decorators import traced_tts # See .env.example for Respeecher configuration needed @@ -247,9 +246,7 @@ async def flush_audio(self): self._context_id = None async def _receive_messages(self): - async for message in WatchdogAsyncIterator( - self._get_websocket(), manager=self.task_manager - ): + async for message in self._get_websocket(): try: response = TypeAdapter(TTSResponse).validate_json(message) except ValidationError as e: diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index aede2dc1e9..62bfdfbd7a 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -38,7 +38,6 @@ from pipecat.services.websocket_service import WebsocketService from pipecat.transcriptions.language import Language from pipecat.utils.asyncio.task_manager import BaseTaskManager -from pipecat.utils.asyncio.watchdog_queue import WatchdogQueue from pipecat.utils.text.base_text_aggregator import BaseTextAggregator from pipecat.utils.text.base_text_filter import BaseTextFilter from pipecat.utils.text.simple_text_aggregator import SimpleTextAggregator @@ -820,20 +819,14 @@ async def _handle_audio_context(self, context_id: str): while running: try: frame = await asyncio.wait_for(queue.get(), timeout=AUDIO_CONTEXT_TIMEOUT) - self.reset_watchdog() if frame: await self.push_frame(frame) running = frame is not None except asyncio.TimeoutError: - self.reset_watchdog() # We didn't get audio, so let's consider this context finished. logger.trace(f"{self} time out on audio context {context_id}") break - @abstractmethod - def reset_watchdog(self) -> None: - pass - @abstractmethod def create_task(self, coroutine: Coroutine) -> asyncio.Task: pass From b0e971bc765abdbf71483c8a45270a519799ca79 Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Tue, 2 Sep 2025 18:27:37 +0300 Subject: [PATCH 20/22] Fix a docstring --- src/pipecat/services/respeecher/tts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/respeecher/tts.py b/src/pipecat/services/respeecher/tts.py index 1106a6fa6b..ad834eb422 100644 --- a/src/pipecat/services/respeecher/tts.py +++ b/src/pipecat/services/respeecher/tts.py @@ -164,10 +164,10 @@ async def stop(self, frame: EndFrame): await self._disconnect() async def cancel(self, frame: CancelFrame): - """Stop the Respeecher TTS service. + """Cancel the Respeecher TTS service. Args: - frame: The end frame. + frame: The cancel frame. """ await super().cancel(frame) await self._disconnect() From fbdb3502ebf7d0be99553b9776bf05612ba8026e Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Fri, 5 Sep 2025 11:35:44 +0300 Subject: [PATCH 21/22] Adjust a dependency specifier --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d942a53c12..243c9fc30f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,7 +87,7 @@ openrouter = [] perplexity = [] playht = [ "websockets>=13.1,<15.0" ] qwen = [] -respeecher = [ "respeecher~=1.0" ] +respeecher = [ "respeecher>=1.0,<2" ] rime = [ "websockets>=13.1,<15.0" ] riva = [ "nvidia-riva-client~=2.21.1" ] runner = [ "python-dotenv>=1.0.0,<2.0.0", "uvicorn>=0.32.0,<1.0.0", "fastapi>=0.115.6,<0.117.0", "pipecat-ai-small-webrtc-prebuilt>=1.0.0"] diff --git a/uv.lock b/uv.lock index d2d86eb857..b2d02ce29e 100644 --- a/uv.lock +++ b/uv.lock @@ -4470,7 +4470,7 @@ requires-dist = [ { name = "python-dotenv", marker = "extra == 'runner'", specifier = ">=1.0.0,<2.0.0" }, { name = "pyvips", extras = ["binary"], marker = "extra == 'moondream'", specifier = "~=3.0.0" }, { name = "resampy", specifier = "~=0.4.3" }, - { name = "respeecher", marker = "extra == 'respeecher'", specifier = "~=1.0" }, + { name = "respeecher", marker = "extra == 'respeecher'", specifier = ">=1.0,<2" }, { name = "sentry-sdk", marker = "extra == 'sentry'", specifier = "~=2.23.1" }, { name = "simli-ai", marker = "extra == 'simli'", specifier = "~=0.1.10" }, { name = "soundfile", marker = "extra == 'soundfile'", specifier = "~=0.13.0" }, From e1a9c1baa39648252abd23ede3fe583ef6623246 Mon Sep 17 00:00:00 2001 From: Nazar Vinnichuk Date: Tue, 23 Sep 2025 17:44:10 +0300 Subject: [PATCH 22/22] Fix a type hint --- src/pipecat/services/tts_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py index 20cba81329..de0936f0d2 100644 --- a/src/pipecat/services/tts_service.py +++ b/src/pipecat/services/tts_service.py @@ -968,7 +968,7 @@ async def cancel(self, frame: CancelFrame): await super().cancel(frame) await self._stop_audio_context_task() - async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection): + async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection): await super()._handle_interruption(frame, direction) await self._stop_audio_context_task() self._create_audio_context_task()