From 65267e842ba1c4490d287b4908972eed5bf440a5 Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Fri, 22 Aug 2025 16:15:20 +0300 Subject: [PATCH 01/16] Add Respeecher TTS plugin --- .../livekit-plugins-respeecher/README.md | 20 + .../livekit/plugins/respeecher/__init__.py | 44 +++ .../livekit/plugins/respeecher/log.py | 3 + .../livekit/plugins/respeecher/models.py | 51 +++ .../livekit/plugins/respeecher/tts.py | 363 ++++++++++++++++++ .../livekit/plugins/respeecher/version.py | 15 + .../livekit-plugins-respeecher/pyproject.toml | 38 ++ tests/docker-compose.yml | 2 + tests/test_tts.py | 15 + uv.lock | 15 + 10 files changed, 566 insertions(+) create mode 100644 livekit-plugins/livekit-plugins-respeecher/README.md create mode 100644 livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/__init__.py create mode 100644 livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/log.py create mode 100644 livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py create mode 100644 livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py create mode 100644 livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/version.py create mode 100644 livekit-plugins/livekit-plugins-respeecher/pyproject.toml diff --git a/livekit-plugins/livekit-plugins-respeecher/README.md b/livekit-plugins/livekit-plugins-respeecher/README.md new file mode 100644 index 0000000000..89dc1fbe09 --- /dev/null +++ b/livekit-plugins/livekit-plugins-respeecher/README.md @@ -0,0 +1,20 @@ +# LiveKit Plugins - Respeecher + +Support for the [Respeecher](https://respeecher.com/) TTS. + +See [https://docs.livekit.io/agents/integrations/tts/respeecher/](https://docs.livekit.io/agents/integrations/tts/respeecher/) for more information. + +## Installation + +```bash +pip install livekit-plugins-respeecher +``` + +## Pre-requisites + +You'll need an API key from ElevenLabs. It can be set as an environment variable: `RESPEECHER_API_KEY` + + +## API Reference + +https://space.respeecher.com/docs/quickstart diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/__init__.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/__init__.py new file mode 100644 index 0000000000..3f7dadc751 --- /dev/null +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/__init__.py @@ -0,0 +1,44 @@ +# Copyright 2023 LiveKit, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Respeecher plugin for LiveKit Agents + +Voice cloning and synthesis plugin for LiveKit Agents using Respeecher API. +""" + +from .tts import TTS, ChunkedStream +from .version import __version__ + +__all__ = ["TTS", "ChunkedStream", "__version__"] + +from livekit.agents import Plugin + +from .log import logger + + +class RespeecherPlugin(Plugin): + def __init__(self) -> None: + super().__init__(__name__, __version__, __package__, logger) + + +Plugin.register_plugin(RespeecherPlugin()) + +# Cleanup docs of unexported modules +_module = dir() +NOT_IN_ALL = [m for m in _module if m not in __all__] + +__pdoc__ = {} + +for n in NOT_IN_ALL: + __pdoc__[n] = False \ No newline at end of file diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/log.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/log.py new file mode 100644 index 0000000000..5d2a79d7bb --- /dev/null +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/log.py @@ -0,0 +1,3 @@ +import logging + +logger = logging.getLogger("livekit.plugins.respeecher") \ No newline at end of file diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py new file mode 100644 index 0000000000..ce740d0dda --- /dev/null +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py @@ -0,0 +1,51 @@ +from typing import Literal, Optional +from dataclasses import dataclass + +TTSModels = Literal[ + # Respeecher's English model, multilanguage models will be added later + "v1/public/tts/en-rt", +] + +TTSEncoding = Literal[ + "pcm_s16le", + "pcm_f32le", +] + +TTSLanguages = Literal[ + "en", +] + +TTSSampleRates = [ + 8000, + 11025, + 16000, + 22050, + 44100, + 48000, +] + +@dataclass +class SamplingParam: + """Check https://space.respeecher.com/docs/api/tts/sampling-params-guide for details""" + seed: Optional[int] = None + temparature: Optional[float] = None + top_k: Optional[int] = None + top_p: Optional[float] = None + min_p: Optional[float] = None + presence_penalty: Optional[float] = None + repetition_penalty: Optional[float] = None + frequency_penalty: Optional[float] = None + +@dataclass +class VoiceSettings: + """Voice settings for Respeecher TTS""" + sampling_param: Optional[SamplingParam] = None + +@dataclass +class Voice: + """Voice model for Respeecher""" + id: str + gender: Optional[str] = None + accent: Optional[str] = None + age: Optional[str] = None + sampling_param: Optional[SamplingParam] = None \ No newline at end of file diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py new file mode 100644 index 0000000000..703516eb10 --- /dev/null +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py @@ -0,0 +1,363 @@ +# Copyright 2025 LiveKit, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import asyncio +import base64 +import dataclasses +import json +import os +import weakref +from dataclasses import dataclass +from typing import Optional + +import aiohttp + +from livekit.agents import ( + APIConnectionError, + APIConnectOptions, + APIError, + APIStatusError, + APITimeoutError, + tts, + utils, +) +from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr +from livekit.agents.utils import is_given + +from .log import logger +from .models import TTSEncoding, TTSModels, Voice, VoiceSettings + +API_AUTH_HEADER = "X-API-Key" +API_VERSION_HEADER = "LiveKit-Plugin-Respeecher-Version" +API_VERSION = "2025-08-20" +API_BASE_URL = "https://api.respeecher.com" + +@dataclass +class _TTSOptions: + model: TTSModels | str + encoding: TTSEncoding + sample_rate: int + voice_id: str + voice_settings: NotGivenOr[VoiceSettings] + api_key: str + base_url: str + + +class TTS(tts.TTS): + def __init__( + self, + *, + api_key: NotGivenOr[str] = NOT_GIVEN, + model: TTSModels | str = "/v1/public/tts/en-rt", + encoding: TTSEncoding = "pcm_s16le", + voice_id: str = "samantha", + voice_settings: NotGivenOr[VoiceSettings] = NOT_GIVEN, + sample_rate: int = 22050, + http_session: aiohttp.ClientSession | None = None, + base_url: str = API_BASE_URL, + ) -> None: + """ + Create a new instance of Respeecher TTS. + + Args: + api_key: Respeecher API key. If not provided, uses RESPEECHER_API_KEY env variable. + model: The Respeecher TTS model to use. + encoding: Audio encoding format. + voice_id: ID of the voice to use. Different set of voices is available for different models. Thus, update the value after getting list_voices() API. + voice_settings: Optional voice settings including sampling parameters. + sample_rate: Audio sample rate in Hz. Use one of allowed values from TTSSampleRates. + http_session: Optional aiohttp session to use for requests. + base_url: The base URL for the Respeecher API. + """ + + super().__init__( + capabilities=tts.TTSCapabilities( + streaming=True, + aligned_transcript=False, + ), + sample_rate=sample_rate, + num_channels=1, + ) + + respeecher_api_key = api_key if is_given(api_key) else os.environ.get("RESPEECHER_API_KEY") + if not respeecher_api_key: + raise ValueError("RESPEECHER_API_KEY must be set") + + self._opts = _TTSOptions( + model=model, + encoding=encoding, + sample_rate=sample_rate, + voice_id=voice_id, + voice_settings=voice_settings, + api_key=respeecher_api_key, + base_url=base_url, + ) + self._session = http_session + self._streams = weakref.WeakSet[SynthesizeStream]() + + def _ensure_session(self) -> aiohttp.ClientSession: + if not self._session: + self._session = utils.http_context.http_session() + return self._session + + + async def list_voices(self) -> list[Voice]: + """List available voices from Respeecher API""" + async with self._ensure_session().get( + f"{self._opts.base_url}{self._opts.model}/voices", + headers={ + API_AUTH_HEADER: self._opts.api_key, + API_VERSION_HEADER: API_VERSION, + }, + ) as resp: + resp.raise_for_status() + data = await resp.json() + voices = [] + for voice_data in data: + voices.append( + Voice( + id=voice_data["id"], + gender=voice_data.get("gender"), + accent=voice_data.get("accent"), + age=voice_data.get("age"), + sampling_param=voice_data.get("sampling_param"), + ) + ) + + if len(voices) == 0: + raise APIError(f"No voices are available") + + return voices + + def update_options( + self, + *, + voice_id: NotGivenOr[str] = NOT_GIVEN, + voice_settings: NotGivenOr[VoiceSettings] = NOT_GIVEN, + model: NotGivenOr[TTSModels | str] = NOT_GIVEN, + ) -> None: + """Update TTS options""" + if is_given(model): + self._opts.model = model + if is_given(voice_id): + self._opts.voice_id = voice_id + if is_given(voice_settings): + self._opts.voice_settings = voice_settings + + def synthesize( + self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS + ) -> ChunkedStream: + return ChunkedStream(tts=self, input_text=text, conn_options=conn_options) + + def stream( + self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS + ) -> SynthesizeStream: + stream = SynthesizeStream(tts=self, conn_options=conn_options) + self._streams.add(stream) + return stream + + async def aclose(self) -> None: + for stream in list(self._streams): + await stream.aclose() + + self._streams.clear() + + if self._session: + await self._session.close() + self._session = None + + +# TODO: ask if it's better to use SSE instead of bytes here +class ChunkedStream(tts.ChunkedStream): + """Synthesize text using Respeecher HTTPS endpoint""" + + def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None: + super().__init__(tts=tts, input_text=input_text, conn_options=conn_options) + self._tts: TTS = tts + + async def _run(self, output_emitter: tts.AudioEmitter) -> None: + """Run the TTS synthesis""" + + json_data = { + "transcript": self._input_text, + "voice": { + "id": self._tts._opts.voice_id, + }, + "output_format": { + "sample_rate": self._tts._opts.sample_rate, + "encoding": self._tts._opts.encoding, + } + } + + if is_given(self._tts._opts.voice_settings) and self._tts._opts.voice_settings.sampling_param: + json_data["voice"]["sampling_param"] = dataclasses.asdict(self._tts._opts.voice_settings.sampling_param) + + http_url = f"{self._tts._opts.base_url}{self._tts._opts.model}/tts/bytes" + try: + async with self._tts._ensure_session().post( + http_url, + headers={ + API_AUTH_HEADER: self._tts._opts.api_key, + API_VERSION_HEADER: API_VERSION, + "Content-Type": "application/json", + }, + json=json_data, + timeout=aiohttp.ClientTimeout(total=30, sock_connect=self._conn_options.timeout), + ) as resp: + resp.raise_for_status() + + output_emitter.initialize( + request_id=utils.shortuuid(), + sample_rate=self._tts._opts.sample_rate, + num_channels=1, + mime_type="audio/wav", + ) + + async for data, _ in resp.content.iter_chunks(): + output_emitter.push(data) + + output_emitter.flush() + + except asyncio.TimeoutError: + raise APITimeoutError() from None + except aiohttp.ClientResponseError as e: + raise APIStatusError( + message=e.message, status_code=e.status, request_id=None, body=None + ) from None + except Exception as e: + raise APIConnectionError() from e + + +class SynthesizeStream(tts.SynthesizeStream): + """Streamed API using WebSocket for real-time synthesis""" + + def __init__(self, *, tts: TTS, conn_options: APIConnectOptions): + super().__init__(tts=tts, conn_options=conn_options) + + async def _run(self, output_emitter: tts.AudioEmitter) -> None: + request_id = utils.shortuuid() + output_emitter.initialize( + request_id=request_id, + sample_rate=self._tts._opts.sample_rate, + num_channels=1, + stream=True, + mime_type="audio/pcm", + ) + + ws_url = self._tts._opts.base_url.replace("https://", "wss://").replace("http://", "ws://") + full_ws_url = f"{ws_url}{self._tts._opts.model}/tts/websocket?api_key={self._tts._opts.api_key}&source={API_VERSION_HEADER}&version={API_VERSION}" + + try: + async with self._tts._ensure_session().ws_connect(full_ws_url) as ws: + @utils.log_exceptions(logger=logger) + async def send_task() -> None: + async for input in self._input_ch: + if isinstance(input, str): + generate_request = { + "context_id": request_id, + "transcript": input, + "voice": { + "id": self._tts._opts.voice_id, + }, + "continue": True, + "output_format": { + "encoding": self._tts._opts.encoding, + "sample_rate": self._tts._opts.sample_rate, + } + } + if is_given(self._tts._opts.voice_settings) and self._tts._opts.voice_settings.sampling_param: + generate_request["voice"]["sampling_param"] = dataclasses.asdict(self._tts._opts.voice_settings.sampling_param) + + self._mark_started() + await ws.send_str(json.dumps(generate_request)) + + # Send final message with continue=False to signal end of stream + end_request = { + "context_id": request_id, + "transcript": "", + "voice": { + "id": self._tts._opts.voice_id, + }, + "continue": False, + "output_format": { + "encoding": self._tts._opts.encoding, + "sample_rate": self._tts._opts.sample_rate, + } + } + if is_given(self._tts._opts.voice_settings) and self._tts._opts.voice_settings.sampling_param: + end_request["voice"]["sampling_param"] = dataclasses.asdict(self._tts._opts.voice_settings.sampling_param) + + await ws.send_str(json.dumps(end_request)) + + @utils.log_exceptions(logger=logger) + async def recv_task() -> None: + current_segment_id: str | None = None + while True: + msg = await ws.receive() + if msg.type in ( + aiohttp.WSMsgType.CLOSED, + aiohttp.WSMsgType.CLOSE, + aiohttp.WSMsgType.CLOSING, + ): + raise APIStatusError( + "Respeecher connection closed unexpectedly", + ) + + if msg.type != aiohttp.WSMsgType.TEXT: + logger.warning("unexpected Respeecher message type %s", msg.type) + continue + + data = json.loads(msg.data) + + if data.get("type") == "error": + raise APIError(message=data.get("error")) + + if data.get("type") == "chunk": + if current_segment_id is None: + current_segment_id = request_id + output_emitter.start_segment(segment_id=current_segment_id) + + audio_data = base64.b64decode(data["data"]) + output_emitter.push(audio_data) + elif data.get("type") == "done": + # End the current segment if one was started + if current_segment_id is not None: + output_emitter.end_segment() + current_segment_id = None + + output_emitter.end_input() + return + else: + raise APIError("Unexpected websocket message type") + + tasks = [ + asyncio.create_task(send_task()), + asyncio.create_task(recv_task()), + ] + + try: + await asyncio.gather(*tasks) + finally: + await utils.aio.gracefully_cancel(*tasks) + + except asyncio.TimeoutError: + raise APITimeoutError() from None + except aiohttp.ClientResponseError as e: + raise APIStatusError( + message=e.message, status_code=e.status, request_id=request_id, body=None + ) from None + except Exception as e: + raise APIConnectionError() from e \ No newline at end of file diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/version.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/version.py new file mode 100644 index 0000000000..db2bb80a21 --- /dev/null +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/version.py @@ -0,0 +1,15 @@ +# Copyright 2025 LiveKit, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "0.1.0" \ No newline at end of file diff --git a/livekit-plugins/livekit-plugins-respeecher/pyproject.toml b/livekit-plugins/livekit-plugins-respeecher/pyproject.toml new file mode 100644 index 0000000000..ee827f1c0d --- /dev/null +++ b/livekit-plugins/livekit-plugins-respeecher/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "livekit-plugins-respeecher" +dynamic = ["version"] +description = "LiveKit Agents Plugin for Respeecher" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.9.0" +authors = [{ name = "LiveKit", email = "hello@livekit.io" }] +keywords = ["realtime", "audio", "livekit", "tts"] +classifiers = [ + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Topic :: Multimedia :: Sound/Audio", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3 :: Only", +] +dependencies = ["livekit-agents>=1.2.6", "aiohttp"] + +[project.urls] +Documentation = "https://docs.livekit.io" +Website = "https://livekit.io/" +Source = "https://github.com/livekit/agents" + +[tool.hatch.version] +path = "livekit/plugins/respeecher/version.py" + +[tool.hatch.build.targets.wheel] +packages = ["livekit"] + +[tool.hatch.build.targets.sdist] +include = ["/livekit"] \ No newline at end of file diff --git a/tests/docker-compose.yml b/tests/docker-compose.yml index f4c10f8174..6792729dd6 100644 --- a/tests/docker-compose.yml +++ b/tests/docker-compose.yml @@ -54,6 +54,7 @@ services: - AWS_SECRET_ACCESS_KEY - NEUPHONIC_API_KEY - RESEMBLE_API_KEY + - RESPEECHER_API_KEY - SPEECHIFY_API_KEY - HUME_API_KEY - SPITCH_API_KEY @@ -75,6 +76,7 @@ services: - "api.play.ht:172.30.0.10" - "f.cluster.resemble.ai:172.30.0.10" - "websocket.cluster.resemble.ai:172.30.0.10" + - "api.respeecher.com:172.30.0.10" - "users.rime.ai:172.30.0.10" - "api.hume.ai:172.30.0.10" - "api.lmnt.com:172.30.0.10" diff --git a/tests/test_tts.py b/tests/test_tts.py index 1dec0cf3c3..8c6ee7d691 100644 --- a/tests/test_tts.py +++ b/tests/test_tts.py @@ -33,6 +33,7 @@ openai, playai, resemble, + respeecher, rime, speechify, spitch, @@ -222,6 +223,13 @@ async def assert_valid_synthesized_audio( }, id="resemble", ), + pytest.param( + lambda: { + "tts": respeecher.TTS(), + "proxy-upstream": "api.respeecher.com:443", + }, + id="respeecher", + ), pytest.param( lambda: { "tts": rime.TTS(), @@ -435,6 +443,13 @@ async def test_tts_synthesize_error_propagation(): }, id="resemble", ), + pytest.param( + lambda: { + "tts": respeecher.TTS(), + "proxy-upstream": "api.respeecher.com:443", + }, + id="respeecher", + ), pytest.param( lambda: { "tts": google.TTS(), diff --git a/uv.lock b/uv.lock index 7846bbeb00..f7478cb030 100644 --- a/uv.lock +++ b/uv.lock @@ -53,6 +53,7 @@ members = [ "livekit-plugins-openai", "livekit-plugins-playai", "livekit-plugins-resemble", + "livekit-plugins-respeecher", "livekit-plugins-rime", "livekit-plugins-sarvam", "livekit-plugins-silero", @@ -2370,6 +2371,20 @@ dependencies = [ [package.metadata] requires-dist = [{ name = "livekit-agents", editable = "livekit-agents" }] +[[package]] +name = "livekit-plugins-respeecher" +source = { editable = "livekit-plugins/livekit-plugins-respeecher" } +dependencies = [ + { name = "aiohttp" }, + { name = "livekit-agents" }, +] + +[package.metadata] +requires-dist = [ + { name = "aiohttp" }, + { name = "livekit-agents", editable = "livekit-agents" }, +] + [[package]] name = "livekit-plugins-rime" source = { editable = "livekit-plugins/livekit-plugins-rime" } From 3409bb58ff7ad02964b882ce244e10d1467909f0 Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Fri, 22 Aug 2025 17:13:10 +0300 Subject: [PATCH 02/16] Format code with ruff --- .../livekit/plugins/respeecher/__init__.py | 2 +- .../livekit/plugins/respeecher/log.py | 2 +- .../livekit/plugins/respeecher/models.py | 28 +++--- .../livekit/plugins/respeecher/tts.py | 94 +++++++++++-------- .../livekit/plugins/respeecher/version.py | 2 +- 5 files changed, 75 insertions(+), 53 deletions(-) diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/__init__.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/__init__.py index 3f7dadc751..8e5ddfea9c 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/__init__.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/__init__.py @@ -41,4 +41,4 @@ def __init__(self) -> None: __pdoc__ = {} for n in NOT_IN_ALL: - __pdoc__[n] = False \ No newline at end of file + __pdoc__[n] = False diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/log.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/log.py index 5d2a79d7bb..61bea91261 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/log.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/log.py @@ -1,3 +1,3 @@ import logging -logger = logging.getLogger("livekit.plugins.respeecher") \ No newline at end of file +logger = logging.getLogger("livekit.plugins.respeecher") diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py index ce740d0dda..7039920912 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py @@ -1,5 +1,5 @@ -from typing import Literal, Optional from dataclasses import dataclass +from typing import Literal, Optional TTSModels = Literal[ # Respeecher's English model, multilanguage models will be added later @@ -11,22 +11,22 @@ "pcm_f32le", ] -TTSLanguages = Literal[ - "en", -] +TTSLanguages = Literal["en"] TTSSampleRates = [ - 8000, - 11025, - 16000, - 22050, - 44100, + 8000, + 11025, + 16000, + 22050, + 44100, 48000, ] + @dataclass class SamplingParam: """Check https://space.respeecher.com/docs/api/tts/sampling-params-guide for details""" + seed: Optional[int] = None temparature: Optional[float] = None top_k: Optional[int] = None @@ -36,16 +36,20 @@ class SamplingParam: repetition_penalty: Optional[float] = None frequency_penalty: Optional[float] = None + @dataclass class VoiceSettings: """Voice settings for Respeecher TTS""" + sampling_param: Optional[SamplingParam] = None - -@dataclass + + +@dataclass class Voice: """Voice model for Respeecher""" + id: str gender: Optional[str] = None accent: Optional[str] = None age: Optional[str] = None - sampling_param: Optional[SamplingParam] = None \ No newline at end of file + sampling_param: Optional[SamplingParam] = None diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py index 703516eb10..67bc45b4ca 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py @@ -21,7 +21,6 @@ import os import weakref from dataclasses import dataclass -from typing import Optional import aiohttp @@ -45,6 +44,7 @@ API_VERSION = "2025-08-20" API_BASE_URL = "https://api.respeecher.com" + @dataclass class _TTSOptions: model: TTSModels | str @@ -113,7 +113,6 @@ def _ensure_session(self) -> aiohttp.ClientSession: self._session = utils.http_context.http_session() return self._session - async def list_voices(self) -> list[Voice]: """List available voices from Respeecher API""" async with self._ensure_session().get( @@ -138,7 +137,7 @@ async def list_voices(self) -> list[Voice]: ) if len(voices) == 0: - raise APIError(f"No voices are available") + raise APIError("No voices are available") return voices @@ -172,9 +171,9 @@ def stream( async def aclose(self) -> None: for stream in list(self._streams): await stream.aclose() - + self._streams.clear() - + if self._session: await self._session.close() self._session = None @@ -187,26 +186,31 @@ class ChunkedStream(tts.ChunkedStream): def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None: super().__init__(tts=tts, input_text=input_text, conn_options=conn_options) self._tts: TTS = tts - + async def _run(self, output_emitter: tts.AudioEmitter) -> None: """Run the TTS synthesis""" - - json_data = { - "transcript": self._input_text, - "voice": { - "id": self._tts._opts.voice_id, - }, - "output_format": { - "sample_rate": self._tts._opts.sample_rate, - "encoding": self._tts._opts.encoding, + + async def _http_operation(): + json_data = { + "transcript": self._input_text, + "voice": { + "id": self._tts._opts.voice_id, + }, + "output_format": { + "sample_rate": self._tts._opts.sample_rate, + "encoding": self._tts._opts.encoding, + }, } - } - - if is_given(self._tts._opts.voice_settings) and self._tts._opts.voice_settings.sampling_param: - json_data["voice"]["sampling_param"] = dataclasses.asdict(self._tts._opts.voice_settings.sampling_param) - http_url = f"{self._tts._opts.base_url}{self._tts._opts.model}/tts/bytes" - try: + if ( + is_given(self._tts._opts.voice_settings) + and self._tts._opts.voice_settings.sampling_param + ): + json_data["voice"]["sampling_param"] = dataclasses.asdict( + self._tts._opts.voice_settings.sampling_param + ) + + http_url = f"{self._tts._opts.base_url}{self._tts._opts.model}/tts/bytes" async with self._tts._ensure_session().post( http_url, headers={ @@ -215,7 +219,6 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None: "Content-Type": "application/json", }, json=json_data, - timeout=aiohttp.ClientTimeout(total=30, sock_connect=self._conn_options.timeout), ) as resp: resp.raise_for_status() @@ -230,7 +233,9 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None: output_emitter.push(data) output_emitter.flush() - + + try: + await asyncio.wait_for(_http_operation(), timeout=self._conn_options.timeout) except asyncio.TimeoutError: raise APITimeoutError() from None except aiohttp.ClientResponseError as e: @@ -259,9 +264,10 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None: ws_url = self._tts._opts.base_url.replace("https://", "wss://").replace("http://", "ws://") full_ws_url = f"{ws_url}{self._tts._opts.model}/tts/websocket?api_key={self._tts._opts.api_key}&source={API_VERSION_HEADER}&version={API_VERSION}" - - try: + + async def _ws_operation(): async with self._tts._ensure_session().ws_connect(full_ws_url) as ws: + @utils.log_exceptions(logger=logger) async def send_task() -> None: async for input in self._input_ch: @@ -276,14 +282,19 @@ async def send_task() -> None: "output_format": { "encoding": self._tts._opts.encoding, "sample_rate": self._tts._opts.sample_rate, - } + }, } - if is_given(self._tts._opts.voice_settings) and self._tts._opts.voice_settings.sampling_param: - generate_request["voice"]["sampling_param"] = dataclasses.asdict(self._tts._opts.voice_settings.sampling_param) + if ( + is_given(self._tts._opts.voice_settings) + and self._tts._opts.voice_settings.sampling_param + ): + generate_request["voice"]["sampling_param"] = dataclasses.asdict( + self._tts._opts.voice_settings.sampling_param + ) self._mark_started() await ws.send_str(json.dumps(generate_request)) - + # Send final message with continue=False to signal end of stream end_request = { "context_id": request_id, @@ -295,11 +306,16 @@ async def send_task() -> None: "output_format": { "encoding": self._tts._opts.encoding, "sample_rate": self._tts._opts.sample_rate, - } + }, } - if is_given(self._tts._opts.voice_settings) and self._tts._opts.voice_settings.sampling_param: - end_request["voice"]["sampling_param"] = dataclasses.asdict(self._tts._opts.voice_settings.sampling_param) - + if ( + is_given(self._tts._opts.voice_settings) + and self._tts._opts.voice_settings.sampling_param + ): + end_request["voice"]["sampling_param"] = dataclasses.asdict( + self._tts._opts.voice_settings.sampling_param + ) + await ws.send_str(json.dumps(end_request)) @utils.log_exceptions(logger=logger) @@ -329,7 +345,7 @@ async def recv_task() -> None: if current_segment_id is None: current_segment_id = request_id output_emitter.start_segment(segment_id=current_segment_id) - + audio_data = base64.b64decode(data["data"]) output_emitter.push(audio_data) elif data.get("type") == "done": @@ -337,7 +353,7 @@ async def recv_task() -> None: if current_segment_id is not None: output_emitter.end_segment() current_segment_id = None - + output_emitter.end_input() return else: @@ -347,12 +363,14 @@ async def recv_task() -> None: asyncio.create_task(send_task()), asyncio.create_task(recv_task()), ] - + try: await asyncio.gather(*tasks) finally: await utils.aio.gracefully_cancel(*tasks) - + + try: + await asyncio.wait_for(_ws_operation(), timeout=self._conn_options.timeout) except asyncio.TimeoutError: raise APITimeoutError() from None except aiohttp.ClientResponseError as e: @@ -360,4 +378,4 @@ async def recv_task() -> None: message=e.message, status_code=e.status, request_id=request_id, body=None ) from None except Exception as e: - raise APIConnectionError() from e \ No newline at end of file + raise APIConnectionError() from e diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/version.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/version.py index db2bb80a21..a951f9245a 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/version.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.1.0" \ No newline at end of file +__version__ = "0.1.0" From 3cfb2abb0221c11641a96b3d6907c22a27407b1f Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Mon, 25 Aug 2025 12:15:44 +0300 Subject: [PATCH 03/16] Apply code review suggetions --- .../livekit-plugins-respeecher/README.md | 4 +-- .../livekit/plugins/respeecher/models.py | 12 +++---- .../livekit/plugins/respeecher/tts.py | 34 ++++++++++++------- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/livekit-plugins/livekit-plugins-respeecher/README.md b/livekit-plugins/livekit-plugins-respeecher/README.md index 89dc1fbe09..bfc5a7b3f2 100644 --- a/livekit-plugins/livekit-plugins-respeecher/README.md +++ b/livekit-plugins/livekit-plugins-respeecher/README.md @@ -12,9 +12,9 @@ pip install livekit-plugins-respeecher ## Pre-requisites -You'll need an API key from ElevenLabs. It can be set as an environment variable: `RESPEECHER_API_KEY` +You'll need an API key from Respeecher. It can be set as an environment variable: `RESPEECHER_API_KEY` ## API Reference -https://space.respeecher.com/docs/quickstart +[Respeecher Quickstart](https://space.respeecher.com/docs/quickstart) diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py index 7039920912..39c9d06407 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py @@ -3,7 +3,7 @@ TTSModels = Literal[ # Respeecher's English model, multilanguage models will be added later - "v1/public/tts/en-rt", + "/v1/public/tts/en-rt", ] TTSEncoding = Literal[ @@ -13,7 +13,7 @@ TTSLanguages = Literal["en"] -TTSSampleRates = [ +TTSSampleRates = Literal[ 8000, 11025, 16000, @@ -24,11 +24,11 @@ @dataclass -class SamplingParam: +class SamplingParams: """Check https://space.respeecher.com/docs/api/tts/sampling-params-guide for details""" seed: Optional[int] = None - temparature: Optional[float] = None + temperature: Optional[float] = None top_k: Optional[int] = None top_p: Optional[float] = None min_p: Optional[float] = None @@ -41,7 +41,7 @@ class SamplingParam: class VoiceSettings: """Voice settings for Respeecher TTS""" - sampling_param: Optional[SamplingParam] = None + sampling_params: Optional[SamplingParams] = None @dataclass @@ -52,4 +52,4 @@ class Voice: gender: Optional[str] = None accent: Optional[str] = None age: Optional[str] = None - sampling_param: Optional[SamplingParam] = None + sampling_params: Optional[SamplingParams] = None diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py index 67bc45b4ca..bfb1d6c314 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py @@ -37,11 +37,12 @@ from livekit.agents.utils import is_given from .log import logger -from .models import TTSEncoding, TTSModels, Voice, VoiceSettings +from .models import SamplingParams, TTSEncoding, TTSModels, Voice, VoiceSettings +from .version import __version__ +API_VERSION = __version__ API_AUTH_HEADER = "X-API-Key" API_VERSION_HEADER = "LiveKit-Plugin-Respeecher-Version" -API_VERSION = "2025-08-20" API_BASE_URL = "https://api.respeecher.com" @@ -132,7 +133,11 @@ async def list_voices(self) -> list[Voice]: gender=voice_data.get("gender"), accent=voice_data.get("accent"), age=voice_data.get("age"), - sampling_param=voice_data.get("sampling_param"), + sampling_params=( + SamplingParams(**voice_data["sampling_params"]) + if isinstance(voice_data.get("sampling_params"), dict) + else None + ), ) ) @@ -204,10 +209,10 @@ async def _http_operation(): if ( is_given(self._tts._opts.voice_settings) - and self._tts._opts.voice_settings.sampling_param + and self._tts._opts.voice_settings.sampling_params ): - json_data["voice"]["sampling_param"] = dataclasses.asdict( - self._tts._opts.voice_settings.sampling_param + json_data["voice"]["sampling_params"] = dataclasses.asdict( + self._tts._opts.voice_settings.sampling_params ) http_url = f"{self._tts._opts.base_url}{self._tts._opts.model}/tts/bytes" @@ -286,10 +291,10 @@ async def send_task() -> None: } if ( is_given(self._tts._opts.voice_settings) - and self._tts._opts.voice_settings.sampling_param + and self._tts._opts.voice_settings.sampling_params ): - generate_request["voice"]["sampling_param"] = dataclasses.asdict( - self._tts._opts.voice_settings.sampling_param + generate_request["voice"]["sampling_params"] = dataclasses.asdict( + self._tts._opts.voice_settings.sampling_params ) self._mark_started() @@ -310,10 +315,10 @@ async def send_task() -> None: } if ( is_given(self._tts._opts.voice_settings) - and self._tts._opts.voice_settings.sampling_param + and self._tts._opts.voice_settings.sampling_params ): - end_request["voice"]["sampling_param"] = dataclasses.asdict( - self._tts._opts.voice_settings.sampling_param + end_request["voice"]["sampling_params"] = dataclasses.asdict( + self._tts._opts.voice_settings.sampling_params ) await ws.send_str(json.dumps(end_request)) @@ -329,7 +334,10 @@ async def recv_task() -> None: aiohttp.WSMsgType.CLOSING, ): raise APIStatusError( - "Respeecher connection closed unexpectedly", + message="Respeecher websocket closed unexpectedly", + status_code=500, + request_id=request_id, + body=None, ) if msg.type != aiohttp.WSMsgType.TEXT: From 7b72aba7fa9cb028cc53ed7f883310aa4d8b0076 Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Tue, 2 Sep 2025 17:07:15 +0300 Subject: [PATCH 04/16] Add sentence tokenizetion --- .../livekit/plugins/respeecher/__init__.py | 2 +- .../livekit/plugins/respeecher/tts.py | 78 ++++++++++++------- 2 files changed, 51 insertions(+), 29 deletions(-) diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/__init__.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/__init__.py index 8e5ddfea9c..944fadd4cb 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/__init__.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2023 LiveKit, Inc. +# Copyright 2025 LiveKit, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py index bfb1d6c314..a028ccca59 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py @@ -30,6 +30,7 @@ APIError, APIStatusError, APITimeoutError, + tokenize, tts, utils, ) @@ -67,6 +68,7 @@ def __init__( voice_id: str = "samantha", voice_settings: NotGivenOr[VoiceSettings] = NOT_GIVEN, sample_rate: int = 22050, + tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN, http_session: aiohttp.ClientSession | None = None, base_url: str = API_BASE_URL, ) -> None: @@ -108,6 +110,9 @@ def __init__( ) self._session = http_session self._streams = weakref.WeakSet[SynthesizeStream]() + self._sentence_tokenizer = ( + tokenizer if is_given(tokenizer) else tokenize.blingfire.SentenceTokenizer() + ) def _ensure_session(self) -> aiohttp.ClientSession: if not self._session: @@ -184,7 +189,6 @@ async def aclose(self) -> None: self._session = None -# TODO: ask if it's better to use SSE instead of bytes here class ChunkedStream(tts.ChunkedStream): """Synthesize text using Respeecher HTTPS endpoint""" @@ -270,35 +274,48 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None: ws_url = self._tts._opts.base_url.replace("https://", "wss://").replace("http://", "ws://") full_ws_url = f"{ws_url}{self._tts._opts.model}/tts/websocket?api_key={self._tts._opts.api_key}&source={API_VERSION_HEADER}&version={API_VERSION}" + sent_tokenizer_stream = self._tts._sentence_tokenizer.stream() + async def _ws_operation(): async with self._tts._ensure_session().ws_connect(full_ws_url) as ws: + logger.debug(f"WebSocket connected {full_ws_url}") + + @utils.log_exceptions(logger=logger) + async def input_task() -> None: + async for data in self._input_ch: + if isinstance(data, self._FlushSentinel): + sent_tokenizer_stream.flush() + continue + + sent_tokenizer_stream.push_text(data) + + sent_tokenizer_stream.end_input() @utils.log_exceptions(logger=logger) async def send_task() -> None: - async for input in self._input_ch: - if isinstance(input, str): - generate_request = { - "context_id": request_id, - "transcript": input, - "voice": { - "id": self._tts._opts.voice_id, - }, - "continue": True, - "output_format": { - "encoding": self._tts._opts.encoding, - "sample_rate": self._tts._opts.sample_rate, - }, - } - if ( - is_given(self._tts._opts.voice_settings) - and self._tts._opts.voice_settings.sampling_params - ): - generate_request["voice"]["sampling_params"] = dataclasses.asdict( - self._tts._opts.voice_settings.sampling_params - ) - - self._mark_started() - await ws.send_str(json.dumps(generate_request)) + async for sent in sent_tokenizer_stream: + generate_request = { + "context_id": request_id, + "transcript": sent.token, + "voice": { + "id": self._tts._opts.voice_id, + }, + "continue": True, # Always True for streamed sentences + "output_format": { + "encoding": self._tts._opts.encoding, + "sample_rate": self._tts._opts.sample_rate, + }, + } + if ( + is_given(self._tts._opts.voice_settings) + and self._tts._opts.voice_settings.sampling_params + ): + generate_request["voice"]["sampling_params"] = dataclasses.asdict( + self._tts._opts.voice_settings.sampling_params + ) + + self._mark_started() + await ws.send_str(json.dumps(generate_request)) # Send final message with continue=False to signal end of stream end_request = { @@ -341,7 +358,7 @@ async def recv_task() -> None: ) if msg.type != aiohttp.WSMsgType.TEXT: - logger.warning("unexpected Respeecher message type %s", msg.type) + logger.warning("Unexpected Respeecher message type %s", msg.type) continue data = json.loads(msg.data) @@ -362,12 +379,16 @@ async def recv_task() -> None: output_emitter.end_segment() current_segment_id = None - output_emitter.end_input() - return + # Only end input when the sentence tokenizer stream is closed + # and we've received the final done message + if sent_tokenizer_stream.closed: + output_emitter.end_input() + return else: raise APIError("Unexpected websocket message type") tasks = [ + asyncio.create_task(input_task()), asyncio.create_task(send_task()), asyncio.create_task(recv_task()), ] @@ -375,6 +396,7 @@ async def recv_task() -> None: try: await asyncio.gather(*tasks) finally: + await sent_tokenizer_stream.aclose() await utils.aio.gracefully_cancel(*tasks) try: From 3c645128be26a58d6a82e05c63bd1a94176702d1 Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Wed, 3 Sep 2025 09:48:47 +0300 Subject: [PATCH 05/16] Define sample_rate as integer instead of list of allowed values --- .../livekit/plugins/respeecher/models.py | 9 --------- .../livekit/plugins/respeecher/tts.py | 4 ++-- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py index 39c9d06407..8ca654f697 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py @@ -13,15 +13,6 @@ TTSLanguages = Literal["en"] -TTSSampleRates = Literal[ - 8000, - 11025, - 16000, - 22050, - 44100, - 48000, -] - @dataclass class SamplingParams: diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py index a028ccca59..827305af51 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py @@ -67,7 +67,7 @@ def __init__( encoding: TTSEncoding = "pcm_s16le", voice_id: str = "samantha", voice_settings: NotGivenOr[VoiceSettings] = NOT_GIVEN, - sample_rate: int = 22050, + sample_rate: int = 24000, tokenizer: NotGivenOr[tokenize.SentenceTokenizer] = NOT_GIVEN, http_session: aiohttp.ClientSession | None = None, base_url: str = API_BASE_URL, @@ -81,7 +81,7 @@ def __init__( encoding: Audio encoding format. voice_id: ID of the voice to use. Different set of voices is available for different models. Thus, update the value after getting list_voices() API. voice_settings: Optional voice settings including sampling parameters. - sample_rate: Audio sample rate in Hz. Use one of allowed values from TTSSampleRates. + sample_rate: Audio sample rate in Hz. http_session: Optional aiohttp session to use for requests. base_url: The base URL for the Respeecher API. """ From 78d800644171589b8a45bb0cc6d3df3a152c287a Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Wed, 3 Sep 2025 15:16:54 +0300 Subject: [PATCH 06/16] Move version from model to base URL --- .../livekit/plugins/respeecher/models.py | 2 +- .../livekit/plugins/respeecher/tts.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py index 8ca654f697..74ba444a22 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py @@ -3,7 +3,7 @@ TTSModels = Literal[ # Respeecher's English model, multilanguage models will be added later - "/v1/public/tts/en-rt", + "/public/tts/en-rt", ] TTSEncoding = Literal[ diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py index 827305af51..76a0795c74 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py @@ -44,7 +44,7 @@ API_VERSION = __version__ API_AUTH_HEADER = "X-API-Key" API_VERSION_HEADER = "LiveKit-Plugin-Respeecher-Version" -API_BASE_URL = "https://api.respeecher.com" +API_BASE_URL = "https://api.respeecher.com/v1" @dataclass @@ -63,7 +63,7 @@ def __init__( self, *, api_key: NotGivenOr[str] = NOT_GIVEN, - model: TTSModels | str = "/v1/public/tts/en-rt", + model: TTSModels | str = "/public/tts/en-rt", encoding: TTSEncoding = "pcm_s16le", voice_id: str = "samantha", voice_settings: NotGivenOr[VoiceSettings] = NOT_GIVEN, From d82e18d893cb3a17c48d4db33244ec6736a05465 Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Mon, 8 Sep 2025 18:54:45 +0900 Subject: [PATCH 07/16] Apply review suggestions --- examples/other/text-to-speech/README.md | 31 +++++++++- .../other/text-to-speech/requirements.txt | 1 + .../other/text-to-speech/respeecher_tts.py | 57 +++++++++++++++++++ .../livekit-plugins-respeecher/README.md | 19 +++++-- .../livekit/plugins/respeecher/models.py | 18 +----- .../livekit/plugins/respeecher/tts.py | 21 ++----- 6 files changed, 108 insertions(+), 39 deletions(-) create mode 100644 examples/other/text-to-speech/respeecher_tts.py diff --git a/examples/other/text-to-speech/README.md b/examples/other/text-to-speech/README.md index 0165b8dfc8..915e5b9364 100644 --- a/examples/other/text-to-speech/README.md +++ b/examples/other/text-to-speech/README.md @@ -1,3 +1,30 @@ -# Text-to-speech +# Text-to-Speech Examples -This small example shows how you can generate real-time audio data from text. \ No newline at end of file +These examples demonstrate real-time text-to-speech generation using various TTS plugins with LiveKit. + +## Environment Variables + +### Plugin API Keys +Set the API key for your chosen plugin. + +### LiveKit Connection +For connecting to LiveKit Cloud: +- `LIVEKIT_URL` - Your LiveKit server URL +- `LIVEKIT_API_KEY` - LiveKit API key +- `LIVEKIT_API_SECRET` - LiveKit API secret + +## Running Examples + +Execute the example to connect to a LiveKit room and stream TTS audio: + +```bash +uv run examples/other/text-to-speech/{your_plugin}_tts.py start +``` + +The agent will join the room and stream synthesized speech to participants. + +### Running Locally + +Running the examples with `console` mode won't play audio since the examples use `rtc.LocalAudioTrack`, which requires the LiveKit room infrastructure for audio playback. The `LocalAudioTrack` is designed to publish audio streams to LiveKit rooms where they are processed and distributed to participants. Without a room connection, the audio frames are generated but not routed to any playback device. + +To test TTS output locally without a LiveKit room, you would need to modify the example file to save the generated audio frames to a WAV file instead of publishing them to a track. The saved WAV file can then be played using any audio player on your system. diff --git a/examples/other/text-to-speech/requirements.txt b/examples/other/text-to-speech/requirements.txt index 5da9ab7014..99f3655204 100644 --- a/examples/other/text-to-speech/requirements.txt +++ b/examples/other/text-to-speech/requirements.txt @@ -2,5 +2,6 @@ livekit-agents>=0.12.18 livekit-plugins-openai>=0.12.2 livekit-plugins-cartesia>=0.4.11 livekit-plugins-elevenlabs>=0.8.1 +livekit-plugins-respeecher>=0.0.1 livekit-plugins-speechify>=0.1.0 python-dotenv~=1.0 diff --git a/examples/other/text-to-speech/respeecher_tts.py b/examples/other/text-to-speech/respeecher_tts.py new file mode 100644 index 0000000000..490a9726bf --- /dev/null +++ b/examples/other/text-to-speech/respeecher_tts.py @@ -0,0 +1,57 @@ +import asyncio +import logging +import wave +import os + +from dotenv import load_dotenv + +from livekit import rtc +from livekit.agents import AutoSubscribe, JobContext, WorkerOptions, cli +from livekit.plugins import respeecher + +load_dotenv() + +logger = logging.getLogger("respeecher-tts-demo") +logger.setLevel(logging.INFO) + + +async def entrypoint(job: JobContext): + logger.info("starting tts example agent") + + tts = respeecher.TTS() + + source = rtc.AudioSource(tts.sample_rate, tts.num_channels) + track = rtc.LocalAudioTrack.create_audio_track("agent-mic", source) + options = rtc.TrackPublishOptions() + options.source = rtc.TrackSource.SOURCE_MICROPHONE + + await job.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_NONE) + publication = await job.room.local_participant.publish_track(track, options) + await publication.wait_for_subscription() + + async with tts.stream() as stream: + async def _playback_task(): + count = 0 + async for audio in stream: + count += 1 + await source.capture_frame(audio.frame) + + task = asyncio.create_task(_playback_task()) + + text = "Hello from Respeecher! I hope you are having a great day." + + # split into two word chunks to simulate LLM streaming + words = text.split() + for i in range(0, len(words), 2): + chunk = " ".join(words[i : i + 2]) + if chunk: + logger.info(f'pushing chunk: "{chunk} "') + stream.push_text(chunk + " ") + + # Mark end of input segment + stream.flush() + stream.end_input() + await asyncio.gather(task) + +if __name__ == "__main__": + cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint)) \ No newline at end of file diff --git a/livekit-plugins/livekit-plugins-respeecher/README.md b/livekit-plugins/livekit-plugins-respeecher/README.md index bfc5a7b3f2..65fd0fed0c 100644 --- a/livekit-plugins/livekit-plugins-respeecher/README.md +++ b/livekit-plugins/livekit-plugins-respeecher/README.md @@ -1,8 +1,8 @@ -# LiveKit Plugins - Respeecher +# Respeecher plugin for LiveKit Agents -Support for the [Respeecher](https://respeecher.com/) TTS. +Support for [Respeecher](https://respeecher.com/)'s TTS in LiveKit Agents. -See [https://docs.livekit.io/agents/integrations/tts/respeecher/](https://docs.livekit.io/agents/integrations/tts/respeecher/) for more information. +More information is available in the docs for the [Respeecher](https://docs.livekit.io/agents/integrations/tts/respeecher/) integration. ## Installation @@ -12,9 +12,16 @@ pip install livekit-plugins-respeecher ## Pre-requisites -You'll need an API key from Respeecher. It can be set as an environment variable: `RESPEECHER_API_KEY` +You'll need an API key from Respeecher. It can be set as an environment variable: `RESPEECHER_API_KEY` or passed to the `respeecher.TTS()` constructor. +To get the key, log in to [Respeecher Space](https://space.respeecher.com/). -## API Reference +## Example -[Respeecher Quickstart](https://space.respeecher.com/docs/quickstart) +To try out the Respeecher plugin, run the example: + +```bash +uv run python examples/other/text-to-speech/respeecher_tts.py start +``` + +Check [`examples/other/text-to-speech/README.md`](../../examples/other/text-to-speech/README.md) for running details. \ No newline at end of file diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py index 74ba444a22..3028adccc5 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Literal, Optional +from typing import Any, Dict, Literal, Optional TTSModels = Literal[ # Respeecher's English model, multilanguage models will be added later @@ -11,21 +11,9 @@ "pcm_f32le", ] -TTSLanguages = Literal["en"] - -@dataclass -class SamplingParams: - """Check https://space.respeecher.com/docs/api/tts/sampling-params-guide for details""" - - seed: Optional[int] = None - temperature: Optional[float] = None - top_k: Optional[int] = None - top_p: Optional[float] = None - min_p: Optional[float] = None - presence_penalty: Optional[float] = None - repetition_penalty: Optional[float] = None - frequency_penalty: Optional[float] = None +"""Check https://space.respeecher.com/docs/api/tts/sampling-params-guide for details""" +SamplingParams = Dict[str, Any] @dataclass diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py index 76a0795c74..611efa0cc7 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py @@ -16,7 +16,6 @@ import asyncio import base64 -import dataclasses import json import os import weakref @@ -38,7 +37,7 @@ from livekit.agents.utils import is_given from .log import logger -from .models import SamplingParams, TTSEncoding, TTSModels, Voice, VoiceSettings +from .models import TTSEncoding, TTSModels, Voice, VoiceSettings from .version import __version__ API_VERSION = __version__ @@ -138,11 +137,7 @@ async def list_voices(self) -> list[Voice]: gender=voice_data.get("gender"), accent=voice_data.get("accent"), age=voice_data.get("age"), - sampling_params=( - SamplingParams(**voice_data["sampling_params"]) - if isinstance(voice_data.get("sampling_params"), dict) - else None - ), + sampling_params=voice_data.get("sampling_params"), ) ) @@ -215,9 +210,7 @@ async def _http_operation(): is_given(self._tts._opts.voice_settings) and self._tts._opts.voice_settings.sampling_params ): - json_data["voice"]["sampling_params"] = dataclasses.asdict( - self._tts._opts.voice_settings.sampling_params - ) + json_data["voice"]["sampling_params"] = self._tts._opts.voice_settings.sampling_params http_url = f"{self._tts._opts.base_url}{self._tts._opts.model}/tts/bytes" async with self._tts._ensure_session().post( @@ -310,9 +303,7 @@ async def send_task() -> None: is_given(self._tts._opts.voice_settings) and self._tts._opts.voice_settings.sampling_params ): - generate_request["voice"]["sampling_params"] = dataclasses.asdict( - self._tts._opts.voice_settings.sampling_params - ) + generate_request["voice"]["sampling_params"] = self._tts._opts.voice_settings.sampling_params self._mark_started() await ws.send_str(json.dumps(generate_request)) @@ -334,9 +325,7 @@ async def send_task() -> None: is_given(self._tts._opts.voice_settings) and self._tts._opts.voice_settings.sampling_params ): - end_request["voice"]["sampling_params"] = dataclasses.asdict( - self._tts._opts.voice_settings.sampling_params - ) + end_request["voice"]["sampling_params"] = self._tts._opts.voice_settings.sampling_params await ws.send_str(json.dumps(end_request)) From 614b6c89fdeee4e6ed7ea6c889afc96bf64edd1b Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Mon, 8 Sep 2025 19:01:16 +0900 Subject: [PATCH 08/16] Fix ruff formatting --- examples/other/text-to-speech/respeecher_tts.py | 6 +++--- .../livekit/plugins/respeecher/models.py | 4 ++-- .../livekit/plugins/respeecher/tts.py | 12 +++++++++--- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/examples/other/text-to-speech/respeecher_tts.py b/examples/other/text-to-speech/respeecher_tts.py index 490a9726bf..f525c48d58 100644 --- a/examples/other/text-to-speech/respeecher_tts.py +++ b/examples/other/text-to-speech/respeecher_tts.py @@ -1,7 +1,5 @@ import asyncio import logging -import wave -import os from dotenv import load_dotenv @@ -30,6 +28,7 @@ async def entrypoint(job: JobContext): await publication.wait_for_subscription() async with tts.stream() as stream: + async def _playback_task(): count = 0 async for audio in stream: @@ -53,5 +52,6 @@ async def _playback_task(): stream.end_input() await asyncio.gather(task) + if __name__ == "__main__": - cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint)) \ No newline at end of file + cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint)) diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py index 3028adccc5..881d4b516c 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Any, Dict, Literal, Optional +from typing import Any, Literal, Optional TTSModels = Literal[ # Respeecher's English model, multilanguage models will be added later @@ -13,7 +13,7 @@ """Check https://space.respeecher.com/docs/api/tts/sampling-params-guide for details""" -SamplingParams = Dict[str, Any] +SamplingParams = dict[str, Any] @dataclass diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py index 611efa0cc7..1eb6d5d22d 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py @@ -210,7 +210,9 @@ async def _http_operation(): is_given(self._tts._opts.voice_settings) and self._tts._opts.voice_settings.sampling_params ): - json_data["voice"]["sampling_params"] = self._tts._opts.voice_settings.sampling_params + json_data["voice"]["sampling_params"] = ( + self._tts._opts.voice_settings.sampling_params + ) http_url = f"{self._tts._opts.base_url}{self._tts._opts.model}/tts/bytes" async with self._tts._ensure_session().post( @@ -303,7 +305,9 @@ async def send_task() -> None: is_given(self._tts._opts.voice_settings) and self._tts._opts.voice_settings.sampling_params ): - generate_request["voice"]["sampling_params"] = self._tts._opts.voice_settings.sampling_params + generate_request["voice"]["sampling_params"] = ( + self._tts._opts.voice_settings.sampling_params + ) self._mark_started() await ws.send_str(json.dumps(generate_request)) @@ -325,7 +329,9 @@ async def send_task() -> None: is_given(self._tts._opts.voice_settings) and self._tts._opts.voice_settings.sampling_params ): - end_request["voice"]["sampling_params"] = self._tts._opts.voice_settings.sampling_params + end_request["voice"]["sampling_params"] = ( + self._tts._opts.voice_settings.sampling_params + ) await ws.send_str(json.dumps(end_request)) From 2b09286bbbe1929430ed1209c61dfe4b21119f78 Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Mon, 8 Sep 2025 20:36:18 +0900 Subject: [PATCH 09/16] Make Voice a dictionary and update timeout logic for tests --- .../livekit/plugins/respeecher/models.py | 22 +++--- .../livekit/plugins/respeecher/tts.py | 71 +++++++++---------- uv.lock | 64 ++++++++++++++--- 3 files changed, 102 insertions(+), 55 deletions(-) diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py index 881d4b516c..b6320d2a3c 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py @@ -23,12 +23,18 @@ class VoiceSettings: sampling_params: Optional[SamplingParams] = None -@dataclass -class Voice: - """Voice model for Respeecher""" +class Voice(dict): + """Voice model for Respeecher - behaves like a dict with guaranteed `id` and optional `sampling_params`""" - id: str - gender: Optional[str] = None - accent: Optional[str] = None - age: Optional[str] = None - sampling_params: Optional[SamplingParams] = None + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if "id" not in self: + raise ValueError("Voice must have an 'id' field") + + @property + def id(self) -> str: + return self["id"] + + @property + def sampling_params(self) -> Optional[SamplingParams]: + return self.get("sampling_params") diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py index 1eb6d5d22d..56e8a07fa8 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py @@ -131,15 +131,7 @@ async def list_voices(self) -> list[Voice]: data = await resp.json() voices = [] for voice_data in data: - voices.append( - Voice( - id=voice_data["id"], - gender=voice_data.get("gender"), - accent=voice_data.get("accent"), - age=voice_data.get("age"), - sampling_params=voice_data.get("sampling_params"), - ) - ) + voices.append(Voice(voice_data)) if len(voices) == 0: raise APIError("No voices are available") @@ -193,28 +185,26 @@ def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions async def _run(self, output_emitter: tts.AudioEmitter) -> None: """Run the TTS synthesis""" + json_data = { + "transcript": self._input_text, + "voice": { + "id": self._tts._opts.voice_id, + }, + "output_format": { + "sample_rate": self._tts._opts.sample_rate, + "encoding": self._tts._opts.encoding, + }, + } - async def _http_operation(): - json_data = { - "transcript": self._input_text, - "voice": { - "id": self._tts._opts.voice_id, - }, - "output_format": { - "sample_rate": self._tts._opts.sample_rate, - "encoding": self._tts._opts.encoding, - }, - } - - if ( - is_given(self._tts._opts.voice_settings) - and self._tts._opts.voice_settings.sampling_params - ): - json_data["voice"]["sampling_params"] = ( - self._tts._opts.voice_settings.sampling_params - ) + if ( + is_given(self._tts._opts.voice_settings) + and self._tts._opts.voice_settings.sampling_params + ): + json_data["voice"]["sampling_params"] = self._tts._opts.voice_settings.sampling_params + + http_url = f"{self._tts._opts.base_url}{self._tts._opts.model}/tts/bytes" - http_url = f"{self._tts._opts.base_url}{self._tts._opts.model}/tts/bytes" + try: async with self._tts._ensure_session().post( http_url, headers={ @@ -223,6 +213,7 @@ async def _http_operation(): "Content-Type": "application/json", }, json=json_data, + timeout=aiohttp.ClientTimeout(total=30, sock_connect=self._conn_options.timeout), ) as resp: resp.raise_for_status() @@ -237,9 +228,6 @@ async def _http_operation(): output_emitter.push(data) output_emitter.flush() - - try: - await asyncio.wait_for(_http_operation(), timeout=self._conn_options.timeout) except asyncio.TimeoutError: raise APITimeoutError() from None except aiohttp.ClientResponseError as e: @@ -272,8 +260,15 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None: sent_tokenizer_stream = self._tts._sentence_tokenizer.stream() async def _ws_operation(): - async with self._tts._ensure_session().ws_connect(full_ws_url) as ws: - logger.debug(f"WebSocket connected {full_ws_url}") + try: + ws = await asyncio.wait_for( + self._tts._ensure_session().ws_connect(full_ws_url), + timeout=self._conn_options.timeout, + ) + except asyncio.TimeoutError: + raise APITimeoutError() from None + + async with ws: @utils.log_exceptions(logger=logger) async def input_task() -> None: @@ -359,6 +354,7 @@ async def recv_task() -> None: data = json.loads(msg.data) if data.get("type") == "error": + logger.error(f"Respeecher API error: {data.get('error')}") raise APIError(message=data.get("error")) if data.get("type") == "chunk": @@ -369,6 +365,7 @@ async def recv_task() -> None: audio_data = base64.b64decode(data["data"]) output_emitter.push(audio_data) elif data.get("type") == "done": + logger.debug(f"Received done message: {data}") # End the current segment if one was started if current_segment_id is not None: output_emitter.end_segment() @@ -395,9 +392,9 @@ async def recv_task() -> None: await utils.aio.gracefully_cancel(*tasks) try: - await asyncio.wait_for(_ws_operation(), timeout=self._conn_options.timeout) - except asyncio.TimeoutError: - raise APITimeoutError() from None + await _ws_operation() + except APITimeoutError: + raise except aiohttp.ClientResponseError as e: raise APIStatusError( message=e.message, status_code=e.status, request_id=request_id, body=None diff --git a/uv.lock b/uv.lock index f7478cb030..b4d9886572 100644 --- a/uv.lock +++ b/uv.lock @@ -59,11 +59,14 @@ members = [ "livekit-plugins-silero", "livekit-plugins-simli", "livekit-plugins-smallestai", + "livekit-plugins-soniox", "livekit-plugins-speechify", "livekit-plugins-speechmatics", "livekit-plugins-spitch", "livekit-plugins-tavus", "livekit-plugins-turn-detector", + "livekit-plugins-ultravox", + "livekit-plugins-upliftai", ] constraints = [{ name = "onnxruntime", marker = "python_full_version == '3.9.*'", specifier = "<1.20.0" }] @@ -1860,6 +1863,12 @@ tavus = [ turn-detector = [ { name = "livekit-plugins-turn-detector" }, ] +ultravox = [ + { name = "livekit-plugins-ultravox" }, +] +upliftai = [ + { name = "livekit-plugins-upliftai" }, +] [package.metadata] requires-dist = [ @@ -1871,7 +1880,7 @@ requires-dist = [ { name = "docstring-parser", specifier = ">=0.16" }, { name = "eval-type-backport" }, { name = "livekit", specifier = ">=1.0.12,<2" }, - { name = "livekit-api", specifier = ">=1.0.4,<2" }, + { name = "livekit-api", specifier = ">=1.0.5,<2" }, { name = "livekit-blingfire", specifier = "~=1.0" }, { name = "livekit-plugins-anam", marker = "extra == 'anam'", editable = "livekit-plugins/livekit-plugins-anam" }, { name = "livekit-plugins-anthropic", marker = "extra == 'anthropic'", editable = "livekit-plugins/livekit-plugins-anthropic" }, @@ -1910,6 +1919,8 @@ requires-dist = [ { name = "livekit-plugins-spitch", marker = "extra == 'spitch'", editable = "livekit-plugins/livekit-plugins-spitch" }, { name = "livekit-plugins-tavus", marker = "extra == 'tavus'", editable = "livekit-plugins/livekit-plugins-tavus" }, { name = "livekit-plugins-turn-detector", marker = "extra == 'turn-detector'", editable = "livekit-plugins/livekit-plugins-turn-detector" }, + { name = "livekit-plugins-ultravox", marker = "extra == 'ultravox'", editable = "livekit-plugins/livekit-plugins-ultravox" }, + { name = "livekit-plugins-upliftai", marker = "extra == 'upliftai'", editable = "livekit-plugins/livekit-plugins-upliftai" }, { name = "livekit-protocol", specifier = "~=1.0" }, { name = "mcp", marker = "python_full_version >= '3.10' and extra == 'mcp'", specifier = ">=1.10.0,<2" }, { name = "nest-asyncio", specifier = ">=1.6.0" }, @@ -1925,15 +1936,15 @@ requires-dist = [ { name = "pydantic", specifier = ">=2.0,<3" }, { name = "pyjwt", specifier = ">=2.0" }, { name = "sounddevice", specifier = ">=0.5" }, - { name = "types-protobuf", specifier = ">=4,<5" }, + { name = "types-protobuf", specifier = ">=4" }, { name = "typing-extensions", specifier = ">=4.12" }, { name = "watchfiles", specifier = ">=1.0" }, ] -provides-extras = ["anam", "anthropic", "assemblyai", "aws", "azure", "baseten", "bey", "bithuman", "cartesia", "clova", "codecs", "deepgram", "elevenlabs", "fal", "gladia", "google", "groq", "hedra", "hume", "images", "inworld", "langchain", "lmnt", "mcp", "mistralai", "neuphonic", "nltk", "openai", "playai", "resemble", "rime", "sarvam", "silero", "simli", "smallestai", "speechify", "speechmatics", "spitch", "tavus", "turn-detector"] +provides-extras = ["anam", "anthropic", "assemblyai", "aws", "azure", "baseten", "bey", "bithuman", "cartesia", "clova", "codecs", "deepgram", "elevenlabs", "fal", "gladia", "google", "groq", "hedra", "hume", "images", "inworld", "langchain", "lmnt", "mcp", "mistralai", "neuphonic", "nltk", "openai", "playai", "resemble", "rime", "sarvam", "silero", "simli", "smallestai", "speechify", "speechmatics", "spitch", "tavus", "turn-detector", "ultravox", "upliftai"] [[package]] name = "livekit-api" -version = "1.0.4" +version = "1.0.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -1942,9 +1953,9 @@ dependencies = [ { name = "pyjwt" }, { name = "types-protobuf" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b3/3a/abf4135de6ac7b43b5cc467a68731ee1adbdaa9dadea8149d37a4b0ef85f/livekit_api-1.0.4.tar.gz", hash = "sha256:90d68423e9d398834cbef300e65b7dc8f0f380f279ba25764d35296df1703152", size = 15032, upload-time = "2025-07-23T09:30:46.535Z" } +sdist = { url = "https://files.pythonhosted.org/packages/6e/af/a3ecf8d204330a07cfeff60c42318df788601a9ade72fc032221bb272f21/livekit_api-1.0.5.tar.gz", hash = "sha256:1607f640ebef177208e3257098ac1fa25e37d1f72a87d0f9953d616d6eb9f18e", size = 15117, upload-time = "2025-07-24T16:43:02.467Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0b/1a/13f4cfdd0fb60b7057adf0a1ffeb2f0bdb13ad140d59fd34cff45e0c52a3/livekit_api-1.0.4-py3-none-any.whl", hash = "sha256:583004fc6aa7255d53932c5863145dbec16a4645fbf3944f0fdcbf2bb8ed9f86", size = 17472, upload-time = "2025-07-23T09:30:45.412Z" }, + { url = "https://files.pythonhosted.org/packages/3a/6f/8d978416467af2a14c4c8ff4c0285c7b2d92507da58b1f3c14cba48930f8/livekit_api-1.0.5-py3-none-any.whl", hash = "sha256:6af149b58b182f43e9a5d2d764582ed6f083c80b520ab3d489c817cea554255e", size = 17577, upload-time = "2025-07-24T16:43:00.961Z" }, ] [[package]] @@ -2200,16 +2211,14 @@ source = { editable = "livekit-plugins/livekit-plugins-groq" } dependencies = [ { name = "aiohttp" }, { name = "livekit" }, - { name = "livekit-agents", extra = ["codecs"] }, - { name = "livekit-plugins-openai" }, + { name = "livekit-agents", extra = ["codecs", "openai"] }, ] [package.metadata] requires-dist = [ { name = "aiohttp" }, { name = "livekit" }, - { name = "livekit-agents", extras = ["codecs"], editable = "livekit-agents" }, - { name = "livekit-plugins-openai", editable = "livekit-plugins/livekit-plugins-openai" }, + { name = "livekit-agents", extras = ["codecs", "openai"], editable = "livekit-agents" }, ] [[package]] @@ -2453,6 +2462,16 @@ requires-dist = [ { name = "numpy", specifier = ">=1.26" }, ] +[[package]] +name = "livekit-plugins-soniox" +source = { editable = "livekit-plugins/livekit-plugins-soniox" } +dependencies = [ + { name = "livekit-agents" }, +] + +[package.metadata] +requires-dist = [{ name = "livekit-agents", editable = "livekit-agents" }] + [[package]] name = "livekit-plugins-speechify" source = { editable = "livekit-plugins/livekit-plugins-speechify" } @@ -2523,6 +2542,31 @@ requires-dist = [ { name = "transformers", specifier = ">=4.47.1" }, ] +[[package]] +name = "livekit-plugins-ultravox" +source = { editable = "livekit-plugins/livekit-plugins-ultravox" } +dependencies = [ + { name = "livekit-agents", extra = ["codecs"] }, +] + +[package.metadata] +requires-dist = [{ name = "livekit-agents", extras = ["codecs"], editable = "livekit-agents" }] + +[[package]] +name = "livekit-plugins-upliftai" +source = { editable = "livekit-plugins/livekit-plugins-upliftai" } +dependencies = [ + { name = "livekit-agents", extra = ["codecs"] }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "numpy", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, +] + +[package.metadata] +requires-dist = [ + { name = "livekit-agents", extras = ["codecs"], editable = "livekit-agents" }, + { name = "numpy", specifier = ">=1.26" }, +] + [[package]] name = "livekit-protocol" version = "1.0.4" From cc93c160ee08988bb5c9fe41e941c197cda9daa6 Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Wed, 10 Sep 2025 01:53:34 +0900 Subject: [PATCH 10/16] Reuse websocket connection across different requests --- .../livekit/plugins/respeecher/tts.py | 265 +++++++++--------- pyproject.toml | 1 + 2 files changed, 133 insertions(+), 133 deletions(-) diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py index 56e8a07fa8..71a16dc764 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py @@ -112,6 +112,19 @@ def __init__( self._sentence_tokenizer = ( tokenizer if is_given(tokenizer) else tokenize.blingfire.SentenceTokenizer() ) + self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse]( + connect_cb=self._connect_ws, + close_cb=self._close_ws, + ) + + async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse: + session = self._ensure_session() + ws_url = self._opts.base_url.replace("https://", "wss://").replace("http://", "ws://") + full_ws_url = f"{ws_url}{self._opts.model}/tts/websocket?api_key={self._opts.api_key}&source={API_VERSION_HEADER}&version={API_VERSION}" + return await asyncio.wait_for(session.ws_connect(full_ws_url), timeout) + + async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None: + await ws.close() def _ensure_session(self) -> aiohttp.ClientSession: if not self._session: @@ -146,8 +159,15 @@ def update_options( model: NotGivenOr[TTSModels | str] = NOT_GIVEN, ) -> None: """Update TTS options""" - if is_given(model): + if is_given(model) and model != self._opts.model: self._opts.model = model + # Clear the connection pool when model changes to force reconnection + asyncio.create_task(self._pool.aclose()) + self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse]( + connect_cb=self._connect_ws, + close_cb=self._close_ws, + ) + if is_given(voice_id): self._opts.voice_id = voice_id if is_given(voice_settings): @@ -158,6 +178,9 @@ def synthesize( ) -> ChunkedStream: return ChunkedStream(tts=self, input_text=text, conn_options=conn_options) + def prewarm(self) -> None: + self._pool.prewarm() + def stream( self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS ) -> SynthesizeStream: @@ -170,6 +193,7 @@ async def aclose(self) -> None: await stream.aclose() self._streams.clear() + await self._pool.aclose() if self._session: await self._session.close() @@ -244,145 +268,121 @@ class SynthesizeStream(tts.SynthesizeStream): def __init__(self, *, tts: TTS, conn_options: APIConnectOptions): super().__init__(tts=tts, conn_options=conn_options) + async def aclose(self) -> None: + await super().aclose() + async def _run(self, output_emitter: tts.AudioEmitter) -> None: - request_id = utils.shortuuid() + context_id = utils.shortuuid() output_emitter.initialize( - request_id=request_id, + request_id=context_id, sample_rate=self._tts._opts.sample_rate, num_channels=1, stream=True, mime_type="audio/pcm", ) - - ws_url = self._tts._opts.base_url.replace("https://", "wss://").replace("http://", "ws://") - full_ws_url = f"{ws_url}{self._tts._opts.model}/tts/websocket?api_key={self._tts._opts.api_key}&source={API_VERSION_HEADER}&version={API_VERSION}" + output_emitter.start_segment(segment_id=context_id) sent_tokenizer_stream = self._tts._sentence_tokenizer.stream() - async def _ws_operation(): - try: - ws = await asyncio.wait_for( - self._tts._ensure_session().ws_connect(full_ws_url), - timeout=self._conn_options.timeout, + async def _input_task() -> None: + async for data in self._input_ch: + if isinstance(data, self._FlushSentinel): + sent_tokenizer_stream.flush() + continue + sent_tokenizer_stream.push_text(data) + sent_tokenizer_stream.end_input() + + async def _sentence_stream_task(ws: aiohttp.ClientWebSocketResponse) -> None: + async for sent in sent_tokenizer_stream: + generate_request = { + "context_id": context_id, + "transcript": sent.token, + "voice": { + "id": self._tts._opts.voice_id, + }, + "continue": True, + "output_format": { + "encoding": self._tts._opts.encoding, + "sample_rate": self._tts._opts.sample_rate, + }, + } + if ( + is_given(self._tts._opts.voice_settings) + and self._tts._opts.voice_settings.sampling_params + ): + generate_request["voice"]["sampling_params"] = ( + self._tts._opts.voice_settings.sampling_params + ) + + self._mark_started() + await ws.send_str(json.dumps(generate_request)) + + # Send final message with continue=False + end_request = { + "context_id": context_id, + "transcript": "", + "voice": { + "id": self._tts._opts.voice_id, + }, + "continue": False, + "output_format": { + "encoding": self._tts._opts.encoding, + "sample_rate": self._tts._opts.sample_rate, + }, + } + if ( + is_given(self._tts._opts.voice_settings) + and self._tts._opts.voice_settings.sampling_params + ): + end_request["voice"]["sampling_params"] = ( + self._tts._opts.voice_settings.sampling_params ) - except asyncio.TimeoutError: - raise APITimeoutError() from None - - async with ws: - - @utils.log_exceptions(logger=logger) - async def input_task() -> None: - async for data in self._input_ch: - if isinstance(data, self._FlushSentinel): - sent_tokenizer_stream.flush() - continue - - sent_tokenizer_stream.push_text(data) - - sent_tokenizer_stream.end_input() - - @utils.log_exceptions(logger=logger) - async def send_task() -> None: - async for sent in sent_tokenizer_stream: - generate_request = { - "context_id": request_id, - "transcript": sent.token, - "voice": { - "id": self._tts._opts.voice_id, - }, - "continue": True, # Always True for streamed sentences - "output_format": { - "encoding": self._tts._opts.encoding, - "sample_rate": self._tts._opts.sample_rate, - }, - } - if ( - is_given(self._tts._opts.voice_settings) - and self._tts._opts.voice_settings.sampling_params - ): - generate_request["voice"]["sampling_params"] = ( - self._tts._opts.voice_settings.sampling_params - ) - - self._mark_started() - await ws.send_str(json.dumps(generate_request)) - - # Send final message with continue=False to signal end of stream - end_request = { - "context_id": request_id, - "transcript": "", - "voice": { - "id": self._tts._opts.voice_id, - }, - "continue": False, - "output_format": { - "encoding": self._tts._opts.encoding, - "sample_rate": self._tts._opts.sample_rate, - }, - } - if ( - is_given(self._tts._opts.voice_settings) - and self._tts._opts.voice_settings.sampling_params - ): - end_request["voice"]["sampling_params"] = ( - self._tts._opts.voice_settings.sampling_params - ) - - await ws.send_str(json.dumps(end_request)) - - @utils.log_exceptions(logger=logger) - async def recv_task() -> None: - current_segment_id: str | None = None - while True: - msg = await ws.receive() - if msg.type in ( - aiohttp.WSMsgType.CLOSED, - aiohttp.WSMsgType.CLOSE, - aiohttp.WSMsgType.CLOSING, - ): - raise APIStatusError( - message="Respeecher websocket closed unexpectedly", - status_code=500, - request_id=request_id, - body=None, - ) - - if msg.type != aiohttp.WSMsgType.TEXT: - logger.warning("Unexpected Respeecher message type %s", msg.type) - continue - - data = json.loads(msg.data) - - if data.get("type") == "error": - logger.error(f"Respeecher API error: {data.get('error')}") - raise APIError(message=data.get("error")) - - if data.get("type") == "chunk": - if current_segment_id is None: - current_segment_id = request_id - output_emitter.start_segment(segment_id=current_segment_id) - - audio_data = base64.b64decode(data["data"]) - output_emitter.push(audio_data) - elif data.get("type") == "done": - logger.debug(f"Received done message: {data}") - # End the current segment if one was started - if current_segment_id is not None: - output_emitter.end_segment() - current_segment_id = None - - # Only end input when the sentence tokenizer stream is closed - # and we've received the final done message - if sent_tokenizer_stream.closed: - output_emitter.end_input() - return - else: - raise APIError("Unexpected websocket message type") + await ws.send_str(json.dumps(end_request)) + + async def _recv_task(ws: aiohttp.ClientWebSocketResponse) -> None: + while True: + msg = await ws.receive() + if msg.type in ( + aiohttp.WSMsgType.CLOSED, + aiohttp.WSMsgType.CLOSE, + aiohttp.WSMsgType.CLOSING, + ): + raise APIStatusError( + "Respeecher connection closed unexpectedly", request_id=context_id + ) + + if msg.type != aiohttp.WSMsgType.TEXT: + logger.warning("Unexpected Respeecher message type %s", msg.type) + continue + + data = json.loads(msg.data) + + if data.get("context_id") != context_id: + logger.warning( + "Received a message with context_id=%s instead of expected %s", + data.get("context_id"), + context_id, + ) + continue + + if data.get("type") == "error": + raise APIError(f"Respeecher returned error: {data.get('error')}") + + if data.get("type") == "chunk": + audio_data = base64.b64decode(data["data"]) + output_emitter.push(audio_data) + + elif data.get("type") == "done": + if sent_tokenizer_stream.closed: + output_emitter.end_input() + break + try: + async with self._tts._pool.connection(timeout=self._conn_options.timeout) as ws: tasks = [ - asyncio.create_task(input_task()), - asyncio.create_task(send_task()), - asyncio.create_task(recv_task()), + asyncio.create_task(_input_task()), + asyncio.create_task(_sentence_stream_task(ws)), + asyncio.create_task(_recv_task(ws)), ] try: @@ -390,14 +390,13 @@ async def recv_task() -> None: finally: await sent_tokenizer_stream.aclose() await utils.aio.gracefully_cancel(*tasks) - - try: - await _ws_operation() - except APITimeoutError: - raise + except asyncio.TimeoutError: + raise APITimeoutError() from None except aiohttp.ClientResponseError as e: raise APIStatusError( - message=e.message, status_code=e.status, request_id=request_id, body=None + message=e.message, status_code=e.status, request_id=None, body=None ) from None except Exception as e: raise APIConnectionError() from e + finally: + output_emitter.end_segment() diff --git a/pyproject.toml b/pyproject.toml index c440d856d4..d8a588d226 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ livekit-plugins-nltk = { workspace = true } livekit-plugins-openai = { workspace = true } livekit-plugins-playai = { workspace = true } livekit-plugins-resemble = { workspace = true } +livekit-plugins-respeecher = { workspace = true } livekit-plugins-rime = { workspace = true } livekit-plugins-sarvam = { workspace = true } livekit-plugins-silero = { workspace = true } From ddbdabaf43d02e21e4a53a25f6c2b6deeb7be94b Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Mon, 13 Oct 2025 18:01:14 +0900 Subject: [PATCH 11/16] Reset uv.lock changes --- uv.lock | 64 +++++++++------------------------------------------------ 1 file changed, 10 insertions(+), 54 deletions(-) diff --git a/uv.lock b/uv.lock index b4d9886572..f7478cb030 100644 --- a/uv.lock +++ b/uv.lock @@ -59,14 +59,11 @@ members = [ "livekit-plugins-silero", "livekit-plugins-simli", "livekit-plugins-smallestai", - "livekit-plugins-soniox", "livekit-plugins-speechify", "livekit-plugins-speechmatics", "livekit-plugins-spitch", "livekit-plugins-tavus", "livekit-plugins-turn-detector", - "livekit-plugins-ultravox", - "livekit-plugins-upliftai", ] constraints = [{ name = "onnxruntime", marker = "python_full_version == '3.9.*'", specifier = "<1.20.0" }] @@ -1863,12 +1860,6 @@ tavus = [ turn-detector = [ { name = "livekit-plugins-turn-detector" }, ] -ultravox = [ - { name = "livekit-plugins-ultravox" }, -] -upliftai = [ - { name = "livekit-plugins-upliftai" }, -] [package.metadata] requires-dist = [ @@ -1880,7 +1871,7 @@ requires-dist = [ { name = "docstring-parser", specifier = ">=0.16" }, { name = "eval-type-backport" }, { name = "livekit", specifier = ">=1.0.12,<2" }, - { name = "livekit-api", specifier = ">=1.0.5,<2" }, + { name = "livekit-api", specifier = ">=1.0.4,<2" }, { name = "livekit-blingfire", specifier = "~=1.0" }, { name = "livekit-plugins-anam", marker = "extra == 'anam'", editable = "livekit-plugins/livekit-plugins-anam" }, { name = "livekit-plugins-anthropic", marker = "extra == 'anthropic'", editable = "livekit-plugins/livekit-plugins-anthropic" }, @@ -1919,8 +1910,6 @@ requires-dist = [ { name = "livekit-plugins-spitch", marker = "extra == 'spitch'", editable = "livekit-plugins/livekit-plugins-spitch" }, { name = "livekit-plugins-tavus", marker = "extra == 'tavus'", editable = "livekit-plugins/livekit-plugins-tavus" }, { name = "livekit-plugins-turn-detector", marker = "extra == 'turn-detector'", editable = "livekit-plugins/livekit-plugins-turn-detector" }, - { name = "livekit-plugins-ultravox", marker = "extra == 'ultravox'", editable = "livekit-plugins/livekit-plugins-ultravox" }, - { name = "livekit-plugins-upliftai", marker = "extra == 'upliftai'", editable = "livekit-plugins/livekit-plugins-upliftai" }, { name = "livekit-protocol", specifier = "~=1.0" }, { name = "mcp", marker = "python_full_version >= '3.10' and extra == 'mcp'", specifier = ">=1.10.0,<2" }, { name = "nest-asyncio", specifier = ">=1.6.0" }, @@ -1936,15 +1925,15 @@ requires-dist = [ { name = "pydantic", specifier = ">=2.0,<3" }, { name = "pyjwt", specifier = ">=2.0" }, { name = "sounddevice", specifier = ">=0.5" }, - { name = "types-protobuf", specifier = ">=4" }, + { name = "types-protobuf", specifier = ">=4,<5" }, { name = "typing-extensions", specifier = ">=4.12" }, { name = "watchfiles", specifier = ">=1.0" }, ] -provides-extras = ["anam", "anthropic", "assemblyai", "aws", "azure", "baseten", "bey", "bithuman", "cartesia", "clova", "codecs", "deepgram", "elevenlabs", "fal", "gladia", "google", "groq", "hedra", "hume", "images", "inworld", "langchain", "lmnt", "mcp", "mistralai", "neuphonic", "nltk", "openai", "playai", "resemble", "rime", "sarvam", "silero", "simli", "smallestai", "speechify", "speechmatics", "spitch", "tavus", "turn-detector", "ultravox", "upliftai"] +provides-extras = ["anam", "anthropic", "assemblyai", "aws", "azure", "baseten", "bey", "bithuman", "cartesia", "clova", "codecs", "deepgram", "elevenlabs", "fal", "gladia", "google", "groq", "hedra", "hume", "images", "inworld", "langchain", "lmnt", "mcp", "mistralai", "neuphonic", "nltk", "openai", "playai", "resemble", "rime", "sarvam", "silero", "simli", "smallestai", "speechify", "speechmatics", "spitch", "tavus", "turn-detector"] [[package]] name = "livekit-api" -version = "1.0.5" +version = "1.0.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -1953,9 +1942,9 @@ dependencies = [ { name = "pyjwt" }, { name = "types-protobuf" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6e/af/a3ecf8d204330a07cfeff60c42318df788601a9ade72fc032221bb272f21/livekit_api-1.0.5.tar.gz", hash = "sha256:1607f640ebef177208e3257098ac1fa25e37d1f72a87d0f9953d616d6eb9f18e", size = 15117, upload-time = "2025-07-24T16:43:02.467Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b3/3a/abf4135de6ac7b43b5cc467a68731ee1adbdaa9dadea8149d37a4b0ef85f/livekit_api-1.0.4.tar.gz", hash = "sha256:90d68423e9d398834cbef300e65b7dc8f0f380f279ba25764d35296df1703152", size = 15032, upload-time = "2025-07-23T09:30:46.535Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/3a/6f/8d978416467af2a14c4c8ff4c0285c7b2d92507da58b1f3c14cba48930f8/livekit_api-1.0.5-py3-none-any.whl", hash = "sha256:6af149b58b182f43e9a5d2d764582ed6f083c80b520ab3d489c817cea554255e", size = 17577, upload-time = "2025-07-24T16:43:00.961Z" }, + { url = "https://files.pythonhosted.org/packages/0b/1a/13f4cfdd0fb60b7057adf0a1ffeb2f0bdb13ad140d59fd34cff45e0c52a3/livekit_api-1.0.4-py3-none-any.whl", hash = "sha256:583004fc6aa7255d53932c5863145dbec16a4645fbf3944f0fdcbf2bb8ed9f86", size = 17472, upload-time = "2025-07-23T09:30:45.412Z" }, ] [[package]] @@ -2211,14 +2200,16 @@ source = { editable = "livekit-plugins/livekit-plugins-groq" } dependencies = [ { name = "aiohttp" }, { name = "livekit" }, - { name = "livekit-agents", extra = ["codecs", "openai"] }, + { name = "livekit-agents", extra = ["codecs"] }, + { name = "livekit-plugins-openai" }, ] [package.metadata] requires-dist = [ { name = "aiohttp" }, { name = "livekit" }, - { name = "livekit-agents", extras = ["codecs", "openai"], editable = "livekit-agents" }, + { name = "livekit-agents", extras = ["codecs"], editable = "livekit-agents" }, + { name = "livekit-plugins-openai", editable = "livekit-plugins/livekit-plugins-openai" }, ] [[package]] @@ -2462,16 +2453,6 @@ requires-dist = [ { name = "numpy", specifier = ">=1.26" }, ] -[[package]] -name = "livekit-plugins-soniox" -source = { editable = "livekit-plugins/livekit-plugins-soniox" } -dependencies = [ - { name = "livekit-agents" }, -] - -[package.metadata] -requires-dist = [{ name = "livekit-agents", editable = "livekit-agents" }] - [[package]] name = "livekit-plugins-speechify" source = { editable = "livekit-plugins/livekit-plugins-speechify" } @@ -2542,31 +2523,6 @@ requires-dist = [ { name = "transformers", specifier = ">=4.47.1" }, ] -[[package]] -name = "livekit-plugins-ultravox" -source = { editable = "livekit-plugins/livekit-plugins-ultravox" } -dependencies = [ - { name = "livekit-agents", extra = ["codecs"] }, -] - -[package.metadata] -requires-dist = [{ name = "livekit-agents", extras = ["codecs"], editable = "livekit-agents" }] - -[[package]] -name = "livekit-plugins-upliftai" -source = { editable = "livekit-plugins/livekit-plugins-upliftai" } -dependencies = [ - { name = "livekit-agents", extra = ["codecs"] }, - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, -] - -[package.metadata] -requires-dist = [ - { name = "livekit-agents", extras = ["codecs"], editable = "livekit-agents" }, - { name = "numpy", specifier = ">=1.26" }, -] - [[package]] name = "livekit-protocol" version = "1.0.4" From f1882d2accc31cb43f193d01c02a0fef9e7e6ace Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Mon, 13 Oct 2025 18:19:41 +0900 Subject: [PATCH 12/16] Run mypy to type checks --- .../livekit/plugins/respeecher/models.py | 4 +-- .../livekit/plugins/respeecher/py.typed | 0 .../livekit/plugins/respeecher/tts.py | 34 +++++++++---------- 3 files changed, 19 insertions(+), 19 deletions(-) create mode 100644 livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/py.typed diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py index b6320d2a3c..03caf3c159 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py @@ -26,14 +26,14 @@ class VoiceSettings: class Voice(dict): """Voice model for Respeecher - behaves like a dict with guaranteed `id` and optional `sampling_params`""" - def __init__(self, *args, **kwargs): + def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) if "id" not in self: raise ValueError("Voice must have an 'id' field") @property def id(self) -> str: - return self["id"] + return str(self["id"]) @property def sampling_params(self) -> Optional[SamplingParams]: diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/py.typed b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py index 71a16dc764..33b2254bfd 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py @@ -224,7 +224,7 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None: is_given(self._tts._opts.voice_settings) and self._tts._opts.voice_settings.sampling_params ): - json_data["voice"]["sampling_params"] = self._tts._opts.voice_settings.sampling_params + json_data["voice"]["sampling_params"] = self._tts._opts.voice_settings.sampling_params # type: ignore[index] http_url = f"{self._tts._opts.base_url}{self._tts._opts.model}/tts/bytes" @@ -275,14 +275,14 @@ async def _run(self, output_emitter: tts.AudioEmitter) -> None: context_id = utils.shortuuid() output_emitter.initialize( request_id=context_id, - sample_rate=self._tts._opts.sample_rate, + sample_rate=self._tts._opts.sample_rate, # type: ignore[attr-defined] num_channels=1, stream=True, mime_type="audio/pcm", ) output_emitter.start_segment(segment_id=context_id) - sent_tokenizer_stream = self._tts._sentence_tokenizer.stream() + sent_tokenizer_stream = self._tts._sentence_tokenizer.stream() # type: ignore[attr-defined] async def _input_task() -> None: async for data in self._input_ch: @@ -298,20 +298,20 @@ async def _sentence_stream_task(ws: aiohttp.ClientWebSocketResponse) -> None: "context_id": context_id, "transcript": sent.token, "voice": { - "id": self._tts._opts.voice_id, + "id": self._tts._opts.voice_id, # type: ignore[attr-defined] }, "continue": True, "output_format": { - "encoding": self._tts._opts.encoding, - "sample_rate": self._tts._opts.sample_rate, + "encoding": self._tts._opts.encoding, # type: ignore[attr-defined] + "sample_rate": self._tts._opts.sample_rate, # type: ignore[attr-defined] }, } if ( - is_given(self._tts._opts.voice_settings) - and self._tts._opts.voice_settings.sampling_params + is_given(self._tts._opts.voice_settings) # type: ignore[attr-defined] + and self._tts._opts.voice_settings.sampling_params # type: ignore[attr-defined] ): generate_request["voice"]["sampling_params"] = ( - self._tts._opts.voice_settings.sampling_params + self._tts._opts.voice_settings.sampling_params # type: ignore[attr-defined] ) self._mark_started() @@ -322,20 +322,20 @@ async def _sentence_stream_task(ws: aiohttp.ClientWebSocketResponse) -> None: "context_id": context_id, "transcript": "", "voice": { - "id": self._tts._opts.voice_id, + "id": self._tts._opts.voice_id, # type: ignore[attr-defined] }, "continue": False, "output_format": { - "encoding": self._tts._opts.encoding, - "sample_rate": self._tts._opts.sample_rate, + "encoding": self._tts._opts.encoding, # type: ignore[attr-defined] + "sample_rate": self._tts._opts.sample_rate, # type: ignore[attr-defined] }, } if ( - is_given(self._tts._opts.voice_settings) - and self._tts._opts.voice_settings.sampling_params + is_given(self._tts._opts.voice_settings) # type: ignore[attr-defined] + and self._tts._opts.voice_settings.sampling_params # type: ignore[attr-defined] ): - end_request["voice"]["sampling_params"] = ( - self._tts._opts.voice_settings.sampling_params + end_request["voice"]["sampling_params"] = ( # type: ignore[index] + self._tts._opts.voice_settings.sampling_params # type: ignore[attr-defined] ) await ws.send_str(json.dumps(end_request)) @@ -378,7 +378,7 @@ async def _recv_task(ws: aiohttp.ClientWebSocketResponse) -> None: break try: - async with self._tts._pool.connection(timeout=self._conn_options.timeout) as ws: + async with self._tts._pool.connection(timeout=self._conn_options.timeout) as ws: # type: ignore[attr-defined] tasks = [ asyncio.create_task(_input_task()), asyncio.create_task(_sentence_stream_task(ws)), From 742b7ddd53aa283773cc71ebd8f96001ae8d3f8a Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Mon, 13 Oct 2025 18:24:29 +0900 Subject: [PATCH 13/16] Remove pcm_f32le --- .../livekit/plugins/respeecher/models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py index 03caf3c159..72c588bcee 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py @@ -8,7 +8,6 @@ TTSEncoding = Literal[ "pcm_s16le", - "pcm_f32le", ] From 3fd16460d9f11c01548df2ce07233fe7851dcd4b Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Mon, 13 Oct 2025 20:11:23 +0900 Subject: [PATCH 14/16] Simplify string replace --- .../livekit/plugins/respeecher/tts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py index 33b2254bfd..4795d0ab59 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py @@ -119,7 +119,7 @@ def __init__( async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse: session = self._ensure_session() - ws_url = self._opts.base_url.replace("https://", "wss://").replace("http://", "ws://") + ws_url = self._opts.base_url.replace("http", "ws") full_ws_url = f"{ws_url}{self._opts.model}/tts/websocket?api_key={self._opts.api_key}&source={API_VERSION_HEADER}&version={API_VERSION}" return await asyncio.wait_for(session.ws_connect(full_ws_url), timeout) From b344a0e27fe9cd668713dcbaa87869c40e0a802d Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Mon, 13 Oct 2025 20:25:50 +0900 Subject: [PATCH 15/16] Run ruff formatter --- .../livekit/plugins/respeecher/models.py | 4 +--- .../livekit/plugins/respeecher/tts.py | 7 ++++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py index 72c588bcee..e378a6a0e7 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/models.py @@ -6,9 +6,7 @@ "/public/tts/en-rt", ] -TTSEncoding = Literal[ - "pcm_s16le", -] +TTSEncoding = Literal["pcm_s16le",] """Check https://space.respeecher.com/docs/api/tts/sampling-params-guide for details""" diff --git a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py index 4795d0ab59..547c685f82 100644 --- a/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py +++ b/livekit-plugins/livekit-plugins-respeecher/livekit/plugins/respeecher/tts.py @@ -119,7 +119,12 @@ def __init__( async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse: session = self._ensure_session() - ws_url = self._opts.base_url.replace("http", "ws") + # WebSocket protocol does not support custom headers, using query parameter + ws_url = self._opts.base_url.replace("https://", "wss://").replace("http://", "ws://") + if not ws_url.startswith("wss://"): + logger.error("Insecure WebSocket connection detected, wss:// required") + raise APIConnectionError("Secure WebSocket connection (wss://) required") + full_ws_url = f"{ws_url}{self._opts.model}/tts/websocket?api_key={self._opts.api_key}&source={API_VERSION_HEADER}&version={API_VERSION}" return await asyncio.wait_for(session.ws_connect(full_ws_url), timeout) From 80047b2544d7707a9bd6beb6aae4cde79ebe7db4 Mon Sep 17 00:00:00 2001 From: Anna Mitrushchienkova Date: Mon, 13 Oct 2025 20:29:17 +0900 Subject: [PATCH 16/16] Reset uv.lock --- uv.lock | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/uv.lock b/uv.lock index 4820c6d6f3..077565c71c 100644 --- a/uv.lock +++ b/uv.lock @@ -55,7 +55,6 @@ members = [ "livekit-plugins-openai", "livekit-plugins-playai", "livekit-plugins-resemble", - "livekit-plugins-respeecher", "livekit-plugins-rime", "livekit-plugins-sarvam", "livekit-plugins-silero", @@ -2418,20 +2417,6 @@ dependencies = [ [package.metadata] requires-dist = [{ name = "livekit-agents", editable = "livekit-agents" }] -[[package]] -name = "livekit-plugins-respeecher" -source = { editable = "livekit-plugins/livekit-plugins-respeecher" } -dependencies = [ - { name = "aiohttp" }, - { name = "livekit-agents" }, -] - -[package.metadata] -requires-dist = [ - { name = "aiohttp" }, - { name = "livekit-agents", editable = "livekit-agents" }, -] - [[package]] name = "livekit-plugins-rime" source = { editable = "livekit-plugins/livekit-plugins-rime" }