From da383ad065ef21cef4d45f93c267f5d9335de107 Mon Sep 17 00:00:00 2001 From: Jaebong Jeong Date: Mon, 30 Mar 2026 16:01:01 +0900 Subject: [PATCH 1/2] added TypecastEngine using typecast-python SDK Co-Authored-By: Claude Sonnet 4.6 --- RealtimeTTS/__init__.py | 16 ++ RealtimeTTS/engines/__init__.py | 12 +- RealtimeTTS/engines/typecast_engine.py | 236 +++++++++++++++++++++++++ 3 files changed, 263 insertions(+), 1 deletion(-) create mode 100644 RealtimeTTS/engines/typecast_engine.py diff --git a/RealtimeTTS/__init__.py b/RealtimeTTS/__init__.py index 7ee4d37..a78fd82 100644 --- a/RealtimeTTS/__init__.py +++ b/RealtimeTTS/__init__.py @@ -24,6 +24,7 @@ "MiniMaxEngine", "MiniMaxVoice", "CartesiaEngine", "CartesiaVoice", "FasterQwenEngine", "FasterQwenVoice", + "TypecastEngine", "TypecastVoice", ] @@ -276,6 +277,19 @@ def _load_fasterqwen_engine(): return FasterQwenEngine +def _load_typecast_engine(): + try: + from .engines.typecast_engine import TypecastEngine, TypecastVoice + except ImportError as e: + raise ImportError( + "Failed to load TypecastEngine and TypecastVoice. " + "Make sure the `cast` CLI is installed: https://typecast.ai" + ) from e + globals()["TypecastEngine"] = TypecastEngine + globals()["TypecastVoice"] = TypecastVoice + return TypecastEngine + + # Mapping names to their lazy loader functions. _lazy_imports = { "SystemEngine": _load_system_engine, @@ -316,6 +330,8 @@ def _load_fasterqwen_engine(): "CartesiaVoice": _load_cartesia_engine, "FasterQwenEngine": _load_fasterqwen_engine, "FasterQwenVoice": _load_fasterqwen_engine, + "TypecastEngine": _load_typecast_engine, + "TypecastVoice": _load_typecast_engine, } diff --git a/RealtimeTTS/engines/__init__.py b/RealtimeTTS/engines/__init__.py index 7d91602..5426a5c 100644 --- a/RealtimeTTS/engines/__init__.py +++ b/RealtimeTTS/engines/__init__.py @@ -21,7 +21,8 @@ "ModelsLabEngine", "ModelsLabVoice", "MiniMaxEngine", "MiniMaxVoice", "CartesiaEngine", "CartesiaVoice", - "FasterQwenEngine", "FasterQwenVoice" + "FasterQwenEngine", "FasterQwenVoice", + "TypecastEngine", "TypecastVoice" ] @@ -166,6 +167,13 @@ def _load_fasterqwen_engine(): return FasterQwenEngine +def _load_typecast_engine(): + from .typecast_engine import TypecastEngine, TypecastVoice + globals()["TypecastEngine"] = TypecastEngine + globals()["TypecastVoice"] = TypecastVoice + return TypecastEngine + + # Map attribute names to lazy loader functions. _lazy_imports = { "AzureEngine": _load_azure_engine, @@ -208,6 +216,8 @@ def _load_fasterqwen_engine(): "CartesiaVoice": _load_cartesia_engine, "FasterQwenEngine": _load_fasterqwen_engine, "FasterQwenVoice": _load_fasterqwen_engine, + "TypecastEngine": _load_typecast_engine, + "TypecastVoice": _load_typecast_engine, } diff --git a/RealtimeTTS/engines/typecast_engine.py b/RealtimeTTS/engines/typecast_engine.py new file mode 100644 index 0000000..9e515da --- /dev/null +++ b/RealtimeTTS/engines/typecast_engine.py @@ -0,0 +1,236 @@ +from .base_engine import BaseEngine +from typing import Union +import os +import io +import wave +import pyaudio + + +class TypecastVoice: + def __init__(self, name: str, voice_id: str, gender: str = "", age: str = "", + use_cases: list = None, models: list = None): + self.name = name + self.voice_id = voice_id + self.gender = gender + self.age = age + self.use_cases = use_cases or [] + self.models = models or [] + + def __repr__(self): + return f"{self.name} ({self.voice_id})" + + +class TypecastEngine(BaseEngine): + def __init__( + self, + voice_id: str = None, + model: str = "ssfm-v30", + tempo: float = None, + pitch: int = None, + volume: int = None, + language: str = None, + emotion_type: str = "preset", + emotion_preset: str = "normal", + emotion_intensity: float = None, + seed: int = None, + api_key: str = None, + host: str = None, + debug: bool = False, + chunk_size: int = 4096, + ): + """ + Initializes a Typecast TTS engine using the official typecast-python SDK. + + Args: + voice_id (str): Typecast voice ID. If unspecified, uses TYPECAST_VOICE_ID env var. + model (str): Model version. "ssfm-v30" or "ssfm-v21". Defaults to "ssfm-v30". + tempo (float): Speech speed multiplier (0.5–2.0). + pitch (int): Pitch in semitones (-12 to +12). + volume (int): Volume (0–200). + language (str): Language code (ISO 639-3). Auto-detected if None. + emotion_type (str): Emotion mode. "preset" or "smart" (ssfm-v30 only). + "smart" infers emotion from context automatically. + emotion_preset (str): Emotion preset when emotion_type is "preset". + ssfm-v30: normal, happy, sad, angry, whisper, toneup, tonedown. + ssfm-v21: normal, happy, sad, angry. + emotion_intensity (float): Emotion intensity (0.0–2.0). Defaults to 1.0. + seed (int): Random seed for reproducible output. + api_key (str): Typecast API key. If unspecified, uses TYPECAST_API_KEY env var. + host (str): API host URL override. If unspecified, the SDK uses + TYPECAST_API_HOST env var or the default endpoint. + debug (bool): Print debug information. + chunk_size (int): Audio chunk size in bytes for streaming. + """ + self.api_key = api_key or os.environ.get("TYPECAST_API_KEY") + if not self.api_key: + raise ValueError( + "Typecast API key is required. Provide it via api_key parameter " + "or TYPECAST_API_KEY environment variable." + ) + + self.voice_id = voice_id or os.environ.get("TYPECAST_VOICE_ID") + self.model = model + self.tempo = tempo + self.pitch = pitch + self.volume = volume + self.language = language + self.emotion_type = emotion_type + self.emotion_preset = emotion_preset + self.emotion_intensity = emotion_intensity + self.seed = seed + self.host = host # If None, SDK falls back to TYPECAST_API_HOST env var or default + self.debug = debug + self.chunk_size = chunk_size + self._client = None + + def post_init(self): + self.engine_name = "typecast" + + def _get_client(self): + if self._client is None: + try: + from typecast import Typecast + except ImportError as e: + raise ImportError( + "typecast-python package is required. Install with:\n" + "pip install typecast-python" + ) from e + kwargs = {"api_key": self.api_key} + if self.host: + kwargs["host"] = self.host + self._client = Typecast(**kwargs) + return self._client + + def _build_prompt(self): + """Builds the appropriate prompt object based on emotion_type and model.""" + from typecast.models import Prompt, PresetPrompt, SmartPrompt + + if self.model == "ssfm-v21": + kwargs = {} + if self.emotion_preset is not None: + kwargs["emotion_preset"] = self.emotion_preset + if self.emotion_intensity is not None: + kwargs["emotion_intensity"] = self.emotion_intensity + return Prompt(**kwargs) if kwargs else None + + # ssfm-v30 + if self.emotion_type == "smart": + return SmartPrompt() + else: + kwargs = {} + if self.emotion_preset is not None: + kwargs["emotion_preset"] = self.emotion_preset + if self.emotion_intensity is not None: + kwargs["emotion_intensity"] = self.emotion_intensity + return PresetPrompt(**kwargs) + + def get_stream_info(self): + """ + Returns PyAudio stream configuration for Typecast (WAV: mono, 44100Hz, 16-bit). + """ + return pyaudio.paInt16, 1, 44100 + + def synthesize(self, text: str) -> bool: + """ + Synthesizes text to audio using the typecast-python SDK and streams to queue. + + Args: + text (str): Text to synthesize. + + Returns: + bool: True on success, False on failure. + """ + from typecast.models import TTSRequest, TTSModel, Output + + if not self.voice_id: + print("[TypecastEngine] Error: voice_id is not set. " + "Pass voice_id to the constructor or set TYPECAST_VOICE_ID.") + return False + + if self.debug: + print(f"[TypecastEngine] Synthesizing: \"{text}\"") + + try: + output_kwargs = {"audio_format": "wav"} + if self.tempo is not None: + output_kwargs["audio_tempo"] = self.tempo + if self.pitch is not None: + output_kwargs["audio_pitch"] = self.pitch + if self.volume is not None: + output_kwargs["volume"] = self.volume + + request = TTSRequest( + text=text, + voice_id=self.voice_id, + model=TTSModel(self.model), + language=self.language, + prompt=self._build_prompt(), + seed=self.seed, + output=Output(**output_kwargs), + ) + + client = self._get_client() + response = client.text_to_speech(request) + + with wave.open(io.BytesIO(response.audio_data), "rb") as wf: + frames_per_chunk = self.chunk_size // (wf.getsampwidth() * wf.getnchannels()) + while not self.stop_synthesis_event.is_set(): + data = wf.readframes(frames_per_chunk) + if not data: + break + self.queue.put(data) + + except Exception as e: + print(f"[TypecastEngine] Error during synthesis: {e}") + return False + + return True + + def get_voices(self) -> list: + """ + Returns available Typecast voices via the SDK. + """ + try: + voices = self._get_client().voices_v2() + return [ + TypecastVoice( + name=v.voice_name, + voice_id=v.voice_id, + gender=v.gender.value if v.gender else "", + age=v.age.value if v.age else "", + use_cases=v.use_cases or [], + models=[m.version for m in v.models] if v.models else [], + ) + for v in voices + ] + except Exception as e: + print(f"[TypecastEngine] Error fetching voices: {e}") + return [] + + def set_voice(self, voice: Union[str, TypecastVoice]): + """ + Sets the voice for synthesis. + + Args: + voice: Voice name/ID string or TypecastVoice object. + """ + if isinstance(voice, TypecastVoice): + self.voice_id = voice.voice_id + return + for v in self.get_voices(): + if voice.lower() in v.name.lower() or voice == v.voice_id: + self.voice_id = v.voice_id + return + # Fallback: treat as raw voice_id + self.voice_id = voice + + def set_voice_parameters(self, **voice_parameters): + """ + Sets voice parameters. + + Supported keys: tempo, pitch, volume, language, seed, model, + emotion_type, emotion_preset, emotion_intensity + """ + for key, value in voice_parameters.items(): + if hasattr(self, key): + setattr(self, key, value) From e056a4d95f00fd4abbea5b76bdaf856ab0ecd39c Mon Sep 17 00:00:00 2001 From: Jaebong Jeong Date: Tue, 31 Mar 2026 14:39:27 +0900 Subject: [PATCH 2/2] Add tests, setup.py extra, and README for TypecastEngine - Add tests/typecast_test.py with basic synthesis and emotion switching - Add realtimetts[typecast] install extra (typecast-python) in setup.py - Update README: engine list, install options, TypecastEngine section Co-Authored-By: Claude Sonnet 4.6 --- README.md | 11 ++++++- setup.py | 2 ++ tests/typecast_test.py | 72 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 tests/typecast_test.py diff --git a/README.md b/README.md index 8b400ed..45ed71d 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ https://github.com/KoljaB/RealtimeTTS/assets/7604638/87dcd9a5-3a4e-4f57-be45-837 - **High-Quality Audio** - generates clear and natural-sounding speech - **Multiple TTS Engine Support** - - supports OpenAI TTS, Elevenlabs, Azure Speech Services, Coqui TTS, StyleTTS2, Piper, gTTS, Edge TTS, Parler TTS, Kokoro, Cartesia, Faster Qwen 3, NeuTTS, PocketTTS, Modelslab, CAMB AI, MiniMax and System TTS + - supports OpenAI TTS, Elevenlabs, Azure Speech Services, Coqui TTS, StyleTTS2, Piper, gTTS, Edge TTS, Parler TTS, Kokoro, Cartesia, Faster Qwen 3, NeuTTS, PocketTTS, Modelslab, CAMB AI, MiniMax, Typecast and System TTS - **Multilingual** - **Robust and Reliable**: - ensures continuous operation through a fallback mechanism @@ -140,6 +140,7 @@ This library uses: - **OrpheusEngine** 🏠: Llama‑powered TTS with emotion tags - **CambEngine** 🌐: CAMB AI MARS models with 140+ languages - **MiniMaxEngine** 🌐: MiniMax Cloud TTS with 12 voice presets + - **TypecastEngine** 🌐: Typecast Cloud TTS with emotion and prosody control - **ZipVoiceEngine** 🏠: 123M zero‑shot model, state‑of‑the‑art quality - **PocketTTSEngine** 🏠: Kyutai Labs 100M model, CPU-optimized with voice cloning - **NeuTTSEngine** 🏠: Voice cloning with 3-second reference audio @@ -183,6 +184,7 @@ Install only required dependencies using these options: - **coqui**: Coqui TTS engine - **camb**: CAMB AI MARS TTS - **minimax**: MiniMax Cloud TTS +- **typecast**: Typecast Cloud TTS - **minimal**: Core package only (for custom engine development) Example: `pip install realtimetts[all]`, `pip install realtimetts[azure]`, `pip install realtimetts[azure,elevenlabs,openai]` @@ -235,6 +237,13 @@ To use the `MiniMaxEngine`, you need: - Available models: `speech-2.8-hd` (high quality), `speech-2.8-turbo` (fast) - 12 voice presets including English and multilingual options +### TypecastEngine +To use the `TypecastEngine`, you need: +- Typecast API key (provided via TypecastEngine constructor parameter "api_key" or in the environment variable TYPECAST_API_KEY) +- Voice ID (provided via TypecastEngine constructor parameter "voice_id" or in the environment variable TYPECAST_VOICE_ID) +- Available models: `ssfm-v30` (default, supports smart emotion), `ssfm-v21` +- Emotion presets (ssfm-v30): `normal`, `happy`, `sad`, `angry`, `whisper`, `toneup`, `tonedown` + ### ElevenlabsEngine For the `ElevenlabsEngine`, you need: - Elevenlabs API key (provided via ElevenlabsEngine constructor parameter "api_key" or in the environment variable ELEVENLABS_API_KEY) diff --git a/setup.py b/setup.py index 27086fc..0db3e26 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ - **minimax**: MiniMax Cloud TTS - **cartesia** Cartesia API integration - **qwen** Faster Qwen3 TTS integration +- **typecast**: Typecast Cloud TTS - **minimal**: Core package only (for custom engine development) You can install multiple engines by separating them with commas. For example: @@ -94,6 +95,7 @@ def parse_requirements(filename): "minimax": base_requirements + [requirements.get("requests", "requests")], "orpheus": base_requirements + [requirements.get("snac", "snac")], "qwen": base_requirements + [requirements.get("faster-qwen3-tts", "faster-qwen3-tts")], + "typecast": base_requirements + ["typecast-python"], "jp": base_requirements +["mecab-python3>=1.0.6", "unidic-lite>=1.0.8", "cutlet", "fugashi>=1.4.0", "jaconv>=0.4.0", "mojimoji>=0.0.13", "pyopenjtalk>=0.4.0"], "zh": base_requirements +["pypinyin>=0.53.0", "ordered_set>=4.1.0", "jieba>=0.42.1", "cn2an>=0.5.23"], "ko": base_requirements +["hangul_romanize"], diff --git a/tests/typecast_test.py b/tests/typecast_test.py new file mode 100644 index 0000000..98df5a0 --- /dev/null +++ b/tests/typecast_test.py @@ -0,0 +1,72 @@ +""" +Typecast TTS – Quick Test + +A simple test that synthesizes text using the Typecast API. + +Env vars: TYPECAST_API_KEY, TYPECAST_VOICE_ID +""" + +if __name__ == "__main__": + import os + import sys + import time + from dotenv import load_dotenv + + load_dotenv() + + from RealtimeTTS import TextToAudioStream, TypecastEngine + + # Initialise engine + engine = TypecastEngine( + voice_id=os.environ.get("TYPECAST_VOICE_ID"), + model="ssfm-v30", + emotion_preset="normal", + debug=True, + ) + stream = TextToAudioStream(engine) + + print("Typecast TTS engine ready.") + print() + + # Simple synthesis test + test_text = "Hello! This is a test of the Typecast text to speech engine." + print(f"Synthesizing: {test_text}") + stream.feed(test_text) + stream.play_async() + while stream.is_playing(): + time.sleep(0.1) + + print() + + # Test emotion preset switching + print("Switching to happy emotion...") + engine.set_voice_parameters(emotion_preset="happy") + test_text2 = "Now I am speaking with a happy emotion preset!" + print(f"Synthesizing: {test_text2}") + stream.feed(test_text2) + stream.play_async() + while stream.is_playing(): + time.sleep(0.1) + + print() + + # Interactive mode + print("Type text to synthesize (or 'quit' to exit):") + while True: + try: + user_input = input("> ").strip() + except (EOFError, KeyboardInterrupt): + print() + break + + if not user_input or user_input.lower() == "quit": + break + + stream.feed(user_input) + stream.play_async() + while stream.is_playing(): + time.sleep(0.1) + + stream.stop() + engine.shutdown() + print("Done!")